How to cluster customers into groups based on shopping behavior using Clustering Algorithm in R?

Description

To cluster customers into groups based on the shopping behavior using Clustering Algorithm in R.

Step 1: Loading the required packages and import the data

Step 2: Data Preparation : Scaling (Normalizing) the input data and filling the missing values .

Step 3: Visualizing the data using ggplot2 and plotly R packages.

Step 4: Finding the best Clustering Algorithm for our data set using two validation techniques internal and stability.

Step 5: Computing Hierarchical Clustering and plotting dendrogram

Step 6: Compare the average values in each of the variables for the 2 clusters (the centroids of the clusters).

#Online Shopping Survey

#Reading data from Excel

#install.packages(“xlsx”)
library(“xlsx”)
my_input<-read.xlsx(“OnlineShopping.xlsx”)
View(my_input)

#Descriptive Statistics

str(my_input)
summary(my_input)

#Data Preparation

#Checking Missing values

sum(is.na(my_input))

#ggplot2 age,name,online Shoppers

library(“ggplot2”)

#install.packages(“ggpubr”)
library(“ggpubr”)
library(“plotly”)

dot<-ggdotchart(my_input,x=”name”,y= “age”,color = “online_shopping”,group = “online_shopping”,ggtheme = theme_pubclean(),add=”segments”,sorting=”desc”,add.params = list(color = “lightgray”,size=3),dot.size = 4) + labs(title=”Plotting of Age, Name versus Online Shoppers using ggplotly”,x=”Name”,y=”Age”)
ggplotly(dot)

#Plotting of Commodity versus age

#install.packages(“ggridges”)

library(“ggridges”)

ggplot(my_input, aes(x = age, y = as.factor(commodity))) +
geom_density_ridges(aes(fill = as.factor(commodity))) +
scale_fill_manual(values = c(“#00AFBB”, “#E7B800”, “#FC4E07”,391),
labels = c(“Metal(Gold,Silver,etc)”, “Cosmetics”,”Electronic Items”,”Home Appliances”),
name = “Commodity”) +
labs(title=”Plotting Age versus Commodity”,x=”Age”,y=”Commodity”) +
theme(legend.background = element_rect(fill=”gray90″)) + theme_pubclean()

#Online Shoppers and thier problems

ggplot(my_input, aes(x= what_problem, group=online_shopping)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..))) +
geom_text(aes( label = scales::percent(..prop..),
y= ..prop.. ), stat= “count”, vjust = -.5) +
labs(y = “Percentage”, fill=”what_problem”) +
facet_grid(~online_shopping) +
scale_y_continuous(labels = scales::percent) +
theme_pubclean() +
scale_fill_manual(values = c(“#00AFBB”, “#E7B800”, “#FC4E07”,391,78,87),labels = c(“No Problem”,”Delay in Delivery”,”Cheap Quality of Product”,”Product Damage”,”Non-Delivery”,”Payment Problem”)) +
labs(title=”Online Shoppers and thier problems”,x=”Problem in Online Shopping”,y=”Percentage”,face=”bold”) +
theme(legend.title = element_text(face=”bold”),legend.background = element_rect(fill=”darkgrey”))

#Age Versus Salary

as<-ggplot(my_input,aes(x=age,y=income)) + geom_bar(color=”red”,fill=”yellow”,stat = “identity”) + scale_fill_grey() + theme(legend.position = “none”) +
labs(title=”Age Versus Salary”,x=”Age”,y=”Income”) + theme_pubclean()

ggplotly(as)

#Favourite Online Shopping Websites

fav<-ggplot(my_input,aes(x=favourite)) + geom_bar(fill=”steelblue”,width = 0.70) + coord_flip() + theme_light() +
theme(panel.grid = element_blank(),
panel.border = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_text(color =”red”,face = “bold”,size = 10)) +
labs(title=”Favourite Online Websites”)

ggplotly(fav)

#Finding the best Clustering algorithm for our data

#install.packages(“clValid”)
library(clValid)

#Internal Validation Measures

#Compute clValid
clmethods<-c(“hierarchical”,”kmeans”,”pam”)
intern<-clValid(my_input[,c(3:15)], nClust = 2:6,
clMethods = clmethods, validation = “internal”)

str(my_input[,c(3:15)])
# Summary
summary(intern)

#Ploting the summary
plot(intern)

#Stability Validation Measures

stab<-clValid(my_input[,c(3:15)],nClust=2:6,clMethods = clmethods,validation = “stability”)

#Summary
summary(stab)

#Plotting the summary
plot(stab)

#Hierarchical Clustering

#Finding the more appropriate method for more strongest clustering structure

#install.packages(“purrr”)
library(“purrr”)

m<-c(“complete”,”single”,”ward”,”average”)
names(m)<-c(“complete”,”single”,”ward”,”average”)
agg_coef<-function(x){
agnes(my_input[,3:15],method = x)$ac
}

map_dbl(m,agg_coef)

#Compute hclust
h_dist<-dist(my_input[,3:15],method = “euclidean”)
h_data<-hclust(h_dist, method = “ward.D”)

#Plotting Dendrogram
plot(h_data)
rect.hclust(h_data,k=2)

#Finding the optimal No of clusters
library(“factoextra”)

fviz_nbclust(my_input,hcut,method = “silhouette”)

clusgrp<-cutree(h_data,k=2)
table(clusgrp)

age<-tapply(my_input[,3], clusgrp, mean)

cat(“Age of two groups\n”)
print(age)

inc<-tapply(my_input[,4], clusgrp, mean)

cat(“Income of two groups\n”)
print(inc)

ed<-tapply(my_input[,5], clusgrp, mean)

cat(“Education of two groups\n”)
print(ed)

mode<-function(f){
uni<-unique(f)
uni[which.max(tabulate(match(f,uni)))]
}

onshop<-tapply(my_input[,6], clusgrp, mode)

cat(“Online Shopping of two groups\n”)
print(onshop)

save<-tapply(my_input$save_time, clusgrp, mode)

cat(“Shopping on Internet saves time”)
print(save)

any_time<-tapply(my_input$any_time, clusgrp, mode)

cat(“It is a great advantage to be able to shop at any time of the day”)
print(any_time)

traditional<-tapply(my_input$traditional_online, clusgrp, mode)

cat(“I prefer traditional shopping to online shopping”)
print(traditional)

credit<-tapply(my_input$credit_card, clusgrp, mode)

cat(“While shopping online I hesitate to give my credit card number”)
print(credit)

cash<-tapply(my_input$cash_on_delivery, clusgrp, mode)

cat(“I cash on delivery than payment via credit/debit card”)
print(cash)

#Bar chart of “Shopping on Internet saves time”

fav<-ggplot(my_input,aes(x=as.factor(save_time))) + geom_bar(fill=”steelblue”,width = 0.40)+ theme_light() +
theme(panel.grid = element_blank(),
panel.border = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_text(color =”red”,face = “bold”,size = 10)) +
labs(title=”Shopping on Internet saves time”)

ggplotly(fav)

#Bar chart of “It is a great advantage to be able to shop at any time of the day”

fav<-ggplot(my_input,aes(x=as.factor(any_time))) + geom_bar(fill=”steelblue”,width = 0.40)+ theme_light() +
theme(panel.grid = element_blank(),
panel.border = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_text(color =”red”,face = “bold”,size = 10)) +
labs(title=”It is a great advantage to be able to shop at any time of the day”)

ggplotly(fav)

#Bar chart of “I prefer traditional shopping to online shopping”

fav<-ggplot(my_input,aes(x=as.factor(traditional_online))) + geom_bar(fill=”steelblue”,width = 0.40)+ theme_light() +
theme(panel.grid = element_blank(),
panel.border = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_text(color =”red”,face = “bold”,size = 10)) +
labs(title=”I prefer traditional shopping to online shopping”)

ggplotly(fav)

#Bar chart of “While shopping online I hesitate to give my credit card number”

fav<-ggplot(my_input,aes(x=as.factor(credit_card))) + geom_bar(fill=”steelblue”,width = 0.40)+ theme_light() +
theme(panel.grid = element_blank(),
panel.border = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_text(color =”red”,face = “bold”,size = 10)) +
labs(title=”While shopping online I hesitate to give my credit card number”)

ggplotly(fav)

#Bar chart of “I prefer cash on delivery than payment via credit/debit card”

fav<-ggplot(my_input,aes(x=as.factor(cash_on_delivery)))+ geom_bar(fill=”steelblue”,width = 0.40)+ theme_light() +
theme(panel.grid = element_blank(),
panel.border = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_text(color =”red”,face = “bold”,size = 10)) +
labs(title=”I prefer cash on delivery than payment via credit/debit card”)

ggplotly(fav)

  • Cluster 1 has the aged group people and obviously has highest income
  • Cluster 1 has not doing online shopping
  • Cluster 2 are lower aged people and has lower income but interested in Online shopping
  • Cluster 2 has education more or less similar to Cluster 1

On taking rating regarding online Shopping:

  • 1) Shopping on Internet saves time : Cluster 1 has higher rating
  • 2) It is a great advantage to be able to shop at any time of the day : Cluster 1 has higher rating
  • 3) I prefer traditional shopping to online shopping : Cluster 2 has higher rating
  • 4) While shopping online I hesitate to give my credit card number : Cluster 1 has higher rating
  • 5) I prefer cash on delivery than payment via credit/debit card : Cluster 1 has higher rating
Leave Comment

Your email address will not be published. Required fields are marked *

clear formSubmit