Research breakthrough possible @S-Logix pro@slogix.in

Office Address

Social List

How to cluster customers into groups based on shopping behavior using Clustering Algorithm in R?

Description

To cluster customers into groups based on the shopping behavior using Clustering Algorithm in R.

Process

Step 1: Loading the required packages and import the data

Step 2: Data Preparation : Scaling (Normalizing) the input data and filling the missing values .

Step 3: Visualizing the data using ggplot2 and plotly R packages.

Step 4: Finding the best Clustering Algorithm for our data set using two validation techniques internal and stability.

Step 5: Computing Hierarchical Clustering and plotting dendrogram

Step 6: Compare the average values in each of the variables for the 2 clusters (the centroids of the clusters).

Sapmle Code

#Online Shopping Survey

#Reading data from Excel

#install.packages(“xlsx”)
library(“xlsx”)
my_input<-read.xlsx(“OnlineShopping.xlsx”)
View(my_input)

#Descriptive Statistics

str(my_input)
summary(my_input)

#Data Preparation

#Checking Missing values

sum(is.na(my_input))

#ggplot2 age,name,online Shoppers

library(“ggplot2”)

#install.packages(“ggpubr”)
library(“ggpubr”)
library(“plotly”)

dot<-ggdotchart(my_input,x=”name”,y= “age”,color = “online_shopping”,group = “online_shopping”,ggtheme = theme_pubclean(),add=”segments”,sorting=”desc”,add.params = list(color = “lightgray”,size=3),dot.size = 4) + labs(title=”Plotting of Age, Name versus Online Shoppers using ggplotly”,x=”Name”,y=”Age”)
ggplotly(dot)

#Plotting of Commodity versus age

#install.packages(“ggridges”)

library(“ggridges”)

ggplot(my_input, aes(x = age, y = as.factor(commodity))) +
geom_density_ridges(aes(fill = as.factor(commodity))) +
scale_fill_manual(values = c(“#00AFBB”, “#E7B800”, “#FC4E07”,391),
labels = c(“Metal(Gold,Silver,etc)”, “Cosmetics”,”Electronic Items”,”Home Appliances”),
name = “Commodity”) +
labs(title=”Plotting Age versus Commodity”,x=”Age”,y=”Commodity”) +
theme(legend.background = element_rect(fill=”gray90″)) + theme_pubclean()

#Online Shoppers and thier problems

ggplot(my_input, aes(x= what_problem, group=online_shopping)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..))) +
geom_text(aes( label = scales::percent(..prop..),
y= ..prop.. ), stat= “count”, vjust = -.5) +
labs(y = “Percentage”, fill=”what_problem”) +
facet_grid(~online_shopping) +
scale_y_continuous(labels = scales::percent) +
theme_pubclean() +
scale_fill_manual(values = c(“#00AFBB”, “#E7B800”, “#FC4E07”,391,78,87),labels = c(“No Problem”,”Delay in Delivery”,”Cheap Quality of Product”,”Product Damage”,”Non-Delivery”,”Payment Problem”)) +
labs(title=”Online Shoppers and thier problems”,x=”Problem in Online Shopping”,y=”Percentage”,face=”bold”) +
theme(legend.title = element_text(face=”bold”),legend.background = element_rect(fill=”darkgrey”))

#Age Versus Salary

as<-ggplot(my_input,aes(x=age,y=income)) + geom_bar(color=”red”,fill=”yellow”,stat = “identity”) + scale_fill_grey() + theme(legend.position = “none”) +
labs(title=”Age Versus Salary”,x=”Age”,y=”Income”) + theme_pubclean()

ggplotly(as)

#Favourite Online Shopping Websites

fav<-ggplot(my_input,aes(x=favourite)) + geom_bar(fill=”steelblue”,width = 0.70) + coord_flip() + theme_light() +
theme(panel.grid = element_blank(),
panel.border = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_text(color =”red”,face = “bold”,size = 10)) +
labs(title=”Favourite Online Websites”)

ggplotly(fav)

#Finding the best Clustering algorithm for our data

#install.packages(“clValid”)
library(clValid)

#Internal Validation Measures

#Compute clValid
clmethods<-c(“hierarchical”,”kmeans”,”pam”)
intern<-clValid(my_input[,c(3:15)], nClust = 2:6,
clMethods = clmethods, validation = “internal”)

str(my_input[,c(3:15)])
# Summary
summary(intern)

#Ploting the summary
plot(intern)

#Stability Validation Measures

stab<-clValid(my_input[,c(3:15)],nClust=2:6,clMethods = clmethods,validation = “stability”)

#Summary
summary(stab)

#Plotting the summary
plot(stab)

#Hierarchical Clustering

#Finding the more appropriate method for more strongest clustering structure

#install.packages(“purrr”)
library(“purrr”)

m<-c(“complete”,”single”,”ward”,”average”)
names(m)<-c(“complete”,”single”,”ward”,”average”)
agg_coef<-function(x){
agnes(my_input[,3:15],method = x)$ac
}

map_dbl(m,agg_coef)

#Compute hclust
h_dist<-dist(my_input[,3:15],method = “euclidean”)
h_data<-hclust(h_dist, method = “ward.D”)

#Plotting Dendrogram
plot(h_data)
rect.hclust(h_data,k=2)

#Finding the optimal No of clusters
library(“factoextra”)

fviz_nbclust(my_input,hcut,method = “silhouette”)

clusgrp<-cutree(h_data,k=2)
table(clusgrp)

age<-tapply(my_input[,3], clusgrp, mean)

cat(“Age of two groups\n”)
print(age)

inc<-tapply(my_input[,4], clusgrp, mean)

cat(“Income of two groups\n”)
print(inc)

ed<-tapply(my_input[,5], clusgrp, mean)

cat(“Education of two groups\n”)
print(ed)

mode<-function(f){
uni<-unique(f)
uni[which.max(tabulate(match(f,uni)))]
}

onshop<-tapply(my_input[,6], clusgrp, mode)

cat(“Online Shopping of two groups\n”)
print(onshop)

save<-tapply(my_input$save_time, clusgrp, mode)

cat(“Shopping on Internet saves time”)
print(save)

any_time<-tapply(my_input$any_time, clusgrp, mode)

cat(“It is a great advantage to be able to shop at any time of the day”)
print(any_time)

traditional<-tapply(my_input$traditional_online, clusgrp, mode)

cat(“I prefer traditional shopping to online shopping”)
print(traditional)

credit<-tapply(my_input$credit_card, clusgrp, mode)

cat(“While shopping online I hesitate to give my credit card number”)
print(credit)

cash<-tapply(my_input$cash_on_delivery, clusgrp, mode)

cat(“I cash on delivery than payment via credit/debit card”)
print(cash)

#Bar chart of “Shopping on Internet saves time”

fav<-ggplot(my_input,aes(x=as.factor(save_time))) + geom_bar(fill=”steelblue”,width = 0.40)+ theme_light() +
theme(panel.grid = element_blank(),
panel.border = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_text(color =”red”,face = “bold”,size = 10)) +
labs(title=”Shopping on Internet saves time”)

ggplotly(fav)

#Bar chart of “It is a great advantage to be able to shop at any time of the day”

fav<-ggplot(my_input,aes(x=as.factor(any_time))) + geom_bar(fill=”steelblue”,width = 0.40)+ theme_light() +
theme(panel.grid = element_blank(),
panel.border = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_text(color =”red”,face = “bold”,size = 10)) +
labs(title=”It is a great advantage to be able to shop at any time of the day”)

ggplotly(fav)

#Bar chart of “I prefer traditional shopping to online shopping”

fav<-ggplot(my_input,aes(x=as.factor(traditional_online))) + geom_bar(fill=”steelblue”,width = 0.40)+ theme_light() +
theme(panel.grid = element_blank(),
panel.border = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_text(color =”red”,face = “bold”,size = 10)) +
labs(title=”I prefer traditional shopping to online shopping”)

ggplotly(fav)

#Bar chart of “While shopping online I hesitate to give my credit card number”

fav<-ggplot(my_input,aes(x=as.factor(credit_card))) + geom_bar(fill=”steelblue”,width = 0.40)+ theme_light() +
theme(panel.grid = element_blank(),
panel.border = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_text(color =”red”,face = “bold”,size = 10)) +
labs(title=”While shopping online I hesitate to give my credit card number”)

ggplotly(fav)

#Bar chart of “I prefer cash on delivery than payment via credit/debit card”

fav<-ggplot(my_input,aes(x=as.factor(cash_on_delivery)))+ geom_bar(fill=”steelblue”,width = 0.40)+ theme_light() +
theme(panel.grid = element_blank(),
panel.border = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_text(color =”red”,face = “bold”,size = 10)) +
labs(title=”I prefer cash on delivery than payment via credit/debit card”)

ggplotly(fav)

Screenshots
How to cluster customers into groups based on shopping behavior using Clustering Algorithm in R
Loading the required packages  and import the data
Data Preparation
Finding the best Clustering Algorithm  for our data set using two validation  techniques internal and    stability
cluster customers into groups based on shopping behavior using Clustering Algorithm in R
Computing Hierarchical Clustering  and plotting dendrogram
Compare the average values in each  of the variables for the 2 clusters
Plotting of Age, Name versus Online Shoppers using ggplotly
Online Shopping Survey
#Reading data from Excel
install.packages
library
Percentage
shopping
scale_fill_manual
Delay in Delivery
legend.background
Age Versus Salary
Favourite Online Shopping Websites
Favourite Online Shopping Websites

O