Implement source code for Clustering Algorithm in R

Process

Step 1: Loading the required packages and import the data

Step 2: Data Preparation : Scaling (Normalizing) the input data and filling the missing values .

Step 3: Visualizing the data using ggplot2 and plotly R packages.

Step 4: Finding the best Clustering Algorithm for our data set using two validation techniques internal and stability.

Step 5: Computing Hierarchical Clustering and plotting dendrogram

Step 6: Compare the average values in each of the variables for the 2 clusters (the centroids of the clusters).

Sapmle Code

#Online Shopping Survey

#Reading data from Excel

#install.packages(“xlsx”)
library(“xlsx”)
my_input<-read.xlsx(“OnlineShopping.xlsx”)
View(my_input)

#Descriptive Statistics

str(my_input)
summary(my_input)

#Data Preparation

#Checking Missing values

sum(is.na(my_input))

#ggplot2 age,name,online Shoppers

library(“ggplot2”)

#install.packages(“ggpubr”)
library(“ggpubr”)
library(“plotly”)

dot<-ggdotchart(my_input,x=”name”,y= “age”,color = “online_shopping”,group = “online_shopping”,ggtheme = theme_pubclean(),add=”segments”,sorting=”desc”,add.params = list(color = “lightgray”,size=3),dot.size = 4) + labs(title=”Plotting of Age, Name versus Online Shoppers using ggplotly”,x=”Name”,y=”Age”)
ggplotly(dot)

#Plotting of Commodity versus age

#install.packages(“ggridges”)

library(“ggridges”)

ggplot(my_input, aes(x = age, y = as.factor(commodity))) +
geom_density_ridges(aes(fill = as.factor(commodity))) +
scale_fill_manual(values = c(“#00AFBB”, “#E7B800”, “#FC4E07”,391),
labels = c(“Metal(Gold,Silver,etc)”, “Cosmetics”,”Electronic Items”,”Home Appliances”),
name = “Commodity”) +
labs(title=”Plotting Age versus Commodity”,x=”Age”,y=”Commodity”) +
theme(legend.background = element_rect(fill=”gray90″)) + theme_pubclean()

#Online Shoppers and thier problems

ggplot(my_input, aes(x= what_problem, group=online_shopping)) +
geom_bar(aes(y = ..prop.., fill = factor(..x..))) +
geom_text(aes( label = scales::percent(..prop..),
y= ..prop.. ), stat= “count”, vjust = -.5) +
labs(y = “Percentage”, fill=”what_problem”) +
facet_grid(~online_shopping) +
scale_y_continuous(labels = scales::percent) +
theme_pubclean() +
scale_fill_manual(values = c(“#00AFBB”, “#E7B800”, “#FC4E07”,391,78,87),labels = c(“No Problem”,”Delay in Delivery”,”Cheap Quality of Product”,”Product Damage”,”Non-Delivery”,”Payment Problem”)) +
labs(title=”Online Shoppers and thier problems”,x=”Problem in Online Shopping”,y=”Percentage”,face=”bold”) +
theme(legend.title = element_text(face=”bold”),legend.background = element_rect(fill=”darkgrey”))

#Age Versus Salary

as<-ggplot(my_input,aes(x=age,y=income)) + geom_bar(color=”red”,fill=”yellow”,stat = “identity”) + scale_fill_grey() + theme(legend.position = “none”) +
labs(title=”Age Versus Salary”,x=”Age”,y=”Income”) + theme_pubclean()

ggplotly(as)

#Favourite Online Shopping Websites

fav<-ggplot(my_input,aes(x=favourite)) + geom_bar(fill=”steelblue”,width = 0.70) + coord_flip() + theme_light() +
theme(panel.grid = element_blank(),
panel.border = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_text(color =”red”,face = “bold”,size = 10)) +
labs(title=”Favourite Online Websites”)

ggplotly(fav)

#Finding the best Clustering algorithm for our data

#install.packages(“clValid”)
library(clValid)

#Internal Validation Measures

#Compute clValid
clmethods<-c(“hierarchical”,”kmeans”,”pam”)
intern<-clValid(my_input[,c(3:15)], nClust = 2:6,
clMethods = clmethods, validation = “internal”)

str(my_input[,c(3:15)])
# Summary
summary(intern)

#Ploting the summary
plot(intern)

#Stability Validation Measures

stab<-clValid(my_input[,c(3:15)],nClust=2:6,clMethods = clmethods,validation = “stability”)

#Summary
summary(stab)

#Plotting the summary
plot(stab)

#Hierarchical Clustering

#Finding the more appropriate method for more strongest clustering structure

#install.packages(“purrr”)
library(“purrr”)

m<-c(“complete”,”single”,”ward”,”average”)
names(m)<-c(“complete”,”single”,”ward”,”average”)
agg_coef<-function(x){
agnes(my_input[,3:15],method = x)$ac
}

map_dbl(m,agg_coef)

#Compute hclust
h_dist<-dist(my_input[,3:15],method = “euclidean”)
h_data<-hclust(h_dist, method = “ward.D”)

#Plotting Dendrogram
plot(h_data)
rect.hclust(h_data,k=2)

#Finding the optimal No of clusters
library(“factoextra”)

fviz_nbclust(my_input,hcut,method = “silhouette”)

clusgrp<-cutree(h_data,k=2)
table(clusgrp)

age<-tapply(my_input[,3], clusgrp, mean)

cat(“Age of two groups\n”)
print(age)

inc<-tapply(my_input[,4], clusgrp, mean)

cat(“Income of two groups\n”)
print(inc)

ed<-tapply(my_input[,5], clusgrp, mean)

cat(“Education of two groups\n”)
print(ed)

mode<-function(f){
uni<-unique(f)
uni[which.max(tabulate(match(f,uni)))]
}

onshop<-tapply(my_input[,6], clusgrp, mode)

cat(“Online Shopping of two groups\n”)
print(onshop)

save<-tapply(my_input$save_time, clusgrp, mode)

cat(“Shopping on Internet saves time”)
print(save)

any_time<-tapply(my_input$any_time, clusgrp, mode)

cat(“It is a great advantage to be able to shop at any time of the day”)
print(any_time)

traditional<-tapply(my_input$traditional_online, clusgrp, mode)

cat(“I prefer traditional shopping to online shopping”)
print(traditional)

credit<-tapply(my_input$credit_card, clusgrp, mode)

cat(“While shopping online I hesitate to give my credit card number”)
print(credit)

cash<-tapply(my_input$cash_on_delivery, clusgrp, mode)

cat(“I cash on delivery than payment via credit/debit card”)
print(cash)

#Bar chart of “Shopping on Internet saves time”

fav<-ggplot(my_input,aes(x=as.factor(save_time))) + geom_bar(fill=”steelblue”,width = 0.40)+ theme_light() +
theme(panel.grid = element_blank(),
panel.border = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_text(color =”red”,face = “bold”,size = 10)) +
labs(title=”Shopping on Internet saves time”)

ggplotly(fav)

#Bar chart of “It is a great advantage to be able to shop at any time of the day”

fav<-ggplot(my_input,aes(x=as.factor(any_time))) + geom_bar(fill=”steelblue”,width = 0.40)+ theme_light() +
theme(panel.grid = element_blank(),
panel.border = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_text(color =”red”,face = “bold”,size = 10)) +
labs(title=”It is a great advantage to be able to shop at any time of the day”)

ggplotly(fav)

#Bar chart of “I prefer traditional shopping to online shopping”

fav<-ggplot(my_input,aes(x=as.factor(traditional_online))) + geom_bar(fill=”steelblue”,width = 0.40)+ theme_light() +
theme(panel.grid = element_blank(),
panel.border = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_text(color =”red”,face = “bold”,size = 10)) +
labs(title=”I prefer traditional shopping to online shopping”)

ggplotly(fav)

#Bar chart of “While shopping online I hesitate to give my credit card number”

fav<-ggplot(my_input,aes(x=as.factor(credit_card))) + geom_bar(fill=”steelblue”,width = 0.40)+ theme_light() +
theme(panel.grid = element_blank(),
panel.border = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_text(color =”red”,face = “bold”,size = 10)) +
labs(title=”While shopping online I hesitate to give my credit card number”)

ggplotly(fav)

#Bar chart of “I prefer cash on delivery than payment via credit/debit card”

fav<-ggplot(my_input,aes(x=as.factor(cash_on_delivery)))+ geom_bar(fill=”steelblue”,width = 0.40)+ theme_light() +
theme(panel.grid = element_blank(),
panel.border = element_blank(),
axis.title = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_text(color =”red”,face = “bold”,size = 10)) +
labs(title=”I prefer cash on delivery than payment via credit/debit card”)

ggplotly(fav)

Office Address

Social List

How to cluster customers into groups based on shopping behavior using Clustering Algorithm in R?

Description

Process

Sapmle Code

Screenshots

S-Logix (OPC) Private Limited