• #5, First Floor, 4th Street Dr. Subbarayan Nagar Kodambakkam, Chennai-600 024 Landmark : Samiyar Madam
• pro@slogix.in
• +91- 81240 01111

### How to analyse SLID Datset using MLR model and Clustering techniques in R?

###### Description

To analyse the SLID Datset using MLR model and Clustering techniques using R.

#### Steps:

• Import data
• Labelling data
• Data Preparation – Resolving Missing values
• Descriptive Statistics
• Univariate Plotting – Pie Chart
• Bivariate plotting – Correlogram
• Splitting into train and test data
• Applying ML algorithms – Multiple Linear Regression Model
• Checking Normality of residuals
• Validation using RMSE R Squared and Adjusted R Squared
• Plotting Actual and Predicted Values
• Applying kmeans clustering Algorithm
###### Sapmle Code

#Import Data

my_input1<-my_input
View(my_input)
str(my_input)

#Labelling Data

library(“plyr”)

nlevels(my_input\$gender)
levels(my_input\$gender)
levels(my_input\$gender)

nlevels(my_input\$language)
levels(my_input\$language)

my_input\$language<-mapvalues(my_input\$language,
from=c(“English”,”French”,”Other”),to = c(0:2))
View(my_input)

#Data Preparation

#Number of Missing Values

sum(is.na(my_input))

#Checking for Missing Values

sapply(my_input, function(x)sum(is.na(x)))

#Recoding wages and education with mean

my_input\$wages[is.na(my_input\$wages)]<-mean(my_input\$wages,na.rm = T)
my_input\$education[is.na(my_input\$education)]<-mean(my_input\$education,na.rm = T)

#Recoding language with mode

mode<-function(f){
uni<-unique(f)
uni[which.max(tabulate(match(f,uni)))]
}

my_input\$language[is.na(my_input\$language)]<-mode(my_input\$language)

sapply(my_input, function(x)sum(is.na(x)))

#Check Outliers

library(“plotly”)

boxplot(my_input\$wages,main=”Box Plot of Wages”,ylab=”Wages”,xlab=”All Labours”,col = “red”)

boxplot(my_input\$education,main=”Box Plot of Education”,ylab=”Education”,xlab=”All Labours”,col = “red”)

boxplot(my_input\$age,main=”Box Plot of age”,ylab=”Age”,xlab=”All Labours”,col = “red”)

#Univariate Plotting

#Pie Chart for Language

tab<-table(my_input1\$language)
colors<-terrain.colors(3)

p_tab<-round(prop.table(tab)*100, digits = 2)
prop_tab<-as.data.frame(p_tab)
p_tab

lab<-sprintf(“%s – %3.1f%s”,prop_tab[,1],p_tab,”%”)

pie(p_tab,col =colors,labels = lab, clockwise = T, border = “gainsboro”,radius = 1,cex=1.5,main = “Pie Chart of Frequency of Languages”)
legend(“topright”,legend = prop_tab[,1],fill=colors,cex=1)

#Pie Chart for Gender

tab1<-table(my_input1\$gender)
colors1<-terrain.colors(2)

p_tab1<-round(prop.table(tab1)*100, digits = 2)
prop_tab1<-as.data.frame(p_tab1)
p_tab1

lab1<-sprintf(“%s – %3.1f%s”,prop_tab1[,1],p_tab1,”%”)

pie(p_tab1,col =colors1,labels = lab1, clockwise = T, border = “gainsboro”,radius = 1,cex=1.5,main = “Pie Chart of Frequency of Gender”)
legend(“topright”,legend = prop_tab1[,1],fill=colors1,cex=1)

#Bivariate Plot

#install.packages(“corrplot”)
library(“corrplot”)

library(“polycor”)

het<-hetcor(my_input[,-1])
het

corrplot(as.matrix(het),method = “pie”,tl.cex=0.7)

corrplot.mixed(as.matrix(het),tl.cex=0.7,lower = “number”,lower.col = “black”)

#Splitting into train and test

library(“caret”)

training<-createDataPartition(my_input\$wages,p=0.7,list=F)

train = my_input[training,]
test = my_input[-training,]

nrow(train)
nrow(test)

#ML Algorithms

#Multiple Linear Regression Model

mult_reg<-lm(wages~.-language,data=train[,-1])
mult_reg
summary(mult_reg)

#Check Normality of Residuals

#Histogram

hist(mult_reg\$residuals,col=62)

#install.packages(“nortest”)
library(“nortest”)

#Anderson – Darling Test

#Regression Diagnostics

par(mfrow = c(2, 2))
plot(mult_reg)

#Confidence Interval

confint(mult_reg)

#Prediction

round(predict(mult_reg,test),digits = 2)

#Validation of the MLR model

MSE RMSE<-sqrt(MSE)
RMSE

sigma(mult_reg)/mean(my_input\$wages)

#AIC and BIC

AIC(mult_reg)

BIC(mult_reg)

#Actual Predicted Plot

mult_reg1<-lm(wages~.-language,data=my_input[,-1])
my_input\$predicted<-predict(mult_reg1)

my_input\$residuals<-residuals(mult_reg1) #For Seperate Variables library(“tidyr”) my_input[1:50,-1] %>%
gather(key = “iv”, value = “x”, -wages, -predicted, -residuals) %>% # Get data into shape
ggplot(aes(x = x, y = wages)) + # Note use of `x` here and next line
geom_segment(aes(xend = x, yend = predicted)) +
geom_point(aes(color = residuals)) +
scale_color_gradient2(low = “blue”, mid = “white”, high = “red”) +
guides(color = FALSE) +
geom_point(aes(y = predicted), shape = 1) +
facet_grid(~ iv, scales = “free”) + # Split panels here by `iv`
theme_bw()

#Hierarchical Clustering

#Finding the more appropriate method for more strongest clustering structure

#install.packages(“purrr”)
library(“purrr”)
library(“cluster”)

m names(m)<-c(“complete”,”single”,”ward”,”average”)
agg_coef<-function(x){
agnes(my_input[1:100,],method = x)\$ac
}

map_dbl(m,agg_coef)

#Compute hclust
my_input_scale<-scale(my_input[,2:4])
View(my_input_scale)

my_input1<-cbind(my_input_scale,my_input[,c(1,5,6)])
h_dist<-dist(my_input1,method = “euclidean”)
h_data<-hclust(h_dist, method = “ward.D”)

#Finding the optimal No of clusters
library(“factoextra”)

fviz_nbclust(my_input1,hcut,method = “silhouette”)

clusgrp<-cutree(h_data,k=2)

table(clusgrp)

w<-tapply(my_input[,2], clusgrp, mean)

cat(“Wages\n”)
print(w)

ed<-tapply(my_input[,3],clusgrp,mean)

cat(“Education”)
print(ed)

a<-tapply(my_input[,4],clusgrp,mean)

cat(“Age”)
print(a)

g<-tapply(as.numeric(my_input[,5]),clusgrp,mean)

cat(“Gender”)
print(g)

lan<-tapply(as.numeric(my_input[,6]),clusgrp,mean)

cat(“Language”)
print(lan)