• #5, First Floor, 4th Street Dr. Subbarayan Nagar Kodambakkam, Chennai-600 024 Landmark : Samiyar Madam
• pro@slogix.in
• +91- 81240 01111

## How to choose the Best fit Classification Model for our data set in R?

###### Description

To choose the best fit Classification Model for the given data set using R.

#### Steps:

• Import data
• Data Preparation – Resolving Missing values
• Descriptive Statistics
• Univariate Plotting – Pie Chart, Box Plot
• Bivariate plotting – Correlogram
• Drop the highly Correlated Independent variables
• Splitting into train and test data
• Applying ML algorithms – Logistic Regression, Random Forest Random Forest with PCA,KNN, SVM
• Compute Confusion Matrix for each Model
• Validation using Metrics – Accuracy, Sensitivity, Precision, Recall, F1 score
###### Sapmle Code

#Import Data

View(my_input)

str(my_input)

#Data Preparation

#Number of Missing Values

sum(is.na(my_input))

#List of rows having Missing Values

my_input[!complete.cases(my_input),]

#Dropping Missing Value variable and id

my_input<-my_input[,c(-1,-33)]

#Checking for Missing Values

sapply(my_input, function(x)sum(is.na(x)))

#Descriptive Statictics

summary(my_input)

#Univariate Plotting

#Pie Chart

tab<-table(my_input\$diagnosis)

colors<-terrain.colors(2)

p_tab<-round(prop.table(tab)*100, digits = 2)

prop_tab<-as.data.frame(p_tab)

p_tab

lab<-sprintf(“%s – %3.1f%s”,prop_tab[,1],p_tab,”%”)

pie(p_tab,col =colors,labels = lab, clockwise = T, border = “gainsboro”,

radius = 1,cex=1.5,main = “Pie Chart of Frequency of Cancer Diagnosis”)

legend(“topright”,legend = prop_tab[,1],fill=colors,cex=1)

#Box Plot

colnames(my_input)

my_mean<-my_input[,c(1:11)];

my_se<-my_input[,c(1, 12:21)];

my_worst<-my_input[,c(1, 22:31)]

library(“ggplot2”)

library(“reshape2″)

ggplot(melt(my_mean,id.vars=”diagnosis”)) + geom_boxplot(aes(variable,value,color =factor(diagnosis))) +

facet_wrap(~variable, scales =’free’)

ggplot(melt(my_se,id.vars=”diagnosis”)) + geom_boxplot(aes(variable,value,color =factor(diagnosis))) +

facet_wrap(~variable, scales =’free’)

ggplot(melt(my_worst,id.vars=”diagnosis”)) + geom_boxplot(aes(variable,value,color =factor(diagnosis))) +

facet_wrap(~variable, scales =’free’)

#Bivariate Analysis

#install.packages(“corrplot”)

library(“corrplot”)

cor_val<-cor(my_input[,2:31])

corrplot(cor_val,order = “hclust”,tl.cex=0.7,main=”Correlation in Independent Variables”)

#Variables with High Correlation

#install.packages(“caret”)

library(“caret”)

high_cor<-colnames(my_input)[findCorrelation(cor_val, cutoff=0.9)]

print(high_cor)

#Removing the highly correlated independent variables

my_input1<-my_input[,which(!colnames(my_input) %in% high_cor)]

ncol(my_input1)

#Splitting into train and test

levels(my_input\$diagnosis)

input<-cbind(diagnosis=my_input\$diagnosis,my_input1)

View(input)

table(input\$diagnosis)

training<-createDataPartition(input\$diagnosis,p=0.7,list=F)

train = input[training,]

test = input[-training,]

nrow(train)

nrow(test)

#ML Algorithms

#Setting levels for both train and test data set

levels(train\$diagnosis)<-make.names(levels(factor(train\$diagnosis)))

levels(test\$diagnosis)<-make.names(levels(factor(test\$diagnosis)))

fit<-trainControl(method=”cv”,

number = 5,

preProcOptions = list(thresh = 0.99),

classProbs = TRUE,

summaryFunction = twoClassSummary)

#Logistic Regression

log_data<-glm(diagnosis~.,data=train,family = “binomial”)

summary(log_data)

log_data_train<-train(diagnosis~., data=train, method=”glm”,family=binomial(),

trControl=fit )

#ROC curve

#install.packages(“ROCR”)

library(“ROCR”)

pred<-prediction(predict(log_data,test),test\$diagnosis)

per<-performance(pred,”tpr”,”fpr”)

plot(per,col=”red”,main=”ROC Curve for Logit Model”)

#Prediction

predict_log<-round(predict(log_data,test,type = “response”),digits = 0)

predict_log<-as.factor(predict_log)

library(“caret”)

levels(predict_log)

levels(test\$diagnosis)<-0:1

confusion_log<-confusionMatrix(test\$diagnosis,predict_log)

confusion_log

#Random Forest

#install.packages(“randomForest”)

library(“randomForest”)

ran_data trControl=fit )

ran_data

predict_ran<-predict(ran_data,test)

levels(predict_ran)<-0:1

table(predict_ran,test\$diagnosis)

mean(predict_ran == test\$diagnosis)

#Feature Importance Plot

plot(varImp(ran_data),top=10,main=”Random Forest”)

#Confusion Matrix

confusion_ran<-confusionMatrix(predict_ran,test\$diagnosis)

confusion_ran

#Random Forest with PCA

ran_pca<-train(diagnosis~.,

data = train,

metric=”ROC”,

preProcess = c(‘center’, ‘scale’, ‘pca’),

trControl=fit)

#Prediction

predict_ran_pca<-predict(ran_pca, test)

levels(predict_ran_pca)<-0:1

#Confusion Matrix

confusion_ran_pca<-confusionMatrix(predict_ran_pca, test\$diagnosis)

confusion_ran_pca

#KNN

# Setting up train controls

library(“caret”)

knn_data<-train(diagnosis~. , data = train, method = “knn”,

preProcess = c(“center”,”scale”),

trControl = fit,

metric = “ROC”)

knn_data

plot(knn_data)

#Prediction

predict_knn<-predict(knn_data,test)

levels(predict_knn)<-0:1

#Confusion Matrix

confusion_knn<-confusionMatrix(predict_knn,test\$diagnosis)

confusion_knn

#SVM

library(“e1071″)

trace=F,trControl=fit )

svm_data

#Prediction

predict_svm<-predict(svm_data,test)

levels(predict_svm)

#Confusion Matrix

confusion_svm<-confusionMatrix(predict_svm,test\$diagnosis)

confusion_svm

#Models Evaluation

model_list<-list(Logistic=log_data_train,Random_Forest=ran_data ,Random_Forest_PCA=ran_pca ,KNN=knn_data,SVM=svm_data)

re

bwplot(re,metric = “ROC”)

confusion<-list(Logistic=confusion_log,Random_Forest=confusion_ran,Random_Forest_PCA=confusion_ran_pca,KNN=confusion_knn,SVM=confusion_svm)

res<-sapply(confusion, function(x) x\$byClass)

round(res,digits = 2)