• #5, First Floor, 4th Street Dr. Subbarayan Nagar Kodambakkam, Chennai-600 024 Landmark : Samiyar Madam
• pro@slogix.in
• +91- 81240 01111

### How to predict income for a given data set using logistic regression in R?

###### Description

To predict the income for the given data set using logistic regression in R.

###### Process

Step 2: Data Preparation : Filling Missing values and Outliers

Step 3: Replacing factor with a numeric value using plyr package

Step 4: Taking sample data from the whole data set

Step 5: Finding Correlation between variables

Step 6: Splitting the data into train and test data set

Step 7: Building the Regression Model

Step 8: Prediction

Step9: Confusion Matrix

###### Sapmle Code

#install.packages(“xlsx”)
library(“xlsx”)
library(“openxlsx”) #For big Excel File
input<-data.frame
View(input)

#Data Preparation
#Filling Missing Value with Mode
View(is.na(input))
sum(is.na(input))
mode<-function(f){
uniq<-unique(f)
uniq[which.max(tabulate(match(f,uniq)))]
}
input\$JobType[is.na(input\$JobType)]<-mode(input\$JobType)
input\$occupation[is.na(input\$occupation)]<-mode(input\$occupation)
sum(is.na(input))

#Resolving Outlier
#Box Polot with Outliers
#install.packages(“plotly”)
library(“plotly”)
g<-ggplot(input,aes(x=input\$SalStat,y=input\$age,fill=input\$SalStat)) + geom_boxplot() + ggtitle(“Box Plot Of Salary Status versus Age”) + xlab(“Salary Status”) + ylab(“Age”)
ggplotly(g)
boxplot(input\$age,col = “red”,main=”Box Plot with outliers”)

#Replacing outlier with values
remove_outliers<-function(x,na.rm=TRUE) {
qnt<-quantile(x, probs=c(.25, .75))
caps<-quantile(x,probs = c(0.05,0.95),na.rm = na.rm)
H x[x < (qnt[1] – H)]<-caps[1] x[x > (qnt[2] + H)]<-caps[2]
x
}
input\$age<-remove_outliers(input\$age)
boxplot(input\$age,col = “red”,main=”Box Plot without outliers”)

#Replacing Factor with a numeric value
#install.packages(“plyr”)
library(“plyr”)
job_fact<-factor(input\$JobType)
ed_fact<-factor(input\$EdType)
mar_fact<-factor(input\$maritalstatus)
occ_fact<-factor(input\$occupation)
rel_fact<-factor(input\$relationship)
race_fact<-factor(input\$race)
gen_fact<-factor(input\$gender)
native_fact<-factor(input\$nativecountry)
sal_fact<-factor(input\$SalStat)

nlevels(job_fact)
nlevels(ed_fact)
nlevels(mar_fact)
nlevels(occ_fact)
nlevels(rel_fact)
nlevels(race_fact)
nlevels(gen_fact)
nlevels(native_fact)
nlevels(sal_fact)

print(levels(job_fact))
print(levels(ed_fact))
print(levels(mar_fact))
print(levels(occ_fact))
print(levels(rel_fact))
print(levels(race_fact))
print(levels(gen_fact))
print(levels(native_fact))
print(levels(sal_fact))

input\$JobType input\$EdType<-mapvalues(ed_fact,from = c(” 10th”,” 11th”, ” 12th”, ” 1st-4th” , ” 5th-6th”, ” 7th-8th”,” 9th” ,” Assoc-acdm”,” Assoc-voc”,” Bachelors”, ” Doctorate”,” HS-grad”,” Masters”,” Preschool”, ” Prof-school”, ” Some-college”),to=c(1:16))
input\$maritalstatus input\$occupation input\$relationship input\$race input\$gender input\$nativecountry input\$SalStat View(input)

#Taking sample data from whole dataset
set.seed(300)
input1<-input[sample(nrow(input),300),]
View(input1)
write.xlsx(input1, “IncomeSampleData.xlsx”)
View(input1)

#Correlation
#install.packages(“polycor”)
library(“polycor”)
hetcor(input1)

#Check class bias
table(input1\$SalStat)

#Splitting into Train and test data
set.seed(301)
training_1<-input1[which(input1\$SalStat==0),]
training_2<-input1[which(input1\$SalStat==1),]
train_1<-sample(1:nrow(training_1),0.8*nrow(training_1))
train_2<-sample(1:nrow(training_2),0.8*nrow(training_2))
train_one<-training_1[train_1,]
train_two<-training_2[train_2,]

#Train data
train<-rbind(train_one,train_two)

#Test Data
test_one<-training_1[-train_1,]
test_two<-training_2[-train_2,]
test<-rbind(test_one,test_two)

#Logit model

model<-glm(SalStat~relationship+capitalgain+hoursperweek,data = train,family = “binomial”)
print(model)
summary(model)

#Prediction
pp<-round(predict(model,test,type=’response’),digits = 0)

#Confusion Matrix
#install.packages(“caret”)
library(“caret”)
#install.packages(“e1071”)
library(“e1071”)
levels(as.factor(pp))
confusionMatrix(test\$SalStat,as.factor(pp))