How to create logistic regression model for the data set height and weight of male and female ?

Description

To create a logistic regression model for the given data set and analyse the summary and goodness of the model.

  Load the data set

  Find and resolve the Missing values

  Find and resolve the Outliers

  Split the data set for training and testing with ratio 80:20 so that training and testing data has 80% and 20% of the original data set respectively

  Build the model

  Fit the model using the training data

  Test the model using the test data

  Take the summary and analyse it

Building a logistic regression model :Required libraries :

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression

from sklearn import metrics

Functions used :

To split the data - x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)

To build the model - rreg = LogisticRegression()

To train the model - reg.fit(x_train_data,y_train_data)

To test the model - reg.predict(x_test_data)

To take report - metrics.classification_report(y_test,y_pred)

To find confusion matrix -metrics.confusion_matrix(y_test, y_pred)

To find accuracy - metrics.accuracy_score(y_test, y_pred)

#import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import warnings
warnings.filterwarnings(“ignore”)
#Read the file
data=pd.read_csv(‘/home/soft23/soft23/Akshaya/weight-height.csv’)
def res_mv(df):
for i in df.describe().columns:
if df[i].isnull().sum()!=0:
df[i].fillna(df[i].median(),inplace=True)
for i in df.describe(include=pd.core.series.Series).columns:
if df[i].isnull().sum()!=0:
df[i].fillna(method=”bfill”,inplace=True)
def outlier_detect(df):
for i in df.describe().columns:
Q1=df.describe().at[‘25%’,i]
Q3=df.describe().at[‘75%’,i]
IQR=Q3 – Q1
LTV=Q1 – 1.5 * IQR
UTV=Q3 + 1.5 * IQR
x=np.array(df[i])
p=[]
for j in x:
if j < LTV or j>UTV:
p.append(df[i].median())
else:
p.append(j)
df[i]=p
print(“Outliers resolved”)
return df
#To Resolve the Missing values
res_mv(data)
#To identify the outliers using boxplot
plt.boxplot(data[‘Height’],notch=True)
plt.title(‘Height distribution with outliers’)
plt.ylabel(‘Height’)
plt.show()

plt.boxplot(data[‘Weight’],notch=True)
plt.title(‘Weight distribution with outliers’)
plt.ylabel(‘Weight’)
plt.show()
#To Resolve the outliers
data=outlier_detect(data)
#boxplot after resolving the outliers
plt.boxplot(data[‘Height’],notch=True)
plt.title(‘Height distribution after resolving outliers’)
plt.ylabel(‘Height’)
plt.show()

plt.boxplot(data[‘Weight’],notch=True)
plt.title(‘Weight distribution after resolving outliers’)
plt.ylabel(‘Weight’)
plt.show()
x=data[[‘Height’,’Weight’]].values.reshape(-1,2)

#Converting the categorical string data type to binomial
data[‘Gender’] = data[‘Gender’].astype(‘category’)
data[‘Gender’] = data[‘Gender’].cat.reorder_categories([‘Male’,’Female’], ordered=True)
data[‘Gender’] = data[‘Gender’].cat.codes
y=data[‘Gender’].values.reshape(-1,1)
#Splitting train and test data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)
reg = LogisticRegression()
reg.fit(x_train,y_train)
y_pred=reg.predict(x_test)
print(“The coefficients are \n “,reg.coef_)
#classification report
print(metrics.classification_report(y_test,y_pred))
Conf_Mat = metrics.confusion_matrix(y_test, y_pred)
print(“The confusion matrix is\n”,Conf_Mat)
print(“Accuracy is “,metrics.accuracy_score(y_test, y_pred))

Leave Comment

Your email address will not be published. Required fields are marked *

clear formSubmit