Amazing technological breakthrough possible @S-Logix pro@slogix.in

Office Address

  • #5, First Floor, 4th Street Dr. Subbarayan Nagar Kodambakkam, Chennai-600 024 Landmark : Samiyar Madam
  • pro@slogix.in
  • +91- 81240 01111

Social List

How to implement Random Forest classifier using sklearn in python?

Description

To implement random forest classifier algorithm using python.

Process

Random Forest:

  Its a supervised machine learning algorithm.

  Random forest builds multiple decision trees and merges them together to get a more accurate and stable prediction.

  It can be used for both classification and regression problems.

Steps:

  Read the data set.

  Check missing values

  Check outliers for each category.

  Calculate descriptive statistics.

  Split the data into train and test.

  Train random forest classifier.

  Fit the model.

  Print the results.

Sample Code

#import libraries
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import mannwhitneyu,wilcoxon
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn import metrics
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
import seaborn as sns
sns.set(style=”ticks”,color_codes=True)

#load data set URL
url = “https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data”
names = [‘sepal-length’, ‘sepal-width’, ‘petal-length’, ‘petal-width’, ‘class’]
data = pd.read_csv(url, names=names)

df = pd.DataFrame(data)

#checking missing values
print(“Missing values\n\n”,df.isnull().sum())
print(“\n”)

#Descriptive statistics
print(“Descriptive statistics\n\n”,df.describe(),”\n”)

X = df.drop(‘class’,1)

y = df[‘class’]

#checking outliers
print(“Outliers detection\n”)
sns.catplot(x=”sepal-length”, y=”class”, kind=”box”, data=df);
plt.show()

sns.catplot(x=”sepal-width”, y=”class”, kind=”box”, data=df);
plt.show()

sns.catplot(x=”petal-length”, y=”class”, kind=”box”, data=df);
plt.show()

sns.catplot(x=”petal-width”, y=”class”, kind=”box”, data=df);
plt.show()

#Density plot
plt.figure(figsize=(10,5))
sns.distplot(df[‘sepal-length’], hist = True, kde = True,kde_kws = {‘linewidth’: 3})
plt.title(“Density plot for sepal-length”)
plt.show()
print(“\n”)

plt.figure(figsize=(10,5))
sns.distplot(df[‘sepal-width’], hist = True, kde = True,kde_kws = {‘linewidth’: 3})
plt.title(“Density plot for sepal-width”)
plt.show()

plt.figure(figsize=(10,5))
sns.distplot(df[‘petal-length’], hist = True, kde = True,kde_kws = {‘linewidth’: 3})
plt.title(“Density plot for petal-length”)
plt.show()

plt.figure(figsize=(10,5))
sns.distplot(df[‘petal-width’], hist = True, kde = True,kde_kws = {‘linewidth’: 3})
plt.title(“Density plot for petal-width”)
plt.show()

#Hypothesis
#Non parametric test(mann Whiteney test)
print(“\n”)
print(“Hypothesis Testing\n”)
print(“Mann-Whiteney Test results”)
data1 = df[‘sepal-length’]
data2 = df[‘sepal-width’]
stat, p = mannwhitneyu(data1, data2)
print(“Statistics:”,stat,”\n”,”p-value:”,p)
alpha = 0.05
if p > alpha:
print(‘Same distribution (fail to reject H0)’)
else:
print(‘Different distribution (reject H0 and accepting H1)’)

#Wilcoxon test
print(“\n”)
print(“Wilcoxson Test Results”)
data1 = df[‘petal-length’]
data2 = df[‘petal-width’]
stat1, p1 = wilcoxon(data1, data2)
print(“Statistics:”,stat1,”\n”,”p-value:”,p1)
# interpret p-value
alpha = 0.05
if p1 > alpha:
print(‘Same distribution (fail to reject H0)’)
else:
print(‘Different distribution (reject H0 and accepting H1)’)
print(“\n”)

#split the data train and test
print(“Randomforest classifier results\n”)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)

# Train the classifier

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print(“Confusion Matrix\n\n”,metrics.confusion_matrix(y_test, y_pred))

a=metrics.accuracy_score(y_test, y_pred)
print(“\n”)
print(“Accuracy score :”,round(a,1))

Screenshots
implement Random Forest classifier using sklearn in python
supervised machine learning algorithm
import metrics
Read the data set
Calculate descriptive statistics
Train random forest classifier
Print the results
Descriptive statistics
Outliers detection
Load dataset
Density plo