• 2nd Floor, #7a, High School Road, Secretariat Colony Ambattur, Chennai-600053 (Landmark: SRM School) Tamil Nadu, India
• pro@slogix.in
• +91- 81240 01111

### How to implement Random Forest classifier using sklearn in python?

###### Description

To implement random forest classifier algorithm using python.

#### Random Forest:

Its a supervised machine learning algorithm.

Random forest builds multiple decision trees and merges them together to get a more accurate and stable prediction.

It can be used for both classification and regression problems.

#### Steps:

Check missing values

Check outliers for each category.

Calculate descriptive statistics.

Split the data into train and test.

Train random forest classifier.

Fit the model.

Print the results.

###### Sample Code

#import libraries
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import mannwhitneyu,wilcoxon
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn import metrics
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
import seaborn as sns
sns.set(style=”ticks”,color_codes=True)

url = “https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data”
names = [‘sepal-length’, ‘sepal-width’, ‘petal-length’, ‘petal-width’, ‘class’]

df = pd.DataFrame(data)

#checking missing values
print(“Missing values\n\n”,df.isnull().sum())
print(“\n”)

#Descriptive statistics
print(“Descriptive statistics\n\n”,df.describe(),”\n”)

X = df.drop(‘class’,1)

y = df[‘class’]

#checking outliers
print(“Outliers detection\n”)
sns.catplot(x=”sepal-length”, y=”class”, kind=”box”, data=df);
plt.show()

sns.catplot(x=”sepal-width”, y=”class”, kind=”box”, data=df);
plt.show()

sns.catplot(x=”petal-length”, y=”class”, kind=”box”, data=df);
plt.show()

sns.catplot(x=”petal-width”, y=”class”, kind=”box”, data=df);
plt.show()

#Density plot
plt.figure(figsize=(10,5))
sns.distplot(df[‘sepal-length’], hist = True, kde = True,kde_kws = {‘linewidth’: 3})
plt.title(“Density plot for sepal-length”)
plt.show()
print(“\n”)

plt.figure(figsize=(10,5))
sns.distplot(df[‘sepal-width’], hist = True, kde = True,kde_kws = {‘linewidth’: 3})
plt.title(“Density plot for sepal-width”)
plt.show()

plt.figure(figsize=(10,5))
sns.distplot(df[‘petal-length’], hist = True, kde = True,kde_kws = {‘linewidth’: 3})
plt.title(“Density plot for petal-length”)
plt.show()

plt.figure(figsize=(10,5))
sns.distplot(df[‘petal-width’], hist = True, kde = True,kde_kws = {‘linewidth’: 3})
plt.title(“Density plot for petal-width”)
plt.show()

#Hypothesis
#Non parametric test(mann Whiteney test)
print(“\n”)
print(“Hypothesis Testing\n”)
print(“Mann-Whiteney Test results”)
data1 = df[‘sepal-length’]
data2 = df[‘sepal-width’]
stat, p = mannwhitneyu(data1, data2)
print(“Statistics:”,stat,”\n”,”p-value:”,p)
alpha = 0.05
if p > alpha:
print(‘Same distribution (fail to reject H0)’)
else:
print(‘Different distribution (reject H0 and accepting H1)’)

#Wilcoxon test
print(“\n”)
print(“Wilcoxson Test Results”)
data1 = df[‘petal-length’]
data2 = df[‘petal-width’]
stat1, p1 = wilcoxon(data1, data2)
print(“Statistics:”,stat1,”\n”,”p-value:”,p1)
# interpret p-value
alpha = 0.05
if p1 > alpha:
print(‘Same distribution (fail to reject H0)’)
else:
print(‘Different distribution (reject H0 and accepting H1)’)
print(“\n”)

#split the data train and test
print(“Randomforest classifier results\n”)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)

# Train the classifier

clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))
print(“Confusion Matrix\n\n”,metrics.confusion_matrix(y_test, y_pred))

a=metrics.accuracy_score(y_test, y_pred)
print(“\n”)
print(“Accuracy score :”,round(a,1))