• 2nd Floor, #7a, High School Road, Secretariat Colony Ambattur, Chennai-600053 (Landmark: SRM School) Tamil Nadu, India
• pro@slogix.in
• +91- 81240 01111

How to implement Decision Tree Classifier algorithm using sklearn in python?

Description

To implement the concept of decision tree classifier using python.

Decision Tree:

• Decision tree breaks down a data set into smaller and smaller subsets.
• It used in both classification and regression.
• Tree has been construct with decision nodes and leaf nodes.
• The topmost decision node in a tree is root node.
• Others are called as leaf node.
• Decision tree construction is based on these two impurity measures, which is entropy and Information gain (Gini Index).
Sample Code

#import libraries
import pandas as pd
import warnings
warnings.filterwarnings(“ignore”)
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import pydotplus
from sklearn.model_selection import train_test_split
from sklearn import metrics
sns.set(style=”ticks”,color_codes=True)

url = “https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data”
names = [‘sepal-length’, ‘sepal-width’, ‘petal-length’, ‘petal-width’, ‘class’]

df = pd.DataFrame(data)

#checking missing values

print(“Missing values\n”,df.isnull().sum())
print(“\n”)
#Descriptive statistics
print(“Descriptive statistics\n”,df.describe())

X = df.drop(‘class’,1)

y = df[‘class’]

#checking outliers
print(“\n”)
print(“Outliers detection\n”)
sns.catplot(x=”sepal-length”, kind=”box”, data=df);
plt.show()
sns.catplot(x=”sepal-width”, kind=”box”, data=df);
plt.show()

sns.catplot(x=”petal-length”, kind=”box”, data=df);
plt.show()

sns.catplot(x=”petal-width”, kind=”box”, data=df);
plt.show()
print(“\n”)

#Removing outliers
print(“Without Outliers\n”)
median = df.loc[df[‘sepal-width’]>=2, ‘sepal-width’].median()
df.loc[df[‘sepal-width’] <=2, ‘sepal-width’] = np.nan
df.fillna(median,inplace=True)

median = df.loc[df[‘sepal-width’]<4, ‘sepal-width’].median() df.loc[df[‘sepal-width’] > 4, ‘sepal-width’] = np.nan
df.fillna(median,inplace=True)

sns.catplot(x=”sepal-width”, kind=”box”, data=df);
plt.show()

#categorical scatter plot for each independent variable
print(“Plot for categorical variable\n”)
df1 = df.sample(50)
sns.catplot(x=”sepal-length”, y=”class”, data=df1, marker=’o’);
plt.title(“Sepal Length vs Class(50 samples only)”)
plt.show()
print(“\n”)

df1 = df.sample(50)
sns.catplot(x=”sepal-width”, y=”class”, data=df1, marker=’o’);
plt.title(“sepal-width vs class(50 samples only)”)
plt.show()
print(“\n”)

df1 = df.sample(50)
sns.catplot(x=”petal-length”, y=”class”, data=df1, marker=’o’);
plt.title(“petal-length vs class(50 samples only)”)
plt.show()
print(“\n”)

df1 = df.sample(50)
sns.catplot(x=”petal-width”, y=”class”, data=df1, marker=’o’);
plt.title(“petal-width vs class(50 samples only)”)
plt.show()
print(“\n”)

#split the data train and test
print(“Decision Tree classifier results\n”)

#split data train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

print(metrics.classification_report(y_test, y_pred))
print(“confusion_matrix\n”,metrics.confusion_matrix(y_test, y_pred))
print(“\n”)
print(“Accuracy_score”,metrics.accuracy_score(y_test, y_pred))

#decision tree
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus

dot_data = StringIO()
export_graphviz(classifier, out_file=dot_data,
filled=True, rounded=True,
special_characters=True)
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())