How to implement Decision Tree Classifier algorithm using sklearn in python?


To implement the concept of decision tree classifier using python.

Decision Tree:

  • Decision tree breaks down a data set into smaller and smaller subsets.
  • It used in both classification and regression.
  • Tree has been construct with decision nodes and leaf nodes.
  • The topmost decision node in a tree is root node.
  • Others are called as leaf node.
  • Decision tree construction is based on these two impurity measures, which is entropy and Information gain (Gini Index).

#import libraries
import pandas as pd
import warnings
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import pydotplus
from sklearn.model_selection import train_test_split
from sklearn import metrics

#load data set URL
url = “”
names = [‘sepal-length’, ‘sepal-width’, ‘petal-length’, ‘petal-width’, ‘class’]
data = pd.read_csv(url, names=names)

df = pd.DataFrame(data)

#checking missing values

print(“Missing values\n”,df.isnull().sum())
#Descriptive statistics
print(“Descriptive statistics\n”,df.describe())

X = df.drop(‘class’,1)

y = df[‘class’]

#checking outliers
print(“Outliers detection\n”)
sns.catplot(x=”sepal-length”, kind=”box”, data=df);
sns.catplot(x=”sepal-width”, kind=”box”, data=df);

sns.catplot(x=”petal-length”, kind=”box”, data=df);

sns.catplot(x=”petal-width”, kind=”box”, data=df);

#Removing outliers
print(“Without Outliers\n”)
median = df.loc[df[‘sepal-width’]>=2, ‘sepal-width’].median()
df.loc[df[‘sepal-width’] <=2, ‘sepal-width’] = np.nan

median = df.loc[df[‘sepal-width’]<4, ‘sepal-width’].median() df.loc[df[‘sepal-width’] > 4, ‘sepal-width’] = np.nan

sns.catplot(x=”sepal-width”, kind=”box”, data=df);

#categorical scatter plot for each independent variable
print(“Plot for categorical variable\n”)
df1 = df.sample(50)
sns.catplot(x=”sepal-length”, y=”class”, data=df1, marker=’o’);
plt.title(“Sepal Length vs Class(50 samples only)”)

df1 = df.sample(50)
sns.catplot(x=”sepal-width”, y=”class”, data=df1, marker=’o’);
plt.title(“sepal-width vs class(50 samples only)”)

df1 = df.sample(50)
sns.catplot(x=”petal-length”, y=”class”, data=df1, marker=’o’);
plt.title(“petal-length vs class(50 samples only)”)

df1 = df.sample(50)
sns.catplot(x=”petal-width”, y=”class”, data=df1, marker=’o’);
plt.title(“petal-width vs class(50 samples only)”)

#split the data train and test
print(“Decision Tree classifier results\n”)

#split data train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
classifier = DecisionTreeClassifier(), y_train)

y_pred = classifier.predict(X_test)

print(metrics.classification_report(y_test, y_pred))
print(“confusion_matrix\n”,metrics.confusion_matrix(y_test, y_pred))
print(“Accuracy_score”,metrics.accuracy_score(y_test, y_pred))

#decision tree
from sklearn.externals.six import StringIO
from IPython.display import Image
from sklearn.tree import export_graphviz
import pydotplus

dot_data = StringIO()
export_graphviz(classifier, out_file=dot_data,
filled=True, rounded=True,
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())

Leave Comment

Your email address will not be published. Required fields are marked *

clear formSubmit