Sentiment analysis on amazon products reviews using KNN algorithm in python?

Description

To train a machine learning model for classify products review using KNN in python.

Input

Amazon product review data set. (Kaggle)

Output

Confusion matrix, classification report and accuracy_score.

  Import library.

  Load the data set.

  Declare independent and dependent variables.

  Do pre process for text data.

  Split the data into train and test.

  Build the model.

  Fit the train data into the model.

  Evaluate the model using test data.

#import necessary libraries
import pandas as pd
import numpy as np
import re
import warnings
import seaborn as sns
warnings.filterwarnings(“ignore”)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

#column header
names = [‘comments’,’type’]

#load data
data = pd.read_table(‘…/amazon_cells_labelled.txt’,sep=”\t”,names = names)

#check missing values
print(“Checking missing values\n\n”)
print(data.isnull().sum())

#make it as a data frame
df = pd.DataFrame(data)

#counts in each class
print(“\n”)
print(“Counts in each class\n”)
print(df[‘type’].value_counts())

#Features
X = df[‘comments’]
y = df[‘type’]

#change text lower cases and removal of white spaces
lower_text = []
for i in range(0,len(X)):
s = str(X[i])
s1 = s.strip()
lower_text.append(s1.lower())
#print(“After converting text to lower case\n\n”,lower_text)

#Remove punctuation
punc_text = []
for i in range(0,len(lower_text)):
s2 = (lower_text[i])
s3 = re.sub(r'[^\w\s2]’,”,s2)
punc_text.append(s3)
#print(“After removed punctuation\n\n”,punc_text)

#Word vectorization
#Initialize the TF-IDF vectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5,max_df = 0.7,norm=’l2′, encoding=’latin-1′, ngram_range=(1, 2),
stop_words=’english’)

#transform independent variable using TF-IDF vectorizer
print(“\n”)
X_tfidf = tfidf.fit_transform(punc_text)
print(“After vectorized text data\n\n”,X_tfidf)

#Split the data into train and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.1, random_state=0)

#Print training data
print(“\n”)
print(“Training data\n\n”,X_train,”\n”,Y_train)
print(“\n\n”)

#Print testing data
print(“Testing data\n\n”,X_test)
print(“\n\n”)

#Build the KNN model
knn = KNeighborsClassifier()
#create a dictionary of all values we want to test for n_neighbors
params_knn = {‘n_neighbors’: np.arange(1, 25)}
#use gridsearch to test all values for n_neighbors
knn_gs = GridSearchCV(knn, params_knn, cv=5)
#fit model to training data
knn_gs.fit(X_train, y_train)
knn_best = knn_gs.best_estimator_
y_pred = knn_best.predict(X_test)

#classification report & confusion matrix
print(“Confusion Matrix\n”,confusion_matrix(Y_test,y_pred))
print(“\n”)
print(“Classification Report\n”,classification_report(Y_test,y_pred))
print(“\n”)
print(“Accuracy : “,accuracy_score(Y_test,y_pred)*100)

Leave Comment

Your email address will not be published. Required fields are marked *

clear formSubmit