How to implement Logistic regression using Spark with Python ?


To implement logistic regression using Spark with Python


  Set up Spark Context and Spark session

  Load the Data set

  Deal with categorical data and Covert the data to dense vectors(Features and Label)

  Transform the dataset to dataframe

  Identify categorical features, and index them.

  Split the data into train and test set

  Fit the logistic regression model

  Predict using the test set.

  Evaluate the metrics

from pyspark.sql import SparkSession
from import Pipeline
from import StringIndexer, OneHotEncoder, VectorAssembler,IndexToString
from pyspark.sql.functions import col
from import VectorIndexer
from import LogisticRegression
from sklearn.metrics import confusion_matrix
#Set up SparkContext and SparkSession
spark=SparkSession \
.builder \
.appName(“Python spark logistic regression example”)\
#Load the data set‘com.databricks.spark.csv’).options(header=’True’,inferschema=’True’).load(“/home/…../weight-height.csv”)
def get_dummy(df,categoricalCols,continuousCols,labelCol):
indexers = [ StringIndexer(inputCol=c, outputCol=”{0}_indexed”.format(c))
for c in categoricalCols ]
# default setting: dropLast=True
encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(),
for indexer in indexers ]
assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders]
+ continuousCols, outputCol=”features”)
pipeline = Pipeline(stages=indexers + encoders + [assembler])
data = model.transform(df)
data = data.withColumn(‘label’,col(labelCol))
catcols =[]
num_cols = [‘Height’,’Weight’]
labelCol = ‘Gender’
data = get_dummy(df,catcols,num_cols,labelCol)
# Index labels, adding metadata to the label column
labelIndexer = StringIndexer(inputCol=’label’,outputCol=’indexedLabel’).fit(data)
labelIndexer.transform(data).show(5, True)
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =VectorIndexer(inputCol=”features”, \
outputCol=”indexedFeatures”, \
# Split the data into training and test sets (20% held out for testing)
(trainingData, testData) = data.randomSplit([0.8, 0.2])
logr = LogisticRegression(featuresCol=’indexedFeatures’, labelCol=’indexedLabel’)
# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol=”prediction”, outputCol=”predictedLabel”,
# Chain indexers and tree in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, featureIndexer, logr,labelConverter])
# Train model. This also runs the indexers.
model =
# Make predictions.
predictions = model.transform(testData)
# Select example rows to display.
y_true =“label”).toPandas()
y_pred =“predictedLabel”).toPandas()
class_temp =“label”).groupBy(“label”).count().sort(‘count’, ascending=False).toPandas()
class_temp = class_temp[“label”].values.tolist()
class_names = map(str, class_temp)
# # # print(class_name)
cnf_matrix = confusion_matrix(y_true, y_pred,labels=class_names)
print(“Confusion Matrix : “,cnf_matrix)

Leave Comment

Your email address will not be published. Required fields are marked *

clear formSubmit