To implement logistic regression for multinomial output using Spark with R
#Set up spark home
Sys.setenv(SPARK_HOME=”/…/spark-2.4.0-bin-hadoop2.7″)
.libPaths(c(file.path(Sys.getenv(“SPARK_HOME”), “R”, “lib”), .libPaths()))
#Load the library
library(SparkR)
library(“caret”)
#Initialize the Spark Context
#To run spark in a local node give master=”local”
sc #Start the SparkSQL Context
sqlContext #Load the data set
data = read.df(“file:///…../iris.csv”,”csv”,header = “True”, inferSchema = “true”, na.strings = “NA”)
#Split the data into train and test set
splt_data=randomSplit(data,c(7,3),30)
trainingData=splt_data[[1]]
testData=splt_data[[2]]
coln=columns(data)
xtest=select(testData,coln[1:4])
ytest=select(testData,”species”)
#Build the model
zoo_model summary(zoo_model)
#Predict using the test data
pred=predict(zoo_model,xtest)
showDF(pred,10)
#Convert the spark data frame to R data frame
y_pred=collect(select(pred,”prediction”),stringsAsFactors=FALSE)
y_true=collect(select(ytest,”species”),stringsAsFactors=FALSE)
#Calculate the confusion matrix
confusionMatrix(as.factor(y_pred$prediction),as.factor(y_true$species))