I am connecting the local instance of spark via the spark_connect function
library(sparklyr)
spark_install(version = "2.1.0")
Spark 2.1.0 for Hadoop 2.7 or later already installed.
spark=spark_connect(master = "local")
Re-using existing Spark connection to local
Coping Titanic train data set from R into spark cluster.
library(titanic)
library(dplyr)
??titanic
titanic_train=copy_to(spark,titanic_train)
Error in sdf_import.default(x, sc, name, memory, repartition, overwrite, :
table titanic_train already exists (pass overwrite = TRUE to overwrite)
src_tbls(spark)
[1] "titanic_test" "titanic_train"
Spark cluster have Titanic train dataset.
# filter by species setosa
titanic_test %>% filter(Sex == "male")
Error in UseMethod("filter_") :
no applicable method for 'filter_' applied to an object of class "list"
#Selecting sepal length and sepal width
titanic_train%>% select(Survived,Pclass,Sex)
str(titanic_train)
'data.frame': 891 obs. of 12 variables:
$ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
$ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
$ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
$ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
$ Sex : chr "male" "female" "female" "female" ...
$ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
$ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
$ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
$ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
$ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
$ Cabin : chr "" "C85" "" "C123" ...
$ Embarked : chr "S" "C" "S" "S" ...
#Converting training variables
titanic_train$Sex=as.factor(titanic_train$Sex)
titanic_train$Embarked=as.factor(titanic_train$Embarked)
titanic_train$Pclass=as.factor(titanic_train$Pclass)
age.mean <- mean(titanic_train$Age, na.rm=TRUE)
titanic_train$Age[is.na(titanic_train$Age)] = age.mean
#Converting test data variables
titanic_test$Sex=as.factor(titanic_test$Sex)
titanic_test$Embarked=as.factor(titanic_test$Embarked)
titanic_test$Pclass=as.factor(titanic_test$Embarked)
age.mean_test <- mean(titanic_test$Age, na.rm=TRUE)
argument is not numeric or logical: returning NA
titanic_test$Age[is.na(titanic_test$Age)] = age.mean
is.na() applied to non-(list or vector) of type 'NULL'
#Male and Female
table(titanic_train$Sex)
female male
314 577
#Missing value observation
sapply(titanic_train,function(x) sum(is.na(x)))
PassengerId Survived Pclass Name Sex Age
0 0 0 0 0 0
SibSp Parch Ticket Fare Cabin Embarked
0 0 0 0 0 0
sum(is.na(titanic_train$Sex))
[1] 0
table(titanic_train$Survived)
0 1
549 342
#Lets try random forest
set.seed(111)
library(randomForest)
rf=randomForest(factor(Survived)~Sex+Embarked+Pclass+Age+SibSp+Parch+Fare,data=titanic_train)
plot(rf)
Overall error rate is less than 0.25.