h2o.removeAll() # Deletes all data and models from the H2O engine
h2o.init(nthreads = -1)
data(“titanic_train”)
titanic_train\(Survived <- as.factor(titanic_train\)Survived) titanic_train\(Pclass <- as.factor(titanic_train\)Pclass) titanic_train\(Sex <- as.factor(titanic_train\)Sex)
titanic.h2o <- as.h2o(titanic_train)
titanic_split <- h2o.splitFrame(data = titanic.h2o, ratios = 0.8, seed = 1234)
train_data <- titanic_split[[1]] # using data from index 1 to create an object test_data <- titanic_split[[2]] # using data from index 2 to create an object
print(paste(“Training rows:”, h2o.nrow(train_data))) print(paste(“Testing rows:”, h2o.nrow(test_data)))
predictors <- c(“Pclass”, “Sex”, “Age”, “SibSp”, “Parch”, “Fare”) response <- “Survived”
titanic_glm <- h2o.glm( x = predictors, y = response, training_frame = train_data, validation_frame = test_data, family = “binomial”)
h2o.auc(titanic_glm, valid = TRUE) # AUC of 0.86, so it is strong
h2o.rmse(titanic_glm, valid = TRUE) # RMSE of 0.39, so it has a moderate amount of error, but still reasonable
titanic_rf <- h2o.randomForest( x = predictors, y = response, training_frame = train_data, validation_frame = test_data, ntrees = 50, seed = 1234)
h2o.auc(titanic_rf, valid = TRUE) # AUC of 0.89, so better than the GLM # RF is more accurate than GLM model
h2o.rmse(titanic_rf, valid = TRUE) # RMSE of 0.35. # Random Forest has a lower error than GLM, it is the better model.
h2o.varimp_plot(titanic_rf) # Sex is the most significant variable when it comes to predicting survival # Followed by Age and Fare
pred <- h2o.predict(titanic_rf, newdata = test_data) head(pred) # These results show the model’s predictions. # p1 is probability of survival and p0 is probability of death # If p1 is larger than p0, it predicts survived # If p0 is larger than p1, it predicts died