# Let's load a couple of powerful packages with tools we'll apply.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.8
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ readr 1.2.1 ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(h2o)
##
## ----------------------------------------------------------------------
##
## Your next step is to start H2O:
## > h2o.init()
##
## For H2O package documentation, ask for help:
## > ??h2o
##
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
##
## ----------------------------------------------------------------------
##
## Attaching package: 'h2o'
## The following objects are masked from 'package:stats':
##
## cor, sd, var
## The following objects are masked from 'package:base':
##
## &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
## colnames<-, ifelse, is.character, is.factor, is.numeric, log,
## log10, log1p, log2, round, signif, trunc
# Since we're using the H2O package, we'll need to initialize and connect to an H2O instance.
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 18 minutes 59 seconds
## H2O cluster timezone: America/Los_Angeles
## H2O data parsing timezone: UTC
## H2O cluster version: 3.22.1.1
## H2O cluster version age: 20 days
## H2O cluster name: H2O_started_from_R_michaelespero_xbv417
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.97 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: XGBoost, Algos, AutoML, Core V3, Core V4
## R Version: R version 3.5.1 (2018-07-02)
# Let's import the training data so it becomes an H2O data frame.
train <- h2o.importFile("house_train.csv")
##
|
| | 0%
|
|===================================================== | 81%
|
|=================================================================| 100%
# Next, let's take a chunk of the training data (20%) and make it a validation set for hyperparameter tuning.
train.splits <- h2o.splitFrame(data = train, ratios = .8, seed = 1234)
train <- train.splits[[1]]
valid <- train.splits[[2]]
# Import the test data for the Kaggle competition.
test <- h2o.importFile("house_test.csv")
##
|
| | 0%
|
|========================================================= | 87%
|
|=================================================================| 100%
# H2O has a convenient descibe function to get a description of the data.
h2o.describe(train)
## Label Type Missing Zeros PosInf NegInf Min Max
## 1 Id int 0 0 0 0 1 1458
## 2 MSSubClass int 0 0 0 0 20 190
## 3 MSZoning enum 0 8 0 0 0 4
## 4 LotFrontage int 211 0 0 0 21 313
## 5 LotArea int 0 0 0 0 1300 215245
## 6 Street enum 0 5 0 0 0 1
## 7 Alley enum 0 37 0 0 0 2
## 8 LotShape enum 0 392 0 0 0 3
## 9 LandContour enum 0 52 0 0 0 3
## 10 Utilities enum 0 1179 0 0 0 0
## 11 LotConfig enum 0 218 0 0 0 4
## 12 LandSlope enum 0 1115 0 0 0 2
## 13 Neighborhood enum 0 9 0 0 0 24
## 14 Condition1 enum 0 38 0 0 0 8
## 15 Condition2 enum 0 1 0 0 0 7
## 16 BldgType enum 0 984 0 0 0 4
## 17 HouseStyle enum 0 126 0 0 0 7
## 18 OverallQual int 0 0 0 0 1 10
## 19 OverallCond int 0 0 0 0 1 9
## 20 YearBuilt int 0 0 0 0 1872 2010
## 21 YearRemodAdd int 0 0 0 0 1950 2010
## 22 RoofStyle enum 0 12 0 0 0 5
## 23 RoofMatl enum 0 1 0 0 0 7
## 24 Exterior1st enum 0 18 0 0 0 14
## 25 Exterior2nd enum 0 18 0 0 0 15
## 26 MasVnrType enum 0 12 0 0 0 4
## 27 MasVnrArea int 7 683 0 0 0 1378
## 28 ExterQual enum 0 40 0 0 0 3
## 29 ExterCond enum 0 3 0 0 0 4
## 30 Foundation enum 0 118 0 0 0 5
## 31 BsmtQual enum 0 100 0 0 0 4
## 32 BsmtCond enum 0 34 0 0 0 4
## 33 BsmtExposure enum 0 175 0 0 0 4
## 34 BsmtFinType1 enum 0 179 0 0 0 6
## 35 BsmtFinSF1 int 0 371 0 0 0 5644
## 36 BsmtFinType2 enum 0 18 0 0 0 6
## 37 BsmtFinSF2 int 0 1040 0 0 0 1474
## 38 BsmtUnfSF int 0 96 0 0 0 2121
## 39 TotalBsmtSF int 0 27 0 0 0 6110
## 40 Heating enum 0 1 0 0 0 5
## 41 HeatingQC enum 0 593 0 0 0 4
## 42 CentralAir enum 0 76 0 0 0 1
## 43 Electrical enum 0 78 0 0 0 5
## 44 1stFlrSF int 0 0 0 0 372 4692
## 45 2ndFlrSF int 0 655 0 0 0 2065
## 46 LowQualFinSF int 0 1156 0 0 0 572
## 47 GrLivArea int 0 0 0 0 438 5642
## 48 BsmtFullBath int 0 695 0 0 0 3
## 49 BsmtHalfBath int 0 1110 0 0 0 2
## 50 FullBath int 0 7 0 0 0 3
## 51 HalfBath int 0 733 0 0 0 2
## 52 BedroomAbvGr int 0 5 0 0 0 8
## 53 KitchenAbvGr int 0 1 0 0 0 3
## 54 KitchenQual enum 0 79 0 0 0 3
## 55 TotRmsAbvGrd int 0 0 0 0 3 14
## 56 Functional enum 0 12 0 0 0 6
## 57 Fireplaces int 0 570 0 0 0 3
## 58 FireplaceQu enum 0 21 0 0 0 5
## 59 GarageType enum 0 6 0 0 0 6
## 60 GarageYrBlt int 71 0 0 0 1900 2010
## 61 GarageFinish enum 0 275 0 0 0 3
## 62 GarageCars int 0 71 0 0 0 4
## 63 GarageArea int 0 71 0 0 0 1418
## 64 GarageQual enum 0 3 0 0 0 5
## 65 GarageCond enum 0 2 0 0 0 5
## 66 PavedDrive enum 0 69 0 0 0 2
## 67 WoodDeckSF int 0 613 0 0 0 857
## 68 OpenPorchSF int 0 538 0 0 0 547
## 69 EnclosedPorch int 0 1005 0 0 0 552
## 70 3SsnPorch int 0 1161 0 0 0 320
## 71 ScreenPorch int 0 1078 0 0 0 480
## 72 PoolArea int 0 1173 0 0 0 738
## 73 PoolQC enum 0 2 0 0 0 3
## 74 Fence enum 0 45 0 0 0 4
## 75 MiscFeature enum 0 2 0 0 0 4
## 76 MiscVal int 0 1136 0 0 0 15500
## 77 MoSold int 0 0 0 0 1 12
## 78 YrSold int 0 0 0 0 2006 2010
## 79 SaleType enum 0 34 0 0 0 8
## 80 SaleCondition enum 0 83 0 0 0 5
## 81 SalePrice int 0 0 0 0 34900 755000
## Mean Sigma Cardinality
## 1 7.340814e+02 4.228466e+02 NA
## 2 5.775657e+01 4.311405e+01 NA
## 3 NA NA 5
## 4 7.008471e+01 2.497641e+01 NA
## 5 1.059553e+04 1.074571e+04 NA
## 6 9.957591e-01 6.501140e-02 2
## 7 NA NA 3
## 8 NA NA 4
## 9 NA NA 4
## 10 0.000000e+00 0.000000e+00 2
## 11 NA NA 5
## 12 NA NA 3
## 13 NA NA 25
## 14 NA NA 9
## 15 NA NA 8
## 16 NA NA 5
## 17 NA NA 8
## 18 6.109415e+00 1.371207e+00 NA
## 19 5.595420e+00 1.137423e+00 NA
## 20 1.970992e+03 3.007284e+01 NA
## 21 1.984649e+03 2.066075e+01 NA
## 22 NA NA 6
## 23 NA NA 8
## 24 NA NA 15
## 25 NA NA 16
## 26 NA NA 5
## 27 1.045538e+02 1.766993e+02 NA
## 28 NA NA 4
## 29 NA NA 5
## 30 NA NA 6
## 31 NA NA 5
## 32 NA NA 5
## 33 NA NA 5
## 34 NA NA 7
## 35 4.503359e+02 4.644889e+02 NA
## 36 NA NA 7
## 37 4.946735e+01 1.681181e+02 NA
## 38 5.548414e+02 4.318837e+02 NA
## 39 1.054645e+03 4.388358e+02 NA
## 40 NA NA 6
## 41 NA NA 5
## 42 9.355386e-01 2.456773e-01 2
## 43 NA NA 6
## 44 1.158835e+03 3.918490e+02 NA
## 45 3.558363e+02 4.390238e+02 NA
## 46 6.300254e+00 5.051394e+01 NA
## 47 1.520972e+03 5.346957e+02 NA
## 48 4.206955e-01 5.157412e-01 NA
## 49 6.022053e-02 2.450258e-01 NA
## 50 1.568278e+00 5.491571e-01 NA
## 51 3.842239e-01 4.986796e-01 NA
## 52 2.877014e+00 8.246865e-01 NA
## 53 1.049194e+00 2.278316e-01 NA
## 54 NA NA 4
## 55 6.547922e+00 1.657939e+00 NA
## 56 NA NA 7
## 57 6.039016e-01 6.475906e-01 NA
## 58 NA NA 6
## 59 NA NA 7
## 60 1.978360e+03 2.454403e+01 NA
## 61 NA NA 4
## 62 1.756573e+00 7.536761e-01 NA
## 63 4.710212e+02 2.148926e+02 NA
## 64 NA NA 6
## 65 NA NA 6
## 66 NA NA 3
## 67 9.521968e+01 1.262547e+02 NA
## 68 4.680662e+01 6.748655e+01 NA
## 69 2.290670e+01 6.195481e+01 NA
## 70 2.860051e+00 2.399154e+01 NA
## 71 1.641985e+01 5.864067e+01 NA
## 72 2.927905e+00 4.145457e+01 NA
## 73 NA NA 4
## 74 NA NA 5
## 75 NA NA 5
## 76 4.909584e+01 5.477061e+02 NA
## 77 6.252757e+00 2.671722e+00 NA
## 78 2.007816e+03 1.336779e+00 NA
## 79 NA NA 9
## 80 NA NA 6
## 81 1.809147e+05 7.999717e+04 NA
# We can identify the outcome of interest and the predictor variables.
y <- "SalePrice"
x <- setdiff(names(train), y)
# We've chosen root mean squared error (rmse) as the stopping metric; the model with the lowest will be used at the
# model for generating predictions.
aml <- h2o.automl(
x = x, y = y,
training_frame = train,
max_runtime_secs = 60,
stopping_metric = "RMSE",
seed = 1
)
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|======= | 11%
|
|======== | 12%
|
|======== | 13%
|
|========= | 13%
|
|========= | 14%
|
|========== | 15%
|
|========== | 16%
|
|=========== | 17%
|
|=========== | 18%
|
|============ | 18%
|
|============ | 19%
|
|============= | 20%
|
|============= | 21%
|
|============== | 21%
|
|============== | 22%
|
|=============== | 22%
|
|=============== | 23%
|
|=============== | 24%
|
|================ | 24%
|
|================ | 25%
|
|================= | 26%
|
|=================== | 29%
|
|============================================================ | 92%
|
|============================================================== | 96%
|
|=================================================================| 100%
# The AutoML Leaderboard can be extracted and printed to show model metrics for those we trained.
lb <- aml@leaderboard
print(lb, n = nrow(lb))
## model_id
## 1 GBM_1_AutoML_20190117_185240
## 2 XGBoost_1_AutoML_20190117_185240
## 3 XGBoost_2_AutoML_20190117_185240
## 4 GBM_2_AutoML_20190117_185240
## 5 GBM_3_AutoML_20190117_185240
## 6 XGBoost_3_AutoML_20190117_185240
## 7 DRF_1_AutoML_20190117_185240
## 8 XRT_1_AutoML_20190117_185240
## 9 StackedEnsemble_AllModels_AutoML_20190117_185240
## 10 StackedEnsemble_BestOfFamily_AutoML_20190117_185240
## 11 GLM_grid_1_AutoML_20190117_185240_model_1
## mean_residual_deviance rmse mse mae rmsle
## 1 717280140 26782.09 717280140 16166.29 0.1366006
## 2 785852947 28033.07 785852947 15948.20 0.1346524
## 3 817563411 28593.07 817563411 16330.32 0.1357601
## 4 831395080 28833.92 831395080 16562.02 0.1394863
## 5 834891344 28894.49 834891344 16433.43 0.1411505
## 6 855353503 29246.43 855353503 16214.29 0.1360603
## 7 964531790 31056.91 964531790 18528.77 0.1508808
## 8 1082786402 32905.72 1082786402 19292.46 0.1629754
## 9 5576283661 74674.52 5576283661 53874.93 0.3823492
## 10 5970428375 77268.55 5970428375 55906.02 0.3957125
## 11 6399425379 79996.41 6399425379 58049.22 0.4100616
##
## [11 rows x 6 columns]
# We can make a simple plot that shows the relative importance of important predictors in the lead model,
# a gradient boosting machine.
h2o.varimp_plot(aml@leader)

# Let's see how the top model performs on the test data.
# h2o.performance(aml@leader, test) #Whoops, the test set for this dataset has no column for the response variable.
# Instead, we'll generate predictions on the test set using the top performing model.
pred <- h2o.predict(aml@leader, test)
##
|
| | 0%
|
|=================================================================| 100%
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/
## Validation dataset column 'MSZoning' has levels not trained on: [NA]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/
## Validation dataset column 'Exterior1st' has levels not trained on: [NA]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/
## Validation dataset column 'Exterior2nd' has levels not trained on: [NA]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/
## Validation dataset column 'KitchenQual' has levels not trained on: [NA]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/
## Validation dataset column 'Functional' has levels not trained on: [NA]
## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/
## Validation dataset column 'SaleType' has levels not trained on: [NA]
# Kaggle asks us to submit our predictions in a specific format, so here we'll turn the H2O data frame into a
# tibble (the kind of data frame used in the Tidyverse), then rename the predict column as "SalePrice", representing the
# predicted sale prices for the homes in the test set.
pred <- pred %>%
as.tibble() %>%
rename("SalePrice" = predict)
# Let's make a column of house IDs.
Id <- seq(from = 1, to = 1459, by = 1)
# Now add the predicted house price column next to the Id column.
pred <- cbind(Id, pred)
# Lastly, we can write our data frame of Ids and predictions to a .csv file and call it "SalePrices".
write.csv(pred, file = "SalePrices", row.names = F)