Predicting House Prices

# Let's load a couple of powerful packages with tools we'll apply.
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 3.1.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.8
## ✔ tidyr   0.8.2     ✔ stringr 1.3.1
## ✔ readr   1.2.1     ✔ forcats 0.3.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(h2o)

## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit http://docs.h2o.ai
## 
## ----------------------------------------------------------------------

## 
## Attaching package: 'h2o'

## The following objects are masked from 'package:stats':
## 
##     cor, sd, var

## The following objects are masked from 'package:base':
## 
##     &&, %*%, %in%, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc

# Since we're using the H2O package, we'll need to initialize and connect to an H2O instance.
h2o.init()

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         18 minutes 59 seconds 
##     H2O cluster timezone:       America/Los_Angeles 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.22.1.1 
##     H2O cluster version age:    20 days  
##     H2O cluster name:           H2O_started_from_R_michaelespero_xbv417 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.97 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         XGBoost, Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.5.1 (2018-07-02)

# Let's import the training data so it becomes an H2O data frame.
train <- h2o.importFile("house_train.csv")

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=====================================================            |  81%
  |                                                                       
  |=================================================================| 100%

# Next, let's take a chunk of the training data (20%) and make it a validation set for hyperparameter tuning.
train.splits <- h2o.splitFrame(data = train, ratios = .8, seed = 1234)
train <- train.splits[[1]]
valid <- train.splits[[2]]

# Import the test data for the Kaggle competition.
test <- h2o.importFile("house_test.csv")

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=========================================================        |  87%
  |                                                                       
  |=================================================================| 100%

# H2O has a convenient descibe function to get a description of the data.
h2o.describe(train)

##            Label Type Missing Zeros PosInf NegInf   Min    Max
## 1             Id  int       0     0      0      0     1   1458
## 2     MSSubClass  int       0     0      0      0    20    190
## 3       MSZoning enum       0     8      0      0     0      4
## 4    LotFrontage  int     211     0      0      0    21    313
## 5        LotArea  int       0     0      0      0  1300 215245
## 6         Street enum       0     5      0      0     0      1
## 7          Alley enum       0    37      0      0     0      2
## 8       LotShape enum       0   392      0      0     0      3
## 9    LandContour enum       0    52      0      0     0      3
## 10     Utilities enum       0  1179      0      0     0      0
## 11     LotConfig enum       0   218      0      0     0      4
## 12     LandSlope enum       0  1115      0      0     0      2
## 13  Neighborhood enum       0     9      0      0     0     24
## 14    Condition1 enum       0    38      0      0     0      8
## 15    Condition2 enum       0     1      0      0     0      7
## 16      BldgType enum       0   984      0      0     0      4
## 17    HouseStyle enum       0   126      0      0     0      7
## 18   OverallQual  int       0     0      0      0     1     10
## 19   OverallCond  int       0     0      0      0     1      9
## 20     YearBuilt  int       0     0      0      0  1872   2010
## 21  YearRemodAdd  int       0     0      0      0  1950   2010
## 22     RoofStyle enum       0    12      0      0     0      5
## 23      RoofMatl enum       0     1      0      0     0      7
## 24   Exterior1st enum       0    18      0      0     0     14
## 25   Exterior2nd enum       0    18      0      0     0     15
## 26    MasVnrType enum       0    12      0      0     0      4
## 27    MasVnrArea  int       7   683      0      0     0   1378
## 28     ExterQual enum       0    40      0      0     0      3
## 29     ExterCond enum       0     3      0      0     0      4
## 30    Foundation enum       0   118      0      0     0      5
## 31      BsmtQual enum       0   100      0      0     0      4
## 32      BsmtCond enum       0    34      0      0     0      4
## 33  BsmtExposure enum       0   175      0      0     0      4
## 34  BsmtFinType1 enum       0   179      0      0     0      6
## 35    BsmtFinSF1  int       0   371      0      0     0   5644
## 36  BsmtFinType2 enum       0    18      0      0     0      6
## 37    BsmtFinSF2  int       0  1040      0      0     0   1474
## 38     BsmtUnfSF  int       0    96      0      0     0   2121
## 39   TotalBsmtSF  int       0    27      0      0     0   6110
## 40       Heating enum       0     1      0      0     0      5
## 41     HeatingQC enum       0   593      0      0     0      4
## 42    CentralAir enum       0    76      0      0     0      1
## 43    Electrical enum       0    78      0      0     0      5
## 44      1stFlrSF  int       0     0      0      0   372   4692
## 45      2ndFlrSF  int       0   655      0      0     0   2065
## 46  LowQualFinSF  int       0  1156      0      0     0    572
## 47     GrLivArea  int       0     0      0      0   438   5642
## 48  BsmtFullBath  int       0   695      0      0     0      3
## 49  BsmtHalfBath  int       0  1110      0      0     0      2
## 50      FullBath  int       0     7      0      0     0      3
## 51      HalfBath  int       0   733      0      0     0      2
## 52  BedroomAbvGr  int       0     5      0      0     0      8
## 53  KitchenAbvGr  int       0     1      0      0     0      3
## 54   KitchenQual enum       0    79      0      0     0      3
## 55  TotRmsAbvGrd  int       0     0      0      0     3     14
## 56    Functional enum       0    12      0      0     0      6
## 57    Fireplaces  int       0   570      0      0     0      3
## 58   FireplaceQu enum       0    21      0      0     0      5
## 59    GarageType enum       0     6      0      0     0      6
## 60   GarageYrBlt  int      71     0      0      0  1900   2010
## 61  GarageFinish enum       0   275      0      0     0      3
## 62    GarageCars  int       0    71      0      0     0      4
## 63    GarageArea  int       0    71      0      0     0   1418
## 64    GarageQual enum       0     3      0      0     0      5
## 65    GarageCond enum       0     2      0      0     0      5
## 66    PavedDrive enum       0    69      0      0     0      2
## 67    WoodDeckSF  int       0   613      0      0     0    857
## 68   OpenPorchSF  int       0   538      0      0     0    547
## 69 EnclosedPorch  int       0  1005      0      0     0    552
## 70     3SsnPorch  int       0  1161      0      0     0    320
## 71   ScreenPorch  int       0  1078      0      0     0    480
## 72      PoolArea  int       0  1173      0      0     0    738
## 73        PoolQC enum       0     2      0      0     0      3
## 74         Fence enum       0    45      0      0     0      4
## 75   MiscFeature enum       0     2      0      0     0      4
## 76       MiscVal  int       0  1136      0      0     0  15500
## 77        MoSold  int       0     0      0      0     1     12
## 78        YrSold  int       0     0      0      0  2006   2010
## 79      SaleType enum       0    34      0      0     0      8
## 80 SaleCondition enum       0    83      0      0     0      5
## 81     SalePrice  int       0     0      0      0 34900 755000
##            Mean        Sigma Cardinality
## 1  7.340814e+02 4.228466e+02          NA
## 2  5.775657e+01 4.311405e+01          NA
## 3            NA           NA           5
## 4  7.008471e+01 2.497641e+01          NA
## 5  1.059553e+04 1.074571e+04          NA
## 6  9.957591e-01 6.501140e-02           2
## 7            NA           NA           3
## 8            NA           NA           4
## 9            NA           NA           4
## 10 0.000000e+00 0.000000e+00           2
## 11           NA           NA           5
## 12           NA           NA           3
## 13           NA           NA          25
## 14           NA           NA           9
## 15           NA           NA           8
## 16           NA           NA           5
## 17           NA           NA           8
## 18 6.109415e+00 1.371207e+00          NA
## 19 5.595420e+00 1.137423e+00          NA
## 20 1.970992e+03 3.007284e+01          NA
## 21 1.984649e+03 2.066075e+01          NA
## 22           NA           NA           6
## 23           NA           NA           8
## 24           NA           NA          15
## 25           NA           NA          16
## 26           NA           NA           5
## 27 1.045538e+02 1.766993e+02          NA
## 28           NA           NA           4
## 29           NA           NA           5
## 30           NA           NA           6
## 31           NA           NA           5
## 32           NA           NA           5
## 33           NA           NA           5
## 34           NA           NA           7
## 35 4.503359e+02 4.644889e+02          NA
## 36           NA           NA           7
## 37 4.946735e+01 1.681181e+02          NA
## 38 5.548414e+02 4.318837e+02          NA
## 39 1.054645e+03 4.388358e+02          NA
## 40           NA           NA           6
## 41           NA           NA           5
## 42 9.355386e-01 2.456773e-01           2
## 43           NA           NA           6
## 44 1.158835e+03 3.918490e+02          NA
## 45 3.558363e+02 4.390238e+02          NA
## 46 6.300254e+00 5.051394e+01          NA
## 47 1.520972e+03 5.346957e+02          NA
## 48 4.206955e-01 5.157412e-01          NA
## 49 6.022053e-02 2.450258e-01          NA
## 50 1.568278e+00 5.491571e-01          NA
## 51 3.842239e-01 4.986796e-01          NA
## 52 2.877014e+00 8.246865e-01          NA
## 53 1.049194e+00 2.278316e-01          NA
## 54           NA           NA           4
## 55 6.547922e+00 1.657939e+00          NA
## 56           NA           NA           7
## 57 6.039016e-01 6.475906e-01          NA
## 58           NA           NA           6
## 59           NA           NA           7
## 60 1.978360e+03 2.454403e+01          NA
## 61           NA           NA           4
## 62 1.756573e+00 7.536761e-01          NA
## 63 4.710212e+02 2.148926e+02          NA
## 64           NA           NA           6
## 65           NA           NA           6
## 66           NA           NA           3
## 67 9.521968e+01 1.262547e+02          NA
## 68 4.680662e+01 6.748655e+01          NA
## 69 2.290670e+01 6.195481e+01          NA
## 70 2.860051e+00 2.399154e+01          NA
## 71 1.641985e+01 5.864067e+01          NA
## 72 2.927905e+00 4.145457e+01          NA
## 73           NA           NA           4
## 74           NA           NA           5
## 75           NA           NA           5
## 76 4.909584e+01 5.477061e+02          NA
## 77 6.252757e+00 2.671722e+00          NA
## 78 2.007816e+03 1.336779e+00          NA
## 79           NA           NA           9
## 80           NA           NA           6
## 81 1.809147e+05 7.999717e+04          NA

# We can identify the outcome of interest and the predictor variables.
y <- "SalePrice"
x <- setdiff(names(train), y)

# We've chosen root mean squared error (rmse) as the stopping metric; the model with the lowest will be used at the
# model for generating predictions.
aml <- h2o.automl(
  x = x, y = y,
  training_frame = train,
  max_runtime_secs = 60,
  stopping_metric = "RMSE",
  seed = 1
)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |                                                                 |   1%
  |                                                                       
  |=                                                                |   1%
  |                                                                       
  |=                                                                |   2%
  |                                                                       
  |==                                                               |   3%
  |                                                                       
  |==                                                               |   4%
  |                                                                       
  |===                                                              |   4%
  |                                                                       
  |===                                                              |   5%
  |                                                                       
  |=======                                                          |  11%
  |                                                                       
  |========                                                         |  12%
  |                                                                       
  |========                                                         |  13%
  |                                                                       
  |=========                                                        |  13%
  |                                                                       
  |=========                                                        |  14%
  |                                                                       
  |==========                                                       |  15%
  |                                                                       
  |==========                                                       |  16%
  |                                                                       
  |===========                                                      |  17%
  |                                                                       
  |===========                                                      |  18%
  |                                                                       
  |============                                                     |  18%
  |                                                                       
  |============                                                     |  19%
  |                                                                       
  |=============                                                    |  20%
  |                                                                       
  |=============                                                    |  21%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |==============                                                   |  22%
  |                                                                       
  |===============                                                  |  22%
  |                                                                       
  |===============                                                  |  23%
  |                                                                       
  |===============                                                  |  24%
  |                                                                       
  |================                                                 |  24%
  |                                                                       
  |================                                                 |  25%
  |                                                                       
  |=================                                                |  26%
  |                                                                       
  |===================                                              |  29%
  |                                                                       
  |============================================================     |  92%
  |                                                                       
  |==============================================================   |  96%
  |                                                                       
  |=================================================================| 100%

# The AutoML Leaderboard can be extracted and printed to show model metrics for those we trained.
lb <- aml@leaderboard
print(lb, n = nrow(lb))

##                                               model_id
## 1                         GBM_1_AutoML_20190117_185240
## 2                     XGBoost_1_AutoML_20190117_185240
## 3                     XGBoost_2_AutoML_20190117_185240
## 4                         GBM_2_AutoML_20190117_185240
## 5                         GBM_3_AutoML_20190117_185240
## 6                     XGBoost_3_AutoML_20190117_185240
## 7                         DRF_1_AutoML_20190117_185240
## 8                         XRT_1_AutoML_20190117_185240
## 9     StackedEnsemble_AllModels_AutoML_20190117_185240
## 10 StackedEnsemble_BestOfFamily_AutoML_20190117_185240
## 11           GLM_grid_1_AutoML_20190117_185240_model_1
##    mean_residual_deviance     rmse        mse      mae     rmsle
## 1               717280140 26782.09  717280140 16166.29 0.1366006
## 2               785852947 28033.07  785852947 15948.20 0.1346524
## 3               817563411 28593.07  817563411 16330.32 0.1357601
## 4               831395080 28833.92  831395080 16562.02 0.1394863
## 5               834891344 28894.49  834891344 16433.43 0.1411505
## 6               855353503 29246.43  855353503 16214.29 0.1360603
## 7               964531790 31056.91  964531790 18528.77 0.1508808
## 8              1082786402 32905.72 1082786402 19292.46 0.1629754
## 9              5576283661 74674.52 5576283661 53874.93 0.3823492
## 10             5970428375 77268.55 5970428375 55906.02 0.3957125
## 11             6399425379 79996.41 6399425379 58049.22 0.4100616
## 
## [11 rows x 6 columns]

# We can make a simple plot that shows the relative importance of important predictors in the lead model,
# a gradient boosting machine.
h2o.varimp_plot(aml@leader)

# Let's see how the top model performs on the test data.
# h2o.performance(aml@leader, test) #Whoops, the test set for this dataset has no column for the response variable.

# Instead, we'll generate predictions on the test set using the top performing model.
pred <- h2o.predict(aml@leader, test)

## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/
## Validation dataset column 'MSZoning' has levels not trained on: [NA]

## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/
## Validation dataset column 'Exterior1st' has levels not trained on: [NA]

## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/
## Validation dataset column 'Exterior2nd' has levels not trained on: [NA]

## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/
## Validation dataset column 'KitchenQual' has levels not trained on: [NA]

## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/
## Validation dataset column 'Functional' has levels not trained on: [NA]

## Warning in doTryCatch(return(expr), name, parentenv, handler): Test/
## Validation dataset column 'SaleType' has levels not trained on: [NA]

# Kaggle asks us to submit our predictions in a specific format, so here we'll turn the H2O data frame into a
# tibble (the kind of data frame used in the Tidyverse), then rename the predict column as "SalePrice", representing the
# predicted sale prices for the homes in the test set.
pred <- pred %>%
  as.tibble() %>%
  rename("SalePrice" = predict)

# Let's make a column of house IDs.
Id <- seq(from = 1, to = 1459, by = 1)

# Now add the predicted house price column next to the Id column.
pred <- cbind(Id, pred)

# Lastly, we can write our data frame of Ids and predictions to a .csv file and call it "SalePrices".
write.csv(pred, file = "SalePrices", row.names = F)

Predicting House Prices

Michael Espero

2019-01-04