Source of data and information. https://www.kaggle.com/c/house-prices-advanced-regression-techniques
Load necessary libraries
Load data from Kaggle URL
Detail infomation about the train dataset and test dataset
skim(train)
| Name | train |
| Number of rows | 1460 |
| Number of columns | 81 |
| _______________________ | |
| Column type frequency: | |
| character | 43 |
| numeric | 38 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ms_zoning | 0 | 1.00 | 2 | 7 | 0 | 5 | 0 |
| street | 0 | 1.00 | 4 | 4 | 0 | 2 | 0 |
| alley | 1369 | 0.06 | 4 | 4 | 0 | 2 | 0 |
| lot_shape | 0 | 1.00 | 3 | 3 | 0 | 4 | 0 |
| land_contour | 0 | 1.00 | 3 | 3 | 0 | 4 | 0 |
| utilities | 0 | 1.00 | 6 | 6 | 0 | 2 | 0 |
| lot_config | 0 | 1.00 | 3 | 7 | 0 | 5 | 0 |
| land_slope | 0 | 1.00 | 3 | 3 | 0 | 3 | 0 |
| neighborhood | 0 | 1.00 | 5 | 7 | 0 | 25 | 0 |
| condition1 | 0 | 1.00 | 4 | 6 | 0 | 9 | 0 |
| condition2 | 0 | 1.00 | 4 | 6 | 0 | 8 | 0 |
| bldg_type | 0 | 1.00 | 4 | 6 | 0 | 5 | 0 |
| house_style | 0 | 1.00 | 4 | 6 | 0 | 8 | 0 |
| roof_style | 0 | 1.00 | 3 | 7 | 0 | 6 | 0 |
| roof_matl | 0 | 1.00 | 4 | 7 | 0 | 8 | 0 |
| exterior1st | 0 | 1.00 | 5 | 7 | 0 | 15 | 0 |
| exterior2nd | 0 | 1.00 | 5 | 7 | 0 | 16 | 0 |
| mas_vnr_type | 8 | 0.99 | 4 | 7 | 0 | 4 | 0 |
| exter_qual | 0 | 1.00 | 2 | 2 | 0 | 4 | 0 |
| exter_cond | 0 | 1.00 | 2 | 2 | 0 | 5 | 0 |
| foundation | 0 | 1.00 | 4 | 6 | 0 | 6 | 0 |
| bsmt_qual | 37 | 0.97 | 2 | 2 | 0 | 4 | 0 |
| bsmt_cond | 37 | 0.97 | 2 | 2 | 0 | 4 | 0 |
| bsmt_exposure | 38 | 0.97 | 2 | 2 | 0 | 4 | 0 |
| bsmt_fin_type1 | 37 | 0.97 | 3 | 3 | 0 | 6 | 0 |
| bsmt_fin_type2 | 38 | 0.97 | 3 | 3 | 0 | 6 | 0 |
| heating | 0 | 1.00 | 4 | 5 | 0 | 6 | 0 |
| heating_qc | 0 | 1.00 | 2 | 2 | 0 | 5 | 0 |
| central_air | 0 | 1.00 | 1 | 1 | 0 | 2 | 0 |
| electrical | 1 | 1.00 | 3 | 5 | 0 | 5 | 0 |
| kitchen_qual | 0 | 1.00 | 2 | 2 | 0 | 4 | 0 |
| functional | 0 | 1.00 | 3 | 4 | 0 | 7 | 0 |
| fireplace_qu | 690 | 0.53 | 2 | 2 | 0 | 5 | 0 |
| garage_type | 81 | 0.94 | 6 | 7 | 0 | 6 | 0 |
| garage_finish | 81 | 0.94 | 3 | 3 | 0 | 3 | 0 |
| garage_qual | 81 | 0.94 | 2 | 2 | 0 | 5 | 0 |
| garage_cond | 81 | 0.94 | 2 | 2 | 0 | 5 | 0 |
| paved_drive | 0 | 1.00 | 1 | 1 | 0 | 3 | 0 |
| pool_qc | 1453 | 0.00 | 2 | 2 | 0 | 3 | 0 |
| fence | 1179 | 0.19 | 4 | 5 | 0 | 4 | 0 |
| misc_feature | 1406 | 0.04 | 4 | 4 | 0 | 4 | 0 |
| sale_type | 0 | 1.00 | 2 | 5 | 0 | 9 | 0 |
| sale_condition | 0 | 1.00 | 6 | 7 | 0 | 6 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1.00 | 730.50 | 421.61 | 1 | 365.75 | 730.5 | 1095.25 | 1460 | ▇▇▇▇▇ |
| ms_sub_class | 0 | 1.00 | 56.90 | 42.30 | 20 | 20.00 | 50.0 | 70.00 | 190 | ▇▅▂▁▁ |
| lot_frontage | 259 | 0.82 | 70.05 | 24.28 | 21 | 59.00 | 69.0 | 80.00 | 313 | ▇▃▁▁▁ |
| lot_area | 0 | 1.00 | 10516.83 | 9981.26 | 1300 | 7553.50 | 9478.5 | 11601.50 | 215245 | ▇▁▁▁▁ |
| overall_qual | 0 | 1.00 | 6.10 | 1.38 | 1 | 5.00 | 6.0 | 7.00 | 10 | ▁▂▇▅▁ |
| overall_cond | 0 | 1.00 | 5.58 | 1.11 | 1 | 5.00 | 5.0 | 6.00 | 9 | ▁▁▇▅▁ |
| year_built | 0 | 1.00 | 1971.27 | 30.20 | 1872 | 1954.00 | 1973.0 | 2000.00 | 2010 | ▁▂▃▆▇ |
| year_remod_add | 0 | 1.00 | 1984.87 | 20.65 | 1950 | 1967.00 | 1994.0 | 2004.00 | 2010 | ▅▂▂▃▇ |
| mas_vnr_area | 8 | 0.99 | 103.69 | 181.07 | 0 | 0.00 | 0.0 | 166.00 | 1600 | ▇▁▁▁▁ |
| bsmt_fin_sf1 | 0 | 1.00 | 443.64 | 456.10 | 0 | 0.00 | 383.5 | 712.25 | 5644 | ▇▁▁▁▁ |
| bsmt_fin_sf2 | 0 | 1.00 | 46.55 | 161.32 | 0 | 0.00 | 0.0 | 0.00 | 1474 | ▇▁▁▁▁ |
| bsmt_unf_sf | 0 | 1.00 | 567.24 | 441.87 | 0 | 223.00 | 477.5 | 808.00 | 2336 | ▇▅▂▁▁ |
| total_bsmt_sf | 0 | 1.00 | 1057.43 | 438.71 | 0 | 795.75 | 991.5 | 1298.25 | 6110 | ▇▃▁▁▁ |
| x1st_flr_sf | 0 | 1.00 | 1162.63 | 386.59 | 334 | 882.00 | 1087.0 | 1391.25 | 4692 | ▇▅▁▁▁ |
| x2nd_flr_sf | 0 | 1.00 | 346.99 | 436.53 | 0 | 0.00 | 0.0 | 728.00 | 2065 | ▇▃▂▁▁ |
| low_qual_fin_sf | 0 | 1.00 | 5.84 | 48.62 | 0 | 0.00 | 0.0 | 0.00 | 572 | ▇▁▁▁▁ |
| gr_liv_area | 0 | 1.00 | 1515.46 | 525.48 | 334 | 1129.50 | 1464.0 | 1776.75 | 5642 | ▇▇▁▁▁ |
| bsmt_full_bath | 0 | 1.00 | 0.43 | 0.52 | 0 | 0.00 | 0.0 | 1.00 | 3 | ▇▆▁▁▁ |
| bsmt_half_bath | 0 | 1.00 | 0.06 | 0.24 | 0 | 0.00 | 0.0 | 0.00 | 2 | ▇▁▁▁▁ |
| full_bath | 0 | 1.00 | 1.57 | 0.55 | 0 | 1.00 | 2.0 | 2.00 | 3 | ▁▇▁▇▁ |
| half_bath | 0 | 1.00 | 0.38 | 0.50 | 0 | 0.00 | 0.0 | 1.00 | 2 | ▇▁▅▁▁ |
| bedroom_abv_gr | 0 | 1.00 | 2.87 | 0.82 | 0 | 2.00 | 3.0 | 3.00 | 8 | ▁▇▂▁▁ |
| kitchen_abv_gr | 0 | 1.00 | 1.05 | 0.22 | 0 | 1.00 | 1.0 | 1.00 | 3 | ▁▇▁▁▁ |
| tot_rms_abv_grd | 0 | 1.00 | 6.52 | 1.63 | 2 | 5.00 | 6.0 | 7.00 | 14 | ▂▇▇▁▁ |
| fireplaces | 0 | 1.00 | 0.61 | 0.64 | 0 | 0.00 | 1.0 | 1.00 | 3 | ▇▇▁▁▁ |
| garage_yr_blt | 81 | 0.94 | 1978.51 | 24.69 | 1900 | 1961.00 | 1980.0 | 2002.00 | 2010 | ▁▁▅▅▇ |
| garage_cars | 0 | 1.00 | 1.77 | 0.75 | 0 | 1.00 | 2.0 | 2.00 | 4 | ▁▃▇▂▁ |
| garage_area | 0 | 1.00 | 472.98 | 213.80 | 0 | 334.50 | 480.0 | 576.00 | 1418 | ▂▇▃▁▁ |
| wood_deck_sf | 0 | 1.00 | 94.24 | 125.34 | 0 | 0.00 | 0.0 | 168.00 | 857 | ▇▂▁▁▁ |
| open_porch_sf | 0 | 1.00 | 46.66 | 66.26 | 0 | 0.00 | 25.0 | 68.00 | 547 | ▇▁▁▁▁ |
| enclosed_porch | 0 | 1.00 | 21.95 | 61.12 | 0 | 0.00 | 0.0 | 0.00 | 552 | ▇▁▁▁▁ |
| x3ssn_porch | 0 | 1.00 | 3.41 | 29.32 | 0 | 0.00 | 0.0 | 0.00 | 508 | ▇▁▁▁▁ |
| screen_porch | 0 | 1.00 | 15.06 | 55.76 | 0 | 0.00 | 0.0 | 0.00 | 480 | ▇▁▁▁▁ |
| pool_area | 0 | 1.00 | 2.76 | 40.18 | 0 | 0.00 | 0.0 | 0.00 | 738 | ▇▁▁▁▁ |
| misc_val | 0 | 1.00 | 43.49 | 496.12 | 0 | 0.00 | 0.0 | 0.00 | 15500 | ▇▁▁▁▁ |
| mo_sold | 0 | 1.00 | 6.32 | 2.70 | 1 | 5.00 | 6.0 | 8.00 | 12 | ▃▆▇▃▃ |
| yr_sold | 0 | 1.00 | 2007.82 | 1.33 | 2006 | 2007.00 | 2008.0 | 2009.00 | 2010 | ▇▇▇▇▅ |
| sale_price | 0 | 1.00 | 180921.20 | 79442.50 | 34900 | 129975.00 | 163000.0 | 214000.00 | 755000 | ▇▅▁▁▁ |
skim(test)
| Name | test |
| Number of rows | 1459 |
| Number of columns | 80 |
| _______________________ | |
| Column type frequency: | |
| character | 43 |
| numeric | 37 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ms_zoning | 4 | 1.00 | 2 | 7 | 0 | 5 | 0 |
| street | 0 | 1.00 | 4 | 4 | 0 | 2 | 0 |
| alley | 1352 | 0.07 | 4 | 4 | 0 | 2 | 0 |
| lot_shape | 0 | 1.00 | 3 | 3 | 0 | 4 | 0 |
| land_contour | 0 | 1.00 | 3 | 3 | 0 | 4 | 0 |
| utilities | 2 | 1.00 | 6 | 6 | 0 | 1 | 0 |
| lot_config | 0 | 1.00 | 3 | 7 | 0 | 5 | 0 |
| land_slope | 0 | 1.00 | 3 | 3 | 0 | 3 | 0 |
| neighborhood | 0 | 1.00 | 5 | 7 | 0 | 25 | 0 |
| condition1 | 0 | 1.00 | 4 | 6 | 0 | 9 | 0 |
| condition2 | 0 | 1.00 | 4 | 6 | 0 | 5 | 0 |
| bldg_type | 0 | 1.00 | 4 | 6 | 0 | 5 | 0 |
| house_style | 0 | 1.00 | 4 | 6 | 0 | 7 | 0 |
| roof_style | 0 | 1.00 | 3 | 7 | 0 | 6 | 0 |
| roof_matl | 0 | 1.00 | 7 | 7 | 0 | 4 | 0 |
| exterior1st | 1 | 1.00 | 6 | 7 | 0 | 13 | 0 |
| exterior2nd | 1 | 1.00 | 5 | 7 | 0 | 15 | 0 |
| mas_vnr_type | 16 | 0.99 | 4 | 7 | 0 | 4 | 0 |
| exter_qual | 0 | 1.00 | 2 | 2 | 0 | 4 | 0 |
| exter_cond | 0 | 1.00 | 2 | 2 | 0 | 5 | 0 |
| foundation | 0 | 1.00 | 4 | 6 | 0 | 6 | 0 |
| bsmt_qual | 44 | 0.97 | 2 | 2 | 0 | 4 | 0 |
| bsmt_cond | 45 | 0.97 | 2 | 2 | 0 | 4 | 0 |
| bsmt_exposure | 44 | 0.97 | 2 | 2 | 0 | 4 | 0 |
| bsmt_fin_type1 | 42 | 0.97 | 3 | 3 | 0 | 6 | 0 |
| bsmt_fin_type2 | 42 | 0.97 | 3 | 3 | 0 | 6 | 0 |
| heating | 0 | 1.00 | 4 | 4 | 0 | 4 | 0 |
| heating_qc | 0 | 1.00 | 2 | 2 | 0 | 5 | 0 |
| central_air | 0 | 1.00 | 1 | 1 | 0 | 2 | 0 |
| electrical | 0 | 1.00 | 5 | 5 | 0 | 4 | 0 |
| kitchen_qual | 1 | 1.00 | 2 | 2 | 0 | 4 | 0 |
| functional | 2 | 1.00 | 3 | 4 | 0 | 7 | 0 |
| fireplace_qu | 730 | 0.50 | 2 | 2 | 0 | 5 | 0 |
| garage_type | 76 | 0.95 | 6 | 7 | 0 | 6 | 0 |
| garage_finish | 78 | 0.95 | 3 | 3 | 0 | 3 | 0 |
| garage_qual | 78 | 0.95 | 2 | 2 | 0 | 4 | 0 |
| garage_cond | 78 | 0.95 | 2 | 2 | 0 | 5 | 0 |
| paved_drive | 0 | 1.00 | 1 | 1 | 0 | 3 | 0 |
| pool_qc | 1456 | 0.00 | 2 | 2 | 0 | 2 | 0 |
| fence | 1169 | 0.20 | 4 | 5 | 0 | 4 | 0 |
| misc_feature | 1408 | 0.03 | 4 | 4 | 0 | 3 | 0 |
| sale_type | 1 | 1.00 | 2 | 5 | 0 | 9 | 0 |
| sale_condition | 0 | 1.00 | 6 | 7 | 0 | 6 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1.00 | 2190.00 | 421.32 | 1461 | 1825.50 | 2190.0 | 2554.50 | 2919 | ▇▇▇▇▇ |
| ms_sub_class | 0 | 1.00 | 57.38 | 42.75 | 20 | 20.00 | 50.0 | 70.00 | 190 | ▇▅▂▁▁ |
| lot_frontage | 227 | 0.84 | 68.58 | 22.38 | 21 | 58.00 | 67.0 | 80.00 | 200 | ▃▇▁▁▁ |
| lot_area | 0 | 1.00 | 9819.16 | 4955.52 | 1470 | 7391.00 | 9399.0 | 11517.50 | 56600 | ▇▂▁▁▁ |
| overall_qual | 0 | 1.00 | 6.08 | 1.44 | 1 | 5.00 | 6.0 | 7.00 | 10 | ▁▁▇▅▁ |
| overall_cond | 0 | 1.00 | 5.55 | 1.11 | 1 | 5.00 | 5.0 | 6.00 | 9 | ▁▁▇▅▁ |
| year_built | 0 | 1.00 | 1971.36 | 30.39 | 1879 | 1953.00 | 1973.0 | 2001.00 | 2010 | ▁▂▃▆▇ |
| year_remod_add | 0 | 1.00 | 1983.66 | 21.13 | 1950 | 1963.00 | 1992.0 | 2004.00 | 2010 | ▅▂▂▃▇ |
| mas_vnr_area | 15 | 0.99 | 100.71 | 177.63 | 0 | 0.00 | 0.0 | 164.00 | 1290 | ▇▁▁▁▁ |
| bsmt_fin_sf1 | 1 | 1.00 | 439.20 | 455.27 | 0 | 0.00 | 350.5 | 753.50 | 4010 | ▇▂▁▁▁ |
| bsmt_fin_sf2 | 1 | 1.00 | 52.62 | 176.75 | 0 | 0.00 | 0.0 | 0.00 | 1526 | ▇▁▁▁▁ |
| bsmt_unf_sf | 1 | 1.00 | 554.29 | 437.26 | 0 | 219.25 | 460.0 | 797.75 | 2140 | ▇▆▂▁▁ |
| total_bsmt_sf | 1 | 1.00 | 1046.12 | 442.90 | 0 | 784.00 | 988.0 | 1305.00 | 5095 | ▇▇▁▁▁ |
| x1st_flr_sf | 0 | 1.00 | 1156.53 | 398.17 | 407 | 873.50 | 1079.0 | 1382.50 | 5095 | ▇▃▁▁▁ |
| x2nd_flr_sf | 0 | 1.00 | 325.97 | 420.61 | 0 | 0.00 | 0.0 | 676.00 | 1862 | ▇▃▂▁▁ |
| low_qual_fin_sf | 0 | 1.00 | 3.54 | 44.04 | 0 | 0.00 | 0.0 | 0.00 | 1064 | ▇▁▁▁▁ |
| gr_liv_area | 0 | 1.00 | 1486.05 | 485.57 | 407 | 1117.50 | 1432.0 | 1721.00 | 5095 | ▇▇▁▁▁ |
| bsmt_full_bath | 2 | 1.00 | 0.43 | 0.53 | 0 | 0.00 | 0.0 | 1.00 | 3 | ▇▆▁▁▁ |
| bsmt_half_bath | 2 | 1.00 | 0.07 | 0.25 | 0 | 0.00 | 0.0 | 0.00 | 2 | ▇▁▁▁▁ |
| full_bath | 0 | 1.00 | 1.57 | 0.56 | 0 | 1.00 | 2.0 | 2.00 | 4 | ▁▇▇▁▁ |
| half_bath | 0 | 1.00 | 0.38 | 0.50 | 0 | 0.00 | 0.0 | 1.00 | 2 | ▇▁▅▁▁ |
| bedroom_abv_gr | 0 | 1.00 | 2.85 | 0.83 | 0 | 2.00 | 3.0 | 3.00 | 6 | ▁▃▇▂▁ |
| kitchen_abv_gr | 0 | 1.00 | 1.04 | 0.21 | 0 | 1.00 | 1.0 | 1.00 | 2 | ▁▁▇▁▁ |
| tot_rms_abv_grd | 0 | 1.00 | 6.39 | 1.51 | 3 | 5.00 | 6.0 | 7.00 | 15 | ▅▇▃▁▁ |
| fireplaces | 0 | 1.00 | 0.58 | 0.65 | 0 | 0.00 | 0.0 | 1.00 | 4 | ▇▇▁▁▁ |
| garage_yr_blt | 78 | 0.95 | 1977.72 | 26.43 | 1895 | 1959.00 | 1979.0 | 2002.00 | 2207 | ▂▇▁▁▁ |
| garage_cars | 1 | 1.00 | 1.77 | 0.78 | 0 | 1.00 | 2.0 | 2.00 | 5 | ▅▇▂▁▁ |
| garage_area | 1 | 1.00 | 472.77 | 217.05 | 0 | 318.00 | 480.0 | 576.00 | 1488 | ▃▇▃▁▁ |
| wood_deck_sf | 0 | 1.00 | 93.17 | 127.74 | 0 | 0.00 | 0.0 | 168.00 | 1424 | ▇▁▁▁▁ |
| open_porch_sf | 0 | 1.00 | 48.31 | 68.88 | 0 | 0.00 | 28.0 | 72.00 | 742 | ▇▁▁▁▁ |
| enclosed_porch | 0 | 1.00 | 24.24 | 67.23 | 0 | 0.00 | 0.0 | 0.00 | 1012 | ▇▁▁▁▁ |
| x3ssn_porch | 0 | 1.00 | 1.79 | 20.21 | 0 | 0.00 | 0.0 | 0.00 | 360 | ▇▁▁▁▁ |
| screen_porch | 0 | 1.00 | 17.06 | 56.61 | 0 | 0.00 | 0.0 | 0.00 | 576 | ▇▁▁▁▁ |
| pool_area | 0 | 1.00 | 1.74 | 30.49 | 0 | 0.00 | 0.0 | 0.00 | 800 | ▇▁▁▁▁ |
| misc_val | 0 | 1.00 | 58.17 | 630.81 | 0 | 0.00 | 0.0 | 0.00 | 17000 | ▇▁▁▁▁ |
| mo_sold | 0 | 1.00 | 6.10 | 2.72 | 1 | 4.00 | 6.0 | 8.00 | 12 | ▅▆▇▃▃ |
| yr_sold | 0 | 1.00 | 2007.77 | 1.30 | 2006 | 2007.00 | 2008.0 | 2009.00 | 2010 | ▇▇▇▇▃ |
Train and test dataset contains lots of missing values. To build and run forecasting models smoothly, we need to impute those missing data or get rid of observation containing missing value. To retain observations as much as possible, I choose to impute those missing value.
train <- train %>%
select (id, sale_price, overall_qual,year_built,year_remod_add,mas_vnr_area,
total_bsmt_sf,x1st_flr_sf,gr_liv_area,full_bath,tot_rms_abv_grd,fireplaces,
garage_yr_blt,garage_cars,garage_area, ms_zoning, street,condition1,
house_style,mas_vnr_type,exter_qual, bsmt_cond,bsmt_qual,bsmt_exposure) %>%
mutate(
bsmt_qual = case_when(is.na(bsmt_qual) == "TRUE" ~ "none",
TRUE ~ bsmt_qual),
bsmt_cond = case_when(is.na(bsmt_cond) == "TRUE" ~ "none",
TRUE ~ bsmt_cond),
bsmt_exposure = case_when(is.na(bsmt_exposure) == "TRUE" ~ "none",
TRUE ~ bsmt_exposure)) %>%
mutate(sale_price = log(sale_price))
OK, let build our model
# split the our data into training and testing data
set.seed(123)
split <- initial_split(train)
training <- training(split)
testing <- testing(split)
Create validation fold for further validation activities
set.seed(234)
resample <- vfold_cv(training, prop = 0.7)
We build a recipe to preprocess our data such as setting role, imputing missing data …
rec <- recipe(sale_price~. , data = training) %>%
update_role(id, new_role = "id") %>%
step_string2factor(all_nominal()) %>%
step_zv(all_numeric()) %>%
step_knnimpute(mas_vnr_type) %>%
step_meanimpute(garage_yr_blt, mas_vnr_area,
garage_cars,garage_area)
rec
## Data Recipe
##
## Inputs:
##
## role #variables
## id 1
## outcome 1
## predictor 22
##
## Operations:
##
## Factor variables from all_nominal()
## Zero variance filter on all_numeric()
## K-nearest neighbor imputation for mas_vnr_type
## Mean Imputation for garage_yr_blt, mas_vnr_area, ...
Build a random forest model using ranger engine and prepare a grid for tuning purpose
# MOdel engine
rf_model <- rand_forest(mtry = tune(),
trees = 750,
min_n = tune()) %>%
set_mode("regression") %>%
set_engine("ranger")
# Tuning grid
rf_grid <- grid_regular(mtry(range = c(10, 20)),
min_n(range = c(1, 8)),
levels = 5)
rf_model
## Random Forest Model Specification (regression)
##
## Main Arguments:
## mtry = tune()
## trees = 750
## min_n = tune()
##
## Computational engine: ranger
rf_grid
## # A tibble: 25 x 2
## mtry min_n
## <int> <int>
## 1 10 1
## 2 12 1
## 3 15 1
## 4 17 1
## 5 20 1
## 6 10 2
## 7 12 2
## 8 15 2
## 9 17 2
## 10 20 2
## # ... with 15 more rows
Now, we put recipe and model into a workflows
rf_wf <- workflow() %>%
add_recipe(rec) %>%
add_model(rf_model)
It’s time to tune our model
Now, we see how well our model perform
fit %>% collect_metrics()
## # A tibble: 75 x 8
## mtry min_n .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 10 1 mae standard 0.0992 9 0.00509 Preprocessor1_Model01
## 2 10 1 rmse standard 0.143 9 0.00984 Preprocessor1_Model01
## 3 10 1 rsq standard 0.871 9 0.0137 Preprocessor1_Model01
## 4 12 1 mae standard 0.0994 9 0.00510 Preprocessor1_Model02
## 5 12 1 rmse standard 0.143 9 0.00981 Preprocessor1_Model02
## 6 12 1 rsq standard 0.870 9 0.0136 Preprocessor1_Model02
## 7 15 1 mae standard 0.100 9 0.00495 Preprocessor1_Model03
## 8 15 1 rmse standard 0.145 9 0.00954 Preprocessor1_Model03
## 9 15 1 rsq standard 0.868 9 0.0133 Preprocessor1_Model03
## 10 17 1 mae standard 0.101 9 0.00515 Preprocessor1_Model04
## # ... with 65 more rows
fit %>% show_best("rsq")
## # A tibble: 5 x 8
## mtry min_n .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 10 6 rsq standard 0.871 9 0.0131 Preprocessor1_Model16
## 2 10 4 rsq standard 0.871 9 0.0133 Preprocessor1_Model11
## 3 10 8 rsq standard 0.871 9 0.0131 Preprocessor1_Model21
## 4 10 2 rsq standard 0.871 9 0.0136 Preprocessor1_Model06
## 5 10 1 rsq standard 0.871 9 0.0137 Preprocessor1_Model01
fit %>% show_best("rmse")
## # A tibble: 5 x 8
## mtry min_n .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 10 6 rmse standard 0.143 9 0.00961 Preprocessor1_Model16
## 2 10 4 rmse standard 0.143 9 0.00976 Preprocessor1_Model11
## 3 10 8 rmse standard 0.143 9 0.00969 Preprocessor1_Model21
## 4 12 4 rmse standard 0.143 9 0.00966 Preprocessor1_Model12
## 5 10 2 rmse standard 0.143 9 0.00985 Preprocessor1_Model06
best_RMSE <- select_best(fit, "rmse") # select the best models
best_tunned_model <- finalize_model(rf_model,
best_RMSE)
tunned_wf <- workflow() %>%
add_recipe(rec) %>%
add_model(best_tunned_model)
last_fit <- tunned_wf %>% last_fit(split,
metrics = metric_set(mae, rmse, rsq))
last_fit %>% collect_metrics()
## # A tibble: 3 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 mae standard 0.114 Preprocessor1_Model1
## 2 rmse standard 0.169 Preprocessor1_Model1
## 3 rsq standard 0.834 Preprocessor1_Model1
last_fit %>% collect_predictions()
## # A tibble: 365 x 5
## id .pred .row sale_price .config
## <chr> <dbl> <int> <dbl> <chr>
## 1 train/test split 12.3 8 12.2 Preprocessor1_Model1
## 2 train/test split 11.6 10 11.7 Preprocessor1_Model1
## 3 train/test split 11.8 11 11.8 Preprocessor1_Model1
## 4 train/test split 11.7 13 11.9 Preprocessor1_Model1
## 5 train/test split 11.9 16 11.8 Preprocessor1_Model1
## 6 train/test split 11.9 19 12.0 Preprocessor1_Model1
## 7 train/test split 11.9 24 11.8 Preprocessor1_Model1
## 8 train/test split 12.5 26 12.5 Preprocessor1_Model1
## 9 train/test split 11.5 31 10.6 Preprocessor1_Model1
## 10 train/test split 11.9 34 12.0 Preprocessor1_Model1
## # ... with 355 more rows
last_fit %>% collect_predictions() %>%
ggplot(aes(.pred, sale_price))+
geom_point()
Save the model to predict on test dat
saved_model <- last_fit$.workflow[[1]]
test <- test %>%
select (id, overall_qual,year_built,year_remod_add,mas_vnr_area,
total_bsmt_sf,x1st_flr_sf,gr_liv_area,full_bath,tot_rms_abv_grd,fireplaces,
garage_yr_blt,garage_cars,garage_area, ms_zoning, street,condition1,
house_style,mas_vnr_type,exter_qual, bsmt_cond,bsmt_qual,bsmt_exposure) %>%
mutate(
bsmt_qual = case_when(is.na(bsmt_qual) == "TRUE" ~ "none",
TRUE ~ bsmt_qual),
bsmt_cond = case_when(is.na(bsmt_cond) == "TRUE" ~ "none",
TRUE ~ bsmt_cond),
bsmt_exposure = case_when(is.na(bsmt_exposure) == "TRUE" ~ "none",
TRUE ~ bsmt_exposure)) %>%
mutate(sale_price=0)
# process the test data to impute missing data
rec2 <- recipe(sale_price~. , data = test) %>%
update_role(id, new_role = "id") %>%
step_string2factor(all_nominal()) %>%
step_zv(all_numeric()) %>%
step_knnimpute(mas_vnr_type,ms_zoning) %>%
step_meanimpute(garage_yr_blt, mas_vnr_area,
garage_cars,garage_area,total_bsmt_sf)
# check prep recipe
prep2 <- prep(rec2)
test2 <- bake(prep2, new_data=NULL)
Finally, we can fit our model on test data to predict the house price
pred_result <- predict(saved_model, test2)
test2$sale_price <- pred_result$.pred