Source of data and information. https://www.kaggle.com/c/house-prices-advanced-regression-techniques

Load necessary libraries

Data loading and exploration

Load data from Kaggle URL

Detail infomation about the train dataset and test dataset

skim(train)
Data summary
Name train
Number of rows 1460
Number of columns 81
_______________________
Column type frequency:
character 43
numeric 38
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
ms_zoning 0 1.00 2 7 0 5 0
street 0 1.00 4 4 0 2 0
alley 1369 0.06 4 4 0 2 0
lot_shape 0 1.00 3 3 0 4 0
land_contour 0 1.00 3 3 0 4 0
utilities 0 1.00 6 6 0 2 0
lot_config 0 1.00 3 7 0 5 0
land_slope 0 1.00 3 3 0 3 0
neighborhood 0 1.00 5 7 0 25 0
condition1 0 1.00 4 6 0 9 0
condition2 0 1.00 4 6 0 8 0
bldg_type 0 1.00 4 6 0 5 0
house_style 0 1.00 4 6 0 8 0
roof_style 0 1.00 3 7 0 6 0
roof_matl 0 1.00 4 7 0 8 0
exterior1st 0 1.00 5 7 0 15 0
exterior2nd 0 1.00 5 7 0 16 0
mas_vnr_type 8 0.99 4 7 0 4 0
exter_qual 0 1.00 2 2 0 4 0
exter_cond 0 1.00 2 2 0 5 0
foundation 0 1.00 4 6 0 6 0
bsmt_qual 37 0.97 2 2 0 4 0
bsmt_cond 37 0.97 2 2 0 4 0
bsmt_exposure 38 0.97 2 2 0 4 0
bsmt_fin_type1 37 0.97 3 3 0 6 0
bsmt_fin_type2 38 0.97 3 3 0 6 0
heating 0 1.00 4 5 0 6 0
heating_qc 0 1.00 2 2 0 5 0
central_air 0 1.00 1 1 0 2 0
electrical 1 1.00 3 5 0 5 0
kitchen_qual 0 1.00 2 2 0 4 0
functional 0 1.00 3 4 0 7 0
fireplace_qu 690 0.53 2 2 0 5 0
garage_type 81 0.94 6 7 0 6 0
garage_finish 81 0.94 3 3 0 3 0
garage_qual 81 0.94 2 2 0 5 0
garage_cond 81 0.94 2 2 0 5 0
paved_drive 0 1.00 1 1 0 3 0
pool_qc 1453 0.00 2 2 0 3 0
fence 1179 0.19 4 5 0 4 0
misc_feature 1406 0.04 4 4 0 4 0
sale_type 0 1.00 2 5 0 9 0
sale_condition 0 1.00 6 7 0 6 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1.00 730.50 421.61 1 365.75 730.5 1095.25 1460 ▇▇▇▇▇
ms_sub_class 0 1.00 56.90 42.30 20 20.00 50.0 70.00 190 ▇▅▂▁▁
lot_frontage 259 0.82 70.05 24.28 21 59.00 69.0 80.00 313 ▇▃▁▁▁
lot_area 0 1.00 10516.83 9981.26 1300 7553.50 9478.5 11601.50 215245 ▇▁▁▁▁
overall_qual 0 1.00 6.10 1.38 1 5.00 6.0 7.00 10 ▁▂▇▅▁
overall_cond 0 1.00 5.58 1.11 1 5.00 5.0 6.00 9 ▁▁▇▅▁
year_built 0 1.00 1971.27 30.20 1872 1954.00 1973.0 2000.00 2010 ▁▂▃▆▇
year_remod_add 0 1.00 1984.87 20.65 1950 1967.00 1994.0 2004.00 2010 ▅▂▂▃▇
mas_vnr_area 8 0.99 103.69 181.07 0 0.00 0.0 166.00 1600 ▇▁▁▁▁
bsmt_fin_sf1 0 1.00 443.64 456.10 0 0.00 383.5 712.25 5644 ▇▁▁▁▁
bsmt_fin_sf2 0 1.00 46.55 161.32 0 0.00 0.0 0.00 1474 ▇▁▁▁▁
bsmt_unf_sf 0 1.00 567.24 441.87 0 223.00 477.5 808.00 2336 ▇▅▂▁▁
total_bsmt_sf 0 1.00 1057.43 438.71 0 795.75 991.5 1298.25 6110 ▇▃▁▁▁
x1st_flr_sf 0 1.00 1162.63 386.59 334 882.00 1087.0 1391.25 4692 ▇▅▁▁▁
x2nd_flr_sf 0 1.00 346.99 436.53 0 0.00 0.0 728.00 2065 ▇▃▂▁▁
low_qual_fin_sf 0 1.00 5.84 48.62 0 0.00 0.0 0.00 572 ▇▁▁▁▁
gr_liv_area 0 1.00 1515.46 525.48 334 1129.50 1464.0 1776.75 5642 ▇▇▁▁▁
bsmt_full_bath 0 1.00 0.43 0.52 0 0.00 0.0 1.00 3 ▇▆▁▁▁
bsmt_half_bath 0 1.00 0.06 0.24 0 0.00 0.0 0.00 2 ▇▁▁▁▁
full_bath 0 1.00 1.57 0.55 0 1.00 2.0 2.00 3 ▁▇▁▇▁
half_bath 0 1.00 0.38 0.50 0 0.00 0.0 1.00 2 ▇▁▅▁▁
bedroom_abv_gr 0 1.00 2.87 0.82 0 2.00 3.0 3.00 8 ▁▇▂▁▁
kitchen_abv_gr 0 1.00 1.05 0.22 0 1.00 1.0 1.00 3 ▁▇▁▁▁
tot_rms_abv_grd 0 1.00 6.52 1.63 2 5.00 6.0 7.00 14 ▂▇▇▁▁
fireplaces 0 1.00 0.61 0.64 0 0.00 1.0 1.00 3 ▇▇▁▁▁
garage_yr_blt 81 0.94 1978.51 24.69 1900 1961.00 1980.0 2002.00 2010 ▁▁▅▅▇
garage_cars 0 1.00 1.77 0.75 0 1.00 2.0 2.00 4 ▁▃▇▂▁
garage_area 0 1.00 472.98 213.80 0 334.50 480.0 576.00 1418 ▂▇▃▁▁
wood_deck_sf 0 1.00 94.24 125.34 0 0.00 0.0 168.00 857 ▇▂▁▁▁
open_porch_sf 0 1.00 46.66 66.26 0 0.00 25.0 68.00 547 ▇▁▁▁▁
enclosed_porch 0 1.00 21.95 61.12 0 0.00 0.0 0.00 552 ▇▁▁▁▁
x3ssn_porch 0 1.00 3.41 29.32 0 0.00 0.0 0.00 508 ▇▁▁▁▁
screen_porch 0 1.00 15.06 55.76 0 0.00 0.0 0.00 480 ▇▁▁▁▁
pool_area 0 1.00 2.76 40.18 0 0.00 0.0 0.00 738 ▇▁▁▁▁
misc_val 0 1.00 43.49 496.12 0 0.00 0.0 0.00 15500 ▇▁▁▁▁
mo_sold 0 1.00 6.32 2.70 1 5.00 6.0 8.00 12 ▃▆▇▃▃
yr_sold 0 1.00 2007.82 1.33 2006 2007.00 2008.0 2009.00 2010 ▇▇▇▇▅
sale_price 0 1.00 180921.20 79442.50 34900 129975.00 163000.0 214000.00 755000 ▇▅▁▁▁
skim(test)
Data summary
Name test
Number of rows 1459
Number of columns 80
_______________________
Column type frequency:
character 43
numeric 37
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
ms_zoning 4 1.00 2 7 0 5 0
street 0 1.00 4 4 0 2 0
alley 1352 0.07 4 4 0 2 0
lot_shape 0 1.00 3 3 0 4 0
land_contour 0 1.00 3 3 0 4 0
utilities 2 1.00 6 6 0 1 0
lot_config 0 1.00 3 7 0 5 0
land_slope 0 1.00 3 3 0 3 0
neighborhood 0 1.00 5 7 0 25 0
condition1 0 1.00 4 6 0 9 0
condition2 0 1.00 4 6 0 5 0
bldg_type 0 1.00 4 6 0 5 0
house_style 0 1.00 4 6 0 7 0
roof_style 0 1.00 3 7 0 6 0
roof_matl 0 1.00 7 7 0 4 0
exterior1st 1 1.00 6 7 0 13 0
exterior2nd 1 1.00 5 7 0 15 0
mas_vnr_type 16 0.99 4 7 0 4 0
exter_qual 0 1.00 2 2 0 4 0
exter_cond 0 1.00 2 2 0 5 0
foundation 0 1.00 4 6 0 6 0
bsmt_qual 44 0.97 2 2 0 4 0
bsmt_cond 45 0.97 2 2 0 4 0
bsmt_exposure 44 0.97 2 2 0 4 0
bsmt_fin_type1 42 0.97 3 3 0 6 0
bsmt_fin_type2 42 0.97 3 3 0 6 0
heating 0 1.00 4 4 0 4 0
heating_qc 0 1.00 2 2 0 5 0
central_air 0 1.00 1 1 0 2 0
electrical 0 1.00 5 5 0 4 0
kitchen_qual 1 1.00 2 2 0 4 0
functional 2 1.00 3 4 0 7 0
fireplace_qu 730 0.50 2 2 0 5 0
garage_type 76 0.95 6 7 0 6 0
garage_finish 78 0.95 3 3 0 3 0
garage_qual 78 0.95 2 2 0 4 0
garage_cond 78 0.95 2 2 0 5 0
paved_drive 0 1.00 1 1 0 3 0
pool_qc 1456 0.00 2 2 0 2 0
fence 1169 0.20 4 5 0 4 0
misc_feature 1408 0.03 4 4 0 3 0
sale_type 1 1.00 2 5 0 9 0
sale_condition 0 1.00 6 7 0 6 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1.00 2190.00 421.32 1461 1825.50 2190.0 2554.50 2919 ▇▇▇▇▇
ms_sub_class 0 1.00 57.38 42.75 20 20.00 50.0 70.00 190 ▇▅▂▁▁
lot_frontage 227 0.84 68.58 22.38 21 58.00 67.0 80.00 200 ▃▇▁▁▁
lot_area 0 1.00 9819.16 4955.52 1470 7391.00 9399.0 11517.50 56600 ▇▂▁▁▁
overall_qual 0 1.00 6.08 1.44 1 5.00 6.0 7.00 10 ▁▁▇▅▁
overall_cond 0 1.00 5.55 1.11 1 5.00 5.0 6.00 9 ▁▁▇▅▁
year_built 0 1.00 1971.36 30.39 1879 1953.00 1973.0 2001.00 2010 ▁▂▃▆▇
year_remod_add 0 1.00 1983.66 21.13 1950 1963.00 1992.0 2004.00 2010 ▅▂▂▃▇
mas_vnr_area 15 0.99 100.71 177.63 0 0.00 0.0 164.00 1290 ▇▁▁▁▁
bsmt_fin_sf1 1 1.00 439.20 455.27 0 0.00 350.5 753.50 4010 ▇▂▁▁▁
bsmt_fin_sf2 1 1.00 52.62 176.75 0 0.00 0.0 0.00 1526 ▇▁▁▁▁
bsmt_unf_sf 1 1.00 554.29 437.26 0 219.25 460.0 797.75 2140 ▇▆▂▁▁
total_bsmt_sf 1 1.00 1046.12 442.90 0 784.00 988.0 1305.00 5095 ▇▇▁▁▁
x1st_flr_sf 0 1.00 1156.53 398.17 407 873.50 1079.0 1382.50 5095 ▇▃▁▁▁
x2nd_flr_sf 0 1.00 325.97 420.61 0 0.00 0.0 676.00 1862 ▇▃▂▁▁
low_qual_fin_sf 0 1.00 3.54 44.04 0 0.00 0.0 0.00 1064 ▇▁▁▁▁
gr_liv_area 0 1.00 1486.05 485.57 407 1117.50 1432.0 1721.00 5095 ▇▇▁▁▁
bsmt_full_bath 2 1.00 0.43 0.53 0 0.00 0.0 1.00 3 ▇▆▁▁▁
bsmt_half_bath 2 1.00 0.07 0.25 0 0.00 0.0 0.00 2 ▇▁▁▁▁
full_bath 0 1.00 1.57 0.56 0 1.00 2.0 2.00 4 ▁▇▇▁▁
half_bath 0 1.00 0.38 0.50 0 0.00 0.0 1.00 2 ▇▁▅▁▁
bedroom_abv_gr 0 1.00 2.85 0.83 0 2.00 3.0 3.00 6 ▁▃▇▂▁
kitchen_abv_gr 0 1.00 1.04 0.21 0 1.00 1.0 1.00 2 ▁▁▇▁▁
tot_rms_abv_grd 0 1.00 6.39 1.51 3 5.00 6.0 7.00 15 ▅▇▃▁▁
fireplaces 0 1.00 0.58 0.65 0 0.00 0.0 1.00 4 ▇▇▁▁▁
garage_yr_blt 78 0.95 1977.72 26.43 1895 1959.00 1979.0 2002.00 2207 ▂▇▁▁▁
garage_cars 1 1.00 1.77 0.78 0 1.00 2.0 2.00 5 ▅▇▂▁▁
garage_area 1 1.00 472.77 217.05 0 318.00 480.0 576.00 1488 ▃▇▃▁▁
wood_deck_sf 0 1.00 93.17 127.74 0 0.00 0.0 168.00 1424 ▇▁▁▁▁
open_porch_sf 0 1.00 48.31 68.88 0 0.00 28.0 72.00 742 ▇▁▁▁▁
enclosed_porch 0 1.00 24.24 67.23 0 0.00 0.0 0.00 1012 ▇▁▁▁▁
x3ssn_porch 0 1.00 1.79 20.21 0 0.00 0.0 0.00 360 ▇▁▁▁▁
screen_porch 0 1.00 17.06 56.61 0 0.00 0.0 0.00 576 ▇▁▁▁▁
pool_area 0 1.00 1.74 30.49 0 0.00 0.0 0.00 800 ▇▁▁▁▁
misc_val 0 1.00 58.17 630.81 0 0.00 0.0 0.00 17000 ▇▁▁▁▁
mo_sold 0 1.00 6.10 2.72 1 4.00 6.0 8.00 12 ▅▆▇▃▃
yr_sold 0 1.00 2007.77 1.30 2006 2007.00 2008.0 2009.00 2010 ▇▇▇▇▃

Train and test dataset contains lots of missing values. To build and run forecasting models smoothly, we need to impute those missing data or get rid of observation containing missing value. To retain observations as much as possible, I choose to impute those missing value.

Choose variables which are used to predict

train <- train %>% 
  select (id, sale_price, overall_qual,year_built,year_remod_add,mas_vnr_area,
          total_bsmt_sf,x1st_flr_sf,gr_liv_area,full_bath,tot_rms_abv_grd,fireplaces,
          garage_yr_blt,garage_cars,garage_area, ms_zoning, street,condition1,
          house_style,mas_vnr_type,exter_qual, bsmt_cond,bsmt_qual,bsmt_exposure) %>% 
  mutate(
    bsmt_qual = case_when(is.na(bsmt_qual) == "TRUE" ~ "none",
                          TRUE ~ bsmt_qual),
    bsmt_cond = case_when(is.na(bsmt_cond) == "TRUE" ~ "none",
                          TRUE ~ bsmt_cond),
    bsmt_exposure = case_when(is.na(bsmt_exposure) == "TRUE" ~ "none",
                              TRUE ~ bsmt_exposure)) %>%
  mutate(sale_price = log(sale_price))

BUILD FORCASTING MODEL

OK, let build our model

Prepare data

# split the our data into training and testing data
set.seed(123)
split <- initial_split(train)
training <- training(split)
testing <- testing(split)

Resampling

Create validation fold for further validation activities

set.seed(234)
resample <- vfold_cv(training, prop = 0.7)

Prepare recipe

We build a recipe to preprocess our data such as setting role, imputing missing data …

rec <- recipe(sale_price~. , data = training) %>%
  update_role(id, new_role = "id") %>%
  step_string2factor(all_nominal()) %>%
  step_zv(all_numeric()) %>%
  step_knnimpute(mas_vnr_type) %>%
  step_meanimpute(garage_yr_blt, mas_vnr_area, 
                  garage_cars,garage_area)
rec
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##         id          1
##    outcome          1
##  predictor         22
## 
## Operations:
## 
## Factor variables from all_nominal()
## Zero variance filter on all_numeric()
## K-nearest neighbor imputation for mas_vnr_type
## Mean Imputation for garage_yr_blt, mas_vnr_area, ...

Prepare engine

Build a random forest model using ranger engine and prepare a grid for tuning purpose

# MOdel engine
rf_model <- rand_forest(mtry = tune(),
                        trees = 750,
                        min_n = tune()) %>%
  set_mode("regression") %>%
  set_engine("ranger")

# Tuning grid
rf_grid <- grid_regular(mtry(range = c(10, 20)),
                        min_n(range = c(1, 8)),
                        levels = 5)

rf_model
## Random Forest Model Specification (regression)
## 
## Main Arguments:
##   mtry = tune()
##   trees = 750
##   min_n = tune()
## 
## Computational engine: ranger
rf_grid
## # A tibble: 25 x 2
##     mtry min_n
##    <int> <int>
##  1    10     1
##  2    12     1
##  3    15     1
##  4    17     1
##  5    20     1
##  6    10     2
##  7    12     2
##  8    15     2
##  9    17     2
## 10    20     2
## # ... with 15 more rows

Create workflows

Now, we put recipe and model into a workflows

rf_wf <- workflow() %>%
  add_recipe(rec) %>%
  add_model(rf_model)

tune the model on training data

It’s time to tune our model

Now, we see how well our model perform

fit %>% collect_metrics()
## # A tibble: 75 x 8
##     mtry min_n .metric .estimator   mean     n std_err .config              
##    <int> <int> <chr>   <chr>       <dbl> <int>   <dbl> <chr>                
##  1    10     1 mae     standard   0.0992     9 0.00509 Preprocessor1_Model01
##  2    10     1 rmse    standard   0.143      9 0.00984 Preprocessor1_Model01
##  3    10     1 rsq     standard   0.871      9 0.0137  Preprocessor1_Model01
##  4    12     1 mae     standard   0.0994     9 0.00510 Preprocessor1_Model02
##  5    12     1 rmse    standard   0.143      9 0.00981 Preprocessor1_Model02
##  6    12     1 rsq     standard   0.870      9 0.0136  Preprocessor1_Model02
##  7    15     1 mae     standard   0.100      9 0.00495 Preprocessor1_Model03
##  8    15     1 rmse    standard   0.145      9 0.00954 Preprocessor1_Model03
##  9    15     1 rsq     standard   0.868      9 0.0133  Preprocessor1_Model03
## 10    17     1 mae     standard   0.101      9 0.00515 Preprocessor1_Model04
## # ... with 65 more rows
fit %>% show_best("rsq")
## # A tibble: 5 x 8
##    mtry min_n .metric .estimator  mean     n std_err .config              
##   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1    10     6 rsq     standard   0.871     9  0.0131 Preprocessor1_Model16
## 2    10     4 rsq     standard   0.871     9  0.0133 Preprocessor1_Model11
## 3    10     8 rsq     standard   0.871     9  0.0131 Preprocessor1_Model21
## 4    10     2 rsq     standard   0.871     9  0.0136 Preprocessor1_Model06
## 5    10     1 rsq     standard   0.871     9  0.0137 Preprocessor1_Model01
fit %>% show_best("rmse")
## # A tibble: 5 x 8
##    mtry min_n .metric .estimator  mean     n std_err .config              
##   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1    10     6 rmse    standard   0.143     9 0.00961 Preprocessor1_Model16
## 2    10     4 rmse    standard   0.143     9 0.00976 Preprocessor1_Model11
## 3    10     8 rmse    standard   0.143     9 0.00969 Preprocessor1_Model21
## 4    12     4 rmse    standard   0.143     9 0.00966 Preprocessor1_Model12
## 5    10     2 rmse    standard   0.143     9 0.00985 Preprocessor1_Model06
best_RMSE <- select_best(fit, "rmse") # select the best models

finalize the model

best_tunned_model <- finalize_model(rf_model,
                                    best_RMSE)

rebuild the model using the best_tunned_specs

tunned_wf <- workflow() %>%
  add_recipe(rec) %>%
  add_model(best_tunned_model)

lastfit using the best_tunned_model

last_fit <- tunned_wf %>% last_fit(split,
                                   metrics = metric_set(mae, rmse, rsq))

assess model performance

last_fit %>% collect_metrics()
## # A tibble: 3 x 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 mae     standard       0.114 Preprocessor1_Model1
## 2 rmse    standard       0.169 Preprocessor1_Model1
## 3 rsq     standard       0.834 Preprocessor1_Model1
last_fit %>% collect_predictions()
## # A tibble: 365 x 5
##    id               .pred  .row sale_price .config             
##    <chr>            <dbl> <int>      <dbl> <chr>               
##  1 train/test split  12.3     8       12.2 Preprocessor1_Model1
##  2 train/test split  11.6    10       11.7 Preprocessor1_Model1
##  3 train/test split  11.8    11       11.8 Preprocessor1_Model1
##  4 train/test split  11.7    13       11.9 Preprocessor1_Model1
##  5 train/test split  11.9    16       11.8 Preprocessor1_Model1
##  6 train/test split  11.9    19       12.0 Preprocessor1_Model1
##  7 train/test split  11.9    24       11.8 Preprocessor1_Model1
##  8 train/test split  12.5    26       12.5 Preprocessor1_Model1
##  9 train/test split  11.5    31       10.6 Preprocessor1_Model1
## 10 train/test split  11.9    34       12.0 Preprocessor1_Model1
## # ... with 355 more rows
last_fit %>% collect_predictions() %>%
  ggplot(aes(.pred, sale_price))+
  geom_point() 

save the model

Save the model to predict on test dat

saved_model <- last_fit$.workflow[[1]]

Preprocess test data

test <- test %>%
  select (id, overall_qual,year_built,year_remod_add,mas_vnr_area,
          total_bsmt_sf,x1st_flr_sf,gr_liv_area,full_bath,tot_rms_abv_grd,fireplaces,
          garage_yr_blt,garage_cars,garage_area, ms_zoning, street,condition1,
          house_style,mas_vnr_type,exter_qual, bsmt_cond,bsmt_qual,bsmt_exposure) %>% 
  mutate(
    bsmt_qual = case_when(is.na(bsmt_qual) == "TRUE" ~ "none",
                          TRUE ~ bsmt_qual),
    bsmt_cond = case_when(is.na(bsmt_cond) == "TRUE" ~ "none",
                          TRUE ~ bsmt_cond),
    bsmt_exposure = case_when(is.na(bsmt_exposure) == "TRUE" ~ "none",
                              TRUE ~ bsmt_exposure)) %>%
  mutate(sale_price=0)
  
# process the test data to impute missing data

rec2 <- recipe(sale_price~. , data = test) %>%
  update_role(id, new_role = "id") %>%
  step_string2factor(all_nominal()) %>%
  step_zv(all_numeric()) %>%
  step_knnimpute(mas_vnr_type,ms_zoning) %>%
  step_meanimpute(garage_yr_blt, mas_vnr_area, 
                  garage_cars,garage_area,total_bsmt_sf)

# check prep recipe
prep2 <- prep(rec2)
test2 <-  bake(prep2, new_data=NULL)

Predict on test data

Finally, we can fit our model on test data to predict the house price

pred_result <- predict(saved_model, test2)
test2$sale_price <- pred_result$.pred