Source of data and information. https://www.kaggle.com/c/house-prices-advanced-regression-techniques

Load necessary libraries

Data loading and exploration

Load data from Kaggle URL

Detail infomation about the train dataset and test dataset

skim(train)

Data summary
Name	train
Number of rows	1460
Number of columns	81
_______________________
Column type frequency:
character	43
numeric	38
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
ms_zoning	0	1.00	2	7	5
street	0	1.00	4	4	2
alley	1369	0.06	4	4	2
lot_shape	0	1.00	3	3	4
land_contour	0	1.00	3	3	4
utilities	0	1.00	6	6	2
lot_config	0	1.00	3	7	5
land_slope	0	1.00	3	3	3
neighborhood	0	1.00	5	7	25
condition1	0	1.00	4	6	9
condition2	0	1.00	4	6	8
bldg_type	0	1.00	4	6	5
house_style	0	1.00	4	6	8
roof_style	0	1.00	3	7	6
roof_matl	0	1.00	4	7	8
exterior1st	0	1.00	5	7	15
exterior2nd	0	1.00	5	7	16
mas_vnr_type	8	0.99	4	7	4
exter_qual	0	1.00	2	2	4
exter_cond	0	1.00	2	2	5
foundation	0	1.00	4	6	6
bsmt_qual	37	0.97	2	2	4
bsmt_cond	37	0.97	2	2	4
bsmt_exposure	38	0.97	2	2	4
bsmt_fin_type1	37	0.97	3	3	6
bsmt_fin_type2	38	0.97	3	3	6
heating	0	1.00	4	5	6
heating_qc	0	1.00	2	2	5
central_air	0	1.00	1	1	2
electrical	1	1.00	3	5	5
kitchen_qual	0	1.00	2	2	4
functional	0	1.00	3	4	7
fireplace_qu	690	0.53	2	2	5
garage_type	81	0.94	6	7	6
garage_finish	81	0.94	3	3	3
garage_qual	81	0.94	2	2	5
garage_cond	81	0.94	2	2	5
paved_drive	0	1.00	1	1	3
pool_qc	1453	0.00	2	2	3
fence	1179	0.19	4	5	4
misc_feature	1406	0.04	4	4	4
sale_type	0	1.00	2	5	9
sale_condition	0	1.00	6	7	6

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	0	1.00	730.50	421.61	1	365.75	730.5	1095.25	1460	▇▇▇▇▇
ms_sub_class	0	1.00	56.90	42.30	20	20.00	50.0	70.00	190	▇▅▂▁▁
lot_frontage	259	0.82	70.05	24.28	21	59.00	69.0	80.00	313	▇▃▁▁▁
lot_area	0	1.00	10516.83	9981.26	1300	7553.50	9478.5	11601.50	215245	▇▁▁▁▁
overall_qual	0	1.00	6.10	1.38	1	5.00	6.0	7.00	10	▁▂▇▅▁
overall_cond	0	1.00	5.58	1.11	1	5.00	5.0	6.00	9	▁▁▇▅▁
year_built	0	1.00	1971.27	30.20	1872	1954.00	1973.0	2000.00	2010	▁▂▃▆▇
year_remod_add	0	1.00	1984.87	20.65	1950	1967.00	1994.0	2004.00	2010	▅▂▂▃▇
mas_vnr_area	8	0.99	103.69	181.07	0	0.00	0.0	166.00	1600	▇▁▁▁▁
bsmt_fin_sf1	0	1.00	443.64	456.10	0	0.00	383.5	712.25	5644	▇▁▁▁▁
bsmt_fin_sf2	0	1.00	46.55	161.32	0	0.00	0.0	0.00	1474	▇▁▁▁▁
bsmt_unf_sf	0	1.00	567.24	441.87	0	223.00	477.5	808.00	2336	▇▅▂▁▁
total_bsmt_sf	0	1.00	1057.43	438.71	0	795.75	991.5	1298.25	6110	▇▃▁▁▁
x1st_flr_sf	0	1.00	1162.63	386.59	334	882.00	1087.0	1391.25	4692	▇▅▁▁▁
x2nd_flr_sf	0	1.00	346.99	436.53	0	0.00	0.0	728.00	2065	▇▃▂▁▁
low_qual_fin_sf	0	1.00	5.84	48.62	0	0.00	0.0	0.00	572	▇▁▁▁▁
gr_liv_area	0	1.00	1515.46	525.48	334	1129.50	1464.0	1776.75	5642	▇▇▁▁▁
bsmt_full_bath	0	1.00	0.43	0.52	0	0.00	0.0	1.00	3	▇▆▁▁▁
bsmt_half_bath	0	1.00	0.06	0.24	0	0.00	0.0	0.00	2	▇▁▁▁▁
full_bath	0	1.00	1.57	0.55	0	1.00	2.0	2.00	3	▁▇▁▇▁
half_bath	0	1.00	0.38	0.50	0	0.00	0.0	1.00	2	▇▁▅▁▁
bedroom_abv_gr	0	1.00	2.87	0.82	0	2.00	3.0	3.00	8	▁▇▂▁▁
kitchen_abv_gr	0	1.00	1.05	0.22	0	1.00	1.0	1.00	3	▁▇▁▁▁
tot_rms_abv_grd	0	1.00	6.52	1.63	2	5.00	6.0	7.00	14	▂▇▇▁▁
fireplaces	0	1.00	0.61	0.64	0	0.00	1.0	1.00	3	▇▇▁▁▁
garage_yr_blt	81	0.94	1978.51	24.69	1900	1961.00	1980.0	2002.00	2010	▁▁▅▅▇
garage_cars	0	1.00	1.77	0.75	0	1.00	2.0	2.00	4	▁▃▇▂▁
garage_area	0	1.00	472.98	213.80	0	334.50	480.0	576.00	1418	▂▇▃▁▁
wood_deck_sf	0	1.00	94.24	125.34	0	0.00	0.0	168.00	857	▇▂▁▁▁
open_porch_sf	0	1.00	46.66	66.26	0	0.00	25.0	68.00	547	▇▁▁▁▁
enclosed_porch	0	1.00	21.95	61.12	0	0.00	0.0	0.00	552	▇▁▁▁▁
x3ssn_porch	0	1.00	3.41	29.32	0	0.00	0.0	0.00	508	▇▁▁▁▁
screen_porch	0	1.00	15.06	55.76	0	0.00	0.0	0.00	480	▇▁▁▁▁
pool_area	0	1.00	2.76	40.18	0	0.00	0.0	0.00	738	▇▁▁▁▁
misc_val	0	1.00	43.49	496.12	0	0.00	0.0	0.00	15500	▇▁▁▁▁
mo_sold	0	1.00	6.32	2.70	1	5.00	6.0	8.00	12	▃▆▇▃▃
yr_sold	0	1.00	2007.82	1.33	2006	2007.00	2008.0	2009.00	2010	▇▇▇▇▅
sale_price	0	1.00	180921.20	79442.50	34900	129975.00	163000.0	214000.00	755000	▇▅▁▁▁

skim(test)

Data summary
Name	test
Number of rows	1459
Number of columns	80
_______________________
Column type frequency:
character	43
numeric	37
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
ms_zoning	4	1.00	2	7	5
street	0	1.00	4	4	2
alley	1352	0.07	4	4	2
lot_shape	0	1.00	3	3	4
land_contour	0	1.00	3	3	4
utilities	2	1.00	6	6	1
lot_config	0	1.00	3	7	5
land_slope	0	1.00	3	3	3
neighborhood	0	1.00	5	7	25
condition1	0	1.00	4	6	9
condition2	0	1.00	4	6	5
bldg_type	0	1.00	4	6	5
house_style	0	1.00	4	6	7
roof_style	0	1.00	3	7	6
roof_matl	0	1.00	7	7	4
exterior1st	1	1.00	6	7	13
exterior2nd	1	1.00	5	7	15
mas_vnr_type	16	0.99	4	7	4
exter_qual	0	1.00	2	2	4
exter_cond	0	1.00	2	2	5
foundation	0	1.00	4	6	6
bsmt_qual	44	0.97	2	2	4
bsmt_cond	45	0.97	2	2	4
bsmt_exposure	44	0.97	2	2	4
bsmt_fin_type1	42	0.97	3	3	6
bsmt_fin_type2	42	0.97	3	3	6
heating	0	1.00	4	4	4
heating_qc	0	1.00	2	2	5
central_air	0	1.00	1	1	2
electrical	0	1.00	5	5	4
kitchen_qual	1	1.00	2	2	4
functional	2	1.00	3	4	7
fireplace_qu	730	0.50	2	2	5
garage_type	76	0.95	6	7	6
garage_finish	78	0.95	3	3	3
garage_qual	78	0.95	2	2	4
garage_cond	78	0.95	2	2	5
paved_drive	0	1.00	1	1	3
pool_qc	1456	0.00	2	2	2
fence	1169	0.20	4	5	4
misc_feature	1408	0.03	4	4	3
sale_type	1	1.00	2	5	9
sale_condition	0	1.00	6	7	6

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
id	0	1.00	2190.00	421.32	1461	1825.50	2190.0	2554.50	2919	▇▇▇▇▇
ms_sub_class	0	1.00	57.38	42.75	20	20.00	50.0	70.00	190	▇▅▂▁▁
lot_frontage	227	0.84	68.58	22.38	21	58.00	67.0	80.00	200	▃▇▁▁▁
lot_area	0	1.00	9819.16	4955.52	1470	7391.00	9399.0	11517.50	56600	▇▂▁▁▁
overall_qual	0	1.00	6.08	1.44	1	5.00	6.0	7.00	10	▁▁▇▅▁
overall_cond	0	1.00	5.55	1.11	1	5.00	5.0	6.00	9	▁▁▇▅▁
year_built	0	1.00	1971.36	30.39	1879	1953.00	1973.0	2001.00	2010	▁▂▃▆▇
year_remod_add	0	1.00	1983.66	21.13	1950	1963.00	1992.0	2004.00	2010	▅▂▂▃▇
mas_vnr_area	15	0.99	100.71	177.63	0	0.00	0.0	164.00	1290	▇▁▁▁▁
bsmt_fin_sf1	1	1.00	439.20	455.27	0	0.00	350.5	753.50	4010	▇▂▁▁▁
bsmt_fin_sf2	1	1.00	52.62	176.75	0	0.00	0.0	0.00	1526	▇▁▁▁▁
bsmt_unf_sf	1	1.00	554.29	437.26	0	219.25	460.0	797.75	2140	▇▆▂▁▁
total_bsmt_sf	1	1.00	1046.12	442.90	0	784.00	988.0	1305.00	5095	▇▇▁▁▁
x1st_flr_sf	0	1.00	1156.53	398.17	407	873.50	1079.0	1382.50	5095	▇▃▁▁▁
x2nd_flr_sf	0	1.00	325.97	420.61	0	0.00	0.0	676.00	1862	▇▃▂▁▁
low_qual_fin_sf	0	1.00	3.54	44.04	0	0.00	0.0	0.00	1064	▇▁▁▁▁
gr_liv_area	0	1.00	1486.05	485.57	407	1117.50	1432.0	1721.00	5095	▇▇▁▁▁
bsmt_full_bath	2	1.00	0.43	0.53	0	0.00	0.0	1.00	3	▇▆▁▁▁
bsmt_half_bath	2	1.00	0.07	0.25	0	0.00	0.0	0.00	2	▇▁▁▁▁
full_bath	0	1.00	1.57	0.56	0	1.00	2.0	2.00	4	▁▇▇▁▁
half_bath	0	1.00	0.38	0.50	0	0.00	0.0	1.00	2	▇▁▅▁▁
bedroom_abv_gr	0	1.00	2.85	0.83	0	2.00	3.0	3.00	6	▁▃▇▂▁
kitchen_abv_gr	0	1.00	1.04	0.21	0	1.00	1.0	1.00	2	▁▁▇▁▁
tot_rms_abv_grd	0	1.00	6.39	1.51	3	5.00	6.0	7.00	15	▅▇▃▁▁
fireplaces	0	1.00	0.58	0.65	0	0.00	0.0	1.00	4	▇▇▁▁▁
garage_yr_blt	78	0.95	1977.72	26.43	1895	1959.00	1979.0	2002.00	2207	▂▇▁▁▁
garage_cars	1	1.00	1.77	0.78	0	1.00	2.0	2.00	5	▅▇▂▁▁
garage_area	1	1.00	472.77	217.05	0	318.00	480.0	576.00	1488	▃▇▃▁▁
wood_deck_sf	0	1.00	93.17	127.74	0	0.00	0.0	168.00	1424	▇▁▁▁▁
open_porch_sf	0	1.00	48.31	68.88	0	0.00	28.0	72.00	742	▇▁▁▁▁
enclosed_porch	0	1.00	24.24	67.23	0	0.00	0.0	0.00	1012	▇▁▁▁▁
x3ssn_porch	0	1.00	1.79	20.21	0	0.00	0.0	0.00	360	▇▁▁▁▁
screen_porch	0	1.00	17.06	56.61	0	0.00	0.0	0.00	576	▇▁▁▁▁
pool_area	0	1.00	1.74	30.49	0	0.00	0.0	0.00	800	▇▁▁▁▁
misc_val	0	1.00	58.17	630.81	0	0.00	0.0	0.00	17000	▇▁▁▁▁
mo_sold	0	1.00	6.10	2.72	1	4.00	6.0	8.00	12	▅▆▇▃▃
yr_sold	0	1.00	2007.77	1.30	2006	2007.00	2008.0	2009.00	2010	▇▇▇▇▃

Train and test dataset contains lots of missing values. To build and run forecasting models smoothly, we need to impute those missing data or get rid of observation containing missing value. To retain observations as much as possible, I choose to impute those missing value.

Choose variables which are used to predict

train <- train %>% 
  select (id, sale_price, overall_qual,year_built,year_remod_add,mas_vnr_area,
          total_bsmt_sf,x1st_flr_sf,gr_liv_area,full_bath,tot_rms_abv_grd,fireplaces,
          garage_yr_blt,garage_cars,garage_area, ms_zoning, street,condition1,
          house_style,mas_vnr_type,exter_qual, bsmt_cond,bsmt_qual,bsmt_exposure) %>% 
  mutate(
    bsmt_qual = case_when(is.na(bsmt_qual) == "TRUE" ~ "none",
                          TRUE ~ bsmt_qual),
    bsmt_cond = case_when(is.na(bsmt_cond) == "TRUE" ~ "none",
                          TRUE ~ bsmt_cond),
    bsmt_exposure = case_when(is.na(bsmt_exposure) == "TRUE" ~ "none",
                              TRUE ~ bsmt_exposure)) %>%
  mutate(sale_price = log(sale_price))

BUILD FORCASTING MODEL

OK, let build our model

Prepare data

# split the our data into training and testing data
set.seed(123)
split <- initial_split(train)
training <- training(split)
testing <- testing(split)

Resampling

Create validation fold for further validation activities

set.seed(234)
resample <- vfold_cv(training, prop = 0.7)

Prepare recipe

We build a recipe to preprocess our data such as setting role, imputing missing data …

rec <- recipe(sale_price~. , data = training) %>%
  update_role(id, new_role = "id") %>%
  step_string2factor(all_nominal()) %>%
  step_zv(all_numeric()) %>%
  step_knnimpute(mas_vnr_type) %>%
  step_meanimpute(garage_yr_blt, mas_vnr_area, 
                  garage_cars,garage_area)
rec

## Data Recipe
## 
## Inputs:
## 
##       role #variables
##         id          1
##    outcome          1
##  predictor         22
## 
## Operations:
## 
## Factor variables from all_nominal()
## Zero variance filter on all_numeric()
## K-nearest neighbor imputation for mas_vnr_type
## Mean Imputation for garage_yr_blt, mas_vnr_area, ...

Prepare engine

Build a random forest model using ranger engine and prepare a grid for tuning purpose

# MOdel engine
rf_model <- rand_forest(mtry = tune(),
                        trees = 750,
                        min_n = tune()) %>%
  set_mode("regression") %>%
  set_engine("ranger")

# Tuning grid
rf_grid <- grid_regular(mtry(range = c(10, 20)),
                        min_n(range = c(1, 8)),
                        levels = 5)

rf_model

## Random Forest Model Specification (regression)
## 
## Main Arguments:
##   mtry = tune()
##   trees = 750
##   min_n = tune()
## 
## Computational engine: ranger

rf_grid

## # A tibble: 25 x 2
##     mtry min_n
##    <int> <int>
##  1    10     1
##  2    12     1
##  3    15     1
##  4    17     1
##  5    20     1
##  6    10     2
##  7    12     2
##  8    15     2
##  9    17     2
## 10    20     2
## # ... with 15 more rows

Create workflows

Now, we put recipe and model into a workflows

rf_wf <- workflow() %>%
  add_recipe(rec) %>%
  add_model(rf_model)

tune the model on training data

It’s time to tune our model

Now, we see how well our model perform

fit %>% collect_metrics()

## # A tibble: 75 x 8
##     mtry min_n .metric .estimator   mean     n std_err .config              
##    <int> <int> <chr>   <chr>       <dbl> <int>   <dbl> <chr>                
##  1    10     1 mae     standard   0.0992     9 0.00509 Preprocessor1_Model01
##  2    10     1 rmse    standard   0.143      9 0.00984 Preprocessor1_Model01
##  3    10     1 rsq     standard   0.871      9 0.0137  Preprocessor1_Model01
##  4    12     1 mae     standard   0.0994     9 0.00510 Preprocessor1_Model02
##  5    12     1 rmse    standard   0.143      9 0.00981 Preprocessor1_Model02
##  6    12     1 rsq     standard   0.870      9 0.0136  Preprocessor1_Model02
##  7    15     1 mae     standard   0.100      9 0.00495 Preprocessor1_Model03
##  8    15     1 rmse    standard   0.145      9 0.00954 Preprocessor1_Model03
##  9    15     1 rsq     standard   0.868      9 0.0133  Preprocessor1_Model03
## 10    17     1 mae     standard   0.101      9 0.00515 Preprocessor1_Model04
## # ... with 65 more rows

fit %>% show_best("rsq")

## # A tibble: 5 x 8
##    mtry min_n .metric .estimator  mean     n std_err .config              
##   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1    10     6 rsq     standard   0.871     9  0.0131 Preprocessor1_Model16
## 2    10     4 rsq     standard   0.871     9  0.0133 Preprocessor1_Model11
## 3    10     8 rsq     standard   0.871     9  0.0131 Preprocessor1_Model21
## 4    10     2 rsq     standard   0.871     9  0.0136 Preprocessor1_Model06
## 5    10     1 rsq     standard   0.871     9  0.0137 Preprocessor1_Model01

fit %>% show_best("rmse")

## # A tibble: 5 x 8
##    mtry min_n .metric .estimator  mean     n std_err .config              
##   <int> <int> <chr>   <chr>      <dbl> <int>   <dbl> <chr>                
## 1    10     6 rmse    standard   0.143     9 0.00961 Preprocessor1_Model16
## 2    10     4 rmse    standard   0.143     9 0.00976 Preprocessor1_Model11
## 3    10     8 rmse    standard   0.143     9 0.00969 Preprocessor1_Model21
## 4    12     4 rmse    standard   0.143     9 0.00966 Preprocessor1_Model12
## 5    10     2 rmse    standard   0.143     9 0.00985 Preprocessor1_Model06

best_RMSE <- select_best(fit, "rmse") # select the best models

finalize the model

best_tunned_model <- finalize_model(rf_model,
                                    best_RMSE)

rebuild the model using the best_tunned_specs

tunned_wf <- workflow() %>%
  add_recipe(rec) %>%
  add_model(best_tunned_model)

lastfit using the best_tunned_model

last_fit <- tunned_wf %>% last_fit(split,
                                   metrics = metric_set(mae, rmse, rsq))

assess model performance

last_fit %>% collect_metrics()

## # A tibble: 3 x 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 mae     standard       0.114 Preprocessor1_Model1
## 2 rmse    standard       0.169 Preprocessor1_Model1
## 3 rsq     standard       0.834 Preprocessor1_Model1

last_fit %>% collect_predictions()

## # A tibble: 365 x 5
##    id               .pred  .row sale_price .config             
##    <chr>            <dbl> <int>      <dbl> <chr>               
##  1 train/test split  12.3     8       12.2 Preprocessor1_Model1
##  2 train/test split  11.6    10       11.7 Preprocessor1_Model1
##  3 train/test split  11.8    11       11.8 Preprocessor1_Model1
##  4 train/test split  11.7    13       11.9 Preprocessor1_Model1
##  5 train/test split  11.9    16       11.8 Preprocessor1_Model1
##  6 train/test split  11.9    19       12.0 Preprocessor1_Model1
##  7 train/test split  11.9    24       11.8 Preprocessor1_Model1
##  8 train/test split  12.5    26       12.5 Preprocessor1_Model1
##  9 train/test split  11.5    31       10.6 Preprocessor1_Model1
## 10 train/test split  11.9    34       12.0 Preprocessor1_Model1
## # ... with 355 more rows

last_fit %>% collect_predictions() %>%
  ggplot(aes(.pred, sale_price))+
  geom_point()

save the model

Save the model to predict on test dat

saved_model <- last_fit$.workflow[[1]]

Preprocess test data

test <- test %>%
  select (id, overall_qual,year_built,year_remod_add,mas_vnr_area,
          total_bsmt_sf,x1st_flr_sf,gr_liv_area,full_bath,tot_rms_abv_grd,fireplaces,
          garage_yr_blt,garage_cars,garage_area, ms_zoning, street,condition1,
          house_style,mas_vnr_type,exter_qual, bsmt_cond,bsmt_qual,bsmt_exposure) %>% 
  mutate(
    bsmt_qual = case_when(is.na(bsmt_qual) == "TRUE" ~ "none",
                          TRUE ~ bsmt_qual),
    bsmt_cond = case_when(is.na(bsmt_cond) == "TRUE" ~ "none",
                          TRUE ~ bsmt_cond),
    bsmt_exposure = case_when(is.na(bsmt_exposure) == "TRUE" ~ "none",
                              TRUE ~ bsmt_exposure)) %>%
  mutate(sale_price=0)
  
# process the test data to impute missing data

rec2 <- recipe(sale_price~. , data = test) %>%
  update_role(id, new_role = "id") %>%
  step_string2factor(all_nominal()) %>%
  step_zv(all_numeric()) %>%
  step_knnimpute(mas_vnr_type,ms_zoning) %>%
  step_meanimpute(garage_yr_blt, mas_vnr_area, 
                  garage_cars,garage_area,total_bsmt_sf)

# check prep recipe
prep2 <- prep(rec2)
test2 <-  bake(prep2, new_data=NULL)

Predict on test data

Finally, we can fit our model on test data to predict the house price

pred_result <- predict(saved_model, test2)
test2$sale_price <- pred_result$.pred

FORECAST HOUSE PRICE FROM KAGGLE COMPETITION

HUNG.NGUYEN

12/24/2020