1 Introduction

Today, i will experiment trying to apply Machine Learning in predicting the sale price of some property in Iowa. I got the dataset from Kaggle in which i re-saved it into xlsx form so that it would make life a little more easier for me to import.

1.1 Importing the neceessary data

library(readxl)
Iowa<- read_excel("D:/Working Directory/Iowa/train.xlsx")
head(Iowa)
## # A tibble: 6 x 81
##      Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
##   <dbl>      <dbl> <chr>    <chr>         <dbl> <chr>  <chr> <chr>   
## 1     1         60 RL       65             8450 Pave   NA    Reg     
## 2     2         20 RL       80             9600 Pave   NA    Reg     
## 3     3         60 RL       68            11250 Pave   NA    IR1     
## 4     4         70 RL       60             9550 Pave   NA    IR1     
## 5     5         60 RL       84            14260 Pave   NA    IR1     
## 6     6         50 RL       85            14115 Pave   NA    IR1     
## # ... with 73 more variables: LandContour <chr>, Utilities <chr>,
## #   LotConfig <chr>, LandSlope <chr>, Neighborhood <chr>,
## #   Condition1 <chr>, Condition2 <chr>, BldgType <chr>, HouseStyle <chr>,
## #   OverallQual <dbl>, OverallCond <dbl>, YearBuilt <dbl>,
## #   YearRemodAdd <dbl>, RoofStyle <chr>, RoofMatl <chr>,
## #   Exterior1st <chr>, Exterior2nd <chr>, MasVnrType <chr>,
## #   MasVnrArea <chr>, ExterQual <chr>, ExterCond <chr>, Foundation <chr>,
## #   BsmtQual <chr>, BsmtCond <chr>, BsmtExposure <chr>,
## #   BsmtFinType1 <chr>, BsmtFinSF1 <dbl>, BsmtFinType2 <chr>,
## #   BsmtFinSF2 <dbl>, BsmtUnfSF <dbl>, TotalBsmtSF <dbl>, Heating <chr>,
## #   HeatingQC <chr>, CentralAir <chr>, Electrical <chr>, `1stFlrSF` <dbl>,
## #   `2ndFlrSF` <dbl>, LowQualFinSF <dbl>, GrLivArea <dbl>,
## #   BsmtFullBath <dbl>, BsmtHalfBath <dbl>, FullBath <dbl>,
## #   HalfBath <dbl>, BedroomAbvGr <dbl>, KitchenAbvGr <dbl>,
## #   KitchenQual <chr>, TotRmsAbvGrd <dbl>, Functional <chr>,
## #   Fireplaces <dbl>, FireplaceQu <chr>, GarageType <chr>,
## #   GarageYrBlt <chr>, GarageFinish <chr>, GarageCars <dbl>,
## #   GarageArea <dbl>, GarageQual <chr>, GarageCond <chr>,
## #   PavedDrive <chr>, WoodDeckSF <dbl>, OpenPorchSF <dbl>,
## #   EnclosedPorch <dbl>, `3SsnPorch` <dbl>, ScreenPorch <dbl>,
## #   PoolArea <dbl>, PoolQC <chr>, Fence <chr>, MiscFeature <chr>,
## #   MiscVal <dbl>, MoSold <dbl>, YrSold <dbl>, SaleType <chr>,
## #   SaleCondition <chr>, SalePrice <dbl>
names(Iowa)
##  [1] "Id"            "MSSubClass"    "MSZoning"      "LotFrontage"  
##  [5] "LotArea"       "Street"        "Alley"         "LotShape"     
##  [9] "LandContour"   "Utilities"     "LotConfig"     "LandSlope"    
## [13] "Neighborhood"  "Condition1"    "Condition2"    "BldgType"     
## [17] "HouseStyle"    "OverallQual"   "OverallCond"   "YearBuilt"    
## [21] "YearRemodAdd"  "RoofStyle"     "RoofMatl"      "Exterior1st"  
## [25] "Exterior2nd"   "MasVnrType"    "MasVnrArea"    "ExterQual"    
## [29] "ExterCond"     "Foundation"    "BsmtQual"      "BsmtCond"     
## [33] "BsmtExposure"  "BsmtFinType1"  "BsmtFinSF1"    "BsmtFinType2" 
## [37] "BsmtFinSF2"    "BsmtUnfSF"     "TotalBsmtSF"   "Heating"      
## [41] "HeatingQC"     "CentralAir"    "Electrical"    "1stFlrSF"     
## [45] "2ndFlrSF"      "LowQualFinSF"  "GrLivArea"     "BsmtFullBath" 
## [49] "BsmtHalfBath"  "FullBath"      "HalfBath"      "BedroomAbvGr" 
## [53] "KitchenAbvGr"  "KitchenQual"   "TotRmsAbvGrd"  "Functional"   
## [57] "Fireplaces"    "FireplaceQu"   "GarageType"    "GarageYrBlt"  
## [61] "GarageFinish"  "GarageCars"    "GarageArea"    "GarageQual"   
## [65] "GarageCond"    "PavedDrive"    "WoodDeckSF"    "OpenPorchSF"  
## [69] "EnclosedPorch" "3SsnPorch"     "ScreenPorch"   "PoolArea"     
## [73] "PoolQC"        "Fence"         "MiscFeature"   "MiscVal"      
## [77] "MoSold"        "YrSold"        "SaleType"      "SaleCondition"
## [81] "SalePrice"
summary(Iowa)
##        Id           MSSubClass      MSZoning         LotFrontage       
##  Min.   :   1.0   Min.   : 20.0   Length:1460        Length:1460       
##  1st Qu.: 365.8   1st Qu.: 20.0   Class :character   Class :character  
##  Median : 730.5   Median : 50.0   Mode  :character   Mode  :character  
##  Mean   : 730.5   Mean   : 56.9                                        
##  3rd Qu.:1095.2   3rd Qu.: 70.0                                        
##  Max.   :1460.0   Max.   :190.0                                        
##     LotArea          Street             Alley             LotShape        
##  Min.   :  1300   Length:1460        Length:1460        Length:1460       
##  1st Qu.:  7554   Class :character   Class :character   Class :character  
##  Median :  9478   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 10517                                                           
##  3rd Qu.: 11602                                                           
##  Max.   :215245                                                           
##  LandContour         Utilities          LotConfig        
##  Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##   LandSlope         Neighborhood        Condition1       
##  Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##   Condition2          BldgType          HouseStyle         OverallQual    
##  Length:1460        Length:1460        Length:1460        Min.   : 1.000  
##  Class :character   Class :character   Class :character   1st Qu.: 5.000  
##  Mode  :character   Mode  :character   Mode  :character   Median : 6.000  
##                                                           Mean   : 6.099  
##                                                           3rd Qu.: 7.000  
##                                                           Max.   :10.000  
##   OverallCond      YearBuilt     YearRemodAdd   RoofStyle        
##  Min.   :1.000   Min.   :1872   Min.   :1950   Length:1460       
##  1st Qu.:5.000   1st Qu.:1954   1st Qu.:1967   Class :character  
##  Median :5.000   Median :1973   Median :1994   Mode  :character  
##  Mean   :5.575   Mean   :1971   Mean   :1985                     
##  3rd Qu.:6.000   3rd Qu.:2000   3rd Qu.:2004                     
##  Max.   :9.000   Max.   :2010   Max.   :2010                     
##    RoofMatl         Exterior1st        Exterior2nd       
##  Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##   MasVnrType         MasVnrArea         ExterQual        
##  Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##   ExterCond          Foundation          BsmtQual        
##  Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##    BsmtCond         BsmtExposure       BsmtFinType1         BsmtFinSF1    
##  Length:1460        Length:1460        Length:1460        Min.   :   0.0  
##  Class :character   Class :character   Class :character   1st Qu.:   0.0  
##  Mode  :character   Mode  :character   Mode  :character   Median : 383.5  
##                                                           Mean   : 443.6  
##                                                           3rd Qu.: 712.2  
##                                                           Max.   :5644.0  
##  BsmtFinType2         BsmtFinSF2        BsmtUnfSF       TotalBsmtSF    
##  Length:1460        Min.   :   0.00   Min.   :   0.0   Min.   :   0.0  
##  Class :character   1st Qu.:   0.00   1st Qu.: 223.0   1st Qu.: 795.8  
##  Mode  :character   Median :   0.00   Median : 477.5   Median : 991.5  
##                     Mean   :  46.55   Mean   : 567.2   Mean   :1057.4  
##                     3rd Qu.:   0.00   3rd Qu.: 808.0   3rd Qu.:1298.2  
##                     Max.   :1474.00   Max.   :2336.0   Max.   :6110.0  
##    Heating           HeatingQC          CentralAir       
##  Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##   Electrical           1stFlrSF       2ndFlrSF     LowQualFinSF    
##  Length:1460        Min.   : 334   Min.   :   0   Min.   :  0.000  
##  Class :character   1st Qu.: 882   1st Qu.:   0   1st Qu.:  0.000  
##  Mode  :character   Median :1087   Median :   0   Median :  0.000  
##                     Mean   :1163   Mean   : 347   Mean   :  5.845  
##                     3rd Qu.:1391   3rd Qu.: 728   3rd Qu.:  0.000  
##                     Max.   :4692   Max.   :2065   Max.   :572.000  
##    GrLivArea     BsmtFullBath     BsmtHalfBath        FullBath    
##  Min.   : 334   Min.   :0.0000   Min.   :0.00000   Min.   :0.000  
##  1st Qu.:1130   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:1.000  
##  Median :1464   Median :0.0000   Median :0.00000   Median :2.000  
##  Mean   :1515   Mean   :0.4253   Mean   :0.05753   Mean   :1.565  
##  3rd Qu.:1777   3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:2.000  
##  Max.   :5642   Max.   :3.0000   Max.   :2.00000   Max.   :3.000  
##     HalfBath       BedroomAbvGr    KitchenAbvGr   KitchenQual       
##  Min.   :0.0000   Min.   :0.000   Min.   :0.000   Length:1460       
##  1st Qu.:0.0000   1st Qu.:2.000   1st Qu.:1.000   Class :character  
##  Median :0.0000   Median :3.000   Median :1.000   Mode  :character  
##  Mean   :0.3829   Mean   :2.866   Mean   :1.047                     
##  3rd Qu.:1.0000   3rd Qu.:3.000   3rd Qu.:1.000                     
##  Max.   :2.0000   Max.   :8.000   Max.   :3.000                     
##   TotRmsAbvGrd     Functional          Fireplaces    FireplaceQu       
##  Min.   : 2.000   Length:1460        Min.   :0.000   Length:1460       
##  1st Qu.: 5.000   Class :character   1st Qu.:0.000   Class :character  
##  Median : 6.000   Mode  :character   Median :1.000   Mode  :character  
##  Mean   : 6.518                      Mean   :0.613                     
##  3rd Qu.: 7.000                      3rd Qu.:1.000                     
##  Max.   :14.000                      Max.   :3.000                     
##   GarageType        GarageYrBlt        GarageFinish         GarageCars   
##  Length:1460        Length:1460        Length:1460        Min.   :0.000  
##  Class :character   Class :character   Class :character   1st Qu.:1.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :2.000  
##                                                           Mean   :1.767  
##                                                           3rd Qu.:2.000  
##                                                           Max.   :4.000  
##    GarageArea      GarageQual         GarageCond         PavedDrive       
##  Min.   :   0.0   Length:1460        Length:1460        Length:1460       
##  1st Qu.: 334.5   Class :character   Class :character   Class :character  
##  Median : 480.0   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 473.0                                                           
##  3rd Qu.: 576.0                                                           
##  Max.   :1418.0                                                           
##    WoodDeckSF      OpenPorchSF     EnclosedPorch      3SsnPorch     
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00  
##  Median :  0.00   Median : 25.00   Median :  0.00   Median :  0.00  
##  Mean   : 94.24   Mean   : 46.66   Mean   : 21.95   Mean   :  3.41  
##  3rd Qu.:168.00   3rd Qu.: 68.00   3rd Qu.:  0.00   3rd Qu.:  0.00  
##  Max.   :857.00   Max.   :547.00   Max.   :552.00   Max.   :508.00  
##   ScreenPorch        PoolArea          PoolQC             Fence          
##  Min.   :  0.00   Min.   :  0.000   Length:1460        Length:1460       
##  1st Qu.:  0.00   1st Qu.:  0.000   Class :character   Class :character  
##  Median :  0.00   Median :  0.000   Mode  :character   Mode  :character  
##  Mean   : 15.06   Mean   :  2.759                                        
##  3rd Qu.:  0.00   3rd Qu.:  0.000                                        
##  Max.   :480.00   Max.   :738.000                                        
##  MiscFeature           MiscVal             MoSold           YrSold    
##  Length:1460        Min.   :    0.00   Min.   : 1.000   Min.   :2006  
##  Class :character   1st Qu.:    0.00   1st Qu.: 5.000   1st Qu.:2007  
##  Mode  :character   Median :    0.00   Median : 6.000   Median :2008  
##                     Mean   :   43.49   Mean   : 6.322   Mean   :2008  
##                     3rd Qu.:    0.00   3rd Qu.: 8.000   3rd Qu.:2009  
##                     Max.   :15500.00   Max.   :12.000   Max.   :2010  
##    SaleType         SaleCondition        SalePrice     
##  Length:1460        Length:1460        Min.   : 34900  
##  Class :character   Class :character   1st Qu.:129975  
##  Mode  :character   Mode  :character   Median :163000  
##                                        Mean   :180921  
##                                        3rd Qu.:214000  
##                                        Max.   :755000

1.2 Importing likewise the libraries that we need

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(tidyverse) #I don't know why but i'm calling this out of habit
## -- Attaching packages --------------------------------------------------------------- tidyverse 1.2.1 --
## v tibble  1.4.2     v purrr   0.2.5
## v tidyr   0.8.1     v dplyr   0.7.6
## v readr   1.1.1     v stringr 1.3.1
## v tibble  1.4.2     v forcats 0.3.0
## -- Conflicts ------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x purrr::lift()   masks caret::lift()
library(rpart)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin

1.3 Converting price into Euro

I would like to convert the SalePrice into Euros. Exchange rate is as of the 29th of November

Iowa_train <- Iowa %>%
  mutate(SalePrice = 0.88 * SalePrice)

2 Creating the model

I would like to predict a model that involves the SalePrice variable and figure out how it can be related to the following parameters:

fit_a <- rpart(SalePrice ~ LotArea + YearBuilt + YearRemodAdd + Condition1 + Condition2 + FullBath + BedroomAbvGr + TotRmsAbvGrd + GarageArea + OverallCond + OverallQual + GrLivArea + WoodDeckSF + OpenPorchSF, data = Iowa_train)

plot(fit_a, uniform = TRUE)
text(fit_a, cex = 0.225)

With this fitted mode, i would like to predict the sale price via the predict() function.

print(head(Iowa_train))
## # A tibble: 6 x 81
##      Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
##   <dbl>      <dbl> <chr>    <chr>         <dbl> <chr>  <chr> <chr>   
## 1     1         60 RL       65             8450 Pave   NA    Reg     
## 2     2         20 RL       80             9600 Pave   NA    Reg     
## 3     3         60 RL       68            11250 Pave   NA    IR1     
## 4     4         70 RL       60             9550 Pave   NA    IR1     
## 5     5         60 RL       84            14260 Pave   NA    IR1     
## 6     6         50 RL       85            14115 Pave   NA    IR1     
## # ... with 73 more variables: LandContour <chr>, Utilities <chr>,
## #   LotConfig <chr>, LandSlope <chr>, Neighborhood <chr>,
## #   Condition1 <chr>, Condition2 <chr>, BldgType <chr>, HouseStyle <chr>,
## #   OverallQual <dbl>, OverallCond <dbl>, YearBuilt <dbl>,
## #   YearRemodAdd <dbl>, RoofStyle <chr>, RoofMatl <chr>,
## #   Exterior1st <chr>, Exterior2nd <chr>, MasVnrType <chr>,
## #   MasVnrArea <chr>, ExterQual <chr>, ExterCond <chr>, Foundation <chr>,
## #   BsmtQual <chr>, BsmtCond <chr>, BsmtExposure <chr>,
## #   BsmtFinType1 <chr>, BsmtFinSF1 <dbl>, BsmtFinType2 <chr>,
## #   BsmtFinSF2 <dbl>, BsmtUnfSF <dbl>, TotalBsmtSF <dbl>, Heating <chr>,
## #   HeatingQC <chr>, CentralAir <chr>, Electrical <chr>, `1stFlrSF` <dbl>,
## #   `2ndFlrSF` <dbl>, LowQualFinSF <dbl>, GrLivArea <dbl>,
## #   BsmtFullBath <dbl>, BsmtHalfBath <dbl>, FullBath <dbl>,
## #   HalfBath <dbl>, BedroomAbvGr <dbl>, KitchenAbvGr <dbl>,
## #   KitchenQual <chr>, TotRmsAbvGrd <dbl>, Functional <chr>,
## #   Fireplaces <dbl>, FireplaceQu <chr>, GarageType <chr>,
## #   GarageYrBlt <chr>, GarageFinish <chr>, GarageCars <dbl>,
## #   GarageArea <dbl>, GarageQual <chr>, GarageCond <chr>,
## #   PavedDrive <chr>, WoodDeckSF <dbl>, OpenPorchSF <dbl>,
## #   EnclosedPorch <dbl>, `3SsnPorch` <dbl>, ScreenPorch <dbl>,
## #   PoolArea <dbl>, PoolQC <chr>, Fence <chr>, MiscFeature <chr>,
## #   MiscVal <dbl>, MoSold <dbl>, YrSold <dbl>, SaleType <chr>,
## #   SaleCondition <chr>, SalePrice <dbl>
print(predict(fit_a, head(Iowa_train)))
##        1        2        3        4        5        6 
## 170930.1 118988.8 170930.1 170930.1 277107.2 118988.8
print(head(Iowa_train$SalePrice))
## [1] 183480 159720 196680 123200 220000 125840

2.1 Determining if our model is good

How do we know if our model is good?. I figure we need to load modelr library and make use of the mae function

library(modelr)

mae(model = fit_a, data = Iowa_train)
## [1] 24815.25

So apparently, our model is off by an average of 24815.25 euros. Huge.

3 Creating a model with partitioning

I would like to split the data i just popped up here into two different datasets, a test and a train set. In that, i want to use a test/train ratio of 0.25/0.75. It’s been said that modelr has some interesting functions to pull it off.

Iowa_split <- resample_partition(Iowa_train, c(test = 0.25, train = 0.75))

lapply(Iowa_split, dim)
## $test
## [1] 364  81
## 
## $train
## [1] 1096   81

3.1 Creating a new model

fit_b <- rpart(SalePrice ~ LotArea + YearBuilt + YearRemodAdd + Condition1 + Condition2 + FullBath + BedroomAbvGr + TotRmsAbvGrd + GarageArea + OverallCond + OverallQual + GrLivArea + WoodDeckSF + OpenPorchSF, data = Iowa_split$train)

Getting the mean average error (mae) of our new model based on the test data

mae(model = fit_b, data = Iowa_split$test)
## [1] 24767.77

The error has slightly increased, which is alarming IMO. But at least we didn’t have to artificially clean it up

4 Model improvement (Overfitting and underfitting)

I would like to employ a loop where i can cut down the decision tree into slightly more viable size. Let’s try to test the mean average error if we can cut down its max depth from 1 to 18

I think we need to do this via looping

get_mae <- function(maxdepth, target, predictors, training_data, testing_data){
  predictors <- paste(predictors, collapse = "+")
  formula <- as.formula(paste(target, "~", predictors, sep = ""))
  model <- rpart(formula, data = training_data, control = rpart.control(maxdepth = maxdepth))
  mae <- mae(model, testing_data)
  return(mae)
}
target <- "SalePrice"
predictors <- c("LotArea","YearBuilt", "YearRemodAdd", "Condition1", "Condition2", "FullBath", "BedroomAbvGr", "TotRmsAbvGrd", "GarageArea", "OverallCond", "OverallQual", "GrLivArea", "WoodDeckSF", "OpenPorchSF")

for(i in 1:18){
  mae <- get_mae(maxdepth = i, target = target, predictors = predictors, training_data = Iowa_split$train, testing_data = Iowa_split$test)
  print(glue::glue("Maxdepth:", i, "\t MAE:", mae ))
}
## Maxdepth:1    MAE:40156.0796403488
## Maxdepth:2    MAE:30466.6070721759
## Maxdepth:3    MAE:25511.4553019137
## Maxdepth:4    MAE:24767.7651648225
## Maxdepth:5    MAE:24767.7651648225
## Maxdepth:6    MAE:24767.7651648225
## Maxdepth:7    MAE:24767.7651648225
## Maxdepth:8    MAE:24767.7651648225
## Maxdepth:9    MAE:24767.7651648225
## Maxdepth:10   MAE:24767.7651648225
## Maxdepth:11   MAE:24767.7651648225
## Maxdepth:12   MAE:24767.7651648225
## Maxdepth:13   MAE:24767.7651648225
## Maxdepth:14   MAE:24767.7651648225
## Maxdepth:15   MAE:24767.7651648225
## Maxdepth:16   MAE:24767.7651648225
## Maxdepth:17   MAE:24767.7651648225
## Maxdepth:18   MAE:24767.7651648225

Apparently, a Maxdepth of 4 is optimal because at a Maxdepth of 5 and below, it’s the same. I don’t understand or know why.