Summary

I started this competition to learn to write R by testing out variation of syntaxes, creating models for prediction, and attempt to gain a better overall foundation of the process data in analytics. My main focus was to attempt the best model by working on some regression and trees/random forest.

## Set directory
# setwd('.../Documents/R/win-library/3.5')

Section 1: Process the Data

1.1 Loading libraries and data into R

Loading R packages used in script.

library(stringi) # install.packages("stringi", repos="http://cran.rstudio.com/", dependencies=TRUE) if error
library(ggplot2)
library(dplyr)
library(tidyr)
library(purrr)
library(readr)
library(forcats)
library(caret)
library(gridExtra)
library(corrplot)
library(Hmisc)
library(knitr)
library(gridExtra)
library(randomForest) 
library(ModelMetrics)
library(e1071) 

Below, I am reading the csv files as dataframes into R.

train <- read.csv('C:/Users/David Ly/Documents/R Projects/Kaggle/House Prices/train.csv', stringsAsFactors = FALSE)
test <- read.csv('C:/Users/David Ly/Documents/R Projects/Kaggle/House Prices/test.csv', stringsAsFactors = FALSE)

1.2 Data Size & Structures of Training Set

The train dataset conssit of characfter and integer variables. You can see certain factors that are ordinal as well so if you plan on changing them, be sure to update it accordingly to their specific order.

dim(train)
## [1] 1460   81
str(train[, c(1:5,81)]) # glimpse(train)
## 'data.frame':    1460 obs. of  6 variables:
##  $ Id         : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ MSSubClass : int  60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning   : chr  "RL" "RL" "RL" "RL" ...
##  $ LotFrontage: int  65 80 68 60 84 85 75 NA 51 50 ...
##  $ LotArea    : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
##  $ SalePrice  : int  208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...

1.3 Check for Missing Values, Errors, Corrupted Values

There are two ways I like to write this out. You can either use colSums() or create a function.

  • Use sapply() version
sapply(train[, c(1:15)], function(x) {sum(is.na(x))}) # colSums(sapply(train, is.na))
##           Id   MSSubClass     MSZoning  LotFrontage      LotArea 
##            0            0            0          259            0 
##       Street        Alley     LotShape  LandContour    Utilities 
##            0         1369            0            0            0 
##    LotConfig    LandSlope Neighborhood   Condition1   Condition2 
##            0            0            0            0            0

1.4 Combine Datasets

test$SalePrice <- NA
all <- rbind(train, test)
sort(colSums(sapply(all, is.na)), decreasing = TRUE)
##        PoolQC   MiscFeature         Alley         Fence     SalePrice 
##          2909          2814          2721          2348          1459 
##   FireplaceQu   LotFrontage   GarageYrBlt  GarageFinish    GarageQual 
##          1420           486           159           159           159 
##    GarageCond    GarageType      BsmtCond  BsmtExposure      BsmtQual 
##           159           157            82            82            81 
##  BsmtFinType2  BsmtFinType1    MasVnrType    MasVnrArea      MSZoning 
##            80            79            24            23             4 
##     Utilities  BsmtFullBath  BsmtHalfBath    Functional   Exterior1st 
##             2             2             2             2             1 
##   Exterior2nd    BsmtFinSF1    BsmtFinSF2     BsmtUnfSF   TotalBsmtSF 
##             1             1             1             1             1 
##    Electrical   KitchenQual    GarageCars    GarageArea      SaleType 
##             1             1             1             1             1 
##            Id    MSSubClass       LotArea        Street      LotShape 
##             0             0             0             0             0 
##   LandContour     LotConfig     LandSlope  Neighborhood    Condition1 
##             0             0             0             0             0 
##    Condition2      BldgType    HouseStyle   OverallQual   OverallCond 
##             0             0             0             0             0 
##     YearBuilt  YearRemodAdd     RoofStyle      RoofMatl     ExterQual 
##             0             0             0             0             0 
##     ExterCond    Foundation       Heating     HeatingQC    CentralAir 
##             0             0             0             0             0 
##     X1stFlrSF     X2ndFlrSF  LowQualFinSF     GrLivArea      FullBath 
##             0             0             0             0             0 
##      HalfBath  BedroomAbvGr  KitchenAbvGr  TotRmsAbvGrd    Fireplaces 
##             0             0             0             0             0 
##    PavedDrive    WoodDeckSF   OpenPorchSF EnclosedPorch    X3SsnPorch 
##             0             0             0             0             0 
##   ScreenPorch      PoolArea       MiscVal        MoSold        YrSold 
##             0             0             0             0             0 
## SaleCondition 
##             0

Section 2: Exploratory Data Analysis

I wanted to see if there were a lot of columns that had more than 20 rows per variable.

# Create a table loop for all variables, head() for any rows above 20
names <- colnames(all)
for(x in 1:10) {  # Use ncol(all) instead of 10 to show all dcol
  if (nrow((table(all[, x]))) > 20) {
    cat('\n', colnames(all[x]), '\n', print(head(all[,x])), '\n')
  } else {
    cat('\n', colnames(all[x]), '\n', print(table(all[,x])), '\n')
  }
}
## [1] 1 2 3 4 5 6
## 
##  Id 
##  1 2 3 4 5 6 
## 
##   20   30   40   45   50   60   70   75   80   85   90  120  150  160  180 
## 1079  139    6   18  287  575  128   23  118   48  109  182    1  128   17 
##  190 
##   61 
## 
##  MSSubClass 
##  1079 139 6 18 287 575 128 23 118 48 109 182 1 128 17 61 
## 
## C (all)      FV      RH      RL      RM 
##      25     139      26    2265     460 
## 
##  MSZoning 
##  25 139 26 2265 460 
## [1] 65 80 68 60 84 85
## 
##  LotFrontage 
##  65 80 68 60 84 85 
## [1]  8450  9600 11250  9550 14260 14115
## 
##  LotArea 
##  8450 9600 11250 9550 14260 14115 
## 
## Grvl Pave 
##   12 2907 
## 
##  Street 
##  12 2907 
## 
## Grvl Pave 
##  120   78 
## 
##  Alley 
##  120 78 
## 
##  IR1  IR2  IR3  Reg 
##  968   76   16 1859 
## 
##  LotShape 
##  968 76 16 1859 
## 
##  Bnk  HLS  Low  Lvl 
##  117  120   60 2622 
## 
##  LandContour 
##  117 120 60 2622 
## 
## AllPub NoSeWa 
##   2916      1 
## 
##  Utilities 
##  2916 1

2.1: Graphing & Summaries

Keep in mind that the response variable is SalePrice.

Let’s attempt to model the numerical variables into a correlation plot. The idea is to gauge for multicollinearity and have easy access to an understanding of the numerical variables if possible.

One thing I would need to clean up are the question marks. They appear because of NA values. I would like to go back to this to clean up to be more appealing visually.

# Split Numeric and Categoric variables (First split - split again after)
num_features_init <- names(which(sapply(all, is.numeric)))
cat_features_init <- names(which(sapply(all, is.character)))
all_numeric_init <- all[, names(all) %in% num_features_init]
all_categoric_init <- all[, names(all) %in% cat_features_init]
# Remove NA values to use findCorrelation() function
all_numeric_init <- all_numeric_init[, !colnames(all_numeric_init) == "SalePrice"]
# Create correlation plot variations
par(mfrow = c(1,1))
correlations <- cor(all_numeric_init, use = "everything") # cor() function to create plots
corrplot(correlations, method = "circle", sig.level = 0.01, insig = "blank") 

corrplot(correlations, method = "color", type = "lower", tl.cex = 0.75, tl.col = "black", tl.srt = 45)

# More correlation plots
corrplot(correlations, method = "number", addrect = 2, diag = F)
corrplot(correlations, sig.level = 0, insig = "p-value", method = "ellipse", type = "upper", addrect = 2, tl.pos = "n", cl.pos = "n", diag = F, add = T)

Visualization of missing data for the combined dataset. This was an interesting way to plot missing values. This was a code I found going through a few kernels. This excerpt of code below is not written by me. We can see that certain variables may not be useful in it the prediction model being created later. Missing values can greatly skew the data. If we do want to include it, we would somehow want to group it into certain values that would make sense.

# Create a function to plot missing data for each variable
plot_Missing <- function(data_in, title = NULL){
  temp_df <- as.data.frame(ifelse(is.na(data_in), 0, 1))
  temp_df <- temp_df[,order(colSums(temp_df))]
  data_temp <- expand.grid(list(x = 1:nrow(temp_df), y = colnames(temp_df)))
  data_temp$m <- as.vector(as.matrix(temp_df))
  data_temp <- data.frame(x = unlist(data_temp$x), y = unlist(data_temp$y), m = unlist(data_temp$m))
  ggplot(data_temp) + geom_tile(aes(x=x, y=y, fill=factor(m))) + scale_fill_manual(values=c("white", "black"), name="Missing\n(0=Yes, 1=No)") + theme_light() + ylab("") + xlab("") + ggtitle(title)
}

# Insert combined data where there is at least one missing value
plot_Missing(all[,colSums(is.na(all)) > 0])

2.2 Custom Theme

I am a visual person. I enjoy color as well as simplicity. I decided to create a custom theme alongside other themes provided such as theme_minimal(). This was just used to practice understanding what parameters there are. There are so many to use, I would suggest checking out the theme() document or check out this ggplot() link below.

ggplot2: Theme Reference

# create custom theme
theme_custom <- function(base_size = 11, base_family = "") {
  theme_grey(base_size = base_size, base_family = base_family) %+replace% 
    theme(axis.title =  element_text(face = "bold"),
          axis.text.x = element_text(colour = "red", face = "bold"),
          axis.text.y = element_text(colour = "red", face = "bold"),
          panel.border = element_rect(fill = NA, colour = "grey20"), 
          panel.background = element_rect(colour = "lightblue", fill = "lightblue"),
          panel.grid.major = element_line(colour = "white"),
          panel.grid.minor = element_line(colour = "white"),
          plot.background = element_rect(colour = "lightgray", fill = "lightgray"),
          plot.title = element_text(hjust = 0.5, size = "12", face = "bold", lineheight= 0.5),
          plot.subtitle = element_text(hjust = 0.5, lineheight = 0.5, face = "italic")
    )
  
}

Some other theme() parameters I would customize would be legend as well, but legends changes more often based on the type of data.

# Change plot scale values
options(scipen = 10000)

2.3 Various Plots

I wanted to visualize many variables to see how the data distribution looked like in boxplots, scatterplots, and more. I believe it is good to get a good understanding that is easy to take in. Humans are visually paired to grasp visual cues/data a lot faster than any other sensory data. We have around 70% cues in our eyes and can take in data visually in about 13ms according to a MIT study.

pairs( ~ YearBuilt + YearRemodAdd + OverallQual + TotalBsmtSF + GrLivArea, data = all,
       main = "Simple Scatterplot Matrix")

pairs( ~ YearBuilt + OverallQual + TotalBsmtSF + GrLivArea, data = all, main = "Simple Scatterplot Matrix")

Curious to know who has the highest Sale Price. Id 692 has the highest sale price based on the data. The lot area is very large at 21,535 which makes sense.

# Who has the max Sale Price?
all[which.max(all$SalePrice),]
##      Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 692 692         60       RL         104   21535   Pave  <NA>      IR1
##     LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 692         Lvl    AllPub    Corner       Gtl      NoRidge       Norm
##     Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 692       Norm     1Fam     2Story          10           6      1994
##     YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 692         1995     Gable  WdShngl     HdBoard     HdBoard    BrkFace
##     MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond
## 692       1170        Ex        TA      PConc       Ex       TA
##     BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF
## 692           Gd          GLQ       1455          Unf          0       989
##     TotalBsmtSF Heating HeatingQC CentralAir Electrical X1stFlrSF
## 692        2444    GasA        Ex          Y      SBrkr      2444
##     X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 692      1872            0      4316            0            1        3
##     HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 692        1            4            1          Ex           10        Typ
##     Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
## 692          2          Ex     Attchd        1994          Fin          3
##     GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 692        832         TA         TA          Y        382          50
##     EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
## 692             0          0           0        0   <NA>  <NA>        <NA>
##     MiscVal MoSold YrSold SaleType SaleCondition SalePrice
## 692       0      1   2007       WD        Normal    755000

2.4 Year Built vs. Sale Price

Sale price seems to have a negative correlation to the year built. The newer the house, the lower the price based on this scatterplot. I would have expected the prices to be higher if the house was newer so this was interesting.

ggplot(data = all[all$SalePrice > 0,], aes(x = YearBuilt, y = SalePrice)) +
  geom_point(na.rm = TRUE) +
  geom_smooth(method = "loess", se = FALSE, na.rm = TRUE) +
  geom_smooth(se = FALSE, colour = "red", linetype = "solid", na.rm = TRUE) +
  theme_minimal() +
  labs(x = "Year Built",
       y = "Sale Price",
       title = "Year Built vs. Sale Price"
       ) 
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

2.5 Year Sold vs. Sale Price

ggplot(data = all[all$YrSold > 1900 & all$SalePrice > 0,], aes(x = YrSold, y = SalePrice, group = YrSold, na.rm = T)) +
  geom_boxplot(na.rm = T) +
  theme_minimal() +
  labs(x = "Year Sold",
       y = "Sale Price",
       title = "Year Sold vs. Sale Price")
## Warning: Removed 1459 rows containing missing values (stat_boxplot).

2.6 Month Sold vs. Sale Price

ggplot(data = all[all$SalePrice > 0,], aes(x = MoSold, y = SalePrice, group = MoSold)) +
  geom_boxplot(na.rm = T) +
  theme_minimal() +
  labs(x = "Month Sold",
       y = "Sales Price",
       title = "Month Sold vs. Sales Price"
  ) + scale_x_continuous(breaks = c(1,2,3,4,5,6,7,8,9,10,11,12))
## Warning: Removed 1459 rows containing missing values (stat_boxplot).

2.7 Monthly Density

ggplot(data = all[all$SalePrice > 0,], aes(x = MoSold)) +
  geom_histogram(na.rm = T) +
  stat_bin(bins = 12, binwidth = 1) +
  geom_density(aes(y = ..density..)) + # Get density plot to overlay
  theme_minimal() +
  labs(x = "Month Sold",
       y = "Sales Price",
       title = "Month Sold vs. Sales Price") +
  theme_custom() +
  scale_x_continuous(breaks = c(1,2,3,4,5,6,7,8,9,10,11,12))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1459 rows containing non-finite values (stat_bin).
## Warning: Removed 1459 rows containing non-finite values (stat_density).

2.8 SqFt. vs. Sale Price

There are high outliers in the square footage from graphing a scatterplot so I reduced it by removing high square footage homes. There were < 5 outliers but caused a large skew to the right which was misleading.

ggplot(data = all[all$X1stFlrSF < 2500 & all$SalePrice > 0,], aes(x = X1stFlrSF, y = SalePrice)) +
  geom_point(na.rm = T) +
  geom_smooth(se = FALSE, na.rm = T) +
  theme_minimal() +
  labs(x = "Square Footage Floor 1 ",
       y = "Sales Price",
       title = "Square Footage Floor 1 vs. Sales Price"
  )
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

2.9 FirePlace Quality vs. Sale Price

ggplot(data = all[all$SalePrice > 0,], aes(x = FireplaceQu, y = SalePrice)) +
  geom_boxplot(na.rm = T) +
  theme_minimal() +
  labs(x = "Fireplace Quality",
       y = "Sales Price",
       title = "Fireplace Quality vs. Sales Price")

2.10 MSZoning vs. Sale Price

ggplot(data = all[!is.na(all$MSZoning) & all$SalePrice > 0,], aes(x = MSZoning, y = SalePrice)) +
  geom_boxplot(na.rm = T) +
  theme_minimal() +
  labs(x = "MSZoning",
       y = "SalePrice",
       title = "MSZoning vs. Sale Price")

2.11 Lot Area vs. Sale Price

I removed any lot area that was too high and was considered an outlier through visual plots. The cutoff at 30,000 removed a few homes that had a large area that would skew the data. This made the graph a little easier to understand to see how the “loess” curve path.

ggplot(data = all[all$LotArea <= 30000 & all$SalePrice > 0,], aes(x = LotArea, y = SalePrice)) +
  geom_point(na.rm = T) +
  geom_smooth(se = TRUE, na.rm = T) +
  theme_minimal() +
  labs(x = "LotArea",
       y = "Sales Price",
       title = "LotArea vs. Sales Price"
  )
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

2.12 House Style vs. Sale Price

ggplot(data = all[all$SalePrice > 0,], aes(x = HouseStyle, y = SalePrice)) + geom_boxplot() + theme_minimal() + labs(x = “House Style”, y = “Sales Price”, title = “House Style vs. Sales Price” )

2.13 Lot Config vs. Sale Price

ggplot(data = all[all$SalePrice > 0,], aes(x = LotConfig, y = SalePrice)) +
  geom_boxplot(na.rm = T) +
  theme_minimal() +
  labs(x = "Lot Config",
       y = "Sales Price",
       title = "Lot Config vs. Sales Price"
  )

2.14 Year Remodeled Difference vs. Sale Price

# Create new variable for difference between remodeled time vs. built time
all <- all %>% 
  mutate(YrRemodel_Diff = YearRemodAdd - YearBuilt) 

ggplot(data = all[all$SalePrice > 0,], aes(x = YrRemodel_Diff, y = SalePrice, group = YrRemodel_Diff)) +
  geom_boxplot(na.rm = T) +
  geom_smooth(na.rm = T) +
  theme_minimal() +
  labs(x = "Year Diff",
       y = "Sales Price",
       title = "Year Remodel Difference vs. Sales Price"
  ) +
  scale_y_continuous(breaks = c(100000,200000,300000,400000,500000,600000,700000))
## Warning: Removed 1459 rows containing missing values (stat_boxplot).
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

2.15 Garage Type vs. Sale Price

ggplot(data = all[all$SalePrice > 0,], aes(x = GarageType, y = SalePrice)) +
  geom_boxplot(na.rm = T) +
  theme_minimal() +
  labs(x = "Garage Type",
       y = "Sales Price",
       title = "Garage Type vs. Sales Price"
  )

2.16 Fence vs. Sale Price

ggplot(data = all[all$SalePrice > 0,], aes(x = Fence, y = SalePrice)) +
  geom_boxplot(na.rm = T) +
  theme_minimal() +
  labs(x = "Fence",
       y = "Sales Price",
       title = "Fence vs. Sales Price"
  )

2.17 ScreenPorch vs. Sale Price

ggplot(data = all[all$SalePrice > 0,], aes(x = ScreenPorch, y = SalePrice)) +
  geom_point(na.rm = T) +
  geom_smooth(method = "lm", na.rm = T) +
  theme_minimal() +
  labs(x = "ScreenPorch",
       y = "Sales Price",
       title = "ScreenPorch vs. Sales Price"
  )

2.18 Low Quality SqFt. vs. Sale Price

ggplot(data = all[all$SalePrice > 0,], aes(x = LowQualFinSF, y = SalePrice)) +
  geom_point() +
  geom_smooth(method = "lm") +
  theme_minimal() +
  labs(x = "LowQualFinSF",
       y = "Sales Price",
       title = "LowQualFinSF vs. Sales Price"
  )
## Warning: Removed 1459 rows containing non-finite values (stat_smooth).
## Warning: Removed 1459 rows containing missing values (geom_point).

ggplot(all, aes(LotFrontage, SalePrice)) +
  geom_point(aes(color = Neighborhood), na.rm = T) +
  geom_smooth(method = "lm", se = FALSE, na.rm = T) +
  scale_x_continuous("LotFrontage") +
  scale_y_continuous("SalePrice") +
  theme_bw() + facet_wrap( ~ SaleType) +
  theme(legend.position = "none")

Section 3: Feature Engineering

# Recheck NA counts
sort(colSums(sapply(all, is.na)), decreasing = TRUE)
##         PoolQC    MiscFeature          Alley          Fence      SalePrice 
##           2909           2814           2721           2348           1459 
##    FireplaceQu    LotFrontage    GarageYrBlt   GarageFinish     GarageQual 
##           1420            486            159            159            159 
##     GarageCond     GarageType       BsmtCond   BsmtExposure       BsmtQual 
##            159            157             82             82             81 
##   BsmtFinType2   BsmtFinType1     MasVnrType     MasVnrArea       MSZoning 
##             80             79             24             23              4 
##      Utilities   BsmtFullBath   BsmtHalfBath     Functional    Exterior1st 
##              2              2              2              2              1 
##    Exterior2nd     BsmtFinSF1     BsmtFinSF2      BsmtUnfSF    TotalBsmtSF 
##              1              1              1              1              1 
##     Electrical    KitchenQual     GarageCars     GarageArea       SaleType 
##              1              1              1              1              1 
##             Id     MSSubClass        LotArea         Street       LotShape 
##              0              0              0              0              0 
##    LandContour      LotConfig      LandSlope   Neighborhood     Condition1 
##              0              0              0              0              0 
##     Condition2       BldgType     HouseStyle    OverallQual    OverallCond 
##              0              0              0              0              0 
##      YearBuilt   YearRemodAdd      RoofStyle       RoofMatl      ExterQual 
##              0              0              0              0              0 
##      ExterCond     Foundation        Heating      HeatingQC     CentralAir 
##              0              0              0              0              0 
##      X1stFlrSF      X2ndFlrSF   LowQualFinSF      GrLivArea       FullBath 
##              0              0              0              0              0 
##       HalfBath   BedroomAbvGr   KitchenAbvGr   TotRmsAbvGrd     Fireplaces 
##              0              0              0              0              0 
##     PavedDrive     WoodDeckSF    OpenPorchSF  EnclosedPorch     X3SsnPorch 
##              0              0              0              0              0 
##    ScreenPorch       PoolArea        MiscVal         MoSold         YrSold 
##              0              0              0              0              0 
##  SaleCondition YrRemodel_Diff 
##              0              0

Is there large difference in prices for remodeled houses?

# Find sumary differences in prices for remodeled vs. non-remodeled
train <- train %>%
  mutate(YrRemodel_Diff = YearRemodAdd - YearBuilt)

train %>%
  summarise(AvgSalePrice_Remodel = mean(train[which(train$YrRemodel_Diff > 0),]$SalePrice, na.rm = FALSE),
            AvgSalePrice_NoRemodel = mean(train[which(train$YrRemodel_Diff == 0),]$SalePrice, na.rm = FALSE),
            MaxSalePrice_Remodel = max(train[which(train$YrRemodel_Diff > 0),]$SalePrice, na.rm = FALSE),
            MaxSalePrice_NoRemodel = max(train[which(train$YrRemodel_Diff == 0),]$SalePrice, na.rm = FALSE),
            MinSalePrice_Remodel = min(train[which(train$YrRemodel_Diff > 0),]$SalePrice, na.rm = FALSE),
            MinSalePrice_NoRemodel = min(train[which(train$YrRemodel_Diff == 0),]$SalePrice, na.rm = FALSE)
  )
##   AvgSalePrice_Remodel AvgSalePrice_NoRemodel MaxSalePrice_Remodel
## 1             179096.3               182583.7               755000
##   MaxSalePrice_NoRemodel MinSalePrice_Remodel MinSalePrice_NoRemodel
## 1                 745000                34900                  55993

3.1 Fix NA values (Exploration)

Factors that contains NA values means that the property does not have that specific attribute. We either need to remove or replace the data so we can include the information into our model correctly. One way we can do this is to change any zero/missing values into “None” or “0”.

# Categorical Variables
all$MiscFeature[is.na(all$MiscFeature)] <- "None"
all$Fence[is.na(all$Fence)] <- "None"
all$PoolQC[is.na(all$PoolQC)] <- "None"
all$FireplaceQu[is.na(all$FireplaceQu)] <- "None"
all$Alley[is.na(all$Alley)] <- "None"
all$SaleType[is.na(all$SaleType)] <- "Oth"
# Garage Variables
all$GarageYrBlt[is.na(all$GarageYrBlt)] <- 0
all$GarageFinish[is.na(all$GarageFinish)] <- "None"
all$GarageQual[is.na(all$GarageQual)] <- "None"
all$GarageType[is.na(all$GarageType)] <- "None"
all$GarageCond[is.na(all$GarageCond)] <- "None"
# Bsmt Variables
all$BsmtCond[is.na(all$BsmtCond)] <- "None"
all$BsmtExposure[is.na(all$BsmtExposure)] <- "None"
all$BsmtQual[is.na(all$BsmtQual)] <- "None"
all$BsmtFinType2[is.na(all$BsmtFinType2)] <- "None"
all$BsmtFinType1[is.na(all$BsmtFinType1)] <- "None"
# Continuous variables
all$LotFrontage[is.na(all$LotFrontage)] <- 0
all$MasVnrArea[is.na(all$MasVnrArea)] <- 0
all$BsmtFullBath[is.na(all$BsmtFullBath)] <- 0
all$BsmtHalfBath[is.na(all$BsmtHalfBath)] <- 0
all$BsmtFinSF1[is.na(all$BsmtFinSF1)] <- 0
all$BsmtFinSF2[is.na(all$BsmtFinSF2)] <- 0
all$BsmtUnfSF[is.na(all$BsmtUnfSF)] <- 0
all$TotalBsmtSF[is.na(all$TotalBsmtSF)] <- 0
all$GarageCars[is.na(all$GarageCars)] <- 0
all$GarageArea[is.na(all$GarageArea)] <- 0
# Other Variables
all$SaleType[is.na(all$SaleType)] <- "Oth"
all$MSZoning[is.na(all$MSZoning)] <- "OTH"
all$Exterior1st[is.na(all$Exterior1st)] <- "Other"
all$Exterior2nd[is.na(all$Exterior2nd)] <- "Other"
all$Functional[is.na(all$Functional)] <- "Oth"
all$Utilities[is.na(all$Utilities)] <- "AllPub"
all$Electrical[is.na(all$Electrical)] <- "SBrkr"
all$KitchenQual[is.na(all$KitchenQual)] <- "TA"
all$MasVnrType[is.na(all$MasVnrType)] <- "None"
all$MSZoning[is.na(all$MSZoning)] <- "RL"

3.2 Feature Engineering New Variables

With some familiarity of houses and exploration of the data, we could create new variables that may positively affect the prediction model. The variable house age could be created by subtracting the current year and the year it was built. We want to make sure we can model with these new variables. Age and some sort of aggregation of variables can be used as a new scoring variable. For example, we can use overall condition variable and overall quality variable to sum up to a new total quality score. This may or may help, but it’s the exploration that counts in this analysis.

We gather the area of the entire house and any other square footage variable included to get an idea of how large it is as a whole.

all <- all %>%
  mutate(YrRemodel_Diff + YearRemodAdd - YearBuilt)

all$HouseAge <- (2018 - all$YearBuilt)
all$NewHouse <- (all$YearBuilt == all$YrSold) * 1
all$TotalFloorSF <- all$X1stFlrSF + all$X2ndFlrSF
all$TotalArea <- all$LotFrontage+ all$LotArea + all$MasVnrArea + all$TotalBsmtSF +
all$TotalFloorSF + all$GrLivArea + all$GarageArea + all$WoodDeckSF +
all$OpenPorchSF + all$EnclosedPorch + all$X3SsnPorch + all$ScreenPorch
all$OverallQualCond <- all$OverallCond + all$OverallQual

Let’s see if a weighted quality will be helpful based on popular area sizes or any other quantitative quality measure. In statistics, weights can provide a new value for each observation by multiplying a weight (quality) to the numerical variable. This is considered a type of arbitrary number given based on intuition. Can we trust these weights and will this provide good results?

Other descriptions on how weights can help are oversampling of groups that can cause disproportionate stratification, less variability, etc.

# Overall Quality = Quality * (numeric features)
all$Mod_year_qual <- all$YearBuilt*all$OverallQual      # overall condition
all$Mod_year_r_qual <- all$OverallQual*all$YearRemodAdd # quality x remodel
all$Mod_bsmt_qual <- all$OverallQual*all$TotalBsmtSF    # quality x basement size
all$Mod_livarea_qual <- all$OverallQual*all$GrLivArea   # quality x living area
all$Mod_qual_bath <- all$OverallQual*all$FullBath       # quality x baths
# Split Numeric and Categoric variables
num_features <- names(which(sapply(all, is.numeric)))
cat_features <- names(which(sapply(all, is.character)))

all_numeric <- all[num_features]
all_categoric <- all[cat_features]

3.3 Error Searching

# Fix NAs for Feature Engineered Variables
all$TotalArea[is.na(all$TotalArea)] <- median(all$TotalArea, na.rm = TRUE)

Create a table of the Fence and ScreenPorch variable.

kable(table(all$Fence))
Var1 Freq
GdPrv 118
GdWo 112
MnPrv 329
MnWw 12
None 2348
# kable(table(all$ScreenPorch))

After searching through our new and old variables, we want to remove any columns that looks uncorrelated. This can also be seen in the correlation plots above in section 2.

# Remove columns that do not look correlated
# Create correlation plots
all <- all[,!colnames(all) == "MiscVal"]
all <- all[,!colnames(all) == "PoolArea"]
all <- all[,!colnames(all) == "PoolQC"]
all <- all[,!colnames(all) == "X3SsnPorch"]
all <- all[,!colnames(all) == "MiscVal"]
all <- all[,!colnames(all) == "BsmtFinSF2"]
all <- all[,!colnames(all) == "MoSold"]
all <- all[,!colnames(all) == "ScreenPorch"]
all <- all[,!colnames(all) == "LowQualFinSF"]
all <- all[,!colnames(all) == "EnclosedPorch"]

Section 4: Modeling

One thing that I had trouble with beginning to model was understanding the type of variables that were being added in the model. I would get warnings or errors based on invalid variable types to not having the data in the testing model. For example, there would need to be the data point “NewHouse” in the testing model if we are going to use predict() based off of a logistic model.

I would also like to create different training/test datasets so I am always able to use a specific model and it’s values without having to go back and change the type or include/exclude certain data.

# Change characters to factors
all <- all %>%
  mutate_if(is.character, as.factor)

Create our train and test datasets.

# Initial partition to train main model
train <- all[1:1460,] # Main training model set
test <- all[1461:nrow(all),]

Used for a random forest model - Some models took too long to run so I wanted to make sure the data provided did not produce any errors in the end.

train_rf <- all[1:200,]

Create partitions based on proportion

inTrain <- createDataPartition(train$SalePrice, p = 0.60, list = FALSE)

training_part <- train[inTrain,]
test_part <- train[-inTrain,]

Random forest is a popular model which is a algorithm based off of trees. It’s an ensemble method that is considered a “black box” of methods because of it’s randomness in selecting features to use in each tree. If you have a weak computer with a large dataset, I would test it this algorithm on a small dataset before running it with large datasets.

# Create an aggregated variable to see how long the model would take to run.
start_time <- Sys.time()
set.seed(1001)
mod_rf <- train(SalePrice ~ ., data = test_part, method = "rf") # Used test only because I wanted a smaller dataset
end_time <- Sys.time()

paste("Total time it took to run the model was",difftime(end_time, start_time, units = "min"), "minutes.")
## [1] "Total time it took to run the model was 6.95568066835403 minutes."

Simple regression can also be used and be tested especially if we want to figure out a continuous variable. Here’s an example of a basic linear regression.

start_time2 <- Sys.time()
set.seed(1002)
mod_lm <- lm(SalePrice ~ LotFrontage + SaleType + HouseAge + OverallQualCond, data = training_part)
end_time2 <- Sys.time()
paste("Total time it took to run the model was",difftime(end_time2, start_time2, units = "min"), "minutes.")
## [1] "Total time it took to run the model was 0.0000664154688517253 minutes."

Was the linear regression model effective? We can test it on the testing dataset and use RMSE to measure the error.

# prediction_lm <- predict(mod_lm, newdata = test_part)
# rmse(log(test_part$SalePrice),log(prediction_lm))

Random Forest

Going back to random forest. I wanted to understand the hyperparameters of this algorithm. Here are some hyperparameters we could use: * nTree * mTry * maxnodes * nodesize

Other hyperparameters for train(): * trControl * tuneLength * importance * preProc * metric

Let’s attempt to tune a forest model. Tuning helps find the best hyperparameter inputs where the model could produce the best outputs. Remember that all this tuning is not necessary and keep in mind that “garbage in means garbage out”. Feature engineering in my opinion is the most effective way to produce an accurate model unless computational complexity and time is not an issue. Even then, good feature engineering + complex models would be better than bad feature engineering + complex models.

So here’s one way to tune it by creating loops of certain hyperparameter values. This could take some time but in the essence of practice, we will attempt this.

# Create control hyperparameter for rf tuning
control <- trainControl(method = "repeatedcv",
                        search = "random",
                        repeats = 3,
                        number = 10)

tunegrid <- expand.grid(.mtry = sqrt(ncol(training_part))) # store best mtry value
# Tune random forest model hyperparameters
# Evaluate the rf model with the default setting
set.seed(1003)
start_time3 <- Sys.time()
mod_forest <- train(SalePrice ~ .,
                data = training_part, # test dataset
                method = "rf",
                importance = TRUE
)
end_time3 <- Sys.time()
paste("Total time it took to run the model was",difftime(end_time3, start_time3, units = "min"), "minutes.")
## [1] "Total time it took to run the model was 15.6582596023877 minutes."

Include some visuals ofthe model including importance() to see which variables were significant.

print(mod_forest)
## Random Forest 
## 
## 878 samples
##  83 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 878, 878, 878, 878, 878, 878, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##     2   49749.07  0.8144015  30098.80
##   133   29633.34  0.8814876  18490.76
##   264   30403.03  0.8737648  18963.42
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 133.
plot(mod_forest)

# importance(mod_forest) # Could only use importance if we use the randomForest pkg
varImp(mod_forest) #varImpPlot if randomForest pkg
## rf variable importance
## 
##   only 20 most important variables shown (out of 264)
## 
##                  Overall
## Mod_livarea_qual  100.00
## Mod_bsmt_qual      69.95
## TotalFloorSF       57.10
## TotalArea          55.56
## Mod_year_qual      50.62
## Mod_qual_bath      50.32
## YearRemodAdd       49.34
## Mod_year_r_qual    46.54
## GrLivArea          45.49
## BsmtFinSF1         45.11
## GarageArea         43.84
## TotalBsmtSF        41.31
## HouseAge           40.97
## YearBuilt          40.89
## MSZoningRM         39.16
## X1stFlrSF          38.67
## GarageYrBlt        36.73
## KitchenQualGd      36.06
## MSSubClass         34.12
## KitchenQualTA      33.78

Find the best number of mtry

start_time4 <- Sys.time()
set.seed(9999)
tunegrid <- expand.grid(.mtry = seq(5,80,5)) # construct vector with values 1:20
rf_mtry <- train(SalePrice ~.,
                 data = training_part,
                 method = "rf",
                 tuneGrid = tunegrid,
                 trControl = control,
                 importance = TRUE
)
end_time4 <- Sys.time()
paste("Total time it took to run the model was",difftime(end_time4, start_time4, units = "min"), "minutes.")
## [1] "Total time it took to run the model was 46.1222561001778 minutes."
best_mtry <- rf_mtry$bestTune$mtry # best value of mtry is stored in here
best_mtry
## [1] 40

Largest value with best acc

tunegrid <- expand.grid(.mtry = best_mtry) # store best mtry value

Find the best number of maxnodes

store_maxnode <- list() # results of the model will be stored here
start_time5 <- Sys.time()
for (maxnodes in c(3:30)) { # compute the model with values of maxnodes starting from 3:30
  set.seed(10000)
  rf_maxnode <- train(SalePrice ~ .,
                      data = training_part,
                      method = "rf",
                      tuneGrid = tunegrid,
                      trControl = control,
                      importance = TRUE,
                      maxnodes = maxnodes
                      # ntree = 500
  )
  current_iteration <- toString(maxnodes) # store as a string variable the value of maxnode
  store_maxnode[[current_iteration]] <- rf_maxnode # save result of the model in the list
}
end_time5 <- Sys.time()
results_mtry <- resamples(store_maxnode) # arrange the results of the model
summary(results_mtry) # print summary of all the combinations
## 
## Call:
## summary.resamples(object = results_mtry)
## 
## Models: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 
## Number of resamples: 30 
## 
## MAE 
##        Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## 3  24330.69 26392.85 28706.44 29709.51 33463.71 36430.05    0
## 4  22682.04 24915.79 26193.83 27032.99 28846.53 32208.99    0
## 5  21435.07 23347.76 24658.82 25405.81 27343.13 30336.57    0
## 6  20700.88 22329.75 24047.60 24659.10 26994.81 29284.06    0
## 7  19833.36 21963.79 23290.09 23780.21 25682.99 28529.53    0
## 8  19872.69 21714.14 23308.42 23407.76 24970.31 27913.95    0
## 9  19431.22 21327.85 22787.05 22855.53 24652.34 27035.41    0
## 10 19241.43 20892.32 22410.41 22458.78 24096.82 26928.59    0
## 11 19017.23 20469.37 22054.66 22112.31 23968.93 26235.71    0
## 12 18396.70 20111.11 21779.15 21898.13 23697.96 26289.22    0
## 13 18379.91 20044.54 21354.85 21581.77 23099.88 25931.63    0
## 14 18099.93 19733.22 21215.93 21373.98 22804.27 25799.11    0
## 15 18046.55 19627.78 20968.47 21148.66 22507.42 25198.27    0
## 16 18123.53 19598.25 20749.66 21064.58 22422.09 25319.14    0
## 17 17843.15 19588.31 20671.27 20908.23 22284.06 25400.00    0
## 18 17834.87 19161.38 20446.29 20724.90 22155.53 24917.03    0
## 19 17677.05 19100.83 20467.04 20586.21 21905.87 24984.63    0
## 20 17430.48 19004.77 20336.74 20460.57 21699.40 24523.46    0
## 21 17477.99 18846.08 20420.98 20337.92 21679.87 24543.89    0
## 22 17479.60 18579.22 20137.58 20230.20 21558.41 24477.90    0
## 23 17374.29 18560.81 20084.97 20134.90 21288.02 24255.21    0
## 24 17027.40 18488.21 19817.51 20036.35 21154.29 24243.51    0
## 25 16677.85 18520.13 19714.59 19914.12 21032.34 24194.69    0
## 26 17114.20 18308.69 19571.46 19844.39 21040.56 24227.90    0
## 27 16800.59 18341.25 19772.42 19853.38 21135.36 24199.63    0
## 28 16537.17 18246.24 19507.30 19754.27 21044.91 24024.05    0
## 29 16656.50 18324.68 19386.33 19682.83 20991.65 24037.73    0
## 30 16913.35 18262.52 19478.83 19634.02 20854.09 23823.28    0
## 
## RMSE 
##        Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## 3  33605.61 36472.36 46171.10 48423.62 59819.62 71504.00    0
## 4  30875.62 34611.93 39743.26 41023.86 45616.55 57421.85    0
## 5  28813.54 32808.47 37648.46 39519.58 45011.09 55988.68    0
## 6  27406.15 31745.95 36641.43 38279.28 43204.62 54669.82    0
## 7  27008.43 30563.03 34691.24 36554.35 41662.16 53384.41    0
## 8  26733.76 30131.35 34397.91 34796.27 40441.58 45177.40    0
## 9  26312.58 29323.38 33518.29 34221.59 39690.84 44853.33    0
## 10 25675.72 28586.09 33447.42 33884.23 39335.15 44509.27    0
## 11 24805.61 28049.75 32715.32 33372.72 39153.04 43569.79    0
## 12 24366.82 27871.67 32441.05 33066.15 39115.05 43683.45    0
## 13 24153.12 27451.31 31884.60 32561.06 38399.10 43780.04    0
## 14 23951.45 26941.05 31455.27 32050.23 36801.63 43243.36    0
## 15 24315.87 27033.10 30845.86 31494.50 36367.29 41770.50    0
## 16 23632.10 26930.57 30234.45 31169.06 35469.54 39991.10    0
## 17 23846.31 26753.73 30704.64 31153.15 35898.06 39498.53    0
## 18 23922.51 26626.51 30182.46 30964.64 35213.18 40282.48    0
## 19 23421.13 26508.52 30179.03 30763.85 34825.79 39628.97    0
## 20 23557.89 26358.14 30211.52 30607.71 34835.45 39213.51    0
## 21 23418.40 26075.39 29651.12 30454.99 34640.67 39634.82    0
## 22 23316.24 25783.72 29744.39 30373.11 35051.99 38986.71    0
## 23 23027.14 25975.45 29545.72 30236.05 35086.05 39260.44    0
## 24 22924.88 25801.93 29342.67 30010.15 34712.42 39886.80    0
## 25 22696.01 25640.27 28835.04 29751.89 34457.56 38518.00    0
## 26 22835.78 25486.60 28593.15 29588.33 34345.24 37702.73    0
## 27 23046.51 25411.51 28982.43 29604.19 33935.25 37995.77    0
## 28 22874.31 25794.79 28255.79 29421.05 33428.76 37966.79    0
## 29 22657.51 25353.74 28432.48 29436.07 33688.28 38386.84    0
## 30 22865.91 25364.82 28432.96 29329.02 33640.55 38012.29    0
## 
## Rsquared 
##         Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## 3  0.5796496 0.6845099 0.7374814 0.7279922 0.7612040 0.8641782    0
## 4  0.6712924 0.7903859 0.8228302 0.8154427 0.8472560 0.8946657    0
## 5  0.6997451 0.8050739 0.8309389 0.8262620 0.8477308 0.9081267    0
## 6  0.7199763 0.8098131 0.8327009 0.8300692 0.8554024 0.9058329    0
## 7  0.7258413 0.8161144 0.8414614 0.8370847 0.8618708 0.9181125    0
## 8  0.7288277 0.8284249 0.8579893 0.8518710 0.8779208 0.9109385    0
## 9  0.7486820 0.8343482 0.8666426 0.8564015 0.8784599 0.9136328    0
## 10 0.7439521 0.8332575 0.8656597 0.8576308 0.8808872 0.9095247    0
## 11 0.7503169 0.8397417 0.8685301 0.8605824 0.8876341 0.9103198    0
## 12 0.7589720 0.8413797 0.8692904 0.8625843 0.8890018 0.9174475    0
## 13 0.7672940 0.8480747 0.8703215 0.8652565 0.8899703 0.9193817    0
## 14 0.7751405 0.8471256 0.8752343 0.8675962 0.8918611 0.9170080    0
## 15 0.7717144 0.8545221 0.8768034 0.8715162 0.8943168 0.9191054    0
## 16 0.7806857 0.8526411 0.8767857 0.8728789 0.8972333 0.9235007    0
## 17 0.7768783 0.8573965 0.8807666 0.8731725 0.8980903 0.9190218    0
## 18 0.7837237 0.8578618 0.8797686 0.8743383 0.8990742 0.9154702    0
## 19 0.7893467 0.8566647 0.8817142 0.8754172 0.8995039 0.9225871    0
## 20 0.7965325 0.8570633 0.8796508 0.8763696 0.9004599 0.9178893    0
## 21 0.7911410 0.8583008 0.8833285 0.8774807 0.8992841 0.9218667    0
## 22 0.7940618 0.8590421 0.8869511 0.8782952 0.8979896 0.9225422    0
## 23 0.7944356 0.8622902 0.8837015 0.8788190 0.9007152 0.9199747    0
## 24 0.8043073 0.8619158 0.8863002 0.8802340 0.9011828 0.9220419    0
## 25 0.7967734 0.8661965 0.8886745 0.8816267 0.9039294 0.9232574    0
## 26 0.8004329 0.8663074 0.8865630 0.8825278 0.9060013 0.9231018    0
## 27 0.8003506 0.8659602 0.8843933 0.8817429 0.9059095 0.9238067    0
## 28 0.8076826 0.8643306 0.8862904 0.8831895 0.9031490 0.9266424    0
## 29 0.8131615 0.8680206 0.8879340 0.8833469 0.9057572 0.9258199    0
## 30 0.8000701 0.8690470 0.8866398 0.8838552 0.9084561 0.9266926    0

Find the best number of ntrees

store_maxtrees <- list()
start_time6 <- Sys.time()
for (ntree in c(250, 500, 800, 1000, 2000, 2200, 2400, 2600)) {
  set.seed(5678)
  rf_maxtrees <- train(SalePrice ~ .,
                       data = training_part,
                       method = "rf",
                       tuneGrid = tunegrid,
                       trControl = control,
                       importance = TRUE,
                       maxnodes = 24, # find best maxnodes
                       ntree = ntree)
  key <- toString(ntree)
  store_maxtrees[[key]] <- rf_maxtrees
}
end6 <- Sys.time()
results_tree <- resamples(store_maxtrees)
summary(results_tree)
## 
## Call:
## summary.resamples(object = results_tree)
## 
## Models: 250, 500, 800, 1000, 2000, 2200, 2400, 2600 
## Number of resamples: 30 
## 
## MAE 
##          Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## 250  17007.79 18910.43 19769.81 19927.56 21050.58 22346.94    0
## 500  16857.48 18857.04 19800.60 19882.95 20885.04 22383.91    0
## 800  16764.09 18820.59 19821.81 19876.63 20953.73 22350.02    0
## 1000 16794.98 18823.67 19781.79 19873.00 20927.34 22305.02    0
## 2000 16634.70 18770.25 19822.32 19860.44 20987.06 22310.13    0
## 2200 16617.22 18798.67 19807.06 19860.61 20986.45 22323.33    0
## 2400 16613.51 18786.41 19792.92 19859.59 20992.24 22294.80    0
## 2600 16628.85 18782.41 19801.94 19864.64 20958.12 22295.55    0
## 
## RMSE 
##          Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## 250  22967.94 27689.18 29618.03 29923.72 31954.95 34799.18    0
## 500  22385.91 27654.27 29819.94 29849.93 32145.31 34996.82    0
## 800  22497.97 27669.53 29847.87 29867.10 32194.15 34891.56    0
## 1000 22545.87 27608.24 29910.41 29875.75 32295.63 34902.36    0
## 2000 22263.27 27646.87 29796.33 29869.52 32450.39 34748.44    0
## 2200 22184.97 27611.56 29865.23 29857.39 32475.19 34756.06    0
## 2400 22165.63 27607.07 29850.66 29859.37 32472.81 34780.29    0
## 2600 22173.70 27639.87 29860.21 29866.91 32501.65 34733.54    0
## 
## Rsquared 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## 250  0.8102113 0.8611855 0.8866613 0.8830814 0.9069482 0.9237795    0
## 500  0.8098036 0.8603975 0.8854948 0.8836829 0.9087686 0.9235772    0
## 800  0.8096186 0.8611959 0.8866005 0.8836711 0.9082843 0.9238712    0
## 1000 0.8088374 0.8602644 0.8868049 0.8836105 0.9080757 0.9243142    0
## 2000 0.8078529 0.8593579 0.8865337 0.8837247 0.9080870 0.9230671    0
## 2200 0.8083910 0.8592711 0.8866915 0.8838621 0.9081223 0.9234693    0
## 2400 0.8088908 0.8590048 0.8867249 0.8838590 0.9079578 0.9237357    0
## 2600 0.8082881 0.8593761 0.8868374 0.8837730 0.9078159 0.9236367    0

Test out the new hyperparameters in one tuned model

set.seed(19151)
start_time7 <- Sys.time()
mod_rf_tuned <- train(SalePrice ~ .,
                      data = training_part,
                      method = "rf",
                      trControl = control,
                      importance = TRUE,
                      ntree = 1000,
                      maxnodes = 20
)
end_time7 <- Sys.time()
paste("Total time it took to run the ntree was",difftime(end_time5, start_time7, units = "mins"), "minutes.")
## [1] "Total time it took to run the ntree was -15.6806159337362 minutes."

Include summary statistics of the new model.

print(mod_rf_tuned)
## Random Forest 
## 
## 878 samples
##  83 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 791, 790, 790, 790, 790, 790, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##    25   31215.62  0.8798028  20687.38
##   259   30596.88  0.8656499  20538.29
##   260   30627.10  0.8654599  20532.17
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 259.
plot(mod_rf_tuned, main = "Tuned RF model")

varImp(mod_rf_tuned)
## rf variable importance
## 
##   only 20 most important variables shown (out of 264)
## 
##                  Overall
## Mod_livarea_qual  100.00
## Mod_bsmt_qual      63.04
## MSZoningRM         27.68
## TotRmsAbvGrd       26.69
## YearRemodAdd       25.44
## BsmtFinSF1         24.95
## Mod_qual_bath      24.72
## Mod_year_qual      23.09
## TotalArea          21.12
## GarageCars         19.92
## KitchenQualGd      19.87
## Mod_year_r_qual    19.08
## YearBuilt          18.94
## TotalFloorSF       18.87
## TotalBsmtSF        18.29
## HouseAge           18.19
## GarageArea         17.00
## KitchenQualTA      16.28
## GarageTypeAttchd   16.09
## GrLivArea          15.14

What if we decided to preprocess the data as well? We would need to see which variables we would want to drop. The earlier variables which looked like have a lot of other data or data that looked like it would not assist in the model.

Create new dataset without the dropped variables

all2 <- all
drop <- c("MSZoning", "Utilities", "Condition2", "RoofStyle",
          "RoofMatl", "Exterior1st", "HeatingQC", "Functional",
          "MiscFeature", "Exterior2nd")
all2 <- all2[,!names(all2) %in% drop]

Now we have to create new datasets without the variables.

# Initial partition to train model
train2 <- all2[1:1460,]
test2 <- all2[1461:nrow(all2),]
# Create testing partitions
inTrain2 <- createDataPartition(train2$SalePrice, p = 0.75, list = FALSE)
training_part2 <- train2[inTrain2,]
test_part2 <- train2[-inTrain2,]
set.seed(95959)
start_time8 <- Sys.time()
mod_rf_removed_var <- train(SalePrice ~ .,
                       data = training_part2,
                       method = "rf",
                       trControl = control,
                       importance = TRUE,
                       preProc = c("center", "scale")
)
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: LotConfigFR3, ElectricalMix,
## SaleTypeCWD

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: LotConfigFR3, ElectricalMix,
## SaleTypeCWD

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: LotConfigFR3, ElectricalMix,
## SaleTypeCWD
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: BsmtCondPo, ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: BsmtCondPo, ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: BsmtCondPo, ElectricalMix
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix, SaleTypeCon

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix, SaleTypeCon

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix, SaleTypeCon
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: NeighborhoodBlueste,
## ExterCondPo, ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: NeighborhoodBlueste,
## ExterCondPo, ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: NeighborhoodBlueste,
## ExterCondPo, ElectricalMix
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ExterCondPo, ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ExterCondPo, ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ExterCondPo, ElectricalMix
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix, SaleTypeCWD

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix, SaleTypeCWD

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix, SaleTypeCWD
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: BsmtCondPo, ElectricalMix,
## SaleTypeCon

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: BsmtCondPo, ElectricalMix,
## SaleTypeCon

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: BsmtCondPo, ElectricalMix,
## SaleTypeCon
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: LotConfigFR3, ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: LotConfigFR3, ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: LotConfigFR3, ElectricalMix
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: NeighborhoodBlueste,
## ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: NeighborhoodBlueste,
## ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: NeighborhoodBlueste,
## ElectricalMix
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: NeighborhoodBlueste,
## ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: NeighborhoodBlueste,
## ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: NeighborhoodBlueste,
## ElectricalMix
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ExterCondPo, ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ExterCondPo, ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ExterCondPo, ElectricalMix
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: LotConfigFR3, ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: LotConfigFR3, ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: LotConfigFR3, ElectricalMix
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: BsmtCondPo, ElectricalMix,
## SaleTypeCon

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: BsmtCondPo, ElectricalMix,
## SaleTypeCon

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: BsmtCondPo, ElectricalMix,
## SaleTypeCon
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: ElectricalMix
end_time8 <- Sys.time()
paste("Total time it took to run the model was",difftime(end_time8, start_time8, units = "mins"), "minutes.")
## [1] "Total time it took to run the model was 20.6997025330861 minutes."

Include the model performance.

print(mod_rf_removed_var)
## Random Forest 
## 
## 1097 samples
##   73 predictor
## 
## Pre-processing: centered (194), scaled (194) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 987, 987, 988, 987, 988, 987, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##    60   26602.79  0.8766519  16717.81
##   105   26635.42  0.8757603  16765.55
##   150   26964.73  0.8722806  16955.89
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 60.
plot(mod_rf_removed_var, main = "Tuned RF model")

varImp(mod_rf_removed_var)
## rf variable importance
## 
##   only 20 most important variables shown (out of 194)
## 
##                  Overall
## Mod_livarea_qual  100.00
## BsmtFinSF1         83.61
## Mod_bsmt_qual      75.91
## TotalFloorSF       71.35
## TotalArea          68.61
## Mod_year_r_qual    67.42
## Mod_year_qual      64.56
## GrLivArea          62.29
## TotalBsmtSF        62.29
## GarageArea         62.06
## X1stFlrSF          59.02
## YearRemodAdd       52.10
## HouseAge           51.61
## OverallQualCond    51.04
## Mod_qual_bath      49.69
## Fireplaces         47.44
## MSSubClass         45.77
## YearBuilt          43.56
## GarageYrBlt        43.56
## OverallCond        43.08
mod_rf_lastTune <- train(SalePrice ~ .,
                            data = training_part,
                            method = "rf",
                            trControl = control,
                            importance = TRUE,
                            preProc = c("center", "scale"),
                            ntree = 1001
)
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, RoofStyleShed, Exterior1stImStucc, Exterior1stOther,
## Exterior2ndOther, ExterCondPo, HeatingQCPo, FunctionalOth, MiscFeatureTenC,
## SaleConditionAdjLand

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, RoofStyleShed, Exterior1stImStucc, Exterior1stOther,
## Exterior2ndOther, ExterCondPo, HeatingQCPo, FunctionalOth, MiscFeatureTenC,
## SaleConditionAdjLand

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, RoofStyleShed, Exterior1stImStucc, Exterior1stOther,
## Exterior2ndOther, ExterCondPo, HeatingQCPo, FunctionalOth, MiscFeatureTenC,
## SaleConditionAdjLand
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, Condition2RRNn, RoofStyleShed, Exterior1stCBlock,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndCBlock, Exterior2ndOther,
## FunctionalOth, MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, Condition2RRNn, RoofStyleShed, Exterior1stCBlock,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndCBlock, Exterior2ndOther,
## FunctionalOth, MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, Condition2RRNn, RoofStyleShed, Exterior1stCBlock,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndCBlock, Exterior2ndOther,
## FunctionalOth, MiscFeatureTenC
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: MSZoningOTH,
## NeighborhoodBlueste, Condition1RRNe, Condition2RRAe, RoofStyleShed,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, BsmtCondPo,
## ElectricalMix, FunctionalOth, MiscFeatureOthr, MiscFeatureTenC, SaleTypeCon

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: MSZoningOTH,
## NeighborhoodBlueste, Condition1RRNe, Condition2RRAe, RoofStyleShed,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, BsmtCondPo,
## ElectricalMix, FunctionalOth, MiscFeatureOthr, MiscFeatureTenC, SaleTypeCon

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: MSZoningOTH,
## NeighborhoodBlueste, Condition1RRNe, Condition2RRAe, RoofStyleShed,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, BsmtCondPo,
## ElectricalMix, FunctionalOth, MiscFeatureOthr, MiscFeatureTenC, SaleTypeCon
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, LotConfigFR3,
## Condition1RRNe, Condition2RRAe, RoofStyleShed, Exterior1stAsphShn,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, FunctionalOth,
## MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, LotConfigFR3,
## Condition1RRNe, Condition2RRAe, RoofStyleShed, Exterior1stAsphShn,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, FunctionalOth,
## MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, LotConfigFR3,
## Condition1RRNe, Condition2RRAe, RoofStyleShed, Exterior1stAsphShn,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, FunctionalOth,
## MiscFeatureTenC
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, RoofStyleShed, RoofMatlMembran, Exterior1stImStucc,
## Exterior1stOther, Exterior2ndOther, FunctionalOth, MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, RoofStyleShed, RoofMatlMembran, Exterior1stImStucc,
## Exterior1stOther, Exterior2ndOther, FunctionalOth, MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, RoofStyleShed, RoofMatlMembran, Exterior1stImStucc,
## Exterior1stOther, Exterior2ndOther, FunctionalOth, MiscFeatureTenC
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, UtilitiesNoSeWa,
## Condition1RRNe, Condition2RRAe, RoofStyleShed, Exterior1stImStucc,
## Exterior1stOther, Exterior1stStone, Exterior2ndOther, FunctionalOth,
## FunctionalSev, MiscFeatureTenC, SaleTypeOth

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, UtilitiesNoSeWa,
## Condition1RRNe, Condition2RRAe, RoofStyleShed, Exterior1stImStucc,
## Exterior1stOther, Exterior1stStone, Exterior2ndOther, FunctionalOth,
## FunctionalSev, MiscFeatureTenC, SaleTypeOth

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, UtilitiesNoSeWa,
## Condition1RRNe, Condition2RRAe, RoofStyleShed, Exterior1stImStucc,
## Exterior1stOther, Exterior1stStone, Exterior2ndOther, FunctionalOth,
## FunctionalSev, MiscFeatureTenC, SaleTypeOth
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, Condition2RRAn, RoofStyleShed, Exterior1stImStucc,
## Exterior1stOther, Exterior2ndOther, FunctionalOth, MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, Condition2RRAn, RoofStyleShed, Exterior1stImStucc,
## Exterior1stOther, Exterior2ndOther, FunctionalOth, MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, Condition2RRAn, RoofStyleShed, Exterior1stImStucc,
## Exterior1stOther, Exterior2ndOther, FunctionalOth, MiscFeatureTenC
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2PosN, Condition2RRAe, RoofStyleShed, Exterior1stImStucc,
## Exterior1stOther, Exterior2ndOther, FunctionalOth, MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2PosN, Condition2RRAe, RoofStyleShed, Exterior1stImStucc,
## Exterior1stOther, Exterior2ndOther, FunctionalOth, MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2PosN, Condition2RRAe, RoofStyleShed, Exterior1stImStucc,
## Exterior1stOther, Exterior2ndOther, FunctionalOth, MiscFeatureTenC
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: MSZoningOTH,
## StreetPave, Condition1RRNe, Condition2Feedr, Condition2RRAe, RoofStyleShed,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, FunctionalOth,
## MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: MSZoningOTH,
## StreetPave, Condition1RRNe, Condition2Feedr, Condition2RRAe, RoofStyleShed,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, FunctionalOth,
## MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: MSZoningOTH,
## StreetPave, Condition1RRNe, Condition2Feedr, Condition2RRAe, RoofStyleShed,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, FunctionalOth,
## MiscFeatureTenC
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2PosA, Condition2RRAe, RoofStyleShed, RoofMatlMetal, RoofMatlRoll,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, FunctionalOth,
## MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2PosA, Condition2RRAe, RoofStyleShed, RoofMatlMetal, RoofMatlRoll,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, FunctionalOth,
## MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2PosA, Condition2RRAe, RoofStyleShed, RoofMatlMetal, RoofMatlRoll,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, FunctionalOth,
## MiscFeatureTenC
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, UtilitiesNoSeWa,
## Condition1RRNe, Condition2RRAe, RoofStyleShed, Exterior1stImStucc,
## Exterior1stOther, Exterior1stStone, Exterior2ndOther, FunctionalOth,
## MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, UtilitiesNoSeWa,
## Condition1RRNe, Condition2RRAe, RoofStyleShed, Exterior1stImStucc,
## Exterior1stOther, Exterior1stStone, Exterior2ndOther, FunctionalOth,
## MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, UtilitiesNoSeWa,
## Condition1RRNe, Condition2RRAe, RoofStyleShed, Exterior1stImStucc,
## Exterior1stOther, Exterior1stStone, Exterior2ndOther, FunctionalOth,
## MiscFeatureTenC
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, Condition2RRNn, RoofStyleShed, Exterior1stImStucc,
## Exterior1stOther, Exterior2ndOther, FoundationWood, FunctionalOth,
## FunctionalSev, MiscFeatureTenC, SaleTypeCon

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, Condition2RRNn, RoofStyleShed, Exterior1stImStucc,
## Exterior1stOther, Exterior2ndOther, FoundationWood, FunctionalOth,
## FunctionalSev, MiscFeatureTenC, SaleTypeCon

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, Condition2RRNn, RoofStyleShed, Exterior1stImStucc,
## Exterior1stOther, Exterior2ndOther, FoundationWood, FunctionalOth,
## FunctionalSev, MiscFeatureTenC, SaleTypeCon
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, RoofStyleShed, RoofMatlMembran, Exterior1stImStucc,
## Exterior1stOther, Exterior2ndOther, FunctionalOth, MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, RoofStyleShed, RoofMatlMembran, Exterior1stImStucc,
## Exterior1stOther, Exterior2ndOther, FunctionalOth, MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, RoofStyleShed, RoofMatlMembran, Exterior1stImStucc,
## Exterior1stOther, Exterior2ndOther, FunctionalOth, MiscFeatureTenC
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2PosN, Condition2RRAe, Condition2RRAn, RoofStyleShed,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, BsmtCondPo,
## ElectricalMix, FunctionalOth, MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2PosN, Condition2RRAe, Condition2RRAn, RoofStyleShed,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, BsmtCondPo,
## ElectricalMix, FunctionalOth, MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2PosN, Condition2RRAe, Condition2RRAn, RoofStyleShed,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, BsmtCondPo,
## ElectricalMix, FunctionalOth, MiscFeatureTenC
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, RoofStyleShed, RoofMatlMetal, Exterior1stImStucc,
## Exterior1stOther, Exterior2ndOther, HeatingQCPo, FunctionalOth,
## MiscFeatureOthr, MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, RoofStyleShed, RoofMatlMetal, Exterior1stImStucc,
## Exterior1stOther, Exterior2ndOther, HeatingQCPo, FunctionalOth,
## MiscFeatureOthr, MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, RoofStyleShed, RoofMatlMetal, Exterior1stImStucc,
## Exterior1stOther, Exterior2ndOther, HeatingQCPo, FunctionalOth,
## MiscFeatureOthr, MiscFeatureTenC
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, StreetPave,
## Condition1RRNe, Condition2RRAe, RoofStyleShed, Exterior1stCBlock,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndCBlock, Exterior2ndOther,
## ExterCondPo, FunctionalOth, MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, StreetPave,
## Condition1RRNe, Condition2RRAe, RoofStyleShed, Exterior1stCBlock,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndCBlock, Exterior2ndOther,
## ExterCondPo, FunctionalOth, MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, StreetPave,
## Condition1RRNe, Condition2RRAe, RoofStyleShed, Exterior1stCBlock,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndCBlock, Exterior2ndOther,
## ExterCondPo, FunctionalOth, MiscFeatureTenC
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, LotConfigFR3,
## Condition1RRNe, Condition2PosA, Condition2RRAe, RoofStyleShed,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, FunctionalOth,
## MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, LotConfigFR3,
## Condition1RRNe, Condition2PosA, Condition2RRAe, RoofStyleShed,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, FunctionalOth,
## MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, LotConfigFR3,
## Condition1RRNe, Condition2PosA, Condition2RRAe, RoofStyleShed,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, FunctionalOth,
## MiscFeatureTenC
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2Feedr, Condition2RRAe, RoofStyleShed, Exterior1stImStucc,
## Exterior1stOther, Exterior2ndOther, FunctionalOth, MiscFeatureTenC,
## SaleConditionAdjLand

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2Feedr, Condition2RRAe, RoofStyleShed, Exterior1stImStucc,
## Exterior1stOther, Exterior2ndOther, FunctionalOth, MiscFeatureTenC,
## SaleConditionAdjLand

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2Feedr, Condition2RRAe, RoofStyleShed, Exterior1stImStucc,
## Exterior1stOther, Exterior2ndOther, FunctionalOth, MiscFeatureTenC,
## SaleConditionAdjLand
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: MSZoningOTH,
## NeighborhoodBlueste, Condition1RRNe, Condition2RRAe, RoofStyleShed,
## Exterior1stAsphShn, Exterior1stImStucc, Exterior1stOther, Exterior2ndOther,
## FunctionalOth, MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: MSZoningOTH,
## NeighborhoodBlueste, Condition1RRNe, Condition2RRAe, RoofStyleShed,
## Exterior1stAsphShn, Exterior1stImStucc, Exterior1stOther, Exterior2ndOther,
## FunctionalOth, MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: MSZoningOTH,
## NeighborhoodBlueste, Condition1RRNe, Condition2RRAe, RoofStyleShed,
## Exterior1stAsphShn, Exterior1stImStucc, Exterior1stOther, Exterior2ndOther,
## FunctionalOth, MiscFeatureTenC
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, RoofStyleShed, RoofMatlRoll, Exterior1stImStucc,
## Exterior1stOther, Exterior2ndOther, FunctionalOth, MiscFeatureTenC,
## SaleTypeOth

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, RoofStyleShed, RoofMatlRoll, Exterior1stImStucc,
## Exterior1stOther, Exterior2ndOther, FunctionalOth, MiscFeatureTenC,
## SaleTypeOth

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, RoofStyleShed, RoofMatlRoll, Exterior1stImStucc,
## Exterior1stOther, Exterior2ndOther, FunctionalOth, MiscFeatureTenC,
## SaleTypeOth
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, RoofStyleShed, Exterior1stImStucc, Exterior1stOther,
## Exterior2ndOther, BsmtCondPo, HeatingQCPo, ElectricalMix, FunctionalOth,
## FunctionalSev, MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, RoofStyleShed, Exterior1stImStucc, Exterior1stOther,
## Exterior2ndOther, BsmtCondPo, HeatingQCPo, ElectricalMix, FunctionalOth,
## FunctionalSev, MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, RoofStyleShed, Exterior1stImStucc, Exterior1stOther,
## Exterior2ndOther, BsmtCondPo, HeatingQCPo, ElectricalMix, FunctionalOth,
## FunctionalSev, MiscFeatureTenC
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2PosN, Condition2RRAe, RoofStyleShed, RoofMatlMetal,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, FoundationWood,
## FunctionalOth, MiscFeatureTenC, SaleConditionAdjLand

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2PosN, Condition2RRAe, RoofStyleShed, RoofMatlMetal,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, FoundationWood,
## FunctionalOth, MiscFeatureTenC, SaleConditionAdjLand

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2PosN, Condition2RRAe, RoofStyleShed, RoofMatlMetal,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, FoundationWood,
## FunctionalOth, MiscFeatureTenC, SaleConditionAdjLand
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2PosA, Condition2RRAe, RoofStyleShed, RoofMatlRoll,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, FunctionalOth,
## MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2PosA, Condition2RRAe, RoofStyleShed, RoofMatlRoll,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, FunctionalOth,
## MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2PosA, Condition2RRAe, RoofStyleShed, RoofMatlRoll,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, FunctionalOth,
## MiscFeatureTenC
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, RoofStyleShed, Exterior1stImStucc, Exterior1stOther,
## Exterior1stStone, Exterior2ndOther, FunctionalOth, MiscFeatureOthr,
## MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, RoofStyleShed, Exterior1stImStucc, Exterior1stOther,
## Exterior1stStone, Exterior2ndOther, FunctionalOth, MiscFeatureOthr,
## MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, RoofStyleShed, Exterior1stImStucc, Exterior1stOther,
## Exterior1stStone, Exterior2ndOther, FunctionalOth, MiscFeatureOthr,
## MiscFeatureTenC
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: MSZoningOTH,
## NeighborhoodBlueste, Condition1RRNe, Condition2RRAe, RoofStyleShed,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, FunctionalOth,
## MiscFeatureTenC, SaleTypeCon

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: MSZoningOTH,
## NeighborhoodBlueste, Condition1RRNe, Condition2RRAe, RoofStyleShed,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, FunctionalOth,
## MiscFeatureTenC, SaleTypeCon

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19,
## uniqueCut = 10, : These variables have zero variances: MSZoningOTH,
## NeighborhoodBlueste, Condition1RRNe, Condition2RRAe, RoofStyleShed,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, FunctionalOth,
## MiscFeatureTenC, SaleTypeCon
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, UtilitiesNoSeWa,
## Condition1RRNe, Condition2RRAe, RoofStyleShed, Exterior1stAsphShn,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, FunctionalOth,
## MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, UtilitiesNoSeWa,
## Condition1RRNe, Condition2RRAe, RoofStyleShed, Exterior1stAsphShn,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, FunctionalOth,
## MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, UtilitiesNoSeWa,
## Condition1RRNe, Condition2RRAe, RoofStyleShed, Exterior1stAsphShn,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, FunctionalOth,
## MiscFeatureTenC
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, StreetPave,
## Condition1RRNe, Condition2RRAe, RoofStyleShed, Exterior1stImStucc,
## Exterior1stOther, Exterior2ndOther, FunctionalOth, MiscFeatureTenC,
## SaleTypeOth

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, StreetPave,
## Condition1RRNe, Condition2RRAe, RoofStyleShed, Exterior1stImStucc,
## Exterior1stOther, Exterior2ndOther, FunctionalOth, MiscFeatureTenC,
## SaleTypeOth

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, StreetPave,
## Condition1RRNe, Condition2RRAe, RoofStyleShed, Exterior1stImStucc,
## Exterior1stOther, Exterior2ndOther, FunctionalOth, MiscFeatureTenC,
## SaleTypeOth
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2Feedr, Condition2RRAe, RoofStyleShed, Exterior1stImStucc,
## Exterior1stOther, Exterior2ndOther, FunctionalOth, MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2Feedr, Condition2RRAe, RoofStyleShed, Exterior1stImStucc,
## Exterior1stOther, Exterior2ndOther, FunctionalOth, MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2Feedr, Condition2RRAe, RoofStyleShed, Exterior1stImStucc,
## Exterior1stOther, Exterior2ndOther, FunctionalOth, MiscFeatureTenC
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, Condition2RRNn, RoofStyleShed, RoofMatlMembran,
## Exterior1stCBlock, Exterior1stImStucc, Exterior1stOther, Exterior2ndCBlock,
## Exterior2ndOther, ExterCondPo, FunctionalOth, MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, Condition2RRNn, RoofStyleShed, RoofMatlMembran,
## Exterior1stCBlock, Exterior1stImStucc, Exterior1stOther, Exterior2ndCBlock,
## Exterior2ndOther, ExterCondPo, FunctionalOth, MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, Condition2RRNn, RoofStyleShed, RoofMatlMembran,
## Exterior1stCBlock, Exterior1stImStucc, Exterior1stOther, Exterior2ndCBlock,
## Exterior2ndOther, ExterCondPo, FunctionalOth, MiscFeatureTenC
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, LotConfigFR3,
## Condition1RRNe, Condition2RRAe, Condition2RRAn, RoofStyleShed,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, FunctionalOth,
## MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, LotConfigFR3,
## Condition1RRNe, Condition2RRAe, Condition2RRAn, RoofStyleShed,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, FunctionalOth,
## MiscFeatureTenC

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, LotConfigFR3,
## Condition1RRNe, Condition2RRAe, Condition2RRAn, RoofStyleShed,
## Exterior1stImStucc, Exterior1stOther, Exterior2ndOther, FunctionalOth,
## MiscFeatureTenC
## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: MSZoningOTH, Condition1RRNe,
## Condition2RRAe, RoofStyleShed, Exterior1stImStucc, Exterior1stOther,
## Exterior2ndOther, FunctionalOth, MiscFeatureTenC

Here’s some of the times the models took to run all grouped up together.

# paste("Total time it took to run the RF model was",difftime(end_time3, start_time3, units = "mins"), "minutes.")       # First RF model
# paste("Total time it took to run the maxnodes model was",difftime(end_time4, start_time4, units = "mins"), "minutes.")   # maxnodes
# paste("Total time it took to run the nodes was",difftime(end_time5, start_time5, units = "mins"), "minutes.")    # nodes
# paste("Total time it took to run the ntree was",difftime(end_time6, start_time6, units = "mins"), "minutes.")            # ntrees
# paste("Total time it took to run the RF model was",difftime(end_time7, start_time7, units = "mins"), "minutes.")            # Tuned RF model
# paste("Total time it took to run the 2nd Pre-processed RF model was",difftime(end_time8, start_time8, units = "mins"), "minutes.") # 2nd Pre-processed RF model

Other hyperparameter summaries

# In case I miss the summary
results_tree <- resamples(store_maxtrees)
summary(results_tree)
## 
## Call:
## summary.resamples(object = results_tree)
## 
## Models: 250, 500, 800, 1000, 2000, 2200, 2400, 2600 
## Number of resamples: 30 
## 
## MAE 
##          Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## 250  17007.79 18910.43 19769.81 19927.56 21050.58 22346.94    0
## 500  16857.48 18857.04 19800.60 19882.95 20885.04 22383.91    0
## 800  16764.09 18820.59 19821.81 19876.63 20953.73 22350.02    0
## 1000 16794.98 18823.67 19781.79 19873.00 20927.34 22305.02    0
## 2000 16634.70 18770.25 19822.32 19860.44 20987.06 22310.13    0
## 2200 16617.22 18798.67 19807.06 19860.61 20986.45 22323.33    0
## 2400 16613.51 18786.41 19792.92 19859.59 20992.24 22294.80    0
## 2600 16628.85 18782.41 19801.94 19864.64 20958.12 22295.55    0
## 
## RMSE 
##          Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## 250  22967.94 27689.18 29618.03 29923.72 31954.95 34799.18    0
## 500  22385.91 27654.27 29819.94 29849.93 32145.31 34996.82    0
## 800  22497.97 27669.53 29847.87 29867.10 32194.15 34891.56    0
## 1000 22545.87 27608.24 29910.41 29875.75 32295.63 34902.36    0
## 2000 22263.27 27646.87 29796.33 29869.52 32450.39 34748.44    0
## 2200 22184.97 27611.56 29865.23 29857.39 32475.19 34756.06    0
## 2400 22165.63 27607.07 29850.66 29859.37 32472.81 34780.29    0
## 2600 22173.70 27639.87 29860.21 29866.91 32501.65 34733.54    0
## 
## Rsquared 
##           Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## 250  0.8102113 0.8611855 0.8866613 0.8830814 0.9069482 0.9237795    0
## 500  0.8098036 0.8603975 0.8854948 0.8836829 0.9087686 0.9235772    0
## 800  0.8096186 0.8611959 0.8866005 0.8836711 0.9082843 0.9238712    0
## 1000 0.8088374 0.8602644 0.8868049 0.8836105 0.9080757 0.9243142    0
## 2000 0.8078529 0.8593579 0.8865337 0.8837247 0.9080870 0.9230671    0
## 2200 0.8083910 0.8592711 0.8866915 0.8838621 0.9081223 0.9234693    0
## 2400 0.8088908 0.8590048 0.8867249 0.8838590 0.9079578 0.9237357    0
## 2600 0.8082881 0.8593761 0.8868374 0.8837730 0.9078159 0.9236367    0
results_mtry <- resamples(store_maxnode) # arrange the results of the model
summary(results_mtry) # print summary of all the combinations
## 
## Call:
## summary.resamples(object = results_mtry)
## 
## Models: 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30 
## Number of resamples: 30 
## 
## MAE 
##        Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## 3  24330.69 26392.85 28706.44 29709.51 33463.71 36430.05    0
## 4  22682.04 24915.79 26193.83 27032.99 28846.53 32208.99    0
## 5  21435.07 23347.76 24658.82 25405.81 27343.13 30336.57    0
## 6  20700.88 22329.75 24047.60 24659.10 26994.81 29284.06    0
## 7  19833.36 21963.79 23290.09 23780.21 25682.99 28529.53    0
## 8  19872.69 21714.14 23308.42 23407.76 24970.31 27913.95    0
## 9  19431.22 21327.85 22787.05 22855.53 24652.34 27035.41    0
## 10 19241.43 20892.32 22410.41 22458.78 24096.82 26928.59    0
## 11 19017.23 20469.37 22054.66 22112.31 23968.93 26235.71    0
## 12 18396.70 20111.11 21779.15 21898.13 23697.96 26289.22    0
## 13 18379.91 20044.54 21354.85 21581.77 23099.88 25931.63    0
## 14 18099.93 19733.22 21215.93 21373.98 22804.27 25799.11    0
## 15 18046.55 19627.78 20968.47 21148.66 22507.42 25198.27    0
## 16 18123.53 19598.25 20749.66 21064.58 22422.09 25319.14    0
## 17 17843.15 19588.31 20671.27 20908.23 22284.06 25400.00    0
## 18 17834.87 19161.38 20446.29 20724.90 22155.53 24917.03    0
## 19 17677.05 19100.83 20467.04 20586.21 21905.87 24984.63    0
## 20 17430.48 19004.77 20336.74 20460.57 21699.40 24523.46    0
## 21 17477.99 18846.08 20420.98 20337.92 21679.87 24543.89    0
## 22 17479.60 18579.22 20137.58 20230.20 21558.41 24477.90    0
## 23 17374.29 18560.81 20084.97 20134.90 21288.02 24255.21    0
## 24 17027.40 18488.21 19817.51 20036.35 21154.29 24243.51    0
## 25 16677.85 18520.13 19714.59 19914.12 21032.34 24194.69    0
## 26 17114.20 18308.69 19571.46 19844.39 21040.56 24227.90    0
## 27 16800.59 18341.25 19772.42 19853.38 21135.36 24199.63    0
## 28 16537.17 18246.24 19507.30 19754.27 21044.91 24024.05    0
## 29 16656.50 18324.68 19386.33 19682.83 20991.65 24037.73    0
## 30 16913.35 18262.52 19478.83 19634.02 20854.09 23823.28    0
## 
## RMSE 
##        Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## 3  33605.61 36472.36 46171.10 48423.62 59819.62 71504.00    0
## 4  30875.62 34611.93 39743.26 41023.86 45616.55 57421.85    0
## 5  28813.54 32808.47 37648.46 39519.58 45011.09 55988.68    0
## 6  27406.15 31745.95 36641.43 38279.28 43204.62 54669.82    0
## 7  27008.43 30563.03 34691.24 36554.35 41662.16 53384.41    0
## 8  26733.76 30131.35 34397.91 34796.27 40441.58 45177.40    0
## 9  26312.58 29323.38 33518.29 34221.59 39690.84 44853.33    0
## 10 25675.72 28586.09 33447.42 33884.23 39335.15 44509.27    0
## 11 24805.61 28049.75 32715.32 33372.72 39153.04 43569.79    0
## 12 24366.82 27871.67 32441.05 33066.15 39115.05 43683.45    0
## 13 24153.12 27451.31 31884.60 32561.06 38399.10 43780.04    0
## 14 23951.45 26941.05 31455.27 32050.23 36801.63 43243.36    0
## 15 24315.87 27033.10 30845.86 31494.50 36367.29 41770.50    0
## 16 23632.10 26930.57 30234.45 31169.06 35469.54 39991.10    0
## 17 23846.31 26753.73 30704.64 31153.15 35898.06 39498.53    0
## 18 23922.51 26626.51 30182.46 30964.64 35213.18 40282.48    0
## 19 23421.13 26508.52 30179.03 30763.85 34825.79 39628.97    0
## 20 23557.89 26358.14 30211.52 30607.71 34835.45 39213.51    0
## 21 23418.40 26075.39 29651.12 30454.99 34640.67 39634.82    0
## 22 23316.24 25783.72 29744.39 30373.11 35051.99 38986.71    0
## 23 23027.14 25975.45 29545.72 30236.05 35086.05 39260.44    0
## 24 22924.88 25801.93 29342.67 30010.15 34712.42 39886.80    0
## 25 22696.01 25640.27 28835.04 29751.89 34457.56 38518.00    0
## 26 22835.78 25486.60 28593.15 29588.33 34345.24 37702.73    0
## 27 23046.51 25411.51 28982.43 29604.19 33935.25 37995.77    0
## 28 22874.31 25794.79 28255.79 29421.05 33428.76 37966.79    0
## 29 22657.51 25353.74 28432.48 29436.07 33688.28 38386.84    0
## 30 22865.91 25364.82 28432.96 29329.02 33640.55 38012.29    0
## 
## Rsquared 
##         Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## 3  0.5796496 0.6845099 0.7374814 0.7279922 0.7612040 0.8641782    0
## 4  0.6712924 0.7903859 0.8228302 0.8154427 0.8472560 0.8946657    0
## 5  0.6997451 0.8050739 0.8309389 0.8262620 0.8477308 0.9081267    0
## 6  0.7199763 0.8098131 0.8327009 0.8300692 0.8554024 0.9058329    0
## 7  0.7258413 0.8161144 0.8414614 0.8370847 0.8618708 0.9181125    0
## 8  0.7288277 0.8284249 0.8579893 0.8518710 0.8779208 0.9109385    0
## 9  0.7486820 0.8343482 0.8666426 0.8564015 0.8784599 0.9136328    0
## 10 0.7439521 0.8332575 0.8656597 0.8576308 0.8808872 0.9095247    0
## 11 0.7503169 0.8397417 0.8685301 0.8605824 0.8876341 0.9103198    0
## 12 0.7589720 0.8413797 0.8692904 0.8625843 0.8890018 0.9174475    0
## 13 0.7672940 0.8480747 0.8703215 0.8652565 0.8899703 0.9193817    0
## 14 0.7751405 0.8471256 0.8752343 0.8675962 0.8918611 0.9170080    0
## 15 0.7717144 0.8545221 0.8768034 0.8715162 0.8943168 0.9191054    0
## 16 0.7806857 0.8526411 0.8767857 0.8728789 0.8972333 0.9235007    0
## 17 0.7768783 0.8573965 0.8807666 0.8731725 0.8980903 0.9190218    0
## 18 0.7837237 0.8578618 0.8797686 0.8743383 0.8990742 0.9154702    0
## 19 0.7893467 0.8566647 0.8817142 0.8754172 0.8995039 0.9225871    0
## 20 0.7965325 0.8570633 0.8796508 0.8763696 0.9004599 0.9178893    0
## 21 0.7911410 0.8583008 0.8833285 0.8774807 0.8992841 0.9218667    0
## 22 0.7940618 0.8590421 0.8869511 0.8782952 0.8979896 0.9225422    0
## 23 0.7944356 0.8622902 0.8837015 0.8788190 0.9007152 0.9199747    0
## 24 0.8043073 0.8619158 0.8863002 0.8802340 0.9011828 0.9220419    0
## 25 0.7967734 0.8661965 0.8886745 0.8816267 0.9039294 0.9232574    0
## 26 0.8004329 0.8663074 0.8865630 0.8825278 0.9060013 0.9231018    0
## 27 0.8003506 0.8659602 0.8843933 0.8817429 0.9059095 0.9238067    0
## 28 0.8076826 0.8643306 0.8862904 0.8831895 0.9031490 0.9266424    0
## 29 0.8131615 0.8680206 0.8879340 0.8833469 0.9057572 0.9258199    0
## 30 0.8000701 0.8690470 0.8866398 0.8838552 0.9084561 0.9266926    0
best_mtry <- rf_mtry$bestTune$mtry # best value of mtry is stored in here
best_mtry
## [1] 40

Section 5: Prediction and Validation

# Predict and test RMSE
prediction_rf2 <- predict(mod_rf_tuned, newdata = test_part)
rmse(log(test_part$SalePrice),log(prediction_rf2))
## [1] 0.1749464
ggplot(test_part, aes(x = SalePrice, y = prediction_rf2)) +
  geom_point(shape = 1) +    # Use hollow circles
  geom_smooth(method = lm)   # Add linear regression line

# Predict and test RMSE (removed var)
prediction_rf <- predict(mod_rf_removed_var, newdata = test_part2)
rmse(log(test_part2$SalePrice),log(prediction_rf))
## [1] 0.1575947
ggplot(test_part2, aes(x = SalePrice, y = prediction_rf)) +
  geom_point(shape = 1) +    # Use hollow circles
  geom_smooth(method = lm)   # Add linear regression line

Submit prediction values into CSV.

# Submit new values into test dataset
# test$submission <- prediction_rf2

Write new submission prediction values into file.

# Create new dataframe with submission values
# submission <- data.frame(Id = test$Id, SalePrice = test$submission)
# write.csv(submission, file = ".../Kaggle/House Prices/submission2.csv", row.names = FALSE)