IS605 Final

CUNY MSDA DATA 605

Tom Detzel, May 20, 2018




Modeling



Step 1: Data Preparation

We’ll get rid of variables with more than 80 percent missing data; for the most part these aren’t important real estate features in Boston. For other missing data, we’ll impute the mean or mode.

1.1. Find what’s missing

# find percent missing
pMiss <- function(x){sum(is.na(x))/length(x)*100}

missing_train <- data.frame(cbind(apply(train,2,pMiss)))
missing_train$Feature <- rownames(missing_train)
colnames(missing_train)[1] <- 'Value'

missing_test <- data.frame(cbind(apply(test,2,pMiss)))
missing_test$Feature <- rownames(missing_test)
colnames(missing_test)[1] <- 'Value'

missing <- cbind(missing_test[,1], missing_train[-81,1])
rownames(missing) <- rownames(missing_test)
colnames(missing) <- c('Test', 'Train')
missing <- data.frame(missing)

pander(missing[with(missing, order(-Train)), ][1:5,], caption='Most missing data (Pct)')
Most missing data (Pct)
  Test Train
PoolQC 99.79 99.52
MiscFeature 96.5 96.3
Alley 92.67 93.77
Fence 80.12 80.75
FireplaceQu 50.03 47.26


1.2 Drop variables with almost no data

It’s futile to impute data when more than 80 percent is missing, so we’ll just delete these variables. (Pool, MiscFeature, Alley and Fence.)

drops <- names(train) %in% c('PoolQC', 'MiscFeature', 'Alley', 'Fence')
train <- train[!drops]
drops <- names(test) %in% c('PoolQC', 'MiscFeature', 'Alley', 'Fence')
test <- test[!drops]

1.3 Drop zero, uncorrelated data

We’ll drop some additonal numeric variables that are mostly zero or have only a faint correlation with the target, SalePrice. The two correlograms each show correlations among half the remaining 34 variables.

# get names of categorical, numeric variables
numerics <- colnames(train[, sapply(train, is.numeric)])
categoricals <- colnames(train[, sapply(train, is.factor)])
categoricals <- categoricals[-1]
corPlot(train[,numerics[1:17]], main='Correlation Plot I')

corPlot(train[,numerics[18:34]], main='Correlation Plot II')

# training variables to drop 
drops <- names(train) %in% c('LotFrontage', 'MasVrnArea', 'GarageYrBlt', 'BedroomAbvGr', 'KitchenAbvGr', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea MiscVal', 'YrSold')

train <- train[!drops]

# test variables to drop
drops <- names(test) %in% c('LotFrontage', 'MasVrnArea', 'GarageYrBlt', 'BedroomAbvGr', 'KitchenAbvGr', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea MiscVal', 'YrSold')

test <- test[!drops]


1.3 Drop redundant, monotonic variables

We take a look at barplots of categorical variables and drop those that have mostly the same value; they won’t contribute much information. For example, almost every house is on a paved street and connected to public utilities. We will also remove redundant categoricals, such as “Condition1” and “Condition2” in favor of “OverallCond” and “OverallQual”. Here some sample plots:

# plots for categorical variables

par(mfrow=c(6,2))
for(i in 31:42) {
  plot(train[, categoricals[i]], main=categoricals[i], col='lightblue')
}

# training variables to drop 
drops <- names(train) %in% c("Street", "LotShape","LandContour", "Utilities", "LotConfig", "LandSlope", "Condition1", "Condition2", "OverallCond", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "ExterQual", "ExterCond", "Foundation", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "Heating", "HeatingQC", "Electrical", "Functional", "FireplaceQu", "GarageFinish",  "GarageQual", "GarageCond",  "PavedDrive", "SaleType", "SaleCondition", "KitchenQual")

train <- train[!drops]

# test variables to drop
drops <- names(test) %in% c("Street", "LotShape","LandContour", "Utilities", "LotConfig", "LandSlope", "Condition1", "Condition2", "OverallCond", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "ExterQual", "ExterCond", "Foundation", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "Heating", "HeatingQC", "Electrical", "Functional", "FireplaceQu", "GarageFinish",  "GarageQual", "GarageCond",  "PavedDrive", "SaleType", "SaleCondition", "KitchenQual")

test <- test[!drops]

We’re down to 35 predictors, but we can do some further pruning to get rid of redundant measures of living area. We’ll create our variable for total living space, TotSqFt, and drop the components.

# add variable for total square feet of living space
train$TotSqFt <- train$BsmtFinSF1 + train$BsmtFinSF2 + train$`1stFlrSF` + train$`2ndFlrSF`
test$TotSqFt <- test$BsmtFinSF1 + test$BsmtFinSF2 + test$`1stFlrSF` + test$`2ndFlrSF`

# drop the components and others  
drops <- names(train) %in% c("1stFlrSF", "2ndFlrSF","MSSubClass", "RoofStyle","MasVnrArea","BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF","LowQualFinSF", "GrLivArea", "BsmtFullBath", "BsmtHalfBath", "GarageCars", "WoodDeckSF", "MiscVal", "PoolArea")
                             
train <- train[!drops]

# test variables to drop
drops <- names(test) %in% c("1stFlrSF", "2ndFlrSF","MSSubClass", "RoofStyle","MasVnrArea","BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF","LowQualFinSF", "GrLivArea", "BsmtFullBath", "BsmtHalfBath", "GarageCars", "WoodDeckSF", "MiscVal", "PoolArea")

test <- test[!drops]

1.4 Impute missing values

Now that we’ve eliminated redundant, correlated and mostly empty variables, we have 19 predictors in our model. Let’s again look again at missing values in the training and test data.

# find percent missing
pMiss <- function(x){sum(is.na(x))/length(x)*100}

missing_train <- data.frame(cbind(apply(train,2,pMiss)))
missing_train$Feature <- rownames(missing_train)
colnames(missing_train)[1] <- 'Value'

missing_test <- data.frame(cbind(apply(test,2,pMiss)))
missing_test$Feature <- rownames(missing_test)
colnames(missing_test)[1] <- 'Value'

missing <- cbind(missing_test[,1], missing_train[-20,1])
rownames(missing) <- rownames(missing_test)
colnames(missing) <- c('Test', 'Train')
missing <- data.frame(missing)

pander(missing[with(missing, order(-Test, -Train)), ][1:5,], caption='Percent missing')
Percent missing
  Test Train
GarageType 5.209 5.548
MSZoning 0.2742 0
GarageArea 0.06854 0
TotSqFt 0.06854 0
Id 0 0


Now, only four variables have missing values, mostly in the test set. How are the missing values distributed?

tempvars <- c('MSZoning', 'GarageType', 'GarageArea', 'TotSqFt')
summary(test[, tempvars])
##     MSZoning      GarageType    GarageArea        TotSqFt    
##  C (all):  15   2Types : 17   Min.   :   0.0   Min.   : 407  
##  FV     :  74   Attchd :853   1st Qu.: 318.0   1st Qu.:1481  
##  RH     :  10   Basment: 17   Median : 480.0   Median :1830  
##  RL     :1114   BuiltIn: 98   Mean   : 472.8   Mean   :1975  
##  RM     : 242   CarPort:  6   3rd Qu.: 576.0   3rd Qu.:2376  
##  NA's   :   4   Detchd :392   Max.   :1488.0   Max.   :9105  
##                 NA's   : 76   NA's   :1        NA's   :1
summary(train$GarageType)
##  2Types  Attchd Basment BuiltIn CarPort  Detchd    NA's 
##       6     870      19      88       9     387      81

We’ll set GarageType aside for the moment and impute the others. Numerics get the mean and categoricals get the mode. GarageType will be randomly distributed according to the sample proportion.

# impute Garage area
# mean(test$GarageArea, na.rm=T) # 473
# which(is.na(test$GarageArea)) # get NA values
# test$GarageArea[1117]  # check
test$GarageArea[1117] <- 473 # update
# test$GarageArea[1117]  # check

# impute Garage area
# mean(test$TotSqFt, na.rm=T) # 1974.728
# which(is.na(test$TotSqFt)) # get NA values
# test$TotSqFt[661]  # check
test$TotSqFt[661] <- 1975 # update
# test$TotSqFt[661]  # check

# impute Zoning
# mode is 'RL'
# which(is.na(test$MSZoning)) # get NA values
# test$MSZoning[c(456, 757, 791, 1445)]
test$MSZoning[c(456, 757, 791, 1445)] <- 'RL'
# test$MSZoning[c(456, 757, 791, 1445)]

# impute Garage type by randomly assigning proportional values
train_rows_to_impute <- rownames(train[which(is.na(train$GarageType)), ])
test_rows_to_impute <- rownames(test[which(is.na(test$GarageType)), ])

# training data first
# get proportions
# summary(train$GarageType)
# trgarage <- c(6, 870, 19, 88, 9, 387)
# round(prop.table(trgarage)*81,1)
# sum(0, 51, 1, 5, 1, 23)

# randomly select fills
Attchd <- sample(train_rows_to_impute, 51)
temp <- train_rows_to_impute[!train_rows_to_impute %in% Attchd] 
Detchd <- sample(temp, 23)
temp <- temp[!temp %in% Detchd] 
BuiltIn <- sample(temp, 5)
temp <- temp[!temp %in% BuiltIn] 
Basment <- sample(temp, 1)
CarPort <- temp[!temp %in% Basment] 

# make changes
train[Attchd, "GarageType"] <- 'Attchd'
train[Detchd, "GarageType"] <- 'Detchd'
train[BuiltIn, "GarageType"] <- 'BuiltIn'
train[Basment, "GarageType"] <- 'Basment'
train[CarPort, "GarageType"] <- 'CarPort'

# check for NAs
# length(which(is.na(train$GarageType)==TRUE))

# test data, same technique
# tsgarage<- c(17, 853, 17, 98, 6, 392)
# round(prop.table(tsgarage)*76,1)
# sum(1, 47, 1, 5, 0, 22)

# randomly select fills
Attchd <- sample(test_rows_to_impute, 47)
temp <- test_rows_to_impute[!test_rows_to_impute %in% Attchd] 
Detchd <- sample(temp, 22)
temp <- temp[!temp %in% Detchd] 
BuiltIn <- sample(temp, 5)
temp <- temp[!temp %in% BuiltIn] 
Basment <- sample(temp, 1)
twoTypes <- temp[!temp %in% Basment] 

# make changes
test[Attchd, "GarageType"] <- '2Types'
test[Detchd, "GarageType"] <- 'Detchd'
test[BuiltIn, "GarageType"] <- 'BuiltIn'
test[Basment, "GarageType"] <- 'Basment'
test[twoTypes, "GarageType"] <- '2Types'

Check for any NAs.

# check for NAs
length(which(is.na(train[,])==TRUE)); length(which(is.na(test[,])==TRUE))
## [1] 0
## [1] 0

Drop our unnecessary ID column and have a look at the imputed dataframes.

# drop ID
test <- test[, -1]
train <- train[,-1]

pander(head(train,3))
Table continues below
MSZoning LotArea Neighborhood BldgType HouseStyle OverallQual
RL 8450 CollgCr 1Fam 2Story 7
RL 9600 Veenker 1Fam 1Story 6
RL 11250 CollgCr 1Fam 2Story 7
Table continues below
YearBuilt YearRemodAdd CentralAir FullBath HalfBath TotRmsAbvGrd
2003 2003 Y 2 1 8
1976 1976 Y 2 0 6
2001 2002 Y 2 1 6
Fireplaces GarageType GarageArea OpenPorchSF MoSold SalePrice TotSqFt
0 Attchd 548 61 2 208500 2416
1 Attchd 460 0 5 181500 2240
1 Attchd 608 42 9 223500 2272
pander(head(test,3))
Table continues below
MSZoning LotArea Neighborhood BldgType HouseStyle OverallQual
RH 11622 NAmes 1Fam 1Story 5
RL 14267 NAmes 1Fam 1Story 6
RL 13830 Gilbert 1Fam 2Story 5
Table continues below
YearBuilt YearRemodAdd CentralAir FullBath HalfBath TotRmsAbvGrd
1961 1961 Y 1 0 5
1958 1958 Y 1 1 6
1997 1998 Y 2 1 6
Fireplaces GarageType GarageArea OpenPorchSF MoSold TotSqFt
0 Attchd 730 0 6 1508
0 Attchd 312 36 6 2252
1 Attchd 482 34 3 2420


1.5 Check for transforms

A last look at some of our numeric variables shows skew, particulary on LotArea but also SalePrice and TotSqFt, so we’ll use Box-Cox transforms in our models.

numerics <- colnames(train[, sapply(train, is.numeric)])
describe(train[,numerics])[11]
##               skew
## LotArea      12.18
## YearBuilt    -0.61
## YearRemodAdd -0.50
## FullBath      0.04
## HalfBath      0.67
## TotRmsAbvGrd  0.67
## Fireplaces    0.65
## GarageArea    0.18
## OpenPorchSF   2.36
## SalePrice     1.88
## TotSqFt       2.16

We are now ready to model the data.



2. Model tryout

2.1 Model types

We’re going compare four models: linear, random forest, stochastic gradient boosting, and cubist. Using the caret package, we’ll use 10-fold cross validation repeated 5 times, and scale and preprocess the data.

We’ll take the best performer on root mean square error (RMSE) and submit that result to Kaggle.
I’m leaving out a bunch of experimentation in the code below. After trying multiple models, the best performance was on an even smaller set of predictors – 15 in all. It turns out that fireplaces, air conditioning, open porches and month detract from the results.

First, some data backup.

# make copies for the modeling; make a small train and test set based on earlier testing
df_train <- train
df_test <- test
df_small <- df_train[, -c(5,9,14,17)]
df_test_small <- df_test[, -c(5,9,14,17)]

# make sure that OverallQual is a properly ordered factor
df_small$OverallQual <- factor(df_small$OverallQual, ordered=TRUE, levels=c(1:10))
df_test_small$OverallQual <- factor(df_test_small$OverallQual, ordered=TRUE, levels=c(1:10))

# using caret package, set up cross validation and our comparison metric
trainControl <- trainControl(method="repeatedcv", number=10, repeats=5)
metric <- "RMSE"

# build the four test models
# 
# LM
set.seed(100)
fit2.lm <- train(SalePrice~., data=df_small, method="lm", metric=metric, preProc=c("center", "scale", "BoxCox"), trControl=trainControl)

# Random Forest
set.seed(100)
fit2.rf <- train(SalePrice~., data=df_small, method="rf", metric=metric,
                preProc=c("BoxCox"),
                trControl=trainControl)

# Stochastic Gradient Boosting
set.seed(100)
fit2.gbm <- train(SalePrice~., data=df_small, method="gbm", metric=metric,
                 preProc=c("BoxCox"),
                 trControl=trainControl, verbose=FALSE)

# Cubist
set.seed(100)
fit2.cubist <- train(SalePrice~., data=df_small, method="cubist", metric=metric, preProc=c("BoxCox"), trControl=trainControl)

# Compare algorithms
ensembleResults <- resamples(list(LM=fit2.lm, GBM=fit2.gbm, RF=fit2.rf, CUBIST=fit2.cubist))

summary(ensembleResults)
## 
## Call:
## summary.resamples(object = ensembleResults)
## 
## Models: LM, GBM, RF, CUBIST 
## Number of resamples: 50 
## 
## MAE 
##            Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## LM     15248.89 17823.10 18967.50 18997.41 19913.60 24098.10    0
## GBM    14382.95 17754.57 18498.16 18637.92 19385.76 22336.07    0
## RF     15130.21 17290.80 17890.94 18196.91 19152.80 22828.28    0
## CUBIST 14628.41 16047.00 16905.84 17100.60 17892.40 21754.49    0
## 
## RMSE 
##            Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## LM     21567.28 26313.89 30025.19 31244.85 35064.47 50140.74    0
## GBM    19378.40 26298.35 28304.00 29827.10 33662.45 44763.65    0
## RF     21043.58 24851.40 28250.97 28886.39 32035.66 46477.02    0
## CUBIST 20433.42 23431.03 25406.07 27978.20 28625.65 48906.47    0
## 
## Rsquared 
##             Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## LM     0.6517412 0.8405932 0.8585204 0.8467941 0.8853125 0.9296673    0
## GBM    0.7229757 0.8427445 0.8725199 0.8588275 0.8947665 0.9327077    0
## RF     0.7374405 0.8428955 0.8859140 0.8668261 0.9027449 0.9279548    0
## CUBIST 0.6263156 0.8727408 0.8999249 0.8727052 0.9189315 0.9348231    0


2.2 Tune the best model

The best result both in R2 and RMSE is the Cubist model. Cubist is an ensemble approch that combines rule-based trees and K nearest neighbors. The parameters for the model are the number of committees (i.e., rules developed as model trees are built) and the number of neighbors used for error adjustment as the model is built.

# Tune the Cubist algorithm
trainControl <- trainControl(method="repeatedcv", number=10, repeats=3)
metric <- "RMSE"

set.seed(100)
grid <- expand.grid(.committees=seq(5, 15, by=1), .neighbors=c(3, 5, 7))

tune.cubist <- train(SalePrice~., data=df_small, method="cubist", metric=metric,
                  preProc=c("BoxCox"), tuneGrid=grid, trControl=trainControl)

plot(tune.cubist)

print(tune.cubist)
## Cubist 
## 
## 1460 samples
##   14 predictor
## 
## Pre-processing: Box-Cox transformation (5) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 1312, 1314, 1314, 1313, 1315, 1315, ... 
## Resampling results across tuning parameters:
## 
##   committees  neighbors  RMSE      Rsquared   MAE     
##    5          3          28357.21  0.8693731  17632.42
##    5          5          28569.87  0.8662826  17312.51
##    5          7          28753.74  0.8637860  17145.98
##    6          3          28225.24  0.8713563  17614.68
##    6          5          28454.69  0.8679802  17285.29
##    6          7          28667.53  0.8651867  17132.61
##    7          3          28099.91  0.8722904  17570.45
##    7          5          28319.09  0.8689532  17245.94
##    7          7          28530.39  0.8661625  17092.26
##    8          3          28113.11  0.8722855  17578.98
##    8          5          28330.22  0.8689958  17237.65
##    8          7          28555.22  0.8661383  17068.38
##    9          3          28050.05  0.8725720  17518.97
##    9          5          28254.74  0.8694820  17199.62
##    9          7          28472.51  0.8666915  17042.66
##   10          3          28083.65  0.8719532  17519.91
##   10          5          28324.80  0.8686410  17195.24
##   10          7          28556.72  0.8657709  17034.99
##   11          3          28015.28  0.8728235  17502.79
##   11          5          28255.28  0.8695338  17184.16
##   11          7          28488.01  0.8666170  17032.67
##   12          3          28036.96  0.8725681  17498.98
##   12          5          28281.90  0.8692311  17173.57
##   12          7          28516.69  0.8663347  17010.98
##   13          3          27989.27  0.8728778  17462.52
##   13          5          28267.70  0.8693685  17161.49
##   13          7          28505.68  0.8664372  17006.47
##   14          3          28072.24  0.8720119  17470.30
##   14          5          28334.47  0.8686006  17166.44
##   14          7          28562.41  0.8657857  16997.84
##   15          3          28032.30  0.8724700  17452.97
##   15          5          28309.62  0.8689695  17159.49
##   15          7          28543.27  0.8661202  17001.17
## 
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were committees = 13 and neighbors = 3.


2.3 Make predictions

The plots that follow compare the distribution of our Cubist model predictions with SalePrice in the training and test data. The distributions look comparable.

cubist.predictions <- predict(tune.cubist)

plot.new()
par(mfrow=c(1,2))
hist(cubist.predictions, freq=F,
      main='Cubist Sales Price',
      col='lightblue', breaks=30,
      xlim=c(50000, 600000))
lines(density(cubist.predictions), col='red', lwd=3)

hist(df_small$SalePrice, freq=F,
      main='Training Sale Price',
      col='lightblue', breaks=30,
      xlim=c(50000, 600000))
lines(density(df_small$SalePrice), col='red', lwd=3)

cubist_test.predictions <- predict(tune.cubist, df_test_small)

plot.new()
par(mfrow=c(1,2))
hist(cubist_test.predictions, freq=F,
      main='Cubist Test Predictions',
      col='lightblue', breaks=30,
      xlim=c(50000, 600000))
lines(density(cubist_test.predictions), col='red', lwd=3)

hist(df_small$SalePrice, freq=F,
      main='Training Sales Price',
      col='lightblue', breaks=30,
      xlim=c(50000, 600000))
lines(density(df_small$SalePrice), col='red', lwd=3)


2.4 Variable importance

The Cubist model allows for a variable importance computation. Our new variable TotSqFt, is the most important relative to the others. Age of the home, garage quality, remodeling and overall quality measures all are relatively important.

# root mean squared logarithmic error
# msle(df_small$SalePrice, cubist.predictions)^0.5

# variable importance
varImp(tune.cubist)
## cubist variable importance
## 
##   only 20 most important variables shown (out of 51)
## 
##                     Overall
## OverallQual.L        100.00
## TotSqFt               93.59
## LotArea               60.90
## YearBuilt             60.90
## YearRemodAdd          52.56
## Fireplaces            47.44
## GarageArea            44.87
## TotRmsAbvGrd          39.10
## FullBath              37.82
## OverallQual^7         36.54
## HalfBath              35.26
## OverallQual^4         33.33
## NeighborhoodBrkSide   31.41
## NeighborhoodCrawfor   31.41
## OpenPorchSF           28.21
## BldgTypeDuplex        27.56
## MSZoningRL            23.72
## OverallQual^5         21.79
## OverallQual.Q         20.51
## NeighborhoodEdwards   19.87


2.6 Final Predictions

The code below builds final model and makes predictions on the transformed test data.

# prepare the data transforms
set.seed(100)
x <- df_small[,c(1:13,15)]
y <- df_small[,14]
preprocessParams <- preProcess(x, method=c("BoxCox"))
transX <- predict(preprocessParams, x)
# train the final model
finalModel <- cubist(x=transX, y=y, committees=13)

print(finalModel)
## 
## Call:
## cubist.default(x = transX, y = y, committees = 13)
## 
## Number of samples: 1460 
## Number of predictors: 14 
## 
## Number of committees: 13 
## Number of rules per committee: 11, 10, 8, 8, 10, 10, 10, 7, 9, 10, 6, 8, 7

Note that neighbors parameter is passed when predicting.

# transform the test data and predict
set.seed(100)
valX <- df_test_small[, 1:14]
trans_valX <- predict(preprocessParams, valX)

# use final model to make predictions on the validation dataset
cubist_final.predictions <- predict(finalModel, newdata=trans_valX, neighbors=3)

# save the predictions for submission
write.csv(cubist_final.predictions, "kaggle_cubist.predictions.csv")