Exploratory Analysis

raw.data <- read.csv('C:\\Users\\Brian\\Desktop\\GradClasses\\Fall18\\605\\final\\all\\train.csv') %>%
  select(-c(Id))

Check for missing elements:

raw.data %>%
  map_dbl(~sum(is.na(.))/nrow(raw.data))
##    MSSubClass      MSZoning   LotFrontage       LotArea        Street 
##  0.0000000000  0.0000000000  0.1773972603  0.0000000000  0.0000000000 
##         Alley      LotShape   LandContour     Utilities     LotConfig 
##  0.9376712329  0.0000000000  0.0000000000  0.0000000000  0.0000000000 
##     LandSlope  Neighborhood    Condition1    Condition2      BldgType 
##  0.0000000000  0.0000000000  0.0000000000  0.0000000000  0.0000000000 
##    HouseStyle   OverallQual   OverallCond     YearBuilt  YearRemodAdd 
##  0.0000000000  0.0000000000  0.0000000000  0.0000000000  0.0000000000 
##     RoofStyle      RoofMatl   Exterior1st   Exterior2nd    MasVnrType 
##  0.0000000000  0.0000000000  0.0000000000  0.0000000000  0.0054794521 
##    MasVnrArea     ExterQual     ExterCond    Foundation      BsmtQual 
##  0.0054794521  0.0000000000  0.0000000000  0.0000000000  0.0253424658 
##      BsmtCond  BsmtExposure  BsmtFinType1    BsmtFinSF1  BsmtFinType2 
##  0.0253424658  0.0260273973  0.0253424658  0.0000000000  0.0260273973 
##    BsmtFinSF2     BsmtUnfSF   TotalBsmtSF       Heating     HeatingQC 
##  0.0000000000  0.0000000000  0.0000000000  0.0000000000  0.0000000000 
##    CentralAir    Electrical     X1stFlrSF     X2ndFlrSF  LowQualFinSF 
##  0.0000000000  0.0006849315  0.0000000000  0.0000000000  0.0000000000 
##     GrLivArea  BsmtFullBath  BsmtHalfBath      FullBath      HalfBath 
##  0.0000000000  0.0000000000  0.0000000000  0.0000000000  0.0000000000 
##  BedroomAbvGr  KitchenAbvGr   KitchenQual  TotRmsAbvGrd    Functional 
##  0.0000000000  0.0000000000  0.0000000000  0.0000000000  0.0000000000 
##    Fireplaces   FireplaceQu    GarageType   GarageYrBlt  GarageFinish 
##  0.0000000000  0.4726027397  0.0554794521  0.0554794521  0.0554794521 
##    GarageCars    GarageArea    GarageQual    GarageCond    PavedDrive 
##  0.0000000000  0.0000000000  0.0554794521  0.0554794521  0.0000000000 
##    WoodDeckSF   OpenPorchSF EnclosedPorch    X3SsnPorch   ScreenPorch 
##  0.0000000000  0.0000000000  0.0000000000  0.0000000000  0.0000000000 
##      PoolArea        PoolQC         Fence   MiscFeature       MiscVal 
##  0.0000000000  0.9952054795  0.8075342466  0.9630136986  0.0000000000 
##        MoSold        YrSold      SaleType SaleCondition     SalePrice 
##  0.0000000000  0.0000000000  0.0000000000  0.0000000000  0.0000000000

Alley, PoolQC, Fence and MiscFeature are all missing so many elements that they are useless. They are removed.

raw.data %<>%
  select(-c(Alley, PoolQC, Fence, MiscFeature))

After exploratory analysis I made some modifications to the dataset. The SalePrice and LotArea are heavily skewed so I log transformed them. Several numerical columns are better suited as factors. For example OveralCond is a number but is better treated as a factor with 10 levels. Finally, a few predictors had so many 0 values (> 90%) that I decieded to turn them into a binary factor. This loses some of the details but it should make capturing differences easier.

raw.data %<>% 
  mutate(SalePrice = log(SalePrice),
         LotArea = log(LotArea),
         OverallQual = factor(OverallQual),
         OverallCond = factor(OverallCond),
         YearBuilt = factor(YearBuilt),
         YearRemodAdd = factor(YearRemodAdd),
         MoSold = factor(MoSold),
         YrSold = factor(YrSold),
         MasVnrArea = factor(ifelse(MasVnrArea > 0, 1, 0)),
         BsmtFinSF2 = factor(ifelse(BsmtFinSF2 > 0, 1, 0)),
         X2ndFlrSF = factor(ifelse(X2ndFlrSF > 0, 1, 0)),
         LowQualFinSF = factor(ifelse(LowQualFinSF > 0, 1, 0)),
         WoodDeckSF = factor(ifelse(WoodDeckSF > 0, 1, 0)),
         OpenPorchSF = factor(ifelse(OpenPorchSF > 0, 1, 0)),
         EnclosedPorch = factor(ifelse(EnclosedPorch > 0, 1, 0)),
         X3SsnPorch = factor(ifelse(X3SsnPorch > 0, 1, 0)),
         ScreenPorch = factor(ifelse(ScreenPorch > 0, 1, 0)),
         PoolArea = factor(ifelse(PoolArea > 0, 1, 0)),
         MiscVal = factor(ifelse(MiscVal > 0, 1, 0)),
         Condition1 = factor(ifelse(Condition1 == 'Norm', 1, 0)),
         Condition2 = factor(ifelse(Condition2 == 'Norm', 1, 0))
         )

Impute missing data with the mice library and confirm all missing values have been filled.

library(mice)
set.seed(123)
imputed.data <- mice::mice(raw.data[, -76], m=5, maxit=5, method='pmm', seed=500, printFlag=FALSE)
## Warning: Number of logged events: 742
data.complete <- cbind(raw.data[, 76], complete(imputed.data, 1))
data.complete %<>%
  rename(SalePrice = `raw.data[, 76]`)
data.complete %>%
  map_dbl(~sum(is.na(.))/nrow(raw.data))
##     SalePrice    MSSubClass      MSZoning   LotFrontage       LotArea 
##             0             0             0             0             0 
##        Street      LotShape   LandContour     Utilities     LotConfig 
##             0             0             0             0             0 
##     LandSlope  Neighborhood    Condition1    Condition2      BldgType 
##             0             0             0             0             0 
##    HouseStyle   OverallQual   OverallCond     YearBuilt  YearRemodAdd 
##             0             0             0             0             0 
##     RoofStyle      RoofMatl   Exterior1st   Exterior2nd    MasVnrType 
##             0             0             0             0             0 
##    MasVnrArea     ExterQual     ExterCond    Foundation      BsmtQual 
##             0             0             0             0             0 
##      BsmtCond  BsmtExposure  BsmtFinType1    BsmtFinSF1  BsmtFinType2 
##             0             0             0             0             0 
##    BsmtFinSF2     BsmtUnfSF   TotalBsmtSF       Heating     HeatingQC 
##             0             0             0             0             0 
##    CentralAir    Electrical     X1stFlrSF     X2ndFlrSF  LowQualFinSF 
##             0             0             0             0             0 
##     GrLivArea  BsmtFullBath  BsmtHalfBath      FullBath      HalfBath 
##             0             0             0             0             0 
##  BedroomAbvGr  KitchenAbvGr   KitchenQual  TotRmsAbvGrd    Functional 
##             0             0             0             0             0 
##    Fireplaces   FireplaceQu    GarageType   GarageYrBlt  GarageFinish 
##             0             0             0             0             0 
##    GarageCars    GarageArea    GarageQual    GarageCond    PavedDrive 
##             0             0             0             0             0 
##    WoodDeckSF   OpenPorchSF EnclosedPorch    X3SsnPorch   ScreenPorch 
##             0             0             0             0             0 
##      PoolArea       MiscVal        MoSold        YrSold      SaleType 
##             0             0             0             0             0 
## SaleCondition 
##             0

Model Building

I created a training and testing partitions:

set.seed(1)
part <- caret::createDataPartition(data.complete$SalePrice, p=0.8, list=FALSE)
data.training <- data.complete %>%
  filter(row_number() %in% part)
data.testing <- data.complete %>%
  filter(!row_number() %in% part)

It appears that correlation may be an issue. I will need to be careful about this.

to.plot <- data.training[, sapply(data.training, is.numeric)]
to.plot %<>% 
  select(-1)
corrplot::corrplot(cor(to.plot), method='circle')

With so many predictors and so few observations, I needed to be careful of overfitting. I tried a multiple regression techinque stepAIC that uses l1 regularization but it left me with too many observations. So I am using stepBIC that uses l2 regularization which has a bigger penalty and thus should give me fewer predictors.

lm.2 <- lm(SalePrice ~ ., data.training)
MASS::stepAIC(lm.2, k=log(nrow(data.training)), trace=0)
## 
## Call:
## lm(formula = SalePrice ~ MSZoning + LotArea + Condition1 + Condition2 + 
##     OverallQual + OverallCond + RoofMatl + BsmtQual + BsmtUnfSF + 
##     TotalBsmtSF + HeatingQC + CentralAir + GrLivArea + BsmtFullBath + 
##     FullBath + HalfBath + KitchenAbvGr + Functional + Fireplaces + 
##     GarageFinish + GarageCars + SaleCondition, data = data.training)
## 
## Coefficients:
##          (Intercept)            MSZoningFV            MSZoningRH  
##            7.134e+00             4.375e-01             4.211e-01  
##           MSZoningRL            MSZoningRM               LotArea  
##            3.836e-01             3.120e-01             6.622e-02  
##          Condition11           Condition21          OverallQual2  
##            6.129e-02             1.088e-01             2.151e-01  
##         OverallQual3          OverallQual4          OverallQual5  
##            2.999e-01             3.695e-01             4.264e-01  
##         OverallQual6          OverallQual7          OverallQual8  
##            4.812e-01             5.639e-01             6.547e-01  
##         OverallQual9         OverallQual10          OverallCond2  
##            7.598e-01             6.941e-01             3.005e-02  
##         OverallCond3          OverallCond4          OverallCond5  
##           -1.703e-01            -4.034e-02             7.069e-03  
##         OverallCond6          OverallCond7          OverallCond8  
##            3.663e-02             9.122e-02             9.901e-02  
##         OverallCond9       RoofMatlCompShg       RoofMatlMembran  
##            1.704e-01             2.503e+00             2.623e+00  
##        RoofMatlMetal          RoofMatlRoll       RoofMatlTar&Grv  
##            2.612e+00             2.506e+00             2.463e+00  
##      RoofMatlWdShake       RoofMatlWdShngl            BsmtQualFa  
##            2.468e+00             2.570e+00            -8.247e-02  
##           BsmtQualGd            BsmtQualTA             BsmtUnfSF  
##           -4.217e-02            -8.711e-02            -5.923e-05  
##          TotalBsmtSF           HeatingQCFa           HeatingQCGd  
##            1.928e-04            -7.621e-02            -2.499e-02  
##          HeatingQCTA           CentralAirY             GrLivArea  
##           -4.234e-02             7.843e-02             2.110e-04  
##         BsmtFullBath              FullBath              HalfBath  
##            3.432e-02             3.315e-02             3.250e-02  
##         KitchenAbvGr        FunctionalMaj2        FunctionalMin1  
##           -7.434e-02            -2.659e-01             2.285e-02  
##       FunctionalMin2         FunctionalMod         FunctionalSev  
##            5.258e-03            -7.174e-02            -3.515e-01  
##        FunctionalTyp            Fireplaces       GarageFinishRFn  
##            5.661e-02             3.075e-02            -2.909e-03  
##      GarageFinishUnf            GarageCars  SaleConditionAdjLand  
##           -4.189e-02             6.525e-02             9.550e-02  
##  SaleConditionAlloca   SaleConditionFamily   SaleConditionNormal  
##            4.970e-02            -9.613e-03             7.220e-02  
## SaleConditionPartial  
##            1.309e-01
lm.3 <- lm(SalePrice ~ MSZoning + LotArea + Condition1 + Condition2 + OverallQual + RoofMatl + BsmtQual + BsmtUnfSF + TotalBsmtSF + CentralAir + GrLivArea + BsmtFullBath + FullBath + HalfBath + KitchenAbvGr + Functional + Fireplaces + GarageFinish + GarageCars + SaleCondition, data=data.training)
summary(lm.3)
## 
## Call:
## lm(formula = SalePrice ~ MSZoning + LotArea + Condition1 + Condition2 + 
##     OverallQual + RoofMatl + BsmtQual + BsmtUnfSF + TotalBsmtSF + 
##     CentralAir + GrLivArea + BsmtFullBath + FullBath + HalfBath + 
##     KitchenAbvGr + Functional + Fireplaces + GarageFinish + GarageCars + 
##     SaleCondition, data = data.training)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.45806 -0.05944  0.00577  0.07202  0.51145 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           6.977e+00  2.128e-01  32.781  < 2e-16 ***
## MSZoningFV            4.166e-01  5.193e-02   8.023 2.58e-15 ***
## MSZoningRH            4.112e-01  5.875e-02   6.999 4.42e-12 ***
## MSZoningRL            3.668e-01  4.762e-02   7.702 2.93e-14 ***
## MSZoningRM            3.088e-01  4.822e-02   6.405 2.22e-10 ***
## LotArea               7.334e-02  9.769e-03   7.507 1.23e-13 ***
## Condition11           5.892e-02  1.230e-02   4.790 1.89e-06 ***
## Condition21           8.917e-02  4.317e-02   2.065 0.039133 *  
## OverallQual2          1.845e-01  1.224e-01   1.507 0.132051    
## OverallQual3          3.613e-01  1.031e-01   3.505 0.000475 ***
## OverallQual4          4.547e-01  9.926e-02   4.581 5.16e-06 ***
## OverallQual5          5.363e-01  9.895e-02   5.420 7.30e-08 ***
## OverallQual6          5.980e-01  9.941e-02   6.016 2.42e-09 ***
## OverallQual7          6.935e-01  1.001e-01   6.926 7.29e-12 ***
## OverallQual8          7.924e-01  1.013e-01   7.823 1.18e-14 ***
## OverallQual9          9.030e-01  1.046e-01   8.637  < 2e-16 ***
## OverallQual10         8.519e-01  1.107e-01   7.698 3.03e-14 ***
## RoofMatlCompShg       2.470e+00  1.535e-01  16.098  < 2e-16 ***
## RoofMatlMembran       2.661e+00  2.071e-01  12.847  < 2e-16 ***
## RoofMatlMetal         2.569e+00  2.043e-01  12.577  < 2e-16 ***
## RoofMatlRoll          2.443e+00  2.034e-01  12.014  < 2e-16 ***
## RoofMatlTar&Grv       2.416e+00  1.598e-01  15.119  < 2e-16 ***
## RoofMatlWdShake       2.433e+00  1.657e-01  14.683  < 2e-16 ***
## RoofMatlWdShngl       2.522e+00  1.620e-01  15.564  < 2e-16 ***
## BsmtQualFa           -5.190e-02  3.162e-02  -1.641 0.101010    
## BsmtQualGd           -3.504e-02  1.669e-02  -2.100 0.035978 *  
## BsmtQualTA           -7.047e-02  1.831e-02  -3.849 0.000125 ***
## BsmtUnfSF            -6.098e-05  1.341e-05  -4.548 6.01e-06 ***
## TotalBsmtSF           1.705e-04  1.598e-05  10.666  < 2e-16 ***
## CentralAirY           1.316e-01  1.789e-02   7.353 3.73e-13 ***
## GrLivArea             2.246e-04  1.479e-05  15.193  < 2e-16 ***
## BsmtFullBath          3.264e-02  1.062e-02   3.074 0.002165 ** 
## FullBath              2.623e-02  1.170e-02   2.243 0.025114 *  
## HalfBath              2.362e-02  1.003e-02   2.354 0.018736 *  
## KitchenAbvGr         -8.864e-02  2.133e-02  -4.155 3.50e-05 ***
## FunctionalMaj2       -3.072e-01  7.884e-02  -3.897 0.000103 ***
## FunctionalMin1        2.227e-02  5.054e-02   0.441 0.659501    
## FunctionalMin2        2.846e-02  4.793e-02   0.594 0.552734    
## FunctionalMod        -4.078e-02  5.659e-02  -0.721 0.471296    
## FunctionalSev        -3.462e-01  1.468e-01  -2.359 0.018500 *  
## FunctionalTyp         8.625e-02  4.072e-02   2.118 0.034367 *  
## Fireplaces            2.438e-02  7.377e-03   3.305 0.000981 ***
## GarageFinishRFn      -6.744e-03  1.112e-02  -0.606 0.544433    
## GarageFinishUnf      -4.056e-02  1.224e-02  -3.314 0.000949 ***
## GarageCars            5.870e-02  7.276e-03   8.068 1.82e-15 ***
## SaleConditionAdjLand  3.220e-02  6.944e-02   0.464 0.643007    
## SaleConditionAlloca   1.767e-02  5.015e-02   0.352 0.724668    
## SaleConditionFamily  -9.440e-03  3.660e-02  -0.258 0.796495    
## SaleConditionNormal   8.460e-02  1.557e-02   5.433 6.81e-08 ***
## SaleConditionPartial  1.445e-01  2.165e-02   6.677 3.84e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1313 on 1119 degrees of freedom
## Multiple R-squared:  0.8974, Adjusted R-squared:  0.8929 
## F-statistic: 199.7 on 49 and 1119 DF,  p-value: < 2.2e-16

High Colinearity on OverallQual perhaps as a result of OveralQual capturing a lot of the information already present in other variables.

car::vif(lm.3)
##                   GVIF Df GVIF^(1/(2*Df))
## MSZoning      2.248184  4        1.106570
## LotArea       1.775650  1        1.332535
## Condition1    1.173874  1        1.083455
## Condition2    1.178420  1        1.085551
## OverallQual   9.735442  9        1.134772
## RoofMatl      2.142404  7        1.055932
## BsmtQual      3.499169  3        1.232142
## BsmtUnfSF     2.344037  1        1.531025
## TotalBsmtSF   3.345304  1        1.829017
## CentralAir    1.415931  1        1.189929
## GrLivArea     4.096398  1        2.023956
## BsmtFullBath  2.074838  1        1.440430
## FullBath      2.832563  1        1.683022
## HalfBath      1.723401  1        1.312784
## KitchenAbvGr  1.470211  1        1.212523
## Functional    1.816271  6        1.050990
## Fireplaces    1.568573  1        1.252427
## GarageFinish  2.008941  2        1.190534
## GarageCars    2.082502  1        1.443088
## SaleCondition 1.908324  5        1.066756

Regression Verification

I want to make sure this is a valid regression and the results can be trusted. None of the summary plots show anything to be too concerned about. The regression appears to be struggling with very high or very low priced houses. Otherwise everything appears valid.

plot(lm.3)
## Warning: not plotting observations with leverage one:
##   221, 531, 1020, 1038

## Warning: not plotting observations with leverage one:
##   221, 531, 1020, 1038

## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced
## Warning in sqrt(crit * p * (1 - hh)/hh): NaNs produced

Make predictions with on the validation set and produce a RMSE score of \(0.08488451\)

library(Metrics)
results <- data_frame(predict=exp(predict(lm.3, newdata=data.testing, type='response')), actual=exp(data.testing$SalePrice))
rmse(data.testing$SalePrice, log(results))
## [1] 0.08488451

Plot actuals vs. predictions shows a clean, valid regression.

ggplot(results, aes(predict, actual)) +
  geom_point() +
  geom_smooth(method='lm', se=FALSE)

Predictions for Kaggle

With the model selected I will rerun the regression with all the data.

lm.final <- lm(SalePrice ~ MSZoning + LotArea + Condition1 + Condition2 + OverallQual + RoofMatl + BsmtQual + BsmtUnfSF + TotalBsmtSF + CentralAir + GrLivArea + BsmtFullBath + FullBath + HalfBath + KitchenAbvGr + Functional + Fireplaces + GarageFinish + GarageCars + SaleCondition, data=data.complete)

I will read in the Kaggle data to predict, clean it identically to the training data, and make predictions.

upload.data <- read.csv('C:\\Users\\Brian\\Desktop\\GradClasses\\Fall18\\605\\final\\all\\test.csv') %>%
  select(-c(Id, Alley, PoolQC, Fence, MiscFeature))
upload.data %<>% 
  mutate(LotArea = log(LotArea),
         OverallQual = factor(OverallQual),
         OverallCond = factor(OverallCond),
         YearBuilt = factor(YearBuilt),
         YearRemodAdd = factor(YearRemodAdd),
         MoSold = factor(MoSold),
         YrSold = factor(YrSold),
         MasVnrArea = factor(ifelse(MasVnrArea > 0, 1, 0)),
         BsmtFinSF2 = factor(ifelse(BsmtFinSF2 > 0, 1, 0)),
         X2ndFlrSF = factor(ifelse(X2ndFlrSF > 0, 1, 0)),
         LowQualFinSF = factor(ifelse(LowQualFinSF > 0, 1, 0)),
         WoodDeckSF = factor(ifelse(WoodDeckSF > 0, 1, 0)),
         OpenPorchSF = factor(ifelse(OpenPorchSF > 0, 1, 0)),
         EnclosedPorch = factor(ifelse(EnclosedPorch > 0, 1, 0)),
         X3SsnPorch = factor(ifelse(X3SsnPorch > 0, 1, 0)),
         ScreenPorch = factor(ifelse(ScreenPorch > 0, 1, 0)),
         PoolArea = factor(ifelse(PoolArea > 0, 1, 0)),
         MiscVal = factor(ifelse(MiscVal > 0, 1, 0)),
         Condition1 = factor(ifelse(Condition1 == 'Norm', 1, 0)),
         Condition2 = factor(ifelse(Condition2 == 'Norm', 1, 0))
         )
set.seed(123)
upload.data <- mice::mice(upload.data, m=5, maxit=5, method='pmm', seed=500, printFlag=FALSE)
## Warning: Number of logged events: 1011
upload.data.complete <- as_tibble(complete(upload.data, 1))
ids <- read.csv('C:\\Users\\Brian\\Desktop\\GradClasses\\Fall18\\605\\final\\all\\test.csv') %>%
  select('Id')
predictions <- exp(predict(lm.final, newdata=upload.data.complete))
write_csv(data_frame(Id=ids$Id, SalePrice=predictions), "C:\\Users\\Brian\\Desktop\\GradClasses\\Fall18\\605\\final\\all\\predictions.csv")

Results

I placed 2426 at the time of writing with a RMSE of 0.14061. The significantly higher RMSE than my training data indicates that I am still overfitting.

USERNAME: briancuny SCORE: 0.14061

https://www.youtube.com/watch?v=njCM-Iaeq4A