1 Introduction

This project is an entry in a longstanding competition hosted by Kaggle.com, in which the goal is to build a model to predict sale prices of houses sold in Ames, Iowa between 2006 and 2010. Entries are submitted to Kaggle and scored based on the final model’s Root Mean Square Error (RMSE). After exploratory analysis, cleaning the data, and feature engineering, I produced a model with an RMSE of .12691, within the top 22% of entries; with further adjustments, I hope to break the top 20%.

The data contains a response variable (SalePrice); an ID number representing the property’s parcel number within the city of Ames; and 79 predictors. Many of the predictors are related–for example, there are

  • 11 basement-related variables (BsmtQual, BsmtCond, etc)

  • 7 garage-related variables (GarageArea, GarageCars, GarageCond, etc)

  • Multiple quality-related variables (OverallQual, ExterQual, KitchenQual, etc)

  • Count variables for features such as bedrooms, bathrooms, and fireplaces

Kaggle provides 2 datafiles: one is a training set containing all 81 variables, and the other is a test set, which is missing the SalePrice column. After building my model on the training data, I’ll use it to predict SalePrice values for the test data.

2 Exploratory Analysis 1

2.1 Data

I’ve combined the test and training sets for purposes of cleaning and analysis, but will separate them later to prevent data leakage during modeling. The combined data contains 2919 observations of the 81 variables described above, with 1460 observations in the training data and 1459 in the test set.

#DATA
training <- read.csv("/Users/arcadia/Desktop/Kaggle House Prices/Data & Documentation/train.csv")
testing <- read.csv("/Users/arcadia/Desktop/Kaggle House Prices/Data & Documentation/test.csv")

testing$SalePrice <-NA 

#Bind the training and test datasets for EDA/cleaning
all <-rbind(training,testing)

str(all)
## 'data.frame':    2919 obs. of  81 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ MSSubClass   : int  60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning     : chr  "RL" "RL" "RL" "RL" ...
##  $ LotFrontage  : int  65 80 68 60 84 85 75 NA 51 50 ...
##  $ LotArea      : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
##  $ Street       : chr  "Pave" "Pave" "Pave" "Pave" ...
##  $ Alley        : chr  NA NA NA NA ...
##  $ LotShape     : chr  "Reg" "Reg" "IR1" "IR1" ...
##  $ LandContour  : chr  "Lvl" "Lvl" "Lvl" "Lvl" ...
##  $ Utilities    : chr  "AllPub" "AllPub" "AllPub" "AllPub" ...
##  $ LotConfig    : chr  "Inside" "FR2" "Inside" "Corner" ...
##  $ LandSlope    : chr  "Gtl" "Gtl" "Gtl" "Gtl" ...
##  $ Neighborhood : chr  "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
##  $ Condition1   : chr  "Norm" "Feedr" "Norm" "Norm" ...
##  $ Condition2   : chr  "Norm" "Norm" "Norm" "Norm" ...
##  $ BldgType     : chr  "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ HouseStyle   : chr  "2Story" "1Story" "2Story" "2Story" ...
##  $ OverallQual  : int  7 6 7 7 8 5 8 7 7 5 ...
##  $ OverallCond  : int  5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt    : int  2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
##  $ YearRemodAdd : int  2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
##  $ RoofStyle    : chr  "Gable" "Gable" "Gable" "Gable" ...
##  $ RoofMatl     : chr  "CompShg" "CompShg" "CompShg" "CompShg" ...
##  $ Exterior1st  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
##  $ Exterior2nd  : chr  "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
##  $ MasVnrType   : chr  "BrkFace" "None" "BrkFace" "None" ...
##  $ MasVnrArea   : int  196 0 162 0 350 0 186 240 0 0 ...
##  $ ExterQual    : chr  "Gd" "TA" "Gd" "TA" ...
##  $ ExterCond    : chr  "TA" "TA" "TA" "TA" ...
##  $ Foundation   : chr  "PConc" "CBlock" "PConc" "BrkTil" ...
##  $ BsmtQual     : chr  "Gd" "Gd" "Gd" "TA" ...
##  $ BsmtCond     : chr  "TA" "TA" "TA" "Gd" ...
##  $ BsmtExposure : chr  "No" "Gd" "Mn" "No" ...
##  $ BsmtFinType1 : chr  "GLQ" "ALQ" "GLQ" "ALQ" ...
##  $ BsmtFinSF1   : int  706 978 486 216 655 732 1369 859 0 851 ...
##  $ BsmtFinType2 : chr  "Unf" "Unf" "Unf" "Unf" ...
##  $ BsmtFinSF2   : int  0 0 0 0 0 0 0 32 0 0 ...
##  $ BsmtUnfSF    : int  150 284 434 540 490 64 317 216 952 140 ...
##  $ TotalBsmtSF  : int  856 1262 920 756 1145 796 1686 1107 952 991 ...
##  $ Heating      : chr  "GasA" "GasA" "GasA" "GasA" ...
##  $ HeatingQC    : chr  "Ex" "Ex" "Ex" "Gd" ...
##  $ CentralAir   : chr  "Y" "Y" "Y" "Y" ...
##  $ Electrical   : chr  "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
##  $ X1stFlrSF    : int  856 1262 920 961 1145 796 1694 1107 1022 1077 ...
##  $ X2ndFlrSF    : int  854 0 866 756 1053 566 0 983 752 0 ...
##  $ LowQualFinSF : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ GrLivArea    : int  1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
##  $ BsmtFullBath : int  1 0 1 1 1 1 1 1 0 1 ...
##  $ BsmtHalfBath : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ FullBath     : int  2 2 2 1 2 1 2 2 2 1 ...
##  $ HalfBath     : int  1 0 1 0 1 1 0 1 0 0 ...
##  $ BedroomAbvGr : int  3 3 3 3 4 1 3 3 2 2 ...
##  $ KitchenAbvGr : int  1 1 1 1 1 1 1 1 2 2 ...
##  $ KitchenQual  : chr  "Gd" "TA" "Gd" "Gd" ...
##  $ TotRmsAbvGrd : int  8 6 6 7 9 5 7 7 8 5 ...
##  $ Functional   : chr  "Typ" "Typ" "Typ" "Typ" ...
##  $ Fireplaces   : int  0 1 1 1 1 0 1 2 2 2 ...
##  $ FireplaceQu  : chr  NA "TA" "TA" "Gd" ...
##  $ GarageType   : chr  "Attchd" "Attchd" "Attchd" "Detchd" ...
##  $ GarageYrBlt  : int  2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 ...
##  $ GarageFinish : chr  "RFn" "RFn" "RFn" "Unf" ...
##  $ GarageCars   : int  2 2 2 3 3 2 2 2 2 1 ...
##  $ GarageArea   : int  548 460 608 642 836 480 636 484 468 205 ...
##  $ GarageQual   : chr  "TA" "TA" "TA" "TA" ...
##  $ GarageCond   : chr  "TA" "TA" "TA" "TA" ...
##  $ PavedDrive   : chr  "Y" "Y" "Y" "Y" ...
##  $ WoodDeckSF   : int  0 298 0 0 192 40 255 235 90 0 ...
##  $ OpenPorchSF  : int  61 0 42 35 84 30 57 204 0 4 ...
##  $ EnclosedPorch: int  0 0 0 272 0 0 0 228 205 0 ...
##  $ X3SsnPorch   : int  0 0 0 0 0 320 0 0 0 0 ...
##  $ ScreenPorch  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolArea     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ PoolQC       : chr  NA NA NA NA ...
##  $ Fence        : chr  NA NA NA NA ...
##  $ MiscFeature  : chr  NA NA NA NA ...
##  $ MiscVal      : int  0 0 0 0 0 700 0 350 0 0 ...
##  $ MoSold       : int  2 5 9 2 12 10 8 11 4 1 ...
##  $ YrSold       : int  2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
##  $ SaleType     : chr  "WD" "WD" "WD" "WD" ...
##  $ SaleCondition: chr  "Normal" "Normal" "Normal" "Abnorml" ...
##  $ SalePrice    : int  208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...

Initially, all variables have class character or integer: most will need to be recoded as numeric or factor.

table(sapply(all, class))
## 
## character   integer 
##        43        38

The documentation for this project indicates there are “23 nominal, 23 ordinal, 14 discrete, and 20 continuous” variables. However, variable classifications have statistical implications for modeling, so I’ll need to carefully consider the best way to classify each variable.

2.1.1 Outliers

The data contains numerous outliers and potential influential points, but for now, I will simply look at outliers in the GrLivArea variable. This is equivalent to each house’s square footage, not including the basement. The project documentation advises removing houses with GrLivArea \(\geq\) 4000 ft\(^{2}\)— however, for this contest, observations can’t be removed from the test set because Kaggle’s scoring mechanism requires predictions for all 1459 test observations. I’ve highlighted the 4 outlying points in the training set on the graph below.

Two of these observations are simply unusually large and expensive houses which are likely to increase positive skew, but not to disrupt the overall linear relationship. However, the two largest houses are genuine outliers in that they are both considerably larger than the rest of the data, but also less expensive. These points are likely to be unduly influential, so I’ll remove them.

highlight <-all %>%
  filter(GrLivArea >= 4000)

g1 <-ggplot(data = all, aes(x = GrLivArea, y = SalePrice)) + geom_point(alpha = 0.5) + geom_point(data = highlight, aes(x = GrLivArea, y = SalePrice), color = "#F21A00") + geom_smooth(method = "loess", colour = "#3B9AB2") + ggtitle("Above-Grade Living Area vs Sale Price")

no_outs <- all%>%
  filter(!is.na(SalePrice)) %>%
  filter(GrLivArea <= 4500)

g2 <-ggplot(data = no_outs, aes(x = GrLivArea, y = SalePrice)) +geom_point(alpha = 0.5) + geom_smooth(method = "loess", colour = "#3B9AB2") + ggtitle("Above-Grade Living Area vs Sale Price")

grid.arrange(g1, g2)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

The test set is left intact while the training set now has 1458 observations, for a total of 2917 in the combined data.

#Update training
training <- training[! training$GrLivArea >= 4500,]

all <-rbind(training, testing)
dim(all)
## [1] 2917   81

2.1.2 Distribution of Y

The response, SalePrice, is significantly positively skewed; the bottom 75% of homes sold for \(\leq\) 214,000, while the upper quartile is much more spread out.

summary(training$SalePrice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129925  163000  180933  214000  755000
y <-ggplot(data = training, aes(x = SalePrice)) +
  geom_histogram(col = "#E1AF00", fill = "#3B9AB2", bins = 50)  + 
  scale_x_continuous(breaks = seq(0, 800000, by = 100000), labels = comma) +labs(title = "Housing Data: Sale Price (Y)")

q1 <-ggplot(data = training, aes(sample = SalePrice)) + stat_qq(col = "#3B9AB2") + 
  stat_qq_line(col = "red") + labs(y = "Sale Price")

After performing a log transformation, the distribution looks much more normal.

training$log_y <-log(training$SalePrice)

logy <-ggplot(data = training, aes(x = log_y)) +
  geom_histogram(col = "#E1AF00", fill = "#3B9AB2", bins = 50)  + 
  scale_x_continuous(breaks = seq(0, 800000, by = 100000), labels = comma) +
  labs(title = "Housing Data: Log(Sale Price)", x = "Log(Sale Price)")

q2 <-ggplot(data = training, aes(sample = log_y)) + stat_qq(col = "#3B9AB2") + 
  stat_qq_line(col = "red") + labs(y = "Log(Sale Price)")

grid.arrange(y, logy)

The qqplot for SalePrice looks better, too; the tails are still longer than a strictly normal distribution, but the data is much less skewed.

grid.arrange(q1, q2, nrow = 1, top = "QQPlots Before and After Log Transformation")

2.2 Univariate Analysis

With so many predictors to examine, I began by creating univariate (and later, bivariate) plots for each one.

2.2.1 Numeric Variables

quant <-subset(all, select = -c(Id, SalePrice)) %>%
  select(where(is.numeric))
quant_names <-names(quant); quant_names
##  [1] "MSSubClass"    "LotFrontage"   "LotArea"       "OverallQual"  
##  [5] "OverallCond"   "YearBuilt"     "YearRemodAdd"  "MasVnrArea"   
##  [9] "BsmtFinSF1"    "BsmtFinSF2"    "BsmtUnfSF"     "TotalBsmtSF"  
## [13] "X1stFlrSF"     "X2ndFlrSF"     "LowQualFinSF"  "GrLivArea"    
## [17] "BsmtFullBath"  "BsmtHalfBath"  "FullBath"      "HalfBath"     
## [21] "BedroomAbvGr"  "KitchenAbvGr"  "TotRmsAbvGrd"  "Fireplaces"   
## [25] "GarageYrBlt"   "GarageCars"    "GarageArea"    "WoodDeckSF"   
## [29] "OpenPorchSF"   "EnclosedPorch" "X3SsnPorch"    "ScreenPorch"  
## [33] "PoolArea"      "MiscVal"       "MoSold"        "YrSold"
#HISTOGRAMS OF NUMERICAL VARIABLES

#Skewed
#continuous
hist_lotfrontage <-ggplot(data = all, aes(x = LotFrontage)) +
  geom_histogram(fill = "#3B9AB2", bins = 50)  
  #scale_x_continuous(breaks = seq(0, 800000, by = 100000), labels = comma) +
  #labs(title = "Lot Frontage"); hist_lotfrontage

#Definitely needs to be transformed (prob on a log scale)
#continuous
hist_lotarea <-ggplot(data = all, aes(x = LotArea)) +
  geom_histogram(fill = "#3B9AB2", bins = 50)  
  #scale_x_continuous(breaks = seq(0, 800000, by = 100000), labels = comma) +
  #labs(title = "Lot Area"); hist_lotarea

#ordinal
bar_overallqual<-ggplot(data = all, aes(x = OverallQual)) +
  geom_bar(fill = "#3B9AB2")  
  #scale_x_continuous(breaks = seq(0, 800000, by = 100000), labels = comma) +
 # labs(title = "Overall Quality"); bar_overallqual

#ordinal
bar_overallcond<-ggplot(data = all, aes(x = OverallCond)) +
  geom_bar(fill = "#3B9AB2")  
  #scale_x_continuous(breaks = seq(0, 800000, by = 100000), labels = comma) +
  #labs(title = "Overall Condition"); bar_overallcond

#YearBuilt- negatively skewed, more newer houses (DISCRETE BUT STILL...)
hist_yearbuilt<-ggplot(data = all, aes(x = YearBuilt)) +
  geom_histogram(fill = "#3B9AB2", bins = 50) 
  #labs(title = "Year Built"); hist_yearbuilt

#interesting distribution- something weird going on with the 0s here (meaning not remodeled)
#DISCRETE
hist_yearremod<-ggplot(data = all, aes(x = YearRemodAdd)) +
  geom_histogram(fill = "#3B9AB2", bins = 50) 
  #labs(title = "Year Remodeled"); hist_yearremod

#Obviously missing data issue
#continuous
hist_masvnrarea<-ggplot(data = all, aes(x = MasVnrArea)) +
  geom_histogram(fill = "#3B9AB2", bins = 50) 
  #labs(title = "Masonry Veneer Area"); hist_masvnrarea

#missing data (0 = no finished basement)
#continuous
hist_bsmtfinsf1<-ggplot(data = all, aes(x = BsmtFinSF1)) +
  geom_histogram(fill = "#3B9AB2", bins = 50) 
  #labs(title = "Type 1 Finished Basement SF"); hist_bsmtfinsf1

#same deal w/missing, would need to leave out 0s to see the real dist. here. 
#or do a log transform or whatever. 
#continuous
hist_bsmtfinsf2<-ggplot(data = all, aes(x = BsmtFinSF2)) +
  geom_histogram(fill = "#3B9AB2", bins = 30) 
  #labs(title = "Type 2 Finished Basement SF"); hist_bsmtfinsf2

#continuous
hist_bsmtunfsf<-ggplot(data = all, aes(x = BsmtUnfSF)) +
  geom_histogram(fill = "#3B9AB2", bins = 50) 
  #labs(title = "Unfinished Basement SF"); hist_bsmtunfsf

#continuous
hist_totalbsmtsf<-ggplot(data = all, aes(x = TotalBsmtSF)) +
  geom_histogram(fill = "#3B9AB2", bins = 50) 
  #labs(title = "Total Basement SF"); hist_totalbsmtsf

#continuous
hist_X1stFlrSF<-ggplot(data = all, aes(x = X1stFlrSF)) +
  geom_histogram(fill = "#3B9AB2", bins = 50) 
 # labs(title = "First Floor SF"); hist_X1stFlrSF

#LOTS of missing values here (aka lots of single-story homes)
#continuous
hist_X2ndFlrSF<-ggplot(data = all, aes(x = X2ndFlrSF)) +
  geom_histogram(fill = "#3B9AB2", bins = 50) 
  #labs(title = "Second Floor SF"); hist_X2ndFlrSF

#pointless- on a log scale, or too much missing data
#maybe could set these to not = 0?
#continuous
hist_LowQualFinSF <-ggplot(data = all, aes(x = LowQualFinSF)) +
  geom_histogram(fill = "#3B9AB2", bins = 10) 
  #labs(title = "Low Quality Finished SF"); hist_LowQualFinSF

#skewed
#continuous
hist_GrLivArea <-ggplot(data = all, aes(x = GrLivArea)) +
  geom_histogram(fill = "#3B9AB2", bins = 50) 
 # labs(title = "Above-Ground Living Area"); hist_GrLivArea

#discrete
bar_BsmtFullBath<-ggplot(data = all, aes(x = BsmtFullBath)) +
  geom_bar(fill = "#3B9AB2") 
  #labs(title = "No. Basement Full Baths"); bar_BsmtFullBath

#discrete
bar_BsmtHalfBath<-ggplot(data = all, aes(x = BsmtHalfBath)) +
  geom_bar(fill = "#3B9AB2") 
 # labs(title = "No. Basement Half Baths"); bar_BsmtHalfBath

#discrete
bar_FullBath <-ggplot(data = all, aes(x = FullBath )) +
  geom_bar(fill = "#3B9AB2") 
 # labs(title = "No. Full Baths"); bar_FullBath 

#discrete
bar_HalfBath <-ggplot(data = all, aes(x =HalfBath )) +
  geom_bar(fill = "#3B9AB2") 
  #labs(title = "No. Half Baths"); bar_HalfBath 

#discrete
bar_BedroomAbvGr <-ggplot(data = all, aes(x =BedroomAbvGr )) +
  geom_bar(fill = "#3B9AB2") 
 # labs(title = "No. Bedrooms Above Grade", subtitle = "Basement bedrooms not included"); bar_BedroomAbvGr

#discrete
bar_KitchenAbvGr <-ggplot(data = all, aes(x =KitchenAbvGr )) +
  geom_bar(fill = "#3B9AB2") 
 # labs(title = "No. Kitchens Above Grade"); bar_KitchenAbvGr

#TOTRMSABVGRD
#(discrete)
bar_TotRmsAbvGrd<-ggplot(data = all, aes(x = TotRmsAbvGrd)) +
  geom_bar(fill = "#3B9AB2") 
  #labs(title = "No. Rooms Above Grade"); bar_TotRmsAbvGrd


#FIREPLACES
#(discrete)
bar_Fireplaces<-ggplot(data = all, aes(x = Fireplaces)) +
  geom_bar(fill = "#3B9AB2") 
 # labs(title = "No. of Fireplaces"); bar_Fireplaces

#skewed
#DISCRETE BUT STILL...
hist_GarageYrBlt <-ggplot(data = all, aes(x = GarageYrBlt)) +
  geom_histogram(fill = "#3B9AB2", bins = 50) + 
  scale_x_continuous(limits = c(1900, 2020))
  #labs(title = "Garage: Year Built"); hist_GarageYrBlt

#(discrete) (this is absurd, clearly should be ordinal??)
bar_GarageCars<-ggplot(data = all, aes(x = GarageCars)) +
  geom_bar(fill = "#3B9AB2") 
  #labs(title = "Garage Car Capacity"); bar_GarageCars

#0 = no garage
#CONTINUOUS
hist_GarageArea <-ggplot(data = all, aes(x = GarageArea)) +
  geom_histogram(fill = "#3B9AB2", bins = 50) 
 # labs(title = "Garage Area"); hist_GarageArea

#continuous
hist_WoodDeckSF <-ggplot(data = all, aes(x = WoodDeckSF)) +
  geom_histogram(fill = "#3B9AB2", bins = 50) 
  #labs(title = "Wood Deck: SF"); hist_WoodDeckSF

#continuous
hist_OpenPorchSF <-ggplot(data = all, aes(x = OpenPorchSF)) +
  geom_histogram(fill = "#3B9AB2", bins = 50) 
  #labs(title = "Open Porch SF"); hist_OpenPorchSF

#continuous
hist_EnclosedPorch <-ggplot(data = all, aes(x = EnclosedPorch)) +
  geom_histogram(fill = "#3B9AB2", bins = 50) 
  #labs(title = "Open Porch SF"); hist_OpenPorchSF

#continuous
hist_X3SsnPorch <-ggplot(data = all, aes(x = X3SsnPorch)) +
  geom_histogram(fill = "#3B9AB2", bins = 50) 
  #labs(title = "3-Season Porch SF"); hist_X3SsnPorch

#continuous
hist_ScreenPorch <-ggplot(data = all, aes(x = ScreenPorch)) +
  geom_histogram(fill = "#3B9AB2", bins = 50) 
  #labs(title = "Screen Porch SF"); hist_ScreenPorch

#Continuous
hist_PoolArea <-ggplot(data = all, aes(x = PoolArea)) +
  geom_histogram(fill = "#3B9AB2", bins = 50) 
  #labs(title = "Pool Area"); hist_PoolArea

#continuous
hist_MiscVal <-ggplot(data = all, aes(x = MiscVal)) +
  geom_histogram(fill = "#3B9AB2", bins = 50) 
  #labs(title = "Value of Misc. Feature"); hist_MiscVal

#MO SOLD?? (DISCRETE) (RECODE)
bar_MoSold <-ggplot(data = all, aes(x = MoSold)) +
  geom_bar(fill = "#3B9AB2") 
 # labs(title = "Month Sold"); bar_MoSold

#YR SOLD (discrete)
bar_YrSold <-ggplot(data = all, aes(x = YrSold)) +
  geom_bar(fill = "#3B9AB2") 
  #labs(title = "Year Sold"); bar_YrSold
grid.arrange(hist_lotfrontage, hist_lotarea, bar_overallqual, bar_overallcond, hist_yearbuilt, hist_yearremod, hist_masvnrarea, hist_bsmtfinsf1, hist_bsmtfinsf2, hist_bsmtunfsf,  hist_totalbsmtsf,
hist_X1stFlrSF, hist_X2ndFlrSF, hist_LowQualFinSF, hist_GrLivArea, bar_BsmtFullBath, bar_BsmtHalfBath, bar_FullBath, bar_HalfBath, bar_BedroomAbvGr, bar_KitchenAbvGr, bar_TotRmsAbvGrd, bar_Fireplaces, hist_GarageYrBlt, bar_GarageCars, hist_GarageArea, hist_WoodDeckSF, hist_OpenPorchSF, hist_EnclosedPorch, hist_X3SsnPorch, hist_ScreenPorch, hist_PoolArea, hist_MiscVal, bar_MoSold, bar_YrSold, ncol = 4)

#### Comments

Looking at these plots, several trends are clear:

  • Many of the numeric variables are not normally distributed, and will need to be transformed before modeling. Often this is due to large numbers of observations coded 0, signifying properties that lack a particular feature. Some variables (PoolArea, X3SsnPorch) contain so few non-zero observations that they’re not visible on plots.

  • Other variables (LotArea, LotFrontage, MasVnrArea, etc) appear to be skewed due to the presence of outliers, which I’ll want to explore further.

  • (Multi)collinearity is likely to be an issue with this dataset. I would expect some variables to be highly correlated (e.g., GarageCars and GarageArea).

  • Some variables need to be recoded. Of the 36 variables initially coded as numeric (not including ID or SalePrice):

    • One (MSSubClass) is clearly categorical, and only appears numeric because its categories are labeled with numbers; it’s included with the categorical plots and will be recoded as a factor.

    • 4 (MoSold, YrSold, OverallQual, and OverallCond) are ambiguous- they could be treated as integers, ordinal factors, or categorical factors depending on modeling requirements. These will be revisited.

2.2.2 Categorical Variables

These plots include MSSubClass, which will be recoded as a factor.

cat <-all %>%
  select(where(is.character))
cat_names <-names(cat); cat_names
##  [1] "MSZoning"      "Street"        "Alley"         "LotShape"     
##  [5] "LandContour"   "Utilities"     "LotConfig"     "LandSlope"    
##  [9] "Neighborhood"  "Condition1"    "Condition2"    "BldgType"     
## [13] "HouseStyle"    "RoofStyle"     "RoofMatl"      "Exterior1st"  
## [17] "Exterior2nd"   "MasVnrType"    "ExterQual"     "ExterCond"    
## [21] "Foundation"    "BsmtQual"      "BsmtCond"      "BsmtExposure" 
## [25] "BsmtFinType1"  "BsmtFinType2"  "Heating"       "HeatingQC"    
## [29] "CentralAir"    "Electrical"    "KitchenQual"   "Functional"   
## [33] "FireplaceQu"   "GarageType"    "GarageFinish"  "GarageQual"   
## [37] "GarageCond"    "PavedDrive"    "PoolQC"        "Fence"        
## [41] "MiscFeature"   "SaleType"      "SaleCondition"
#nominal- #THIS IS REALLY NOMINAL, SHOULD BE RECODED PERHAPS? 
bar_mssubclass <-ggplot(data = all, aes(x = factor(MSSubClass))) +
  geom_bar(fill = "#3B9AB2")  
  #scale_x_continuous(breaks = seq(0, 800000, by = 100000), labels = comma) +
  #labs(title = "Type of Dwelling"); bar_mssubclass

#nominal- best to recode names and do coord_flip
bar_MSZoning<-ggplot(data = all, aes(x = MSZoning)) +
  geom_bar(fill = "#3B9AB2") 
  #labs(title = "MS Zoning"); bar_MSZoning

#nominal
bar_Street<-ggplot(data = all, aes(x = Street)) +
  geom_bar(fill = "#3B9AB2") 
  #labs(title = "Type of Road Access"); bar_Street

#nominal
bar_Alley<-ggplot(data = all, aes(x = Alley)) +
  geom_bar(fill = "#3B9AB2") 
  #labs(title = "Type Alley Access"); bar_Alley

#ordinal (re-code?)
bar_LotShape<-ggplot(data = all, aes(x = LotShape)) +
  geom_bar(fill = "#3B9AB2")  
 # labs(title = "Lot Shape"); bar_LotShape

#nominal (re-code?)
bar_LandContour <-ggplot(data = all, aes(x = LandContour)) +
  geom_bar(fill = "#3B9AB2") 
  #labs(title = "Land Contour: Property Flatness"); bar_LandContour

#ordinal (recode?)
bar_Utilities <-ggplot(data = all, aes(x = Utilities)) +
  geom_bar(fill = "#3B9AB2") #+ 
  #labs(title = "Type of Utilities Available"); bar_Utilities

#nominal
bar_LotConfig <-ggplot(data = all, aes(x = LotConfig)) +
  geom_bar(fill = "#3B9AB2") 
  #labs(title = "Lot Configuration"); bar_LotConfig

#ordinal
bar_LandSlope <-ggplot(data = all, aes(x = LandSlope)) +
  geom_bar(fill = "#3B9AB2") 
  #labs(title = "Property Slope"); bar_LandSlope

#Nominal- need to redo this one
bar_Neighborhood <-ggplot(data = all, aes(x = Neighborhood)) +
  geom_bar(fill = "#3B9AB2") + coord_flip() 
 # labs(title = "Neighborhood"); bar_Neighborhood

#nominal (Recode)
bar_Condition1 <-ggplot(data = all, aes(x = Condition1)) +
  geom_bar(fill = "#3B9AB2") + coord_flip() 
  #labs(title = "Proximity to Var. Conditions"); bar_Condition1
#
#nominal- #need to figure this out
bar_Condition2 <-ggplot(data = all, aes(x = Condition2)) +
  geom_bar(fill = "#3B9AB2") + coord_flip() 
  #labs(title = "Proximity to >1 Condition"); bar_Condition2

#nominal
bar_BldgType <-ggplot(data = all, aes(x = BldgType)) +
  geom_bar(fill = "#3B9AB2") 
  #labs(title = "Building Type"); bar_BldgType

#nominal
bar_HouseStyle <-ggplot(data = all, aes(x = HouseStyle)) +
  geom_bar(fill = "#3B9AB2") 
  #labs(title = "House Style"); bar_HouseStyle

#nominal
bar_RoofStyle <-ggplot(data = all, aes(x = RoofStyle)) +
  geom_bar(fill = "#3B9AB2") 
  #labs(title = "Roof Style"); bar_RoofStyle

#nominal
bar_RoofMatl <-ggplot(data = all, aes(x = RoofMatl)) +
  geom_bar(fill = "#3B9AB2") + coord_flip() 
  #labs(title = "Roof Material"); bar_RoofMatl

#nominal
bar_Exterior1st <-ggplot(data = all, aes(x = Exterior1st)) +
  geom_bar(fill = "#3B9AB2") + coord_flip()
  #labs(title = "Exterior Covering on House"); bar_Exterior1st

#nominal
bar_Exterior2nd <-ggplot(data = all, aes(x = Exterior2nd)) +
  geom_bar(fill = "#3B9AB2") + coord_flip() 
  #labs(title = "Exterior Covering if >1"); bar_Exterior2nd

#nominal
bar_MasVnrType <-ggplot(data = all, aes(x = MasVnrType)) +
  geom_bar(fill = "#3B9AB2") 
  #labs(title = "Masonry Veneer Type"); bar_MasVnrType

#ordinal
bar_ExterQual <-ggplot(data = all, aes(x = ExterQual)) +
  geom_bar(fill = "#3B9AB2") 
  #labs(title = "Exterior Quality"); bar_ExterQual

#ordinal -NEED TO RE-CODE X-AXIS FOR THIS CONDITION VAR, AND MAYBE OTHERS? 
bar_ExterCond <-ggplot(data = all, aes(x = ExterCond)) +
  geom_bar(fill = "#3B9AB2") 
  #labs(title = "Exterior Condition"); bar_ExterCond

#nominal
bar_Foundation<-ggplot(data = all, aes(x = Foundation)) +
  geom_bar(fill = "#3B9AB2") 
  #labs(title = "Type of Foundation"); bar_Foundation

#ordinal
bar_BsmtQual<-ggplot(data = all, aes(x = BsmtQual)) +
  geom_bar(fill = "#3B9AB2") 
 # labs(title = "Basement Quality: Height"); bar_BsmtQual

#ordinal #AGAIN, NEED TO REORGANIZE DIRECTION ON X-AXIS, THIS IS ORDINAL
#IT'S CURRENTLY IN ALPHABETICAL ORDER
bar_BsmtCond<-ggplot(data = all, aes(x = BsmtCond)) +
  geom_bar(fill = "#3B9AB2") 
 # labs(title = "Basement Condition"); bar_BsmtCond

#ordinal
bar_BsmtExposure<-ggplot(data = all, aes(x = BsmtExposure)) +
  geom_bar(fill = "#3B9AB2") 
  #labs(title = "Basement Exposure (Walkout/Garden level)"); bar_BsmtExposure

#ordinal (recode labels)
bar_BsmtFinType1<-ggplot(data = all, aes(x = BsmtFinType1)) +
  geom_bar(fill = "#3B9AB2") 
  #labs(title = "Rating of Basement Finished Area"); bar_BsmtFinType1

#ordinal
bar_BsmtFinType2<-ggplot(data = all, aes(x = BsmtFinType2)) +
  geom_bar(fill = "#3B9AB2") 
  #labs(title = "Finished Basement Rating if >1 Area"); bar_BsmtFinType2

#nominal
bar_Heating<-ggplot(data = all, aes(x = Heating)) +
  geom_bar(fill = "#3B9AB2") 
 # labs(title = "Type of Heating"); bar_Heating

#ordinal
bar_HeatingQC<-ggplot(data = all, aes(x = HeatingQC)) +
  geom_bar(fill = "#3B9AB2") 
  #labs(title = "Heating Quality/Condition"); bar_HeatingQC

#nominal- also BINARY
bar_CentralAir<-ggplot(data = all, aes(x = CentralAir)) +
  geom_bar(fill = "#3B9AB2") 
  #labs(title = "Central Air"); bar_CentralAir

#ordinal- NOTE THIS IS ORDINAL, NOT NOMINAL
bar_Electrical<-ggplot(data = all, aes(x = Electrical)) +
  geom_bar(fill = "#3B9AB2")  
 # labs(title = "Electrical System"); bar_Electrical

#ordinal- note this is a likert scale- FIX ORDER/RECODE
bar_KitchenQual<-ggplot(data = all, aes(x = KitchenQual)) +
  geom_bar(fill = "#3B9AB2") 
 # labs(title = "Kitchen Quality"); bar_KitchenQual

#(ordinal) #maybe important?? Recode, currently in alphabetical order
bar_Functional<-ggplot(data = all, aes(x = Functional)) +
  geom_bar(fill = "#3B9AB2") 
  #labs(title = "Home Functionality"); bar_Functional

#F(ordinal) (recode)
bar_FireplaceQu<-ggplot(data = all, aes(x = FireplaceQu)) +
  geom_bar(fill = "#3B9AB2") 
  #labs(title = "Fireplace Quality"); bar_FireplaceQu

#(nominal) (bizarrely not ordinal)
bar_GarageType<-ggplot(data = all, aes(x = GarageType)) +
  geom_bar(fill = "#3B9AB2") 
 # labs(title = "Garage Type/Location"); bar_GarageType

#(ordinal) (recode)
bar_GarageFinish<-ggplot(data = all, aes(x = GarageFinish)) +
  geom_bar(fill = "#3B9AB2") 
  #labs(title = "Interior Garage Finish"); bar_GarageFinish

#(ordinal) (recode)
bar_GarageQual<-ggplot(data = all, aes(x = GarageQual)) +
  geom_bar(fill = "#3B9AB2") 
  #labs(title = "Garage Quality"); bar_GarageQual

#(ordinal) (recode)
bar_GarageCond<-ggplot(data = all, aes(x = GarageCond)) +
  geom_bar(fill = "#3B9AB2") 
  #labs(title = "Garage Condition"); bar_GarageCond

#(ordinal) (recode)
bar_PavedDrive<-ggplot(data = all, aes(x = PavedDrive)) +
  geom_bar(fill = "#3B9AB2") 
 # labs(title = "Driveway Paving Material"); bar_PavedDrive

#(ordinal) (recode)
bar_PoolQC<-ggplot(data = all, aes(x = PoolQC)) +
  geom_bar(fill = "#3B9AB2") 
  #labs(title = "Pool Quality"); bar_PoolQC

#(ordinal) (recode)
bar_Fence<-ggplot(data = all, aes(x = Fence)) +
  geom_bar(fill = "#3B9AB2") 
  #labs(title = "Fence Quality"); bar_Fence

#(nominal)
bar_MiscFeature<-ggplot(data = all, aes(x = MiscFeature)) +
  geom_bar(fill = "#3B9AB2") 
 # labs(title = "Misc. Feature"); bar_MiscFeature


#SALE TYPE (nominal)
bar_SaleType <-ggplot(data = all, aes(x = SaleType)) +
  geom_bar(fill = "#3B9AB2") 
  #labs(title = "Sale Type"); bar_SaleType

#SALE CONDITION (nominal)
bar_SaleCondition <-ggplot(data = all, aes(x = SaleCondition)) +
  geom_bar(fill = "#3B9AB2") 
 # labs(title = "Condition of Sale"); bar_SaleCondition
grid.arrange(bar_mssubclass, bar_MSZoning, bar_Street, bar_Alley, bar_LotShape,bar_LandContour, bar_Utilities,bar_LotConfig, bar_LandSlope, 
 bar_Neighborhood, bar_Condition1, bar_Condition2, bar_BldgType, bar_HouseStyle, bar_RoofStyle, bar_RoofMatl, bar_Exterior1st, bar_Exterior2nd, bar_MasVnrType, bar_ExterQual, bar_ExterCond, bar_Foundation, 
 bar_BsmtQual, bar_BsmtCond, bar_BsmtExposure, bar_BsmtFinType1, bar_BsmtFinType2, bar_Heating, bar_HeatingQC, bar_CentralAir,  bar_Electrical, bar_KitchenQual, bar_Functional, bar_FireplaceQu, bar_GarageType, bar_GarageFinish, bar_GarageQual, bar_GarageCond, bar_PavedDrive, bar_PoolQC, bar_Fence, bar_MiscFeature, bar_SaleType, bar_SaleCondition,ncol = 4)

#### Comments

  • As with the numeric variables, some of the categorical ones display evidence of non-normality and (multi)collinearity, while others (Street, Utilities) may not be useful predictors.

  • There are significant amounts of missing data. Many of these NA observations are likely to be structurally missing, meaning missing for a logical reason: e.g., if GarageArea = NA, the property has no garage. Missing data will require detailed exploration and imputation.

  • It might be useful to bin certain variables with large numbers of categories (e.g., Neighborhood, MSSubClass).

  • All of the categorical variables have class “character”, and will need to be recoded, since most modeling algorithms won’t accept character data.

3 Data Cleaning

3.1 Missing Data Analysis

3.1.1 Identify Missing Data

Before performing any bivariate analysis or modeling, I need to deal with missing data. First, I’ve identified all variables with missing values. Note that SalePrice is missing 1459 observations; these are the NA values in the test set, which I will later predict. Next, the aggr plot (from the VIM package) shows the proportion of missing data for each variable with > 1 missing value (not including SalePrice).

miss <-sort(colSums(is.na(all)), decreasing = TRUE)
miss[miss>0]
##       PoolQC  MiscFeature        Alley        Fence    SalePrice  FireplaceQu 
##         2908         2812         2719         2346         1459         1420 
##  LotFrontage  GarageYrBlt GarageFinish   GarageQual   GarageCond   GarageType 
##          486          159          159          159          159          157 
##     BsmtCond BsmtExposure     BsmtQual BsmtFinType2 BsmtFinType1   MasVnrType 
##           82           82           81           80           79           24 
##   MasVnrArea     MSZoning    Utilities BsmtFullBath BsmtHalfBath   Functional 
##           23            4            2            2            2            2 
##  Exterior1st  Exterior2nd   BsmtFinSF1   BsmtFinSF2    BsmtUnfSF  TotalBsmtSF 
##            1            1            1            1            1            1 
##   Electrical  KitchenQual   GarageCars   GarageArea     SaleType 
##            1            1            1            1            1
na <-sort(colSums(is.na(all)), decreasing = TRUE)
na <-na[na >1]

missing <-all[names(na)]
missing <-subset(missing, select = -SalePrice)

aggr(missing, only.miss = TRUE, cex.axis = 0.7,  bars = FALSE,
     combined = TRUE, col =c("#3B9AB2","#F21A00"), prop = TRUE, freq = TRUE, border = NA)
title("Proportion of Missing Data")

attach(all)

The plot supports my conjecture that most of the NA data is structurally missing. The variables with the greatest proportion of NAs (PoolQC, MiscFeature, Alley, Fence, FireplaceQu) represent unusual features: most properties won’t have a pool or a miscellaneous feature like an elevator, and only some will have alley access, a fence, or a fireplace. Furthermore, the plot also shows that the same observations are NA across groups of related variables (garage variables, basement variables), suggesting these properties lack garages and basements.

My strategy for replacing missing data will be as follows:

  1. Examine each variable and decide whether the NAs are structurally missing, i.e., refer to an absent feature.

  2. If so, identify/rule out any NA observations for that variable which are not structurally missing by looking at the records for related variables. For example, where FireplaceQu = NA, “Fireplaces” should = 0; if there are any cases where this is not true, I’ll assume there actually is a fireplace and that particular observation for FireplaceQu will need to be imputed.

  3. Recode all structurally missing values as either 0 (for numeric variables), or some variation of Not Present (for categorical variables—e.g., GarageCond = NA will become “No Garage”).

  4. For variables where most NAs are not structurally missing, determine whether there’s an obvious reason to classify them as MCAR (Missing Completely at Random) or MNAR (Missing Not at Random). If not, treat these NAs as MAR (Missing at Random) and impute them using the mice (multiple imputation by chain equations) package.

3.1.2 MD Mechanisms

3.1.2.1 Pool Variables

For Pool QC, 2906 out of 2914 values are NA. I assume most or all of these are structurally missing, since most properties don’t have pools. There are 9 non-NA values indicating properties that do have pools:

sum(!is.na(PoolQC)) 
## [1] 9

To look for any non-structural missing values, I cross-checked PoolQC with PoolArea to identify properties where a) PoolArea \(\neq\) 0 and PoolQC = NA, or b) PoolQC \(\neq\) NA and PoolArea = 0.

PoolArea has no NAs; I assume that any observations where PoolArea > 0 do have pools. There are 12 of these.

which(is.na(PoolArea)) 
## integer(0)
pools <-all[PoolArea > 0,]
pools <-pools[,c("Id", "PoolArea","PoolQC")]; pools
##         Id PoolArea PoolQC
## 198    198      512     Ex
## 811    811      648     Fa
## 1171  1171      576     Gd
## 1183  1183      555     Ex
## 1387  1387      519     Fa
## 1424  1424      738     Gd
## 5151  1975      144     Ex
## 9611  2421      368   <NA>
## 10441 2504      444   <NA>
## 11141 2574      228     Ex
## 11401 2600      561   <NA>
## 12511 2711      800     Gd

Since 3 properties (IDs 2421, 2504, and 2600) do have pools, these values of PoolQC are not structurally missing.

poolna <-pools[which(is.na(pools$PoolQC)),]
poolna <-poolna[,c("Id", "PoolArea","PoolQC")]
poolna
##         Id PoolArea PoolQC
## 9611  2421      368   <NA>
## 10441 2504      444   <NA>
## 11401 2600      561   <NA>

I looked at the full records for these properties, but found no obvious indication of why the data might be missing. I’ll assume they are MAR, and will impute using MICE.

3.1.2.2 MiscFeature/Val

MiscFeature and MiscVal

I matched MiscFeature records to MiscVal to identify any non-structural NAs.

105 properties have MiscFeatures: 2812 do not.

sum(!is.na(MiscFeature))
## [1] 105
sum(is.na(MiscFeature)) 
## [1] 2812

MiscVal (the dollar value of any MiscFeature) has no NAs and has 103 values > 0.

sum(is.na(MiscVal))  
## [1] 0
sum(MiscVal > 0) 
## [1] 103

This suggests that all NAs in MiscFeature are structurally missing, and 2 properties have incorrect values of 0 for MiscVal. To confirm this, I searched for records where MiscFeature \(\neq\) NA and MiscVal = 0; however, I found 3 of these properties, not 2.

ftr <-all[!(is.na(MiscFeature)),] #all properties with MiscFeatures
ftr <-ftr[ftr$MiscVal == 0,]
ftr <-ftr[,c("Id", "MiscFeature", "MiscVal")]; ftr
##        Id MiscFeature MiscVal
## 874   874        Othr       0
## 1201 1201        Shed       0
## 9721 2432        Shed       0

Therefore, there must be one additional non-structural NA observation, where MiscFeature = NA and MiscVal > 0:

val <-all[MiscVal > 0,]%>%
  filter(is.na(MiscFeature)) 
val <-val[,c("Id", "MiscFeature", "MiscVal")]; val
##         Id MiscFeature MiscVal
## 10901 2550        <NA>   17000

So, there are 102 properties with MiscVal and MiscFeature correctly identified; 3 properties where MiscFeature \(\neq\) NA but MiscVal = 0; 1 property with MiscVal > 0 but no MiscFeature; and (2917-106) = 2811 properties with no miscellaneous features, where the NAs in MiscFeature are structurally missing.

If a property has either MiscVal or MiscFeature filled in, I’ll assume it does have a miscellaneous feature. For the 3 properties where MiscFeature \(\neq\) NA and MiscVal = 0, there are several possibilities:

  1. The MiscFeatures records are incorrect and should be NA, or

  2. The MiscVal records are incorrect and should be > 0, or

  3. The records for both variables are correct: these properties have miscellaneous features that do not increase their value.

Looking at the other variables for these records, there’s no obvious indication of why this data would be missing. I considered whether the features (2 sheds, 1 unidentified “other”) might be somehow undesirable, and thus potentially detracting from property values. However, of all the MiscFeatures, the vast majority (95 out of 105) are sheds, so it seems that sheds generally do add value to a property:

table(all$MiscFeature)
## 
## Gar2 Othr Shed TenC 
##    5    4   95    1

I noticed that for Id# 874, LowQualFinSF = 232. Most properties have LowQualFinSF = 0. I’m going to assume this refers to the property’s MiscFeature and leave the MiscVal at 0. For the other two, 1201 and 2432, I’ll assume these values are MAR and impute them.

Finally, for property #2550, where MiscVal \(\neq\) 0 but MiscFeat = NA, I will impute MiscFeat as “Other.” This is the largest property in the dataset and is also quite new, so it could contain any number of unusual and expensive features.

3.1.2.3 Fireplace Variables

I cross-checked FireplaceQu with Fireplaces and found 1420 values of 0 (meaning the property has no fireplace), all for the same properties where FireplaceQu = NA. All NAs for FireplaceQu are structurally missing, so there’s no need to impute any of this data.

sum(is.na(FireplaceQu)) #1420 no-fireplace properties
## [1] 1420
#Check # of Fireplaces; 1420 = 0
table(Fireplaces)
## Fireplaces
##    0    1    2    3    4 
## 1420 1267  219   10    1
#Check to make sure these records are the same
fire <-all%>%
  filter(!is.na(FireplaceQu)) %>%
  filter(Fireplaces == 0)
dim(fire)
## [1]  0 81

3.1.2.4 Basement Variables

There are 11 basement variables: BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinSF1, BsmtFinType2, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath,and BsmtHalfBath. For the most part, NAs for these variables are structurally missing, meaning the property does not have a basement. To identify any truly missing NAs, I compared records across the variables using a dataframe where n = # of NAs for a given basement variable and p = 12 (Id + 11 basement variables). I then looked at 11 versions of this dataframe, one for each variable. This allowed easy identification of non-structural NAs: if the record for that single variable was NA, but some or all of the other variables were filled in, I assumed the property contained a basement.

For example, below, I looked at BsmtFinType2, with NA = 80. One can clearly see that the property with ID 333 does have a basement, so this NA is likely MAR.

#Sample database using BsmtFinType2
bsmt_na <-all[is.na(BsmtFinType2),]
bsmt_na <-bsmt_na[ , c("Id", "BsmtCond", "BsmtQual", "BsmtFinType1", "BsmtFinType2", "TotalBsmtSF","BsmtExposure", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "BsmtFullBath", "BsmtHalfBath")]
head(bsmt_na, 8)
##      Id BsmtCond BsmtQual BsmtFinType1 BsmtFinType2 TotalBsmtSF BsmtExposure
## 18   18     <NA>     <NA>         <NA>         <NA>           0         <NA>
## 40   40     <NA>     <NA>         <NA>         <NA>           0         <NA>
## 91   91     <NA>     <NA>         <NA>         <NA>           0         <NA>
## 103 103     <NA>     <NA>         <NA>         <NA>           0         <NA>
## 157 157     <NA>     <NA>         <NA>         <NA>           0         <NA>
## 183 183     <NA>     <NA>         <NA>         <NA>           0         <NA>
## 260 260     <NA>     <NA>         <NA>         <NA>           0         <NA>
## 333 333       TA       Gd          GLQ         <NA>        3206           No
##     BsmtFinSF1 BsmtFinSF2 BsmtUnfSF BsmtFullBath BsmtHalfBath
## 18           0          0         0            0            0
## 40           0          0         0            0            0
## 91           0          0         0            0            0
## 103          0          0         0            0            0
## 157          0          0         0            0            0
## 183          0          0         0            0            0
## 260          0          0         0            0            0
## 333       1124        479      1603            1            0

To first determine the number of properties with no basement, I looked at variables relevant to every record, namely BsmtQual, BsmtCond, and TotalBsmtSF. For the first 2 variables, NA means no basement; to find properties without basements using TotalBsmtSF, I looked at records coded 0.

TotalBsmtSF: 1 NA and 78 0s

  • After ensuring that all 0 values of TotalBsmtSF were coded correctly, I found 78 properties with no basement. For the single NA, ID # 2121, all other basement variables are also NA or 0. It’s reasonable to assume there’s no basement on this property, so I’ll recode this NA to 0. There are 79 properties with no basement.
table(TotalBsmtSF)[1] #Check # of 0 values
##  0 
## 78
bsmt_na <-all[is.na(TotalBsmtSF),]
bsmt_na <-bsmt_na[, c("Id", "BsmtCond", "BsmtQual", "BsmtFinType1", "BsmtFinType2", "TotalBsmtSF","BsmtExposure", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "BsmtFullBath", "BsmtHalfBath")]

bsmt_na #ID 2121 has no basement, recode as 0
##        Id BsmtCond BsmtQual BsmtFinType1 BsmtFinType2 TotalBsmtSF BsmtExposure
## 6611 2121     <NA>     <NA>         <NA>         <NA>          NA         <NA>
##      BsmtFinSF1 BsmtFinSF2 BsmtUnfSF BsmtFullBath BsmtHalfBath
## 6611         NA         NA        NA           NA           NA

BsmtQual

  • BsmtQual has 81 NAs- since we know there are 79 properties with no basement, 2 of these are non-structural. Plugging BsmtQual into the dataframe reveals 2 properties (ID #s 2218 and 2219) with values for the other basement variables, so I conclude these 2 NAs are MAR and will need to be imputed.
bsmt_na <-all[is.na(BsmtQual),] 
nrow(bsmt_na)
## [1] 81
bsmt_na <-bsmt_na[ , c("Id", "BsmtCond", "BsmtQual", "BsmtFinType1", "BsmtFinType2", "TotalBsmtSF", "BsmtExposure", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "BsmtFullBath", "BsmtHalfBath")]
#Impute 2218 and 2219

BsmtCond

  • There are 3 extra NAs corresponding to 3 properties (ID #s 2041, 2186, and 2525) that do seem to have basements. These values of BsmtCond are therefore MAR and will need to be imputed.
bsmt_na <-all[is.na(BsmtCond),]
nrow(bsmt_na)
## [1] 82
bsmt_na <-bsmt_na[ , c("Id", "BsmtCond", "BsmtQual", "BsmtFinType1", "BsmtFinType2", "TotalBsmtSF","BsmtExposure", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "BsmtFullBath","BsmtHalfBath")]
#Impute IDs 2041, 2186, and 2525

Next, I looked at the other 3 variables for basement square feet, BsmtFinSF1, BsmtFinSF2, and BsmtUnfS. Here, all NAs are some non-structural form of missing data and do not mean “No Basement.” I also checked for missing data by looking at the records coded 0.

BsmtFinSF1

  • The single NA (ID #2121) in this category has no basement, so I will recode this as 0.

  • There are 929 records for which BsmtFinSF1 = 0, which I compared to BsmtFinType1 to check for incongruities (I.e., if BsmtFinSF1 = 0 but BsmtFinType1 = GLQ, this would need to be resolved). No errors were found.

#BSMTFINSF1
table(BsmtFinSF1)[1]
##   0 
## 929
bsmt_na <-all[is.na(BsmtFinSF1),]
nrow(bsmt_na)
## [1] 1
bsmt_na <-bsmt_na[ , c("Id", "BsmtCond", "BsmtQual", "BsmtFinType1", "BsmtFinType2", "TotalBsmtSF", "BsmtExposure", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "BsmtFullBath","BsmtHalfBath")]

#Recode Id 2121 as 0

#Check the 0s - make sure all values of BsmtFinType1 are either Unf or NA
bsmt_na <-all[BsmtFinSF1 == 0,]
bsmt_na <-bsmt_na[ , c("Id", "BsmtFinType1", "BsmtFinSF1")]
table(bsmt_na$BsmtFinType1)
## 
## Unf 
## 851
sum(is.na(bsmt_na$BsmtFinType1)) #no problem here.
## [1] 79

BsmtFinSF2

  • The single NA (ID 2121) in BsmtFinSF2 has no basement and will be recoded as 0.

  • For records where BsmtFinSF2 = 0, I checked values of BsmtFinType2 for incongruities. For Id #1471, BsmtFinSF2 = 0 but BsmtFinType2 = BLQ. I looked at other values for this property and noted that this basement has BsmtFinType1 = GLQ, BsmtFinSF1 = 1051, BsmtUnfSF = 354, and TotalBsmtSF = 1471. Since the finished and unfinished areas add up to the total area, there is no room left for a “BLQ” rated area. For this record, I assume that BsmtFinSF2 is correctly labeled 0, but BsmtFinType2 should be recoded as “Unf.”

#BSMTFINSF2
table(BsmtFinSF2)[1] #2567 = 0
##    0 
## 2569
bsmt_na <-all[is.na(BsmtFinSF2),] #The only NA is 2121- recode to 0
nrow(bsmt_na)
## [1] 1
bsmt_na <-bsmt_na[ , c("Id", "BsmtCond", "BsmtQual", "BsmtFinType1", "BsmtFinType2", "TotalBsmtSF", "BsmtExposure", "BsmtFinSF1", "BsmtFinSF2",  "BsmtUnfSF", "BsmtFullBath", "BsmtHalfBath")]

#Check the 0s - make sure all values of BsmtFinType2 are either Unf or NA
bsmt_na <-all[BsmtFinSF2 == 0,]
bsmt_na <-bsmt_na[ , c("Id","BsmtFinType2", "BsmtFinSF2")]

table(bsmt_na$BsmtFinType2) # 1 zero is coded incorrectly (as BLQ)
## 
##  BLQ  Unf 
##    1 2490
which(bsmt_na$BsmtFinType2 == "BLQ")  
## [1] 1300
bsmt_na[1298,] #Id #1471 needs to be recoded
##        Id BsmtFinType2 BsmtFinSF2
## 8100 1468          Unf          0
all[Id == 1471,] #Look at the rest of the record
##         Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 11100 1471        120       RH          26    5858   Pave  <NA>      IR1
##       LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 11100         Lvl    AllPub       FR2       Gtl        NAmes       Norm
##       Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 11100       Norm   TwnhsE     1Story           7           5      1999
##       YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 11100         1999     Gable  CompShg     MetalSd     MetalSd       None
##       MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 11100          0        Gd        TA      PConc       Gd       TA           No
##       BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 11100          GLQ       1051          BLQ          0       354        1405
##       Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 11100    GasA        Ex          Y      SBrkr      1337         0            0
##       GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 11100      1337            1            0        2        0            2
##       KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 11100            1          Gd            5        Typ          1          Fa
##       GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 11100     Attchd        1999          Fin          2        511         TA
##       GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 11100         TA          Y        203          68             0          0
##       ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 11100           0        0   <NA>  <NA>        <NA>       0      6   2010
##       SaleType SaleCondition SalePrice
## 11100       WD        Normal        NA
#Recode BsmtFinType2 Id #1471 to Unf

BsmtUnfSF

  • As with other variables, ID #2121 (a property with no basement) is also incorrectly marked NA and will be recoded as 0.

  • There are 83 records for which BsmtUnfSF = 0, while BsmtFinType2 = Unf. This reveals a problem with the coding for BsmtFinType2 (discussed below), but there are no identifiable errors with BsmtUnfSF.

#The NA has no basement --> coerce to 0 (ID 2121)
table(BsmtUnfSF)[1] #241 have no unf sf
##   0 
## 241
bsmt_na <-all[is.na(BsmtUnfSF),]
bsmt_na <-bsmt_na[ , c("Id", "BsmtCond", "BsmtQual", "BsmtFinType1", "BsmtFinType2", "TotalBsmtSF", "BsmtExposure", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "BsmtFullBath", "BsmtHalfBath")]

#check the 0s-  make sure where bsmtunfsf = 0, bsmtfintype1 and 2 aren't unf
bsmt_na <-all[BsmtUnfSF == 0,]
bsmt_na <-bsmt_na[ , c("Id","BsmtUnfSF", "BsmtFinType1", "BsmtFinType2", "TotalBsmtSF", "BsmtFinSF1", "BsmtFinSF2")]

table(bsmt_na$BsmtFinType1) 
## 
## ALQ BLQ GLQ LwQ Rec 
##  34  19  49  35  26
table(bsmt_na$BsmtFinType2) #83 records where BsmtFinType2 = Unf but Unf sqft = 0
## 
## ALQ BLQ GLQ LwQ Rec Unf 
##  13   7  15  30  15  83
unf_rows <-which(bsmt_na$BsmtFinType2 == "Unf") 
head(bsmt_na[unf_rows,], 10) #coding problem evident
##      Id BsmtUnfSF BsmtFinType1 BsmtFinType2 TotalBsmtSF BsmtFinSF1 BsmtFinSF2
## 55   55         0          ALQ          Unf         384        384          0
## 76   76         0          GLQ          Unf         462        462          0
## 121 121         0          ALQ          Unf         938        938          0
## 189 189         0          GLQ          Unf        1086       1086          0
## 203 203         0          LwQ          Unf         617        617          0
## 252 252         0          GLQ          Unf        1573       1573          0
## 263 263         0          ALQ          Unf         506        506          0
## 294 294         0          ALQ          Unf         795        795          0
## 304 304         0          ALQ          Unf         894        894          0
## 307 307         0          ALQ          Unf         700        700          0

Moving on to the remaining basement variables:

BsmtFinType1

  • There are 79 NAs, the same as the number of properties with no basement. No non-structural missing data.
#BsmtFinType1 = 79 NA (NA = no basement)-- No changes, all NAs structurally missing
bsmt_na <-all[is.na(BsmtFinType1),]
nrow(bsmt_na)
## [1] 79
bsmt_na <-bsmt_na[ , c("Id", "BsmtCond", "BsmtQual", "BsmtFinType1", "BsmtFinType2", "TotalBsmtSF", "BsmtExposure", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "BsmtFullBath", "BsmtHalfBath")]

BsmtFinType2

  • There are 80 NAs, meaning one extra.
#BSMTFINTYPE2
bsmt_na <-all[is.na(BsmtFinType2),]
nrow(bsmt_na) #80 NAs- 1 extra
## [1] 80
bsmt_na <-bsmt_na[ , c("Id", "BsmtCond", "BsmtQual", "BsmtFinType1", "BsmtFinType2", "TotalBsmtSF",
                       "BsmtExposure", "BsmtFinSF1", "BsmtFinSF2",  "BsmtUnfSF", "BsmtFullBath",
                       "BsmtHalfBath")]

#NA for id 333 is wrong-
bsmt_na[bsmt_na$Id == 333,]
##      Id BsmtCond BsmtQual BsmtFinType1 BsmtFinType2 TotalBsmtSF BsmtExposure
## 333 333       TA       Gd          GLQ         <NA>        3206           No
##     BsmtFinSF1 BsmtFinSF2 BsmtUnfSF BsmtFullBath BsmtHalfBath
## 333       1124        479      1603            1            0
#479 sf for BsmtFinSF2 is large for a recroom,, and this entry only has 1 full bathroom, so prob. not a
#2nd living quarters. 
#Recode Id # 1471 to Unf and Id 333 as recroom.

BsmtExposure

  • There are 82 NAs (meaning no basement), meaning 3 NAs are MAR. ID #s 949, 1488, and 2349 do have basements, so these will need to be imputed.
bsmt_na <-all[is.na(BsmtExposure),]
bsmt_na <-bsmt_na[ , c("Id", "BsmtCond", "BsmtQual", "BsmtFinType1", "BsmtFinType2", "TotalBsmtSF", "BsmtExposure", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "BsmtFullBath", "BsmtHalfBath")]

BsmtFullBath

  • The 2 NAs are for properties with basements (IDs 2121 and 2189), and will be recoded as 0.
bsmt_na <-all[is.na(BsmtFullBath),]
bsmt_na <-bsmt_na[ , c("Id", "BsmtCond", "BsmtQual", "BsmtFinType1", "BsmtFinType2","TotalBsmtSF", "BsmtExposure", "BsmtFinSF1", "BsmtFinSF2", 
"BsmtUnfSF", "BsmtFullBath", "BsmtHalfBath")]

BsmtHalfBath

  • Both NAs are for properties with basements (IDs 2121 and 2189) and will be recoded as 0.
bsmt_na <-all[is.na(BsmtHalfBath),]
bsmt_na <-bsmt_na[ , c("Id", "BsmtCond", "BsmtQual", "BsmtFinType1", "BsmtFinType2","TotalBsmtSF", "BsmtExposure", "BsmtFinSF1", "BsmtFinSF2",
"BsmtUnfSF", "BsmtFullBath", "BsmtHalfBath")]

3.1.2.5 Garage Variables

There are 7 garage variables (GarageYrBlt, GarageFinish, GarageQual, GarageCond, GarageType, GarageCars, and GarageArea). As with basement variables, to identify any non-structural missing data, I created a dataframe with p = 8 (the 7 garage variables + Id) and where n = the # of NAs or # of 0s in each variable. I then rotated each variable in and out of the lead position in order to cross-check NAs against the other variables.

I began by identifying properties with no garage, which determined the expected number of structural NAs for each variable. Across all the garage variables, there were only 2 true NAs. Property #2127 has data for GarageArea, GarageType, and GarageCars, but the other 4 variables need to be imputed. Property # 2577 has only GarageType, so I need to either impute the other data or drop this property; I chose to impute.

GarageYrBlt

  • There are 159 NAs, suggesting properties with no garage. However, 2 of these- IDs 2127 and 2577- contain data for other garage variables and therefore do appear to have garages. For now, I will assume there are 157 properties with no garage and 2 MAR NAs.
garage_na <-all[is.na(GarageYrBlt),]
garage_na <-garage_na[,c("Id", "GarageArea", "GarageType", "GarageQual", "GarageCond", "GarageFinish", "GarageCars", "GarageYrBlt")]

mar <-garage_na[!is.na(garage_na$GarageType),] #impute these

GarageFinish

  • There are 159 NAs. IDs 2127 and 2577 are also MAR for this variable.
garage_na <-all[is.na(GarageFinish),]
garage_na <-garage_na[,c("Id", "GarageArea", "GarageType", "GarageQual", "GarageCond", "GarageFinish", "GarageCars", "GarageYrBlt")]

GarageQual

+159 NAs, 2 MAR (2127 and 2577).

garage_na <-all[is.na(GarageQual),]
garage_na <-garage_na[,c("Id", "GarageArea", "GarageType", "GarageQual", "GarageCond", "GarageFinish", "GarageCars", "GarageYrBlt")]

GarageCond

  • 159 NAs, 2 MAR (2127 and 2577)
garage_na <-all[is.na(GarageCond),]
garage_na <-garage_na[,c("Id", "GarageArea", "GarageType", "GarageQual", "GarageCond", "GarageFinish", "GarageCars", "GarageYrBlt")]

GarageType

  • 157 NAs, corresponding precisely with the number of properties with no garage. IDs 2127 and 2577 do contain data for GarageType (both are detached).
garage_na <-all[is.na(GarageType),]
garage_na <-garage_na[,c("Id", "GarageArea", "GarageType", "GarageQual", "GarageCond", "GarageFinish", "GarageCars", "GarageYrBlt")]

GarageCars

  • 1 NA and 158 records where GarageCars = 0, indicating “no garage.” While the value for #2577 is missing and will need to be imputed, there is an entry for #2127 (it’s a 1-car garage). The extra NA is strangely missing data for every variable, including Id and the row number. This may be an author-induced error. We will need to see about imputing this record. Note: Yes, this is wrong!! Need to double check all of these variables. Checking for is.na(all$Id) in the data section doesn’t turn up anything.
table(GarageCars)[1]

garage_na <-all[is.na(GarageCars),]
garage_na <-garage_na[,c("Id", "GarageArea", "GarageType", "GarageQual", "GarageCond", "GarageFinish", "GarageCars", "GarageYrBlt")]

#check 0s
garage_na <-all[GarageCars == 0,]
garage_na <-garage_na[,c("Id", "GarageArea", "GarageType", "GarageQual", "GarageCond", "GarageFinish", "GarageCars", "GarageYrBlt")]

GarageArea

  • There is 1 NA (ID 2577), which we will impute, and 157 records where GarageArea == 0, correctly corresponding to the number of properties with no garage.
table(GarageArea)[1]

garage_na <-all[is.na(GarageArea),]
garage_na <-garage_na[,c("Id", "GarageArea", "GarageType", "GarageQual", "GarageCond", "GarageFinish", "GarageCars", "GarageYrBlt")]

#check 0s
garage_na <-all[GarageArea == 0,]
garage_na <-garage_na[,c("Id", "GarageArea", "GarageType", "GarageQual", "GarageCond", "GarageFinish", "GarageCars", "GarageYrBlt")]

We conclude there are 157 properties with no garage, for which data is mostly complete and correct. Properties #2127 and #2177 contain MAR data for multiple variables; we will impute this data using MICE.

#Find the 2 MAR
mar
##         Id GarageArea GarageType GarageQual GarageCond GarageFinish GarageCars
## 6671  2127        360     Detchd       <NA>       <NA>         <NA>          1
## 11171 2577         NA     Detchd       <NA>       <NA>         <NA>         NA
##       GarageYrBlt
## 6671           NA
## 11171          NA

3.1.2.6 Exterior Variables

I initially assumed the variables MasVnrType, MasVnrArea, Exterior1st and Exterior2nd would be consistent across categories, e.g., if Exterior1st = “BrkFace” than MasVnrType should also = “BrkFace.” I therefore analyzed them together in hopes of identifying any coding errors or MAR data.

Unfortunately, the very high number of discrepancies is suggestive of a less-obvious coding scheme—there were, for example, 84 records where MasVnrType = “None”, MasVnrArea = 0, and Exterior1st = “BrkFace.” It’s also possible these variables are riddled with errors, but as there’s no way to know, it does not make sense to try to recode this data. I therefore focused on correcting NAs.

MasVnrType

  • 24 NAs. These should all be coerced to “None” as MasVnrArea is 0 and Exterior1st and 2nd are non-masonry materials.
ext_na <-all[is.na(MasVnrType),]
ext_na <-ext_na[, c("Id", "MasVnrType", "MasVnrArea", "Exterior1st", "Exterior2nd")]
#Result: coerce these values to "None". 
  • Look for records where MasVnrType == “None”. There are 1742, too many to comb through manually for discrepancies.
#Check values of "None" 
table(MasVnrType)["None"] #yikes that's a lot of values
## None 
## 1742
ext_na <-all[MasVnrType == "None",]
ext_na <-ext_na[, c("Id", "MasVnrType", "MasVnrArea", "Exterior1st", "Exterior2nd")]
  • Filter the “None” records by other variables to look for discrepancies, we find 84 records for which MasVnrType is “None” but Exterior1st is BrkFace. For these, you could coerce MasVnrType to BrkFace; coerce MasVnrArea to NA and impute. However, these are more discrepancies than I’m comfortable treating as errors, so I’m going to leave these alone and just focus on NAs.
#Now filter these to check for errors
#You need to change these 84 values to "BrkFace" 
coerce <-ext_na %>%
  filter(Exterior1st == "BrkFace")

MasVnrArea

  • There are 23 NAs; since all of these records coincide with MasVnrType = “None”, I assume they are structurally missing and will replace them with 0.

  • I also checked observations of MasVnrArea coded “0” for discrepancies with MasVnrType. 7 properties have observations where MasVnrType = “None” and MasVnrArea > 0; 3 of these had MasVnrArea = 1. I will recode these 3 observations (ID #s 774, 1231, and 2453) to 0, since they seem more likely than the others to be coding errors.

#Check NAs. They are all consistent with MasVnrType = NA, so replace with "O"
ext_na <-all[is.na(MasVnrArea),]
ext_na <-ext_na[, c("Id", "MasVnrType", "MasVnrArea", "Exterior1st", "Exterior2nd")]

#Check "None" and look at discrepancies. 
table(MasVnrArea)[1] #1737 
##    0 
## 1738
table(MasVnrType)["None"]
## None 
## 1742
mas <-all %>%
  filter(MasVnrType == "None") %>%
  filter(MasVnrArea > 0)
mas[,c("Id", "MasVnrType", "MasVnrArea", "Exterior1st", "Exterior2nd")]
##        Id MasVnrType MasVnrArea Exterior1st Exterior2nd
## 625   625       None        288     VinylSd     VinylSd
## 774   774       None          1     Wd Sdng     Wd Sdng
## 1231 1231       None          1     Plywood     Plywood
## 1301 1301       None        344     VinylSd     VinylSd
## 1335 1335       None        312     HdBoard     HdBoard
## 2101 1670       None        285     VinylSd     VinylSd
## 9931 2453       None          1     MetalSd     MetalSd

Exterior1st and Exterior2nd

  • For both of these variables, the only missing observation is Id #2152, which I will impute using MICE. There is no “none” category for either of these, so no need to check for missing data there.
ext_na <-all[is.na(Exterior1st),]
ext_na <-ext_na[, c("Id", "MasVnrType", "MasVnrArea", "Exterior1st", "Exterior2nd")]

ext_na <-all[is.na(Exterior2nd),]
ext_na <-ext_na[, c("Id", "MasVnrType", "MasVnrArea", "Exterior1st", "Exterior2nd")]

ext_na
##        Id MasVnrType MasVnrArea Exterior1st Exterior2nd
## 6921 2152       None          0        <NA>        <NA>

3.1.2.7 Other Variables

Alley

  • The documentation informs us that NA values of Alley indicate properties without alley access. There’s no comparison variable to check that all of these NAs are structurally missing, so I’ve simply assumed that to be the case and will coerce them to “None” when I factorize this variable.
aly <-all[is.na(Alley), ]
aly <-aly[, c("Id", "Alley")] #Coerce these 2,716 obs to "None"

Fence

  • There are 2,344 missing values of “Fence” indicating properties with no fence. These will be coerced to “No Fence.”
fnc <-all[is.na(Fence), ]
fnc <-fnc[, c("Id", "Fence")] #Coerce these 2,716 obs to "None"

Lot Frontage

  • I did some research on the LotFrontage variable and found that in almost all cases, LotFrontage should be > 0. There are exceptions (adjacent plots of rural land bordered by other rural land on all sides, or plots accessed only by long access roads), but these are quite unusual. Moreover, looking at LotFrontage next to LotType reveals many records where the lot frontage is known to be > 0 (corner lots, frontage on 2 or 3 sides) where LotFrontage is NA. There’s no way to identify “true” values where LotFrontage = NA, and a rigorous investigation of missing data mechanisms is beyonad the scope of this project, so I will treat all missing values of LotFrontage as MAR and impute them using MICE.
summary(LotFrontage) #Check for any Lot Frontage = 0; there are none
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   21.00   59.00   68.00   69.18   80.00  313.00     486
lots <-which(is.na(LotFrontage)) #Impute these values of LotFrontage

MSZoning

As with LotFrontage, there cannot be a “none” category for MSZoning- every property must have a zone. I will impute MSZoning for IDs 1916, 2217, 2251, and 2905.

msz <-all[is.na(MSZoning),]
msz <-msz[,c("Id", "MSZoning")] #Impute these values of MSZoning
msz
##         Id MSZoning
## 4561  1916     <NA>
## 7571  2217     <NA>
## 7911  2251     <NA>
## 14451 2905     <NA>

Utilities

  • There’s no structural category for “No Utilities”, so this is MAR data. Impute IDs 1916 and 1946.
util <-all[is.na(Utilities),]
util <-util[,c("Id", "Utilities")]

Functional

  • No structural missing data category here, so these 2 NAs are MAR. Impute IDs 2217 and 2474.
fnc <-all[is.na(Functional),]
fnc <-fnc[,c("Id", "Functional")]

Electrical

  • Impute ID #1380.
elec <-all[is.na(Electrical),]
elec <-elec[,c("Id", "Electrical")]

KitchenQual

  • Impute ID 1556.
kq <-all[is.na(KitchenQual),]
kq <-kq[,c("Id", "KitchenQual")]

SaleType

  • Impute ID 2490.
slty <-all[is.na(SaleType),]
slty <-slty[,c("Id", "SaleType")]

3.2 Variable Classification

As previously noted, the project documentation classifies each variable as either discrete, continuous, ordinal or nominal, while in R, all variables initially have class character or integer. One of the challenges of this project involved determining a) whether to retain the same typology listed in the documentation, and b) how to classify ambiguous variables and in particular, ordinal variables, in R, for the sake of both convenience and statistical accuracy. Some types of models will accept variables in any format (e.g., random forest), while others will accept only numerical data (e.g., LASSO).

3.2.1 Data Types

For the most part, I retained the data typology listed in the project documentation, i.e., if the documentation said a variable was ordinal I kept it that way. The only exception is MoSold, which is listed as discrete and which I think is more accurately treated as nominal. Month numbers refer only to their chronological order, not to any relationship with the response- if anything, I think the season a house was sold in might be a better predictor than the month, and in that case January and December would be grouped together, not at opposite ends of the spectrum.

Excluding SalePrice (continuous) and ID (discrete), variable types are as follows:

  • 19 Continuous: LotFrontage, LotArea, MasVarArea, BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, X1stFlrSF, X2ndFlrSF, LowQualFinSF, GrLivArea, GarageArea, WoodDeckSF, OpenPorchSF, EnclosedPorch, X3SsnPorch, ScreenPorch, PoolArea, MiscVal

  • 23 Ordinal: LotShape, Utilities, LandSlope, OverallQual, OverallCond, ExteriorQual, ExteriorCond, BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2, HeatingQC, Electrical, KitchenQual, Functional, FireplaceQu, GarageFinish, GarageQual, GarageCond, PavedDrive, PoolQC, Fence

  • 24 Nominal: MSSubClass, MSZoning, Street, Alley, LandContour, LotConfig, Neighborhood, Condition1, Condition2, BldgType, HouseStyle, RoofStyle, RoofMaterial, Exterior1, Exterior2, MasVnrType, Foundation, Heating, CentralAir, GarageType, MiscFeature, SaleType, SaleCond, MoSold

  • 13 Discrete: YearBuilt, YearRemodAdd, BsmtFullBath, BsmtHalfBath, FullBath, HalfBath, BedroomAbvGrd, KitchenAbvGrd, TotRmsAbvGrd, Fireplaces, GarageYrBlt, GarageCars, YrSold

3.2.2 Recoding

I recoded nominal variables as categorical (factors), continuous as numeric, and discrete as integers. For classifying ordinal factors in R, I had 3 choices:

  1. As “ordered factors.” This is statistically accurate, but significantly limits my methodological choices, which doesn’t make sense when building a predictive model.

  2. As nominal categorical variables, which can then be converted to dummies/one-hot-encoded for algorithms that require numerical data. This is convenient, but highly undesirable and statistically inaccurate, as it tells the model to ignore the ordering in these variables. Given the number of ordinal variables that measure quality and the fact that these look like some of the strongest predictors in the data, I don’t want to use this approach.

  3. As integers. This assumes that the categories are equally spaced in terms of numerical distance, which might or might not be true. Still, this is a common approach to ordinal data in the real world, and violating this assumption is unlikely to significantly skew the results.

I chose to initially code the ordinal variables as ordered factors— this allowed me to reorder/relabel the levels as needed, and also ensured accurate (though slow) imputation. I’ll then convert to integers for further analysis and modeling.

#RECODING
detach(all)

#CONTINUOUS FEATURES --> NUMERICS
nums <-c("LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "X1stFlrSF", "X2ndFlrSF", "LowQualFinSF", "GrLivArea", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "X3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal", "SalePrice")


all[nums] <- lapply(all[nums], function(x) as.numeric(x)) #Factorize the above


#NOMINAL FEATURES --> FACTORS
factors <-c("MSSubClass", "MSZoning", "Street", "LandContour", "LotConfig",
            "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle", 
            "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", 
            "Foundation", "Heating", "CentralAir", "SaleType", "SaleCondition", 
            'MoSold')


all[factors] <- lapply(all[factors], function(x) as.factor(x)) #Factorize the above

###MSSubClass
all$MSSubClass <-revalue(all$MSSubClass, 
                         c("20" = "1 story 1946 +",
                           "30" = "1 story 1945 -", 
                           "40" = "1 story finished attic", 
                           "45" = "1.5 story unfinished",
                           "50" = "1.5 story finished",
                           "60" = "2 story 1946 +",
                           "70" = "2 story 1945 -",
                           "75" = "2.5 story",
                           "80" = "Split or multilevel",
                           "85" = "Split foyer",
                           "90" = "Duplex",
                           "120" = "1 story PUD 1946 +",
                           "150" = "1.5 story PUD",
                           "160" = "2 story PUD 1946 +",
                           "180" = "PUD Multilevel",
                           "190" = "2 family conversion"))

#####Street
all$Street <-revalue(all$Street,  c("Grvl" = "Gravel", 
                                    "Pave" ="Paved"))

#####Alley 
all$Alley[is.na(all$Alley)] <-'No alley access' #Fix NAs
all$Alley <-factor(all$Alley)
all$Alley <-revalue(all$Alley, c('Grvl' = 'Gravel', 
                                 'Pave' = 'Paved'))


####Land Contour
all$LandContour <-revalue(all$LandContour, c('Bnk' = 'Banked',
                                             'HLS' = 'Hillside',
                                             'Low' = 'Depression',
                                             'Lvl' = 'Flat/Level'))

########CONDITION 1-
all$Condition1 <-revalue(all$Condition1, c('Artery' = 'Adj. to Artery St',
                                           'Feedr' = 'Adj. to Feeder St',
                                           'Norm' = 'Normal',
                                           'RRNn' = 'Near NS RR', 
                                           'RRAn' = 'Adj. to NS RR',
                                           'PosA' = 'Adj. to positive feature',
                                           'PosN' = 'Near positive feature',
                                           'RRNe' = 'Near ES RR', 
                                           'RRAe' = 'Adj. to EW RR'))

#########CONDITION2
all$Condition2 <-revalue(all$Condition2, c('Artery' = 'Adj. to Artery St',
                                           'Feedr' = 'Adj. to Feeder St',
                                           'Norm' = 'Normal',
                                           'RRNn' = 'Near NS RR', 
                                           'RRAn' = 'Adj. to NS RR',
                                           'PosA' = 'Adj. to positive feature',
                                           'PosN' = 'Near positive feature',
                                           'RRNe' = 'Near ES RR', 
                                           'RRAe' = 'Adj. to EW RR'))

######BldgType
all$BldgType <-revalue(all$BldgType, c('1Fam' = '1Family',
                                       '2fmCon' = '2Family Conversion',
                                       'Duplex' = 'Duplex', 
                                       'Twnhs' = 'Townhouse InsideUnit',
                                       'TwnhsE' = 'Townhouse EndUnit'))


##MASVNRTYPE
all$MasVnrType <-revalue(all$MasVnrType, c('BrkCmn' = 'BrickCommon', 
                                            'BrkFace' = 'BrickFace'))
all$MasVnrType[is.na(all$MasVnrType)] <- 'None' #Fix NAs

##HEATING
all$Heating <-revalue(all$Heating, c('GasA' = 'Gas Forced Air',
                                     'GasW' = 'Gas Hot Water',
                                     'Grav' = 'Gravity',
                                     'OthW' = 'HW/Steam o/t gas',
                                     'Wall' = 'Wall Furnace'))

########GARAGETYPE
all$GarageType[is.na(all$GarageType)] <- "No Garage" #Fix NAs
all$GarageType <-as.factor(all$GarageType)

########MISCFEATURE
all$MiscFeature[is.na(all$MiscFeature)] <-"None" #Fix NAs
all$MiscFeature <-as.factor(all$MiscFeature)

After converting the continuous and nominal variables to numerics and factors, all of the data types are correct except for the remaining 24 character variables. These are the ordinal factors, which will also be converted and recoded.

table(sapply(all, class)) 
## 
## character    factor   integer   numeric 
##        21        24        16        20
####ORDINAL FEATURES --> ORDERED FACTORS

#####LOTSHAPE
all$LotShape <-factor(all$LotShape, order = TRUE,
                      levels = c("IR3", "IR2", "IR1", 'Reg')) 


all$LotShape <-revalue(all$LotShape, c("IR3" ="Irregular",
                                       'IR2' = "Mod. Irregular",
                                       'IR1' = 'Sl. Irregular',
                                       'Reg' = 'Regular'))


#####UTILITIES
all$Utilities <-factor(all$Utilities, order = TRUE, levels = c('ELO', 'NoSeWa',
                                                               'NoSewr', 'AllPub'))
all$Utilities <-revalue(all$Utilities, c('ELO' = 'Elec. only',
                                         'NoSeWa' = 'Elec. and gas', 
                                         'NoSewr' = 'Elec., gas, water', 
                                         'AllPub' = 'All public utils'))


#######LandSlope
all$LandSlope <-factor(all$LandSlope, order = TRUE, levels = c('Gtl', 'Mod', 'Sev'))

#OverallQual
all$OverallQual <-factor(all$OverallQual, order = TRUE)

#OverallCond
all$OverallCond <-factor(all$OverallCond, order = TRUE)

#ExterQual
quals <-c('Fa', 'TA', 'Gd', 'Ex') #Why is this on 4 point scale??
all$ExterQual <- factor(all$ExterQual, order = TRUE, levels = quals)

#Exterior Cond 
quals <-c('Po', 'Fa', 'TA', 'Gd', 'Ex')
all$ExterCond <-factor(all$ExterCond, order = TRUE, levels = quals)

#Bsmt Qual
all$BsmtQual[is.na(all$BsmtQual)] <- "No basement" #Fix NAs
quals <-c('No basement', 'Fa', 'TA', 'Gd', 'Ex')
all$BsmtQual <-factor(all$BsmtQual, order = TRUE, levels = quals) 

#re-assign NAs to MAR datapoints- these values will be imputed by MICE
all[all$Id == 2218, "BsmtQual"] <-NA
all[all$Id == 2219, "BsmtQual"] <-NA

#Bsmt Cond
all$BsmtCond[is.na(all$BsmtCond)] <-"No basement" #Fix NAs
quals <-c('No basement', 'Po', 'Fa', 'TA', 'Gd')
all$BsmtCond <-factor(all$BsmtCond, order = TRUE, levels = quals)

#These values to be imputed by MICE
all[all$Id == 2041, "BsmtCond"] <-NA
all[all$Id == 2186, "BsmtCond"] <-NA
all[all$Id == 2525, "BsmtCond"] <-NA

#Bsmt Exposure
all$BsmtExposure[is.na(all$BsmtExposure)] <-"No basement" 
all$BsmtExposure <-factor(all$BsmtExposure, order = TRUE, 
                          levels = c('No basement', 'No', 'Mn', 'Av', 'Gd'))

all[all$Id == 949, 'BsmtExposure'] <-NA
all[all$Id == 1488, 'BsmtExposure'] <-NA
all[all$Id == 2349, 'BsmtExposure'] <-NA


#BsmtFin Type 1
all$BsmtFinType1[is.na(all$BsmtFinType1)] <- "No basement" #Fix NAs
all$BsmtFinType1 <-factor(all$BsmtFinType1, order = TRUE, 
                          levels = c('No basement', 'Unf', 'LwQ', 'Rec', 'BLQ', 
                                     'ALQ', 'GLQ'))

#BsmtFin Type 2,- 80 NA
all$BsmtFinType2[is.na(all$BsmtFinType2)] <- "No basement"
all$BsmtFinType2 <-factor(all$BsmtFinType2, order = TRUE, 
                          levels = c('No basement', 'Unf', 'LwQ', 'Rec', 'BLQ', 
                                     'ALQ', 'GLQ'))

#correct this NA- we know there is a 2nd type of space that != unfinished
all[all$Id == 333, 'BsmtFinType2'] <-'Rec' 

###Heating QC- no NA
quals <-c('Po', 'Fa', 'TA', 'Gd', 'Ex') 

all$HeatingQC <-factor(all$HeatingQC, order = TRUE, levels = quals)

 #Electrical- 1 NA (ID 1380)- impute w/MICE
all$Electrical <-factor(all$Electrical, order = TRUE, 
                        levels = c('Mix', 'FuseP', 'FuseF', 'FuseA', 'SBrkr'))

all$Electrical <-revalue(all$Electrical, c('Mix' = 'Mixed', 
                                           'FuseP' = 'Po. FuseBox',
                                           'FuseF' = 'Fa. FuseBox',
                                           'FuseA' = 'Av. Fusebox',
                                           'SBrkr' = 'Circuit Brkr'))

#KitchenQual
quals <-c('Fa', 'TA', 'Gd', 'Ex')
all$KitchenQual <-factor(all$KitchenQual, order = TRUE, levels = quals)

#Functional
all$Functional <-factor(all$Functional, order = TRUE, levels = 
                          c('Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ'))

all$Functional <-revalue(all$Functional, c('Sev' = 'Severe', 
                                           'Maj2' = 'Maj. Deductions 2',
                                           'Maj1' = 'Maj. Deductions 1', 
                                           'Min2' = 'Min. Deductions 2',
                                           'Min1' = 'Min. Deductions 1',
                                           'Typ' = 'Typ. Functionality'))

#FireplaceQu
all$FireplaceQu[is.na(all$FireplaceQu)] <- 'No Fireplace'
quals <-c('No Fireplace', 'Po', 'Fa', 'TA', 'Gd', 'Ex')
all$FireplaceQu <-factor(all$FireplaceQu, order = TRUE, levels = quals)

#Garage Finish, 
all$GarageFinish[is.na(all$GarageFinish)] <- 'No Garage'
all$GarageFinish <-factor(all$GarageFinish, order = TRUE, levels = 
                            c('No Garage', 'Unf', 'RFn', 'Fin'))
all$GarageFinish <-revalue(all$GarageFinish, c('RFn' = 'Rough Fin.'))

#these are MAR, impute later
all[all$Id == 2127, 'GarageFinish'] <-NA
all[all$Id == 2577, 'GarageFinish'] <-NA

#########GARAGEQUAL
all$GarageQual[is.na(all$GarageQual)] <-'No Garage'
quals <-c('No Garage', 'Po', 'Fa', 'TA', 'Gd', 'Ex')
all$GarageQual <-factor(all$GarageQual, order = TRUE, levels = quals)

all[all$Id == 2127, 'GarageQual'] <-NA
all[all$Id == 2577, 'GarageQual'] <-NA


#####GARAGECOND
all$GarageCond[is.na(all$GarageCond)] <-'No Garage'
quals <-c('No Garage', 'Po', 'Fa', 'TA', 'Gd', 'Ex')
all$GarageCond <-factor(all$GarageCond, order = TRUE, levels = quals)

all[all$Id == 2127, 'GarageCond'] <-NA
all[all$Id == 2577, 'GarageCond'] <-NA


#Paved Drive,
all$PavedDrive <-factor(all$PavedDrive, order = TRUE, levels = c('N', 'P', 'Y'))
all$PavedDrive <-revalue(all$PavedDrive, c('N' = 'Dirt/Grvl',
                                           'P' = 'Part. Pvmnt',
                                           'Y' = 'Paved'))

######POOLQC
all$PoolQC[is.na(all$PoolQC)] <-'No Pool'
quals <-c('No Pool','Fa', 'Gd', 'Ex')
all$PoolQC <-factor(all$PoolQC, order = TRUE, levels = quals)

#these properties do have pools and need to be imputed
all[all$Id == 2421, 'PoolQC'] <-NA
all[all$Id == 2504, 'PoolQC'] <-NA
all[all$Id == 2600, 'PoolQC'] <-NA

#####FENCE- replace all NA
all$Fence[is.na(all$Fence)] <-"No Fence"
all$Fence <-factor(all$Fence, order = TRUE, levels = c('No Fence', 'MnWw', 'GdWo',
                                                       'MnPrv', 'GdPrv'))
all$Fence <-revalue(all$Fence, c('MnWw' = 'Min. wood, wire',
                                 'GdWo' = 'Good wood',
                                 'MnPrv' = 'Min. privacy',
                                 'GdPrv' = 'Good privacy'))



#######DEAL WITH REMAINING NAS IN DISCRETE/NUMERICAL VARS
#GarageYrBlt
all$GarageYrBlt[is.na(all$GarageYrBlt)] <- 0

all[all$Id == 2127 , 'GarageYrBlt'] <-NA
all[all$Id == 2577 , 'GarageYrBlt'] <-NA

###MASVNRAREA
all$MasVnrArea[is.na(all$MasVnrArea)] <-0

all[all$Id == 774, 'MasVnrArea'] <-0
all[all$Id == 1231, 'MasVnrArea'] <-0
all[all$Id == 2453, 'MasVnrArea'] <-0

#Recode single NAs
all[all$Id == 2121, 'BsmtFinSF1'] <-0

all[all$Id == 2121, 'BsmtFinSF2']  <-0

all[all$Id == 2121, 'BsmtUnfSF'] <-0

all[all$Id == 2121, 'TotalBsmtSF'] <-0

3.3 Imputation of Missing Data

I used mice to impute the missing data. For most variables, this amounted to only a few observations; however, more than half of the data from LotFrontage was missing. A density plot of the observed and imputed data shows the imputed distribution is similar to the original one, outliers excluded.

#set.seed(1986)
#impute <- mice(all, m = 5, method = "cart")
#summary(impute)
#densityplot(impute)
#title("Density Plot")


###CHECKING VALUES OF VARIABLES
#impute$imp$PoolQC
#summary(all$PoolQC)

#summary(impute$imp$LotFrontage) 
#summary(all$LotFrontage)

#######COMPLETE THE DATA
#comp_all <-complete(impute, 4)

#sum(is.na(comp_all)) #No remaining NAs

####NOW: WRITE TRAINING AND TESTING DATASETS BASED ON COMP_ALL
#comp_training <-subset(comp_all, comp_all$Id %in% training$Id)
#comp_testing <-subset(comp_all, comp_all$Id %in% testing$Id)

#Save files to Rdata
#save(comp_training, file = "comp_training2.RData")
#save(comp_testing, file = "comp_testing2.RData")

Density Plots of Observed and Imputed Data

Obviously, I won’t be using the imputed values of SalePrice—these were converted back to NAs—but the synchronicity between the observed and imputed distributions does confirm the algorithm is working well.

Note that because this project was compiled in RMarkdown and imputation considerably slows the knitting process, I ran the code in a separate script, saved the completed data to Rdata files, and uploaded them. This allows quick access to the clean and complete data.

load(file = "comp_testing3.Rdata") #Load in completed data separately
load(file = "comp_training3.Rdata")

ctrain <-comp_training
ctest <-comp_testing

ctest$SalePrice <-NA
ctrain$SalePrice <-log(ctrain$SalePrice)
comp_all <-rbind(ctrain, ctest)

4 Exploratory Analysis 2

4.1 Bivariate Plots

4.1.1 Numeric

##################BIVARIATE ANALYSIS
#####PLOT CONTINUOUS VARS
g1 <- ggplot(ctrain, aes(x = LotFrontage, y = SalePrice)) + 
  geom_point(alpha = 0.7) + geom_smooth(method = "loess", colour = "#3B9AB2")

g2 <- ggplot(ctrain, aes(x = LotArea, y = SalePrice)) + geom_point(alpha = 0.7) + 
  geom_smooth(method = "loess", col = "#3B9AB2")

g3 <- ggplot(ctrain, aes(x = YearBuilt, y = SalePrice)) + geom_point(alpha = 0.7) + 
  geom_smooth(method = "loess", col = "#3B9AB2")

g4 <- ggplot(ctrain, aes(x = YearRemodAdd, y = SalePrice)) + 
  geom_point(alpha = 0.7) + geom_smooth(method = "loess", col = "#3B9AB2")

g5 <- ggplot(ctrain, aes(x = MasVnrArea, y = SalePrice)) + geom_point(alpha = 0.7) + geom_smooth(method = "loess", col = "#3B9AB2")

g6 <- ggplot(ctrain, aes(x = BsmtFinSF1, y = SalePrice)) + geom_point(alpha = 0.7) + geom_smooth(method = "loess", col = "#3B9AB2")

g7 <- ggplot(ctrain, aes(x = BsmtFinSF2, y = SalePrice)) + geom_point(alpha = 0.7) + geom_smooth(method = "loess", col = "#3B9AB2")

g8 <- ggplot(ctrain, aes(x = BsmtUnfSF, y = SalePrice)) + geom_point(alpha = 0.7) + 
  geom_smooth(method = "loess", col = "#3B9AB2")

g9 <- ggplot(ctrain, aes(x = TotalBsmtSF, y = SalePrice)) + 
  geom_point(alpha = 0.7) + geom_smooth(method = "loess", col = "#3B9AB2")

g10 <- ggplot(ctrain, aes(x = X1stFlrSF, y = SalePrice)) + geom_point(alpha = 0.7) + geom_smooth(method = "loess", col = "#3B9AB2")

g11 <- ggplot(ctrain, aes(x = X2ndFlrSF, y = SalePrice)) + geom_point(alpha = 0.7) + geom_smooth(method = "loess", col = "#3B9AB2")

g12 <- ggplot(ctrain, aes(x = LowQualFinSF, y = SalePrice)) + 
  geom_point(alpha = 0.7) + geom_smooth(method = "loess", col = "#3B9AB2") 

g13 <- ggplot(ctrain, aes(x = GrLivArea, y = SalePrice)) + geom_point(alpha = 0.7) + geom_smooth(method = "loess", col = "#3B9AB2")

g14 <- ggplot(ctrain, aes(x = BsmtFullBath, y = SalePrice)) + 
  geom_point(alpha = 0.7) + geom_smooth(method = "loess", col = "#3B9AB2")

g15 <- ggplot(ctrain, aes(x = BsmtHalfBath, y = SalePrice)) + 
  geom_point(alpha = 0.7) + geom_smooth(method = "loess", col = "#3B9AB2")

g16 <- ggplot(ctrain, aes(x = FullBath, y = SalePrice)) + geom_point(alpha = 0.7) + 
  geom_smooth(method = "loess", col = "#3B9AB2")

g17 <- ggplot(ctrain, aes(x = HalfBath, y = SalePrice)) + geom_point(alpha = 0.7) + 
  geom_smooth(method = "loess", col = "#3B9AB2")

g18 <- ggplot(ctrain, aes(x = BedroomAbvGr, y = SalePrice)) + 
  geom_point(alpha = 0.7) + geom_smooth(method = "loess", col = "#3B9AB2")

g19 <- ggplot(ctrain, aes(x = KitchenAbvGr, y = SalePrice)) + 
  geom_point(alpha = 0.7) + geom_smooth(method = "loess", col = "#3B9AB2")

g20 <- ggplot(ctrain, aes(x = TotRmsAbvGrd, y = SalePrice)) + 
  geom_point(alpha = 0.7) + geom_smooth(method = "loess", col = "#3B9AB2")

g21 <- ggplot(ctrain, aes(x = Fireplaces, y = SalePrice)) + 
  geom_point(alpha = 0.7) + geom_smooth(method = "loess", col = "#3B9AB2")

g22 <- ggplot(ctrain, aes(x = GarageYrBlt, y = SalePrice)) + 
  geom_point(alpha = 0.7) + geom_smooth(method = "loess", col = "#3B9AB2")

g23 <- ggplot(ctrain, aes(x = GarageCars, y = SalePrice)) + 
  geom_point(alpha = 0.7) + geom_smooth(method = "loess", col = "#3B9AB2")

g24 <- ggplot(ctrain, aes(x = GarageArea, y = SalePrice)) + 
  geom_point(alpha = 0.7) + geom_smooth(method = "loess", col = "#3B9AB2")

g25 <- ggplot(ctrain, aes(x = WoodDeckSF, y = SalePrice)) + 
  geom_point(alpha = 0.7) + geom_smooth(method = "loess", col = "#3B9AB2")

g26 <- ggplot(ctrain, aes(x = OpenPorchSF, y = SalePrice)) + 
  geom_point(alpha = 0.7) + geom_smooth(method = "loess", col = "#3B9AB2")

g27 <- ggplot(ctrain, aes(x = EnclosedPorch, y = SalePrice)) + 
  geom_point(alpha = 0.7) + geom_smooth(method = "loess", col = "#3B9AB2")

g28 <- ggplot(ctrain, aes(x = X3SsnPorch, y = SalePrice)) +
  geom_point(alpha = 0.7) + geom_smooth(method = "loess", col = "#3B9AB2")

g29 <- ggplot(ctrain, aes(x = ScreenPorch, y = SalePrice)) +
  geom_point(alpha = 0.7) + geom_smooth(method = "loess", col = "#3B9AB2")

g30 <- ggplot(ctrain, aes(x = PoolArea, y = SalePrice)) + geom_point(alpha = 0.7) + 
  geom_smooth(method = "loess", col = "#3B9AB2")

g31 <- ggplot(ctrain, aes(x = MiscVal, y = SalePrice)) + geom_point(alpha = 0.7) + 
  geom_smooth(method = "loess", col = "#3B9AB2")

g32 <- ggplot(ctrain, aes(x = YrSold, y = SalePrice)) + geom_point(alpha = 0.7) + 
  geom_smooth(method = "loess", col = "#3B9AB2")

grid.arrange(g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15, g16, g17, g18, g19, g20, g21, g22, g23, g24, g25, g26, g27, g28, g29, g30, g31, g32, ncol = 4)

4.1.1.1 Comments

  • Outliers are an issue for most of these variables, but they appear to be particularly influential for LotFrontage and LotArea, and to a lesser extent for MasVnrArea, GarageArea, and OpenPorchSF. Some of these points are just especially large features, while others are true outliers in that they’re both very large and unusually inexpensive.

  • Most of the SF/area-related variables look like strong predictors. This isn’t surprising, since larger properties are generally more expensive.

  • A number of variables will need to be transformed, especially those with a large number of 0s. It might be helpful to convert GarageYrBlt to a factor, i.e. “No Garage,” “1920-1930,” …, “2000-2010.”

  • YrSold looks almost useless as a predictor. I’m a little surprised by this- I expected to see a more pronounced dip in prices coinciding with the recession of 2008-onwards. Also, the univariate plots showed that far fewer houses were sold in 2010 compared to previous years, but this seems to have had little relationship to sale prices.

4.1.2 Nominal

g33 <-ggplot(ctrain, aes(x = reorder(MSSubClass, SalePrice), y = SalePrice)) + geom_boxplot(col = "#3B9AB2") + coord_flip()

g34<-ggplot(ctrain, aes(x = reorder(MSZoning, SalePrice), y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g35 <-ggplot(ctrain, aes(x = reorder(Street, SalePrice), y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g36 <-ggplot(ctrain, aes(x = reorder(Alley,SalePrice), y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g37 <-ggplot(ctrain, aes(x = reorder(LandContour, SalePrice), y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g38 <-ggplot(ctrain, aes(x = reorder(LotConfig, SalePrice), y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g39 <-ggplot(ctrain, aes(x = reorder(Neighborhood, SalePrice), y = SalePrice)) + geom_boxplot(col = "#3B9AB2") + coord_flip()

g40 <-ggplot(ctrain, aes(x = reorder(Condition1, SalePrice), y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g41 <-ggplot(ctrain, aes(x = Condition2, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g42 <-ggplot(ctrain, aes(x = reorder(BldgType, SalePrice), y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g43 <-ggplot(ctrain, aes(x = reorder(HouseStyle, SalePrice), y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g44 <-ggplot(ctrain, aes(x = reorder(RoofStyle, SalePrice), y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g45 <-ggplot(ctrain, aes(x = reorder(RoofMatl, SalePrice), y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g46 <-ggplot(ctrain, aes(x = reorder(Exterior1st, SalePrice), y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g47 <-ggplot(ctrain, aes(x = reorder(Exterior2nd, SalePrice), y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g48 <-ggplot(ctrain, aes(x = reorder(MasVnrType, SalePrice), y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g49 <-ggplot(ctrain, aes(x = reorder(Foundation, SalePrice), y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g50 <-ggplot(ctrain, aes(x = reorder(Heating, SalePrice), y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g51 <-ggplot(ctrain, aes(x = CentralAir, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g52 <-ggplot(ctrain, aes(x = reorder(GarageType, SalePrice), y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g53 <-ggplot(ctrain, aes(x = reorder(MiscFeature, SalePrice), y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g54 <-ggplot(ctrain, aes(x = MoSold, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g55 <-ggplot(ctrain, aes(x = reorder(SaleType, SalePrice), y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g56 <-ggplot(ctrain, aes(x = reorder(SaleCondition, SalePrice), y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

grid.arrange(g33, g34, g35, g36, g37, g38, g39, g40, g41, g42, g43, g44, g45, g46, g47, g48, g49, g50, g51, g52, g53, g54, g55, g56, ncol = 3)

4.1.2.1 Comments

  • Of the nominal categorical variables, MSZoning, Street, Neighborhood, Condition1, RoofMatl, Exterior1st, MasVnrType, Foundation, Heating, CentralAir, and GarageType look to be strong predictors.

4.1.3 Ordinal

g57 <-ggplot(ctrain, aes(x = LotShape, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g58 <-ggplot(ctrain, aes(x = Utilities, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g59 <-ggplot(ctrain, aes(x = LandSlope, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g60 <- ggplot(ctrain, aes(x = factor(OverallQual), y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g61 <- ggplot(ctrain, aes(x = factor(OverallCond), y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g62 <-ggplot(ctrain, aes(x = ExterQual, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g63 <-ggplot(ctrain, aes(x = ExterCond, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g64 <-ggplot(ctrain, aes(x = BsmtQual, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g65 <-ggplot(ctrain, aes(x = BsmtCond, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g66 <-ggplot(ctrain, aes(x = BsmtExposure, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g67 <-ggplot(ctrain, aes(x = BsmtFinType1, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g68 <-ggplot(ctrain, aes(x = BsmtFinType2, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g69 <-ggplot(ctrain, aes(x = HeatingQC, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g70 <-ggplot(ctrain, aes(x = Electrical, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g71 <-ggplot(ctrain, aes(x = KitchenQual, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g72 <-ggplot(ctrain, aes(x = Functional, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g73 <-ggplot(ctrain, aes(x = FireplaceQu, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g74 <-ggplot(ctrain, aes(x = GarageFinish, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g75 <-ggplot(ctrain, aes(x = GarageQual, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g76 <-ggplot(ctrain, aes(x = GarageCond, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g77 <-ggplot(ctrain, aes(x = PavedDrive, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g78 <-ggplot(ctrain, aes(x = PoolQC, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g79 <-ggplot(ctrain, aes(x = Fence, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

grid.arrange(g57, g58, g59, g60, g61, g62, g63, g64, g65, g66, g67, g68, g69, g70, g71, g72, g73, g74, g75, g76, g77, g78, g79, ncol = 3)

4.1.3.1 Comments

  • The “Quality” variables look to be some of the strongest predictors, along with GarageFinish, HeatingQC, and Electrical. Other than garages, outdoor components (fences, land slope, driveways) don’t seem especially important.

  • Utilities has zero differentiation in the training set, so it has no predictive power and should be dropped.

4.2 Variable Importance

I ran a simple random forest model in order to get a rough measure of variable importance, as well as to produce a benchmark RMSE.

set.seed(1986)
rf_train <-sample(1:nrow(ctrain), nrow(ctrain)*0.6)
rf_test <-ctrain[-rf_train,]
rf_y <-rf_test$SalePrice

rf_1 <-randomForest(SalePrice~.-Id, data = ctrain, subset = rf_train, importance = TRUE,ntree = 1000)

rf_1
## 
## Call:
##  randomForest(formula = SalePrice ~ . - Id, data = ctrain, importance = TRUE,      ntree = 1000, subset = rf_train) 
##                Type of random forest: regression
##                      Number of trees: 1000
## No. of variables tried at each split: 26
## 
##           Mean of squared residuals: 0.01754328
##                     % Var explained: 88.86
varImpPlot(rf_1, main = "Random Forest Variable Importance")

Consistent with what we’ve seen so far, variables related to a property’s size and general quality appear much more important than “extras” like pools and porches. The single most important variable is GrLivArea, closely followed by variables likely to be correlated, such as the SF of the 1st and 2nd floors and of the basement.

The other notable result is that very few categorical variables register as important, with the exception of Neighborhood (#2) and MSSubClass (#7). The Neighborhood result isn’t surprising—“Location, location, location!”— but at first glance, I was a little surprisedto see MSSubClass on this plot, since similar variables like “HouseStyle” and “BldgType” don’t seem to be strong predictors. However, a closer look at the levels of MSSubClass shows it to be another strong indicator of property size.

4.3 Multicollinearity

Next, I want to look at multicollinearity in the data. Because this project is focused on prediction rather than inference, building a simple model isn’t a priority, and most of the methods I’ll be using still work in the presence of multicollinearity. However, multicollinearity does affect the results of variance importance measures. My approach will be to eliminate as much multicollinearity as possible without diminishing the model’s predictive ability.

Before looking at these measures, I converted the ordinal factors to integers so I could include them in the analysis.

####CONVERSION TO NUMERICS
#select ordered factors
ordfac <-comp_all %>%
  select(where(is.factor)) %>%
  select(where(is.ordered))

ordfactors <-names(ordfac) #this is correct

#convert ordinal factors to integers
comp_all[ordfactors] <- lapply(comp_all[ordfactors], function(x) as.integer(x))

#update the training and test sets
ctrain <-subset(comp_all, comp_all$Id %in% ctrain$Id)
ctest <-subset(comp_all, comp_all$Id %in% ctest$Id)

4.3.1 VIFs

When I first tried to examine the variance inflation factors (VIFs), R produced an error indicating there are aliased coefficients in the model. This means some variables are perfectly correlated:

#VIFs
lm1 <-lm(SalePrice~.-Id, data = ctrain) #linear model
options(scipen=999)
#gvif <-data.frame(vif(lm1)) #Produces error about aliased coefficients

#Identify aliased coefficients
attributes(alias(lm1)$Complete)$dimnames[[1]]
## [1] "BldgTypeDuplex"    "Exterior2ndCBlock" "TotalBsmtSF"      
## [4] "GrLivArea"
  • It’s not immediately clear why BldgType and Exterior2nd would be aliased; my guess is that Exterior2nd is correlated with Exterior1st, and BldgType may be correlated with MSSubClass (both contain a “Duplex” category). Based on the bivariate plots and random forest output, neither of these variables appears to be terribly important, so I will simply remove them from the VIF analysis.

The other two aliased variables, GrLivArea and TotalBsmtSF, are likely to be linear combinations of other square footage variables.

  • For GrLivArea, the obvious candidates are X1stFlrSF and X2ndFlrSF; LowQualFinSF is also included in GrLivArea. GrLivArea is therefore the sum of these 3 variables.
#Looking at composition of GrLivArea
df <-comp_all %>% 
  filter(comp_all$LowQualFinSF > 0) %>%
  select (X1stFlrSF, X2ndFlrSF, LowQualFinSF, GrLivArea)

head(df)
##     X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea
## 52        816         0          360      1176
## 89       1013         0          513      1526
## 126       520         0          234       754
## 171       854         0          528      1382
## 186      1518      1518          572      3608
## 188       808       704          144      1656
  • TotalBsmtSF is the sum of BsmtFinSF1, BsmtFinSF2, and BsmtUnfSF:
df2 <-comp_all %>%
  filter(comp_all$BsmtUnfSF > 0) 

df2 <-df2 %>%
  filter(df2$BsmtFinSF2 > 0) %>%
  select(BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF)
head(df2)
##    BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 8         859         32       216        1107
## 25        188        668       204        1060
## 27        234        486       180         900
## 44        280        491       167         938
## 45        179        506       465        1150
## 74        320        362       404        1086

Dropping X1stFlrSF, X2ndFlrSF, LowQualFinSF, BsmtFinSF1,BsmtFinSF2, and BsmtUnfSF from the model used to examine the VIFs takes care of the aliased coefficients error.

#Drop aliased variables from VIF model
train_vif <-subset(ctrain, select = -c(Id, LowQualFinSF, X1stFlrSF, X2ndFlrSF, Exterior2nd, BldgType, BsmtFinSF1,BsmtFinSF2))

lm2 <-lm(SalePrice~., data = train_vif)
options(scipen=999)
gvif <-data.frame(vif(lm2))
gvif$sqr <-gvif[,3]^2

gvifs <-gvif %>%
  filter(sqr > 5)

Here, I’ve identified variables with VIF > 5. (Technically,

gvifs
##                        GVIF Df GVIF..1..2.Df..         sqr
## HouseStyle   3334203.355275  7        2.923668    8.547833
## OverallQual        5.286300  1        2.299196    5.286300
## YearBuilt         16.844665  1        4.104225   16.844665
## BsmtQual           5.116737  1        2.262021    5.116737
## TotalBsmtSF        8.385141  1        2.895711    8.385141
## GrLivArea         13.172461  1        3.629389   13.172461
## TotRmsAbvGrd       6.334269  1        2.516797    6.334269
## Fireplaces         5.446938  1        2.333868    5.446938
## FireplaceQu        5.674604  1        2.382143    5.674604
## GarageType     21775.594120  6        2.298780    5.284392
## GarageYrBlt     2084.663082  1       45.658111 2084.663082
## GarageCars         7.723441  1        2.779108    7.723441
## GarageArea         7.718973  1        2.778304    7.718973
## GarageQual        20.886691  1        4.570196   20.886691
## GarageCond        21.821513  1        4.671350   21.821513
## PoolArea           8.396921  1        2.897744    8.396921
## PoolQC             8.195349  1        2.862752    8.195349
## MiscVal           25.303336  1        5.030242   25.303336

The VIF for GarageYrBlt is enormous (> 2,103), so I’ll want to remove this before modeling. I’ll likely do the same for MiscVal, which doesn’t seem to be very important and has a VIF of 25.

Other variables with large VIFs are almost certain to be highly correlated- for example, all of the garage variables, PoolArea and PoolQC, and variables related to the size of the house (TotRmsAbvGrd and GrLivArea). I’ll likely be dropping some of these, but before doing so I want to look at a correlation matrix.

4.3.2 Correlation Matrix

In creating the correlation matrix, I excluded the same variables I removed from the VIF analysis—X1stFlrSF, X2ndFlrSF, BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, and LowQualFinSF—which are already known to be perfectly correlated with GrLivArea and TotalBsmtSF. I also excluded any variable without at least one correlation \(\geq\) 0.3.

drop <-subset(ctrain, select = -c(Id, X1stFlrSF, X2ndFlrSF, BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, LowQualFinSF))

quant <-drop %>%
  select(where(is.numeric))

cors <-cor(quant, use = "pairwise.complete.obs")
cors <-round(cors, 2)

corrplot1 <-ggcorrplot(cors, lab = TRUE,type= 'lower', hc.order = TRUE, lab_size = 2, tl.cex = 6)

#update the corrplot
quant <-subset(quant, select = -c(Functional, Fence, X3SsnPorch, Utilities, BsmtHalfBath, ScreenPorch, YrSold, LotShape, KitchenAbvGr, BsmtFinType2))

cors <-round(cor(quant), 2)
corrplot2 <-ggcorrplot(cors, lab = TRUE,type= 'lower', hc.order = TRUE, lab_size = 3, tl.cex = 8, tl.col = "black", show.legend = FALSE)
corrplot2

The correlation matrix indicates some variables should be dropped before modeling:

  • GarageYrBlt has a massive VIF, evidently due to its correlation of 0.95 with GarageQual and GarageCond; however, it has only a 0.35 correlation with SalePrice. \(\Rightarrow\) Drop GarageYrBlt

  • PoolQC and PoolArea are highly correlated (\(\rho\) = 0.9). Both have \(\rho\) = 0.04 with SalePrice; I will drop PoolQC, since most of its values are imputed, whereas PoolArea is composed of original data. \(\Rightarrow\) Drop PoolQC

  • GarageQual and GarageCond have \(\rho\) = 0.96, so I want to drop one of these. Their correlation with SalePrice is almost identical (0.37 and 0.36 respectively) and so are their correlation patterns with other variables. \(\Rightarrow\) Drop GarageCond.

  • Similarly, GarageCars and GarageArea have \(\rho\) = 0.89 and nearly identical correlations (0.68, 0.66) with SalePrice. \(\Rightarrow\)Drop GarageArea

  • Fireplaces and FireplaceQu have \(\rho\) = 0.86; their correlations with SalePrice are 0.49 and 0.55, respectively. FireplaceQu also appears slightly more important in the random forest model. \(\Rightarrow\) Drop Fireplaces

  • GrLivArea and TotRmsAbvGrd have \(\rho\) = 0.83. SalePrice has 0.72 with GrLivArea and only 0.53 with TotRmsAbvGrd. \(\Rightarrow\) Drop TotRmsAbvGrd

Variables related to quality, particularly ExterQual, KitchenQual, BsmtQual, and OverallQual, also show collinearity—however, as the boxplots revealed, these variables also have some of the strongest correlations with SalePrice.

Other variables highly correlated with SalePrice include those related to size (GrLivArea, TotalBsmtSF, GarageCars), consistent with results from the bivariate plots and random forest. One very surprising result was the low correlation between BedroomsAbvGr and SalePrice (\(\rho\) = 0.2). Since the number of bedrooms is usually important in a house’s price, one possibility is that for split-level houses, bedrooms in the lower-level areas are not being counted in this feature.

4.4 Feature Engineering

4.4.1 New Variables

There are many options for feature engineering in this dataset, especially related to the age of the house, binning multicategory variables, interactions, and so on. For my initial model, though, I’ve created only three new variables:

  • NeighRich. Binning Neighborhood

Neighborhoods with especially high or low property values are likely to be qualitatively different from more average neighborhoods. I initially tried creating 4 categories for this variable—one for the poorest 3 neighborhoods, one for all others with property values below the mean, one for the 3 most expensive neighborhoods, and another for all others above the mean—but it wasn’t a strong predictor. 3 categories differentiating only the most and least expensive neighborhoods from all the others seems more promising.

df <-ctrain %>%
  mutate(SalePrice =exp(SalePrice))

nb2 <- ggplot(df, aes(x=reorder(Neighborhood, SalePrice, fun = "mean"), y=SalePrice)) +geom_bar(stat='summary', fun= "mean", fill='#3B9AB2') +labs(x='Neighborhood', y="Mean SalePrice") + theme(axis.text.x = element_text(angle = 45)) + geom_hline(yintercept=180151.2, linetype="dashed", color = "#F21A00", weight = 3)

nb2

#Binning neighborhood
comp_all$NeighRich[comp_all$Neighborhood %in% c('StoneBr', 'NridgHt', 'NoRidge')] <- 2
comp_all$NeighRich[!comp_all$Neighborhood %in% c('MeadowV', 'IDOTRR', 'BrDale', 'StoneBr', 'NridgHt', 'NoRidge')] <- 1
comp_all$NeighRich[comp_all$Neighborhood %in% c('MeadowV', 'IDOTRR', 'BrDale')] <- 0

ctrain <-subset(comp_all, comp_all$Id %in% ctrain$Id)
ctest <-subset(comp_all, comp_all$Id %in% ctest$Id)
  • TotalSF, the sum of GrLivArea + TotalBsmtSF, representing the total square footage of each property.
#Total SF
comp_all$TotalSF <-comp_all$GrLivArea + comp_all$TotalBsmtSF 
  • TotalBaths. Extra bathrooms are a highly desirable feature in a house, and a single variable counting the number of baths likely will have more predictive power than separate varibles for full, half, and basement bathrooms. TotalBaths is the sum of bathrooms, with each full bath counting as 1 and each half bath as 0.5.
#Total bathrooms
comp_all$TotalBaths <-comp_all$BsmtHalfBath*0.5 + comp_all$HalfBath*0.5 + comp_all$FullBath + comp_all$BsmtFullBath

4.4.2 Drop Unneeded Variables

For now, I’m dropping only the variables discussed so far: those with significantly inflated VIFs or extremely high correlations with other variables, and those that are perfectly correlated with size-related variables. I also dropped Utilities, due to its lack of predictive power.

drop <-c("GrLivArea", "TotalBsmtSF", "X1stFlrSF", "X2ndFlrSF", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "LowQualFinSF", "BsmtHalfBath", "HalfBath", "BsmtFullBath", "FullBath", "BldgType", "Exterior2nd", "Utilities", "GarageYrBlt", "GarageCond", "GarageArea", "PoolQC", "Fireplaces", "MiscVal", "Utilities", "TotRmsAbvGrd", "BldgType", "Exterior2nd")

comp_all <-comp_all[, !(names(comp_all) %in% drop)]

ctrain <-subset(comp_all, comp_all$Id %in% ctrain$Id) 
ctest <-subset(comp_all, comp_all$Id %in% ctest$Id)

4.5 Revisiting Variable Importance

Now that some of the multicollinearity has been removed from the data, I’d like to see how the variable importance measures have changed, so I ran a second random forest.

set.seed(1986)
rf_train <-sample(1:nrow(ctrain), nrow(ctrain)*0.6)
rf_test <-ctrain[-rf_train,]
rf_y <-rf_test$SalePrice

rf_2 <-randomForest(SalePrice~.-Id, data = ctrain, subset = rf_train, importance = TRUE, ntree = 1000)

rf_2
## 
## Call:
##  randomForest(formula = SalePrice ~ . - Id, data = ctrain, importance = TRUE,      ntree = 1000, subset = rf_train) 
##                Type of random forest: regression
##                      Number of trees: 1000
## No. of variables tried at each split: 20
## 
##           Mean of squared residuals: 0.01700892
##                     % Var explained: 89.2
varImpPlot(rf_2, main = "Random Forest Variable Importance- v.2")

Several changes are evident from the first random forest to the second:

  • First, the RMSE and % variance explained have improved slightly, from .13245 and 88.86% in the first model to .12988 and 89.28 in the second. In a predictive contest, any improvements are valued; in the real world, this might or might not be trivial, depending on the setting. In any event, this small improvement is what I would expect to see. Reducing multicollinearity hasn’t hugely improved predictive power, but it’s simplified the model somewhat and allowed other important variables to emerge.

  • TotalSF is a substantially more powerful predictor than GrLivArea. The main random forest measure of variable importance, %IncMSE, indicates roughly how much the model’s accuracy would decrease if a given variable were dropped. For GrLivArea, this was about 50%, while for TotalSF it’s around 70%.

  • TotalBaths also turns out to be a much more powerful variable than any of its component parts were—most of them didn’t even turn up in the first Variable Importance plot. Otherwise, there are not a lot of surprises in terms of which variables emerge as important.

5 Modeling

5.1 Processing

Given the number of variables in this dataset and the more or less linear relationship of most with the response, I didn’t spend a great deal of time inspecting each one for normality, outliers, etc. To correct non-normality, I adjusted the skewness on any numeric or ordinal variable with skewness > |0.8|. Standardization will be performed automatically during modeling.

ordfac <-ordfac[names(ordfac) %in% names(comp_all)] #removing factors from ordfac that were dropped from comp_all
ordfac <-comp_all[,names(ordfac)] #ordfac are now integers rather than factors

numeric <-comp_all%>%
  select(where(is.numeric))
numeric <-numeric[,!(names(numeric) %in% names(ordfac))] #true numerics only

cats <-comp_all %>%
  select(where(is.factor))

##### FIX THIS CODE
SalePrice <-comp_all$SalePrice
Id <-comp_all$Id

DFnum <-cbind(numeric, ordfac) #all numeric and ord. fac. data
drop <-c("SalePrice", "Id")

DFnum<-DFnum[,!names(DFnum) %in% drop]

#Density plots before adjustment- 
d1 <-ggplot(DFnum, aes(x = TotalSF)) + geom_density(colour = "#3B9AB2", weight = 4)
#skewness(DFnum$TotalSF)


###Adjust skewness
for(i in 1:ncol(DFnum)){
  if(abs(skew(DFnum[,i])) > 0.8){
    DFnum[,i] <-log(DFnum[,i] + 1)
  }
}

#density plots after correction

d2 <-ggplot(DFnum, aes(x = TotalSF)) + geom_density(colour = "#F21A00", weight = 4)
#skewness(DFnum$TotalSF)

grid.arrange(d1, d2, nrow = 1, top = "TotalSF Before and After Log Transform")

comp_all <-cbind(DFnum, cats)
comp_all$Id <-Id

5.2 LASSO Model

I used LASSO and the glmnet package for my first (and for now, only) model because I wanted an effective predictive algorithm that could deal with multicollinearity and would also perform variable selection. As glmnet() requires a matrix input, I processed the data using the model.matrix() function, which automatically performs one-hot-encoding on the categorical variables. glmnet() automatically standardizes numerical data, so there was no need to perform standardization separately. Finally, I used cv.glmnet() to select the tuning parameter (\(\lambda\)) by cross-validation.

x <-model.matrix(~.-Id, data = comp_all)[,-1]#remove x intercept 
y <-SalePrice
grid <-10^seq(10, -2, length = 100)

#Separate training and test sets
x.train <-x[1:1458,]
x.test <-x[1459:nrow(x),]
y.train <-y[1:1458]

#Choose lambda by cross-validation
set.seed(1986)
cv.lass <-cv.glmnet(x.train, y.train, alpha = 1)
plot(cv.lass)

bestlam <-cv.lass$lambda.min; bestlam
## [1] 0.003132975
set.seed(1986)
lasso.mod <-glmnet(x.train, y.train, alpha = 1)
#plot(lasso.mod)

pred<-predict(lasso.mod, s = bestlam, newx = x.test)
preds <-exp(pred) #back-transformation of log(y)

sub <-data.frame(Id = ctest$Id, SalePrice = preds)
names(sub) <-c("Id", "SalePrice")
write.csv(sub, file = "KaggleHousing_WriteupPreds.csv", row.names = F)

5.3 Results

The model is scored by submitting predictions to Kaggle, who returned an RMSE of 0.12648, which is the top 22nd percentile of scores.

The possibilities for future work on this project are endless; I did fairly minimal feature engineering, but I did play with some other potential variables (interaction variables with Year and YearRemodAdd, combining the porch SF variables, etc), and the results weren’t terribly promising. Ultimately, to break the 20th percentile, it would probably be necessary to use a boosted and/or ensemble model. If I do future work on this project, I’d likely start by using xgboost. It’s also possible that using the caret package would produce better LASSO results; I haven’t used this package before, but it’s popular on Kaggle.

6 Code

library(ggplot2)
library(corrplot)
library(tidyverse)
library(glmnet)
library(boot) #cross-validation
library(devtools)
library(wesanderson)
library(scales) #for label = comma in plots
library(gridExtra) #for grid.arrange
library(Rmisc) #for multiplot
library(VIM) #for missing data
library(tidyr)
library(plyr)
library(dplyr)
library(mice) #imputation
library(VIM)
library(randomForest)
library(car) #for VIFs
library(ggcorrplot)
library(psych) #for skewness
library(moments) #skewness
library(glmnet) #LASSO

Zissou1 = c("#3B9AB2", "#78B7C5", "#EBCC2A", "#E1AF00", "#F21A00")


#DATA
training <- read.csv("/Users/arcadia/Desktop/Kaggle House Prices/Data & Documentation/train.csv")
testing <- read.csv("/Users/arcadia/Desktop/Kaggle House Prices/Data & Documentation/test.csv")

testing$SalePrice <-NA

#Bind the training and test datasets for EDA/cleaning
all <-rbind(training,testing)

str(all)
table(sapply(all, class))

#OUTLIERS in GrLivArea
highlight <-all %>%
  filter(GrLivArea >= 4000)

g1 <-ggplot(data = all, aes(x = GrLivArea, y = SalePrice)) +geom_point(alpha = 0.5) +
  geom_point(data = highlight, aes(x = GrLivArea, y = SalePrice), color = "red") +
  geom_smooth(method = "loess") + ggtitle("Above-Grade Living Area vs Sale Price")

no_outs <- all%>%
  filter(!is.na(SalePrice)) %>%
  filter(GrLivArea <= 4500)

g2 <-ggplot(data = no_outs, aes(x = GrLivArea, y = SalePrice)) + geom_point(alpha = 0.5) +
  geom_smooth(method = "loess") + ggtitle("Above-Grade Living Area vs Sale Price")

grid.arrange(g1, g2)

#Remove only 2 outliers
training <- training[! training$GrLivArea >= 4500,]

all <-rbind(training, testing)
dim(all)

#Look at distribution of y
summary(training$SalePrice)

y <-ggplot(data = training, aes(x = SalePrice)) +
  geom_histogram(col = "#E1AF00", fill = "#3B9AB2", bins = 50)  +
  scale_x_continuous(breaks = seq(0, 800000, by = 100000), labels = comma) +
  labs(title = "Housing Data: Sale Price (Y)")

q1 <-ggplot(data = training, aes(sample = SalePrice)) + stat_qq(col = "#3B9AB2") + 
  stat_qq_line(col = "red") + labs(y = "Sale Price")

#transform y
training$log_y <-log(training$SalePrice)

logy <-ggplot(data = training, aes(x = log_y)) +
  geom_histogram(col = "#E1AF00", fill = "#3B9AB2", bins = 50)  +
  scale_x_continuous(breaks = seq(0, 800000, by = 100000), labels = comma) +
  labs(title = "Housing Data: Log(Sale Price)", x = "Log(Sale Price)")

q2 <-ggplot(data = training, aes(sample = log_y)) + stat_qq(col = "#3B9AB2") + 
  stat_qq_line(col = "red") + labs(y = "Log(Sale Price)")

grid.arrange(y, logy)
grid.arrange(q1, q2, nrow = 1, top = "QQPlots Before and After Log Transformation")


#Univariate plots- find numeric variables
quant <-subset(all, select = -c(Id, SalePrice)) %>%
  select(where(is.numeric))
quant_names <-names(quant); quant_names


#HISTOGRAMS OF NUMERICAL VARIABLES

hist_lotfrontage <-ggplot(data = all, aes(x = LotFrontage)) +
  geom_histogram(fill = "#3B9AB2", bins = 50)
#scale_x_continuous(breaks = seq(0, 800000, by = 100000), labels = comma) +
#labs(title = "Lot Frontage"); hist_lotfrontage

hist_lotarea <-ggplot(data = all, aes(x = LotArea)) +
  geom_histogram(fill = "#3B9AB2", bins = 50)
#scale_x_continuous(breaks = seq(0, 800000, by = 100000), labels = comma) +
#labs(title = "Lot Area"); hist_lotarea

bar_overallqual<-ggplot(data = all, aes(x = OverallQual)) +
  geom_bar(fill = "#3B9AB2")
#scale_x_continuous(breaks = seq(0, 800000, by = 100000), labels = comma) +
# labs(title = "Overall Quality"); bar_overallqual

bar_overallcond<-ggplot(data = all, aes(x = OverallCond)) +
  geom_bar(fill = "#3B9AB2")
#scale_x_continuous(breaks = seq(0, 800000, by = 100000), labels = comma) +
#labs(title = "Overall Condition"); bar_overallcond

hist_yearbuilt<-ggplot(data = all, aes(x = YearBuilt)) +
  geom_histogram(fill = "#3B9AB2", bins = 50)
#labs(title = "Year Built"); hist_yearbuilt

hist_yearremod<-ggplot(data = all, aes(x = YearRemodAdd)) +
  geom_histogram(fill = "#3B9AB2", bins = 50)
#labs(title = "Year Remodeled"); hist_yearremod

hist_masvnrarea<-ggplot(data = all, aes(x = MasVnrArea)) +
  geom_histogram(fill = "#3B9AB2", bins = 50)
#labs(title = "Masonry Veneer Area"); hist_masvnrarea

hist_bsmtfinsf1<-ggplot(data = all, aes(x = BsmtFinSF1)) +
  geom_histogram(fill = "#3B9AB2", bins = 50)
#labs(title = "Type 1 Finished Basement SF"); hist_bsmtfinsf1

hist_bsmtfinsf2<-ggplot(data = all, aes(x = BsmtFinSF2)) +
  geom_histogram(fill = "#3B9AB2", bins = 30)
#labs(title = "Type 2 Finished Basement SF"); hist_bsmtfinsf2

hist_bsmtunfsf<-ggplot(data = all, aes(x = BsmtUnfSF)) +
  geom_histogram(fill = "#3B9AB2", bins = 50)
#labs(title = "Unfinished Basement SF"); hist_bsmtunfsf

hist_totalbsmtsf<-ggplot(data = all, aes(x = TotalBsmtSF)) +
  geom_histogram(fill = "#3B9AB2", bins = 50)
#labs(title = "Total Basement SF"); hist_totalbsmtsf

hist_X1stFlrSF<-ggplot(data = all, aes(x = X1stFlrSF)) +
  geom_histogram(fill = "#3B9AB2", bins = 50)
# labs(title = "First Floor SF"); hist_X1stFlrSF

hist_X2ndFlrSF<-ggplot(data = all, aes(x = X2ndFlrSF)) +
  geom_histogram(fill = "#3B9AB2", bins = 50)
#labs(title = "Second Floor SF"); hist_X2ndFlrSF

hist_LowQualFinSF <-ggplot(data = all, aes(x = LowQualFinSF)) +
  geom_histogram(fill = "#3B9AB2", bins = 10)
#labs(title = "Low Quality Finished SF"); hist_LowQualFinSF

hist_GrLivArea <-ggplot(data = all, aes(x = GrLivArea)) +
  geom_histogram(fill = "#3B9AB2", bins = 50)
# labs(title = "Above-Ground Living Area"); hist_GrLivArea

bar_BsmtFullBath<-ggplot(data = all, aes(x = BsmtFullBath)) +
  geom_bar(fill = "#3B9AB2")
#labs(title = "No. Basement Full Baths"); bar_BsmtFullBath

bar_BsmtHalfBath<-ggplot(data = all, aes(x = BsmtHalfBath)) +
  geom_bar(fill = "#3B9AB2")
# labs(title = "No. Basement Half Baths"); bar_BsmtHalfBath

bar_FullBath <-ggplot(data = all, aes(x = FullBath )) +
  geom_bar(fill = "#3B9AB2")
# labs(title = "No. Full Baths"); bar_FullBath

bar_HalfBath <-ggplot(data = all, aes(x =HalfBath )) +
  geom_bar(fill = "#3B9AB2")
#labs(title = "No. Half Baths"); bar_HalfBath

bar_BedroomAbvGr <-ggplot(data = all, aes(x =BedroomAbvGr )) +
  geom_bar(fill = "#3B9AB2")
# labs(title = "No. Bedrooms Above Grade", subtitle = "Basement bedrooms not included"); bar_BedroomAbvGr

bar_KitchenAbvGr <-ggplot(data = all, aes(x =KitchenAbvGr )) +
  geom_bar(fill = "#3B9AB2")
# labs(title = "No. Kitchens Above Grade"); bar_KitchenAbvGr

bar_TotRmsAbvGrd<-ggplot(data = all, aes(x = TotRmsAbvGrd)) +
  geom_bar(fill = "#3B9AB2")
#labs(title = "No. Rooms Above Grade"); bar_TotRmsAbvGrd

bar_Fireplaces<-ggplot(data = all, aes(x = Fireplaces)) +
  geom_bar(fill = "#3B9AB2")
# labs(title = "No. of Fireplaces"); bar_Fireplaces

hist_GarageYrBlt <-ggplot(data = all, aes(x = GarageYrBlt)) +
  geom_histogram(fill = "#3B9AB2", bins = 50) +
  scale_x_continuous(limits = c(1900, 2020))
#labs(title = "Garage: Year Built"); hist_GarageYrBlt

bar_GarageCars<-ggplot(data = all, aes(x = GarageCars)) +
  geom_bar(fill = "#3B9AB2")
#labs(title = "Garage Car Capacity"); bar_GarageCars

hist_GarageArea <-ggplot(data = all, aes(x = GarageArea)) +
  geom_histogram(fill = "#3B9AB2", bins = 50)
# labs(title = "Garage Area"); hist_GarageArea

hist_WoodDeckSF <-ggplot(data = all, aes(x = WoodDeckSF)) +
  geom_histogram(fill = "#3B9AB2", bins = 50)
#labs(title = "Wood Deck: SF"); hist_WoodDeckSF

hist_OpenPorchSF <-ggplot(data = all, aes(x = OpenPorchSF)) +
  geom_histogram(fill = "#3B9AB2", bins = 50)
#labs(title = "Open Porch SF"); hist_OpenPorchSF

hist_EnclosedPorch <-ggplot(data = all, aes(x = EnclosedPorch)) +
  geom_histogram(fill = "#3B9AB2", bins = 50)
#labs(title = "Open Porch SF"); hist_OpenPorchSF

hist_X3SsnPorch <-ggplot(data = all, aes(x = X3SsnPorch)) +
  geom_histogram(fill = "#3B9AB2", bins = 50)
#labs(title = "3-Season Porch SF"); hist_X3SsnPorch

hist_ScreenPorch <-ggplot(data = all, aes(x = ScreenPorch)) +
  geom_histogram(fill = "#3B9AB2", bins = 50)
#labs(title = "Screen Porch SF"); hist_ScreenPorch

hist_PoolArea <-ggplot(data = all, aes(x = PoolArea)) +
  geom_histogram(fill = "#3B9AB2", bins = 50)
#labs(title = "Pool Area"); hist_PoolArea

hist_MiscVal <-ggplot(data = all, aes(x = MiscVal)) +
  geom_histogram(fill = "#3B9AB2", bins = 50)
#labs(title = "Value of Misc. Feature"); hist_MiscVal

bar_MoSold <-ggplot(data = all, aes(x = MoSold)) +
  geom_bar(fill = "#3B9AB2")
# labs(title = "Month Sold"); bar_MoSold

bar_YrSold <-ggplot(data = all, aes(x = YrSold)) +
  geom_bar(fill = "#3B9AB2")
#labs(title = "Year Sold"); bar_YrSold

grid.arrange(hist_lotfrontage, hist_lotarea, bar_overallqual, bar_overallcond, hist_yearbuilt,
             hist_yearremod, hist_masvnrarea, hist_bsmtfinsf1, hist_bsmtfinsf2, hist_bsmtunfsf,
             hist_totalbsmtsf, hist_X1stFlrSF, hist_X2ndFlrSF, hist_LowQualFinSF, hist_GrLivArea,
             bar_BsmtFullBath, bar_BsmtHalfBath, bar_FullBath, bar_HalfBath, bar_BedroomAbvGr,
             bar_KitchenAbvGr, bar_TotRmsAbvGrd, bar_Fireplaces, hist_GarageYrBlt, bar_GarageCars,
             hist_GarageArea, hist_WoodDeckSF, hist_OpenPorchSF, hist_EnclosedPorch, hist_X3SsnPorch,
             hist_ScreenPorch, hist_PoolArea, hist_MiscVal, bar_MoSold, bar_YrSold, ncol = 4)

#CATEGORICAL VARIABLES
cat <-all %>%
  select(where(is.character))
cat_names <-names(cat); cat_names

#Character univariate plots

bar_mssubclass <-ggplot(data = all, aes(x = factor(MSSubClass))) +
  geom_bar(fill = "#3B9AB2")
#scale_x_continuous(breaks = seq(0, 800000, by = 100000), labels = comma) +
#labs(title = "Type of Dwelling"); bar_mssubclass

bar_MSZoning<-ggplot(data = all, aes(x = MSZoning)) +
  geom_bar(fill = "#3B9AB2")
#labs(title = "MS Zoning"); bar_MSZoning

bar_Street<-ggplot(data = all, aes(x = Street)) +
  geom_bar(fill = "#3B9AB2")
#labs(title = "Type of Road Access"); bar_Street

bar_Alley<-ggplot(data = all, aes(x = Alley)) +
  geom_bar(fill = "#3B9AB2")
#labs(title = "Type Alley Access"); bar_Alley

bar_LotShape<-ggplot(data = all, aes(x = LotShape)) +
  geom_bar(fill = "#3B9AB2")
# labs(title = "Lot Shape"); bar_LotShape

bar_LandContour <-ggplot(data = all, aes(x = LandContour)) +
  geom_bar(fill = "#3B9AB2")
#labs(title = "Land Contour: Property Flatness"); bar_LandContour

bar_Utilities <-ggplot(data = all, aes(x = Utilities)) +
  geom_bar(fill = "#3B9AB2") #+
#labs(title = "Type of Utilities Available"); bar_Utilities

bar_LotConfig <-ggplot(data = all, aes(x = LotConfig)) +
  geom_bar(fill = "#3B9AB2")
#labs(title = "Lot Configuration"); bar_LotConfig

bar_LandSlope <-ggplot(data = all, aes(x = LandSlope)) +
  geom_bar(fill = "#3B9AB2")
#labs(title = "Property Slope"); bar_LandSlope

bar_Neighborhood <-ggplot(data = all, aes(x = Neighborhood)) +
  geom_bar(fill = "#3B9AB2") + coord_flip()
# labs(title = "Neighborhood"); bar_Neighborhood

bar_Condition1 <-ggplot(data = all, aes(x = Condition1)) +
  geom_bar(fill = "#3B9AB2") + coord_flip()
#labs(title = "Proximity to Var. Conditions"); bar_Condition1

bar_Condition2 <-ggplot(data = all, aes(x = Condition2)) +
  geom_bar(fill = "#3B9AB2") + coord_flip()
#labs(title = "Proximity to >1 Condition"); bar_Condition2

bar_BldgType <-ggplot(data = all, aes(x = BldgType)) +
  geom_bar(fill = "#3B9AB2")
#labs(title = "Building Type"); bar_BldgType

bar_HouseStyle <-ggplot(data = all, aes(x = HouseStyle)) +
  geom_bar(fill = "#3B9AB2")
#labs(title = "House Style"); bar_HouseStyle

bar_RoofStyle <-ggplot(data = all, aes(x = RoofStyle)) +
  geom_bar(fill = "#3B9AB2")
#labs(title = "Roof Style"); bar_RoofStyle

bar_RoofMatl <-ggplot(data = all, aes(x = RoofMatl)) +
  geom_bar(fill = "#3B9AB2") + coord_flip()
#labs(title = "Roof Material"); bar_RoofMatl

bar_Exterior1st <-ggplot(data = all, aes(x = Exterior1st)) +
  geom_bar(fill = "#3B9AB2") + coord_flip()
#labs(title = "Exterior Covering on House"); bar_Exterior1st

bar_Exterior2nd <-ggplot(data = all, aes(x = Exterior2nd)) +
  geom_bar(fill = "#3B9AB2") + coord_flip()
#labs(title = "Exterior Covering if >1"); bar_Exterior2nd

bar_MasVnrType <-ggplot(data = all, aes(x = MasVnrType)) +
  geom_bar(fill = "#3B9AB2")
#labs(title = "Masonry Veneer Type"); bar_MasVnrType

bar_ExterQual <-ggplot(data = all, aes(x = ExterQual)) +
  geom_bar(fill = "#3B9AB2")
#labs(title = "Exterior Quality"); bar_ExterQual

bar_ExterCond <-ggplot(data = all, aes(x = ExterCond)) +
  geom_bar(fill = "#3B9AB2")
#labs(title = "Exterior Condition"); bar_ExterCond

bar_Foundation<-ggplot(data = all, aes(x = Foundation)) +
  geom_bar(fill = "#3B9AB2")
#labs(title = "Type of Foundation"); bar_Foundation

bar_BsmtQual<-ggplot(data = all, aes(x = BsmtQual)) +
  geom_bar(fill = "#3B9AB2")
# labs(title = "Basement Quality: Height"); bar_BsmtQual

bar_BsmtCond<-ggplot(data = all, aes(x = BsmtCond)) +
  geom_bar(fill = "#3B9AB2")
# labs(title = "Basement Condition"); bar_BsmtCond

bar_BsmtExposure<-ggplot(data = all, aes(x = BsmtExposure)) +
  geom_bar(fill = "#3B9AB2")
#labs(title = "Basement Exposure (Walkout/Garden level)"); bar_BsmtExposure

bar_BsmtFinType1<-ggplot(data = all, aes(x = BsmtFinType1)) +
  geom_bar(fill = "#3B9AB2")
#labs(title = "Rating of Basement Finished Area"); bar_BsmtFinType1

bar_BsmtFinType2<-ggplot(data = all, aes(x = BsmtFinType2)) +
  geom_bar(fill = "#3B9AB2")
#labs(title = "Finished Basement Rating if >1 Area"); bar_BsmtFinType2

bar_Heating<-ggplot(data = all, aes(x = Heating)) +
  geom_bar(fill = "#3B9AB2")
# labs(title = "Type of Heating"); bar_Heating

bar_HeatingQC<-ggplot(data = all, aes(x = HeatingQC)) +
  geom_bar(fill = "#3B9AB2")
#labs(title = "Heating Quality/Condition"); bar_HeatingQC

bar_CentralAir<-ggplot(data = all, aes(x = CentralAir)) +
  geom_bar(fill = "#3B9AB2")
#labs(title = "Central Air"); bar_CentralAir

bar_Electrical<-ggplot(data = all, aes(x = Electrical)) +
  geom_bar(fill = "#3B9AB2")
# labs(title = "Electrical System"); bar_Electrical

bar_KitchenQual<-ggplot(data = all, aes(x = KitchenQual)) +
  geom_bar(fill = "#3B9AB2")
# labs(title = "Kitchen Quality"); bar_KitchenQual

bar_Functional<-ggplot(data = all, aes(x = Functional)) +
  geom_bar(fill = "#3B9AB2")
#labs(title = "Home Functionality"); bar_Functional

bar_FireplaceQu<-ggplot(data = all, aes(x = FireplaceQu)) +
  geom_bar(fill = "#3B9AB2")
#labs(title = "Fireplace Quality"); bar_FireplaceQu

bar_GarageType<-ggplot(data = all, aes(x = GarageType)) +
  geom_bar(fill = "#3B9AB2")
# labs(title = "Garage Type/Location"); bar_GarageType

bar_GarageFinish<-ggplot(data = all, aes(x = GarageFinish)) +
  geom_bar(fill = "#3B9AB2")
#labs(title = "Interior Garage Finish"); bar_GarageFinish

bar_GarageQual<-ggplot(data = all, aes(x = GarageQual)) +
  geom_bar(fill = "#3B9AB2")
#labs(title = "Garage Quality"); bar_GarageQual

bar_GarageCond<-ggplot(data = all, aes(x = GarageCond)) +
  geom_bar(fill = "#3B9AB2")
#labs(title = "Garage Condition"); bar_GarageCond

bar_PavedDrive<-ggplot(data = all, aes(x = PavedDrive)) +
  geom_bar(fill = "#3B9AB2")
# labs(title = "Driveway Paving Material"); bar_PavedDrive

bar_PoolQC<-ggplot(data = all, aes(x = PoolQC)) +
  geom_bar(fill = "#3B9AB2")
#labs(title = "Pool Quality"); bar_PoolQC

bar_Fence<-ggplot(data = all, aes(x = Fence)) +
  geom_bar(fill = "#3B9AB2")
#labs(title = "Fence Quality"); bar_Fence

bar_MiscFeature<-ggplot(data = all, aes(x = MiscFeature)) +
  geom_bar(fill = "#3B9AB2")
# labs(title = "Misc. Feature"); bar_MiscFeature

bar_SaleType <-ggplot(data = all, aes(x = SaleType)) +
  geom_bar(fill = "#3B9AB2")
#labs(title = "Sale Type"); bar_SaleType

bar_SaleCondition <-ggplot(data = all, aes(x = SaleCondition)) +
  geom_bar(fill = "#3B9AB2")
# labs(title = "Condition of Sale"); bar_SaleCondition


grid.arrange(bar_mssubclass, bar_MSZoning, bar_Street, bar_Alley, bar_LotShape,bar_LandContour,
             bar_Utilities,bar_LotConfig, bar_LandSlope, bar_Neighborhood, bar_Condition1,
             bar_Condition2, bar_BldgType, bar_HouseStyle, bar_RoofStyle, bar_RoofMatl,
             bar_Exterior1st, bar_Exterior2nd, bar_MasVnrType, bar_ExterQual, bar_ExterCond,
             bar_Foundation, bar_BsmtQual, bar_BsmtCond, bar_BsmtExposure, bar_BsmtFinType1,
             bar_BsmtFinType2, bar_Heating, bar_HeatingQC, bar_CentralAir,  bar_Electrical,
             bar_KitchenQual, bar_Functional, bar_FireplaceQu, bar_GarageType, bar_GarageFinish,
             bar_GarageQual, bar_GarageCond, bar_PavedDrive, bar_PoolQC, bar_Fence, bar_MiscFeature,
             bar_SaleType, bar_SaleCondition, ncol = 4)


########################MISSING DATA IDENTIFICATION#############################
miss <-sort(colSums(is.na(all)), decreasing = TRUE)
miss[miss>0]

na <-sort(colSums(is.na(all)), decreasing = TRUE)
na <-na[na >1]

missing <-all[names(na)]
missing <-subset(missing, select = -SalePrice)

aggr(missing, only.miss = TRUE, cex.axis = 0.7,  bars = FALSE,
     combined = TRUE, col =c("#3B9AB2","#F21A00"), prop = TRUE, freq = TRUE, border = NA)
title("Proportion of Missing Data")

###########DIAGNOSING MISSING DATA MECHANISMS#########################################3
attach(all)

#Pool variables
sum(!is.na(PoolQC))

which(is.na(PoolArea))
pools <-all[PoolArea > 0,]
pools <-pools[,c("Id", "PoolArea","PoolQC")]; pools

poolna <-pools[which(is.na(pools$PoolQC)),]
poolna <-poolna[,c("Id", "PoolArea","PoolQC")];poolna

#MiscFeature/MiscVal
sum(is.na(MiscVal)) #MiscVal has 0 NA's
sum(MiscVal > 0)

ftr <-all[!(is.na(MiscFeature)),] #all properties with MiscFeatures
ftr <-ftr[ftr$MiscVal == 0,]
ftr <-ftr[,c("Id", "MiscFeature", "MiscVal")]

#find the one non-structural MiscFeature NA
val <-all[MiscVal > 0,]%>%
  filter(is.na(MiscFeature))
val <-val[,c("Id", "MiscFeature", "MiscVal")]; val

table(all$MiscFeature)

#Fireplace variables
sum(is.na(FireplaceQu)) #1420 no-fireplace properties

table(Fireplaces) #Check # of Fireplaces; 1420 = 0

fire <-all%>% #Confirm these records are the same
  filter(!is.na(FireplaceQu)) %>%
  filter(Fireplaces == 0)


#BSMT VARIABLES
#Sample database using BsmtFinType2
bsmt_na <-all[is.na(BsmtFinType2),]
bsmt_na <-bsmt_na[ , c("Id", "BsmtCond", "BsmtQual", "BsmtFinType1", "BsmtFinType2", "TotalBsmtSF",
                       "BsmtExposure", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "BsmtFullBath",
                       "BsmtHalfBath")]
head(bsmt_na, 8)

#TotalBsmtSF
table(TotalBsmtSF)[1] #Check # of 0 values

bsmt_na <-all[is.na(TotalBsmtSF),]
bsmt_na <-bsmt_na[, c("Id", "BsmtCond", "BsmtQual", "BsmtFinType1", "BsmtFinType2", "TotalBsmtSF",
                       "BsmtExposure", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "BsmtFullBath",
                       "BsmtHalfBath")]
bsmt_na #ID 2121 has no basement, recode as 0


#BSMTQUAL
bsmt_na <-all[is.na(BsmtQual),]
nrow(bsmt_na)
bsmt_na <-bsmt_na[ , c("Id", "BsmtCond", "BsmtQual", "BsmtFinType1", "BsmtFinType2", "TotalBsmtSF",
                       "BsmtExposure", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "BsmtFullBath",
                       "BsmtHalfBath")]
#Impute 2218 and 2219

#BSMTCOND
bsmt_na <-all[is.na(BsmtCond),]
nrow(bsmt_na)
bsmt_na <-bsmt_na[ , c("Id", "BsmtCond", "BsmtQual", "BsmtFinType1", "BsmtFinType2", "TotalBsmtSF",
                       "BsmtExposure", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "BsmtFullBath",
                       "BsmtHalfBath")]
#Impute IDs 2041, 2186, and 2525


#BSMTFINSF1
table(BsmtFinSF1)[1]
bsmt_na <-all[is.na(BsmtFinSF1),]
nrow(bsmt_na)
bsmt_na <-bsmt_na[ , c("Id", "BsmtCond", "BsmtQual", "BsmtFinType1", "BsmtFinType2", "TotalBsmtSF",
                       "BsmtExposure", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "BsmtFullBath",
                       "BsmtHalfBath")]

#Recode Id 2121 as 0

#Check the 0s - make sure all values of BsmtFinType1 are either Unf or NA
bsmt_na <-all[BsmtFinSF1 == 0,]
bsmt_na <-bsmt_na[ , c("Id", "BsmtFinType1", "BsmtFinSF1")]
table(bsmt_na$BsmtFinType1)
sum(is.na(bsmt_na$BsmtFinType1)) #no problem here.

#BSMTFINSF2
table(BsmtFinSF2)[1] #2567 = 0
bsmt_na <-all[is.na(BsmtFinSF2),] #The only NA is 2121- recode to 0
nrow(bsmt_na)
bsmt_na <-bsmt_na[ , c("Id", "BsmtCond", "BsmtQual", "BsmtFinType1", "BsmtFinType2", "TotalBsmtSF",
                       "BsmtExposure", "BsmtFinSF1", "BsmtFinSF2",  "BsmtUnfSF", "BsmtFullBath",
                       "BsmtHalfBath")]



#Check the 0s - make sure all values of BsmtFinType2 are either Unf or NA
bsmt_na <-all[BsmtFinSF2 == 0,]
bsmt_na <-bsmt_na[ , c("Id","BsmtFinType2", "BsmtFinSF2")]

table(bsmt_na$BsmtFinType2) # 1 zero is coded incorrectly (as BLQ)
which(bsmt_na$BsmtFinType2 == "BLQ")
bsmt_na[1298,] #Id #1471 needs to be recoded

all[Id == 1471,] #Look at the rest of the record

#Recode BsmtFinType2 Id #1471 to Unf

#BSMTUNFSF
#The NA has no basement --> coerce to 0 (ID 2121)
table(BsmtUnfSF)[1] #241 have no unfinished sf
bsmt_na <-all[is.na(BsmtUnfSF),]
bsmt_na <-bsmt_na[ , c("Id", "BsmtCond", "BsmtQual", "BsmtFinType1", "BsmtFinType2", "TotalBsmtSF", "BsmtExposure", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "BsmtFullBath", "BsmtHalfBath")]
#Recode 2121 to 0

#check the 0s- make sure where bsmtunfsf = 0, bsmtfintype1 and 2 don't equal unfinished
bsmt_na <-all[BsmtUnfSF == 0,]
bsmt_na <-bsmt_na[ , c("Id","BsmtUnfSF", "BsmtFinType1", "BsmtFinType2", "TotalBsmtSF", "BsmtFinSF1", "BsmtFinSF2")]

table(bsmt_na$BsmtFinType1)
table(bsmt_na$BsmtFinType2) #83 records where BsmtFinType2 = Unf but Unf sqft = 0

unf_rows <-which(bsmt_na$BsmtFinType2 == "Unf")
head(bsmt_na[unf_rows,], 10) #coding problem evident


#BSMTFINTYPE1
#79 NA (NA = no basement)-- No changes, all NAs structurally missing
bsmt_na <-all[is.na(BsmtFinType1),]
nrow(bsmt_na)
bsmt_na <-bsmt_na[ , c("Id", "BsmtCond", "BsmtQual", "BsmtFinType1", "BsmtFinType2", "TotalBsmtSF",
                       "BsmtExposure", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "BsmtFullBath",
                       "BsmtHalfBath")]


#BSMTFINTYPE2
bsmt_na <-all[is.na(BsmtFinType2),]
nrow(bsmt_na) #80 NAs- 1 extra
bsmt_na <-bsmt_na[ , c("Id", "BsmtCond", "BsmtQual", "BsmtFinType1", "BsmtFinType2", "TotalBsmtSF",
                       "BsmtExposure", "BsmtFinSF1", "BsmtFinSF2",  "BsmtUnfSF", "BsmtFullBath",
                       "BsmtHalfBath")]

#NA for id 333 is wrong-
bsmt_na[bsmt_na$Id == 333,]

#479 sf for BsmtFinSF2 is large for a recroom,, and this entry only has 1 full bathroom, so prob. not a
#2nd living quarters.
#Recode Id # 1471 to Unf and Id 333 as recroom.

#BSMTEXPOSURE
bsmt_na <-all[is.na(BsmtExposure),]
bsmt_na <-bsmt_na[ , c("Id", "BsmtCond", "BsmtQual", "BsmtFinType1", "BsmtFinType2", "TotalBsmtSF",
                       "BsmtExposure", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "BsmtFullBath",
                       "BsmtHalfBath")]

#BSMTFULLBATH
bsmt_na <-all[is.na(BsmtFullBath),]
bsmt_na <-bsmt_na[ , c("Id", "BsmtCond", "BsmtQual", "BsmtFinType1", "BsmtFinType2","TotalBsmtSF",
                       "BsmtExposure", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "BsmtFullBath",
                       "BsmtHalfBath")]

#BSMTHALFBATH
bsmt_na <-all[is.na(BsmtHalfBath),]
bsmt_na <-bsmt_na[ , c("Id", "BsmtCond", "BsmtQual", "BsmtFinType1", "BsmtFinType2","TotalBsmtSF",
                       "BsmtExposure", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "BsmtFullBath",
                       "BsmtHalfBath")]

###############GARAGE VARIABLES

###GARAGEYRBLT
garage_na <-all[is.na(GarageYrBlt),]
garage_na <-garage_na[,c("Id", "GarageArea", "GarageType", "GarageQual", "GarageCond",
                         "GarageFinish", "GarageCars", "GarageYrBlt")]

mar <-garage_na[!is.na(garage_na$GarageType),]

#2127 and #2527 do have garages

###GARAGEFINISH
garage_na <-all[is.na(GarageFinish),]
garage_na <-garage_na[,c("Id", "GarageArea", "GarageType", "GarageQual", "GarageCond",
                         "GarageFinish", "GarageCars", "GarageYrBlt")]

###GARAGEQUAL
garage_na <-all[is.na(GarageQual),]
garage_na <-garage_na[,c("Id", "GarageArea", "GarageType", "GarageQual", "GarageCond",
                         "GarageFinish", "GarageCars", "GarageYrBlt")]

####GARAGECOND
garage_na <-all[is.na(GarageCond),]
garage_na <-garage_na[,c("Id", "GarageArea", "GarageType", "GarageQual", "GarageCond",
                         "GarageFinish", "GarageCars", "GarageYrBlt")]

###GARAGETYPE
garage_na <-all[is.na(GarageType),]
garage_na <-garage_na[,c("Id", "GarageArea", "GarageType", "GarageQual", "GarageCond",
                         "GarageFinish", "GarageCars", "GarageYrBlt")]


###GARAGECARS
table(GarageCars)[1]
garage_na <-all[is.na(GarageCars),]
garage_na <-garage_na[,c("Id", "GarageArea", "GarageType", "GarageQual", "GarageCond",
                         "GarageFinish", "GarageCars", "GarageYrBlt")]

#check 0s
garage_na <-all[GarageCars == 0,]
garage_na <-garage_na[,c("Id", "GarageArea", "GarageType", "GarageQual", "GarageCond",
                         "GarageFinish", "GarageCars", "GarageYrBlt")]


###GARAGEAREA
table(GarageArea)[1]
garage_na <-all[is.na(GarageArea),]
garage_na <-garage_na[,c("Id", "GarageArea", "GarageType", "GarageQual", "GarageCond",
                         "GarageFinish", "GarageCars", "GarageYrBlt")]

#check 0s
garage_na <-all[GarageArea == 0,]
garage_na <-garage_na[,c("Id", "GarageArea", "GarageType", "GarageQual", "GarageCond",
                         "GarageFinish", "GarageCars", "GarageYrBlt")]

#Data to be imputed across garage variables
mar

########EXTERIOR VARIABLES

###MASVNRTYPE
ext_na <-all[is.na(MasVnrType),]
ext_na <-ext_na[, c("Id", "MasVnrType", "MasVnrArea", "Exterior1st", "Exterior2nd")]
#Result: coerce these values to "None".


#Check values of "None"
table(MasVnrType)["None"] #yikes
ext_na <-all[MasVnrType == "None",]
ext_na <-ext_na[, c("Id", "MasVnrType", "MasVnrArea", "Exterior1st", "Exterior2nd")]

#Now filter these to check for errors
#You need to change these 84 values to "BrkFace"
coerce <-ext_na %>%
  filter(Exterior1st == "BrkFace")


###MASVNRAREA
#Check NAs. They are all consistent with MasVnrType = NA, so replace with "O"
ext_na <-all[is.na(MasVnrArea),]
ext_na <-ext_na[, c("Id", "MasVnrType", "MasVnrArea", "Exterior1st", "Exterior2nd")]

#Check "None" and look at discrepancies.
table(MasVnrArea)[1] #1737
table(MasVnrType)["None"]

mas <-all %>%
  filter(MasVnrType == "None") %>%
  filter(MasVnrArea > 0)
mas[,c("Id", "MasVnrType", "MasVnrArea", "Exterior1st", "Exterior2nd")]

###EXTERIOR1ST
ext_na <-all[is.na(Exterior1st),]
ext_na <-ext_na[, c("Id", "MasVnrType", "MasVnrArea", "Exterior1st", "Exterior2nd")]

###EXTERIOR2ND
ext_na <-all[is.na(Exterior2nd),]
ext_na <-ext_na[, c("Id", "MasVnrType", "MasVnrArea", "Exterior1st", "Exterior2nd")]
ext_na


######OTHER VARIABLES
###ALLEY
aly <-all[is.na(Alley), ]
aly <-aly[, c("Id", "Alley")] #Coerce these 2,716 obs to "None"

###FENCE
fnc <-all[is.na(Fence), ]
fnc <-fnc[, c("Id", "Fence")] #Coerce these 2,716 obs to "None"


#LOTFRONTAGE
summary(LotFrontage) #Check for any Lot Frontage = 0; there are none
lots <-which(is.na(LotFrontage)) #Impute these values of LotFrontage

###UTILITIES
util <-all[is.na(Utilities),]
util <-util[,c("Id", "Utilities")]

###FUNCTIONAL
fnc <-all[is.na(Functional),]
fnc <-fnc[,c("Id", "Functional")]

###ELECTRICAL
elec <-all[is.na(Electrical),]
elec <-elec[,c("Id", "Electrical")]

###KITCHENQUAL
kq <-all[is.na(KitchenQual),]
kq <-kq[,c("Id", "KitchenQual")]

###SALETYPE
slty <-all[is.na(SaleType),]
slty <-slty[,c("Id", "SaleType")]

detach(all)

#Look at missing data for the 2 other obs
outs <-training %>%
  filter(GrLivArea > 4000)
outs

#IDS #692 and 1183. Both NA for Alley,MiscFeature. 692 is NA for Fence.
#Don't see any need for manual corrections, just for the numbers in the missing data section. 




############RECODING & IMPUTING SINGLE OBSERVATIONS#################
#CONTINUOUS FEATURES --> NUMERICS
nums <-c("LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF",
         "TotalBsmtSF", "X1stFlrSF", "X2ndFlrSF", "LowQualFinSF", "GrLivArea", "GarageArea",
         "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "X3SsnPorch", "ScreenPorch", "PoolArea",
         "MiscVal", "SalePrice")


all[nums] <- lapply(all[nums], function(x) as.numeric(x))


#NOMINAL FEATURES --> FACTORS
factors <-c("MSSubClass", "MSZoning", "Street", "LandContour", "LotConfig",
            "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle",
            "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType",
            "Foundation", "Heating", "CentralAir", "SaleType", "SaleCondition",
            'MoSold')


all[factors] <- lapply(all[factors], function(x) as.factor(x)) #Factorize the above

###MSSubClass
all$MSSubClass <-revalue(all$MSSubClass, 
                         c("20" = "1 story 1946 +",
                           "30" = "1 story 1945 -", 
                           "40" = "1 story finished attic", 
                           "45" = "1.5 story unfinished",
                           "50" = "1.5 story finished",
                           "60" = "2 story 1946 +",
                           "70" = "2 story 1945 -",
                           "75" = "2.5 story",
                           "80" = "Split or multilevel",
                           "85" = "Split foyer",
                           "90" = "Duplex",
                           "120" = "1 story PUD 1946 +",
                           "150" = "1.5 story PUD",
                           "160" = "2 story PUD 1946 +",
                           "180" = "PUD Multilevel",
                           "190" = "2 family conversion"))

#####Street
all$Street <-revalue(all$Street,  c("Grvl" = "Gravel", 
                                    "Pave" ="Paved"))

#####Alley 
all$Alley[is.na(all$Alley)] <-'No alley access' #Fix NAs
all$Alley <-factor(all$Alley)
all$Alley <-revalue(all$Alley, c('Grvl' = 'Gravel', 
                                 'Pave' = 'Paved'))


####Land Contour
all$LandContour <-revalue(all$LandContour, c('Bnk' = 'Banked',
                                             'HLS' = 'Hillside',
                                             'Low' = 'Depression',
                                             'Lvl' = 'Flat/Level'))

########CONDITION 1-
all$Condition1 <-revalue(all$Condition1, c('Artery' = 'Adj. to Artery St',
                                           'Feedr' = 'Adj. to Feeder St',
                                           'Norm' = 'Normal',
                                           'RRNn' = 'Near NS RR', 
                                           'RRAn' = 'Adj. to NS RR',
                                           'PosA' = 'Adj. to positive feature',
                                           'PosN' = 'Near positive feature',
                                           'RRNe' = 'Near ES RR', 
                                           'RRAe' = 'Adj. to EW RR'))

#########CONDITION2
all$Condition2 <-revalue(all$Condition2, c('Artery' = 'Adj. to Artery St',
                                           'Feedr' = 'Adj. to Feeder St',
                                           'Norm' = 'Normal',
                                           'RRNn' = 'Near NS RR', 
                                           'RRAn' = 'Adj. to NS RR',
                                           'PosA' = 'Adj. to positive feature',
                                           'PosN' = 'Near positive feature',
                                           'RRNe' = 'Near ES RR', 
                                           'RRAe' = 'Adj. to EW RR'))

######BldgType
all$BldgType <-revalue(all$BldgType, c('1Fam' = '1Family',
                                       '2fmCon' = '2Family Conversion',
                                       'Duplex' = 'Duplex', 
                                       'Twnhs' = 'Townhouse InsideUnit',
                                       'TwnhsE' = 'Townhouse EndUnit'))


##MASVNRTYPE
all$MasVnrType <-revalue(all$MasVnrType, c('BrkCmn' = 'BrickCommon', 
                                           'BrkFace' = 'BrickFace'))
all$MasVnrType[is.na(all$MasVnrType)] <- 'None' #Fix NAs

##HEATING
all$Heating <-revalue(all$Heating, c('GasA' = 'Gas Forced Air',
                                     'GasW' = 'Gas Hot Water',
                                     'Grav' = 'Gravity',
                                     'OthW' = 'HW/Steam o/t gas',
                                     'Wall' = 'Wall Furnace'))

########GARAGETYPE
all$GarageType[is.na(all$GarageType)] <- "No Garage" #Fix NAs
all$GarageType <-as.factor(all$GarageType)

########MISCFEATURE
all$MiscFeature[is.na(all$MiscFeature)] <-"None" #Fix NAs
all$MiscFeature <-as.factor(all$MiscFeature)

table(sapply(all,class))

####ORDINAL FEATURES --> ORDERED FACTORS

#####LOTSHAPE
all$LotShape <-factor(all$LotShape, order = TRUE,
                      levels = c("IR3", "IR2", "IR1", 'Reg'))


all$LotShape <-revalue(all$LotShape, c("IR3" ="Irregular",
                                       'IR2' = "Mod. Irregular",
                                       'IR1' = 'Sl. Irregular',
                                       'Reg' = 'Regular'))


#####UTILITIES
all$Utilities <-factor(all$Utilities, order = TRUE, levels = c('ELO', 'NoSeWa',
                                                               'NoSewr', 'AllPub'))
all$Utilities <-revalue(all$Utilities, c('ELO' = 'Elec. only',
                                         'NoSeWa' = 'Elec. and gas',
                                         'NoSewr' = 'Elec., gas, water',
                                         'AllPub' = 'All public utils'))


#######LandSlope
all$LandSlope <-factor(all$LandSlope, order = TRUE, levels = c('Gtl', 'Mod', 'Sev'))

#OverallQual
all$OverallQual <-factor(all$OverallQual, order = TRUE)

#OverallCond
all$OverallCond <-factor(all$OverallCond, order = TRUE)

#ExterQual
quals <-c('Fa', 'TA', 'Gd', 'Ex') #Why is this on 4 point scale??
all$ExterQual <- factor(all$ExterQual, order = TRUE, levels = quals)

#Exterior Cond
quals <-c('Po', 'Fa', 'TA', 'Gd', 'Ex')
all$ExterCond <-factor(all$ExterCond, order = TRUE, levels = quals)

#Bsmt Qual
all$BsmtQual[is.na(all$BsmtQual)] <- "No basement" #Fix NAs
quals <-c('No basement', 'Fa', 'TA', 'Gd', 'Ex')
all$BsmtQual <-factor(all$BsmtQual, order = TRUE, levels = quals)

#re-assign NAs to MAR datapoints- these values will be imputed by MICE
all[all$Id == 2218, "BsmtQual"] <-NA
all[all$Id == 2219, "BsmtQual"] <-NA

#Bsmt Cond
all$BsmtCond[is.na(all$BsmtCond)] <-"No basement" #Fix NAs
quals <-c('No basement', 'Po', 'Fa', 'TA', 'Gd')
all$BsmtCond <-factor(all$BsmtCond, order = TRUE, levels = quals)

#These values to be imputed by MICE
all[all$Id == 2041, "BsmtCond"] <-NA
all[all$Id == 2186, "BsmtCond"] <-NA
all[all$Id == 2525, "BsmtCond"] <-NA

#Bsmt Exposure
all$BsmtExposure[is.na(all$BsmtExposure)] <-"No basement"
all$BsmtExposure <-factor(all$BsmtExposure, order = TRUE,
                          levels = c('No basement', 'No', 'Mn', 'Av', 'Gd'))

all[all$Id == 949, 'BsmtExposure'] <-NA
all[all$Id == 1488, 'BsmtExposure'] <-NA
all[all$Id == 2349, 'BsmtExposure'] <-NA


#BsmtFin Type 1
all$BsmtFinType1[is.na(all$BsmtFinType1)] <- "No basement" #Fix NAs
all$BsmtFinType1 <-factor(all$BsmtFinType1, order = TRUE,
                          levels = c('No basement', 'Unf', 'LwQ', 'Rec', 'BLQ',
                                     'ALQ', 'GLQ'))

#BsmtFin Type 2,- 80 NA
all$BsmtFinType2[is.na(all$BsmtFinType2)] <- "No basement"
all$BsmtFinType2 <-factor(all$BsmtFinType2, order = TRUE,
                          levels = c('No basement', 'Unf', 'LwQ', 'Rec', 'BLQ',
                                     'ALQ', 'GLQ'))

#correct this NA- we know there is a 2nd type of space that != unfinished
all[all$Id == 333, 'BsmtFinType2'] <-'Rec'

###Heating QC- no NA
quals <-c('Po', 'Fa', 'TA', 'Gd', 'Ex')

all$HeatingQC <-factor(all$HeatingQC, order = TRUE, levels = quals)

#Electrical- 1 NA (ID 1380)- impute w/MICE
all$Electrical <-factor(all$Electrical, order = TRUE,
                        levels = c('Mix', 'FuseP', 'FuseF', 'FuseA', 'SBrkr'))

all$Electrical <-revalue(all$Electrical, c('Mix' = 'Mixed',
                                           'FuseP' = 'Po. FuseBox',
                                           'FuseF' = 'Fa. FuseBox',
                                           'FuseA' = 'Av. Fusebox',
                                           'SBrkr' = 'Circuit Brkr'))

#KitchenQual
quals <-c('Fa', 'TA', 'Gd', 'Ex')
all$KitchenQual <-factor(all$KitchenQual, order = TRUE, levels = quals)

#Functional
all$Functional <-factor(all$Functional, order = TRUE, levels =
                          c('Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ'))

all$Functional <-revalue(all$Functional, c('Sev' = 'Severe',
                                           'Maj2' = 'Maj. Deductions 2',
                                           'Maj1' = 'Maj. Deductions 1',
                                           'Min2' = 'Min. Deductions 2',
                                           'Min1' = 'Min. Deductions 1',
                                           'Typ' = 'Typ. Functionality'))

#FireplaceQu
all$FireplaceQu[is.na(all$FireplaceQu)] <- 'No Fireplace'
quals <-c('No Fireplace', 'Po', 'Fa', 'TA', 'Gd', 'Ex')
all$FireplaceQu <-factor(all$FireplaceQu, order = TRUE, levels = quals)

#Garage Finish,
all$GarageFinish[is.na(all$GarageFinish)] <- 'No Garage'
all$GarageFinish <-factor(all$GarageFinish, order = TRUE, levels =
                            c('No Garage', 'Unf', 'RFn', 'Fin'))
all$GarageFinish <-revalue(all$GarageFinish, c('RFn' = 'Rough Fin.'))

#these are MAR, impute later
all[all$Id == 2127, 'GarageFinish'] <-NA
all[all$Id == 2577, 'GarageFinish'] <-NA

#########GARAGEQUAL
all$GarageQual[is.na(all$GarageQual)] <-'No Garage'
quals <-c('No Garage', 'Po', 'Fa', 'TA', 'Gd', 'Ex')
all$GarageQual <-factor(all$GarageQual, order = TRUE, levels = quals)

all[all$Id == 2127, 'GarageQual'] <-NA
all[all$Id == 2577, 'GarageQual'] <-NA


#####GARAGECOND
all$GarageCond[is.na(all$GarageCond)] <-'No Garage'
quals <-c('No Garage', 'Po', 'Fa', 'TA', 'Gd', 'Ex')
all$GarageCond <-factor(all$GarageCond, order = TRUE, levels = quals)

all[all$Id == 2127, 'GarageCond'] <-NA
all[all$Id == 2577, 'GarageCond'] <-NA


#Paved Drive,
all$PavedDrive <-factor(all$PavedDrive, order = TRUE, levels = c('N', 'P', 'Y'))
all$PavedDrive <-revalue(all$PavedDrive, c('N' = 'Dirt/Grvl',
                                           'P' = 'Part. Pvmnt',
                                           'Y' = 'Paved'))

######POOLQC
all$PoolQC[is.na(all$PoolQC)] <-'No Pool'
quals <-c('No Pool','Fa', 'Gd', 'Ex')
all$PoolQC <-factor(all$PoolQC, order = TRUE, levels = quals)

#these properties do have pools and need to be imputed
all[all$Id == 2421, 'PoolQC'] <-NA
all[all$Id == 2504, 'PoolQC'] <-NA
all[all$Id == 2600, 'PoolQC'] <-NA

#####FENCE- replace all NA
all$Fence[is.na(all$Fence)] <-"No Fence"
all$Fence <-factor(all$Fence, order = TRUE, levels = c('No Fence', 'MnWw', 'GdWo',
                                                       'MnPrv', 'GdPrv'))
all$Fence <-revalue(all$Fence, c('MnWw' = 'Min. wood, wire',
                                 'GdWo' = 'Good wood',
                                 'MnPrv' = 'Min. privacy',
                               'GdPrv' = 'Good privacy'))


##############DEAL WITH REMAINING NAS IN DISCRETE/NUMERICAL VARS
#GarageYrBlt
all$GarageYrBlt[is.na(all$GarageYrBlt)] <- 0

all[all$Id == 2127 , 'GarageYrBlt'] <-NA
all[all$Id == 2577 , 'GarageYrBlt'] <-NA

###MASVNRAREA
all$MasVnrArea[is.na(all$MasVnrArea)] <-0

all[all$Id == 774, 'MasVnrArea'] <-0
all[all$Id == 1231, 'MasVnrArea'] <-0
all[all$Id == 2453, 'MasVnrArea'] <-0

#Recode single NAs
all[all$Id == 2121, 'BsmtFinSF1'] <-0

all[all$Id == 2121, 'BsmtFinSF2']  <-0

all[all$Id == 2121, 'BsmtUnfSF'] <-0

all[all$Id == 2121, 'TotalBsmtSF'] <-0

#######IMPUTE MISSING DATA
set.seed(1986)
impute <- mice(all, m = 5, method = "cart")
summary(impute)
densityplot(impute)
title("Density Plot")

###CHECKING VALUES OF VARIABLES
impute$imp$PoolQC
summary(all$PoolQC)

summary(impute$imp$LotFrontage) #How should I choose the best imputation??
summary(all$LotFrontage)

x1 <-xyplot(impute, SalePrice ~ LotFrontage | .imp, pch = 20, cex = 1.4)
x2 <-xyplot(impute, SalePrice ~ PoolQC | .imp, pch = 20, cex = 1.4)
grid.arrange(x1, x2)

#Complete the data- was 4 before, when you hadn't removed the extra 2 outliers from GrLivArea
#at the beginning
#now try completing with 3
comp_all <-complete(impute, 3)

sum(is.na(comp_all))

####NOW: WRITE TRAINING AND TESTING DATASETS BASED ON COMP_ALL
comp_training <-subset(comp_all, comp_all$Id %in% training$Id)
comp_testing <-subset(comp_all, comp_all$Id %in% testing$Id)

#Save files to Rdata
save(comp_training, file = "comp_training3.RData")
save(comp_testing, file = "comp_testing3.RData")

rm(comp_training)
rm(comp_testing)

##########################START AGAIN HERE WITH CLEAN & IMPUTED DATASET#####################
load(file = "comp_training3.Rdata")
load(file = "comp_testing3.Rdata")

ctrain <-comp_training
ctest <-comp_testing

ctest$SalePrice <-NA
ctrain$SalePrice <-log(ctrain$SalePrice)
comp_all <-rbind(ctrain, ctest)

########################################BIVARIATE PLOTS############################
#####PLOT CONTINUOUS VARS
g1 <- ggplot(ctrain, aes(x = LotFrontage, y = SalePrice)) + geom_point(alpha = 0.6) + geom_smooth(method = "loess")

g2 <- ggplot(ctrain, aes(x = LotArea, y = SalePrice)) + geom_point() + 
  geom_smooth(method = "loess")

g3 <- ggplot(ctrain, aes(x = YearBuilt, y = SalePrice)) + geom_point() + 
  geom_smooth(method = "loess")

g4 <- ggplot(ctrain, aes(x = YearRemodAdd, y = SalePrice)) + geom_point() + 
  geom_smooth(method = "loess")

g5 <- ggplot(ctrain, aes(x = MasVnrArea, y = SalePrice)) + geom_point() + 
  geom_smooth(method = "loess")

g6 <- ggplot(ctrain, aes(x = BsmtFinSF1, y = SalePrice)) + geom_point() + 
  geom_smooth(method = "loess")

g7 <- ggplot(ctrain, aes(x = BsmtFinSF2, y = SalePrice)) + geom_point() + 
  geom_smooth(method = "loess")

g8 <- ggplot(ctrain, aes(x = BsmtUnfSF, y = SalePrice)) + geom_point() + 
  geom_smooth(method = "loess")

g9 <- ggplot(ctrain, aes(x = TotalBsmtSF, y = SalePrice)) + geom_point() + 
  geom_smooth(method = "loess")

g10 <- ggplot(ctrain, aes(x = X1stFlrSF, y = SalePrice)) + geom_point() + 
  geom_smooth(method = "loess")

g11 <- ggplot(ctrain, aes(x = X2ndFlrSF, y = SalePrice)) + geom_point() + 
  geom_smooth(method = "loess")

g12 <- ggplot(ctrain, aes(x = LowQualFinSF, y = SalePrice)) + geom_point() + 
  geom_smooth(method = "loess") #problems with this graph, prob. becaue so many 0. 

g13 <- ggplot(ctrain, aes(x = GrLivArea, y = SalePrice)) + geom_point() + 
  geom_smooth(method = "loess")

g14 <- ggplot(ctrain, aes(x = BsmtFullBath, y = SalePrice)) + geom_point() + 
  geom_smooth(method = "loess")

g15 <- ggplot(ctrain, aes(x = BsmtHalfBath, y = SalePrice)) + geom_point() + 
  geom_smooth(method = "loess")

g16 <- ggplot(ctrain, aes(x = FullBath, y = SalePrice)) + geom_point() + 
  geom_smooth(method = "loess")

g17 <- ggplot(ctrain, aes(x = HalfBath, y = SalePrice)) + geom_point() + 
  geom_smooth(method = "loess")

g18 <- ggplot(ctrain, aes(x = BedroomAbvGr, y = SalePrice)) + geom_point() + 
  geom_smooth(method = "loess")

g19 <- ggplot(ctrain, aes(x = KitchenAbvGr, y = SalePrice)) + geom_point() + 
  geom_smooth(method = "loess")

g20 <- ggplot(ctrain, aes(x = TotRmsAbvGrd, y = SalePrice)) + geom_point() + 
  geom_smooth(method = "loess")

g21 <- ggplot(ctrain, aes(x = Fireplaces, y = SalePrice)) + geom_point() + 
  geom_smooth(method = "loess")

g22 <- ggplot(ctrain, aes(x = GarageYrBlt, y = SalePrice)) + geom_point() + 
  geom_smooth(method = "loess")

g23 <- ggplot(ctrain, aes(x = GarageCars, y = SalePrice)) + geom_point() + 
  geom_smooth(method = "loess")

g24 <- ggplot(ctrain, aes(x = GarageArea, y = SalePrice)) + geom_point() + 
  geom_smooth(method = "loess")

g25 <- ggplot(ctrain, aes(x = WoodDeckSF, y = SalePrice)) + geom_point() + 
  geom_smooth(method = "loess")

g26 <- ggplot(ctrain, aes(x = OpenPorchSF, y = SalePrice)) + geom_point() + 
  geom_smooth(method = "loess")

g27 <- ggplot(ctrain, aes(x = EnclosedPorch, y = SalePrice)) + geom_point() + 
  geom_smooth(method = "loess")

g28 <- ggplot(ctrain, aes(x = X3SsnPorch, y = SalePrice)) + geom_point() + 
  geom_smooth(method = "loess")

g29 <- ggplot(ctrain, aes(x = ScreenPorch, y = SalePrice)) + geom_point() + 
  geom_smooth(method = "loess")

g30 <- ggplot(ctrain, aes(x = PoolArea, y = SalePrice)) + geom_point() + 
  geom_smooth(method = "loess")

g31 <- ggplot(ctrain, aes(x = MiscVal, y = SalePrice)) + geom_point() + 
  geom_smooth(method = "loess")

g32 <- ggplot(ctrain, aes(x = YrSold, y = SalePrice)) + geom_point() + 
  geom_smooth(method = "loess")

grid.arrange(g1, g2, g3, g4, g5, g6, g7, g8, g9, g10, g11, g12, g13, g14, g15, g16, g17, g18,
             g19, g20, g21, g22, g23, g24, g25, g26, g27, g28, g29, g30, g31, g32, ncol = 4)


g33 <-ggplot(ctrain, aes(x = reorder(MSSubClass, SalePrice), y = SalePrice)) + 
  geom_boxplot(col = "#3B9AB2") + coord_flip()

g34<-ggplot(ctrain, aes(x = reorder(MSZoning, SalePrice), y = SalePrice)) +
  geom_boxplot(col = "#3B9AB2")

g35 <-ggplot(ctrain, aes(x = reorder(Street, SalePrice), y = SalePrice)) + 
  geom_boxplot(col = "#3B9AB2")

g36 <-ggplot(ctrain, aes(x = reorder(Alley,SalePrice), y = SalePrice)) +
  geom_boxplot(col = "#3B9AB2")

g37 <-ggplot(ctrain, aes(x = reorder(LandContour, SalePrice), y = SalePrice)) +
  geom_boxplot(col = "#3B9AB2")

g38 <-ggplot(ctrain, aes(x = reorder(LotConfig, SalePrice), y = SalePrice)) + 
  geom_boxplot(col = "#3B9AB2")

g39 <-ggplot(ctrain, aes(x = reorder(Neighborhood, SalePrice), y = SalePrice)) +
  geom_boxplot(col = "#3B9AB2") + coord_flip()

g40 <-ggplot(ctrain, aes(x = reorder(Condition1, SalePrice), y = SalePrice)) + 
  geom_boxplot(col = "#3B9AB2")

g41 <-ggplot(ctrain, aes(x = Condition2, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g42 <-ggplot(ctrain, aes(x = reorder(BldgType, SalePrice), y = SalePrice)) + 
  geom_boxplot(col = "#3B9AB2")

g43 <-ggplot(ctrain, aes(x = reorder(HouseStyle, SalePrice), y = SalePrice)) +
  geom_boxplot(col = "#3B9AB2")

g44 <-ggplot(ctrain, aes(x = reorder(RoofStyle, SalePrice), y = SalePrice)) +
  geom_boxplot(col = "#3B9AB2")

g45 <-ggplot(ctrain, aes(x = reorder(RoofMatl, SalePrice), y = SalePrice)) + 
  geom_boxplot(col = "#3B9AB2")

g46 <-ggplot(ctrain, aes(x = reorder(Exterior1st, SalePrice), y = SalePrice)) + 
  geom_boxplot(col = "#3B9AB2")

g47 <-ggplot(ctrain, aes(x = reorder(Exterior2nd, SalePrice), y = SalePrice)) + 
  geom_boxplot(col = "#3B9AB2")

g48 <-ggplot(ctrain, aes(x = reorder(MasVnrType, SalePrice), y = SalePrice)) + 
  geom_boxplot(col = "#3B9AB2")

g49 <-ggplot(ctrain, aes(x = reorder(Foundation, SalePrice), y = SalePrice)) + 
  geom_boxplot(col = "#3B9AB2")

g50 <-ggplot(ctrain, aes(x = reorder(Heating, SalePrice), y = SalePrice)) +
  geom_boxplot(col = "#3B9AB2")

g51 <-ggplot(ctrain, aes(x = CentralAir, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g52 <-ggplot(ctrain, aes(x = reorder(GarageType, SalePrice), y = SalePrice)) + 
  geom_boxplot(col = "#3B9AB2")

g53 <-ggplot(ctrain, aes(x = reorder(MiscFeature, SalePrice), y = SalePrice)) + 
  geom_boxplot(col = "#3B9AB2")

g54 <-ggplot(ctrain, aes(x = MoSold, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g55 <-ggplot(ctrain, aes(x = reorder(SaleType, SalePrice), y = SalePrice)) +
  geom_boxplot(col = "#3B9AB2")

g56 <-ggplot(ctrain, aes(x = reorder(SaleCondition, SalePrice), y = SalePrice)) + 
  geom_boxplot(col = "#3B9AB2")

grid.arrange(g33, g34, g35, g36, g37, g38, g39, g40, g41, g42, g43, g44, g45, g46, g47, g48, g49,
             g50, g51, g52, g53, g54, g55, g56, ncol = 3)


g57 <-ggplot(ctrain, aes(x = LotShape, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g58 <-ggplot(ctrain, aes(x = Utilities, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g59 <-ggplot(ctrain, aes(x = LandSlope, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g60 <- ggplot(ctrain, aes(x = factor(OverallQual), y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g61 <- ggplot(ctrain, aes(x = factor(OverallCond), y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g62 <-ggplot(ctrain, aes(x = ExterQual, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g63 <-ggplot(ctrain, aes(x = ExterCond, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g64 <-ggplot(ctrain, aes(x = BsmtQual, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g65 <-ggplot(ctrain, aes(x = BsmtCond, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g66 <-ggplot(ctrain, aes(x = BsmtExposure, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g67 <-ggplot(ctrain, aes(x = BsmtFinType1, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g68 <-ggplot(ctrain, aes(x = BsmtFinType2, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g69 <-ggplot(ctrain, aes(x = HeatingQC, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g70 <-ggplot(ctrain, aes(x = Electrical, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g71 <-ggplot(ctrain, aes(x = KitchenQual, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g72 <-ggplot(ctrain, aes(x = Functional, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g73 <-ggplot(ctrain, aes(x = FireplaceQu, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g74 <-ggplot(ctrain, aes(x = GarageFinish, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g75 <-ggplot(ctrain, aes(x = GarageQual, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g76 <-ggplot(ctrain, aes(x = GarageCond, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g77 <-ggplot(ctrain, aes(x = PavedDrive, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g78 <-ggplot(ctrain, aes(x = PoolQC, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

g79 <-ggplot(ctrain, aes(x = Fence, y = SalePrice)) + geom_boxplot(col = "#3B9AB2")

grid.arrange(g57, g58, g59, g60, g61, g62, g63, g64, g65, g66, g67, g68, g69, g70, g71, g72, g73,
             g74, g75, g76, g77, g78, g79, ncol = 3)

################################RANDOM FOREST #1#####################################
set.seed(1986)
rf_train <-sample(1:nrow(ctrain), nrow(ctrain)*0.6)
rf_test <-ctrain[-rf_train,]
rf_y <-rf_test$SalePrice

rf_1 <-randomForest(SalePrice~.-Id, data = ctrain, subset = rf_train, importance = TRUE,ntree = 1000)
rf_1
varImpPlot(rf_1, main = "Random Forest Variable Importance")



###############################CONVERT ORDINALS TO INTEGERS########################
#select ordered factors
ordfac <-comp_all %>%
  select(where(is.factor)) %>%
  select(where(is.ordered))

ordfactors <-names(ordfac) #this is correct

#convert ordinal factors to integers
comp_all[ordfactors] <- lapply(comp_all[ordfactors], function(x) as.integer(x))

#update the training and test sets
ctrain <-subset(comp_all, comp_all$Id %in% ctrain$Id)
ctest <-subset(comp_all, comp_all$Id %in% ctest$Id)

###########################VARIANCE INFLATION FACTORS############################
#VIFs
lm1 <-lm(SalePrice~.-Id, data = ctrain) #linear model
options(scipen=999)
gvif <-data.frame(vif(lm1)) #Produces error about aliased coefficients

#Identify aliased coefficients
attributes(alias(lm1)$Complete)$dimnames[[1]]

#Check on composition of GrLivArea and TotalBsmtSF
df <-comp_all %>%
  filter(comp_all$LowQualFinSF > 0) %>%
  select (X1stFlrSF, X2ndFlrSF, LowQualFinSF, GrLivArea)
head(df)

df2 <-comp_all %>%
  filter(comp_all$BsmtUnfSF > 0) 

df2 <-df2 %>%
  filter(df2$BsmtFinSF2 > 0) %>%
  select(BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF)
head(df2)

#Drop aliased vars from VIF analysis
train_vif <-subset(ctrain, select = -c(Id, LowQualFinSF, X1stFlrSF, X2ndFlrSF, Exterior2nd, 
                                       BldgType, BsmtFinSF1,BsmtFinSF2))

lm2 <-lm(SalePrice~., data = train_vif)
options(scipen=999)
gvif <-data.frame(vif(lm2))
gvif$sqr <-gvif[,3]^2

gvifs <-gvif %>%
  filter(sqr > 5)

gvifs

#########################CORRELATION MATRIX#####################################33
drop <-subset(ctrain, select = -c(Id, X1stFlrSF, X2ndFlrSF, BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, LowQualFinSF))

quant <-drop %>%
  select(where(is.numeric))

cors <-cor(quant, use = "pairwise.complete.obs")
cors <-round(cors, 2)

corrplot1 <-ggcorrplot(cors, lab = TRUE,type= 'lower', hc.order = TRUE, lab_size = 2, tl.cex = 6)

#update the corrplot- drop variables with no corr > 0.3
quant <-subset(quant, select = -c(Functional, Fence, X3SsnPorch, Utilities, BsmtHalfBath, ScreenPorch, YrSold, LotShape, KitchenAbvGr, BsmtFinType2))

cors <-round(cor(quant), 2)
corrplot2 <-ggcorrplot(cors, lab = TRUE,type= 'lower', hc.order = TRUE, lab_size = 3, tl.cex = 8, tl.col = "black", show.legend = FALSE)
corrplot2

#######################FEATURE ENGINEERING################################
#Total SF
comp_all$TotalSF <-comp_all$GrLivArea + comp_all$TotalBsmtSF 

#Total bathrooms
comp_all$TotalBaths <-comp_all$BsmtHalfBath*0.5 + comp_all$HalfBath*0.5 + comp_all$FullBath + comp_all$BsmtFullBath


###############GRAPHICALLY EXPLORING NEIGHBORHOOD
ggplot(comp_all, aes(x = Neighborhood, y = SalePrice)) + geom_boxplot(col = "royalblue")

df <-ctrain %>%
  mutate(SalePrice =exp(SalePrice))

nb1 <- ggplot(df, aes(x=reorder(Neighborhood, SalePrice, fun = median), y=SalePrice)) +
  geom_bar(stat='summary', fun= "median", fill='#3B9AB2') +labs(x='Neighborhood', y="Median SalePrice") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_y_continuous(breaks= seq(0, 800000, by=50000), labels = comma)+
  geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3) +
  geom_hline(yintercept=163000, linetype="dashed", color = "#F21A00")


median(df$SalePrice)
mean(df$SalePrice)

nb2 <- ggplot(df, aes(x=reorder(Neighborhood, SalePrice, fun = "mean"), y=SalePrice)) +
  geom_bar(stat='summary', fun= "mean", fill='#3B9AB2') +labs(x='Neighborhood', y="Mean SalePrice") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_y_continuous(breaks= seq(0, 800000, by=50000), labels = comma)+
  geom_label(stat = "count", aes(label = ..count.., y = ..count..), size=3) +
  geom_hline(yintercept=180151.2, linetype="dashed", color = "#F21A00")

grid.arrange(nb1, nb2)

#Binning neighborhood
comp_all$NeighRich[comp_all$Neighborhood %in% c('StoneBr', 'NridgHt', 'NoRidge')] <- 2
comp_all$NeighRich[!comp_all$Neighborhood %in% c('MeadowV', 'IDOTRR', 'BrDale', 'StoneBr', 'NridgHt', 'NoRidge')] <- 1
comp_all$NeighRich[comp_all$Neighborhood %in% c('MeadowV', 'IDOTRR', 'BrDale')] <- 0

ctrain <-subset(comp_all, comp_all$Id %in% ctrain$Id)
ctest <-subset(comp_all, comp_all$Id %in% ctest$Id)


#########################3#####UPDATE DATASET- DROP UNNECESSARY VARS#########################
drop <-c("GrLivArea", "TotalBsmtSF", "X1stFlrSF", "X2ndFlrSF", "BsmtFinSF1", "BsmtFinSF2", 
         "BsmtUnfSF", "LowQualFinSF", "BsmtHalfBath", "HalfBath", "BsmtFullBath", "FullBath",
         "BldgType", "Exterior2nd", "Utilities", "GarageYrBlt", "GarageCond", "GarageArea", 
         "PoolQC", "Fireplaces", "MiscVal", "Utilities", "TotRmsAbvGrd", "BldgType", "Exterior2nd")

comp_all <-comp_all[, !(names(comp_all) %in% drop)]

ctrain <-subset(comp_all, comp_all$Id %in% ctrain$Id) 
ctest <-subset(comp_all, comp_all$Id %in% ctest$Id)

############################RANDOM FOREST #2####################################
set.seed(1986)
rf_train <-sample(1:nrow(ctrain), nrow(ctrain)*0.6)
rf_test <-ctrain[-rf_train,]
rf_y <-rf_test$SalePrice

rf_2 <-randomForest(SalePrice~.-Id, data = ctrain, subset = rf_train, importance = TRUE, ntree = 1000)
rf_2
varImpPlot(rf_2, main = "Random Forest Variable Importance- v.2")



################################3FIX SKEWNESS#######################################
ordfac <-ordfac[names(ordfac) %in% names(comp_all)] #removing factors from ordfac that were dropped from comp_all
ordfac <-comp_all[,names(ordfac)] #ordfac are now integers rather than factors


numeric <-comp_all%>%
  select(where(is.numeric))
numeric <-numeric[,!(names(numeric) %in% names(ordfac))] #true numerics only, incl. id and SP

cats <-comp_all %>%
  select(where(is.factor))

##### FIX THIS CODE
SalePrice <-comp_all$SalePrice
Id <-comp_all$Id

DFnum <-cbind(numeric, ordfac) #all numeric and ord. fac. data
drop <-c("SalePrice", "Id")

DFnum<-DFnum[,!names(DFnum) %in% drop]

#DENSITY PLOTS BEFORE
d1 <-ggplot(DFnum, aes(x = LotFrontage))+ geom_density(); d1
skewness(DFnum$LotFrontage)

d2 <-ggplot(DFnum, aes(x = TotalSF)) + geom_density(); d2
skewness(DFnum$TotalSF)


###Adjust skewness
for(i in 1:ncol(DFnum)){
  if(abs(skew(DFnum[,i])) > 0.8){
    DFnum[,i] <-log(DFnum[,i] + 1)
  }
}

comp_all <-cbind(DFnum, cats)
comp_all$Id <-Id

#density plots after
d4 <-ggplot(DFnum, aes(x = LotFrontage))+ geom_density()
skewness(DFnum$LotFrontage)

d5 <-ggplot(DFnum, aes(x = TotalSF)) + geom_density()
skewness(DFnum$TotalSF)

grid.arrange(d1, d3)

grid.arrange(d2, d4)

####simple linear model####################################
set.seed(1986)

##############LASSO: CROSS-VALIDATION TO PRED. TEST ERROR######################


#################################LASSO MODEL####################################
x <-model.matrix(~.-Id, data = comp_all)[,-1]#remove x intercept 
y <-SalePrice
grid <-10^seq(10, -2, length = 100)

#Separate training and test sets
x.train <-x[1:1458,]
x.test <-x[1459:nrow(x),]
y.train <-y[1:1458]

set.seed(1986)
cv.lass <-cv.glmnet(x.train, y.train, alpha = 1)
plot(cv.lass)
bestlam <-cv.lass$lambda.min; bestlam

cv.mse <-cv.glmnet(x.train, y.train, alpha = 1, nfolds = 20, type.measure = "mse")
cv.rmse <-sqrt(cv.mse$lambda.min)
cv.rmse

set.seed(1986)
lasso.mod <-glmnet(x.train, y.train, alpha = 1)
plot(lasso.mod)

pred<-predict(lasso.mod, s = bestlam, newx = x.test)
preds <-exp(pred)

sub <-data.frame(Id = ctest$Id, SalePrice = preds)
names(sub) <-c("Id", "SalePrice")
write.csv(sub, file = "KaggleHousing_WriteupPredsOuts5.csv", row.names = F)