In this project, we are presented with a train and test set of the Ames Iowa house data from Kaggle. The goal is to create a model that is capable of predicting house prices using the available variables in the dataset. The Mulitple Linear Regression model will be explored and utilized in this analysis.
# Importing the necessary libraries and packages
library(ggplot2)
library(data.table)
library(mice)
library(caTools)
library(dplyr)
library(MASS)
library(glmnet)
library(factoextra)
library(randomForest)
# Importing the data
data <- read.csv("training.csv")
# Exploring basic information about the data
colnames(data)
[1] "Id" "MSSubClass" "MSZoning" "LotFrontage" "LotArea"
[6] "Street" "Alley" "LotShape" "LandContour" "Utilities"
[11] "LotConfig" "LandSlope" "Neighborhood" "Condition1" "Condition2"
[16] "BldgType" "HouseStyle" "OverallQual" "OverallCond" "YearBuilt"
[21] "YearRemodAdd" "RoofStyle" "RoofMatl" "Exterior1st" "Exterior2nd"
[26] "MasVnrType" "MasVnrArea" "ExterQual" "ExterCond" "Foundation"
[31] "BsmtQual" "BsmtCond" "BsmtExposure" "BsmtFinType1" "BsmtFinSF1"
[36] "BsmtFinType2" "BsmtFinSF2" "BsmtUnfSF" "TotalBsmtSF" "Heating"
[41] "HeatingQC" "CentralAir" "Electrical" "X1stFlrSF" "X2ndFlrSF"
[46] "LowQualFinSF" "GrLivArea" "BsmtFullBath" "BsmtHalfBath" "FullBath"
[51] "HalfBath" "BedroomAbvGr" "KitchenAbvGr" "KitchenQual" "TotRmsAbvGrd"
[56] "Functional" "Fireplaces" "FireplaceQu" "GarageType" "GarageYrBlt"
[61] "GarageFinish" "GarageCars" "GarageArea" "GarageQual" "GarageCond"
[66] "PavedDrive" "WoodDeckSF" "OpenPorchSF" "EnclosedPorch" "X3SsnPorch"
[71] "ScreenPorch" "PoolArea" "PoolQC" "Fence" "MiscFeature"
[76] "MiscVal" "MoSold" "YrSold" "SaleType" "SaleCondition"
[81] "SalePrice"
str(data)
'data.frame': 1060 obs. of 81 variables:
$ Id : int 1 3 4 5 8 10 11 12 14 15 ...
$ MSSubClass : int 60 60 70 60 60 190 20 60 20 20 ...
$ MSZoning : Factor w/ 5 levels "C (all)","FV",..: 4 4 4 4 4 4 4 4 4 4 ...
$ LotFrontage : int 65 68 60 84 NA 50 70 85 91 NA ...
$ LotArea : int 8450 11250 9550 14260 10382 7420 11200 11924 10652 10920 ...
$ Street : Factor w/ 2 levels "Grvl","Pave": 2 2 2 2 2 2 2 2 2 2 ...
$ Alley : Factor w/ 2 levels "Grvl","Pave": NA NA NA NA NA NA NA NA NA NA ...
$ LotShape : Factor w/ 4 levels "IR1","IR2","IR3",..: 4 1 1 1 1 4 4 1 1 1 ...
$ LandContour : Factor w/ 4 levels "Bnk","HLS","Low",..: 4 4 4 4 4 4 4 4 4 4 ...
$ Utilities : Factor w/ 1 level "AllPub": 1 1 1 1 1 1 1 1 1 1 ...
$ LotConfig : Factor w/ 5 levels "Corner","CulDSac",..: 5 5 1 3 1 1 5 5 5 1 ...
$ LandSlope : Factor w/ 3 levels "Gtl","Mod","Sev": 1 1 1 1 1 1 1 1 1 1 ...
$ Neighborhood : Factor w/ 25 levels "Blmngtn","Blueste",..: 6 6 7 14 17 4 19 16 6 13 ...
$ Condition1 : Factor w/ 9 levels "Artery","Feedr",..: 3 3 3 3 5 1 3 3 3 3 ...
$ Condition2 : Factor w/ 6 levels "Artery","Feedr",..: 3 3 3 3 3 1 3 3 3 3 ...
$ BldgType : Factor w/ 5 levels "1Fam","2fmCon",..: 1 1 1 1 1 2 1 1 1 1 ...
$ HouseStyle : Factor w/ 8 levels "1.5Fin","1.5Unf",..: 6 6 6 6 6 2 3 6 3 3 ...
$ OverallQual : int 7 7 7 8 7 5 5 9 7 6 ...
$ OverallCond : int 5 5 5 5 6 6 5 5 5 5 ...
$ YearBuilt : int 2003 2001 1915 2000 1973 1939 1965 2005 2006 1960 ...
$ YearRemodAdd : int 2003 2002 1970 2000 1973 1950 1965 2006 2007 1960 ...
$ RoofStyle : Factor w/ 6 levels "Flat","Gable",..: 2 2 2 2 2 2 4 4 2 4 ...
$ RoofMatl : Factor w/ 7 levels "ClyTile","CompShg",..: 2 2 2 2 2 2 2 2 2 2 ...
$ Exterior1st : Factor w/ 14 levels "AsbShng","AsphShn",..: 12 12 13 12 7 8 7 14 12 8 ...
$ Exterior2nd : Factor w/ 15 levels "AsbShng","AsphShn",..: 13 13 15 13 7 9 7 15 13 9 ...
$ MasVnrType : Factor w/ 4 levels "BrkCmn","BrkFace",..: 2 2 3 2 4 3 3 4 4 2 ...
$ MasVnrArea : int 196 162 0 350 240 0 0 286 306 212 ...
$ ExterQual : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 3 4 3 4 4 4 1 3 4 ...
$ ExterCond : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
$ Foundation : Factor w/ 6 levels "BrkTil","CBlock",..: 3 3 1 3 2 1 2 3 3 2 ...
$ BsmtQual : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 3 4 3 3 4 4 1 3 4 ...
$ BsmtCond : Factor w/ 4 levels "Fa","Gd","Po",..: 4 4 2 4 4 4 4 4 4 4 ...
$ BsmtExposure : Factor w/ 4 levels "Av","Gd","Mn",..: 4 3 4 1 3 4 4 4 1 4 ...
$ BsmtFinType1 : Factor w/ 6 levels "ALQ","BLQ","GLQ",..: 3 3 1 3 1 3 5 3 6 2 ...
$ BsmtFinSF1 : int 706 486 216 655 859 851 906 998 0 733 ...
$ BsmtFinType2 : Factor w/ 6 levels "ALQ","BLQ","GLQ",..: 6 6 6 6 2 6 6 6 6 6 ...
$ BsmtFinSF2 : int 0 0 0 0 32 0 0 0 0 0 ...
$ BsmtUnfSF : int 150 434 540 490 216 140 134 177 1494 520 ...
$ TotalBsmtSF : int 856 920 756 1145 1107 991 1040 1175 1494 1253 ...
$ Heating : Factor w/ 6 levels "Floor","GasA",..: 2 2 2 2 2 2 2 2 2 2 ...
$ HeatingQC : Factor w/ 5 levels "Ex","Fa","Gd",..: 1 1 3 1 1 1 1 1 1 5 ...
$ CentralAir : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 2 2 2 2 ...
$ Electrical : Factor w/ 5 levels "FuseA","FuseF",..: 5 5 5 5 5 5 5 5 5 5 ...
$ X1stFlrSF : int 856 920 961 1145 1107 1077 1040 1182 1494 1253 ...
$ X2ndFlrSF : int 854 866 756 1053 983 0 0 1142 0 0 ...
$ LowQualFinSF : int 0 0 0 0 0 0 0 0 0 0 ...
$ GrLivArea : int 1710 1786 1717 2198 2090 1077 1040 2324 1494 1253 ...
$ BsmtFullBath : int 1 1 1 1 1 1 1 1 0 1 ...
$ BsmtHalfBath : int 0 0 0 0 0 0 0 0 0 0 ...
$ FullBath : int 2 2 1 2 2 1 1 3 2 1 ...
$ HalfBath : int 1 1 0 1 1 0 0 0 0 1 ...
$ BedroomAbvGr : int 3 3 3 4 3 2 3 4 3 2 ...
$ KitchenAbvGr : int 1 1 1 1 1 2 1 1 1 1 ...
$ KitchenQual : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 3 3 3 4 4 4 1 3 4 ...
$ TotRmsAbvGrd : int 8 6 7 9 7 5 5 11 7 5 ...
$ Functional : Factor w/ 7 levels "Maj1","Maj2",..: 7 7 7 7 7 7 7 7 7 7 ...
$ Fireplaces : int 0 1 1 1 2 2 0 2 1 1 ...
$ FireplaceQu : Factor w/ 5 levels "Ex","Fa","Gd",..: NA 5 3 5 5 5 NA 3 3 2 ...
$ GarageType : Factor w/ 6 levels "2Types","Attchd",..: 2 2 6 2 2 2 6 4 2 2 ...
$ GarageYrBlt : int 2003 2001 1998 2000 1973 1939 1965 2005 2006 1960 ...
$ GarageFinish : Factor w/ 3 levels "Fin","RFn","Unf": 2 2 3 2 2 2 3 1 2 2 ...
$ GarageCars : int 2 2 3 3 2 1 1 3 3 1 ...
$ GarageArea : int 548 608 642 836 484 205 384 736 840 352 ...
$ GarageQual : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 3 5 5 5 5 ...
$ GarageCond : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
$ PavedDrive : Factor w/ 3 levels "N","P","Y": 3 3 3 3 3 3 3 3 3 3 ...
$ WoodDeckSF : int 0 0 0 192 235 0 0 147 160 0 ...
$ OpenPorchSF : int 61 42 35 84 204 4 0 21 33 213 ...
$ EnclosedPorch: int 0 0 272 0 228 0 0 0 0 176 ...
$ X3SsnPorch : int 0 0 0 0 0 0 0 0 0 0 ...
$ ScreenPorch : int 0 0 0 0 0 0 0 0 0 0 ...
$ PoolArea : int 0 0 0 0 0 0 0 0 0 0 ...
$ PoolQC : Factor w/ 3 levels "Ex","Fa","Gd": NA NA NA NA NA NA NA NA NA NA ...
$ Fence : Factor w/ 4 levels "GdPrv","GdWo",..: NA NA NA NA NA NA NA NA NA 2 ...
$ MiscFeature : Factor w/ 4 levels "Gar2","Othr",..: NA NA NA NA 3 NA NA NA NA NA ...
$ MiscVal : int 0 0 0 0 350 0 0 0 0 0 ...
$ MoSold : int 2 9 2 12 11 1 2 7 8 5 ...
$ YrSold : int 2008 2008 2006 2008 2009 2008 2008 2006 2007 2008 ...
$ SaleType : Factor w/ 9 levels "COD","Con","ConLD",..: 9 9 9 9 9 9 9 7 7 9 ...
$ SaleCondition: Factor w/ 6 levels "Abnorml","AdjLand",..: 5 5 1 5 5 5 5 6 6 5 ...
$ SalePrice : int 208500 223500 140000 250000 200000 118000 129500 345000 279500 157000 ...
It looks like we have 1060 observations and 80 variables, excluding target variables ‘SalePrice’; 23 nominal, 23 ordinal, 14 discrete, and 20 continuous. Also, it is clear that we are missing values in our dataset. Let’s determine the distribution of the mising data.
# Checking what prcent of all the data is composed of missing values
(sum(is.na(data))/(nrow(data)*ncol(data)))*100
[1] 5.885162
# Let's get a break down of the number of missing values in each column
missing.col <- (colSums(is.na(data))[colSums(is.na(data)) > 0]/nrow(data))*100
missing.col.names <- row.names(data.frame(missing.col))
missing.col.percent <- data.frame(missing.col)
missing.col.percent <- setorder(data.frame(missing.col), missing.col)
colnames(missing.col.percent) <- 'percent.missing'
missing.col.percent
It seems that ‘LotFrontage’, ‘FireplaceQu’, ‘Fence’, ‘Alley’, ‘MiscFeature’, and ‘PoolQC’ possess significant proportions of missing data, over 5%. More pressing is the fact that some of the variables, such as ‘PoolQC’, contain no data at all and even ‘LotFrontage’, which has the least missing data of the variables previously mentioned, is missing almost 20% of its data.
After consulting the data documentation, it seems that many of the variables use ‘NA’ as respresenting the absence of a future. For example, ‘FireplaceQu’ represents fireplace quality of a home. Now, although this variable is missing almost half of it’s data, that is due to the fact that a missing value denotes the absence of a fireplace in the home and not a missing piece of infromation of data. So, before continuing, let’s add ‘NA’ as a feature level where it is appropriate based on the data documentation.
# Adding 'NA' as a factor level
col_list <- c('Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature')
data[col_list] <- lapply(data[col_list], addNA)
Now, let’s see how much missing data remains in the dataset.
(sum(is.na(data))/(nrow(data)*ncol(data)))*100
[1] 0.3004892
missing.col <- (colSums(is.na(data))[colSums(is.na(data)) > 0]/nrow(data))*100
missing.col.names <- row.names(data.frame(missing.col))
missing.col.percent <- data.frame(missing.col)
missing.col.percent <- setorder(data.frame(missing.col), missing.col)
colnames(missing.col.percent) <- 'percent.missing'
missing.col.percent
str(data[,c('MasVnrType','MasVnrArea','GarageYrBlt','LotFrontage')])
'data.frame': 1060 obs. of 4 variables:
$ MasVnrType : Factor w/ 4 levels "BrkCmn","BrkFace",..: 2 2 3 2 4 3 3 4 4 2 ...
$ MasVnrArea : int 196 162 0 350 240 0 0 286 306 212 ...
$ GarageYrBlt: int 2003 2001 1998 2000 1973 1939 1965 2005 2006 1960 ...
$ LotFrontage: int 65 68 60 84 NA 50 70 85 91 NA ...
Wow, what a difference. We have reduced the total proportion of missing data by almost half, ~6% versus ~3%, and where we had 18 variables with missing values, we now only have 4.
Now, we will use the ‘mice’ package to impute any missing values that remain. In order to prevent the injection of any personal bias, we will preemptively split the data into a train and test set first and impute the two data sets independently. Before that, let’s see if there are any columns we can, and should, remove all together because of a lack of additional predictability or information.
# Checking which variables have only a single value for all observations
for(x in colnames(data)){
if(length(unique(data[,x])) == 1){
print(x)
}
}
[1] "Utilities"
We see that ‘Utilities’ gives us no additional predicitve power as the only value present for all observations is ‘AllPub’. Further more, the ‘Id’ variable serves no purpose other than identification and should o be included in the model. For these reasons, both of these columns will be removed from the dataset.
# Dropping the 'Id' and 'Utilities' columns
data <- data[, !(names(data) %in% c('Id','Utilities'))]
Let’s continue with the imputation of the reamining missing values.
# Creating a train and test split
set.seed(2019)
split <- sample.split(data$SalePrice, SplitRatio = 0.8)
train <- subset(data, split == TRUE)
test <- subset(data, split == FALSE)
test.x <- test[ ,!(names(test) %in% c("SalePrice"))]
test.y <- test$SalePrice
# Imputing missing values
train.mids <- mice(train, m=1, method='cart', seed=2019)
test.mids <- mice(test.x, m=1, method='cart', seed=2019)
train.imputed <- complete(train.mids,1)
test.imputed <- complete(test.mids,1)
# Confirming that there are no more missing values in the dataset
train.imp.miss <- (sum(is.na(train.imputed))/(nrow(train.imputed)*ncol(train.imputed)))*100
test.imp.miss <- (sum(is.na(test.imputed))/(nrow(test.imputed)*ncol(test.imputed)))*100
print("Percent missing data in imputed train set: ")
[1] "Percent missing data in imputed train set: "
print(train.imp.miss)
[1] 0
print("Percent missing data in imputed test set: ")
[1] "Percent missing data in imputed test set: "
print(test.imp.miss)
[1] 0
Now, let’s continue by exploring the correlations between the variables in the dataset
# Creating a dataframe of correlations between SalePrice and numeric vars.
train.numeric <- dplyr::select_if(data, is.numeric)
corr.matrix <- cor(train.numeric, use = 'pairwise.complete.obs')
corr.df <- data.frame(row=rownames(corr.matrix)[row(corr.matrix)[upper.tri(corr.matrix)]], col=colnames(corr.matrix)[col(corr.matrix)[upper.tri(corr.matrix)]],corr=corr.matrix[upper.tri(corr.matrix)])
corr.df <- corr.df[order(corr.df$'corr'),]
sale.price.corr <- corr.df[corr.df$col == 'SalePrice',]
rbind(head(sale.price.corr,5), tail(sale.price.corr,5))
It looks like ‘OverallQual’, ‘GrLivArea’, ‘GarageCars’, and ‘GarageArea’ all have “significant” correlations with the response varaible, SalePrice. By “significant”, we mean a correlation above 0.6. These may be important predictors of SalePrice. Let us also visualize the distribution of SalePrice.
# Plot of SalePrice distribution
sp.plot <- ggplot(data = train.imputed, aes(x=train.imputed$SalePrice)) + geom_histogram(bins = 50, color = 'white', fill = 'dodgerblue') +
labs(title = "Histogram of 'SalePrice'") +
xlab("SalePrice")
log.sp.plot <- ggplot(data = train.imputed, aes(x=log(train.imputed$SalePrice))) + geom_histogram(bins = 50, color = 'white', fill = 'dodgerblue') +
labs(title = "Histogram of log('SalePrice')") +
xlab("log('SalePrice')")
sp.plot
log.sp.plot
The distribution does seem to become more normal when a long transform is applied. This may improve the performance of a linear model. Will need to come back later and determine if the log trandform really has a beneficial effect.
Let’s now utilize methodical approaches to select the features that will provide us with the most predictability in our model. We will explore forward, backward, and stepwise selection as well as Lasso Regression.
# Some levels present in test set that are not accounted for in train set
train.imputed <- train.imputed[ ,!(names(train.imputed) %in% c("Exterior1st"))]
test.imputed <- test.imputed[ ,!(names(test.imputed) %in% c("Exterior1st"))]
# Creating a dataframe with the log transform of SalePrice
train.imputed.log <- train.imputed
train.imputed.log$SalePrice <- log(train.imputed.log$SalePrice)
# Selection results (on log(SalePrice))
step.f.log$call # Forward results
lm(formula = SalePrice ~ OverallQual + Neighborhood + GrLivArea +
GarageCars + OverallCond + BsmtFullBath + RoofMatl + TotalBsmtSF +
YearBuilt + Condition2 + MSZoning + Fireplaces + BsmtFinSF1 +
LotArea + Functional + SaleType + SaleCondition + Heating +
BldgType + ScreenPorch + YearRemodAdd + CentralAir + WoodDeckSF +
EnclosedPorch + KitchenQual + GarageArea + PoolQC + HeatingQC +
TotRmsAbvGrd + BsmtExposure + LotConfig + LotFrontage + X3SsnPorch +
Condition1 + GarageCond + GarageYrBlt + HouseStyle + X1stFlrSF +
Fence + LandSlope + Foundation, data = train.imputed.log)
print("######################")
[1] "######################"
step.b.log$call # Backward results
lm(formula = SalePrice ~ MSSubClass + MSZoning + LotFrontage +
LotArea + LotConfig + LandSlope + Neighborhood + Condition1 +
Condition2 + HouseStyle + OverallQual + OverallCond + YearBuilt +
YearRemodAdd + RoofMatl + MasVnrArea + Foundation + BsmtFinSF1 +
Heating + HeatingQC + CentralAir + X1stFlrSF + X2ndFlrSF +
BsmtFullBath + KitchenQual + TotRmsAbvGrd + Functional +
Fireplaces + GarageYrBlt + GarageCars + GarageArea + GarageCond +
WoodDeckSF + EnclosedPorch + X3SsnPorch + ScreenPorch + PoolQC +
Fence + SaleType + SaleCondition, data = train.imputed.log)
print("######################")
[1] "######################"
step.s.log$call # Stepwise results
lm(formula = SalePrice ~ MSSubClass + MSZoning + LotFrontage +
LotArea + LotConfig + LandSlope + Neighborhood + Condition1 +
Condition2 + HouseStyle + OverallQual + OverallCond + YearBuilt +
YearRemodAdd + RoofMatl + Foundation + BsmtFinSF1 + Heating +
HeatingQC + CentralAir + X1stFlrSF + X2ndFlrSF + BsmtFullBath +
KitchenQual + TotRmsAbvGrd + Functional + Fireplaces + GarageYrBlt +
GarageCars + GarageArea + GarageCond + WoodDeckSF + EnclosedPorch +
X3SsnPorch + ScreenPorch + PoolQC + Fence + SaleType + SaleCondition +
TotalBsmtSF + KitchenAbvGr, data = train.imputed.log)
# Conducting PCA on the stepwise selected feature sets using log SalePrice
# Scaling the subset of the predictors
train.imputed.log.pca <- train.imputed.log[,c('MSSubClass', 'MSZoning', 'LotFrontage',
'LotArea' , 'LotConfig' , 'LandSlope' , 'Neighborhood' , 'Condition1' ,
'Condition2' , 'HouseStyle' , 'OverallQual' , 'OverallCond' , 'YearBuilt' ,
'YearRemodAdd' , 'RoofMatl' , 'Foundation' , 'BsmtFinSF1' , 'Heating' ,
'HeatingQC' , 'CentralAir' , 'X1stFlrSF' , 'X2ndFlrSF' , 'BsmtFullBath' ,
'KitchenQual' , 'TotRmsAbvGrd' , 'Functional' , 'Fireplaces' , 'GarageYrBlt' ,
'GarageCars' , 'GarageArea' , 'GarageCond' , 'WoodDeckSF' , 'EnclosedPorch' ,
'X3SsnPorch' , 'ScreenPorch' , 'PoolQC' , 'Fence' , 'SaleType' , 'SaleCondition' ,
'TotalBsmtSF' , 'KitchenAbvGr')]
test.imputed.log.pca <- test.x[,c('MSSubClass', 'MSZoning', 'LotFrontage',
'LotArea' , 'LotConfig' , 'LandSlope' , 'Neighborhood' , 'Condition1' ,
'Condition2' , 'HouseStyle' , 'OverallQual' , 'OverallCond' , 'YearBuilt' ,
'YearRemodAdd' , 'RoofMatl' , 'Foundation' , 'BsmtFinSF1' , 'Heating' ,
'HeatingQC' , 'CentralAir' , 'X1stFlrSF' , 'X2ndFlrSF' , 'BsmtFullBath' ,
'KitchenQual' , 'TotRmsAbvGrd' , 'Functional' , 'Fireplaces' , 'GarageYrBlt' ,
'GarageCars' , 'GarageArea' , 'GarageCond' , 'WoodDeckSF' , 'EnclosedPorch' ,
'X3SsnPorch' , 'ScreenPorch' , 'PoolQC' , 'Fence' , 'SaleType' , 'SaleCondition' ,
'TotalBsmtSF' , 'KitchenAbvGr')]
unusable.cols <- c("Neighborhood", "Condition1", "HouseStyle", "RoofMatl", "Heating", "GarageCond", "SaleCondition", "LandSlope", "Condition2", "Foundation", "HeatingQC", "Functional", "PoolQC", "SaleType")
pca.matrix.train <- train.imputed.log.pca[,!(names(train.imputed.log.pca) %in% unusable.cols)]
pca.matrix.train <- model.matrix( ~.-1, data=pca.matrix.train )
pca.matrix.train <- scale(pca.matrix.train)
pca.matrix.test <- test.imputed.log.pca[,!(names(test.imputed.log.pca) %in% unusable.cols)]
pca.matrix.test <- model.matrix( ~.-1, data=pca.matrix.test )
pca.matrix.test <- scale(pca.matrix.test)
# PCA
all.pca <- prcomp(pca.matrix.train, scale = FALSE)
summary(all.pca)
Importance of components:
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9
Standard deviation 2.6543 1.74991 1.49577 1.40279 1.29090 1.24162 1.20703 1.18736 1.14632
Proportion of Variance 0.1807 0.07852 0.05737 0.05046 0.04273 0.03953 0.03736 0.03615 0.03369
Cumulative Proportion 0.1807 0.25917 0.31654 0.36700 0.40973 0.44925 0.48661 0.52276 0.55645
PC10 PC11 PC12 PC13 PC14 PC15 PC16 PC17 PC18
Standard deviation 1.0672 1.05741 1.04090 1.02670 1.00043 0.99244 0.98070 0.96227 0.93821
Proportion of Variance 0.0292 0.02867 0.02778 0.02703 0.02566 0.02525 0.02466 0.02374 0.02257
Cumulative Proportion 0.5857 0.61433 0.64211 0.66914 0.69480 0.72005 0.74472 0.76846 0.79103
PC19 PC20 PC21 PC22 PC23 PC24 PC25 PC26 PC27
Standard deviation 0.91036 0.90483 0.88163 0.84133 0.8189 0.7876 0.75323 0.7441 0.69868
Proportion of Variance 0.02125 0.02099 0.01993 0.01815 0.0172 0.0159 0.01455 0.0142 0.01252
Cumulative Proportion 0.81228 0.83327 0.85320 0.87135 0.8885 0.9044 0.91900 0.9332 0.94571
PC28 PC29 PC30 PC31 PC32 PC33 PC34 PC35 PC36
Standard deviation 0.64126 0.57725 0.51007 0.50374 0.44622 0.39704 0.36157 0.33215 0.30779
Proportion of Variance 0.01054 0.00854 0.00667 0.00651 0.00511 0.00404 0.00335 0.00283 0.00243
Cumulative Proportion 0.95626 0.96480 0.97147 0.97798 0.98308 0.98713 0.99048 0.99331 0.99574
PC37 PC38 PC39
Standard deviation 0.30342 0.2724 9.809e-15
Proportion of Variance 0.00236 0.0019 0.000e+00
Cumulative Proportion 0.99810 1.0000 1.000e+00
# Visulize Scree Plot
fviz_eig(all.pca)
# Plot of individual groupings
#fviz_pca_ind(all.pca, col.ind = "cos2", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel = TRUE)
# Variable contribution to each PC
fviz_pca_var(all.pca, axes = c(1, 2), col.var = "contrib", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel = FALSE)
fviz_pca_var(all.pca, axes = c(2, 3), col.var = "contrib", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel = FALSE)
fviz_pca_var(all.pca, axes = c(3, 4), col.var = "contrib", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel = FALSE)
# Biplot
#fviz_pca_biplot(all.pca, repel = TRUE, col.var = "#2E9FDF", col.ind = "#696969")
# Predicting PCA values for each observation
pca.pred.train <- predict(all.pca, newdata = pca.matrix.train)
train.pca <- pca.pred.train
train.pca <- data.frame(train.pca)
train.pca$SalePrice <- train.imputed.log$SalePrice
pca.pred.test <- predict(all.pca, newdata = pca.matrix.test)
test.pca <- pca.pred.test
test.pca <- data.frame(test.pca)
Let’s predict the ‘SalePrice’, of each test observation, to be the average of all the train ‘SalePrice’ values, and use the resulting metrics as a benchmark against which all following model performances will be compared.
# Let's predict the average for each test observation and use it as a benchmark
sale.price.mean <- mean(train.imputed$SalePrice)
avg.vector <- rep(c(sale.price.mean), times = 212)
cat("Bias: ", mean(avg.vector-test.y))
Bias: 7299.447
cat("\nMaximum Deviation: ", max(avg.vector-test.y))
Maximum Deviation: 146626.7
cat("\nMean Absolute Deviation: ", mean(abs(avg.vector-test.y)))
Mean Absolute Deviation: 53682.19
cat("\nMean Square Error: ", mean((avg.vector-test.y)**2))
Mean Square Error: 4959006479
cat("\nRoot Mean Square Error: ", sqrt(mean((avg.vector-test.y)**2)))
Root Mean Square Error: 70420.21
Moving on to actually building the models.
# Linear model with forward selected features (log SalePrice)
forward.lm.log <- lm(formula(step.f.log$call), data = train.imputed.log)
forward.predictions.log <- exp(predict(forward.lm.log, test.imputed))
summary(forward.lm.log)$adj.r.squared
[1] 0.9268791
cat("Bias: ", mean(forward.predictions.log-test.y))
Bias: -522.4962
cat("\nMaximum Deviation: ", max(forward.predictions.log-test.y))
Maximum Deviation: 94900.78
cat("\nMean Absolute Deviation: ", mean(abs(forward.predictions.log-test.y)))
Mean Absolute Deviation: 15715.34
cat("\nMean Square Error: ", mean((forward.predictions.log-test.y)**2))
Mean Square Error: 516572433
cat("\nRoot Mean Square Error: ", sqrt(mean((forward.predictions.log-test.y)**2)))
Root Mean Square Error: 22728.23
Linear model with backward selected features
# Linear model with backward selected features (log SalePrice)
backward.lm.log <- lm(formula(step.b.log$call), data = train.imputed.log)
backward.predictions.log <- exp(predict(backward.lm.log, test.imputed))
summary(backward.lm.log)$adj.r.squared
[1] 0.9265143
cat("Bias: ", mean(backward.predictions.log-test.y))
Bias: -448.4182
cat("\nMaximum Deviation: ", max(backward.predictions.log-test.y))
Maximum Deviation: 77974.29
cat("\nMean Absolute Deviation: ", mean(abs(backward.predictions.log-test.y)))
Mean Absolute Deviation: 15836.28
cat("\nMean Square Error: ", mean((backward.predictions.log-test.y)**2))
Mean Square Error: 527659567
cat("\nRoot Mean Square Error: ", sqrt(mean((backward.predictions.log-test.y)**2)))
Root Mean Square Error: 22970.84
Linear model with stepwise selected features
# Linear model with stepwise selected features (log SalePrice)
stepwise.lm.log <- lm(formula(step.s.log$call), data = train.imputed.log)
stepwise.predictions.log <- exp(predict(stepwise.lm.log, test.imputed))
summary(stepwise.lm.log)$adj.r.squared
[1] 0.9266278
cat("Bias: ", mean(stepwise.predictions.log-test.y))
Bias: -475.9706
cat("\nMaximum Deviation: ", max(stepwise.predictions.log-test.y))
Maximum Deviation: 88851.67
cat("\nMean Absolute Deviation: ", mean(abs(stepwise.predictions.log-test.y)))
Mean Absolute Deviation: 15491.4
cat("\nMean Square Error: ", mean((stepwise.predictions.log-test.y)**2))
Mean Square Error: 507986943
cat("\nRoot Mean Square Error: ", sqrt(mean((stepwise.predictions.log-test.y)**2)))
Root Mean Square Error: 22538.57
PCA
# Linear model with PCA terms from log SalePrice and stepwise variables
pca.fit <- lm(SalePrice~., data = train.pca)
pca.predictions <- exp(predict(pca.fit, test.pca))
cat("Bias: ", mean(pca.predictions-test.y))
Bias: 2950.563
cat("\nMaximum Deviation: ", max(pca.predictions-test.y))
Maximum Deviation: 264325.1
cat("\nMean Absolute Deviation: ", mean(abs(pca.predictions-test.y)))
Mean Absolute Deviation: 71223.44
cat("\nMean Square Error: ", mean((pca.predictions-test.y)**2))
Mean Square Error: 9501566441
cat("\nRoot Mean Square Error: ", sqrt(mean((pca.predictions-test.y)**2)))
Root Mean Square Error: 97475.98
LASSO Regression
# LASSO Regression (log SalePrice)
lr.train.log <- model.matrix(SalePrice~., data = train.imputed.log)
test.imputed$SalePrice <- test.y
lr.test <- model.matrix(SalePrice~., data = test.imputed)
grid <- 10^seq(4, -2, length = 100)
set.seed (2019)
cv.lasso.log <- cv.glmnet(lr.train.log, log(train$SalePrice), alpha = 1, lambda = grid)
bestlam.log <- cv.lasso.log$lambda.min
bestlam.log # Best lambda = 0.03511192
[1] 0.03511192
set.seed(2019)
# Fitting the LASSO (log SalePrice)
fit.lasso.log <- glmnet(lr.train.log, train.imputed.log$SalePrice, alpha = 1, lambda = 0.03511192)
lasso.predict.log <- exp(predict(fit.lasso.log, lr.test))
cat("Bias: ", mean(lasso.predict.log-test.y))
Bias: -7266.865
cat("\nMaximum Deviation: ", max(lasso.predict.log-test.y))
Maximum Deviation: 62073.74
cat("\nMean Absolute Deviation: ", mean(abs(lasso.predict.log-test.y)))
Mean Absolute Deviation: 19777
cat("\nMean Square Error: ", mean((lasso.predict.log-test.y)**2))
Mean Square Error: 868229681
cat("\nRoot Mean Square Error: ", sqrt(mean((lasso.predict.log-test.y)**2)))
Root Mean Square Error: 29465.74
A random Forest was used to determine the most significant interaction terms of the stepwise feature set.
# Linear model with stepwise selected features (log SalePrice) (with intereactions)
test.imputed$SalePrice <- NULL
stepwise.int.lm <- lm(SalePrice ~ MSZoning + LotFrontage + LotConfig +
HouseStyle + OverallCond + YearBuilt + + LandSlope + Condition1 + Condition2 +
YearRemodAdd + Foundation + BsmtFinSF1 + Heating + RoofMatl +
HeatingQC + CentralAir + X2ndFlrSF + BsmtFullBath +
KitchenQual + TotRmsAbvGrd + Functional + Fireplaces + GarageYrBlt +
GarageCars + GarageArea + GarageCond + WoodDeckSF + EnclosedPorch +
X3SsnPorch + ScreenPorch + PoolQC + Fence + SaleType + SaleCondition +
TotalBsmtSF + KitchenAbvGr + OverallQual:X1stFlrSF + OverallQual:Neighborhood + OverallQual:LotArea + OverallQual:BsmtUnfSF + OverallQual:BsmtFinSF1 + OverallQual:YearBuilt + OverallQual:MoSold + OverallQual:YearRemodAdd + OverallQual:BsmtExposure + OverallQual:MSSubClass, data = train.imputed.log)
stepwise.int.lm.pred <- exp(predict(stepwise.int.lm, test.imputed))
cat("Bias: ", mean(stepwise.int.lm.pred-test.y))
Bias: -1018.832
cat("\nMaximum Deviation: ", max(stepwise.int.lm.pred-test.y))
Maximum Deviation: 79370.68
cat("\nMean Absolute Deviation: ", mean(abs(stepwise.int.lm.pred-test.y)))
Mean Absolute Deviation: 15081.78
cat("\nMean Square Error: ", mean((stepwise.int.lm.pred-test.y)**2))
Mean Square Error: 456506704
cat("\nRoot Mean Square Error: ", sqrt(mean((stepwise.int.lm.pred-test.y)**2)))
Root Mean Square Error: 21366.02
summary(stepwise.int.lm)
Call:
lm(formula = interaction.fit <- lm(data = train.imputed, SalePrice ~
MSZoning + LotFrontage + LotConfig + LandSlope + Condition1 +
Condition2 + HouseStyle + OverallCond + YearBuilt + YearRemodAdd +
RoofMatl + Foundation + BsmtFinSF1 + Heating + HeatingQC +
CentralAir + X2ndFlrSF + BsmtFullBath + KitchenQual +
TotRmsAbvGrd + Functional + Fireplaces + GarageYrBlt +
GarageCars + GarageArea + GarageCond + WoodDeckSF + EnclosedPorch +
X3SsnPorch + ScreenPorch + PoolQC + Fence + SaleType +
SaleCondition + TotalBsmtSF + KitchenAbvGr + OverallQual:X1stFlrSF +
OverallQual:Neighborhood + OverallQual:LotArea + OverallQual:BsmtUnfSF +
OverallQual:BsmtFinSF1 + OverallQual:YearBuilt + OverallQual:MoSold +
OverallQual:YearRemodAdd + OverallQual:BsmtExposure +
OverallQual:MSSubClass), data = train.imputed.log)
Residuals:
Min 1Q Median 3Q Max
-0.62615 -0.05029 0.00291 0.05886 0.53931
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.804e+00 1.122e+00 1.608 0.108238
MSZoningFV 5.828e-01 7.264e-02 8.023 4.28e-15 ***
MSZoningRH 4.677e-01 6.827e-02 6.851 1.59e-11 ***
MSZoningRL 4.861e-01 5.644e-02 8.613 < 2e-16 ***
MSZoningRM 4.571e-01 5.489e-02 8.328 4.24e-16 ***
LotFrontage 7.537e-04 2.656e-04 2.837 0.004680 **
LotConfigCulDSac 6.280e-02 2.189e-02 2.868 0.004248 **
LotConfigFR2 -3.125e-02 2.674e-02 -1.169 0.242913
LotConfigFR3 -3.626e-02 9.594e-02 -0.378 0.705616
LotConfigInside -9.526e-03 1.176e-02 -0.810 0.418086
LandSlopeMod 4.208e-02 2.493e-02 1.688 0.091833 .
LandSlopeSev 7.570e-02 7.489e-02 1.011 0.312409
Condition1Feedr 4.735e-02 3.047e-02 1.554 0.120578
Condition1Norm 7.628e-02 2.363e-02 3.228 0.001302 **
Condition1PosA 4.644e-02 6.314e-02 0.736 0.462256
Condition1PosN 9.158e-02 4.546e-02 2.015 0.044318 *
Condition1RRAe -2.485e-02 5.215e-02 -0.476 0.633911
Condition1RRAn 4.030e-02 4.480e-02 0.900 0.368650
Condition1RRNe -3.793e-02 8.888e-02 -0.427 0.669672
Condition1RRNn -2.185e-02 8.157e-02 -0.268 0.788872
Condition2Feedr 1.631e-01 1.507e-01 1.083 0.279397
Condition2Norm 1.797e-02 1.303e-01 0.138 0.890400
Condition2PosN -7.096e-01 1.678e-01 -4.228 2.66e-05 ***
Condition2RRAn -1.027e-01 1.753e-01 -0.586 0.558187
Condition2RRNn 3.957e-02 1.777e-01 0.223 0.823873
HouseStyle1.5Unf -6.340e-02 4.882e-02 -1.299 0.194510
HouseStyle1Story -9.156e-02 2.358e-02 -3.883 0.000113 ***
HouseStyle2.5Fin 1.014e-01 7.576e-02 1.338 0.181206
HouseStyle2.5Unf 1.896e-02 5.703e-02 0.332 0.739673
HouseStyle2Story -5.203e-02 2.065e-02 -2.520 0.011964 *
HouseStyleSFoyer -9.202e-02 3.739e-02 -2.461 0.014078 *
HouseStyleSLvl -6.694e-02 3.114e-02 -2.150 0.031894 *
OverallCond 3.352e-02 5.734e-03 5.846 7.68e-09 ***
YearBuilt 4.374e-03 1.092e-03 4.006 6.83e-05 ***
YearRemodAdd -5.485e-04 1.033e-03 -0.531 0.595484
RoofMatlCompShg 2.731e+00 2.656e-01 10.279 < 2e-16 ***
RoofMatlMetal 2.637e+00 3.105e-01 8.495 < 2e-16 ***
RoofMatlRoll 2.783e+00 2.902e-01 9.590 < 2e-16 ***
RoofMatlTar&Grv 2.797e+00 2.588e-01 10.807 < 2e-16 ***
RoofMatlWdShake 2.733e+00 2.852e-01 9.583 < 2e-16 ***
RoofMatlWdShngl 2.856e+00 2.650e-01 10.779 < 2e-16 ***
FoundationCBlock 1.121e-02 1.996e-02 0.562 0.574601
FoundationPConc 4.915e-02 2.147e-02 2.289 0.022369 *
FoundationSlab -5.521e-02 6.023e-02 -0.917 0.359700
FoundationStone 6.305e-02 6.269e-02 1.006 0.314913
FoundationWood 1.653e-02 1.236e-01 0.134 0.893606
BsmtFinSF1 1.910e-04 5.380e-05 3.550 0.000410 ***
HeatingGasA 4.213e-02 1.283e-01 0.328 0.742658
HeatingGasW 1.314e-01 1.355e-01 0.970 0.332463
HeatingGrav -2.163e-01 1.425e-01 -1.518 0.129398
HeatingOthW 2.348e-02 1.574e-01 0.149 0.881458
HeatingWall 1.238e-01 1.477e-01 0.838 0.402259
HeatingQCFa -1.794e-02 3.231e-02 -0.555 0.578856
HeatingQCGd -3.347e-02 1.361e-02 -2.459 0.014173 *
HeatingQCPo -1.086e-01 1.368e-01 -0.794 0.427545
HeatingQCTA -3.837e-02 1.346e-02 -2.851 0.004485 **
CentralAirY 4.524e-02 2.247e-02 2.014 0.044413 *
X2ndFlrSF 1.420e-04 3.133e-05 4.531 6.89e-06 ***
BsmtFullBath 2.236e-02 1.221e-02 1.831 0.067451 .
KitchenQualFa -6.009e-02 4.521e-02 -1.329 0.184220
KitchenQualGd -5.779e-02 2.135e-02 -2.707 0.006957 **
KitchenQualTA -4.373e-02 2.483e-02 -1.761 0.078618 .
TotRmsAbvGrd 2.300e-02 5.174e-03 4.446 1.02e-05 ***
FunctionalMaj2 -2.407e-01 1.071e-01 -2.246 0.024981 *
FunctionalMin1 -4.132e-02 5.234e-02 -0.789 0.430170
FunctionalMin2 3.638e-03 5.152e-02 0.071 0.943723
FunctionalMod -1.492e-01 6.144e-02 -2.429 0.015387 *
FunctionalSev -3.773e-01 1.432e-01 -2.634 0.008613 **
FunctionalTyp 1.786e-02 4.308e-02 0.415 0.678499
Fireplaces 3.624e-02 8.538e-03 4.245 2.48e-05 ***
GarageYrBlt -7.612e-04 3.821e-04 -1.992 0.046707 *
GarageCars 4.111e-02 1.436e-02 2.863 0.004319 **
GarageArea 1.056e-04 5.136e-05 2.055 0.040245 *
GarageCondFa -1.749e-01 1.278e-01 -1.368 0.171716
GarageCondGd -9.761e-02 1.339e-01 -0.729 0.466126
GarageCondPo 3.666e-02 1.456e-01 0.252 0.801312
GarageCondTA -7.495e-02 1.223e-01 -0.613 0.540099
GarageCondNA -8.263e-02 1.259e-01 -0.656 0.511842
WoodDeckSF 1.112e-04 3.778e-05 2.944 0.003346 **
EnclosedPorch 2.256e-04 8.202e-05 2.751 0.006095 **
X3SsnPorch 2.743e-04 1.433e-04 1.915 0.055900 .
ScreenPorch 3.235e-04 7.871e-05 4.110 4.41e-05 ***
PoolQCFa -3.727e-01 1.813e-01 -2.056 0.040188 *
PoolQCGd 3.161e-01 2.030e-01 1.557 0.119977
PoolQCNA -1.970e-01 1.327e-01 -1.484 0.138257
FenceGdWo -5.835e-02 3.185e-02 -1.832 0.067358 .
FenceMnPrv -1.243e-02 2.666e-02 -0.466 0.641242
FenceMnWw -6.348e-02 5.800e-02 -1.095 0.274078
FenceNA 7.583e-04 2.432e-02 0.031 0.975135
SaleTypeCon 5.351e-02 1.198e-01 0.447 0.655369
SaleTypeConLD 1.975e-01 5.999e-02 3.293 0.001042 **
SaleTypeConLI -8.785e-02 9.067e-02 -0.969 0.332929
SaleTypeConLw 6.888e-02 7.949e-02 0.867 0.386508
SaleTypeCWD 1.408e-01 7.390e-02 1.906 0.057070 .
SaleTypeNew 6.309e-01 1.586e-01 3.978 7.66e-05 ***
SaleTypeOth 5.875e-02 7.512e-02 0.782 0.434434
SaleTypeWD -1.272e-02 2.870e-02 -0.443 0.657699
SaleConditionAdjLand 8.640e-02 8.882e-02 0.973 0.331001
SaleConditionAlloca -1.780e-02 5.885e-02 -0.302 0.762403
SaleConditionFamily 9.193e-02 5.226e-02 1.759 0.078993 .
SaleConditionNormal 6.864e-02 1.885e-02 3.642 0.000290 ***
SaleConditionPartial -5.287e-01 1.563e-01 -3.382 0.000759 ***
TotalBsmtSF 1.588e-04 3.468e-05 4.577 5.57e-06 ***
KitchenAbvGr -2.530e-02 2.795e-02 -0.905 0.365684
OverallQual:X1stFlrSF 2.739e-05 4.921e-06 5.567 3.69e-08 ***
OverallQual:NeighborhoodBlueste -6.528e-03 2.119e-02 -0.308 0.758133
OverallQual:NeighborhoodBrDale -1.859e-02 9.835e-03 -1.890 0.059163 .
OverallQual:NeighborhoodBrkSide 8.487e-04 8.478e-03 0.100 0.920288
OverallQual:NeighborhoodClearCr 4.721e-03 7.753e-03 0.609 0.542770
OverallQual:NeighborhoodCollgCr -4.921e-04 5.552e-03 -0.089 0.929389
OverallQual:NeighborhoodCrawfor 1.738e-02 7.112e-03 2.444 0.014777 *
OverallQual:NeighborhoodEdwards -2.295e-02 6.507e-03 -3.527 0.000447 ***
OverallQual:NeighborhoodGilbert -1.160e-03 6.092e-03 -0.190 0.848984
OverallQual:NeighborhoodIDOTRR -9.217e-03 9.888e-03 -0.932 0.351555
OverallQual:NeighborhoodMeadowV -4.061e-02 1.113e-02 -3.649 0.000283 ***
OverallQual:NeighborhoodMitchel -5.325e-03 6.960e-03 -0.765 0.444462
OverallQual:NeighborhoodNAmes -3.707e-03 6.309e-03 -0.588 0.556943
OverallQual:NeighborhoodNoRidge 6.057e-03 6.319e-03 0.959 0.338115
OverallQual:NeighborhoodNPkVill -2.408e-03 1.022e-02 -0.235 0.813900
OverallQual:NeighborhoodNridgHt 1.338e-02 5.534e-03 2.418 0.015879 *
OverallQual:NeighborhoodNWAmes -1.625e-03 6.304e-03 -0.258 0.796600
OverallQual:NeighborhoodOldTown -5.875e-03 8.291e-03 -0.709 0.478810
OverallQual:NeighborhoodSawyer -9.485e-03 6.922e-03 -1.370 0.171020
OverallQual:NeighborhoodSawyerW -2.044e-03 6.092e-03 -0.336 0.737331
OverallQual:NeighborhoodSomerst -2.759e-03 7.837e-03 -0.352 0.724872
OverallQual:NeighborhoodStoneBr 2.228e-02 7.037e-03 3.166 0.001611 **
OverallQual:NeighborhoodSWISU 2.798e-04 8.639e-03 0.032 0.974171
OverallQual:NeighborhoodTimber 4.355e-03 6.181e-03 0.705 0.481243
OverallQual:NeighborhoodVeenker 5.475e-03 1.279e-02 0.428 0.668703
OverallQual:LotArea 1.384e-07 9.581e-08 1.444 0.149109
OverallQual:BsmtUnfSF -1.495e-05 4.824e-06 -3.099 0.002019 **
BsmtFinSF1:OverallQual -2.948e-05 8.880e-06 -3.320 0.000948 ***
YearBuilt:OverallQual -3.135e-04 1.802e-04 -1.740 0.082318 .
OverallQual:MoSold -1.338e-04 2.374e-04 -0.563 0.573296
YearRemodAdd:OverallQual 3.313e-04 1.783e-04 1.859 0.063480 .
OverallQual:BsmtExposureGd 4.287e-03 2.830e-03 1.515 0.130277
OverallQual:BsmtExposureMn -1.859e-03 3.017e-03 -0.616 0.537908
OverallQual:BsmtExposureNo -1.124e-03 2.067e-03 -0.544 0.586821
OverallQual:BsmtExposureNA 1.607e-02 1.072e-02 1.499 0.134331
OverallQual:MSSubClass -5.146e-05 2.815e-05 -1.828 0.067943 .
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.1126 on 708 degrees of freedom
Multiple R-squared: 0.9382, Adjusted R-squared: 0.9261
F-statistic: 77.34 on 139 and 708 DF, p-value: < 2.2e-16
Based on our findings, we see a couple things. The cross validation selection of lambda degrades the performance of the LASSO Regression, LASSO performs worse than any of our other models. Training on the log of SalePrice improves model performances the most. there are only a few ordianl variables, so there is little to be gained from cahnging them from numeric to categorical variables. In the end, we went with the multiple linear regression model with the stepwise feature set including interaction terms as it exhibited the most favorable performance metrics on the validation set.
Now let’s check the diagnostics for the forward selected linear model as it seems to be the ‘best’ model because it contains the variable with the strongest correlation with the response variable.
#Residual Plots
plot(stepwise.int.lm)
not plotting observations with leverage one:
6, 76, 157, 194, 330, 351, 357, 397, 454, 592, 687, 689, 739, 755, 766, 807, 831
not plotting observations with leverage one:
6, 76, 157, 194, 330, 351, 357, 397, 454, 592, 687, 689, 739, 755, 766, 807, 831
# Plot of predicted versus true reponse test values
plot(test.y,stepwise.int.lm.pred, xlab="Observed Values", ylab="Predicted Values")
title("Observed vs. Predicted Values")
abline(0,1, col = 'red')
Final model performnace on test set
# Importing the data
test.data <- read.csv("test.csv")
#cols.keep <- c("MSZoning", "LotFrontage" , "LotConfig" , "LandSlope" , "Condition1" ,
# "HouseStyle" , "OverallCond" , "YearBuilt" , "YearRemodAdd" ,
# "Foundation" , "BsmtFinSF1", "Heating" , "HeatingQC" ,
# "CentralAir" , "X2ndFlrSF" , "BsmtFullBath" , "KitchenQual" ,
# "TotRmsAbvGrd" , "Functional" , "Fireplaces" , "GarageYrBlt" ,
# "GarageCars" , "GarageArea" , "GarageCond" , "WoodDeckSF" , "EnclosedPorch" ,
# "X3SsnPorch" , "ScreenPorch" , "PoolQC" , "Fence" , "SaleType" ,
# "SaleCondition" , "TotalBsmtSF" , "KitchenAbvGr", "OverallQual", "X1stFlrSF", #"Neighborhood", "LotArea", "BsmtUnfSF", "BsmtFinSF1", "YearBuilt", "MoSold", "YearRemodAdd", #"BsmtExposure", "MSSubClass")
# Leave these out:
test.data.x <- test.data[,-81]
test.data.y <- test.data[,81]
Any missing values?
(sum(is.na(test.data.x))/(nrow(test.data.x)*ncol(test.data.x)))*100
[1] 5.975
# Adding 'NA' as a factor level
col_list <- c('Alley', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature')
test.data.x[col_list] <- lapply(test.data.x[col_list], addNA)
(sum(is.na(test.data.x))/(nrow(test.data.x)*ncol(test.data.x)))*100
[1] 0.309375
# Imputing missing values
final.train.mids <- mice(test.data.x, m=1, method='cart', seed=2019)
iter imp variable
1 1 LotFrontage MasVnrType MasVnrArea Electrical GarageYrBlt
2 1 LotFrontage MasVnrType MasVnrArea Electrical GarageYrBlt
3 1 LotFrontage MasVnrType MasVnrArea Electrical GarageYrBlt
4 1 LotFrontage MasVnrType MasVnrArea Electrical GarageYrBlt
5 1 LotFrontage MasVnrType MasVnrArea Electrical GarageYrBlt
Number of logged events: 26
test.data.x <- complete(final.train.mids,1)
(sum(is.na(test.data.x))/(nrow(test.data.x)*ncol(test.data.x)))*100
[1] 0
test.data <- cbind(test.data.x, test.data.y)
#test.data[test.data$Condition2=="PosA" | test.data$Condition2=="RRAe",]
#test.data[test.data$RoofMatl=="Membran",]
test.data <- test.data[-c(77,158,337),]
new.test.x <- test.data[,-81]
new.test.y <- test.data[,81]
final.pred <- exp(predict(stepwise.int.lm, new.test.x))
cat("Bias: ", mean(final.pred-new.test.y))
Bias: 1628.562
cat("\nMaximum Deviation: ", max(final.pred-new.test.y))
Maximum Deviation: 98258.15
cat("\nMean Absolute Deviation: ", mean(abs(final.pred-new.test.y)))
Mean Absolute Deviation: 14952.98
cat("\nMean Square Error: ", mean((final.pred-new.test.y)**2))
Mean Square Error: 476726595
cat("\nRoot Mean Square Error: ", sqrt(mean((final.pred-new.test.y)**2)))
Root Mean Square Error: 21834.07
3 observations removed from the final test set; new levels in “Condition2” and “RoofMatl” not accounted for by model.