Introduction

The dataset used for our project is from Kaggle and our focus is to predict the sales price of individual residential property in Ames, Iowa. The dataset consists of 2919 observations with 80 different variables which describe the quality and quantity of the many physical attributes of the property of which 23 are nominal, 23 are ordinal, 14 are discrete and 20 are continuous.

Kaggle has split the complete data into two datasets in csv format - the train dataset consists of 1460 observations with 80 predictor variables and SalePrice being the target variable. The test dataset consists of 1459 observations with 80 predictor values with no SalePrice value provided. The train dataset is used to build an efficient and accurate model to predict the missing values of SalePrice in the test dataset.

The ultimate aim of this report is to analyze the given data, apply methods to clean the data, remove ‘NA’ values and choose strong predictors to build a model that provides the most accurate predictions for SalePrice of houses.

library(xlsx)
library(psych)
library(ggplot2)
library(gridExtra)
library(MASS) 
library(caret)
library(missForest)
library(e1071)
library(knitr)

setwd("/Users/siddharth/Desktop/Work/Sem 2/Stats and Predictive Analytics/House_Prices")
train <- read.csv("train.csv", stringsAsFactors = FALSE)
train <- train[,-1]

test <- read.csv("test.csv", stringsAsFactors = FALSE)
test <- test[,-1]
test$SalePrice <- 0

data <- rbind(train, test)

# Convert to factors
data$MSSubClass <- factor(data$MSSubClass)
data$MSZoning <- factor(data$MSZoning)
data$Street <- factor(data$Street)

data$Alley[is.na(data$Alley)] <- "No"
data$Alley <- factor(data$Alley)

data$LotShape <- factor(data$LotShape)
data$LandContour <- factor(data$LandContour)
data$Utilities <- factor(data$Utilities)
data$LotConfig <- factor(data$LotConfig)
data$LandSlope <- factor(data$LandSlope, 
                         levels = c("Gtl", "Mod", "Sev"), ordered = T)
data$Neighborhood <- factor(data$Neighborhood)
data$Condition1 <- factor(data$Condition1)
data$Condition2 <- factor(data$Condition2)
data$BldgType <- factor(data$BldgType)
data$HouseStyle <- factor(data$HouseStyle)
data$RoofStyle <- factor(data$RoofStyle)
data$RoofMatl <- factor(data$RoofMatl)
data$Exterior1st <- factor(data$Exterior1st)
data$Exterior2nd <- factor(data$Exterior2nd)
data$MasVnrType <- factor(data$MasVnrType)
data$ExterQual <- factor(data$ExterQual)
data$Foundation <- factor(data$Foundation)

data$BsmtQual[is.na(data$BsmtQual)] <- "No"
data$BsmtCond[is.na(data$BsmtCond)] <- "No"

data$BsmtExposure[is.na(data$BsmtExposure)] <- "NoBase"
data$BsmtExposure <- factor(data$BsmtExposure)

data$BsmtFinType1[is.na(data$BsmtFinType1)] <- "No"
data$BsmtFinType1 <- factor(data$BsmtFinType1)

data$BsmtFinType2[is.na(data$BsmtFinType2)] <- "No"
data$BsmtFinType2 <- factor(data$BsmtFinType2)

data$Heating <- factor(data$Heating)
data$CentralAir <- factor(data$CentralAir)
data$Electrical <- factor(data$Electrical)
data$Functional <- factor(data$Functional)

data$FireplaceQu[is.na(data$FireplaceQu)] <- "No"

data$GarageType[is.na(data$GarageType)] <- "No"
data$GarageType <- factor(data$GarageType)

data$GarageFinish[is.na(data$GarageFinish)] <- "No"
data$GarageFinish <- factor(data$GarageFinish)

data$GarageQual[is.na(data$GarageQual)] <- "No"
data$GarageCond[is.na(data$GarageCond)] <- "No"

data$PavedDrive <- factor(data$PavedDrive)

data$PoolQC[is.na(data$PoolQC)] <- "No"
data$PoolQC <- factor(data$PoolQC)

data$Fence[is.na(data$Fence)] <- "No"
data$Fence <- factor(data$Fence)

data$MiscFeature[is.na(data$MiscFeature)] <- "None"
data$MiscFeature <- factor(data$MiscFeature)

data$MoSold <- factor(data$MoSold)
data$SaleType <- factor(data$SaleType)
data$SaleCondition <- factor(data$SaleCondition)

# Neighbourhood bin
data$Neighborhood_bin <- 0

data[which(data$Neighborhood=="MeadowV"|data$Neighborhood=="BrDale"|
             data$Neighborhood=="IDOTRR"|data$Neighborhood=="BrkSide"|
             data$Neighborhood=="Edwards"|data$Neighborhood=="OldTown"),
     "Neighborhood_bin"] <- 1
data[which(data$Neighborhood=="NPkVill"|data$Neighborhood=="Blueste"|
             data$Neighborhood=="SWISU"|data$Neighborhood=="Mitchel"|
             data$Neighborhood=="Sawyer"|data$Neighborhood=="NAmes"),
     "Neighborhood_bin"] <- 2
data[which(data$Neighborhood=="Veenker"|data$Neighborhood=="Blmngtn"|
             data$Neighborhood=="SawyerW"|data$Neighborhood=="NWAmes"|
             data$Neighborhood=="Gilbert"|data$Neighborhood=="CollgCr"),
     "Neighborhood_bin"] <- 3
data[which(data$Neighborhood=="ClearCr"|data$Neighborhood=="StoneBr"|
             data$Neighborhood=="Timber"|data$Neighborhood=="Crawfor"|
             data$Neighborhood=="NoRidge"|data$Neighborhood=="Somerst"|
             data$Neighborhood=="NridgHt"),"Neighborhood_bin"] <- 4

data$Neighborhood_bin <- factor(data$Neighborhood_bin)

# Binning GarageType
data$GarageType_bin <- ifelse(data$GarageType == "Attchd" | data$GarageType == "BuiltIn",1,0)

# Total Living Area
data$TotalArea <- data$GrLivArea + data$TotalBsmtSF #+ data$GarageArea

# Total Porch
data$TotalPorch <- data$OpenPorchSF + data$X3SsnPorch + 
  data$EnclosedPorch + data$ScreenPorch

# TotalBath
data$TotalBath <- data$BsmtFullBath + 0.5 * data$BsmtHalfBath +
  data$FullBath + 0.5 * data$HalfBath

# TotalFlrSF
data$TotalFlrSF <- data$X1stFlrSF + data$X2ndFlrSF

# OverallGrade
data$OverallGrade <- data$OverallQual * data$OverallCond

# BsmtGrade
data$BsmtQual[which(data$BsmtQual == "No")] <- 0
data$BsmtQual[which(data$BsmtQual == "Po")] <- 1
data$BsmtQual[which(data$BsmtQual == "Fa")] <- 2
data$BsmtQual[which(data$BsmtQual == "TA")] <- 3
data$BsmtQual[which(data$BsmtQual == "Gd")] <- 4
data$BsmtQual[which(data$BsmtQual == "Ex")] <- 5
data$BsmtQual <- as.integer(data$BsmtQual)

data$BsmtCond[which(data$BsmtCond == "No")] <- 0
data$BsmtCond[which(data$BsmtCond == "Po")] <- 1
data$BsmtCond[which(data$BsmtCond == "Fa")] <- 2
data$BsmtCond[which(data$BsmtCond == "TA")] <- 3
data$BsmtCond[which(data$BsmtCond == "Gd")] <- 4
data$BsmtCond[which(data$BsmtCond == "Ex")] <- 5
data$BsmtCond <- as.integer(data$BsmtCond)

data$BsmtGrade <- data$BsmtQual * data$BsmtCond

# GarageGrade
data$GarageQual[which(data$GarageQual == "No")] <- 0
data$GarageQual[which(data$GarageQual == "Po")] <- 1
data$GarageQual[which(data$GarageQual == "Fa")] <- 2
data$GarageQual[which(data$GarageQual == "TA")] <- 3
data$GarageQual[which(data$GarageQual == "Gd")] <- 4
data$GarageQual[which(data$GarageQual == "Ex")] <- 5
data$GarageQual <- as.integer(data$GarageQual)

data$GarageCond[which(data$GarageCond == "No")] <- 0
data$GarageCond[which(data$GarageCond == "Po")] <- 1
data$GarageCond[which(data$GarageCond == "Fa")] <- 2
data$GarageCond[which(data$GarageCond == "TA")] <- 3
data$GarageCond[which(data$GarageCond == "Gd")] <- 4
data$GarageCond[which(data$GarageCond == "Ex")] <- 5
data$GarageCond <- as.integer(data$GarageCond)

data$GarageGrade <- data$GarageQual * data$GarageCond

# KitchenScore
data$KitchenQual[which(data$KitchenQual == "Po")] <- 1
data$KitchenQual[which(data$KitchenQual == "Fa")] <- 2
data$KitchenQual[which(data$KitchenQual == "TA")] <- 3
data$KitchenQual[which(data$KitchenQual == "Gd")] <- 4
data$KitchenQual[which(data$KitchenQual == "Ex")] <- 5
data$KitchenQual <- as.integer(data$KitchenQual)

data$KitchenScore <- data$KitchenAbvGr * data$KitchenQual

#FireplaceScore
data$FireplaceQu[which(data$FireplaceQu == "No")] <- 0
data$FireplaceQu[which(data$FireplaceQu == "Po")] <- 1
data$FireplaceQu[which(data$FireplaceQu == "Fa")] <- 2
data$FireplaceQu[which(data$FireplaceQu == "TA")] <- 3
data$FireplaceQu[which(data$FireplaceQu == "Gd")] <- 4
data$FireplaceQu[which(data$FireplaceQu == "Ex")] <- 5
data$FireplaceQu <- as.integer(data$FireplaceQu)

data$FireplaceScore <- data$Fireplaces * data$FireplaceQu

# HeatingQC
data$HeatingQC[which(data$HeatingQC == "Po")] <- 1
data$HeatingQC[which(data$HeatingQC == "Fa")] <- 2
data$HeatingQC[which(data$HeatingQC == "TA")] <- 3
data$HeatingQC[which(data$HeatingQC == "Gd")] <- 4
data$HeatingQC[which(data$HeatingQC == "Ex")] <- 5
data$HeatingQC <- as.integer(data$HeatingQC)

# ExterCond
data$ExterCond[which(data$ExterCond == "Po")] <- 1
data$ExterCond[which(data$ExterCond == "Fa")] <- 2
data$ExterCond[which(data$ExterCond == "TA")] <- 3
data$ExterCond[which(data$ExterCond == "Gd")] <- 4
data$ExterCond[which(data$ExterCond == "Ex")] <- 5
data$ExterCond <- as.integer(data$ExterCond)

# Replacing NA's for integer variables
med_imp <- predict(preProcess(data, method = c("medianImpute")), data)
sort(colSums(sapply(med_imp, is.na)), decreasing = T)

##       MasVnrType         MSZoning        Utilities       Functional 
##               24                4                2                2 
##      Exterior1st      Exterior2nd       Electrical         SaleType 
##                1                1                1                1 
##       MSSubClass      LotFrontage          LotArea           Street 
##                0                0                0                0 
##            Alley         LotShape      LandContour        LotConfig 
##                0                0                0                0 
##        LandSlope     Neighborhood       Condition1       Condition2 
##                0                0                0                0 
##         BldgType       HouseStyle      OverallQual      OverallCond 
##                0                0                0                0 
##        YearBuilt     YearRemodAdd        RoofStyle         RoofMatl 
##                0                0                0                0 
##       MasVnrArea        ExterQual        ExterCond       Foundation 
##                0                0                0                0 
##         BsmtQual         BsmtCond     BsmtExposure     BsmtFinType1 
##                0                0                0                0 
##       BsmtFinSF1     BsmtFinType2       BsmtFinSF2        BsmtUnfSF 
##                0                0                0                0 
##      TotalBsmtSF          Heating        HeatingQC       CentralAir 
##                0                0                0                0 
##        X1stFlrSF        X2ndFlrSF     LowQualFinSF        GrLivArea 
##                0                0                0                0 
##     BsmtFullBath     BsmtHalfBath         FullBath         HalfBath 
##                0                0                0                0 
##     BedroomAbvGr     KitchenAbvGr      KitchenQual     TotRmsAbvGrd 
##                0                0                0                0 
##       Fireplaces      FireplaceQu       GarageType      GarageYrBlt 
##                0                0                0                0 
##     GarageFinish       GarageCars       GarageArea       GarageQual 
##                0                0                0                0 
##       GarageCond       PavedDrive       WoodDeckSF      OpenPorchSF 
##                0                0                0                0 
##    EnclosedPorch       X3SsnPorch      ScreenPorch         PoolArea 
##                0                0                0                0 
##           PoolQC            Fence      MiscFeature          MiscVal 
##                0                0                0                0 
##           MoSold           YrSold    SaleCondition        SalePrice 
##                0                0                0                0 
## Neighborhood_bin   GarageType_bin        TotalArea       TotalPorch 
##                0                0                0                0 
##        TotalBath       TotalFlrSF     OverallGrade        BsmtGrade 
##                0                0                0                0 
##      GarageGrade     KitchenScore   FireplaceScore 
##                0                0                0

# Replacing NA's for factor variables
data_imp <- missForest(med_imp)

##   missForest iteration 1 in progress...done!
##   missForest iteration 2 in progress...done!
##   missForest iteration 3 in progress...done!

sort(colSums(sapply(data_imp$ximp, is.na)), decreasing = T)

##       MSSubClass         MSZoning      LotFrontage          LotArea 
##                0                0                0                0 
##           Street            Alley         LotShape      LandContour 
##                0                0                0                0 
##        Utilities        LotConfig        LandSlope     Neighborhood 
##                0                0                0                0 
##       Condition1       Condition2         BldgType       HouseStyle 
##                0                0                0                0 
##      OverallQual      OverallCond        YearBuilt     YearRemodAdd 
##                0                0                0                0 
##        RoofStyle         RoofMatl      Exterior1st      Exterior2nd 
##                0                0                0                0 
##       MasVnrType       MasVnrArea        ExterQual        ExterCond 
##                0                0                0                0 
##       Foundation         BsmtQual         BsmtCond     BsmtExposure 
##                0                0                0                0 
##     BsmtFinType1       BsmtFinSF1     BsmtFinType2       BsmtFinSF2 
##                0                0                0                0 
##        BsmtUnfSF      TotalBsmtSF          Heating        HeatingQC 
##                0                0                0                0 
##       CentralAir       Electrical        X1stFlrSF        X2ndFlrSF 
##                0                0                0                0 
##     LowQualFinSF        GrLivArea     BsmtFullBath     BsmtHalfBath 
##                0                0                0                0 
##         FullBath         HalfBath     BedroomAbvGr     KitchenAbvGr 
##                0                0                0                0 
##      KitchenQual     TotRmsAbvGrd       Functional       Fireplaces 
##                0                0                0                0 
##      FireplaceQu       GarageType      GarageYrBlt     GarageFinish 
##                0                0                0                0 
##       GarageCars       GarageArea       GarageQual       GarageCond 
##                0                0                0                0 
##       PavedDrive       WoodDeckSF      OpenPorchSF    EnclosedPorch 
##                0                0                0                0 
##       X3SsnPorch      ScreenPorch         PoolArea           PoolQC 
##                0                0                0                0 
##            Fence      MiscFeature          MiscVal           MoSold 
##                0                0                0                0 
##           YrSold         SaleType    SaleCondition        SalePrice 
##                0                0                0                0 
## Neighborhood_bin   GarageType_bin        TotalArea       TotalPorch 
##                0                0                0                0 
##        TotalBath       TotalFlrSF     OverallGrade        BsmtGrade 
##                0                0                0                0 
##      GarageGrade     KitchenScore   FireplaceScore 
##                0                0                0

train <- data_imp$ximp[1:1460,]
test <- data_imp$ximp[1461:2919,-80]

# Remove rows where GrLivArea > 4000
train <- train[-which(train$GrLivArea > 4000), ]

Data modeling and Cleaning

Analysis of predictors

We combined the train and test datasets to make necessary changes to the variables and perform feature engineering. We began our analysis by checking the correlation of the numeric and categorical variables with SalePrice. This helped shorlist about 30 predictors which were highly correlated to SalePrice. We performed feature engineering on these variables.

Feature Engineering

We recoded the following factor variables:

BsmtQual - Recoded to a range of 0 - 5 where
- 0 = No basement
- 5 = Excellent height of basement
BsmtCond - Recoded to a range of 0 - 5 where
- 0 = No basement
- 5 = Excellent condition of the basement
GarageQual - Recoded to a range of 0 - 5 where
- 0 = No garage
- 5 = Excellent garage quality
GarageCond - Recoded to a range of 0 - 5 where
- 0 = No garage
- 5 = Excellent garage condition
KitchenQual - Recoded to a range of 1 - 5 where
- 1 = Poor kitchen quality
- 5 = Excellent kitchen quality
FireplaceQu - Recoded to a range of 0 - 5 where
- 0 = No fireplace
- 5 = Exceptional masonry fireplace
HeatingQC - Recoded to a range of 1 - 5 where
- 1 = Poor heating quality and condition
- 5 = Excellent heating quality and condition
ExterCond - Recoded to a range of 1 - 5 where
- 1 = Poor condition of the material on the exterior
- 5 = Excellent condition of the material on the exterior

Recoding the above variables showed that they predicted SalePrice better.

We created the following new variables:

TotalArea = GrLivArea + TotalBsmtSF
TotalPorch = OpenPorchSF + EnclosedPorch + X3SsnPorch + ScreenPorch
TotalBath = BsmtFullBath + 0.5 * BsmtHalfBath + FullBath + 0.5 * HalfBath
TotalFlrSF = X1stFlrSF + X2ndFlrSF
OverallGrade = OverallQual * OverallCond
BsmtGrade = BsmtQual * BsmtCond
GarageGrade = GarageQual * GarageCond
KitchenScore = KitchenAbvGr * KitchenQual
FireplaceScore = Fireplaces * FireplaceQu
Neighbourhood_bin = Neighbourhood binned according to SalePrice
- Bin 1 = MeadowV, BrDale, IDOTRR, BrkSide, Edwards, OldTown
- Bin 2 = NPkVill, Blueste, SWISU, Mitchel, Sawyer, NAmes
- Bin 3 = Veenker, Blmngtn, SawyerW, NWAmes, Gilbert, CollgCr
- Bin 4 = ClearCr, StoneBr, Timber, Crawfor, NoRidge, Somerst, NridgHt
GarageType_bin = GarageType binned according to Type
- Bin 0 = Detchd, 2Types, Basment, CarPort, No Garage
- Bin 1 = Attchd, BuiltIn

The above new variables possess more information, have higher correlation with SalePrice and hence tend to perform better in a linear model.

Data Cleaning

The dataset contains many numeric and categorical variables with “NA” values. The data dictionary helps understand the meaning of “NA” for different categorical variables. e.g. The presence of “NA” in the variable “GarageQual” means that the house does not have a garage. This holds true for most of the other variables. Hence, we replaced the “NA” values from each of these categorical variables with “None”. This modification is performed to the categorical variables which include “Alley”, “BsmtQual”, “BsmtCond”, “BsmtExposure”, “BsmtFinType1”, “BsmtFinType2”, “FireplaceQu”, “GarageType”, “GarageFinish”, “GarageQual”, “GarageCond”, “PoolQC”, “Fence”, “MiscFeature”. The true “NA” that existed in categorical variables after replacement were imputed using missForest().

Median impute operation is performed on the numeric variables which include “GarageArea”, “GarageCars”, “TotalBsmtSF”, “BsmtUnfSF”, “BsmtFinSF2”, “BsmtFinSF1”, “BsmtHalfBath”, “BsmtFullBath”, “LotFrontage”, “GarageYrBlt” and “MasVnrArea” which contained “NA” values.

Model and Model development

Log Transformation of Variables

We log transform skewed variables in the model so that the data is normally distributed and modelling the data will provide better model performance. /

ggplot(train, aes(SalePrice)) + 
  geom_histogram(aes(y = ..density..)) +
  stat_function(fun = dnorm, 
                color = "red",
                args = list(mean = mean(train$SalePrice),
                            sd = sd(train$SalePrice))) + 
  ggtitle("Plot of SalePrice Distribution") +
  theme(plot.title = element_text(size = 11))

ggplot(train, aes(log(SalePrice))) + 
  geom_histogram(aes(y = ..density..)) +
  stat_function(fun = dnorm, 
                color = "red",
                args = list(mean = mean(log(train$SalePrice)),
                            sd = sd(log(train$SalePrice)))) + 
  ggtitle("Plot of log(SalePrice) Distribution") +
  theme(plot.title = element_text(size = 11))

$\textit{(a) Plot of SalePrice distribution clearly shows the data is not normally distributed but skewed to the right and has a large range. (b) Plot of log(SalePrice) shows taking a log of SalePrice makes the data normally distributed.}$ $\textit{(a) Plot of SalePrice distribution clearly shows the data is not normally distributed but skewed to the right and has a large range. (b) Plot of log(SalePrice) shows taking a log of SalePrice makes the data normally distributed.}$

ggplot(train, aes(LotFrontage)) + 
  geom_histogram(aes(y = ..density..)) +
  stat_function(fun = dnorm, 
                color = "red",
                args = list(mean = mean(train$LotFrontage),
                            sd = sd(train$LotFrontage))) + 
  ggtitle("Plot of LotFrontage Distribution") +
  theme(plot.title = element_text(size = 11))

ggplot(train, aes(log(LotFrontage))) + 
  geom_histogram(aes(y = ..density..)) +
  stat_function(fun = dnorm, 
                color = "red",
                args = list(mean = mean(log(train$LotFrontage)),
                            sd = sd(log(train$LotFrontage)))) + 
  ggtitle("Plot of log(LotFrontage) Distribution") +
  theme(plot.title = element_text(size = 11))

$\textit{(a) Plot of LotFrontage distribution clearly shows the data is not normally distributed but skewed to the right. (b) Plot of log(LotFrontage) shows taking a log of LotFrontage makes the data more normally distributed.}$ $\textit{(a) Plot of LotFrontage distribution clearly shows the data is not normally distributed but skewed to the right. (b) Plot of log(LotFrontage) shows taking a log of LotFrontage makes the data more normally distributed.}$

ggplot(train, aes(LotArea)) + 
  geom_histogram(aes(y = ..density..)) +
  stat_function(fun = dnorm, 
                color = "red",
                args = list(mean = mean(train$LotArea),
                            sd = sd(train$LotArea))) + 
  ggtitle("Plot of LotArea Distribution") +
  theme(plot.title = element_text(size = 11))

ggplot(train, aes(log(LotArea))) + 
  geom_histogram(aes(y = ..density..)) +
  stat_function(fun = dnorm, 
                color = "red",
                args = list(mean = mean(log(train$LotArea)),
                            sd = sd(log(train$LotArea)))) + 
  ggtitle("Plot of log(LotArea) Distribution") +
  theme(plot.title = element_text(size = 11))

$\textit{(a) Plot of LotArea distribution clearly shows the data is not normally distributed but skewed to the right and has a large range. (b) Plot of log(LotArea) shows taking a log of LotArea makes the data more normally distributed.}$ $\textit{(a) Plot of LotArea distribution clearly shows the data is not normally distributed but skewed to the right and has a large range. (b) Plot of log(LotArea) shows taking a log of LotArea makes the data more normally distributed.}$

Figure 1, 2 and 3 show that data for SalePrice, LotFrontage and LotArea are not normally distributed and log transforming them makes the data normally distributed which results in better model performance.

Non-linear Variable

ggplot(train, aes(TotalFlrSF, SalePrice)) + 
  geom_point() +
  stat_smooth(method = "lm", se = FALSE) +
  xlab("TotalFlrSF") +
  ylab("SalePrice") +
  ggtitle("Plot of SalePrice and TotalFlrSF") +
  theme(plot.title = element_text(size = 11))

$\textit{Plot of TotalFlrSF and SalePrice shows that their relation is non-linear.}$

Figure 4 shows that TotalFlrSF has a non-linear relation with SalePrice. Hence, adding a quadratic term for TotalFlrSF improves the fit and performace of the model.

Statistical Method Selection

We chose linear model because:

Tranforming the outcome variable makes it normally distributed.
Plotting the predictor variables against output variable showed that the relationships are fairly linear.
Linear models are robust to overfitting.

Model

We used R², RMSE, and AIC to compare models with different significant predictors and chose the model giving the best results. We also used anova() to compare whether different models were statistically significant.

linear_model <- (lm(log(SalePrice) ~ OverallGrade + BsmtGrade + GarageGrade +
                      KitchenScore + FireplaceScore + TotalArea + TotalPorch +
                      TotalBath + TotalFlrSF + I(TotalFlrSF^2) +  
                      WoodDeckSF + BsmtFinSF1 + log(LotFrontage) + 
                      log(LotArea) + KitchenQual + HeatingQC + ExterCond +
                      GarageArea + GarageCars + YearBuilt + YearRemodAdd +
                      factor(CentralAir) + factor(ExterQual) + 
                      factor(BsmtExposure) + factor(Neighborhood_bin) +
                      factor(SaleCondition) +  factor(Neighborhood_bin) * YearBuilt +
                      log(LotArea)*factor(Neighborhood_bin) +
                      factor(GarageType_bin) * YearBuilt + YearBuilt * factor(MasVnrType)+
                      TotalArea * factor(KitchenQual) + YearBuilt * factor(KitchenQual)+
                      factor(Neighborhood_bin) * OverallGrade +
                      factor(Neighborhood_bin) * TotalFlrSF,
                    data = train))

Interactions

ggplot(train, aes(OverallGrade, SalePrice, col = factor(Neighborhood_bin))) +
  geom_point() +
  stat_smooth(method = "lm", se = FALSE) +
  xlab("OverallGrade") +
  ylab("SalePrice") +
  ggtitle("Interaction Plot: OverallGrade * Neighborhood_bin") +
  scale_color_discrete(name = "Neighborhood", 
                       labels = c("Bin 1", "Bin 2", "Bin 3", "Bin 4")) +
  theme(plot.title = element_text(size = 11))

$\textit{Interaction plot shows that for each neighborhood, an improvement in overall quality and condition of the house leads to a non-linear increase in price of the house.}$

Figure 5 shows the effect that higher quality and condition of a house has on sales price of a house in each of the binned neighborhoods.

Increments in sales price with increase in quality and condition of a house is lesser in neighborhoods in bin 2 compared to bin 1.
Increments in sales price with increase in quality and condition of a house is greater in neighborhoods in bin 3 compared to bin 1.
Increments in sales price with increase in quality and condition of a house is higher in neighborhoods in bin 4 compared to bin 1.

ggplot(train, aes(TotalFlrSF, SalePrice, col = factor(Neighborhood_bin))) +
  geom_point() +
  stat_smooth(method = "lm", se = FALSE) +
  xlab("TotalFlrSF") +
  ylab("SalePrice") +
  ggtitle("Interaction Plot: TotalFlrSF * Neighborhood_bin") +
  scale_color_discrete(name = "Neighborhood", 
                       labels = c("Bin 1", "Bin 2", "Bin 3", "Bin 4")) +
  theme(plot.title = element_text(size = 11))

$\textit{Interaction plot shows that for each neighborhood an increase in square feet area of the house leads to a non-linear increase in price of the house.}$

Figure 6 shows the effect that higher square feet area has on sales price of a house in each of the binned neighborhoods.

Increments in sales price with increase in square feet area of the house is lesser in neighborhoods in bin 2 compared to bin 1.
Increments in sales price with increase in square feet area of the house is slighly greater in neighborhoods in bin 3 compared to bin 1.
Increments in sales price with increase in square feet area of the house is greater in neighborhoods in bin 4 compared to bin 1.

# YearBuilt * Neighborhood_bin
ggplot(train, aes(YearBuilt, SalePrice, col = factor(Neighborhood_bin))) +
  geom_point() +
  stat_smooth(method = "lm", se = FALSE) +
  xlab("YearBuilt") +
  ylab("SalePrice") +
  ggtitle("Interaction Plot: YearBuilt * Neighborhood_bin") +
  scale_color_discrete(name = "Neighborhood", 
                       labels = c("Bin 1", "Bin 2", "Bin 3", "Bin 4")) +
  theme(plot.title = element_text(size = 11))

$\textit{Interaction plot shows that for each neighborhood, an increase in YearBuilt of the house leads to a non-linear increase in price of the house.}$

Figure 7 shows the effect of YearBuilt on sales price of a house in each of the binned neighborhoods. A generalization can be made that newer homes tend to have a higher sales price and have a relationship with binned neighborhoods.

Increments in sales price with increase in the year the house was built almost remains the same througout in neighborhoods in bin 1.
Increments in sales price with increase in the year the house was built is greater in the neighborhoods in bin 3 compared to the neighborhoods in bin 2.
Increments in sales price with increase in the year the house was built is about the same for neighborhoods in bin 3 compared to the neighborhoods in bin 4.

# LotArea * Neighborhood_bin
ggplot(train, aes(log(LotArea), SalePrice, col = factor(Neighborhood_bin))) +
  geom_point() +
  stat_smooth(method = "lm", se = FALSE) +
  xlab("LotArea") +
  ylab("SalePrice") +
  ggtitle("Interaction Plot: log(LotArea) * Neighborhood_bin") +
  scale_color_discrete(name = "Neighborhood", 
                       labels = c("Bin 1", "Bin 2", "Bin 3", "Bin 4")) +
  theme(plot.title = element_text(size = 11))

$\textit{Interaction plot shows that for each neighborhood, an increase in LotArea of the house leads to a non-linear increase in price of the house.}$

Figure 8 shows the effect the area of the lot has on sales price of a house in each of the binned neighborhoods. A generalization can be made that houses with higher lot area tend to have a higher sales price and have a relationship with binned neighborhoods.

Increments in sales price with increase in lot area of the house is lesser in neighborhoods in bin 2 compared to bin 1.
Increments in sales price with increase in lot area of the house is about the same in neighborhoods in bin 3 and bin 1.
Increments in sales price with increase in lot area of the house is greater in neighborhoods in bin 4 compared to bin 1.

# YearBuilt * GarageType_bin
ggplot(train, aes(YearBuilt, SalePrice, col = factor(GarageType_bin))) +
  geom_point() +
  stat_smooth(method = "lm", se = FALSE) +
  xlab("YearBuilt") +
  ylab("SalePrice") +
  ggtitle("Interaction Plot: YearBuilt * GarageType_bin") +
  scale_color_discrete(name = "GarageType_bin", 
                       labels = c("Bin 0", "Bin 1")) +
  theme(plot.title = element_text(size = 11))

$\textit{Interaction plot shows that for attached and built in garage types, an increase in the year the house was built leads to a non-linear increase in price of the house. But for all other types of garage types, an increase in the year the house was built leads to a non-linear decrease in price of the house.}$

Figure 9 shows the effect of YearBuilt on sales price of a house in each of the binned garage type.

Increments in sales price with increase in the year the house was built is lesser in garage types of bin 0 compared to garage types bin 1.

#TotalArea * factor(KitchenQual_Num) 
ggplot(train, aes(TotalArea, SalePrice, col = factor(KitchenQual))) +
  geom_point() +
  stat_smooth(method = "lm", se = FALSE) +
  xlab("TotalArea") +
  ylab("SalePrice") +
  ggtitle("Interaction Plot: TotalArea * KitchenQual") +
  scale_color_discrete(name = "KitchenQual", 
                       labels = c("Fair", "Typical", "Good", "Excellent")) +
  theme(plot.title = element_text(size = 11))

$\textit{Interaction plot shows that for each type of Kitchen quality, an increase in the total area of the house leads to non-linear increase in price of the house.}$

Figure 10 shows the effect of total area of the house on sales price of a house in each type of Kitchen Quality.

Increments in sales price with increase in the total area of the house is lesser in “Fair” quality kitchens compared to “Typical” quality kitchens.
Increments in sales price with increase in the total area of the house is lesser in “Typical” quality kitchens compared to “Good” quality kitchens.
Increments in sales price with increase in the total area of the house is lesser in “Good” quality kitchens compared to “Excellent” quality kitchens.

#YearBuilt * factor(KitchenQual_Num)
ggplot(train, aes(YearBuilt, SalePrice, col = factor(KitchenQual))) +
  geom_point() +
  stat_smooth(method = "lm", se = FALSE) +
  xlab("YearBuilt") +
  ylab("SalePrice") +
  ggtitle("Interaction Plot: YearBuilt * KitchenQual") +
  scale_color_discrete(name = "KitchenQual", 
                       labels = c("Fair", "Typical", "Good", "Excellent")) +
  theme(plot.title = element_text(size = 11))

$\textit{Interaction plot shows that for Typical, Good and Excellent quality Kitchen types, an increase in the year the house was built leads to non-linear increase in price of the house. But for Fair quality Kitchen types, an increase in the year the house was built leads to non-linear decrese in price of the house.}$

Figure 11 shows the effect of YearBuilt on sales price of a house in each type of Kitchen Quality.

Increments in sales price with increase in the year the house was built tends to decrease in “Fair” quality kitchens.
Increments in sales price with increase in the year the house was built is lesser in “Typical” quality kitchens compared to “Good” quality kitchens.
Increments in sales price with increase in the year the house was built is lesser in “Good” quality kitchens compared to “Excellent” quality kitchens.

Model Performance

rmse <- function(yhat, y) {
  sqrt((mean((yhat - y)^2)))
}

In sample performance for the model is:

R² = 0.92
RMSE = 19083.22

basic <- train(log(SalePrice) ~ OverallGrade +
                 BsmtGrade +
                 GarageGrade +
                 KitchenScore +
                 FireplaceScore +
                 TotalArea +
                 TotalPorch +
                 TotalBath +
                 TotalFlrSF +
                 I(TotalFlrSF^2) +
                 log(LotArea) +
                 WoodDeckSF +
                 log(LotFrontage) +
                 BsmtFinSF1 +
                 KitchenQual +
                 HeatingQC +
                 ExterCond +
                 GarageArea +
                 GarageCars +
                 YearBuilt +
                 YearRemodAdd +
                 factor(CentralAir) +
                 factor(ExterQual) +
                 factor(BsmtExposure) +
                 factor(Neighborhood_bin) +
                 factor(SaleCondition) +
                 factor(Neighborhood_bin) * YearBuilt +
                 log(LotArea)*factor(Neighborhood_bin) +
                 factor(GarageType_bin) * YearBuilt  +
                 YearBuilt * factor(MasVnrType) +
                 YearBuilt * factor(KitchenQual) +
                 TotalArea * factor(KitchenQual) +
                 factor(Neighborhood_bin) * OverallGrade +
                 factor(Neighborhood_bin) * TotalFlrSF,
               data = train, method = "lm")

Out of sample performance for the model is:

R² = 0.91
RMSE = 0.1182699

From the above values we can see that R² values in sample and out of sample are similar hence, the model is not overfitted to training set.

Kaggle score of 0.11851 was received by submitting predicted values for test set.

                                         ** The End **

Final Project Report

Nikita Bagri & Siddharth Suresh

March 29, 2017