The dataset used for our project is from Kaggle and our focus is to predict the sales price of individual residential property in Ames, Iowa. The dataset consists of 2919 observations with 80 different variables which describe the quality and quantity of the many physical attributes of the property of which 23 are nominal, 23 are ordinal, 14 are discrete and 20 are continuous.
Kaggle has split the complete data into two datasets in csv format - the train dataset consists of 1460 observations with 80 predictor variables and SalePrice being the target variable. The test dataset consists of 1459 observations with 80 predictor values with no SalePrice value provided. The train dataset is used to build an efficient and accurate model to predict the missing values of SalePrice in the test dataset.
The ultimate aim of this report is to analyze the given data, apply methods to clean the data, remove ‘NA’ values and choose strong predictors to build a model that provides the most accurate predictions for SalePrice of houses.
library(xlsx)
library(psych)
library(ggplot2)
library(gridExtra)
library(MASS)
library(caret)
library(missForest)
library(e1071)
library(knitr)
setwd("/Users/siddharth/Desktop/Work/Sem 2/Stats and Predictive Analytics/House_Prices")
train <- read.csv("train.csv", stringsAsFactors = FALSE)
train <- train[,-1]
test <- read.csv("test.csv", stringsAsFactors = FALSE)
test <- test[,-1]
test$SalePrice <- 0
data <- rbind(train, test)
# Convert to factors
data$MSSubClass <- factor(data$MSSubClass)
data$MSZoning <- factor(data$MSZoning)
data$Street <- factor(data$Street)
data$Alley[is.na(data$Alley)] <- "No"
data$Alley <- factor(data$Alley)
data$LotShape <- factor(data$LotShape)
data$LandContour <- factor(data$LandContour)
data$Utilities <- factor(data$Utilities)
data$LotConfig <- factor(data$LotConfig)
data$LandSlope <- factor(data$LandSlope,
levels = c("Gtl", "Mod", "Sev"), ordered = T)
data$Neighborhood <- factor(data$Neighborhood)
data$Condition1 <- factor(data$Condition1)
data$Condition2 <- factor(data$Condition2)
data$BldgType <- factor(data$BldgType)
data$HouseStyle <- factor(data$HouseStyle)
data$RoofStyle <- factor(data$RoofStyle)
data$RoofMatl <- factor(data$RoofMatl)
data$Exterior1st <- factor(data$Exterior1st)
data$Exterior2nd <- factor(data$Exterior2nd)
data$MasVnrType <- factor(data$MasVnrType)
data$ExterQual <- factor(data$ExterQual)
data$Foundation <- factor(data$Foundation)
data$BsmtQual[is.na(data$BsmtQual)] <- "No"
data$BsmtCond[is.na(data$BsmtCond)] <- "No"
data$BsmtExposure[is.na(data$BsmtExposure)] <- "NoBase"
data$BsmtExposure <- factor(data$BsmtExposure)
data$BsmtFinType1[is.na(data$BsmtFinType1)] <- "No"
data$BsmtFinType1 <- factor(data$BsmtFinType1)
data$BsmtFinType2[is.na(data$BsmtFinType2)] <- "No"
data$BsmtFinType2 <- factor(data$BsmtFinType2)
data$Heating <- factor(data$Heating)
data$CentralAir <- factor(data$CentralAir)
data$Electrical <- factor(data$Electrical)
data$Functional <- factor(data$Functional)
data$FireplaceQu[is.na(data$FireplaceQu)] <- "No"
data$GarageType[is.na(data$GarageType)] <- "No"
data$GarageType <- factor(data$GarageType)
data$GarageFinish[is.na(data$GarageFinish)] <- "No"
data$GarageFinish <- factor(data$GarageFinish)
data$GarageQual[is.na(data$GarageQual)] <- "No"
data$GarageCond[is.na(data$GarageCond)] <- "No"
data$PavedDrive <- factor(data$PavedDrive)
data$PoolQC[is.na(data$PoolQC)] <- "No"
data$PoolQC <- factor(data$PoolQC)
data$Fence[is.na(data$Fence)] <- "No"
data$Fence <- factor(data$Fence)
data$MiscFeature[is.na(data$MiscFeature)] <- "None"
data$MiscFeature <- factor(data$MiscFeature)
data$MoSold <- factor(data$MoSold)
data$SaleType <- factor(data$SaleType)
data$SaleCondition <- factor(data$SaleCondition)
# Neighbourhood bin
data$Neighborhood_bin <- 0
data[which(data$Neighborhood=="MeadowV"|data$Neighborhood=="BrDale"|
data$Neighborhood=="IDOTRR"|data$Neighborhood=="BrkSide"|
data$Neighborhood=="Edwards"|data$Neighborhood=="OldTown"),
"Neighborhood_bin"] <- 1
data[which(data$Neighborhood=="NPkVill"|data$Neighborhood=="Blueste"|
data$Neighborhood=="SWISU"|data$Neighborhood=="Mitchel"|
data$Neighborhood=="Sawyer"|data$Neighborhood=="NAmes"),
"Neighborhood_bin"] <- 2
data[which(data$Neighborhood=="Veenker"|data$Neighborhood=="Blmngtn"|
data$Neighborhood=="SawyerW"|data$Neighborhood=="NWAmes"|
data$Neighborhood=="Gilbert"|data$Neighborhood=="CollgCr"),
"Neighborhood_bin"] <- 3
data[which(data$Neighborhood=="ClearCr"|data$Neighborhood=="StoneBr"|
data$Neighborhood=="Timber"|data$Neighborhood=="Crawfor"|
data$Neighborhood=="NoRidge"|data$Neighborhood=="Somerst"|
data$Neighborhood=="NridgHt"),"Neighborhood_bin"] <- 4
data$Neighborhood_bin <- factor(data$Neighborhood_bin)
# Binning GarageType
data$GarageType_bin <- ifelse(data$GarageType == "Attchd" | data$GarageType == "BuiltIn",1,0)
# Total Living Area
data$TotalArea <- data$GrLivArea + data$TotalBsmtSF #+ data$GarageArea
# Total Porch
data$TotalPorch <- data$OpenPorchSF + data$X3SsnPorch +
data$EnclosedPorch + data$ScreenPorch
# TotalBath
data$TotalBath <- data$BsmtFullBath + 0.5 * data$BsmtHalfBath +
data$FullBath + 0.5 * data$HalfBath
# TotalFlrSF
data$TotalFlrSF <- data$X1stFlrSF + data$X2ndFlrSF
# OverallGrade
data$OverallGrade <- data$OverallQual * data$OverallCond
# BsmtGrade
data$BsmtQual[which(data$BsmtQual == "No")] <- 0
data$BsmtQual[which(data$BsmtQual == "Po")] <- 1
data$BsmtQual[which(data$BsmtQual == "Fa")] <- 2
data$BsmtQual[which(data$BsmtQual == "TA")] <- 3
data$BsmtQual[which(data$BsmtQual == "Gd")] <- 4
data$BsmtQual[which(data$BsmtQual == "Ex")] <- 5
data$BsmtQual <- as.integer(data$BsmtQual)
data$BsmtCond[which(data$BsmtCond == "No")] <- 0
data$BsmtCond[which(data$BsmtCond == "Po")] <- 1
data$BsmtCond[which(data$BsmtCond == "Fa")] <- 2
data$BsmtCond[which(data$BsmtCond == "TA")] <- 3
data$BsmtCond[which(data$BsmtCond == "Gd")] <- 4
data$BsmtCond[which(data$BsmtCond == "Ex")] <- 5
data$BsmtCond <- as.integer(data$BsmtCond)
data$BsmtGrade <- data$BsmtQual * data$BsmtCond
# GarageGrade
data$GarageQual[which(data$GarageQual == "No")] <- 0
data$GarageQual[which(data$GarageQual == "Po")] <- 1
data$GarageQual[which(data$GarageQual == "Fa")] <- 2
data$GarageQual[which(data$GarageQual == "TA")] <- 3
data$GarageQual[which(data$GarageQual == "Gd")] <- 4
data$GarageQual[which(data$GarageQual == "Ex")] <- 5
data$GarageQual <- as.integer(data$GarageQual)
data$GarageCond[which(data$GarageCond == "No")] <- 0
data$GarageCond[which(data$GarageCond == "Po")] <- 1
data$GarageCond[which(data$GarageCond == "Fa")] <- 2
data$GarageCond[which(data$GarageCond == "TA")] <- 3
data$GarageCond[which(data$GarageCond == "Gd")] <- 4
data$GarageCond[which(data$GarageCond == "Ex")] <- 5
data$GarageCond <- as.integer(data$GarageCond)
data$GarageGrade <- data$GarageQual * data$GarageCond
# KitchenScore
data$KitchenQual[which(data$KitchenQual == "Po")] <- 1
data$KitchenQual[which(data$KitchenQual == "Fa")] <- 2
data$KitchenQual[which(data$KitchenQual == "TA")] <- 3
data$KitchenQual[which(data$KitchenQual == "Gd")] <- 4
data$KitchenQual[which(data$KitchenQual == "Ex")] <- 5
data$KitchenQual <- as.integer(data$KitchenQual)
data$KitchenScore <- data$KitchenAbvGr * data$KitchenQual
#FireplaceScore
data$FireplaceQu[which(data$FireplaceQu == "No")] <- 0
data$FireplaceQu[which(data$FireplaceQu == "Po")] <- 1
data$FireplaceQu[which(data$FireplaceQu == "Fa")] <- 2
data$FireplaceQu[which(data$FireplaceQu == "TA")] <- 3
data$FireplaceQu[which(data$FireplaceQu == "Gd")] <- 4
data$FireplaceQu[which(data$FireplaceQu == "Ex")] <- 5
data$FireplaceQu <- as.integer(data$FireplaceQu)
data$FireplaceScore <- data$Fireplaces * data$FireplaceQu
# HeatingQC
data$HeatingQC[which(data$HeatingQC == "Po")] <- 1
data$HeatingQC[which(data$HeatingQC == "Fa")] <- 2
data$HeatingQC[which(data$HeatingQC == "TA")] <- 3
data$HeatingQC[which(data$HeatingQC == "Gd")] <- 4
data$HeatingQC[which(data$HeatingQC == "Ex")] <- 5
data$HeatingQC <- as.integer(data$HeatingQC)
# ExterCond
data$ExterCond[which(data$ExterCond == "Po")] <- 1
data$ExterCond[which(data$ExterCond == "Fa")] <- 2
data$ExterCond[which(data$ExterCond == "TA")] <- 3
data$ExterCond[which(data$ExterCond == "Gd")] <- 4
data$ExterCond[which(data$ExterCond == "Ex")] <- 5
data$ExterCond <- as.integer(data$ExterCond)
# Replacing NA's for integer variables
med_imp <- predict(preProcess(data, method = c("medianImpute")), data)
sort(colSums(sapply(med_imp, is.na)), decreasing = T)
## MasVnrType MSZoning Utilities Functional
## 24 4 2 2
## Exterior1st Exterior2nd Electrical SaleType
## 1 1 1 1
## MSSubClass LotFrontage LotArea Street
## 0 0 0 0
## Alley LotShape LandContour LotConfig
## 0 0 0 0
## LandSlope Neighborhood Condition1 Condition2
## 0 0 0 0
## BldgType HouseStyle OverallQual OverallCond
## 0 0 0 0
## YearBuilt YearRemodAdd RoofStyle RoofMatl
## 0 0 0 0
## MasVnrArea ExterQual ExterCond Foundation
## 0 0 0 0
## BsmtQual BsmtCond BsmtExposure BsmtFinType1
## 0 0 0 0
## BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF
## 0 0 0 0
## TotalBsmtSF Heating HeatingQC CentralAir
## 0 0 0 0
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea
## 0 0 0 0
## BsmtFullBath BsmtHalfBath FullBath HalfBath
## 0 0 0 0
## BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
## 0 0 0 0
## Fireplaces FireplaceQu GarageType GarageYrBlt
## 0 0 0 0
## GarageFinish GarageCars GarageArea GarageQual
## 0 0 0 0
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 0 0 0 0
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea
## 0 0 0 0
## PoolQC Fence MiscFeature MiscVal
## 0 0 0 0
## MoSold YrSold SaleCondition SalePrice
## 0 0 0 0
## Neighborhood_bin GarageType_bin TotalArea TotalPorch
## 0 0 0 0
## TotalBath TotalFlrSF OverallGrade BsmtGrade
## 0 0 0 0
## GarageGrade KitchenScore FireplaceScore
## 0 0 0
# Replacing NA's for factor variables
data_imp <- missForest(med_imp)
## missForest iteration 1 in progress...done!
## missForest iteration 2 in progress...done!
## missForest iteration 3 in progress...done!
sort(colSums(sapply(data_imp$ximp, is.na)), decreasing = T)
## MSSubClass MSZoning LotFrontage LotArea
## 0 0 0 0
## Street Alley LotShape LandContour
## 0 0 0 0
## Utilities LotConfig LandSlope Neighborhood
## 0 0 0 0
## Condition1 Condition2 BldgType HouseStyle
## 0 0 0 0
## OverallQual OverallCond YearBuilt YearRemodAdd
## 0 0 0 0
## RoofStyle RoofMatl Exterior1st Exterior2nd
## 0 0 0 0
## MasVnrType MasVnrArea ExterQual ExterCond
## 0 0 0 0
## Foundation BsmtQual BsmtCond BsmtExposure
## 0 0 0 0
## BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2
## 0 0 0 0
## BsmtUnfSF TotalBsmtSF Heating HeatingQC
## 0 0 0 0
## CentralAir Electrical X1stFlrSF X2ndFlrSF
## 0 0 0 0
## LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath
## 0 0 0 0
## FullBath HalfBath BedroomAbvGr KitchenAbvGr
## 0 0 0 0
## KitchenQual TotRmsAbvGrd Functional Fireplaces
## 0 0 0 0
## FireplaceQu GarageType GarageYrBlt GarageFinish
## 0 0 0 0
## GarageCars GarageArea GarageQual GarageCond
## 0 0 0 0
## PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch
## 0 0 0 0
## X3SsnPorch ScreenPorch PoolArea PoolQC
## 0 0 0 0
## Fence MiscFeature MiscVal MoSold
## 0 0 0 0
## YrSold SaleType SaleCondition SalePrice
## 0 0 0 0
## Neighborhood_bin GarageType_bin TotalArea TotalPorch
## 0 0 0 0
## TotalBath TotalFlrSF OverallGrade BsmtGrade
## 0 0 0 0
## GarageGrade KitchenScore FireplaceScore
## 0 0 0
train <- data_imp$ximp[1:1460,]
test <- data_imp$ximp[1461:2919,-80]
# Remove rows where GrLivArea > 4000
train <- train[-which(train$GrLivArea > 4000), ]
We combined the train and test datasets to make necessary changes to the variables and perform feature engineering. We began our analysis by checking the correlation of the numeric and categorical variables with SalePrice. This helped shorlist about 30 predictors which were highly correlated to SalePrice. We performed feature engineering on these variables.
We recoded the following factor variables:
Recoding the above variables showed that they predicted SalePrice better.
We created the following new variables:
The above new variables possess more information, have higher correlation with SalePrice and hence tend to perform better in a linear model.
The dataset contains many numeric and categorical variables with “NA” values. The data dictionary helps understand the meaning of “NA” for different categorical variables. e.g. The presence of “NA” in the variable “GarageQual” means that the house does not have a garage. This holds true for most of the other variables. Hence, we replaced the “NA” values from each of these categorical variables with “None”. This modification is performed to the categorical variables which include “Alley”, “BsmtQual”, “BsmtCond”, “BsmtExposure”, “BsmtFinType1”, “BsmtFinType2”, “FireplaceQu”, “GarageType”, “GarageFinish”, “GarageQual”, “GarageCond”, “PoolQC”, “Fence”, “MiscFeature”. The true “NA” that existed in categorical variables after replacement were imputed using missForest().
Median impute operation is performed on the numeric variables which include “GarageArea”, “GarageCars”, “TotalBsmtSF”, “BsmtUnfSF”, “BsmtFinSF2”, “BsmtFinSF1”, “BsmtHalfBath”, “BsmtFullBath”, “LotFrontage”, “GarageYrBlt” and “MasVnrArea” which contained “NA” values.
We log transform skewed variables in the model so that the data is normally distributed and modelling the data will provide better model performance. /
ggplot(train, aes(SalePrice)) +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm,
color = "red",
args = list(mean = mean(train$SalePrice),
sd = sd(train$SalePrice))) +
ggtitle("Plot of SalePrice Distribution") +
theme(plot.title = element_text(size = 11))
ggplot(train, aes(log(SalePrice))) +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm,
color = "red",
args = list(mean = mean(log(train$SalePrice)),
sd = sd(log(train$SalePrice)))) +
ggtitle("Plot of log(SalePrice) Distribution") +
theme(plot.title = element_text(size = 11))
ggplot(train, aes(LotFrontage)) +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm,
color = "red",
args = list(mean = mean(train$LotFrontage),
sd = sd(train$LotFrontage))) +
ggtitle("Plot of LotFrontage Distribution") +
theme(plot.title = element_text(size = 11))
ggplot(train, aes(log(LotFrontage))) +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm,
color = "red",
args = list(mean = mean(log(train$LotFrontage)),
sd = sd(log(train$LotFrontage)))) +
ggtitle("Plot of log(LotFrontage) Distribution") +
theme(plot.title = element_text(size = 11))
ggplot(train, aes(LotArea)) +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm,
color = "red",
args = list(mean = mean(train$LotArea),
sd = sd(train$LotArea))) +
ggtitle("Plot of LotArea Distribution") +
theme(plot.title = element_text(size = 11))
ggplot(train, aes(log(LotArea))) +
geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm,
color = "red",
args = list(mean = mean(log(train$LotArea)),
sd = sd(log(train$LotArea)))) +
ggtitle("Plot of log(LotArea) Distribution") +
theme(plot.title = element_text(size = 11))
Figure 1, 2 and 3 show that data for SalePrice, LotFrontage and LotArea are not normally distributed and log transforming them makes the data normally distributed which results in better model performance.
ggplot(train, aes(TotalFlrSF, SalePrice)) +
geom_point() +
stat_smooth(method = "lm", se = FALSE) +
xlab("TotalFlrSF") +
ylab("SalePrice") +
ggtitle("Plot of SalePrice and TotalFlrSF") +
theme(plot.title = element_text(size = 11))
Figure 4 shows that TotalFlrSF has a non-linear relation with SalePrice. Hence, adding a quadratic term for TotalFlrSF improves the fit and performace of the model.
We chose linear model because:
We used R2, RMSE, and AIC to compare models with different significant predictors and chose the model giving the best results. We also used anova() to compare whether different models were statistically significant.
linear_model <- (lm(log(SalePrice) ~ OverallGrade + BsmtGrade + GarageGrade +
KitchenScore + FireplaceScore + TotalArea + TotalPorch +
TotalBath + TotalFlrSF + I(TotalFlrSF^2) +
WoodDeckSF + BsmtFinSF1 + log(LotFrontage) +
log(LotArea) + KitchenQual + HeatingQC + ExterCond +
GarageArea + GarageCars + YearBuilt + YearRemodAdd +
factor(CentralAir) + factor(ExterQual) +
factor(BsmtExposure) + factor(Neighborhood_bin) +
factor(SaleCondition) + factor(Neighborhood_bin) * YearBuilt +
log(LotArea)*factor(Neighborhood_bin) +
factor(GarageType_bin) * YearBuilt + YearBuilt * factor(MasVnrType)+
TotalArea * factor(KitchenQual) + YearBuilt * factor(KitchenQual)+
factor(Neighborhood_bin) * OverallGrade +
factor(Neighborhood_bin) * TotalFlrSF,
data = train))
ggplot(train, aes(OverallGrade, SalePrice, col = factor(Neighborhood_bin))) +
geom_point() +
stat_smooth(method = "lm", se = FALSE) +
xlab("OverallGrade") +
ylab("SalePrice") +
ggtitle("Interaction Plot: OverallGrade * Neighborhood_bin") +
scale_color_discrete(name = "Neighborhood",
labels = c("Bin 1", "Bin 2", "Bin 3", "Bin 4")) +
theme(plot.title = element_text(size = 11))
Figure 5 shows the effect that higher quality and condition of a house has on sales price of a house in each of the binned neighborhoods.
ggplot(train, aes(TotalFlrSF, SalePrice, col = factor(Neighborhood_bin))) +
geom_point() +
stat_smooth(method = "lm", se = FALSE) +
xlab("TotalFlrSF") +
ylab("SalePrice") +
ggtitle("Interaction Plot: TotalFlrSF * Neighborhood_bin") +
scale_color_discrete(name = "Neighborhood",
labels = c("Bin 1", "Bin 2", "Bin 3", "Bin 4")) +
theme(plot.title = element_text(size = 11))
Figure 6 shows the effect that higher square feet area has on sales price of a house in each of the binned neighborhoods.
# YearBuilt * Neighborhood_bin
ggplot(train, aes(YearBuilt, SalePrice, col = factor(Neighborhood_bin))) +
geom_point() +
stat_smooth(method = "lm", se = FALSE) +
xlab("YearBuilt") +
ylab("SalePrice") +
ggtitle("Interaction Plot: YearBuilt * Neighborhood_bin") +
scale_color_discrete(name = "Neighborhood",
labels = c("Bin 1", "Bin 2", "Bin 3", "Bin 4")) +
theme(plot.title = element_text(size = 11))
Figure 7 shows the effect of YearBuilt on sales price of a house in each of the binned neighborhoods. A generalization can be made that newer homes tend to have a higher sales price and have a relationship with binned neighborhoods.
# LotArea * Neighborhood_bin
ggplot(train, aes(log(LotArea), SalePrice, col = factor(Neighborhood_bin))) +
geom_point() +
stat_smooth(method = "lm", se = FALSE) +
xlab("LotArea") +
ylab("SalePrice") +
ggtitle("Interaction Plot: log(LotArea) * Neighborhood_bin") +
scale_color_discrete(name = "Neighborhood",
labels = c("Bin 1", "Bin 2", "Bin 3", "Bin 4")) +
theme(plot.title = element_text(size = 11))
Figure 8 shows the effect the area of the lot has on sales price of a house in each of the binned neighborhoods. A generalization can be made that houses with higher lot area tend to have a higher sales price and have a relationship with binned neighborhoods.
# YearBuilt * GarageType_bin
ggplot(train, aes(YearBuilt, SalePrice, col = factor(GarageType_bin))) +
geom_point() +
stat_smooth(method = "lm", se = FALSE) +
xlab("YearBuilt") +
ylab("SalePrice") +
ggtitle("Interaction Plot: YearBuilt * GarageType_bin") +
scale_color_discrete(name = "GarageType_bin",
labels = c("Bin 0", "Bin 1")) +
theme(plot.title = element_text(size = 11))
Figure 9 shows the effect of YearBuilt on sales price of a house in each of the binned garage type.
#TotalArea * factor(KitchenQual_Num)
ggplot(train, aes(TotalArea, SalePrice, col = factor(KitchenQual))) +
geom_point() +
stat_smooth(method = "lm", se = FALSE) +
xlab("TotalArea") +
ylab("SalePrice") +
ggtitle("Interaction Plot: TotalArea * KitchenQual") +
scale_color_discrete(name = "KitchenQual",
labels = c("Fair", "Typical", "Good", "Excellent")) +
theme(plot.title = element_text(size = 11))
Figure 10 shows the effect of total area of the house on sales price of a house in each type of Kitchen Quality.
#YearBuilt * factor(KitchenQual_Num)
ggplot(train, aes(YearBuilt, SalePrice, col = factor(KitchenQual))) +
geom_point() +
stat_smooth(method = "lm", se = FALSE) +
xlab("YearBuilt") +
ylab("SalePrice") +
ggtitle("Interaction Plot: YearBuilt * KitchenQual") +
scale_color_discrete(name = "KitchenQual",
labels = c("Fair", "Typical", "Good", "Excellent")) +
theme(plot.title = element_text(size = 11))
Figure 11 shows the effect of YearBuilt on sales price of a house in each type of Kitchen Quality.
rmse <- function(yhat, y) {
sqrt((mean((yhat - y)^2)))
}
In sample performance for the model is:
basic <- train(log(SalePrice) ~ OverallGrade +
BsmtGrade +
GarageGrade +
KitchenScore +
FireplaceScore +
TotalArea +
TotalPorch +
TotalBath +
TotalFlrSF +
I(TotalFlrSF^2) +
log(LotArea) +
WoodDeckSF +
log(LotFrontage) +
BsmtFinSF1 +
KitchenQual +
HeatingQC +
ExterCond +
GarageArea +
GarageCars +
YearBuilt +
YearRemodAdd +
factor(CentralAir) +
factor(ExterQual) +
factor(BsmtExposure) +
factor(Neighborhood_bin) +
factor(SaleCondition) +
factor(Neighborhood_bin) * YearBuilt +
log(LotArea)*factor(Neighborhood_bin) +
factor(GarageType_bin) * YearBuilt +
YearBuilt * factor(MasVnrType) +
YearBuilt * factor(KitchenQual) +
TotalArea * factor(KitchenQual) +
factor(Neighborhood_bin) * OverallGrade +
factor(Neighborhood_bin) * TotalFlrSF,
data = train, method = "lm")
Out of sample performance for the model is:
From the above values we can see that R2 values in sample and out of sample are similar hence, the model is not overfitted to training set.
Kaggle score of 0.11851 was received by submitting predicted values for test set.
** The End **