Goal
Predict the sale price of houses.
Dataset Description
In this dataset, there are 1460 observations with 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa. Among explanatory variables, there are 37 integer variables, such as Id, MSSubClass, LotFrontage, and 43 factor variables, such as MSZoning, Street, LotShape. Descriptive analysis and quantitative analysis will use subsets of it depending on models.
First part of this report: Descriptive and Exploratory Analysis
Second part of this report: Predictive Analysis.
#install multiple packages, if not installed
ipak <- function(pkg){
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg))
install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}
packages <- c("ggplot2", "readr", "gplots", "repr", "rpart.plot")
ipak(packages)
## Loading required package: ggplot2
## Loading required package: readr
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
## Loading required package: repr
## Warning: package 'repr' was built under R version 3.5.2
## Loading required package: rpart.plot
## Warning: package 'rpart.plot' was built under R version 3.5.2
## Loading required package: rpart
## ggplot2 readr gplots repr rpart.plot
## TRUE TRUE TRUE TRUE TRUE
setwd("D:/Practice/house-prices-advanced-regression-techniques")
getwd()
## [1] "D:/Practice/house-prices-advanced-regression-techniques"
train <- read.csv("train.csv")
# list rows of data that have missing values
missing_row <- train[!complete.cases(train),]
head(missing_row)
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1 1 60 RL 65 8450 Pave <NA> Reg
## 2 2 20 RL 80 9600 Pave <NA> Reg
## 3 3 60 RL 68 11250 Pave <NA> IR1
## 4 4 70 RL 60 9550 Pave <NA> IR1
## 5 5 60 RL 84 14260 Pave <NA> IR1
## 6 6 50 RL 85 14115 Pave <NA> IR1
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1 Lvl AllPub Inside Gtl CollgCr Norm
## 2 Lvl AllPub FR2 Gtl Veenker Feedr
## 3 Lvl AllPub Inside Gtl CollgCr Norm
## 4 Lvl AllPub Corner Gtl Crawfor Norm
## 5 Lvl AllPub FR2 Gtl NoRidge Norm
## 6 Lvl AllPub Inside Gtl Mitchel Norm
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1 Norm 1Fam 2Story 7 5 2003
## 2 Norm 1Fam 1Story 6 8 1976
## 3 Norm 1Fam 2Story 7 5 2001
## 4 Norm 1Fam 2Story 7 5 1915
## 5 Norm 1Fam 2Story 8 5 2000
## 6 Norm 1Fam 1.5Fin 5 5 1993
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1 2003 Gable CompShg VinylSd VinylSd BrkFace
## 2 1976 Gable CompShg MetalSd MetalSd None
## 3 2002 Gable CompShg VinylSd VinylSd BrkFace
## 4 1970 Gable CompShg Wd Sdng Wd Shng None
## 5 2000 Gable CompShg VinylSd VinylSd BrkFace
## 6 1995 Gable CompShg VinylSd VinylSd None
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1 196 Gd TA PConc Gd TA No
## 2 0 TA TA CBlock Gd TA Gd
## 3 162 Gd TA PConc Gd TA Mn
## 4 0 TA TA BrkTil TA Gd No
## 5 350 Gd TA PConc Gd TA Av
## 6 0 TA TA Wood Gd TA No
## BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1 GLQ 706 Unf 0 150 856
## 2 ALQ 978 Unf 0 284 1262
## 3 GLQ 486 Unf 0 434 920
## 4 ALQ 216 Unf 0 540 756
## 5 GLQ 655 Unf 0 490 1145
## 6 GLQ 732 Unf 0 64 796
## Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1 GasA Ex Y SBrkr 856 854 0
## 2 GasA Ex Y SBrkr 1262 0 0
## 3 GasA Ex Y SBrkr 920 866 0
## 4 GasA Gd Y SBrkr 961 756 0
## 5 GasA Ex Y SBrkr 1145 1053 0
## 6 GasA Ex Y SBrkr 796 566 0
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1 1710 1 0 2 1 3
## 2 1262 0 1 2 0 3
## 3 1786 1 0 2 1 3
## 4 1717 1 0 1 0 3
## 5 2198 1 0 2 1 4
## 6 1362 1 0 1 1 1
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1 1 Gd 8 Typ 0 <NA>
## 2 1 TA 6 Typ 1 TA
## 3 1 Gd 6 Typ 1 TA
## 4 1 Gd 7 Typ 1 Gd
## 5 1 Gd 9 Typ 1 TA
## 6 1 TA 5 Typ 0 <NA>
## GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1 Attchd 2003 RFn 2 548 TA
## 2 Attchd 1976 RFn 2 460 TA
## 3 Attchd 2001 RFn 2 608 TA
## 4 Detchd 1998 Unf 3 642 TA
## 5 Attchd 2000 RFn 3 836 TA
## 6 Attchd 1993 Unf 2 480 TA
## GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1 TA Y 0 61 0 0
## 2 TA Y 298 0 0 0
## 3 TA Y 0 42 0 0
## 4 TA Y 0 35 272 0
## 5 TA Y 192 84 0 0
## 6 TA Y 40 30 0 320
## ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1 0 0 <NA> <NA> <NA> 0 2 2008
## 2 0 0 <NA> <NA> <NA> 0 5 2007
## 3 0 0 <NA> <NA> <NA> 0 9 2008
## 4 0 0 <NA> <NA> <NA> 0 2 2006
## 5 0 0 <NA> <NA> <NA> 0 12 2008
## 6 0 0 <NA> MnPrv Shed 700 10 2009
## SaleType SaleCondition SalePrice
## 1 WD Normal 208500
## 2 WD Normal 181500
## 3 WD Normal 223500
## 4 WD Abnorml 140000
## 5 WD Normal 250000
## 6 WD Normal 143000
nrow(missing_row)
## [1] 1460
Select Variables
Step 1: select variables that may have greater impact on house price
Step 2: build subset of train dataset for prediction.
## show all variable names
var_name <- names(train)
var_name
## [1] "Id" "MSSubClass" "MSZoning" "LotFrontage"
## [5] "LotArea" "Street" "Alley" "LotShape"
## [9] "LandContour" "Utilities" "LotConfig" "LandSlope"
## [13] "Neighborhood" "Condition1" "Condition2" "BldgType"
## [17] "HouseStyle" "OverallQual" "OverallCond" "YearBuilt"
## [21] "YearRemodAdd" "RoofStyle" "RoofMatl" "Exterior1st"
## [25] "Exterior2nd" "MasVnrType" "MasVnrArea" "ExterQual"
## [29] "ExterCond" "Foundation" "BsmtQual" "BsmtCond"
## [33] "BsmtExposure" "BsmtFinType1" "BsmtFinSF1" "BsmtFinType2"
## [37] "BsmtFinSF2" "BsmtUnfSF" "TotalBsmtSF" "Heating"
## [41] "HeatingQC" "CentralAir" "Electrical" "X1stFlrSF"
## [45] "X2ndFlrSF" "LowQualFinSF" "GrLivArea" "BsmtFullBath"
## [49] "BsmtHalfBath" "FullBath" "HalfBath" "BedroomAbvGr"
## [53] "KitchenAbvGr" "KitchenQual" "TotRmsAbvGrd" "Functional"
## [57] "Fireplaces" "FireplaceQu" "GarageType" "GarageYrBlt"
## [61] "GarageFinish" "GarageCars" "GarageArea" "GarageQual"
## [65] "GarageCond" "PavedDrive" "WoodDeckSF" "OpenPorchSF"
## [69] "EnclosedPorch" "X3SsnPorch" "ScreenPorch" "PoolArea"
## [73] "PoolQC" "Fence" "MiscFeature" "MiscVal"
## [77] "MoSold" "YrSold" "SaleType" "SaleCondition"
## [81] "SalePrice"
#Here, we select these important variables by creating a vector that contains variable names
select_var <- c('Id','MSZoning','Utilities', 'Neighborhood','BldgType','HouseStyle',
'OverallQual','OverallCond','YearBuilt', 'ExterQual','ExterCond',
'BsmtQual','BsmtCond','TotalBsmtSF','Heating','HeatingQC',
'CentralAir','Electrical','GrLivArea','BedroomAbvGr','KitchenAbvGr',
'KitchenQual','TotRmsAbvGrd','Functional','Fireplaces','FireplaceQu',
'GarageArea','GarageQual','GarageCond','OpenPorchSF','PoolArea',
'Fence','MoSold','YrSold','SaleType','SaleCondition','SalePrice')
# construct subset of train dataset that is used for prediction
select_train <- train[,select_var]
head(select_train)
## Id MSZoning Utilities Neighborhood BldgType HouseStyle OverallQual
## 1 1 RL AllPub CollgCr 1Fam 2Story 7
## 2 2 RL AllPub Veenker 1Fam 1Story 6
## 3 3 RL AllPub CollgCr 1Fam 2Story 7
## 4 4 RL AllPub Crawfor 1Fam 2Story 7
## 5 5 RL AllPub NoRidge 1Fam 2Story 8
## 6 6 RL AllPub Mitchel 1Fam 1.5Fin 5
## OverallCond YearBuilt ExterQual ExterCond BsmtQual BsmtCond TotalBsmtSF
## 1 5 2003 Gd TA Gd TA 856
## 2 8 1976 TA TA Gd TA 1262
## 3 5 2001 Gd TA Gd TA 920
## 4 5 1915 TA TA TA Gd 756
## 5 5 2000 Gd TA Gd TA 1145
## 6 5 1993 TA TA Gd TA 796
## Heating HeatingQC CentralAir Electrical GrLivArea BedroomAbvGr
## 1 GasA Ex Y SBrkr 1710 3
## 2 GasA Ex Y SBrkr 1262 3
## 3 GasA Ex Y SBrkr 1786 3
## 4 GasA Gd Y SBrkr 1717 3
## 5 GasA Ex Y SBrkr 2198 4
## 6 GasA Ex Y SBrkr 1362 1
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1 1 Gd 8 Typ 0 <NA>
## 2 1 TA 6 Typ 1 TA
## 3 1 Gd 6 Typ 1 TA
## 4 1 Gd 7 Typ 1 Gd
## 5 1 Gd 9 Typ 1 TA
## 6 1 TA 5 Typ 0 <NA>
## GarageArea GarageQual GarageCond OpenPorchSF PoolArea Fence MoSold
## 1 548 TA TA 61 0 <NA> 2
## 2 460 TA TA 0 0 <NA> 5
## 3 608 TA TA 42 0 <NA> 9
## 4 642 TA TA 35 0 <NA> 2
## 5 836 TA TA 84 0 <NA> 12
## 6 480 TA TA 30 0 MnPrv 10
## YrSold SaleType SaleCondition SalePrice
## 1 2008 WD Normal 208500
## 2 2007 WD Normal 181500
## 3 2008 WD Normal 223500
## 4 2006 WD Abnorml 140000
## 5 2008 WD Normal 250000
## 6 2009 WD Normal 143000
summary(select_train)
## Id MSZoning Utilities Neighborhood BldgType
## Min. : 1.0 C (all): 10 AllPub:1459 NAmes :225 1Fam :1220
## 1st Qu.: 365.8 FV : 65 NoSeWa: 1 CollgCr:150 2fmCon: 31
## Median : 730.5 RH : 16 OldTown:113 Duplex: 52
## Mean : 730.5 RL :1151 Edwards:100 Twnhs : 43
## 3rd Qu.:1095.2 RM : 218 Somerst: 86 TwnhsE: 114
## Max. :1460.0 Gilbert: 79
## (Other):707
## HouseStyle OverallQual OverallCond YearBuilt ExterQual
## 1Story :726 Min. : 1.000 Min. :1.000 Min. :1872 Ex: 52
## 2Story :445 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954 Fa: 14
## 1.5Fin :154 Median : 6.000 Median :5.000 Median :1973 Gd:488
## SLvl : 65 Mean : 6.099 Mean :5.575 Mean :1971 TA:906
## SFoyer : 37 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## 1.5Unf : 14 Max. :10.000 Max. :9.000 Max. :2010
## (Other): 19
## ExterCond BsmtQual BsmtCond TotalBsmtSF Heating HeatingQC
## Ex: 3 Ex :121 Fa : 45 Min. : 0.0 Floor: 1 Ex:741
## Fa: 28 Fa : 35 Gd : 65 1st Qu.: 795.8 GasA :1428 Fa: 49
## Gd: 146 Gd :618 Po : 2 Median : 991.5 GasW : 18 Gd:241
## Po: 1 TA :649 TA :1311 Mean :1057.4 Grav : 7 Po: 1
## TA:1282 NA's: 37 NA's: 37 3rd Qu.:1298.2 OthW : 2 TA:428
## Max. :6110.0 Wall : 4
##
## CentralAir Electrical GrLivArea BedroomAbvGr KitchenAbvGr
## N: 95 FuseA: 94 Min. : 334 Min. :0.000 Min. :0.000
## Y:1365 FuseF: 27 1st Qu.:1130 1st Qu.:2.000 1st Qu.:1.000
## FuseP: 3 Median :1464 Median :3.000 Median :1.000
## Mix : 1 Mean :1515 Mean :2.866 Mean :1.047
## SBrkr:1334 3rd Qu.:1777 3rd Qu.:3.000 3rd Qu.:1.000
## NA's : 1 Max. :5642 Max. :8.000 Max. :3.000
##
## KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## Ex:100 Min. : 2.000 Maj1: 14 Min. :0.000 Ex : 24
## Fa: 39 1st Qu.: 5.000 Maj2: 5 1st Qu.:0.000 Fa : 33
## Gd:586 Median : 6.000 Min1: 31 Median :1.000 Gd :380
## TA:735 Mean : 6.518 Min2: 34 Mean :0.613 Po : 20
## 3rd Qu.: 7.000 Mod : 15 3rd Qu.:1.000 TA :313
## Max. :14.000 Sev : 1 Max. :3.000 NA's:690
## Typ :1360
## GarageArea GarageQual GarageCond OpenPorchSF
## Min. : 0.0 Ex : 3 Ex : 2 Min. : 0.00
## 1st Qu.: 334.5 Fa : 48 Fa : 35 1st Qu.: 0.00
## Median : 480.0 Gd : 14 Gd : 9 Median : 25.00
## Mean : 473.0 Po : 3 Po : 7 Mean : 46.66
## 3rd Qu.: 576.0 TA :1311 TA :1326 3rd Qu.: 68.00
## Max. :1418.0 NA's: 81 NA's: 81 Max. :547.00
##
## PoolArea Fence MoSold YrSold
## Min. : 0.000 GdPrv: 59 Min. : 1.000 Min. :2006
## 1st Qu.: 0.000 GdWo : 54 1st Qu.: 5.000 1st Qu.:2007
## Median : 0.000 MnPrv: 157 Median : 6.000 Median :2008
## Mean : 2.759 MnWw : 11 Mean : 6.322 Mean :2008
## 3rd Qu.: 0.000 NA's :1179 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :738.000 Max. :12.000 Max. :2010
##
## SaleType SaleCondition SalePrice
## WD :1267 Abnorml: 101 Min. : 34900
## New : 122 AdjLand: 4 1st Qu.:129975
## COD : 43 Alloca : 12 Median :163000
## ConLD : 9 Family : 20 Mean :180921
## ConLI : 5 Normal :1198 3rd Qu.:214000
## ConLw : 5 Partial: 125 Max. :755000
## (Other): 9
2.4 Descriptive and exploratory analysis of SalePrice
SalePrice is our target variable and also the dependent variable for prediction. According to the assumptions of Linear Regression, data should be normally distributed. By checking the distribution of SalePrice, we can decide if we need non-linear transformation, like log term, to make better prediction.
2.4.1 Summary of Target Variable: SalePrice
# Five number summary
summary(select_train$SalePrice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 129975 163000 180921 214000 755000
# Draw a higtogram to figure out the distribution of SalePrice
options(scipen=10000)
ggplot(select_train, aes(x = SalePrice, fill = ..count..)) +
geom_histogram(binwidth = 5000) +
ggtitle("Figure 1 Histogram of SalePrice") +
ylab("Count of houses") +
xlab("Housing Price") +
theme(plot.title = element_text(hjust = 0.5))
From the histogram above, the distribution of our target variable– SalePrice is skewed to right. Thus, a log term of SalePrice should be generated for linear regression. Here, we name it lSalePrice.
#log term of SalePrice
select_train$lSalePrice <- log(select_train$SalePrice)
# Draw a higtogram to figure out the distribution of log SalePrice
ggplot(select_train, aes(x = lSalePrice, fill = ..count..)) +
geom_histogram(binwidth = 0.05) +
ggtitle("Figure 2 Histogram of log SalePrice") +
ylab("Count of houses") +
xlab("Housing Price") +
theme(plot.title = element_text(hjust = 0.5))
Great! After fixing, lSalePrice is normally distributed. We will use this log term of SalePrice later in models.
2.4.2 Explore the distribution of SalePrice by MSZoning
When it comes to housing price, the value of house is usually related to two types of elements: internal and external. Internal elements are the key features of house itself, like total area, the number of rooms. As for External elements, environment is one of the key factors.
First, let’s figure out the variable that can indicates housing environment in our dataset. Here, I choose MSZoning as this indicator. It’s a dummy variable and the identiciation is:
MSZoning: Identifies the general zoning classification of the sale.
Therefore, in this section, I will explore the relationship between MSZoning and our target variable SalePrice.
First, let’s have a close look at MSZoning. Because it’s a dummy variable, I am curious about the total number of houses in each category.
# count house by MSZoning
options(repr.plot.width=5, repr.plot.height=4)
ggplot(select_train, aes(x = MSZoning, fill = MSZoning )) +
geom_bar()+
scale_fill_hue(c = 80)+
ggtitle("Figure 3 Distribution of MSZoning")+
theme(plot.title = element_text(hjust = 0.5),legend.position="right", legend.background = element_rect(fill="grey90",
size=0.5, linetype="solid",
colour ="black"))+
geom_text(stat='count',aes(label=..count..),vjust=-0.25)
# Distribution of MSZoning
table(select_train$MSZoning)
##
## C (all) FV RH RL RM
## 10 65 16 1151 218
From the graph and table above, it is obvious that most of houses in this dataset are built in the area of Residential Low Density(1151 houses), and follows by Residential Medium Density(218 houses). Few houes are built in Commercial, Floating Village and Residential High Density.
Since a large amount of houses comes to the categoreis of Residential Low Density and Residential Medium Density, these two areas should be paid more attention for housing price analysis.
On top of it, let’s add our target variable into analysis. How does housing price look like in each category? Here, I use boxplot to show the distribution of prices in each MSZoning.
# Change plot size to 9 x 6
options(repr.plot.width=9, repr.plot.height=6)
#boxplot of SalePrice by MSZoning
#add average value of SalePrice as red point
ggplot(select_train, aes(x=MSZoning, y=SalePrice, fill=MSZoning)) +
geom_boxplot(alpha=0.3) +
stat_summary(fun.y=mean, geom="point", shape=20, size=4, color="red", fill="red")+
theme(legend.position="none")+
ggtitle("Figure 4 Boxplot of SalePrice by MSZoning")+
theme(plot.title = element_text(hjust = 0.5))
The graph above shows the distribution of SalePrice by MSZoning. The sales in “Floating Village Residential” area have the highest average sale price, and then followed by “Residential Low Density”. While “Commercial” sales have the lowest average sale price.
It is quite strange that commercial area has the lowest average Sale Price while village area has the highest. One possible explanation could be SalePrice is also related to the size of houses. To confirm, let’s explore the average size in these area.
The variable indicates size in this dataset is called GrLivArea.
Definition: Above grade (ground) living area square feet
# Display the average hosue size in each area
library(plyr)
ddply(train, .(MSZoning), summarize, size=mean(GrLivArea))
## MSZoning size
## 1 C (all) 1191.400
## 2 FV 1574.538
## 3 RH 1510.125
## 4 RL 1551.646
## 5 RM 1322.073
It is obvious that the avarage size of houses in Commecial are is much smaller than Floating Village area, which verified our assumption above.
2.4.3 Explore the distribution of SalePrice by BldfType
Next, We are going to describe SalePrice by different cateogries of BldfType.
BldgType: Type of dwelling
1Fam Single-family Detached
2FmCon Two-family Conversion; originally built as one-family dwelling Duplx Duplex TwnhsE Townhouse End Unit TwnhsI Townhouse Inside Unit To get a quick feel about BldgType, I use a table here to count houses in each catetory and also show maximum and minimum SalePrice.
library(plyr)
ddply(train, .(BldgType), summarize,Total = length(BldgType),Max_price=max(SalePrice),Min_price=min(SalePrice))
## BldgType Total Max_price Min_price
## 1 1Fam 1220 755000 34900
## 2 2fmCon 31 228950 55000
## 3 Duplex 52 206300 82000
## 4 Twnhs 43 230000 75000
## 5 TwnhsE 114 392500 75500
In previous section I used Boxplot to describe MSZoning, while for BldgType I will use Histogram, since I am more interested in the distribution rather than summary numbers.
# historgram of housing price by BldgType
ggplot(select_train, aes(SalePrice)) +
geom_histogram(aes(fill = BldgType), position = position_stack(reverse = TRUE), binwidth = 20000) +
coord_flip() + ggtitle("Figure 5 Histogram of SalePrice") +
ylab("Count") +
xlab("Housing Price") +
theme(plot.title = element_text(hjust = 0.5),legend.position=c(0.9,0.8), legend.background = element_rect(fill="grey90",
size=0.5, linetype="solid",
colour ="black"))
More thoughts about the graph above:
For houses with type of Single-family Detached, most of their prices are within the range from 50000 to 300000 For Two-family Conversion, Duplex, Townhouse End Unit and Townhouse Inside Unit, most of house prices are ranging from 75000 to 210000 The highest and lowest house price both come to Single-family Detached house type
Explore the distribution of SalePrice by OverallQual
The last one is OverallQual.
OverallQual: Rates the overall material and finish of the house
10 Very Excellent 9 Excellent 8 Very Good 7 Good 6 Above Average 5 Average 4 Below Average 3 Fair 2 Poor 1 Very Poor
ggplot(select_train, aes(x = SalePrice,fill = as.factor(OverallQual))) +
geom_histogram(position = "stack", binwidth = 10000) +
ggtitle("Figure 6 Histogram of SalePrice") +
ylab("Count") +
xlab("Housing Price") +
scale_fill_discrete(name="OverallQual")+
theme(plot.title = element_text(hjust = 0.5), legend.position=c(0.9,0.7), legend.background = element_rect(fill="grey90",
size=0.5, linetype="solid",
colour ="black"))
As we saw in graph above:
Most houese are with OverallQuall of 4,5,6 and 7, equivalent to “Below Average”, “Average”, “Above Average” and “Good” The higher rate of overall quality, the higher house sale price For each rate level of overall quality, the distribution of house price is almost symmetric
What kind of house will be sold for higher price?
2.5.1 Correlation Exploreation
Let’s select variables first.
Variables for correlation exploration: SalePrice’,‘OverallQual’,‘OverallCond’,‘YearBuilt’,‘ExterCond2’,‘TotalBsmtSF’,‘HeatingQC2’
In order to have a clear view of how the key variables relate to SalePrice, I decide to use Correlation Heatmap to plot correlation coefficients.
But before plotting heatmap, one more step is needed– feature engineering. In section I, we learn that some variables are factors, in order to buid heatmap we need to convert them into numerics.
Since these factor varaibles evaluate quality of house with ordered levels, such as “Ex”, “Fa”,“Gd”, “TA”, and “Po”, here, I match them to numbers: “1”,“2”,“3”,“4”, and “5”. That is, the smaller number, the higher level. After transforming, all the variables used for heatmap are numberic.
# convert factor to numeric
select_train$ExterCond2 <- as.numeric(factor(select_train$ExterCond,
levels = c("Ex", "Fa","Gd", "TA","Po"),
labels = c(5,2,4,3,1) ,ordered = TRUE))
select_train$HeatingQC2 <- as.numeric(factor(select_train$HeatingQC,
levels = c("Ex", "Fa","Gd", "TA","Po"),
labels = c(5,2,4,3,1) ,ordered = TRUE))
select_train$CentralAir2 <- as.numeric(factor(select_train$CentralAir,
levels = c("N", "Y"),
labels = c(0,1) ,ordered = TRUE))
#select variables that be used for model buidling and heat map
model_var <- c('SalePrice',
'OverallQual','OverallCond','YearBuilt','ExterCond2',
'TotalBsmtSF','HeatingQC2',
'CentralAir2','GrLivArea','BedroomAbvGr','KitchenAbvGr',
'TotRmsAbvGrd','Fireplaces',
'GarageArea','OpenPorchSF','PoolArea',
'YrSold')
heat <- select_train[,model_var]
Next, using ggplot plot correlation heatmap.
#plot correlation heatmap for SalePrice
options(repr.plot.width=8, repr.plot.height=6)
library(ggplot2)
library(reshape2)
qplot(x=Var1, y=Var2, data=melt(cor(heat, use="p")), fill=value, geom="tile") +
scale_fill_gradient2(low = "green", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Correlation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 8, hjust = 1))+
coord_fixed()+
ggtitle("Figure 7 Correlation Heatmap") +
theme(plot.title = element_text(hjust = 0.4))
In this graph, Red indicates perfect positive correlation and Green indicates perfect negative correlation. As we can see, there are several variables should be paid attention to: GarageArea, Fireplaces, TotRmsAbvGrd, GrLivArea, HeatingQC, TotalBsmtSF and YearBuild.
2.5.2 Correlation between SalePrice and some numeric variables
In this section, I am going to analyze the correlation between SalePrice and numeric variables, including GrLivArea,TotalBsmtSF, TotRmsAbvGrd, GarageArea. Different from categorical variables, here I will use scatter plot and trend line to indicate the relationship.
# scatter plot of GrLiveArea
# Change plot size to 5 x 4
options(repr.plot.width=9, repr.plot.height=6)
p1 <- ggplot(select_train, aes(x=GrLivArea, y=SalePrice)) +
geom_point(shape=1) +
geom_smooth(method=lm , color="red", se=FALSE)+
ggtitle("Figure 8 Scatter plot of SalePrice and GrLivArea") +
theme(plot.title = element_text(hjust = 0.4))
# scatter plot of TotalBsmtSF
p2 <- ggplot(select_train, aes(x=TotalBsmtSF, y=SalePrice)) +
geom_point(shape=1) +
geom_smooth(method=lm , color="red", se=FALSE)+
ggtitle("Figure 9 Scatter plot of SalePrice and TotalBsmtSF") +
theme(plot.title = element_text(hjust = 0.4))
#scatter plot of TotRmsAbvGrd
p3 <- ggplot(select_train, aes(x=TotRmsAbvGrd, y=SalePrice)) +
geom_point(shape=1) +
geom_smooth(method=lm , color="red", se=FALSE)+
ggtitle("Figure 10 Scatter plot of SalePrice and TotRmsAbvGrd") +
theme(plot.title = element_text(hjust = 0.4))
#scatter plot of GarageArea
p4 <- ggplot(select_train, aes(x=GarageArea, y=SalePrice)) +
geom_point(shape=1) +
geom_smooth(method=lm , color="red", se=FALSE)+
ggtitle("Figure 11 Scatter plot of SalePrice and GarageArea") +
theme(plot.title = element_text(hjust = 0.4))
library(gridExtra)
grid.arrange(p1, p2,p3,p4)
Some thoughts about these graphs:
GrLivArea, TotalBsmtSF, TotRmsAbvGrd, and GarageArea are positively correlated with SalePrice, which means with the increase of GrLivArea, TotalBsmtSF, TotRmsAbvGrd and GarageArea, the SalePrice also increases. TotalBsmtSF has more concentrated distribution than others Part III Model Fitting
After Descriptive Analysis we are moving into Predictive Analysis section. There are three models here:
Linear Regresion Model
Classification & Regression Trees (CART) Model
Random Forest Model
3.1 Linear Regression Model
In Linear Regresion Model, the relationships between Dependent and Indepedent Variables is expressed by equation with coefficients. The aim of this model is to minimize the sum of the squared residuals. Here I select 16 variables to fit into this model.
Variables in this model
SalePrice, OverallQual, OverallCond, YearBuilt, ExterQual2, ExterCond2, TotalBsmtSF, HeatingQC2, CentralAir2, GrLivArea, BedroomAbvGr, KitchenAbvGr, TotRmsAbvGrd, Fireplaces, GarageArea, OpenPorchSF, PoolArea,YrSold
Step 1: choose variables and transfer SalePrice into log term
#prediction of lm
#build model dataset for linear regression
model_lin <- select_train[, model_var]
model_lin$lSalePrice <- log(model_lin$SalePrice)
Step 2: divide datasets into two parts – training and validation, to prepare for prediction later
#partition data
set.seed(10000)
train.index <- sample(c(1:dim(model_lin)[1]), dim(model_lin)[1]*0.8)
model_lin_train = model_lin[train.index,]
model_lin_valid <- model_lin[-train.index,]
Step 3: run regression
#use lm() to run linear regression of SalePrice on all variables in model dataset
linreg <- lm(lSalePrice~.-SalePrice, data = model_lin_train)
summary(linreg)
##
## Call:
## lm(formula = lSalePrice ~ . - SalePrice, data = model_lin_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.58114 -0.06702 0.00342 0.07786 0.44064
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.6023217895 6.1903443257 0.905 0.36565
## OverallQual 0.0672509736 0.0050771139 13.246 < 0.0000000000000002 ***
## OverallCond 0.0592300019 0.0042589866 13.907 < 0.0000000000000002 ***
## YearBuilt 0.0033446003 0.0002151885 15.543 < 0.0000000000000002 ***
## ExterCond2 0.0173620804 0.0104747653 1.658 0.09769 .
## TotalBsmtSF 0.0001781542 0.0000126056 14.133 < 0.0000000000000002 ***
## HeatingQC2 -0.0163295194 0.0035734007 -4.570 0.000005409910875536 ***
## CentralAir2 0.0578755232 0.0199820314 2.896 0.00385 **
## GrLivArea 0.0002318796 0.0000173153 13.392 < 0.0000000000000002 ***
## BedroomAbvGr -0.0118786527 0.0073986920 -1.606 0.10866
## KitchenAbvGr -0.0875970702 0.0220040022 -3.981 0.000072936382037312 ***
## TotRmsAbvGrd 0.0164602588 0.0053935815 3.052 0.00233 **
## Fireplaces 0.0631045713 0.0076248880 8.276 0.000000000000000349 ***
## GarageArea 0.0002487926 0.0000255869 9.723 < 0.0000000000000002 ***
## OpenPorchSF -0.0000001497 0.0000654107 -0.002 0.99817
## PoolArea 0.0001314590 0.0000970241 1.355 0.17571
## YrSold -0.0008613313 0.0030848002 -0.279 0.78013
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1389 on 1151 degrees of freedom
## Multiple R-squared: 0.8787, Adjusted R-squared: 0.877
## F-statistic: 521 on 16 and 1151 DF, p-value: < 0.00000000000000022
Step 4: forecast and check for model accuracy
library(forecast)
#use predict() to make prediction on a new set
pred1 <- predict(linreg,model_lin_valid,type = "response")
residuals <- model_lin_valid$lSalePrice - pred1
linreg_pred <- data.frame("Predicted" = pred1, "Actual" = model_lin_valid$lSalePrice, "Residual" = residuals)
accuracy(pred1, model_lin_valid$lSalePrice)
## ME RMSE MAE MPE MAPE
## Test set -0.01358129 0.2256403 0.1155791 -0.1262696 0.9675185
CART
# classification tree
library(rpart)
library(rpart.plot)
class.tree <- rpart(lSalePrice~.-SalePrice,
data = model_lin_train,control = rpart.control(cp = 0.01))
plotcp(class.tree)
printcp(class.tree)
##
## Regression tree:
## rpart(formula = lSalePrice ~ . - SalePrice, data = model_lin_train,
## control = rpart.control(cp = 0.01))
##
## Variables actually used in tree construction:
## [1] CentralAir2 GarageArea GrLivArea OverallCond OverallQual TotalBsmtSF
##
## Root node error: 182.93/1168 = 0.15661
##
## n= 1168
##
## CP nsplit rel error xerror xstd
## 1 0.459527 0 1.00000 1.00138 0.049656
## 2 0.086240 1 0.54047 0.54351 0.030019
## 3 0.071446 2 0.45423 0.46729 0.027024
## 4 0.049339 3 0.38279 0.40445 0.024325
## 5 0.023257 4 0.33345 0.35746 0.021596
## 6 0.019324 5 0.31019 0.34290 0.020669
## 7 0.017259 6 0.29087 0.33024 0.019701
## 8 0.012367 7 0.27361 0.30179 0.017780
## 9 0.011729 8 0.26124 0.29326 0.017140
## 10 0.010000 9 0.24951 0.28807 0.016811
rpart.plot(class.tree,
box.palette="GnBu",
branch.lty=3, shadow.col="gray", nn=TRUE)
Random Forest
#Random Forest
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
RF <- randomForest(lSalePrice ~.-SalePrice, data = model_lin_train,
importance =TRUE,ntree=500,nodesize=7, na.action=na.roughfix)
# variable importance
options(repr.plot.width=9, repr.plot.height=6)
varImpPlot(RF, type=1)
#prediction
rf.pred <- predict(RF, newdata=model_lin_valid )
accuracy(rf.pred, model_lin_valid$lSalePrice)
## ME RMSE MAE MPE MAPE
## Test set -0.01157112 0.1624179 0.1050912 -0.1171112 0.8789691
write.csv(model_lin_valid, file = "MyData.csv")
plot(rf.pred, model_lin_valid$lSalePrice, main = "Figure 9 Predicted vs. Actual log SalePrice")
abline(0,1)