Data Exploration and Prediction of House Price

Goal

Predict the sale price of houses.

Dataset Description

In this dataset, there are 1460 observations with 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa. Among explanatory variables, there are 37 integer variables, such as Id, MSSubClass, LotFrontage, and 43 factor variables, such as MSZoning, Street, LotShape. Descriptive analysis and quantitative analysis will use subsets of it depending on models.

First part of this report: Descriptive and Exploratory Analysis

Second part of this report: Predictive Analysis.

#install multiple packages, if not installed
ipak <- function(pkg){
new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg))
install.packages(new.pkg, dependencies = TRUE)
sapply(pkg, require, character.only = TRUE)
}
packages <- c("ggplot2", "readr", "gplots", "repr", "rpart.plot")
ipak(packages)

## Loading required package: ggplot2

## Loading required package: readr

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

## Loading required package: repr

## Warning: package 'repr' was built under R version 3.5.2

## Loading required package: rpart.plot

## Warning: package 'rpart.plot' was built under R version 3.5.2

## Loading required package: rpart

##    ggplot2      readr     gplots       repr rpart.plot 
##       TRUE       TRUE       TRUE       TRUE       TRUE

setwd("D:/Practice/house-prices-advanced-regression-techniques")
getwd()

## [1] "D:/Practice/house-prices-advanced-regression-techniques"

train <- read.csv("train.csv")

# list rows of data that have missing values 
missing_row <- train[!complete.cases(train),]
head(missing_row)

##   Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1  1         60       RL          65    8450   Pave  <NA>      Reg
## 2  2         20       RL          80    9600   Pave  <NA>      Reg
## 3  3         60       RL          68   11250   Pave  <NA>      IR1
## 4  4         70       RL          60    9550   Pave  <NA>      IR1
## 5  5         60       RL          84   14260   Pave  <NA>      IR1
## 6  6         50       RL          85   14115   Pave  <NA>      IR1
##   LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1         Lvl    AllPub    Inside       Gtl      CollgCr       Norm
## 2         Lvl    AllPub       FR2       Gtl      Veenker      Feedr
## 3         Lvl    AllPub    Inside       Gtl      CollgCr       Norm
## 4         Lvl    AllPub    Corner       Gtl      Crawfor       Norm
## 5         Lvl    AllPub       FR2       Gtl      NoRidge       Norm
## 6         Lvl    AllPub    Inside       Gtl      Mitchel       Norm
##   Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1       Norm     1Fam     2Story           7           5      2003
## 2       Norm     1Fam     1Story           6           8      1976
## 3       Norm     1Fam     2Story           7           5      2001
## 4       Norm     1Fam     2Story           7           5      1915
## 5       Norm     1Fam     2Story           8           5      2000
## 6       Norm     1Fam     1.5Fin           5           5      1993
##   YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1         2003     Gable  CompShg     VinylSd     VinylSd    BrkFace
## 2         1976     Gable  CompShg     MetalSd     MetalSd       None
## 3         2002     Gable  CompShg     VinylSd     VinylSd    BrkFace
## 4         1970     Gable  CompShg     Wd Sdng     Wd Shng       None
## 5         2000     Gable  CompShg     VinylSd     VinylSd    BrkFace
## 6         1995     Gable  CompShg     VinylSd     VinylSd       None
##   MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1        196        Gd        TA      PConc       Gd       TA           No
## 2          0        TA        TA     CBlock       Gd       TA           Gd
## 3        162        Gd        TA      PConc       Gd       TA           Mn
## 4          0        TA        TA     BrkTil       TA       Gd           No
## 5        350        Gd        TA      PConc       Gd       TA           Av
## 6          0        TA        TA       Wood       Gd       TA           No
##   BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1          GLQ        706          Unf          0       150         856
## 2          ALQ        978          Unf          0       284        1262
## 3          GLQ        486          Unf          0       434         920
## 4          ALQ        216          Unf          0       540         756
## 5          GLQ        655          Unf          0       490        1145
## 6          GLQ        732          Unf          0        64         796
##   Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1    GasA        Ex          Y      SBrkr       856       854            0
## 2    GasA        Ex          Y      SBrkr      1262         0            0
## 3    GasA        Ex          Y      SBrkr       920       866            0
## 4    GasA        Gd          Y      SBrkr       961       756            0
## 5    GasA        Ex          Y      SBrkr      1145      1053            0
## 6    GasA        Ex          Y      SBrkr       796       566            0
##   GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1      1710            1            0        2        1            3
## 2      1262            0            1        2        0            3
## 3      1786            1            0        2        1            3
## 4      1717            1            0        1        0            3
## 5      2198            1            0        2        1            4
## 6      1362            1            0        1        1            1
##   KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1            1          Gd            8        Typ          0        <NA>
## 2            1          TA            6        Typ          1          TA
## 3            1          Gd            6        Typ          1          TA
## 4            1          Gd            7        Typ          1          Gd
## 5            1          Gd            9        Typ          1          TA
## 6            1          TA            5        Typ          0        <NA>
##   GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1     Attchd        2003          RFn          2        548         TA
## 2     Attchd        1976          RFn          2        460         TA
## 3     Attchd        2001          RFn          2        608         TA
## 4     Detchd        1998          Unf          3        642         TA
## 5     Attchd        2000          RFn          3        836         TA
## 6     Attchd        1993          Unf          2        480         TA
##   GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1         TA          Y          0          61             0          0
## 2         TA          Y        298           0             0          0
## 3         TA          Y          0          42             0          0
## 4         TA          Y          0          35           272          0
## 5         TA          Y        192          84             0          0
## 6         TA          Y         40          30             0        320
##   ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1           0        0   <NA>  <NA>        <NA>       0      2   2008
## 2           0        0   <NA>  <NA>        <NA>       0      5   2007
## 3           0        0   <NA>  <NA>        <NA>       0      9   2008
## 4           0        0   <NA>  <NA>        <NA>       0      2   2006
## 5           0        0   <NA>  <NA>        <NA>       0     12   2008
## 6           0        0   <NA> MnPrv        Shed     700     10   2009
##   SaleType SaleCondition SalePrice
## 1       WD        Normal    208500
## 2       WD        Normal    181500
## 3       WD        Normal    223500
## 4       WD       Abnorml    140000
## 5       WD        Normal    250000
## 6       WD        Normal    143000

nrow(missing_row)

## [1] 1460

Select Variables

Step 1: select variables that may have greater impact on house price

Step 2: build subset of train dataset for prediction.

## show all variable names
var_name <- names(train)
var_name

##  [1] "Id"            "MSSubClass"    "MSZoning"      "LotFrontage"  
##  [5] "LotArea"       "Street"        "Alley"         "LotShape"     
##  [9] "LandContour"   "Utilities"     "LotConfig"     "LandSlope"    
## [13] "Neighborhood"  "Condition1"    "Condition2"    "BldgType"     
## [17] "HouseStyle"    "OverallQual"   "OverallCond"   "YearBuilt"    
## [21] "YearRemodAdd"  "RoofStyle"     "RoofMatl"      "Exterior1st"  
## [25] "Exterior2nd"   "MasVnrType"    "MasVnrArea"    "ExterQual"    
## [29] "ExterCond"     "Foundation"    "BsmtQual"      "BsmtCond"     
## [33] "BsmtExposure"  "BsmtFinType1"  "BsmtFinSF1"    "BsmtFinType2" 
## [37] "BsmtFinSF2"    "BsmtUnfSF"     "TotalBsmtSF"   "Heating"      
## [41] "HeatingQC"     "CentralAir"    "Electrical"    "X1stFlrSF"    
## [45] "X2ndFlrSF"     "LowQualFinSF"  "GrLivArea"     "BsmtFullBath" 
## [49] "BsmtHalfBath"  "FullBath"      "HalfBath"      "BedroomAbvGr" 
## [53] "KitchenAbvGr"  "KitchenQual"   "TotRmsAbvGrd"  "Functional"   
## [57] "Fireplaces"    "FireplaceQu"   "GarageType"    "GarageYrBlt"  
## [61] "GarageFinish"  "GarageCars"    "GarageArea"    "GarageQual"   
## [65] "GarageCond"    "PavedDrive"    "WoodDeckSF"    "OpenPorchSF"  
## [69] "EnclosedPorch" "X3SsnPorch"    "ScreenPorch"   "PoolArea"     
## [73] "PoolQC"        "Fence"         "MiscFeature"   "MiscVal"      
## [77] "MoSold"        "YrSold"        "SaleType"      "SaleCondition"
## [81] "SalePrice"

#Here, we select these important variables by creating a vector that contains variable names
select_var <- c('Id','MSZoning','Utilities', 'Neighborhood','BldgType','HouseStyle',
                'OverallQual','OverallCond','YearBuilt', 'ExterQual','ExterCond',
                'BsmtQual','BsmtCond','TotalBsmtSF','Heating','HeatingQC', 
                'CentralAir','Electrical','GrLivArea','BedroomAbvGr','KitchenAbvGr',
                'KitchenQual','TotRmsAbvGrd','Functional','Fireplaces','FireplaceQu',
               'GarageArea','GarageQual','GarageCond','OpenPorchSF','PoolArea',
                'Fence','MoSold','YrSold','SaleType','SaleCondition','SalePrice')

# construct subset of train dataset that is used for prediction
select_train <- train[,select_var]
head(select_train)

##   Id MSZoning Utilities Neighborhood BldgType HouseStyle OverallQual
## 1  1       RL    AllPub      CollgCr     1Fam     2Story           7
## 2  2       RL    AllPub      Veenker     1Fam     1Story           6
## 3  3       RL    AllPub      CollgCr     1Fam     2Story           7
## 4  4       RL    AllPub      Crawfor     1Fam     2Story           7
## 5  5       RL    AllPub      NoRidge     1Fam     2Story           8
## 6  6       RL    AllPub      Mitchel     1Fam     1.5Fin           5
##   OverallCond YearBuilt ExterQual ExterCond BsmtQual BsmtCond TotalBsmtSF
## 1           5      2003        Gd        TA       Gd       TA         856
## 2           8      1976        TA        TA       Gd       TA        1262
## 3           5      2001        Gd        TA       Gd       TA         920
## 4           5      1915        TA        TA       TA       Gd         756
## 5           5      2000        Gd        TA       Gd       TA        1145
## 6           5      1993        TA        TA       Gd       TA         796
##   Heating HeatingQC CentralAir Electrical GrLivArea BedroomAbvGr
## 1    GasA        Ex          Y      SBrkr      1710            3
## 2    GasA        Ex          Y      SBrkr      1262            3
## 3    GasA        Ex          Y      SBrkr      1786            3
## 4    GasA        Gd          Y      SBrkr      1717            3
## 5    GasA        Ex          Y      SBrkr      2198            4
## 6    GasA        Ex          Y      SBrkr      1362            1
##   KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1            1          Gd            8        Typ          0        <NA>
## 2            1          TA            6        Typ          1          TA
## 3            1          Gd            6        Typ          1          TA
## 4            1          Gd            7        Typ          1          Gd
## 5            1          Gd            9        Typ          1          TA
## 6            1          TA            5        Typ          0        <NA>
##   GarageArea GarageQual GarageCond OpenPorchSF PoolArea Fence MoSold
## 1        548         TA         TA          61        0  <NA>      2
## 2        460         TA         TA           0        0  <NA>      5
## 3        608         TA         TA          42        0  <NA>      9
## 4        642         TA         TA          35        0  <NA>      2
## 5        836         TA         TA          84        0  <NA>     12
## 6        480         TA         TA          30        0 MnPrv     10
##   YrSold SaleType SaleCondition SalePrice
## 1   2008       WD        Normal    208500
## 2   2007       WD        Normal    181500
## 3   2008       WD        Normal    223500
## 4   2006       WD       Abnorml    140000
## 5   2008       WD        Normal    250000
## 6   2009       WD        Normal    143000

summary(select_train)

##        Id            MSZoning     Utilities     Neighborhood   BldgType   
##  Min.   :   1.0   C (all):  10   AllPub:1459   NAmes  :225   1Fam  :1220  
##  1st Qu.: 365.8   FV     :  65   NoSeWa:   1   CollgCr:150   2fmCon:  31  
##  Median : 730.5   RH     :  16                 OldTown:113   Duplex:  52  
##  Mean   : 730.5   RL     :1151                 Edwards:100   Twnhs :  43  
##  3rd Qu.:1095.2   RM     : 218                 Somerst: 86   TwnhsE: 114  
##  Max.   :1460.0                                Gilbert: 79                
##                                                (Other):707                
##    HouseStyle   OverallQual      OverallCond      YearBuilt    ExterQual
##  1Story :726   Min.   : 1.000   Min.   :1.000   Min.   :1872   Ex: 52   
##  2Story :445   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954   Fa: 14   
##  1.5Fin :154   Median : 6.000   Median :5.000   Median :1973   Gd:488   
##  SLvl   : 65   Mean   : 6.099   Mean   :5.575   Mean   :1971   TA:906   
##  SFoyer : 37   3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000            
##  1.5Unf : 14   Max.   :10.000   Max.   :9.000   Max.   :2010            
##  (Other): 19                                                            
##  ExterCond BsmtQual   BsmtCond     TotalBsmtSF      Heating     HeatingQC
##  Ex:   3   Ex  :121   Fa  :  45   Min.   :   0.0   Floor:   1   Ex:741   
##  Fa:  28   Fa  : 35   Gd  :  65   1st Qu.: 795.8   GasA :1428   Fa: 49   
##  Gd: 146   Gd  :618   Po  :   2   Median : 991.5   GasW :  18   Gd:241   
##  Po:   1   TA  :649   TA  :1311   Mean   :1057.4   Grav :   7   Po:  1   
##  TA:1282   NA's: 37   NA's:  37   3rd Qu.:1298.2   OthW :   2   TA:428   
##                                   Max.   :6110.0   Wall :   4            
##                                                                          
##  CentralAir Electrical     GrLivArea     BedroomAbvGr    KitchenAbvGr  
##  N:  95     FuseA:  94   Min.   : 334   Min.   :0.000   Min.   :0.000  
##  Y:1365     FuseF:  27   1st Qu.:1130   1st Qu.:2.000   1st Qu.:1.000  
##             FuseP:   3   Median :1464   Median :3.000   Median :1.000  
##             Mix  :   1   Mean   :1515   Mean   :2.866   Mean   :1.047  
##             SBrkr:1334   3rd Qu.:1777   3rd Qu.:3.000   3rd Qu.:1.000  
##             NA's :   1   Max.   :5642   Max.   :8.000   Max.   :3.000  
##                                                                        
##  KitchenQual  TotRmsAbvGrd    Functional    Fireplaces    FireplaceQu
##  Ex:100      Min.   : 2.000   Maj1:  14   Min.   :0.000   Ex  : 24   
##  Fa: 39      1st Qu.: 5.000   Maj2:   5   1st Qu.:0.000   Fa  : 33   
##  Gd:586      Median : 6.000   Min1:  31   Median :1.000   Gd  :380   
##  TA:735      Mean   : 6.518   Min2:  34   Mean   :0.613   Po  : 20   
##              3rd Qu.: 7.000   Mod :  15   3rd Qu.:1.000   TA  :313   
##              Max.   :14.000   Sev :   1   Max.   :3.000   NA's:690   
##                               Typ :1360                              
##    GarageArea     GarageQual  GarageCond   OpenPorchSF    
##  Min.   :   0.0   Ex  :   3   Ex  :   2   Min.   :  0.00  
##  1st Qu.: 334.5   Fa  :  48   Fa  :  35   1st Qu.:  0.00  
##  Median : 480.0   Gd  :  14   Gd  :   9   Median : 25.00  
##  Mean   : 473.0   Po  :   3   Po  :   7   Mean   : 46.66  
##  3rd Qu.: 576.0   TA  :1311   TA  :1326   3rd Qu.: 68.00  
##  Max.   :1418.0   NA's:  81   NA's:  81   Max.   :547.00  
##                                                           
##     PoolArea         Fence          MoSold           YrSold    
##  Min.   :  0.000   GdPrv:  59   Min.   : 1.000   Min.   :2006  
##  1st Qu.:  0.000   GdWo :  54   1st Qu.: 5.000   1st Qu.:2007  
##  Median :  0.000   MnPrv: 157   Median : 6.000   Median :2008  
##  Mean   :  2.759   MnWw :  11   Mean   : 6.322   Mean   :2008  
##  3rd Qu.:  0.000   NA's :1179   3rd Qu.: 8.000   3rd Qu.:2009  
##  Max.   :738.000                Max.   :12.000   Max.   :2010  
##                                                                
##     SaleType    SaleCondition    SalePrice     
##  WD     :1267   Abnorml: 101   Min.   : 34900  
##  New    : 122   AdjLand:   4   1st Qu.:129975  
##  COD    :  43   Alloca :  12   Median :163000  
##  ConLD  :   9   Family :  20   Mean   :180921  
##  ConLI  :   5   Normal :1198   3rd Qu.:214000  
##  ConLw  :   5   Partial: 125   Max.   :755000  
##  (Other):   9

2.4 Descriptive and exploratory analysis of SalePrice

SalePrice is our target variable and also the dependent variable for prediction. According to the assumptions of Linear Regression, data should be normally distributed. By checking the distribution of SalePrice, we can decide if we need non-linear transformation, like log term, to make better prediction.

2.4.1 Summary of Target Variable: SalePrice

# Five number summary
summary(select_train$SalePrice)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000

# Draw a higtogram to figure out the distribution of SalePrice
options(scipen=10000)
ggplot(select_train, aes(x = SalePrice, fill = ..count..)) +
  geom_histogram(binwidth = 5000) +
  ggtitle("Figure 1 Histogram of SalePrice") +
  ylab("Count of houses") +
  xlab("Housing Price") + 
  theme(plot.title = element_text(hjust = 0.5))

From the histogram above, the distribution of our target variable– SalePrice is skewed to right. Thus, a log term of SalePrice should be generated for linear regression. Here, we name it lSalePrice.

#log term of SalePrice
select_train$lSalePrice <- log(select_train$SalePrice)

# Draw a higtogram to figure out the distribution of log SalePrice

ggplot(select_train, aes(x = lSalePrice, fill = ..count..)) +
  geom_histogram(binwidth = 0.05) +
  ggtitle("Figure 2 Histogram of log SalePrice") +
  ylab("Count of houses") +
  xlab("Housing Price") + 
  theme(plot.title = element_text(hjust = 0.5))

Great! After fixing, lSalePrice is normally distributed. We will use this log term of SalePrice later in models.

2.4.2 Explore the distribution of SalePrice by MSZoning

When it comes to housing price, the value of house is usually related to two types of elements: internal and external. Internal elements are the key features of house itself, like total area, the number of rooms. As for External elements, environment is one of the key factors.

First, let’s figure out the variable that can indicates housing environment in our dataset. Here, I choose MSZoning as this indicator. It’s a dummy variable and the identiciation is:

MSZoning: Identifies the general zoning classification of the sale.

Therefore, in this section, I will explore the relationship between MSZoning and our target variable SalePrice.

First, let’s have a close look at MSZoning. Because it’s a dummy variable, I am curious about the total number of houses in each category.

# count house by MSZoning
options(repr.plot.width=5, repr.plot.height=4)
ggplot(select_train, aes(x = MSZoning, fill = MSZoning )) + 
geom_bar()+ 
scale_fill_hue(c = 80)+
ggtitle("Figure 3 Distribution of MSZoning")+
theme(plot.title = element_text(hjust = 0.5),legend.position="right", legend.background = element_rect(fill="grey90",
                                                                                                           size=0.5, linetype="solid", 
                                                                                                           colour ="black"))+
geom_text(stat='count',aes(label=..count..),vjust=-0.25)

# Distribution of MSZoning
table(select_train$MSZoning)

## 
## C (all)      FV      RH      RL      RM 
##      10      65      16    1151     218

From the graph and table above, it is obvious that most of houses in this dataset are built in the area of Residential Low Density(1151 houses), and follows by Residential Medium Density(218 houses). Few houes are built in Commercial, Floating Village and Residential High Density.

Since a large amount of houses comes to the categoreis of Residential Low Density and Residential Medium Density, these two areas should be paid more attention for housing price analysis.

On top of it, let’s add our target variable into analysis. How does housing price look like in each category? Here, I use boxplot to show the distribution of prices in each MSZoning.

# Change plot size to 9 x 6
options(repr.plot.width=9, repr.plot.height=6)
#boxplot of SalePrice by MSZoning
#add average value of SalePrice as red point
ggplot(select_train, aes(x=MSZoning, y=SalePrice, fill=MSZoning)) + 
  geom_boxplot(alpha=0.3) +
  stat_summary(fun.y=mean, geom="point", shape=20, size=4, color="red", fill="red")+
  theme(legend.position="none")+
  ggtitle("Figure 4 Boxplot of SalePrice by MSZoning")+
  theme(plot.title = element_text(hjust = 0.5))

The graph above shows the distribution of SalePrice by MSZoning. The sales in “Floating Village Residential” area have the highest average sale price, and then followed by “Residential Low Density”. While “Commercial” sales have the lowest average sale price.

It is quite strange that commercial area has the lowest average Sale Price while village area has the highest. One possible explanation could be SalePrice is also related to the size of houses. To confirm, let’s explore the average size in these area.

The variable indicates size in this dataset is called GrLivArea.

Definition: Above grade (ground) living area square feet

# Display the average hosue size in each area
library(plyr)

ddply(train, .(MSZoning), summarize,  size=mean(GrLivArea))

##   MSZoning     size
## 1  C (all) 1191.400
## 2       FV 1574.538
## 3       RH 1510.125
## 4       RL 1551.646
## 5       RM 1322.073

It is obvious that the avarage size of houses in Commecial are is much smaller than Floating Village area, which verified our assumption above.

2.4.3 Explore the distribution of SalePrice by BldfType

Next, We are going to describe SalePrice by different cateogries of BldfType.

BldgType: Type of dwelling

1Fam Single-family Detached

2FmCon Two-family Conversion; originally built as one-family dwelling Duplx Duplex TwnhsE Townhouse End Unit TwnhsI Townhouse Inside Unit To get a quick feel about BldgType, I use a table here to count houses in each catetory and also show maximum and minimum SalePrice.

library(plyr)
ddply(train, .(BldgType), summarize,Total = length(BldgType),Max_price=max(SalePrice),Min_price=min(SalePrice))

##   BldgType Total Max_price Min_price
## 1     1Fam  1220    755000     34900
## 2   2fmCon    31    228950     55000
## 3   Duplex    52    206300     82000
## 4    Twnhs    43    230000     75000
## 5   TwnhsE   114    392500     75500

In previous section I used Boxplot to describe MSZoning, while for BldgType I will use Histogram, since I am more interested in the distribution rather than summary numbers.

# historgram of housing price by BldgType 
ggplot(select_train, aes(SalePrice)) +
 geom_histogram(aes(fill = BldgType), position = position_stack(reverse = TRUE), binwidth = 20000) +
 coord_flip() + ggtitle("Figure 5 Histogram of SalePrice") +
 ylab("Count") +
 xlab("Housing Price") + 
 theme(plot.title = element_text(hjust = 0.5),legend.position=c(0.9,0.8), legend.background = element_rect(fill="grey90",
                                                                                                           size=0.5, linetype="solid", 
                                                                                                           colour ="black"))

More thoughts about the graph above:

For houses with type of Single-family Detached, most of their prices are within the range from 50000 to 300000 For Two-family Conversion, Duplex, Townhouse End Unit and Townhouse Inside Unit, most of house prices are ranging from 75000 to 210000 The highest and lowest house price both come to Single-family Detached house type

Explore the distribution of SalePrice by OverallQual

The last one is OverallQual.

OverallQual: Rates the overall material and finish of the house

10 Very Excellent 9 Excellent 8 Very Good 7 Good 6 Above Average 5 Average 4 Below Average 3 Fair 2 Poor 1 Very Poor

ggplot(select_train, aes(x = SalePrice,fill = as.factor(OverallQual))) +
  geom_histogram(position = "stack", binwidth = 10000) +
  ggtitle("Figure 6 Histogram of SalePrice") +
  ylab("Count") +
  xlab("Housing Price") + 
  scale_fill_discrete(name="OverallQual")+
  theme(plot.title = element_text(hjust = 0.5), legend.position=c(0.9,0.7), legend.background = element_rect(fill="grey90",
                                                                                                           size=0.5, linetype="solid", 
                                                                                                           colour ="black"))

As we saw in graph above:

Most houese are with OverallQuall of 4,5,6 and 7, equivalent to “Below Average”, “Average”, “Above Average” and “Good” The higher rate of overall quality, the higher house sale price For each rate level of overall quality, the distribution of house price is almost symmetric

What kind of house will be sold for higher price?

2.5.1 Correlation Exploreation

Let’s select variables first.

Variables for correlation exploration: SalePrice’,‘OverallQual’,‘OverallCond’,‘YearBuilt’,‘ExterCond2’,‘TotalBsmtSF’,‘HeatingQC2’

In order to have a clear view of how the key variables relate to SalePrice, I decide to use Correlation Heatmap to plot correlation coefficients.

But before plotting heatmap, one more step is needed– feature engineering. In section I, we learn that some variables are factors, in order to buid heatmap we need to convert them into numerics.

Since these factor varaibles evaluate quality of house with ordered levels, such as “Ex”, “Fa”,“Gd”, “TA”, and “Po”, here, I match them to numbers: “1”,“2”,“3”,“4”, and “5”. That is, the smaller number, the higher level. After transforming, all the variables used for heatmap are numberic.

# convert factor to numeric
select_train$ExterCond2 <- as.numeric(factor(select_train$ExterCond, 
                                  levels = c("Ex", "Fa","Gd", "TA","Po"),
                                  labels = c(5,2,4,3,1) ,ordered = TRUE))
select_train$HeatingQC2 <- as.numeric(factor(select_train$HeatingQC, 
                                  levels = c("Ex", "Fa","Gd", "TA","Po"),
                                  labels = c(5,2,4,3,1) ,ordered = TRUE))
select_train$CentralAir2 <- as.numeric(factor(select_train$CentralAir, 
                                  levels = c("N", "Y"),
                                  labels = c(0,1) ,ordered = TRUE))

#select variables that be used for model buidling and heat map
model_var <- c('SalePrice', 
                'OverallQual','OverallCond','YearBuilt','ExterCond2',
                'TotalBsmtSF','HeatingQC2', 
                'CentralAir2','GrLivArea','BedroomAbvGr','KitchenAbvGr',
                'TotRmsAbvGrd','Fireplaces',
                'GarageArea','OpenPorchSF','PoolArea',
                 'YrSold')
heat <- select_train[,model_var]

Next, using ggplot plot correlation heatmap.

#plot correlation heatmap for SalePrice
options(repr.plot.width=8, repr.plot.height=6)
library(ggplot2)
library(reshape2)
qplot(x=Var1, y=Var2, data=melt(cor(heat, use="p")), fill=value, geom="tile") +
   scale_fill_gradient2(low = "green", high = "red", mid = "white", 
   midpoint = 0, limit = c(-1,1), space = "Lab", 
   name="Correlation") +
   theme_minimal()+ 
   theme(axis.text.x = element_text(angle = 45, vjust = 1, size = 8, hjust = 1))+
   coord_fixed()+
   ggtitle("Figure 7 Correlation Heatmap") +
   theme(plot.title = element_text(hjust = 0.4))

In this graph, Red indicates perfect positive correlation and Green indicates perfect negative correlation. As we can see, there are several variables should be paid attention to: GarageArea, Fireplaces, TotRmsAbvGrd, GrLivArea, HeatingQC, TotalBsmtSF and YearBuild.

2.5.2 Correlation between SalePrice and some numeric variables

In this section, I am going to analyze the correlation between SalePrice and numeric variables, including GrLivArea,TotalBsmtSF, TotRmsAbvGrd, GarageArea. Different from categorical variables, here I will use scatter plot and trend line to indicate the relationship.

# scatter plot of GrLiveArea
# Change plot size to 5 x 4
options(repr.plot.width=9, repr.plot.height=6)
p1 <- ggplot(select_train, aes(x=GrLivArea, y=SalePrice)) + 
  geom_point(shape=1) +  
  geom_smooth(method=lm , color="red", se=FALSE)+
  ggtitle("Figure 8 Scatter plot of SalePrice and GrLivArea") +
  theme(plot.title = element_text(hjust = 0.4))

# scatter plot of TotalBsmtSF
p2 <- ggplot(select_train, aes(x=TotalBsmtSF, y=SalePrice)) + 
  geom_point(shape=1) +  
  geom_smooth(method=lm , color="red", se=FALSE)+
  ggtitle("Figure 9 Scatter plot of SalePrice and TotalBsmtSF") +
  theme(plot.title = element_text(hjust = 0.4))

#scatter plot of TotRmsAbvGrd
p3 <- ggplot(select_train, aes(x=TotRmsAbvGrd, y=SalePrice)) + 
  geom_point(shape=1) +  
  geom_smooth(method=lm , color="red", se=FALSE)+
  ggtitle("Figure 10 Scatter plot of SalePrice and TotRmsAbvGrd") +
  theme(plot.title = element_text(hjust = 0.4))

#scatter plot of GarageArea
p4 <- ggplot(select_train, aes(x=GarageArea, y=SalePrice)) + 
  geom_point(shape=1) +  
  geom_smooth(method=lm , color="red", se=FALSE)+
  ggtitle("Figure 11 Scatter plot of SalePrice and GarageArea") +
  theme(plot.title = element_text(hjust = 0.4))

library(gridExtra)
grid.arrange(p1, p2,p3,p4)

Some thoughts about these graphs:

GrLivArea, TotalBsmtSF, TotRmsAbvGrd, and GarageArea are positively correlated with SalePrice, which means with the increase of GrLivArea, TotalBsmtSF, TotRmsAbvGrd and GarageArea, the SalePrice also increases. TotalBsmtSF has more concentrated distribution than others Part III Model Fitting

After Descriptive Analysis we are moving into Predictive Analysis section. There are three models here:

Linear Regresion Model

Classification & Regression Trees (CART) Model

Random Forest Model

3.1 Linear Regression Model

In Linear Regresion Model, the relationships between Dependent and Indepedent Variables is expressed by equation with coefficients. The aim of this model is to minimize the sum of the squared residuals. Here I select 16 variables to fit into this model.

Variables in this model

SalePrice, OverallQual, OverallCond, YearBuilt, ExterQual2, ExterCond2, TotalBsmtSF, HeatingQC2, CentralAir2, GrLivArea, BedroomAbvGr, KitchenAbvGr, TotRmsAbvGrd, Fireplaces, GarageArea, OpenPorchSF, PoolArea,YrSold

Step 1: choose variables and transfer SalePrice into log term

#prediction of lm
#build model dataset for linear regression 
model_lin <- select_train[, model_var]
model_lin$lSalePrice <- log(model_lin$SalePrice)

Step 2: divide datasets into two parts – training and validation, to prepare for prediction later

#partition data
set.seed(10000)
train.index <- sample(c(1:dim(model_lin)[1]), dim(model_lin)[1]*0.8)
model_lin_train = model_lin[train.index,]
model_lin_valid <- model_lin[-train.index,]

Step 3: run regression

#use lm() to run linear regression of SalePrice on all variables in model dataset
linreg <- lm(lSalePrice~.-SalePrice, data = model_lin_train)
summary(linreg)

## 
## Call:
## lm(formula = lSalePrice ~ . - SalePrice, data = model_lin_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.58114 -0.06702  0.00342  0.07786  0.44064 
## 
## Coefficients:
##                   Estimate    Std. Error t value             Pr(>|t|)    
## (Intercept)   5.6023217895  6.1903443257   0.905              0.36565    
## OverallQual   0.0672509736  0.0050771139  13.246 < 0.0000000000000002 ***
## OverallCond   0.0592300019  0.0042589866  13.907 < 0.0000000000000002 ***
## YearBuilt     0.0033446003  0.0002151885  15.543 < 0.0000000000000002 ***
## ExterCond2    0.0173620804  0.0104747653   1.658              0.09769 .  
## TotalBsmtSF   0.0001781542  0.0000126056  14.133 < 0.0000000000000002 ***
## HeatingQC2   -0.0163295194  0.0035734007  -4.570 0.000005409910875536 ***
## CentralAir2   0.0578755232  0.0199820314   2.896              0.00385 ** 
## GrLivArea     0.0002318796  0.0000173153  13.392 < 0.0000000000000002 ***
## BedroomAbvGr -0.0118786527  0.0073986920  -1.606              0.10866    
## KitchenAbvGr -0.0875970702  0.0220040022  -3.981 0.000072936382037312 ***
## TotRmsAbvGrd  0.0164602588  0.0053935815   3.052              0.00233 ** 
## Fireplaces    0.0631045713  0.0076248880   8.276 0.000000000000000349 ***
## GarageArea    0.0002487926  0.0000255869   9.723 < 0.0000000000000002 ***
## OpenPorchSF  -0.0000001497  0.0000654107  -0.002              0.99817    
## PoolArea      0.0001314590  0.0000970241   1.355              0.17571    
## YrSold       -0.0008613313  0.0030848002  -0.279              0.78013    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1389 on 1151 degrees of freedom
## Multiple R-squared:  0.8787, Adjusted R-squared:  0.877 
## F-statistic:   521 on 16 and 1151 DF,  p-value: < 0.00000000000000022

Step 4: forecast and check for model accuracy

library(forecast)
#use predict() to make prediction on a new set
pred1 <- predict(linreg,model_lin_valid,type = "response")
residuals <- model_lin_valid$lSalePrice - pred1
linreg_pred <- data.frame("Predicted" = pred1, "Actual" = model_lin_valid$lSalePrice, "Residual" = residuals)
accuracy(pred1, model_lin_valid$lSalePrice)

##                   ME      RMSE       MAE        MPE      MAPE
## Test set -0.01358129 0.2256403 0.1155791 -0.1262696 0.9675185

CART

# classification tree
library(rpart)
library(rpart.plot)

class.tree <- rpart(lSalePrice~.-SalePrice,
                    data = model_lin_train,control = rpart.control(cp = 0.01))

plotcp(class.tree)

printcp(class.tree)

## 
## Regression tree:
## rpart(formula = lSalePrice ~ . - SalePrice, data = model_lin_train, 
##     control = rpart.control(cp = 0.01))
## 
## Variables actually used in tree construction:
## [1] CentralAir2 GarageArea  GrLivArea   OverallCond OverallQual TotalBsmtSF
## 
## Root node error: 182.93/1168 = 0.15661
## 
## n= 1168 
## 
##          CP nsplit rel error  xerror     xstd
## 1  0.459527      0   1.00000 1.00138 0.049656
## 2  0.086240      1   0.54047 0.54351 0.030019
## 3  0.071446      2   0.45423 0.46729 0.027024
## 4  0.049339      3   0.38279 0.40445 0.024325
## 5  0.023257      4   0.33345 0.35746 0.021596
## 6  0.019324      5   0.31019 0.34290 0.020669
## 7  0.017259      6   0.29087 0.33024 0.019701
## 8  0.012367      7   0.27361 0.30179 0.017780
## 9  0.011729      8   0.26124 0.29326 0.017140
## 10 0.010000      9   0.24951 0.28807 0.016811

rpart.plot(class.tree, 
           box.palette="GnBu",
           branch.lty=3, shadow.col="gray", nn=TRUE)

Random Forest

#Random Forest
library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:gridExtra':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

RF <- randomForest(lSalePrice ~.-SalePrice, data = model_lin_train, 
                   importance =TRUE,ntree=500,nodesize=7, na.action=na.roughfix)

# variable importance
options(repr.plot.width=9, repr.plot.height=6)
varImpPlot(RF, type=1)

#prediction
rf.pred <- predict(RF, newdata=model_lin_valid )
accuracy(rf.pred, model_lin_valid$lSalePrice)

##                   ME      RMSE       MAE        MPE      MAPE
## Test set -0.01157112 0.1624179 0.1050912 -0.1171112 0.8789691

write.csv(model_lin_valid, file = "MyData.csv")

plot(rf.pred, model_lin_valid$lSalePrice, main = "Figure 9 Predicted vs. Actual log SalePrice") 
abline(0,1)