For this project, we are going to work on a Housing dataset which gives around 80 independent features of around 2900 houses and their locality in the city of Ames, Iowa. The dataset was actually created for academia by a professor using the raw data coming from the Assessor’s Office in the form of a data dump from their records system. As a part of this project, we are analyzing this raw data and understand how the sale price is dependent on these features.
Ask a home buyer to describe their dream house, and they probably won’t begin with the height of the basement ceiling or the proximity to an east-west railroad. But this playground competition’s dataset proves that much more influences price negotiations than the number of bedrooms or a white-picket fence. With 79 explanatory variables describing (almost) every aspect of residential homes in Ames, Iowa, this is a real world data set which needs to be explored further to understand how the Sale Price of a house in the city of Ames, Iowa can be determined.
Data Source: https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data
Out of the 80 total variables, 44 are categorical variables like: Street: Type of road access to property which has 2 values - “Grvl” (for Gravel) and “Pave” for Paved. Remaining 36 variables are numeric features. Response or Target feature is the Sale price of a unit which is going to be our element / feature of interest for the prediction - based on the independent features.
All the variables description is given in this file: https://raw.githubusercontent.com/deepakmongia/Data621/master/Final%20Project/Data/data_description.txt
Let us now start with the work. We will load our required libraries first, load the dataset, and see the basic EDA features of the dataset.
library(ggplot2)
library(ggmap)
## Warning: package 'ggmap' was built under R version 3.6.3
## Google's Terms of Service: https://cloud.google.com/maps-platform/terms/.
## Please cite ggmap if you use it! See citation("ggmap") for details.
library(cowplot)
## Warning: package 'cowplot' was built under R version 3.6.2
##
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
## default ggplot2 theme anymore. To recover the previous
## behavior, execute:
## theme_set(theme_cowplot())
## ********************************************************
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggmap':
##
## theme_nothing
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.6.3
## corrplot 0.84 loaded
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(mice)
## Warning: package 'mice' was built under R version 3.6.2
##
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(VIM)
## Warning: package 'VIM' was built under R version 3.6.2
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(corrplot)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(ggResidpanel)
## Warning: package 'ggResidpanel' was built under R version 3.6.3
library(pdp)
## Warning: package 'pdp' was built under R version 3.6.3
housing2_raw <- read.csv("https://raw.githubusercontent.com/deepakmongia/Data621/master/Final%20Project/Data/train.csv",
header = TRUE, row.names = 1)
dim(housing2_raw)
## [1] 1460 80
colnames(housing2_raw)
## [1] "MSSubClass" "MSZoning" "LotFrontage" "LotArea"
## [5] "Street" "Alley" "LotShape" "LandContour"
## [9] "Utilities" "LotConfig" "LandSlope" "Neighborhood"
## [13] "Condition1" "Condition2" "BldgType" "HouseStyle"
## [17] "OverallQual" "OverallCond" "YearBuilt" "YearRemodAdd"
## [21] "RoofStyle" "RoofMatl" "Exterior1st" "Exterior2nd"
## [25] "MasVnrType" "MasVnrArea" "ExterQual" "ExterCond"
## [29] "Foundation" "BsmtQual" "BsmtCond" "BsmtExposure"
## [33] "BsmtFinType1" "BsmtFinSF1" "BsmtFinType2" "BsmtFinSF2"
## [37] "BsmtUnfSF" "TotalBsmtSF" "Heating" "HeatingQC"
## [41] "CentralAir" "Electrical" "X1stFlrSF" "X2ndFlrSF"
## [45] "LowQualFinSF" "GrLivArea" "BsmtFullBath" "BsmtHalfBath"
## [49] "FullBath" "HalfBath" "BedroomAbvGr" "KitchenAbvGr"
## [53] "KitchenQual" "TotRmsAbvGrd" "Functional" "Fireplaces"
## [57] "FireplaceQu" "GarageType" "GarageYrBlt" "GarageFinish"
## [61] "GarageCars" "GarageArea" "GarageQual" "GarageCond"
## [65] "PavedDrive" "WoodDeckSF" "OpenPorchSF" "EnclosedPorch"
## [69] "X3SsnPorch" "ScreenPorch" "PoolArea" "PoolQC"
## [73] "Fence" "MiscFeature" "MiscVal" "MoSold"
## [77] "YrSold" "SaleType" "SaleCondition" "SalePrice"
summary(housing2_raw)
## MSSubClass MSZoning LotFrontage LotArea
## Min. : 20.0 C (all): 10 Min. : 21.00 Min. : 1300
## 1st Qu.: 20.0 FV : 65 1st Qu.: 59.00 1st Qu.: 7554
## Median : 50.0 RH : 16 Median : 69.00 Median : 9478
## Mean : 56.9 RL :1151 Mean : 70.05 Mean : 10517
## 3rd Qu.: 70.0 RM : 218 3rd Qu.: 80.00 3rd Qu.: 11602
## Max. :190.0 Max. :313.00 Max. :215245
## NA's :259
## Street Alley LotShape LandContour Utilities
## Grvl: 6 Grvl: 50 IR1:484 Bnk: 63 AllPub:1459
## Pave:1454 Pave: 41 IR2: 41 HLS: 50 NoSeWa: 1
## NA's:1369 IR3: 10 Low: 36
## Reg:925 Lvl:1311
##
##
##
## LotConfig LandSlope Neighborhood Condition1 Condition2
## Corner : 263 Gtl:1382 NAmes :225 Norm :1260 Norm :1445
## CulDSac: 94 Mod: 65 CollgCr:150 Feedr : 81 Feedr : 6
## FR2 : 47 Sev: 13 OldTown:113 Artery : 48 Artery : 2
## FR3 : 4 Edwards:100 RRAn : 26 PosN : 2
## Inside :1052 Somerst: 86 PosN : 19 RRNn : 2
## Gilbert: 79 RRAe : 11 PosA : 1
## (Other):707 (Other): 15 (Other): 2
## BldgType HouseStyle OverallQual OverallCond
## 1Fam :1220 1Story :726 Min. : 1.000 Min. :1.000
## 2fmCon: 31 2Story :445 1st Qu.: 5.000 1st Qu.:5.000
## Duplex: 52 1.5Fin :154 Median : 6.000 Median :5.000
## Twnhs : 43 SLvl : 65 Mean : 6.099 Mean :5.575
## TwnhsE: 114 SFoyer : 37 3rd Qu.: 7.000 3rd Qu.:6.000
## 1.5Unf : 14 Max. :10.000 Max. :9.000
## (Other): 19
## YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1872 Min. :1950 Flat : 13 CompShg:1434 VinylSd:515
## 1st Qu.:1954 1st Qu.:1967 Gable :1141 Tar&Grv: 11 HdBoard:222
## Median :1973 Median :1994 Gambrel: 11 WdShngl: 6 MetalSd:220
## Mean :1971 Mean :1985 Hip : 286 WdShake: 5 Wd Sdng:206
## 3rd Qu.:2000 3rd Qu.:2004 Mansard: 7 ClyTile: 1 Plywood:108
## Max. :2010 Max. :2010 Shed : 2 Membran: 1 CemntBd: 61
## (Other): 2 (Other):128
## Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond
## VinylSd:504 BrkCmn : 15 Min. : 0.0 Ex: 52 Ex: 3
## MetalSd:214 BrkFace:445 1st Qu.: 0.0 Fa: 14 Fa: 28
## HdBoard:207 None :864 Median : 0.0 Gd:488 Gd: 146
## Wd Sdng:197 Stone :128 Mean : 103.7 TA:906 Po: 1
## Plywood:142 NA's : 8 3rd Qu.: 166.0 TA:1282
## CmentBd: 60 Max. :1600.0
## (Other):136 NA's :8
## Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1
## BrkTil:146 Ex :121 Fa : 45 Av :221 ALQ :220
## CBlock:634 Fa : 35 Gd : 65 Gd :134 BLQ :148
## PConc :647 Gd :618 Po : 2 Mn :114 GLQ :418
## Slab : 24 TA :649 TA :1311 No :953 LwQ : 74
## Stone : 6 NA's: 37 NA's: 37 NA's: 38 Rec :133
## Wood : 3 Unf :430
## NA's: 37
## BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF
## Min. : 0.0 ALQ : 19 Min. : 0.00 Min. : 0.0
## 1st Qu.: 0.0 BLQ : 33 1st Qu.: 0.00 1st Qu.: 223.0
## Median : 383.5 GLQ : 14 Median : 0.00 Median : 477.5
## Mean : 443.6 LwQ : 46 Mean : 46.55 Mean : 567.2
## 3rd Qu.: 712.2 Rec : 54 3rd Qu.: 0.00 3rd Qu.: 808.0
## Max. :5644.0 Unf :1256 Max. :1474.00 Max. :2336.0
## NA's: 38
## TotalBsmtSF Heating HeatingQC CentralAir Electrical
## Min. : 0.0 Floor: 1 Ex:741 N: 95 FuseA: 94
## 1st Qu.: 795.8 GasA :1428 Fa: 49 Y:1365 FuseF: 27
## Median : 991.5 GasW : 18 Gd:241 FuseP: 3
## Mean :1057.4 Grav : 7 Po: 1 Mix : 1
## 3rd Qu.:1298.2 OthW : 2 TA:428 SBrkr:1334
## Max. :6110.0 Wall : 4 NA's : 1
##
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea
## Min. : 334 Min. : 0 Min. : 0.000 Min. : 334
## 1st Qu.: 882 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1130
## Median :1087 Median : 0 Median : 0.000 Median :1464
## Mean :1163 Mean : 347 Mean : 5.845 Mean :1515
## 3rd Qu.:1391 3rd Qu.: 728 3rd Qu.: 0.000 3rd Qu.:1777
## Max. :4692 Max. :2065 Max. :572.000 Max. :5642
##
## BsmtFullBath BsmtHalfBath FullBath HalfBath
## Min. :0.0000 Min. :0.00000 Min. :0.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000
## Median :0.0000 Median :0.00000 Median :2.000 Median :0.0000
## Mean :0.4253 Mean :0.05753 Mean :1.565 Mean :0.3829
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :3.0000 Max. :2.00000 Max. :3.000 Max. :2.0000
##
## BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Min. :0.000 Ex:100 Min. : 2.000 Maj1: 14
## 1st Qu.:2.000 1st Qu.:1.000 Fa: 39 1st Qu.: 5.000 Maj2: 5
## Median :3.000 Median :1.000 Gd:586 Median : 6.000 Min1: 31
## Mean :2.866 Mean :1.047 TA:735 Mean : 6.518 Min2: 34
## 3rd Qu.:3.000 3rd Qu.:1.000 3rd Qu.: 7.000 Mod : 15
## Max. :8.000 Max. :3.000 Max. :14.000 Sev : 1
## Typ :1360
## Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish
## Min. :0.000 Ex : 24 2Types : 6 Min. :1900 Fin :352
## 1st Qu.:0.000 Fa : 33 Attchd :870 1st Qu.:1961 RFn :422
## Median :1.000 Gd :380 Basment: 19 Median :1980 Unf :605
## Mean :0.613 Po : 20 BuiltIn: 88 Mean :1979 NA's: 81
## 3rd Qu.:1.000 TA :313 CarPort: 9 3rd Qu.:2002
## Max. :3.000 NA's:690 Detchd :387 Max. :2010
## NA's : 81 NA's :81
## GarageCars GarageArea GarageQual GarageCond PavedDrive
## Min. :0.000 Min. : 0.0 Ex : 3 Ex : 2 N: 90
## 1st Qu.:1.000 1st Qu.: 334.5 Fa : 48 Fa : 35 P: 30
## Median :2.000 Median : 480.0 Gd : 14 Gd : 9 Y:1340
## Mean :1.767 Mean : 473.0 Po : 3 Po : 7
## 3rd Qu.:2.000 3rd Qu.: 576.0 TA :1311 TA :1326
## Max. :4.000 Max. :1418.0 NA's: 81 NA's: 81
##
## WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 0.00 Median : 25.00 Median : 0.00 Median : 0.00
## Mean : 94.24 Mean : 46.66 Mean : 21.95 Mean : 3.41
## 3rd Qu.:168.00 3rd Qu.: 68.00 3rd Qu.: 0.00 3rd Qu.: 0.00
## Max. :857.00 Max. :547.00 Max. :552.00 Max. :508.00
##
## ScreenPorch PoolArea PoolQC Fence MiscFeature
## Min. : 0.00 Min. : 0.000 Ex : 2 GdPrv: 59 Gar2: 2
## 1st Qu.: 0.00 1st Qu.: 0.000 Fa : 2 GdWo : 54 Othr: 2
## Median : 0.00 Median : 0.000 Gd : 3 MnPrv: 157 Shed: 49
## Mean : 15.06 Mean : 2.759 NA's:1453 MnWw : 11 TenC: 1
## 3rd Qu.: 0.00 3rd Qu.: 0.000 NA's :1179 NA's:1406
## Max. :480.00 Max. :738.000
##
## MiscVal MoSold YrSold SaleType
## Min. : 0.00 Min. : 1.000 Min. :2006 WD :1267
## 1st Qu.: 0.00 1st Qu.: 5.000 1st Qu.:2007 New : 122
## Median : 0.00 Median : 6.000 Median :2008 COD : 43
## Mean : 43.49 Mean : 6.322 Mean :2008 ConLD : 9
## 3rd Qu.: 0.00 3rd Qu.: 8.000 3rd Qu.:2009 ConLI : 5
## Max. :15500.00 Max. :12.000 Max. :2010 ConLw : 5
## (Other): 9
## SaleCondition SalePrice
## Abnorml: 101 Min. : 34900
## AdjLand: 4 1st Qu.:129975
## Alloca : 12 Median :163000
## Family : 20 Mean :180921
## Normal :1198 3rd Qu.:214000
## Partial: 125 Max. :755000
##
head(housing2_raw)
## MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1 60 RL 65 8450 Pave <NA> Reg
## 2 20 RL 80 9600 Pave <NA> Reg
## 3 60 RL 68 11250 Pave <NA> IR1
## 4 70 RL 60 9550 Pave <NA> IR1
## 5 60 RL 84 14260 Pave <NA> IR1
## 6 50 RL 85 14115 Pave <NA> IR1
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1 Lvl AllPub Inside Gtl CollgCr Norm
## 2 Lvl AllPub FR2 Gtl Veenker Feedr
## 3 Lvl AllPub Inside Gtl CollgCr Norm
## 4 Lvl AllPub Corner Gtl Crawfor Norm
## 5 Lvl AllPub FR2 Gtl NoRidge Norm
## 6 Lvl AllPub Inside Gtl Mitchel Norm
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1 Norm 1Fam 2Story 7 5 2003
## 2 Norm 1Fam 1Story 6 8 1976
## 3 Norm 1Fam 2Story 7 5 2001
## 4 Norm 1Fam 2Story 7 5 1915
## 5 Norm 1Fam 2Story 8 5 2000
## 6 Norm 1Fam 1.5Fin 5 5 1993
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1 2003 Gable CompShg VinylSd VinylSd BrkFace
## 2 1976 Gable CompShg MetalSd MetalSd None
## 3 2002 Gable CompShg VinylSd VinylSd BrkFace
## 4 1970 Gable CompShg Wd Sdng Wd Shng None
## 5 2000 Gable CompShg VinylSd VinylSd BrkFace
## 6 1995 Gable CompShg VinylSd VinylSd None
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1 196 Gd TA PConc Gd TA No
## 2 0 TA TA CBlock Gd TA Gd
## 3 162 Gd TA PConc Gd TA Mn
## 4 0 TA TA BrkTil TA Gd No
## 5 350 Gd TA PConc Gd TA Av
## 6 0 TA TA Wood Gd TA No
## BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1 GLQ 706 Unf 0 150 856
## 2 ALQ 978 Unf 0 284 1262
## 3 GLQ 486 Unf 0 434 920
## 4 ALQ 216 Unf 0 540 756
## 5 GLQ 655 Unf 0 490 1145
## 6 GLQ 732 Unf 0 64 796
## Heating HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 1 GasA Ex Y SBrkr 856 854 0
## 2 GasA Ex Y SBrkr 1262 0 0
## 3 GasA Ex Y SBrkr 920 866 0
## 4 GasA Gd Y SBrkr 961 756 0
## 5 GasA Ex Y SBrkr 1145 1053 0
## 6 GasA Ex Y SBrkr 796 566 0
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1 1710 1 0 2 1 3
## 2 1262 0 1 2 0 3
## 3 1786 1 0 2 1 3
## 4 1717 1 0 1 0 3
## 5 2198 1 0 2 1 4
## 6 1362 1 0 1 1 1
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1 1 Gd 8 Typ 0 <NA>
## 2 1 TA 6 Typ 1 TA
## 3 1 Gd 6 Typ 1 TA
## 4 1 Gd 7 Typ 1 Gd
## 5 1 Gd 9 Typ 1 TA
## 6 1 TA 5 Typ 0 <NA>
## GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1 Attchd 2003 RFn 2 548 TA
## 2 Attchd 1976 RFn 2 460 TA
## 3 Attchd 2001 RFn 2 608 TA
## 4 Detchd 1998 Unf 3 642 TA
## 5 Attchd 2000 RFn 3 836 TA
## 6 Attchd 1993 Unf 2 480 TA
## GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 1 TA Y 0 61 0 0
## 2 TA Y 298 0 0 0
## 3 TA Y 0 42 0 0
## 4 TA Y 0 35 272 0
## 5 TA Y 192 84 0 0
## 6 TA Y 40 30 0 320
## ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1 0 0 <NA> <NA> <NA> 0 2 2008
## 2 0 0 <NA> <NA> <NA> 0 5 2007
## 3 0 0 <NA> <NA> <NA> 0 9 2008
## 4 0 0 <NA> <NA> <NA> 0 2 2006
## 5 0 0 <NA> <NA> <NA> 0 12 2008
## 6 0 0 <NA> MnPrv Shed 700 10 2009
## SaleType SaleCondition SalePrice
## 1 WD Normal 208500
## 2 WD Normal 181500
## 3 WD Normal 223500
## 4 WD Abnorml 140000
## 5 WD Normal 250000
## 6 WD Normal 143000
Let us first check if we have any columns with blank data
train_missing_df1 <- data.frame(apply(housing2_raw, 2, function(x) length(which(x == ''))))
colnames(train_missing_df1) <- "counts"
train_missing_df1$column_name <- rownames(train_missing_df1)
train_missing_df1 <- data.frame(train_missing_df1[train_missing_df1$counts > 0,])
print(train_missing_df1)
## [1] counts column_name
## <0 rows> (or 0-length row.names)
So there is no data which is blank.
Let us check NA data now
train_na_df1 <- data.frame(apply(housing2_raw, 2, function(x) length(which(is.na(x)))))
colnames(train_na_df1) <- "counts"
train_na_df1$column_name <- rownames(train_na_df1)
train_na_df1 <- data.frame(train_na_df1[train_na_df1$counts > 0,])
print(train_na_df1)
## counts column_name
## LotFrontage 259 LotFrontage
## Alley 1369 Alley
## MasVnrType 8 MasVnrType
## MasVnrArea 8 MasVnrArea
## BsmtQual 37 BsmtQual
## BsmtCond 37 BsmtCond
## BsmtExposure 38 BsmtExposure
## BsmtFinType1 37 BsmtFinType1
## BsmtFinType2 38 BsmtFinType2
## Electrical 1 Electrical
## FireplaceQu 690 FireplaceQu
## GarageType 81 GarageType
## GarageYrBlt 81 GarageYrBlt
## GarageFinish 81 GarageFinish
## GarageQual 81 GarageQual
## GarageCond 81 GarageCond
## PoolQC 1453 PoolQC
## Fence 1179 Fence
## MiscFeature 1406 MiscFeature
So, we have NA data in both numeric as well as categorical data. We will deal with NA data ahead.
Let us now impute the missing / NA numeric features
#### Impute Numeric data
for (i in nrow(housing2_raw)) {
housing2_raw$LotFrontage <- ifelse(is.na(housing2_raw$LotFrontage),
median(housing2_raw$LotFrontage, na.rm = TRUE),
housing2_raw$LotFrontage)
}
for (i in nrow(housing2_raw)) {
housing2_raw$MasVnrArea <- ifelse(is.na(housing2_raw$MasVnrArea),
median(housing2_raw$MasVnrArea, na.rm = TRUE),
housing2_raw$MasVnrArea)
}
head(housing2_raw$GarageYrBlt,50)
## [1] 2003 1976 2001 1998 2000 1993 2004 1973 1931 1939 1965 2005 1962 2006
## [15] 1960 1991 1970 1967 2004 1958 2005 1930 2002 1976 1968 2007 2005 2008
## [29] 1957 1920 1920 1966 2007 1959 2005 2004 1995 1954 1953 NA 1965 1959
## [43] 1983 1977 1959 2005 2003 2006 NA 1966
median(housing2_raw$GarageYrBlt, na.rm = TRUE)
## [1] 1980
for (i in nrow(housing2_raw)) {
housing2_raw$GarageYrBlt <- ifelse(is.na(housing2_raw$GarageYrBlt),
median(housing2_raw$GarageYrBlt, na.rm = TRUE),
housing2_raw$GarageYrBlt)
}
housing2_raw$MoSold <- as.factor(housing2_raw$MoSold)
Let us plot the numeric data now.
numeric_housing2_raw <- Filter(is.numeric, housing2_raw)
hist_plots_list <- list()
for (i in 1:length(colnames(numeric_housing2_raw))){
print(i)
p1 <- eval(substitute(
ggplot(numeric_housing2_raw, aes(numeric_housing2_raw[, i])) + geom_histogram() +
xlab(colnames(numeric_housing2_raw)[i])
,list(i = i)))
hist_plots_list[[i]] <- p1
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
## [1] 23
## [1] 24
## [1] 25
## [1] 26
## [1] 27
## [1] 28
## [1] 29
## [1] 30
## [1] 31
## [1] 32
## [1] 33
## [1] 34
## [1] 35
## [1] 36
hist_plots_list[[1]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[2]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[3]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[4]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[5]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[6]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[7]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[8]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[9]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[10]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[11]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[12]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[13]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[14]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[15]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[16]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[17]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[18]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[19]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[20]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[21]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[22]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[23]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[24]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[25]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[26]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[27]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[28]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[29]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[30]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[31]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[32]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[33]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[34]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[35]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
hist_plots_list[[36]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Plotting the plots for the factors now - to see how levels for each categorical feature impact the SalePrice
factor_housing2_raw <- Filter(is.factor, housing2_raw)
factor_housing2_raw <- cbind(factor_housing2_raw, housing2_raw$SalePrice)
dim(factor_housing2_raw)
## [1] 1460 45
colnames(factor_housing2_raw)[colnames(factor_housing2_raw) == 'housing2_raw$SalePrice'] <-
"SalePrice"
box_plots_list <- list()
for (i in 1:(length(colnames(factor_housing2_raw)) - 1)){
print(i)
p1 <- eval(substitute(
ggplot(factor_housing2_raw, aes(x = factor_housing2_raw[, i], y = SalePrice)) +
geom_boxplot() +
xlab(colnames(factor_housing2_raw)[i])
,list(i = i)))
box_plots_list[[i]] <- p1
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
## [1] 13
## [1] 14
## [1] 15
## [1] 16
## [1] 17
## [1] 18
## [1] 19
## [1] 20
## [1] 21
## [1] 22
## [1] 23
## [1] 24
## [1] 25
## [1] 26
## [1] 27
## [1] 28
## [1] 29
## [1] 30
## [1] 31
## [1] 32
## [1] 33
## [1] 34
## [1] 35
## [1] 36
## [1] 37
## [1] 38
## [1] 39
## [1] 40
## [1] 41
## [1] 42
## [1] 43
## [1] 44
box_plots_list[[1]]
box_plots_list[[2]]
box_plots_list[[3]]
box_plots_list[[4]]
box_plots_list[[5]]
box_plots_list[[6]]
box_plots_list[[7]]
box_plots_list[[8]]
box_plots_list[[9]]
box_plots_list[[10]]
box_plots_list[[11]]
box_plots_list[[12]]
box_plots_list[[13]]
box_plots_list[[14]]
box_plots_list[[15]]
box_plots_list[[16]]
box_plots_list[[17]]
box_plots_list[[18]]
box_plots_list[[19]]
box_plots_list[[20]]
box_plots_list[[21]]
box_plots_list[[22]]
box_plots_list[[23]]
box_plots_list[[24]]
box_plots_list[[25]]
box_plots_list[[26]]
box_plots_list[[27]]
box_plots_list[[28]]
box_plots_list[[29]]
box_plots_list[[30]]
box_plots_list[[31]]
box_plots_list[[32]]
box_plots_list[[33]]
box_plots_list[[34]]
box_plots_list[[35]]
box_plots_list[[36]]
box_plots_list[[37]]
box_plots_list[[38]]
box_plots_list[[39]]
box_plots_list[[40]]
box_plots_list[[41]]
box_plots_list[[42]]
box_plots_list[[43]]
box_plots_list[[44]]
We will check now the correlation among the numerical variables
#### Correlation
corrMatrix <- round(cor(numeric_housing2_raw),4)
corrMatrix %>% corrplot(., method = "color", outline = T, addgrid.col = "darkgray",
order="hclust", addrect = 4, rect.col = "black",
rect.lwd = 5,cl.pos = "b", tl.col = "indianred4",
tl.cex = 1.0, cl.cex = 1.0, addCoef.col = "white",
number.digits = 2, number.cex = 0.8,
col =
colorRampPalette(c("darkred","white","dodgerblue4"))(100))
cor_numVar <- cor(numeric_housing2_raw, use="pairwise.complete.obs") #correlations of all numeric variables
#sort on decreasing correlations with SalePrice
cor_sorted <- as.matrix(sort(cor_numVar[,'SalePrice'], decreasing = TRUE))
#select only high corelations
CorHigh <- names(which(apply(cor_sorted, 1, function(x) abs(x)>0.5)))
cor_numVar <- cor_numVar[CorHigh, CorHigh]
corrplot.mixed(cor_numVar, tl.col="black", tl.pos = "lt")
Checking the relationship between OverallQual and SalePrice
### SalePrice as compared with OverallQual
ggplot(data = numeric_housing2_raw, aes(x = OverallQual, y = SalePrice)) +
geom_point()
ggplot(data = numeric_housing2_raw, aes(x = as.factor(OverallQual), y = SalePrice)) +
geom_boxplot()
### SalePrice as compared with GrLivArea
ggplot(data = numeric_housing2_raw, aes(x = GrLivArea, y = SalePrice)) +
geom_point()
### SalePrice as compared with GarageCars
ggplot(data = numeric_housing2_raw, aes(x = as.factor(GarageCars), y = SalePrice)) +
geom_boxplot()
These plots show clear relationship of SalePrice with these 3 numeric features: Overall Quality - OverallQual GrLivArea - Above grade (ground) living area square feet GarageCars - Garage location
We will now work on recoding the categorical features so that they are ready to be fed into the regression models.
##MSZoning
summary(housing2_raw$MSZoning)
## C (all) FV RH RL RM
## 10 65 16 1151 218
housing2_raw$Zone[housing2_raw$MSZoning == "FV"] <- 4
housing2_raw$Zone[housing2_raw$MSZoning == "RL"] <- 3
housing2_raw$Zone[housing2_raw$MSZoning == "RH" | housing2_raw$MSZoning == "RM"] <- 2
housing2_raw$Zone[housing2_raw$MSZoning == "C (all)"] <- 1
table(housing2_raw$Zone)
##
## 1 2 3 4
## 10 234 1151 65
housing2_raw$MSZoning <- NULL
##Street
summary(housing2_raw$Street)
## Grvl Pave
## 6 1454
housing2_raw$street_paved[housing2_raw$Street == "Pave"] <- 1
housing2_raw$street_paved[housing2_raw$Street == "Grvl"] <- 0
table(housing2_raw$street_paved)
##
## 0 1
## 6 1454
housing2_raw$Street <- NULL
##3 Alley
summary(housing2_raw$Alley)
## Grvl Pave NA's
## 50 41 1369
levels(housing2_raw$Alley) <- c(levels(housing2_raw$Alley), "None")
housing2_raw$Alley[is.na(housing2_raw$Alley)] <- "None"
housing2_raw$Paved_alley[housing2_raw$Alley == "Pave"] <- 1
housing2_raw$Paved_alley[!housing2_raw$Alley == "Pave"] <- 0
table(housing2_raw$Paved_alley)
##
## 0 1
## 1419 41
housing2_raw$Alley <- NULL
##4 LotShape
summary(housing2_raw$LotShape)
## IR1 IR2 IR3 Reg
## 484 41 10 925
housing2_raw$Lot_regular[housing2_raw$LotShape == "Reg"] <- 1
housing2_raw$Lot_regular[!housing2_raw$LotShape == "Reg"] <- 0
table(housing2_raw$Lot_regular)
##
## 0 1
## 535 925
housing2_raw$LotShape <- NULL
##5 LandContour
summary(housing2_raw$LandContour)
## Bnk HLS Low Lvl
## 63 50 36 1311
housing2_raw$LandContour_level[housing2_raw$LandContour == "Lvl"] <- 1
housing2_raw$LandContour_level[!housing2_raw$LandContour == "Lvl"] <- 0
table(housing2_raw$LandContour_level)
##
## 0 1
## 149 1311
housing2_raw$LandContour <- NULL
##6 Utilities
summary(housing2_raw$Utilities)
## AllPub NoSeWa
## 1459 1
### We can drop this column
housing2_raw$Utilities <- NULL
##7 LotConfig
summary(housing2_raw$LotConfig)
## Corner CulDSac FR2 FR3 Inside
## 263 94 47 4 1052
housing2_raw$LotConfig_culdsac_fr3[housing2_raw$LotConfig == "CulDSac" | housing2_raw$LotConfig == "FR3"] <- 1
housing2_raw$LotConfig_culdsac_fr3[!(housing2_raw$LotConfig == "CulDSac" | housing2_raw$LotConfig == "FR3")] <- 0
table(housing2_raw$LotConfig_culdsac_fr3)
##
## 0 1
## 1362 98
housing2_raw$LotConfig <- NULL
##8 LandSlope
summary(housing2_raw$LandSlope)
## Gtl Mod Sev
## 1382 65 13
### We don't see much difference with this. So we will drop this feature.
housing2_raw$LandSlope <- NULL
##9 Neighborhood
summary(housing2_raw$Neighborhood)
## Blmngtn Blueste BrDale BrkSide ClearCr CollgCr Crawfor Edwards Gilbert
## 17 2 16 58 28 150 51 100 79
## IDOTRR MeadowV Mitchel NAmes NoRidge NPkVill NridgHt NWAmes OldTown
## 37 17 49 225 41 9 77 73 113
## Sawyer SawyerW Somerst StoneBr SWISU Timber Veenker
## 74 59 86 25 25 38 11
housing2_raw$Neighborhood_highprice <- ifelse(housing2_raw$Neighborhood %in%
(which(with(housing2_raw,
tapply(SalePrice, Neighborhood, median)) > 200000) %>% names()),
1, 0)
table(housing2_raw$Neighborhood_highprice)
##
## 0 1
## 1103 357
housing2_raw$Neighborhood <- NULL
##10 Condition1
summary(housing2_raw$Condition1)
## Artery Feedr Norm PosA PosN RRAe RRAn RRNe RRNn
## 48 81 1260 8 19 11 26 2 5
housing2_raw$Condition1_good[housing2_raw$Condition1 == "PosN" | housing2_raw$Condition1 == "PosA"] <- 1
housing2_raw$Condition1_good[!(housing2_raw$Condition1 == "PosN" | housing2_raw$Condition1 == "PosA")] <- 0
table(housing2_raw$Condition1_good)
##
## 0 1
## 1433 27
housing2_raw$Condition1 <- NULL
##11 Condition2
summary(housing2_raw$Condition2)
## Artery Feedr Norm PosA PosN RRAe RRAn RRNn
## 2 6 1445 1 2 1 1 2
housing2_raw$Condition2_good[housing2_raw$Condition2 == "PosN" | housing2_raw$Condition2 == "PosA"] <- 1
housing2_raw$Condition2_good[!(housing2_raw$Condition2 == "PosN" | housing2_raw$Condition2 == "PosA")] <- 0
table(housing2_raw$Condition2_good)
##
## 0 1
## 1457 3
housing2_raw$Condition2 <- NULL
##12 BldgType
summary(housing2_raw$BldgType)
## 1Fam 2fmCon Duplex Twnhs TwnhsE
## 1220 31 52 43 114
housing2_raw$BldgType_singlefam_endtwnhse[housing2_raw$BldgType == "1Fam" | housing2_raw$BldgType == "TwnhsE"] <- 1
housing2_raw$BldgType_singlefam_endtwnhse[!(housing2_raw$BldgType == "1Fam" | housing2_raw$BldgType == "TwnhsE")] <- 0
table(housing2_raw$BldgType_singlefam_endtwnhse)
##
## 0 1
## 126 1334
housing2_raw$BldgType <- NULL
##13 HouseStyle
summary(housing2_raw$HouseStyle)
## 1.5Fin 1.5Unf 1Story 2.5Fin 2.5Unf 2Story SFoyer SLvl
## 154 14 726 8 11 445 37 65
housing2_raw$HouseStyle_level[housing2_raw$HouseStyle == "2Story" | housing2_raw$HouseStyle == "2.5Fin"] <- 1
housing2_raw$HouseStyle_level[!(housing2_raw$HouseStyle == "2Story" | housing2_raw$HouseStyle == "2.5Fin")] <- 0
table(housing2_raw$HouseStyle_level)
##
## 0 1
## 1007 453
housing2_raw$HouseStyle <- NULL
##14 RoofStyle
summary(housing2_raw$RoofStyle)
## Flat Gable Gambrel Hip Mansard Shed
## 13 1141 11 286 7 2
housing2_raw$RoofStyle_level[housing2_raw$RoofStyle == "Hip" | housing2_raw$RoofStyle == "Flat" | housing2_raw$RoofStyle == "Shed"] <- 1
housing2_raw$RoofStyle_level[!(housing2_raw$RoofStyle == "Hip" | housing2_raw$RoofStyle == "Flat" | housing2_raw$RoofStyle == "Shed")] <- 0
table(housing2_raw$RoofStyle_level)
##
## 0 1
## 1159 301
housing2_raw$RoofStyle <- NULL
##15 RoofMatl
summary(housing2_raw$RoofMatl)
## ClyTile CompShg Membran Metal Roll Tar&Grv WdShake WdShngl
## 1 1434 1 1 1 11 5 6
housing2_raw$RoofMatl_level[housing2_raw$RoofMatl == "WdShake" | housing2_raw$RoofMatl == "WdShngl" | housing2_raw$RoofMatl == "Membran"] <- 1
housing2_raw$RoofMatl_level[!(housing2_raw$RoofMatl == "WdShake" | housing2_raw$RoofMatl == "WdShngl" | housing2_raw$RoofMatl == "Membran")] <- 0
table(housing2_raw$RoofMatl_level)
##
## 0 1
## 1448 12
housing2_raw$RoofMatl <- NULL
##16 and 17Exterior1st and Exterior2nd
summary(housing2_raw$Exterior1st)
## AsbShng AsphShn BrkComm BrkFace CBlock CemntBd HdBoard ImStucc MetalSd
## 20 1 2 50 1 61 222 1 220
## Plywood Stone Stucco VinylSd Wd Sdng WdShing
## 108 2 25 515 206 26
summary(housing2_raw$Exterior2nd)
## AsbShng AsphShn Brk Cmn BrkFace CBlock CmentBd HdBoard ImStucc MetalSd
## 20 3 7 25 1 60 207 10 214
## Other Plywood Stone Stucco VinylSd Wd Sdng Wd Shng
## 1 142 5 26 504 197 38
## Not using these as there is not much difference in the SalePrice
housing2_raw$Exterior1st <- NULL
housing2_raw$Exterior2nd <- NULL
#18 MasVnrType
summary(housing2_raw$MasVnrType)
## BrkCmn BrkFace None Stone NA's
## 15 445 864 128 8
housing2_raw$MasVnrType_Stone[housing2_raw$MasVnrType == "Stone" & !is.na(housing2_raw$MasVnrType)] <- 1
housing2_raw$MasVnrType_Stone[!(housing2_raw$MasVnrType == "Stone" & !is.na(housing2_raw$MasVnrType))] <- 0
table(housing2_raw$MasVnrType_Stone)
##
## 0 1
## 1332 128
housing2_raw$MasVnrType <- NULL
#19 ExterQual
summary(housing2_raw$ExterQual)
## Ex Fa Gd TA
## 52 14 488 906
housing2_raw$ExterQual_level[housing2_raw$ExterQual == "Fa"] <- 1
housing2_raw$ExterQual_level[housing2_raw$ExterQual == "TA"] <- 2
housing2_raw$ExterQual_level[housing2_raw$ExterQual == "Gd"] <- 3
housing2_raw$ExterQual_level[housing2_raw$ExterQual == "Ex"] <- 4
table(housing2_raw$ExterQual_level)
##
## 1 2 3 4
## 14 906 488 52
housing2_raw$ExterQual <- NULL
#20 ExterCond
summary(housing2_raw$ExterCond)
## Ex Fa Gd Po TA
## 3 28 146 1 1282
housing2_raw$ExterCond_level[housing2_raw$ExterCond == "Po"] <- 1
housing2_raw$ExterCond_level[housing2_raw$ExterCond == "Fa"] <- 2
housing2_raw$ExterCond_level[housing2_raw$ExterCond == "TA"] <- 3
housing2_raw$ExterCond_level[housing2_raw$ExterCond == "Gd"] <- 4
housing2_raw$ExterCond_level[housing2_raw$ExterCond == "Ex"] <- 5
table(housing2_raw$ExterCond_level)
##
## 1 2 3 4 5
## 1 28 1282 146 3
housing2_raw$ExterCond <- NULL
#21 Foundation
summary(housing2_raw$Foundation)
## BrkTil CBlock PConc Slab Stone Wood
## 146 634 647 24 6 3
housing2_raw$Foundation_concrete[housing2_raw$Foundation == "PConc"] <- 1
housing2_raw$Foundation_concrete[!(housing2_raw$Foundation == "PConc")] <- 0
table(housing2_raw$Foundation_concrete)
##
## 0 1
## 813 647
housing2_raw$Foundation <- NULL
#22 BsmtQual
summary(housing2_raw$BsmtQual)
## Ex Fa Gd TA NA's
## 121 35 618 649 37
housing2_raw$BsmtQual_level[housing2_raw$BsmtQual == "Fa" & !is.na(housing2_raw$BsmtQual)] <- 2
housing2_raw$BsmtQual_level[housing2_raw$BsmtQual == "TA" & !is.na(housing2_raw$BsmtQual)] <- 3
housing2_raw$BsmtQual_level[housing2_raw$BsmtQual == "Gd" & !is.na(housing2_raw$BsmtQual)] <- 4
housing2_raw$BsmtQual_level[housing2_raw$BsmtQual == "Ex" & !is.na(housing2_raw$BsmtQual)] <- 5
housing2_raw$BsmtQual_level[is.na(housing2_raw$BsmtQual)] <- 1
housing2_raw$BsmtQual <- NULL
#23 BsmtCond
summary(housing2_raw$BsmtCond)
## Fa Gd Po TA NA's
## 45 65 2 1311 37
housing2_raw$BsmtCond_level[housing2_raw$BsmtCond == "Po" | is.na(housing2_raw$BsmtCond)] <- 1
housing2_raw$BsmtCond_level[housing2_raw$BsmtCond == "Fa" & !is.na(housing2_raw$BsmtCond)] <- 2
housing2_raw$BsmtCond_level[housing2_raw$BsmtCond == "TA" & !is.na(housing2_raw$BsmtCond)] <- 3
housing2_raw$BsmtCond_level[housing2_raw$BsmtCond == "Gd" & !is.na(housing2_raw$BsmtCond)] <- 4
housing2_raw$BsmtCond_level[housing2_raw$BsmtCond == "Ex" & !is.na(housing2_raw$BsmtCond)] <- 5
housing2_raw$BsmtCond <- NULL
#24 BsmtExposure
summary(housing2_raw$BsmtExposure)
## Av Gd Mn No NA's
## 221 134 114 953 38
housing2_raw$BsmtExposure_level[housing2_raw$BsmtExposure == "No" | is.na(housing2_raw$BsmtExposure)] <- 1
housing2_raw$BsmtExposure_level[housing2_raw$BsmtExposure == "Mn" & !is.na(housing2_raw$BsmtExposure)] <- 2
housing2_raw$BsmtExposure_level[housing2_raw$BsmtExposure == "Av" & !is.na(housing2_raw$BsmtExposure)] <- 3
housing2_raw$BsmtExposure_level[housing2_raw$BsmtExposure == "Gd" & !is.na(housing2_raw$BsmtExposure)] <- 4
table(housing2_raw$BsmtExposure_level)
##
## 1 2 3 4
## 991 114 221 134
housing2_raw$BsmtExposure <- NULL
## 25 and 26 - BsmtFinType1 and BsmtFinType2
### Not using these features as we have already considered the most important Basement
### features
housing2_raw$BsmtFinType1 <- NULL
housing2_raw$BsmtFinType2 <- NULL
## 27 - Heating
summary(housing2_raw$Heating)
## Floor GasA GasW Grav OthW Wall
## 1 1428 18 7 2 4
housing2_raw$Heating_type[housing2_raw$Heating == "GasW" | housing2_raw$Heating == "GasA"] <- 1
housing2_raw$Heating_type[!(housing2_raw$Heating == "GasW" | housing2_raw$Heating == "GasA")] <- 0
table(housing2_raw$Heating_type)
##
## 0 1
## 14 1446
housing2_raw$Heating <- NULL
## 28 - hEATINGqc
summary(housing2_raw$HeatingQC)
## Ex Fa Gd Po TA
## 741 49 241 1 428
housing2_raw$HeatingQC_level[housing2_raw$HeatingQC == "Po"] <- 1
housing2_raw$HeatingQC_level[housing2_raw$HeatingQC == "Fa"] <- 2
housing2_raw$HeatingQC_level[housing2_raw$HeatingQC == "TA"] <- 3
housing2_raw$HeatingQC_level[housing2_raw$HeatingQC == "Gd"] <- 4
housing2_raw$HeatingQC_level[housing2_raw$HeatingQC == "Ex"] <- 5
table(housing2_raw$HeatingQC_level)
##
## 1 2 3 4 5
## 1 49 428 241 741
housing2_raw$HeatingQC <- NULL
##29 - CentralAir
summary(housing2_raw$CentralAir)
## N Y
## 95 1365
housing2_raw$CentralAir_flag[housing2_raw$CentralAir == "Y"] <- 1
housing2_raw$CentralAir_flag[housing2_raw$CentralAir == "N"] <- 0
table(housing2_raw$CentralAir_flag)
##
## 0 1
## 95 1365
housing2_raw$CentralAir <- NULL
##30 - Electrical
summary(housing2_raw$Electrical)
## FuseA FuseF FuseP Mix SBrkr NA's
## 94 27 3 1 1334 1
housing2_raw$Electrical_SBrkr[housing2_raw$Electrical == "SBrkr" & !is.na(housing2_raw$Electrical)] <- 1
housing2_raw$Electrical_SBrkr[!(housing2_raw$Electrical == "SBrkr" & !is.na(housing2_raw$Electrical))] <- 0
table(housing2_raw$Electrical_SBrkr)
##
## 0 1
## 126 1334
housing2_raw$Electrical <- NULL
##31 - KitchenQual
summary(housing2_raw$KitchenQual)
## Ex Fa Gd TA
## 100 39 586 735
housing2_raw$KitchenQual_level[housing2_raw$KitchenQual == "Fa"] <- 1
housing2_raw$KitchenQual_level[housing2_raw$KitchenQual == "TA"] <- 2
housing2_raw$KitchenQual_level[housing2_raw$KitchenQual == "Gd"] <- 3
housing2_raw$KitchenQual_level[housing2_raw$KitchenQual == "Ex"] <- 4
table(housing2_raw$KitchenQual_level)
##
## 1 2 3 4
## 39 735 586 100
housing2_raw$KitchenQual <- NULL
##32 - fUNCTIONAL
## As seen in the box plots, we don't see much differences in the various category levels
## So we won't be using this feature
housing2_raw$Functional <- NULL
##33 - FireplaceQu
summary(housing2_raw$FireplaceQu)
## Ex Fa Gd Po TA NA's
## 24 33 380 20 313 690
housing2_raw$FireplaceQu_level[housing2_raw$FireplaceQu == "Po" | is.na(housing2_raw$FireplaceQu)] <- 1
housing2_raw$FireplaceQu_level[housing2_raw$FireplaceQu == "Fa" & !is.na(housing2_raw$FireplaceQu)] <- 2
housing2_raw$FireplaceQu_level[housing2_raw$FireplaceQu == "TA" & !is.na(housing2_raw$FireplaceQu)] <- 3
housing2_raw$FireplaceQu_level[housing2_raw$FireplaceQu == "Gd" & !is.na(housing2_raw$FireplaceQu)] <- 4
housing2_raw$FireplaceQu_level[housing2_raw$FireplaceQu == "Ex" & !is.na(housing2_raw$FireplaceQu)] <- 5
table(housing2_raw$FireplaceQu_level)
##
## 1 2 3 4 5
## 710 33 313 380 24
housing2_raw$FireplaceQu <- NULL
##34 - GarageType
summary(housing2_raw$GarageType)
## 2Types Attchd Basment BuiltIn CarPort Detchd NA's
## 6 870 19 88 9 387 81
housing2_raw$GarageType_within[(housing2_raw$GarageType == "Attchd" | housing2_raw$GarageType == "BuiltIn") & !is.na(housing2_raw$GarageType)] <- 1
housing2_raw$GarageType_within[!((housing2_raw$GarageType == "Attchd" | housing2_raw$GarageType == "BuiltIn") & !is.na(housing2_raw$GarageType))] <- 0
table(housing2_raw$GarageType_within)
##
## 0 1
## 502 958
housing2_raw$GarageType <- NULL
##35 - gARAGEfINISH
summary(housing2_raw$GarageFinish)
## Fin RFn Unf NA's
## 352 422 605 81
housing2_raw$GarageFinish_status[(housing2_raw$GarageFinish == "Fin" | housing2_raw$GarageFinish == "RFn") & !is.na(housing2_raw$GarageFinish)] <- 1
housing2_raw$GarageFinish_status[!((housing2_raw$GarageFinish == "Fin" | housing2_raw$GarageFinish == "RFn") & !is.na(housing2_raw$GarageFinish))] <- 0
table(housing2_raw$GarageFinish_status)
##
## 0 1
## 686 774
housing2_raw$GarageFinish <- NULL
##36 - GarageQual
summary(housing2_raw$GarageQual)
## Ex Fa Gd Po TA NA's
## 3 48 14 3 1311 81
housing2_raw$GarageQual_level[housing2_raw$GarageQual == "Po" | is.na(housing2_raw$GarageQual)] <- 1
housing2_raw$GarageQual_level[housing2_raw$GarageQual == "Fa" & !is.na(housing2_raw$GarageQual)] <- 2
housing2_raw$GarageQual_level[housing2_raw$GarageQual == "TA" & !is.na(housing2_raw$GarageQual)] <- 3
housing2_raw$GarageQual_level[housing2_raw$GarageQual == "Gd" & !is.na(housing2_raw$GarageQual)] <- 4
housing2_raw$GarageQual_level[housing2_raw$GarageQual == "Ex" & !is.na(housing2_raw$GarageQual)] <- 5
table(housing2_raw$GarageQual_level)
##
## 1 2 3 4 5
## 84 48 1311 14 3
housing2_raw$GarageQual <- NULL
##37 - GarageCond
summary(housing2_raw$GarageCond)
## Ex Fa Gd Po TA NA's
## 2 35 9 7 1326 81
housing2_raw$GarageCond_status[housing2_raw$GarageCond == "Po" | is.na(housing2_raw$GarageCond)] <- 1
housing2_raw$GarageCond_status[housing2_raw$GarageCond == "Fa" & !is.na(housing2_raw$GarageCond)] <- 2
housing2_raw$GarageCond_status[housing2_raw$GarageCond == "TA" & !is.na(housing2_raw$GarageCond)] <- 3
housing2_raw$GarageCond_status[housing2_raw$GarageCond == "Gd" & !is.na(housing2_raw$GarageCond)] <- 4
housing2_raw$GarageCond_status[housing2_raw$GarageCond == "Ex" & !is.na(housing2_raw$GarageCond)] <- 5
table(housing2_raw$GarageCond_status)
##
## 1 2 3 4 5
## 88 35 1326 9 2
housing2_raw$GarageCond <- NULL
##38 - PavedDrive
summary(housing2_raw$PavedDrive)
## N P Y
## 90 30 1340
housing2_raw$PavedDrive_flag[housing2_raw$PavedDrive == "Y" | housing2_raw$PavedDrive == "P"] <- 1
housing2_raw$PavedDrive_flag[housing2_raw$PavedDrive == "N"] <- 0
table(housing2_raw$PavedDrive_flag)
##
## 0 1
## 90 1370
housing2_raw$PavedDrive <- NULL
##39 - PoolQC
summary(housing2_raw$PoolQC)
## Ex Fa Gd NA's
## 2 2 3 1453
housing2_raw$PoolQC_level[housing2_raw$PoolQC == "Fa" | is.na(housing2_raw$PoolQC)] <- 1
housing2_raw$PoolQC_level[housing2_raw$PoolQC == "Gd" & !is.na(housing2_raw$PoolQC)] <- 2
housing2_raw$PoolQC_level[housing2_raw$PoolQC == "Ex" & !is.na(housing2_raw$PoolQC)] <- 3
table(housing2_raw$PoolQC_level)
##
## 1 2 3
## 1455 3 2
housing2_raw$PoolQC <- NULL
##40 - Fence
summary(housing2_raw$Fence)
## GdPrv GdWo MnPrv MnWw NA's
## 59 54 157 11 1179
housing2_raw$Fence_good[housing2_raw$Fence == "GdPrv" & !is.na(housing2_raw$Fence)] <- 1
housing2_raw$Fence_good[!(housing2_raw$Fence == "GdPrv" & !is.na(housing2_raw$Fence))] <- 0
table(housing2_raw$Fence_good)
##
## 0 1
## 1401 59
housing2_raw$Fence <- NULL
##41 - MiscFeature
### nOT MUCH difference in the saleprice in the levels, hence we won't use it
housing2_raw$MiscFeature <- NULL
##42 - MoSold
### nOT MUCH difference in the saleprice in the levels, hence we won't use it
housing2_raw$MoSold <- NULL
##43 - sALEtYPE
summary(housing2_raw$SaleType)
## COD Con ConLD ConLI ConLw CWD New Oth WD
## 43 2 9 5 5 4 122 3 1267
housing2_raw$SaleType_value[housing2_raw$SaleType == "New" | housing2_raw$SaleType == "Con"] <- 3
housing2_raw$SaleType_value[housing2_raw$SaleType == "ConLw" | housing2_raw$SaleType == "CWD" | housing2_raw$SaleType == "WD"] <- 2
housing2_raw$SaleType_value[housing2_raw$SaleType == "COD" | housing2_raw$SaleType == "ConLD" |
housing2_raw$SaleType == "ConLI" | housing2_raw$SaleType == "Oth"] <- 1
table(housing2_raw$SaleType_value)
##
## 1 2 3
## 60 1276 124
housing2_raw$SaleType <- NULL
## 44 - SaleCondition
### Dropping this feature and not using it
housing2_raw$SaleCondition <- NULL
########## All features are now ready
dim(housing2_raw)
## [1] 1460 70
summary(housing2_raw)
## MSSubClass LotFrontage LotArea OverallQual
## Min. : 20.0 Min. : 21.00 Min. : 1300 Min. : 1.000
## 1st Qu.: 20.0 1st Qu.: 60.00 1st Qu.: 7554 1st Qu.: 5.000
## Median : 50.0 Median : 69.00 Median : 9478 Median : 6.000
## Mean : 56.9 Mean : 69.86 Mean : 10517 Mean : 6.099
## 3rd Qu.: 70.0 3rd Qu.: 79.00 3rd Qu.: 11602 3rd Qu.: 7.000
## Max. :190.0 Max. :313.00 Max. :215245 Max. :10.000
## OverallCond YearBuilt YearRemodAdd MasVnrArea
## Min. :1.000 Min. :1872 Min. :1950 Min. : 0.0
## 1st Qu.:5.000 1st Qu.:1954 1st Qu.:1967 1st Qu.: 0.0
## Median :5.000 Median :1973 Median :1994 Median : 0.0
## Mean :5.575 Mean :1971 Mean :1985 Mean : 103.1
## 3rd Qu.:6.000 3rd Qu.:2000 3rd Qu.:2004 3rd Qu.: 164.2
## Max. :9.000 Max. :2010 Max. :2010 Max. :1600.0
## BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## Min. : 0.0 Min. : 0.00 Min. : 0.0 Min. : 0.0
## 1st Qu.: 0.0 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8
## Median : 383.5 Median : 0.00 Median : 477.5 Median : 991.5
## Mean : 443.6 Mean : 46.55 Mean : 567.2 Mean :1057.4
## 3rd Qu.: 712.2 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2
## Max. :5644.0 Max. :1474.00 Max. :2336.0 Max. :6110.0
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea
## Min. : 334 Min. : 0 Min. : 0.000 Min. : 334
## 1st Qu.: 882 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1130
## Median :1087 Median : 0 Median : 0.000 Median :1464
## Mean :1163 Mean : 347 Mean : 5.845 Mean :1515
## 3rd Qu.:1391 3rd Qu.: 728 3rd Qu.: 0.000 3rd Qu.:1777
## Max. :4692 Max. :2065 Max. :572.000 Max. :5642
## BsmtFullBath BsmtHalfBath FullBath HalfBath
## Min. :0.0000 Min. :0.00000 Min. :0.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000
## Median :0.0000 Median :0.00000 Median :2.000 Median :0.0000
## Mean :0.4253 Mean :0.05753 Mean :1.565 Mean :0.3829
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :3.0000 Max. :2.00000 Max. :3.000 Max. :2.0000
## BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces
## Min. :0.000 Min. :0.000 Min. : 2.000 Min. :0.000
## 1st Qu.:2.000 1st Qu.:1.000 1st Qu.: 5.000 1st Qu.:0.000
## Median :3.000 Median :1.000 Median : 6.000 Median :1.000
## Mean :2.866 Mean :1.047 Mean : 6.518 Mean :0.613
## 3rd Qu.:3.000 3rd Qu.:1.000 3rd Qu.: 7.000 3rd Qu.:1.000
## Max. :8.000 Max. :3.000 Max. :14.000 Max. :3.000
## GarageYrBlt GarageCars GarageArea WoodDeckSF
## Min. :1900 Min. :0.000 Min. : 0.0 Min. : 0.00
## 1st Qu.:1962 1st Qu.:1.000 1st Qu.: 334.5 1st Qu.: 0.00
## Median :1980 Median :2.000 Median : 480.0 Median : 0.00
## Mean :1979 Mean :1.767 Mean : 473.0 Mean : 94.24
## 3rd Qu.:2001 3rd Qu.:2.000 3rd Qu.: 576.0 3rd Qu.:168.00
## Max. :2010 Max. :4.000 Max. :1418.0 Max. :857.00
## OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Median : 25.00 Median : 0.00 Median : 0.00 Median : 0.00
## Mean : 46.66 Mean : 21.95 Mean : 3.41 Mean : 15.06
## 3rd Qu.: 68.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00
## Max. :547.00 Max. :552.00 Max. :508.00 Max. :480.00
## PoolArea MiscVal YrSold SalePrice
## Min. : 0.000 Min. : 0.00 Min. :2006 Min. : 34900
## 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.:2007 1st Qu.:129975
## Median : 0.000 Median : 0.00 Median :2008 Median :163000
## Mean : 2.759 Mean : 43.49 Mean :2008 Mean :180921
## 3rd Qu.: 0.000 3rd Qu.: 0.00 3rd Qu.:2009 3rd Qu.:214000
## Max. :738.000 Max. :15500.00 Max. :2010 Max. :755000
## Zone street_paved Paved_alley Lot_regular
## Min. :1.000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:3.000 1st Qu.:1.0000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :3.000 Median :1.0000 Median :0.00000 Median :1.0000
## Mean :2.871 Mean :0.9959 Mean :0.02808 Mean :0.6336
## 3rd Qu.:3.000 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :4.000 Max. :1.0000 Max. :1.00000 Max. :1.0000
## LandContour_level LotConfig_culdsac_fr3 Neighborhood_highprice
## Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:1.0000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :1.0000 Median :0.00000 Median :0.0000
## Mean :0.8979 Mean :0.06712 Mean :0.2445
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.00000 Max. :1.0000
## Condition1_good Condition2_good BldgType_singlefam_endtwnhse
## Min. :0.00000 Min. :0.000000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.000000 1st Qu.:1.0000
## Median :0.00000 Median :0.000000 Median :1.0000
## Mean :0.01849 Mean :0.002055 Mean :0.9137
## 3rd Qu.:0.00000 3rd Qu.:0.000000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.000000 Max. :1.0000
## HouseStyle_level RoofStyle_level RoofMatl_level MasVnrType_Stone
## Min. :0.0000 Min. :0.0000 Min. :0.000000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.000000 1st Qu.:0.00000
## Median :0.0000 Median :0.0000 Median :0.000000 Median :0.00000
## Mean :0.3103 Mean :0.2062 Mean :0.008219 Mean :0.08767
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.000000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.000000 Max. :1.00000
## ExterQual_level ExterCond_level Foundation_concrete BsmtQual_level
## Min. :1.000 Min. :1.000 Min. :0.0000 Min. :1.000
## 1st Qu.:2.000 1st Qu.:3.000 1st Qu.:0.0000 1st Qu.:3.000
## Median :2.000 Median :3.000 Median :0.0000 Median :4.000
## Mean :2.396 Mean :3.084 Mean :0.4432 Mean :3.514
## 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.:1.0000 3rd Qu.:4.000
## Max. :4.000 Max. :5.000 Max. :1.0000 Max. :5.000
## BsmtCond_level BsmtExposure_level Heating_type HeatingQC_level
## Min. :1.00 Min. :1.000 Min. :0.0000 Min. :1.000
## 1st Qu.:3.00 1st Qu.:1.000 1st Qu.:1.0000 1st Qu.:3.000
## Median :3.00 Median :1.000 Median :1.0000 Median :5.000
## Mean :2.96 Mean :1.656 Mean :0.9904 Mean :4.145
## 3rd Qu.:3.00 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:5.000
## Max. :4.00 Max. :4.000 Max. :1.0000 Max. :5.000
## CentralAir_flag Electrical_SBrkr KitchenQual_level FireplaceQu_level
## Min. :0.0000 Min. :0.0000 Min. :1.000 Min. :1.000
## 1st Qu.:1.0000 1st Qu.:1.0000 1st Qu.:2.000 1st Qu.:1.000
## Median :1.0000 Median :1.0000 Median :2.000 Median :2.000
## Mean :0.9349 Mean :0.9137 Mean :2.512 Mean :2.298
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:4.000
## Max. :1.0000 Max. :1.0000 Max. :4.000 Max. :5.000
## GarageType_within GarageFinish_status GarageQual_level GarageCond_status
## Min. :0.0000 Min. :0.0000 Min. :1.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:3.000
## Median :1.0000 Median :1.0000 Median :3.000 Median :3.000
## Mean :0.6562 Mean :0.5301 Mean :2.866 Mean :2.864
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :1.0000 Max. :1.0000 Max. :5.000 Max. :5.000
## PavedDrive_flag PoolQC_level Fence_good SaleType_value
## Min. :0.0000 Min. :1.000 Min. :0.00000 Min. :1.000
## 1st Qu.:1.0000 1st Qu.:1.000 1st Qu.:0.00000 1st Qu.:2.000
## Median :1.0000 Median :1.000 Median :0.00000 Median :2.000
## Mean :0.9384 Mean :1.005 Mean :0.04041 Mean :2.044
## 3rd Qu.:1.0000 3rd Qu.:1.000 3rd Qu.:0.00000 3rd Qu.:2.000
## Max. :1.0000 Max. :3.000 Max. :1.00000 Max. :3.000
Checking Missing data again - to ensure all good, and there is no missing data
### Blank data
train_missing_df2 <- data.frame(apply(housing2_raw, 2, function(x) length(which(x == ''))))
colnames(train_missing_df2) <- "counts"
train_missing_df2$column_name <- rownames(train_missing_df2)
train_missing_df2 <- data.frame(train_missing_df2[train_missing_df2$counts > 0,])
print(train_missing_df2)
## [1] counts column_name
## <0 rows> (or 0-length row.names)
### NA data
train_na_df2 <- data.frame(apply(housing2_raw, 2, function(x) length(which(is.na(x)))))
colnames(train_na_df2) <- "counts"
train_na_df2$column_name <- rownames(train_na_df2)
train_na_df2 <- data.frame(train_na_df2[train_na_df2$counts > 0,])
print(train_na_df2)
## [1] counts column_name
## <0 rows> (or 0-length row.names)
Now there is no missing data or NA’s.
Taking the SalePrice - the target variable to the front of the dataframe by reordering the features in the dataframe.
housing2_raw <- housing2_raw[,c(36,1:35,37:70)]
colnames(housing2_raw)
## [1] "SalePrice" "MSSubClass"
## [3] "LotFrontage" "LotArea"
## [5] "OverallQual" "OverallCond"
## [7] "YearBuilt" "YearRemodAdd"
## [9] "MasVnrArea" "BsmtFinSF1"
## [11] "BsmtFinSF2" "BsmtUnfSF"
## [13] "TotalBsmtSF" "X1stFlrSF"
## [15] "X2ndFlrSF" "LowQualFinSF"
## [17] "GrLivArea" "BsmtFullBath"
## [19] "BsmtHalfBath" "FullBath"
## [21] "HalfBath" "BedroomAbvGr"
## [23] "KitchenAbvGr" "TotRmsAbvGrd"
## [25] "Fireplaces" "GarageYrBlt"
## [27] "GarageCars" "GarageArea"
## [29] "WoodDeckSF" "OpenPorchSF"
## [31] "EnclosedPorch" "X3SsnPorch"
## [33] "ScreenPorch" "PoolArea"
## [35] "MiscVal" "YrSold"
## [37] "Zone" "street_paved"
## [39] "Paved_alley" "Lot_regular"
## [41] "LandContour_level" "LotConfig_culdsac_fr3"
## [43] "Neighborhood_highprice" "Condition1_good"
## [45] "Condition2_good" "BldgType_singlefam_endtwnhse"
## [47] "HouseStyle_level" "RoofStyle_level"
## [49] "RoofMatl_level" "MasVnrType_Stone"
## [51] "ExterQual_level" "ExterCond_level"
## [53] "Foundation_concrete" "BsmtQual_level"
## [55] "BsmtCond_level" "BsmtExposure_level"
## [57] "Heating_type" "HeatingQC_level"
## [59] "CentralAir_flag" "Electrical_SBrkr"
## [61] "KitchenQual_level" "FireplaceQu_level"
## [63] "GarageType_within" "GarageFinish_status"
## [65] "GarageQual_level" "GarageCond_status"
## [67] "PavedDrive_flag" "PoolQC_level"
## [69] "Fence_good" "SaleType_value"
Splitting the data into test and train datasets
n <- nrow(housing2_raw)
set.seed(123)
housing2_random_index <- housing2_raw[sample(n), ]
housing2.train.df <- housing2_raw[1:as.integer(0.8*n),]
housing2.test.df <- housing2_raw[as.integer(0.8*n +1):n, ]
nrow(housing2.train.df)
## [1] 1168
nrow(housing2.test.df)
## [1] 292
Building first model - taking all the features.
#### Linear Model 1
g1 <- lm(SalePrice ~ .,
data = housing2.train.df)
summary(g1)
##
## Call:
## lm(formula = SalePrice ~ ., data = housing2.train.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -241143 -14048 48 12556 202160
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.698e+05 1.267e+06 -0.687 0.492509
## MSSubClass -1.334e+02 3.071e+01 -4.343 1.53e-05 ***
## LotFrontage 1.753e+02 5.054e+01 3.468 0.000545 ***
## LotArea 3.280e-01 8.923e-02 3.676 0.000248 ***
## OverallQual 9.679e+03 1.173e+03 8.251 4.44e-16 ***
## OverallCond 6.158e+03 1.022e+03 6.025 2.30e-09 ***
## YearBuilt 2.028e+02 7.296e+01 2.780 0.005529 **
## YearRemodAdd -2.672e+01 6.754e+01 -0.396 0.692474
## MasVnrArea 2.480e+01 5.418e+00 4.577 5.26e-06 ***
## BsmtFinSF1 3.710e+01 4.906e+00 7.563 8.32e-14 ***
## BsmtFinSF2 2.481e+01 6.687e+00 3.711 0.000217 ***
## BsmtUnfSF 1.782e+01 4.495e+00 3.963 7.88e-05 ***
## TotalBsmtSF NA NA NA NA
## X1stFlrSF 4.301e+01 5.698e+00 7.547 9.29e-14 ***
## X2ndFlrSF 5.245e+01 5.261e+00 9.968 < 2e-16 ***
## LowQualFinSF 2.918e+01 1.731e+01 1.685 0.092236 .
## GrLivArea NA NA NA NA
## BsmtFullBath 1.023e+03 2.357e+03 0.434 0.664252
## BsmtHalfBath -1.960e+03 3.514e+03 -0.558 0.577221
## FullBath 4.049e+03 2.577e+03 1.571 0.116456
## HalfBath 3.604e+03 2.485e+03 1.450 0.147213
## BedroomAbvGr -8.567e+03 1.573e+03 -5.445 6.39e-08 ***
## KitchenAbvGr -2.122e+04 5.220e+03 -4.064 5.16e-05 ***
## TotRmsAbvGrd 5.732e+03 1.146e+03 5.003 6.57e-07 ***
## Fireplaces 7.928e+02 2.418e+03 0.328 0.743099
## GarageYrBlt 4.473e+01 6.616e+01 0.676 0.499122
## GarageCars -5.776e+02 2.661e+03 -0.217 0.828190
## GarageArea 2.678e+01 9.143e+00 2.929 0.003472 **
## WoodDeckSF 1.640e+01 7.330e+00 2.237 0.025504 *
## OpenPorchSF -6.594e+00 1.398e+01 -0.472 0.637241
## EnclosedPorch 7.004e+00 1.536e+01 0.456 0.648510
## X3SsnPorch -7.119e+00 2.672e+01 -0.266 0.789933
## ScreenPorch 4.480e+01 1.535e+01 2.918 0.003592 **
## PoolArea 3.365e+01 4.271e+01 0.788 0.430871
## MiscVal 6.723e-01 1.656e+00 0.406 0.684844
## YrSold 1.816e+02 6.296e+02 0.288 0.773078
## Zone 7.598e+02 2.284e+03 0.333 0.739400
## street_paved 1.788e+04 1.307e+04 1.368 0.171639
## Paved_alley -8.342e+03 5.435e+03 -1.535 0.125109
## Lot_regular 1.237e+03 1.890e+03 0.655 0.512903
## LandContour_level 2.228e+03 2.946e+03 0.756 0.449607
## LotConfig_culdsac_fr3 7.186e+03 3.561e+03 2.018 0.043833 *
## Neighborhood_highprice 1.869e+04 2.588e+03 7.221 9.61e-13 ***
## Condition1_good -1.900e+04 6.099e+03 -3.115 0.001888 **
## Condition2_good -1.276e+05 1.745e+04 -7.315 4.96e-13 ***
## BldgType_singlefam_endtwnhse -4.714e+03 4.641e+03 -1.016 0.309993
## HouseStyle_level -6.071e+03 3.638e+03 -1.669 0.095428 .
## RoofStyle_level 1.192e+03 2.220e+03 0.537 0.591362
## RoofMatl_level 3.031e+04 8.915e+03 3.400 0.000699 ***
## MasVnrType_Stone 3.873e+03 3.352e+03 1.155 0.248212
## ExterQual_level 8.281e+03 2.516e+03 3.291 0.001030 **
## ExterCond_level -1.670e+03 2.521e+03 -0.662 0.507977
## Foundation_concrete 3.905e+03 2.614e+03 1.494 0.135499
## BsmtQual_level 4.280e+03 2.016e+03 2.122 0.034023 *
## BsmtCond_level -7.749e+03 2.576e+03 -3.009 0.002685 **
## BsmtExposure_level 4.026e+03 9.390e+02 4.288 1.96e-05 ***
## Heating_type -6.979e+02 1.044e+04 -0.067 0.946715
## HeatingQC_level 3.259e+02 1.138e+03 0.286 0.774722
## CentralAir_flag -3.157e+03 4.121e+03 -0.766 0.443729
## Electrical_SBrkr -3.031e+03 3.252e+03 -0.932 0.351433
## KitchenQual_level 7.493e+03 1.965e+03 3.813 0.000145 ***
## FireplaceQu_level 1.163e+03 1.201e+03 0.969 0.332803
## GarageType_within -4.687e+03 2.582e+03 -1.815 0.069762 .
## GarageFinish_status -2.132e+03 2.428e+03 -0.878 0.380029
## GarageQual_level 6.342e+03 4.083e+03 1.553 0.120694
## GarageCond_status -9.683e+03 4.183e+03 -2.315 0.020821 *
## PavedDrive_flag 1.209e+03 4.286e+03 0.282 0.777997
## PoolQC_level -3.623e+04 1.874e+04 -1.934 0.053412 .
## Fence_good -5.646e+03 4.261e+03 -1.325 0.185371
## SaleType_value 1.687e+04 2.539e+03 6.644 4.79e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26850 on 1100 degrees of freedom
## Multiple R-squared: 0.8906, Adjusted R-squared: 0.8839
## F-statistic: 133.6 on 67 and 1100 DF, p-value: < 2.2e-16
As we see above, many of the variables are not statistically significant. So we are only taking the significant features.
g2 <- lm(SalePrice ~ MSSubClass + LotFrontage + LotArea + OverallQual + OverallCond +
YearBuilt + MasVnrArea + BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF + X1stFlrSF +
X2ndFlrSF + LowQualFinSF + BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd +
GarageArea + WoodDeckSF + ScreenPorch + Paved_alley + LotConfig_culdsac_fr3 +
Neighborhood_highprice + Condition1_good + Condition2_good + RoofMatl_level +
ExterQual_level + BsmtQual_level + BsmtCond_level + BsmtExposure_level +
KitchenQual_level + PoolQC_level + SaleType_value + GarageCond_status,
data = housing2.train.df)
summary(g2)
##
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotFrontage + LotArea +
## OverallQual + OverallCond + YearBuilt + MasVnrArea + BsmtFinSF1 +
## BsmtFinSF2 + BsmtUnfSF + X1stFlrSF + X2ndFlrSF + LowQualFinSF +
## BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd + GarageArea +
## WoodDeckSF + ScreenPorch + Paved_alley + LotConfig_culdsac_fr3 +
## Neighborhood_highprice + Condition1_good + Condition2_good +
## RoofMatl_level + ExterQual_level + BsmtQual_level + BsmtCond_level +
## BsmtExposure_level + KitchenQual_level + PoolQC_level + SaleType_value +
## GarageCond_status, data = housing2.train.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -245534 -13523 11 12976 202924
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.523e+05 8.912e+04 -5.076 4.51e-07 ***
## MSSubClass -1.214e+02 2.490e+01 -4.876 1.24e-06 ***
## LotFrontage 1.424e+02 4.767e+01 2.988 0.002872 **
## LotArea 2.955e-01 8.371e-02 3.530 0.000432 ***
## OverallQual 9.920e+03 1.116e+03 8.885 < 2e-16 ***
## OverallCond 5.274e+03 8.434e+02 6.253 5.70e-10 ***
## YearBuilt 1.978e+02 4.562e+01 4.336 1.58e-05 ***
## MasVnrArea 2.498e+01 5.100e+00 4.899 1.10e-06 ***
## BsmtFinSF1 3.710e+01 4.515e+00 8.217 5.66e-16 ***
## BsmtFinSF2 2.311e+01 6.443e+00 3.586 0.000350 ***
## BsmtUnfSF 1.747e+01 4.387e+00 3.981 7.31e-05 ***
## X1stFlrSF 4.630e+01 5.203e+00 8.900 < 2e-16 ***
## X2ndFlrSF 5.102e+01 3.787e+00 13.473 < 2e-16 ***
## LowQualFinSF 3.242e+01 1.670e+01 1.941 0.052532 .
## BedroomAbvGr -8.500e+03 1.477e+03 -5.754 1.12e-08 ***
## KitchenAbvGr -1.749e+04 4.497e+03 -3.889 0.000106 ***
## TotRmsAbvGrd 5.842e+03 1.111e+03 5.259 1.73e-07 ***
## GarageArea 2.841e+01 5.889e+00 4.825 1.59e-06 ***
## WoodDeckSF 1.325e+01 7.085e+00 1.871 0.061651 .
## ScreenPorch 4.749e+01 1.494e+01 3.180 0.001515 **
## Paved_alley -7.458e+03 5.118e+03 -1.457 0.145294
## LotConfig_culdsac_fr3 5.427e+03 3.348e+03 1.621 0.105280
## Neighborhood_highprice 1.980e+04 2.376e+03 8.334 2.23e-16 ***
## Condition1_good -1.949e+04 5.972e+03 -3.263 0.001134 **
## Condition2_good -1.244e+05 1.677e+04 -7.419 2.31e-13 ***
## RoofMatl_level 2.878e+04 8.687e+03 3.312 0.000954 ***
## ExterQual_level 9.867e+03 2.397e+03 4.117 4.12e-05 ***
## BsmtQual_level 4.747e+03 1.925e+03 2.466 0.013822 *
## BsmtCond_level -8.703e+03 2.451e+03 -3.550 0.000400 ***
## BsmtExposure_level 3.800e+03 9.034e+02 4.206 2.80e-05 ***
## KitchenQual_level 7.559e+03 1.872e+03 4.038 5.76e-05 ***
## PoolQC_level -2.865e+04 1.421e+04 -2.015 0.044100 *
## SaleType_value 1.786e+04 2.445e+03 7.304 5.25e-13 ***
## GarageCond_status -5.917e+03 1.956e+03 -3.025 0.002540 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26800 on 1134 degrees of freedom
## Multiple R-squared: 0.8876, Adjusted R-squared: 0.8844
## F-statistic: 271.5 on 33 and 1134 DF, p-value: < 2.2e-16
Model diagnostic plots
#resid_panel(g2, plots='default', smoother = TRUE)
plot(g2)
## Warning: not plotting observations with leverage one:
## 198
## Warning: not plotting observations with leverage one:
## 198
Removing statistically insignificant features further.
g3 <- lm(SalePrice ~ MSSubClass + LotFrontage + LotArea + OverallQual + OverallCond +
YearBuilt + MasVnrArea + BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF + X1stFlrSF +
X2ndFlrSF + LowQualFinSF + BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd +
GarageArea + WoodDeckSF + ScreenPorch +
Neighborhood_highprice + Condition1_good + Condition2_good + RoofMatl_level +
ExterQual_level + BsmtQual_level + BsmtCond_level + BsmtExposure_level +
KitchenQual_level + PoolQC_level + SaleType_value + GarageCond_status,
data = housing2.train.df)
summary(g3)
##
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotFrontage + LotArea +
## OverallQual + OverallCond + YearBuilt + MasVnrArea + BsmtFinSF1 +
## BsmtFinSF2 + BsmtUnfSF + X1stFlrSF + X2ndFlrSF + LowQualFinSF +
## BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd + GarageArea +
## WoodDeckSF + ScreenPorch + Neighborhood_highprice + Condition1_good +
## Condition2_good + RoofMatl_level + ExterQual_level + BsmtQual_level +
## BsmtCond_level + BsmtExposure_level + KitchenQual_level +
## PoolQC_level + SaleType_value + GarageCond_status, data = housing2.train.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -246143 -13827 -62 12817 203286
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.696e+05 8.888e+04 -5.283 1.52e-07 ***
## MSSubClass -1.276e+02 2.477e+01 -5.153 3.01e-07 ***
## LotFrontage 1.365e+02 4.669e+01 2.923 0.003531 **
## LotArea 3.218e-01 8.264e-02 3.894 0.000105 ***
## OverallQual 9.933e+03 1.118e+03 8.887 < 2e-16 ***
## OverallCond 5.380e+03 8.431e+02 6.381 2.55e-10 ***
## YearBuilt 2.063e+02 4.550e+01 4.534 6.40e-06 ***
## MasVnrArea 2.439e+01 5.099e+00 4.784 1.95e-06 ***
## BsmtFinSF1 3.675e+01 4.518e+00 8.133 1.09e-15 ***
## BsmtFinSF2 2.296e+01 6.451e+00 3.559 0.000387 ***
## BsmtUnfSF 1.687e+01 4.385e+00 3.847 0.000126 ***
## X1stFlrSF 4.659e+01 5.198e+00 8.963 < 2e-16 ***
## X2ndFlrSF 5.084e+01 3.770e+00 13.486 < 2e-16 ***
## LowQualFinSF 3.089e+01 1.669e+01 1.851 0.064413 .
## BedroomAbvGr -8.620e+03 1.477e+03 -5.837 6.93e-09 ***
## KitchenAbvGr -1.753e+04 4.503e+03 -3.892 0.000105 ***
## TotRmsAbvGrd 6.020e+03 1.103e+03 5.459 5.88e-08 ***
## GarageArea 2.868e+01 5.895e+00 4.866 1.30e-06 ***
## WoodDeckSF 1.392e+01 7.060e+00 1.972 0.048852 *
## ScreenPorch 4.917e+01 1.494e+01 3.291 0.001027 **
## Neighborhood_highprice 1.938e+04 2.355e+03 8.228 5.18e-16 ***
## Condition1_good -1.952e+04 5.963e+03 -3.273 0.001095 **
## Condition2_good -1.248e+05 1.678e+04 -7.439 1.99e-13 ***
## RoofMatl_level 2.828e+04 8.692e+03 3.253 0.001174 **
## ExterQual_level 1.001e+04 2.399e+03 4.171 3.26e-05 ***
## BsmtQual_level 4.837e+03 1.927e+03 2.510 0.012207 *
## BsmtCond_level -8.732e+03 2.455e+03 -3.558 0.000390 ***
## BsmtExposure_level 3.941e+03 9.021e+02 4.369 1.36e-05 ***
## KitchenQual_level 7.324e+03 1.871e+03 3.914 9.64e-05 ***
## PoolQC_level -2.797e+04 1.423e+04 -1.966 0.049577 *
## SaleType_value 1.755e+04 2.443e+03 7.182 1.24e-12 ***
## GarageCond_status -6.044e+03 1.957e+03 -3.088 0.002062 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26840 on 1136 degrees of freedom
## Multiple R-squared: 0.8872, Adjusted R-squared: 0.8841
## F-statistic: 288.1 on 31 and 1136 DF, p-value: < 2.2e-16
Model diagnostic plots
#resid_panel(g3, plots='default', smoother = TRUE)
plot(g3)
## Warning: not plotting observations with leverage one:
## 198
## Warning: not plotting observations with leverage one:
## 198
As residuals are a curve shape, we will try taking the log of the SalePrice, which is the target variable.
### Taking log
g4 <- lm(log(SalePrice) ~ MSSubClass + LotFrontage + LotArea + OverallQual +
OverallCond +
YearBuilt + MasVnrArea + BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF + X1stFlrSF +
X2ndFlrSF + LowQualFinSF + BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd +
GarageArea + WoodDeckSF + ScreenPorch +
Neighborhood_highprice + Condition1_good + Condition2_good + RoofMatl_level
+ ExterQual_level + BsmtQual_level + BsmtCond_level + BsmtExposure_level +
KitchenQual_level + PoolQC_level + SaleType_value + GarageCond_status,
data = housing2.train.df)
summary(g4)
##
## Call:
## lm(formula = log(SalePrice) ~ MSSubClass + LotFrontage + LotArea +
## OverallQual + OverallCond + YearBuilt + MasVnrArea + BsmtFinSF1 +
## BsmtFinSF2 + BsmtUnfSF + X1stFlrSF + X2ndFlrSF + LowQualFinSF +
## BedroomAbvGr + KitchenAbvGr + TotRmsAbvGrd + GarageArea +
## WoodDeckSF + ScreenPorch + Neighborhood_highprice + Condition1_good +
## Condition2_good + RoofMatl_level + ExterQual_level + BsmtQual_level +
## BsmtCond_level + BsmtExposure_level + KitchenQual_level +
## PoolQC_level + SaleType_value + GarageCond_status, data = housing2.train.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.09067 -0.05671 0.00656 0.06939 0.66094
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.720e+00 4.255e-01 11.092 < 2e-16 ***
## MSSubClass -4.172e-04 1.186e-04 -3.518 0.000452 ***
## LotFrontage 9.495e-04 2.236e-04 4.247 2.34e-05 ***
## LotArea 1.699e-06 3.957e-07 4.292 1.92e-05 ***
## OverallQual 6.713e-02 5.352e-03 12.544 < 2e-16 ***
## OverallCond 5.145e-02 4.037e-03 12.745 < 2e-16 ***
## YearBuilt 2.860e-03 2.179e-04 13.126 < 2e-16 ***
## MasVnrArea -2.654e-05 2.441e-05 -1.087 0.277288
## BsmtFinSF1 1.484e-04 2.164e-05 6.857 1.15e-11 ***
## BsmtFinSF2 9.940e-05 3.089e-05 3.218 0.001327 **
## BsmtUnfSF 5.842e-05 2.100e-05 2.782 0.005484 **
## X1stFlrSF 2.829e-04 2.489e-05 11.367 < 2e-16 ***
## X2ndFlrSF 2.552e-04 1.805e-05 14.135 < 2e-16 ***
## LowQualFinSF 1.642e-04 7.991e-05 2.054 0.040163 *
## BedroomAbvGr -5.822e-03 7.071e-03 -0.823 0.410502
## KitchenAbvGr -5.071e-02 2.156e-02 -2.352 0.018858 *
## TotRmsAbvGrd 1.434e-02 5.280e-03 2.716 0.006705 **
## GarageArea 1.105e-04 2.822e-05 3.915 9.56e-05 ***
## WoodDeckSF 8.657e-05 3.380e-05 2.561 0.010564 *
## ScreenPorch 3.126e-04 7.152e-05 4.371 1.35e-05 ***
## Neighborhood_highprice 8.910e-02 1.128e-02 7.902 6.47e-15 ***
## Condition1_good -4.246e-02 2.855e-02 -1.487 0.137279
## Condition2_good -5.994e-01 8.035e-02 -7.459 1.72e-13 ***
## RoofMatl_level -1.443e-02 4.162e-02 -0.347 0.728939
## ExterQual_level 1.642e-02 1.149e-02 1.430 0.153015
## BsmtQual_level 2.012e-02 9.227e-03 2.181 0.029425 *
## BsmtCond_level -4.056e-03 1.175e-02 -0.345 0.730067
## BsmtExposure_level 7.582e-03 4.320e-03 1.755 0.079467 .
## KitchenQual_level 2.543e-02 8.961e-03 2.838 0.004622 **
## PoolQC_level -7.778e-02 6.813e-02 -1.142 0.253859
## SaleType_value 5.495e-02 1.170e-02 4.697 2.96e-06 ***
## GarageCond_status 2.645e-02 9.371e-03 2.822 0.004852 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1285 on 1136 degrees of freedom
## Multiple R-squared: 0.9003, Adjusted R-squared: 0.8976
## F-statistic: 330.9 on 31 and 1136 DF, p-value: < 2.2e-16
Model diagnostic plots
plot(g4)
## Warning: not plotting observations with leverage one:
## 198
## Warning: not plotting observations with leverage one:
## 198
Removing further the features which are statistically insignificant
## Removing further
g6 <- lm(log(SalePrice) ~ MSSubClass + LotFrontage + LotArea + OverallQual + OverallCond +
YearBuilt + BsmtFinSF1 + BsmtFinSF2 + BsmtUnfSF + X1stFlrSF +
X2ndFlrSF + LowQualFinSF + KitchenAbvGr + TotRmsAbvGrd +
GarageArea + WoodDeckSF + ScreenPorch +
Neighborhood_highprice + Condition2_good +
BsmtQual_level + BsmtExposure_level +
KitchenQual_level + SaleType_value + GarageCond_status,
data = housing2.train.df)
summary(g6)
##
## Call:
## lm(formula = log(SalePrice) ~ MSSubClass + LotFrontage + LotArea +
## OverallQual + OverallCond + YearBuilt + BsmtFinSF1 + BsmtFinSF2 +
## BsmtUnfSF + X1stFlrSF + X2ndFlrSF + LowQualFinSF + KitchenAbvGr +
## TotRmsAbvGrd + GarageArea + WoodDeckSF + ScreenPorch + Neighborhood_highprice +
## Condition2_good + BsmtQual_level + BsmtExposure_level + KitchenQual_level +
## SaleType_value + GarageCond_status, data = housing2.train.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.09737 -0.05546 0.00846 0.07017 0.70361
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.587e+00 4.021e-01 11.406 < 2e-16 ***
## MSSubClass -4.306e-04 1.181e-04 -3.645 0.000280 ***
## LotFrontage 8.575e-04 2.185e-04 3.924 9.22e-05 ***
## LotArea 1.692e-06 3.938e-07 4.296 1.88e-05 ***
## OverallQual 6.884e-02 5.162e-03 13.335 < 2e-16 ***
## OverallCond 5.109e-02 3.909e-03 13.072 < 2e-16 ***
## YearBuilt 2.887e-03 2.104e-04 13.722 < 2e-16 ***
## BsmtFinSF1 1.421e-04 2.024e-05 7.021 3.76e-12 ***
## BsmtFinSF2 9.506e-05 2.938e-05 3.236 0.001248 **
## BsmtUnfSF 5.479e-05 1.961e-05 2.794 0.005287 **
## X1stFlrSF 2.784e-04 2.368e-05 11.753 < 2e-16 ***
## X2ndFlrSF 2.472e-04 1.758e-05 14.064 < 2e-16 ***
## LowQualFinSF 1.494e-04 7.749e-05 1.929 0.054020 .
## KitchenAbvGr -4.621e-02 2.141e-02 -2.158 0.031108 *
## TotRmsAbvGrd 1.319e-02 4.674e-03 2.821 0.004864 **
## GarageArea 1.115e-04 2.803e-05 3.977 7.42e-05 ***
## WoodDeckSF 8.091e-05 3.363e-05 2.406 0.016276 *
## ScreenPorch 3.091e-04 7.130e-05 4.336 1.58e-05 ***
## Neighborhood_highprice 9.351e-02 1.097e-02 8.524 < 2e-16 ***
## Condition2_good -6.052e-01 7.775e-02 -7.784 1.57e-14 ***
## BsmtQual_level 2.151e-02 8.507e-03 2.529 0.011568 *
## BsmtExposure_level 8.058e-03 4.298e-03 1.875 0.061032 .
## KitchenQual_level 3.169e-02 8.297e-03 3.819 0.000141 ***
## SaleType_value 5.696e-02 1.162e-02 4.903 1.08e-06 ***
## GarageCond_status 2.582e-02 9.331e-03 2.767 0.005741 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1286 on 1143 degrees of freedom
## Multiple R-squared: 0.8996, Adjusted R-squared: 0.8975
## F-statistic: 426.6 on 24 and 1143 DF, p-value: < 2.2e-16
Model diagnostic plots
plot(g6)
This is the best one as it has the biggest R-squared, and the plots also suggest that the assumptions of the linear model are all true. ###### Prediction of test dataset
Predicted_test_raw2 <- predict(g6, newdata = housing2.test.df)
Predicted_test_final2 <- exp(Predicted_test_raw2)
Test_predicted_values2 <- cbind(data.frame(housing2.test.df$SalePrice),
data.frame(Predicted_test_final2))
class(Test_predicted_values2)
## [1] "data.frame"
names(Test_predicted_values2) <- c("Actual", "Predicted")
head(Test_predicted_values2, 20)
## Actual Predicted
## 1169 235000 200374.97
## 1170 625000 615441.66
## 1171 171000 148732.42
## 1172 163000 163658.03
## 1173 171900 164138.15
## 1174 200500 248483.10
## 1175 239000 223886.00
## 1176 285000 344108.44
## 1177 119500 122545.35
## 1178 115000 132778.52
## 1179 154900 123132.49
## 1180 93000 95165.37
## 1181 250000 277633.88
## 1182 392500 279760.24
## 1183 745000 855342.81
## 1184 120000 124221.98
## 1185 186700 194288.37
## 1186 104900 122309.98
## 1187 95000 92904.08
## 1188 262000 264006.88
Plots for the most relevant features - one by one:
summary(housing2_raw$OverallQual)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 5.000 6.000 6.099 7.000 10.000
ggplot(data = housing2_raw, aes(x = as.factor(OverallQual), y = SalePrice)) +
geom_boxplot()
summary(housing2_raw$OverallCond)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 5.000 5.000 5.575 6.000 9.000
ggplot(data = housing2_raw, aes(x = as.factor(OverallCond), y = SalePrice)) +
geom_boxplot()
summary(housing2_raw$YearBuilt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1872 1954 1973 1971 2000 2010
ggplot(data = housing2_raw, aes(x = YearBuilt, y = SalePrice)) +
geom_point() + geom_smooth(method=lm , color="blue", se=FALSE)
summary(housing2_raw$BsmtFinSF1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 383.5 443.6 712.2 5644.0
ggplot(data = housing2_raw, aes(x = BsmtFinSF1, y = SalePrice)) +
geom_point() + geom_smooth(method=lm , color="blue", se=FALSE)
summary(housing2_raw$X1stFlrSF)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 334 882 1087 1163 1391 4692
ggplot(data = housing2_raw, aes(x = X1stFlrSF, y = SalePrice)) +
geom_point() + geom_smooth(method=lm , color="blue", se=FALSE)
summary(housing2_raw$X2ndFlrSF)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 0 347 728 2065
ggplot(data = housing2_raw, aes(x = X2ndFlrSF, y = SalePrice)) +
geom_point() + geom_smooth(method=lm , color="blue", se=FALSE)
summary(housing2_raw$Neighborhood_highprice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.2445 0.0000 1.0000
ggplot(data = housing2_raw, aes(x = as.factor(Neighborhood_highprice),
y = SalePrice)) +
geom_boxplot()
summary(housing2_raw$Condition2_good)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000000 0.000000 0.000000 0.002055 0.000000 1.000000
ggplot(data = housing2_raw, aes(x = as.factor(Condition2_good),
y = SalePrice)) +
geom_boxplot()
MSSubClass
LotFrontage
LotArea
OverallQual
OverallCond
YearBuilt
BsmtFinSF1
BsmtFinSF2
BsmtUnfSF
X1stFlrSF
X2ndFlrSF
LowQualFinSF
KitchenAbvGr
TotRmsAbvGrd
GarageArea
WoodDeckSF
ScreenPorch
Neighborhood_highprice
Condition2_good
BsmtQual_level
BsmtExposure_level
KitchenQual_level
SaleType_value
GarageCond_status
OverallQual
OverallCond
YearBuilt
BsmtFinSF1
X1stFlrSF
X2ndFlrSF
Condition2_good