We will need to load the raw data and needed libraries (Plus, some of the functions we defined earlier.)
## Load my personal function libraries
source('~/Dropbox/Utility/source_self.R', echo=TRUE)
##
## > source("~/Dropbox/Utility/char_to_factor.R", echo = TRUE)
##
## > char_to_factor = function(data) {
## + library(magrittr)
## + data = as.data.frame(data)
## + sapply(data, class) %>% table() %>% print
## + cha .... [TRUNCATED]
##
## > source("~/Dropbox/Utility/correlation.R", echo = TRUE)
##
## > correlation = function(data) {
## + select_num = function(data) {
## + library(magrittr)
## + data = as.data.frame(data)
## + int_in .... [TRUNCATED]
##
## > source("~/Dropbox/Utility/miss_name.R", echo = TRUE)
##
## > miss_name = function(data) {
## + miss_pct = function(data) {
## + pct = sum(is.na(data))/length(data)
## + }
## + missing = data %>% sapply .... [TRUNCATED]
##
## > source("~/Dropbox/Utility/miss_pct_plot.R", echo = TRUE)
##
## > miss_pct_plot = function(data) {
## + miss_pct = function(data) {
## + pct = sum(is.na(data))/length(data)
## + }
## + missing = data %>% sa .... [TRUNCATED]
##
## > source("~/Dropbox/Utility/na_to_fac.R", echo = TRUE)
##
## > na_to_fac = function(data, names) {
## + library(magrittr)
## + library(dplyr)
## + data_na = data %>% select(names)
## + data_nna = data %>% se .... [TRUNCATED]
##
## > source("~/Dropbox/Utility/select_comp.R", echo = TRUE)
##
## > select_comp = function(data, p = 0.5) {
## + miss_pct = function(data) {
## + pct = sum(is.na(data))/length(data)
## + }
## + missing = sapp .... [TRUNCATED]
##
## > source("~/Dropbox/Utility/select_fac.R", echo = TRUE)
##
## > select_fac = function(data) {
## + library(magrittr)
## + data = as.data.frame(data)
## + data %>% sapply(class) %>% table() %>% print
## + fac_ .... [TRUNCATED]
##
## > source("~/Dropbox/Utility/select_num.R", echo = TRUE)
##
## > select_num = function(data) {
## + library(magrittr)
## + data = as.data.frame(data)
## + data %>% sapply(class) %>% table() %>% print
## + int_ .... [TRUNCATED]
##
## > source("~/Dropbox/Utility/timeRecord_functions.R",
## + echo = TRUE)
##
## > timeRecordA = function() {
## + file_name = paste(format(Sys.time(), "%F_%T"), ".log", sep = "")
## + file.create(file_name)
## + extremely_long_ .... [TRUNCATED]
##
## > timeRecordB = function() {
## + if (file.exists(extremely_long_name_i_do_not_think_anyone_would_be_sanely_to_use)) {
## + file_name = extremel .... [TRUNCATED]
##
## > timeRecordR = function(output = "job", unit = "s") {
## + if (file.exists(extremely_long_name_i_do_not_think_anyone_would_be_sanely_to_use)) {
## + .... [TRUNCATED]
##
## > source("~/Dropbox/Utility/miss_pct_df.R", echo = TRUE)
##
## > miss_pct_df = function(data) {
## + miss_pct = function(data) {
## + pct = sum(is.na(data))/length(data)
## + }
## + missing = data %>% sapp .... [TRUNCATED]
##
## > source("~/Dropbox/Utility/int_to_num.R", echo = TRUE)
##
## > int_to_num = function(data) {
## + data = as.data.frame(data)
## + library(magrittr)
## + data = as.data.frame(data)
## + sapply(data, class) %> .... [TRUNCATED]
## Load R packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(magrittr)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
boston_raw=fread("train.csv",header=TRUE)
#boston_raw=boston_raw%>%char_to_factor()
## Several versions of data cleaning
## For dataset that are relatively large, it is a good practice to save the different version of the cleaned data.
The next step is to understand the missing value and impute it.
There are basically two types of missing value: 1. Systematic missing value 2. Missing value at random
Let us first look at the features that have missing values.
miss_name=function(data){
library(magrittr)
# Define a function to calculate percentage of missing value in each feature
miss_pct=function(data){
pct=sum(is.na(data))/length(data)
return(pct)
}
# Apply the function define above to every columns of the data set
missing=data%>%sapply(miss_pct)%>%as.data.frame()
colnames(missing)="miss_pct"
missing=data.frame(names=row.names(missing)%>%as.character(),missing_pct=missing)
# Select only the features with missing values and their missing value percentage
missing=missing%>%filter(miss_pct>0)%>%arrange(desc(miss_pct))
return(missing)
}
miss_name(boston_raw)
## names miss_pct
## 1 PoolQC 0.9952054795
## 2 MiscFeature 0.9630136986
## 3 Alley 0.9376712329
## 4 Fence 0.8075342466
## 5 FireplaceQu 0.4726027397
## 6 LotFrontage 0.1773972603
## 7 GarageType 0.0554794521
## 8 GarageYrBlt 0.0554794521
## 9 GarageFinish 0.0554794521
## 10 GarageQual 0.0554794521
## 11 GarageCond 0.0554794521
## 12 BsmtExposure 0.0260273973
## 13 BsmtFinType2 0.0260273973
## 14 BsmtQual 0.0253424658
## 15 BsmtCond 0.0253424658
## 16 BsmtFinType1 0.0253424658
## 17 MasVnrType 0.0054794521
## 18 MasVnrArea 0.0054794521
## 19 Electrical 0.0006849315
Let us look at the data dictionary to find out if NA value actually means something else.
Alley:NA No alley access BsmtQual: NA No Basement BsmtCond: NA No Basement BsmtExposure:NA No Basement BsmtFinType1:NA No Basement BsmtFinType2: NA No Basement
FireplaceQu:NA No Fireplace
GarageType: NA No Garage GarageFinish:NA No Garage GarageQual:NA No Garage GarageCond:NA No Garage
PoolQC:NA No Poolcaret
Fence:NA No FenceGarageYrBlt MiscFeature: NA None
It seems that in all these features, NA does not mean missing value, it simply means there is no such property. Thus, we need to convert NA in these features into a factor value like “None”.
## Change NA values of the features into "None", a new factor value
na_to_fac=function(data,names){
# libraries (change it into installing it if not installed later)
library(magrittr)
library(dplyr)
# Split the data into data with NA value and data with no NA value
data_na=data%>%select(names)
data_nna=data%>%select(-one_of(names))
data_na[is.na(data_na)]="None"
data=cbind(data_na,data_nna)
return(data)
}
boston_clean2=boston_raw%>%na_to_fac(names=c("Alley","BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2","FireplaceQu","GarageType","GarageFinish","GarageQual","GarageCond","PoolQC","Fence","MiscFeature"))%>%char_to_factor()
## .
## character integer
## 43 38
## .
## factor integer
## 43 38
miss_pct_plot(boston_clean2)
## [1] "There are 5 features with missing values."
# boston_clean2%>%sapply(class)
Then, after this cleaning we are left with 5 features with missing values.
Let us look further into those variables.
LotFrontage is the Linear feet of street connected to property. GarageYrBlt is the Year garage was built.
MasVnrArea is the Masonry veneer area in square feet. MasVnrType is the Masonry veneer type
Electrical is Electrical system with various factors.
boston_raw%>%filter(is.na(Electrical))
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1 1380 80 RL 73 9735 Pave <NA> Reg
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1 Lvl AllPub Inside Gtl Timber Norm
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1 Norm 1Fam SLvl 5 5 2006
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1 2007 Gable CompShg VinylSd VinylSd None
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1 0 TA TA PConc Gd TA No
## BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1 Unf 0 Unf 0 384 384
## Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF
## 1 GasA Gd Y <NA> 754 640 0
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1 1394 0 0 2 1 3
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1 1 Gd 7 Typ 0 <NA>
## GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1 BuiltIn 2007 Fin 2 400 TA
## GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch
## 1 TA Y 100 0 0 0
## ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1 0 0 <NA> <NA> <NA> 0 5 2008
## SaleType SaleCondition SalePrice
## 1 WD Normal 167500
There is only one missing value. We could definitely impute it with some methods.
2.Look at the missing value at MasVnrArea and Masonry veneer type.
boston_raw%>%filter(is.na(MasVnrArea))
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1 235 60 RL NA 7851 Pave <NA> Reg
## 2 530 20 RL NA 32668 Pave <NA> IR1
## 3 651 60 FV 65 8125 Pave <NA> Reg
## 4 937 20 RL 67 10083 Pave <NA> Reg
## 5 974 20 FV 95 11639 Pave <NA> Reg
## 6 978 120 FV 35 4274 Pave Pave IR1
## 7 1244 20 RL 107 13891 Pave <NA> Reg
## 8 1279 60 RL 75 9473 Pave <NA> Reg
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1 Lvl AllPub Inside Gtl Gilbert Norm
## 2 Lvl AllPub CulDSac Gtl Crawfor Norm
## 3 Lvl AllPub Inside Gtl Somerst Norm
## 4 Lvl AllPub Inside Gtl SawyerW Norm
## 5 Lvl AllPub Corner Gtl Somerst Norm
## 6 Lvl AllPub Inside Gtl Somerst Norm
## 7 Lvl AllPub Inside Gtl NridgHt Norm
## 8 Lvl AllPub Inside Gtl CollgCr Norm
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1 Norm 1Fam 2Story 6 5 2002
## 2 Norm 1Fam 1Story 6 3 1957
## 3 Norm 1Fam 2Story 7 6 2007
## 4 Norm 1Fam 1Story 7 5 2003
## 5 Norm 1Fam 1Story 7 5 2007
## 6 Norm TwnhsE 1Story 7 5 2006
## 7 Norm 1Fam 1Story 10 5 2006
## 8 Norm 1Fam 2Story 8 5 2002
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1 2002 Gable CompShg VinylSd VinylSd <NA>
## 2 1975 Hip CompShg Wd Sdng Stone <NA>
## 3 2007 Gable CompShg CemntBd CmentBd <NA>
## 4 2003 Gable CompShg VinylSd VinylSd <NA>
## 5 2008 Gable CompShg CemntBd CmentBd <NA>
## 6 2007 Gable CompShg VinylSd VinylSd <NA>
## 7 2006 Gable CompShg VinylSd VinylSd <NA>
## 8 2002 Gable CompShg VinylSd VinylSd <NA>
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1 NA Gd TA PConc Gd TA No
## 2 NA Gd TA PConc TA TA No
## 3 NA Gd TA PConc Gd TA No
## 4 NA Gd TA PConc Gd TA No
## 5 NA Gd TA PConc Gd TA No
## 6 NA Gd TA PConc Gd TA No
## 7 NA Ex TA PConc Ex Gd Gd
## 8 NA Gd TA PConc Gd TA No
## BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1 GLQ 625 Unf 0 235 860
## 2 Rec 1219 Unf 0 816 2035
## 3 Unf 0 Unf 0 813 813
## 4 GLQ 833 Unf 0 343 1176
## 5 Unf 0 Unf 0 1428 1428
## 6 GLQ 1106 Unf 0 135 1241
## 7 GLQ 1386 Unf 0 690 2076
## 8 GLQ 804 Unf 0 324 1128
## Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF
## 1 GasA Ex Y SBrkr 860 1100 0
## 2 GasA TA Y SBrkr 2515 0 0
## 3 GasA Ex Y SBrkr 822 843 0
## 4 GasA Ex Y SBrkr 1200 0 0
## 5 GasA Ex Y SBrkr 1428 0 0
## 6 GasA Ex Y SBrkr 1241 0 0
## 7 GasA Ex Y SBrkr 2076 0 0
## 8 GasA Ex Y SBrkr 1128 903 0
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1 1960 1 0 2 1 4
## 2 2515 1 0 3 0 4
## 3 1665 0 0 2 1 3
## 4 1200 1 0 2 0 2
## 5 1428 0 0 2 0 3
## 6 1241 1 0 1 1 1
## 7 2076 1 0 2 1 2
## 8 2031 1 0 2 1 3
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1 1 Gd 8 Typ 2 TA
## 2 2 TA 9 Maj1 2 TA
## 3 1 Gd 7 Typ 0 <NA>
## 4 1 Gd 5 Typ 0 <NA>
## 5 1 Gd 6 Typ 0 <NA>
## 6 1 Gd 4 Typ 0 <NA>
## 7 1 Ex 7 Typ 1 Gd
## 8 1 Gd 7 Typ 1 Gd
## GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1 BuiltIn 2002 Fin 2 440 TA
## 2 Attchd 1975 RFn 2 484 TA
## 3 Attchd 2007 RFn 2 562 TA
## 4 Attchd 2003 RFn 2 555 TA
## 5 Attchd 2007 Fin 2 480 TA
## 6 Attchd 2007 Fin 2 569 TA
## 7 Attchd 2006 Fin 3 850 TA
## 8 Attchd 2002 RFn 2 577 TA
## GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch
## 1 TA Y 288 48 0 0
## 2 TA Y 0 0 200 0
## 3 TA Y 0 0 0 0
## 4 TA Y 0 41 0 0
## 5 TA Y 0 120 0 0
## 6 TA Y 0 116 0 0
## 7 TA Y 216 229 0 0
## 8 TA Y 0 211 0 0
## ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1 0 0 <NA> <NA> <NA> 0 5 2010
## 2 0 0 <NA> <NA> <NA> 0 3 2007
## 3 0 0 <NA> <NA> <NA> 0 5 2008
## 4 0 0 <NA> <NA> <NA> 0 8 2009
## 5 0 0 <NA> <NA> <NA> 0 12 2008
## 6 0 0 <NA> <NA> <NA> 0 11 2007
## 7 0 0 <NA> <NA> <NA> 0 9 2006
## 8 0 0 <NA> <NA> <NA> 0 3 2008
## SaleType SaleCondition SalePrice
## 1 WD Normal 216500
## 2 WD Alloca 200624
## 3 WD Normal 205950
## 4 WD Normal 184900
## 5 New Partial 182000
## 6 New Partial 199900
## 7 New Partial 465000
## 8 WD Normal 237000
boston_raw%>%filter(is.na(MasVnrType))%>%select(Id)==boston_raw%>%filter(is.na(MasVnrArea))%>%select(Id)
## Id
## [1,] TRUE
## [2,] TRUE
## [3,] TRUE
## [4,] TRUE
## [5,] TRUE
## [6,] TRUE
## [7,] TRUE
## [8,] TRUE
The house that has a missing value in MasVnrType also has a missing value in MasVnrArea.
I would consider two solutions: 1) use mice package to do the imputation 2) Assign missing values in MasVnrType to None and MasVnrArea to 0.
boston_raw%>%filter(is.na(GarageYrBlt))%>%head()
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1 40 90 RL 65 6040 Pave <NA> Reg
## 2 49 190 RM 33 4456 Pave <NA> Reg
## 3 79 90 RL 72 10778 Pave <NA> Reg
## 4 89 50 C (all) 105 8470 Pave <NA> IR1
## 5 90 20 RL 60 8070 Pave <NA> Reg
## 6 100 20 RL 77 9320 Pave <NA> IR1
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1 Lvl AllPub Inside Gtl Edwards Norm
## 2 Lvl AllPub Inside Gtl OldTown Norm
## 3 Lvl AllPub Inside Gtl Sawyer Norm
## 4 Lvl AllPub Corner Gtl IDOTRR Feedr
## 5 Lvl AllPub Inside Gtl CollgCr Norm
## 6 Lvl AllPub Inside Gtl NAmes Norm
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1 Norm Duplex 1Story 4 5 1955
## 2 Norm 2fmCon 2Story 4 5 1920
## 3 Norm Duplex 1Story 4 5 1968
## 4 Feedr 1Fam 1.5Fin 3 2 1915
## 5 Norm 1Fam 1Story 4 5 1994
## 6 Norm 1Fam 1Story 4 5 1959
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1 1955 Gable CompShg AsbShng Plywood None
## 2 2008 Gable CompShg MetalSd MetalSd None
## 3 1968 Hip CompShg HdBoard HdBoard None
## 4 1982 Hip CompShg Plywood Plywood None
## 5 1995 Gable CompShg VinylSd VinylSd None
## 6 1959 Gable CompShg Plywood Plywood None
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1 0 TA TA PConc <NA> <NA> <NA>
## 2 0 TA TA BrkTil TA TA No
## 3 0 TA TA CBlock TA TA No
## 4 0 Fa Fa CBlock TA Fa No
## 5 0 TA TA PConc Gd TA No
## 6 0 TA TA CBlock TA TA No
## BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1 <NA> 0 <NA> 0 0 0
## 2 Unf 0 Unf 0 736 736
## 3 Unf 0 Unf 0 1768 1768
## 4 Unf 0 Unf 0 1013 1013
## 5 GLQ 588 Unf 0 402 990
## 6 ALQ 569 Unf 0 381 950
## Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF
## 1 GasA TA N FuseP 1152 0 0
## 2 GasA Gd Y SBrkr 736 716 0
## 3 GasA TA N SBrkr 1768 0 0
## 4 GasA TA N SBrkr 1013 0 513
## 5 GasA Ex Y SBrkr 990 0 0
## 6 GasA Fa Y SBrkr 1225 0 0
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1 1152 0 0 2 0 2
## 2 1452 0 0 2 0 2
## 3 1768 0 0 2 0 4
## 4 1526 0 0 1 0 2
## 5 990 1 0 1 0 3
## 6 1225 1 0 1 1 3
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1 2 Fa 6 Typ 0 <NA>
## 2 3 TA 8 Typ 0 <NA>
## 3 2 TA 8 Typ 0 <NA>
## 4 1 Fa 6 Typ 0 <NA>
## 5 1 TA 5 Typ 0 <NA>
## 6 1 TA 6 Typ 0 <NA>
## GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1 <NA> NA <NA> 0 0 <NA>
## 2 <NA> NA <NA> 0 0 <NA>
## 3 <NA> NA <NA> 0 0 <NA>
## 4 <NA> NA <NA> 0 0 <NA>
## 5 <NA> NA <NA> 0 0 <NA>
## 6 <NA> NA <NA> 0 0 <NA>
## GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch
## 1 <NA> N 0 0 0 0
## 2 <NA> N 0 0 102 0
## 3 <NA> Y 0 0 0 0
## 4 <NA> N 0 0 156 0
## 5 <NA> Y 0 0 0 0
## 6 <NA> Y 352 0 0 0
## ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1 0 0 <NA> <NA> <NA> 0 6 2008
## 2 0 0 <NA> <NA> <NA> 0 6 2009
## 3 0 0 <NA> <NA> <NA> 0 4 2010
## 4 0 0 <NA> MnPrv <NA> 0 10 2009
## 5 0 0 <NA> <NA> <NA> 0 8 2007
## 6 0 0 <NA> <NA> Shed 400 1 2010
## SaleType SaleCondition SalePrice
## 1 WD AdjLand 82000
## 2 New Partial 113000
## 3 WD Normal 136500
## 4 ConLD Abnorml 85000
## 5 WD Normal 123600
## 6 WD Normal 128950
It seems that other garage features are also missing when GarageYrBlt is missing.
Let us check.
boston_raw%>%filter(is.na(GarageYrBlt))%>%select(Id)==boston_raw%>%filter(is.na(GarageQual))%>%select(Id)
## Id
## [1,] TRUE
## [2,] TRUE
## [3,] TRUE
## [4,] TRUE
## [5,] TRUE
## [6,] TRUE
## [7,] TRUE
## [8,] TRUE
## [9,] TRUE
## [10,] TRUE
## [11,] TRUE
## [12,] TRUE
## [13,] TRUE
## [14,] TRUE
## [15,] TRUE
## [16,] TRUE
## [17,] TRUE
## [18,] TRUE
## [19,] TRUE
## [20,] TRUE
## [21,] TRUE
## [22,] TRUE
## [23,] TRUE
## [24,] TRUE
## [25,] TRUE
## [26,] TRUE
## [27,] TRUE
## [28,] TRUE
## [29,] TRUE
## [30,] TRUE
## [31,] TRUE
## [32,] TRUE
## [33,] TRUE
## [34,] TRUE
## [35,] TRUE
## [36,] TRUE
## [37,] TRUE
## [38,] TRUE
## [39,] TRUE
## [40,] TRUE
## [41,] TRUE
## [42,] TRUE
## [43,] TRUE
## [44,] TRUE
## [45,] TRUE
## [46,] TRUE
## [47,] TRUE
## [48,] TRUE
## [49,] TRUE
## [50,] TRUE
## [51,] TRUE
## [52,] TRUE
## [53,] TRUE
## [54,] TRUE
## [55,] TRUE
## [56,] TRUE
## [57,] TRUE
## [58,] TRUE
## [59,] TRUE
## [60,] TRUE
## [61,] TRUE
## [62,] TRUE
## [63,] TRUE
## [64,] TRUE
## [65,] TRUE
## [66,] TRUE
## [67,] TRUE
## [68,] TRUE
## [69,] TRUE
## [70,] TRUE
## [71,] TRUE
## [72,] TRUE
## [73,] TRUE
## [74,] TRUE
## [75,] TRUE
## [76,] TRUE
## [77,] TRUE
## [78,] TRUE
## [79,] TRUE
## [80,] TRUE
## [81,] TRUE
Let us look at the relationship between GarageYrBlt and SalePrice. Plus, we will color the points with different OverallQual.
library(ggplot2)
boston_raw$OverallQual=as.factor(boston_raw$OverallQual)
boston_raw%>%ggplot(aes(x=GarageYrBlt,y=log1p(SalePrice)))+geom_point(aes(color=OverallQual),alpha=0.6)
## Warning: Removed 81 rows containing missing values (geom_point).
There are no perfect solutions to this NA value.
There are three solutions available: 1) assign it a median value 2) assign it a zero value 3) assign it a value using CART imputation.
boston_raw%>%filter(is.na(LotFrontage))%>%head()
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1 8 60 RL NA 10382 Pave <NA> IR1
## 2 13 20 RL NA 12968 Pave <NA> IR2
## 3 15 20 RL NA 10920 Pave <NA> IR1
## 4 17 20 RL NA 11241 Pave <NA> IR1
## 5 25 20 RL NA 8246 Pave <NA> IR1
## 6 32 20 RL NA 8544 Pave <NA> IR1
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1 Lvl AllPub Corner Gtl NWAmes PosN
## 2 Lvl AllPub Inside Gtl Sawyer Norm
## 3 Lvl AllPub Corner Gtl NAmes Norm
## 4 Lvl AllPub CulDSac Gtl NAmes Norm
## 5 Lvl AllPub Inside Gtl Sawyer Norm
## 6 Lvl AllPub CulDSac Gtl Sawyer Norm
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1 Norm 1Fam 2Story 7 6 1973
## 2 Norm 1Fam 1Story 5 6 1962
## 3 Norm 1Fam 1Story 6 5 1960
## 4 Norm 1Fam 1Story 6 7 1970
## 5 Norm 1Fam 1Story 5 8 1968
## 6 Norm 1Fam 1Story 5 6 1966
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1 1973 Gable CompShg HdBoard HdBoard Stone
## 2 1962 Hip CompShg HdBoard Plywood None
## 3 1960 Hip CompShg MetalSd MetalSd BrkFace
## 4 1970 Gable CompShg Wd Sdng Wd Sdng BrkFace
## 5 2001 Gable CompShg Plywood Plywood None
## 6 2006 Gable CompShg HdBoard HdBoard None
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1 240 TA TA CBlock Gd TA Mn
## 2 0 TA TA CBlock TA TA No
## 3 212 TA TA CBlock TA TA No
## 4 180 TA TA CBlock TA TA No
## 5 0 TA Gd CBlock TA TA Mn
## 6 0 TA TA CBlock TA TA No
## BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1 ALQ 859 BLQ 32 216 1107
## 2 ALQ 737 Unf 0 175 912
## 3 BLQ 733 Unf 0 520 1253
## 4 ALQ 578 Unf 0 426 1004
## 5 Rec 188 ALQ 668 204 1060
## 6 Unf 0 Unf 0 1228 1228
## Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF
## 1 GasA Ex Y SBrkr 1107 983 0
## 2 GasA TA Y SBrkr 912 0 0
## 3 GasA TA Y SBrkr 1253 0 0
## 4 GasA Ex Y SBrkr 1004 0 0
## 5 GasA Ex Y SBrkr 1060 0 0
## 6 GasA Gd Y SBrkr 1228 0 0
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1 2090 1 0 2 1 3
## 2 912 1 0 1 0 2
## 3 1253 1 0 1 1 2
## 4 1004 1 0 1 0 2
## 5 1060 1 0 1 0 3
## 6 1228 0 0 1 1 3
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1 1 TA 7 Typ 2 TA
## 2 1 TA 4 Typ 0 <NA>
## 3 1 TA 5 Typ 1 Fa
## 4 1 TA 5 Typ 1 TA
## 5 1 Gd 6 Typ 1 TA
## 6 1 Gd 6 Typ 0 <NA>
## GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1 Attchd 1973 RFn 2 484 TA
## 2 Detchd 1962 Unf 1 352 TA
## 3 Attchd 1960 RFn 1 352 TA
## 4 Attchd 1970 Fin 2 480 TA
## 5 Attchd 1968 Unf 1 270 TA
## 6 Attchd 1966 Unf 1 271 TA
## GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch
## 1 TA Y 235 204 228 0
## 2 TA Y 140 0 0 0
## 3 TA Y 0 213 176 0
## 4 TA Y 0 0 0 0
## 5 TA Y 406 90 0 0
## 6 TA Y 0 65 0 0
## ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1 0 0 <NA> <NA> Shed 350 11 2009
## 2 176 0 <NA> <NA> <NA> 0 9 2008
## 3 0 0 <NA> GdWo <NA> 0 5 2008
## 4 0 0 <NA> <NA> Shed 700 3 2010
## 5 0 0 <NA> MnPrv <NA> 0 5 2010
## 6 0 0 <NA> MnPrv <NA> 0 6 2008
## SaleType SaleCondition SalePrice
## 1 WD Normal 200000
## 2 WD Normal 144000
## 3 WD Normal 157000
## 4 WD Normal 149000
## 5 WD Normal 154000
## 6 WD Normal 149350
As per usual, the two solutions we could compare is: 1) assign 0 to the NA value in LotFrontage 2) use CART to impute the value.
After analyzing the missing values, let us actually do clean the data with various method and compare them with baseline models.
The details of the data cleaning code is as follows.
Use a function called timeRecordA to record the time used in the calculation. (details of timeRecord functions will be discussed in later blogs.)
The first method of data cleaning is done as in the function data_clean1().
1.get rid of features with more than 50% of missing value 2.Use mice pacakge and apply random forest method to impute the missing value in features (make one copy instead of 5).
It is important to exclude target variable, SalePrice, in the process.
# record the time
timeRecordA()
## [1] "Fun time log has been created"
##########################################################################################
data_clean1=function(data){
library(mice)
# get rid of features with more than 50% of missing value.
data1=data%>%select_comp(p=0.5)%>%char_to_factor()
# Never impute with target variable
data1_impute=data1%>%select(-SalePrice)
data1_impute=data1_impute%>%mice(method="rf",m=1,seed=7) # just make one imputation instead of the default 5 results.
data1_impute=complete(data1_impute,1)
data1=cbind(data1_impute,SalePrice=data1$SalePrice)
return(data1)
}
boston_clean1=boston_raw%>%data_clean1()
## .
## character factor integer
## 39 1 37
## .
## factor integer
## 40 37
##
## iter imp variable
## 1 1 LotFrontage MasVnrArea GarageYrBlt MasVnrType BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinType2 Electrical FireplaceQu GarageType GarageFinish GarageQual GarageCond
## 2 1 LotFrontage MasVnrArea GarageYrBlt MasVnrType BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinType2 Electrical FireplaceQu GarageType GarageFinish GarageQual GarageCond
## 3 1 LotFrontage MasVnrArea GarageYrBlt MasVnrType BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinType2 Electrical FireplaceQu GarageType GarageFinish GarageQual GarageCond
## 4 1 LotFrontage MasVnrArea GarageYrBlt MasVnrType BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinType2 Electrical FireplaceQu GarageType GarageFinish GarageQual GarageCond
## 5 1 LotFrontage MasVnrArea GarageYrBlt MasVnrType BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinType2 Electrical FireplaceQu GarageType GarageFinish GarageQual GarageCond
timeRecordB()
##########################################################################################
The details of how mice package works will be too long to be included here, and we will do that in future blogs.
The second method of data cleaning is done as in the function data_clean2()
1.Fit the missing value in Electrical with the majority type 2.Fit the missing value in MasVnrType and MasVnrArea with “None”, a new factor value. 3.Fit the missing value in GarageYrBlt with 0. (I know there will be many other ways to impute year, but I believe it will not affect prediction too much. You could try to verify that yourself.) 4.Fit the missing value in LotFrontage with 0, too.
data_clean2=function(data){
features=c("Alley","BsmtQual","BsmtCond",
"BsmtExposure","BsmtFinType1","BsmtFinType2",
"FireplaceQu","GarageType","GarageFinish","GarageQual",
"GarageCond","PoolQC","Fence","MiscFeature")
data2=data%>%na_to_fac(names=features)
# Fit the missing value in Electrical with the majority type
data2%>%select(Electrical)%>%table(useNA="always")
data2[is.na(Electrical),"Electrical"]= "SBrkr"
## Fit the missing value in MasVnrType and MasVnrArea
data2[is.na(MasVnrType),"MasVnrType"]="None"
data2[is.na(MasVnrArea),"MasVnrArea"]=0
data2%>%miss_pct_df()
## Fit the missing value in GarageYrBlt
data2[is.na(GarageYrBlt),"GarageYrBlt"]=0
## Fit the missing value in LotFrontage
data2[is.na(LotFrontage),"LotFrontage"]=0
data2=data2%>%char_to_factor()
return(data2)
}
boston_clean2=boston_raw%>%data_clean2()
## [1] "There are 2 features with missing values."
## .
## character factor integer
## 43 1 37
## .
## factor integer
## 44 37
timeRecordB()
########################################################################
The third method of data cleaning.
1.Fit the missing value in Electrical with the majority type 2.Fit the missing value in MasVnrType with “None”, a new factor value. 2.Impute the rest with mice package. (Random forest, or CART, or sample method, which are corresponding to boston_clean3, boston_clean4,boston_clean5)
data_clean3=function(data,method){
library(mice)
features=c("Alley","BsmtQual","BsmtCond",
"BsmtExposure","BsmtFinType1","BsmtFinType2",
"FireplaceQu","GarageType","GarageFinish","GarageQual",
"GarageCond","PoolQC","Fence","MiscFeature")
data3=data%>%na_to_fac(names=features)
data3[is.na(MasVnrType),"MasVnrType"]="None"
data3[is.na(Electrical),"Electrical"]= "SBrkr"
# Remove targe variable from imputation
data3_impute=data3%>%select(-SalePrice)
data3_impute=data3_impute%>%mice(method=method,m=1,seed=7)
data3_impute=complete(data3_impute,1)
data3=cbind(data3_impute,SalePrice=data3$SalePrice)
data3=data3%>%char_to_factor()
data3%>%miss_pct_df()
return(data3)
}
# Random forest imputations
boston_clean3=boston_raw%>%data_clean3(method="rf")
##
## iter imp variable
## 1 1 LotFrontage MasVnrArea GarageYrBlt
## 2 1 LotFrontage MasVnrArea GarageYrBlt
## 3 1 LotFrontage MasVnrArea GarageYrBlt
## 4 1 LotFrontage MasVnrArea GarageYrBlt
## 5 1 LotFrontage MasVnrArea GarageYrBlt
## .
## character factor integer
## 43 1 37
## .
## factor integer
## 44 37
## [1] "There are 0 features with missing values."
timeRecordB()
###################################################################################################
# Classification and regression trees
boston_clean4=boston_raw%>%data_clean3(method="cart")
##
## iter imp variable
## 1 1 LotFrontage MasVnrArea GarageYrBlt
## 2 1 LotFrontage MasVnrArea GarageYrBlt
## 3 1 LotFrontage MasVnrArea GarageYrBlt
## 4 1 LotFrontage MasVnrArea GarageYrBlt
## 5 1 LotFrontage MasVnrArea GarageYrBlt
## .
## character factor integer
## 43 1 37
## .
## factor integer
## 44 37
## [1] "There are 0 features with missing values."
timeRecordB()
###################################################################################################
# Random sample from the observed values
boston_clean5=boston_raw%>%data_clean3(method="sample")
##
## iter imp variable
## 1 1 LotFrontage MasVnrArea GarageYrBlt
## 2 1 LotFrontage MasVnrArea GarageYrBlt
## 3 1 LotFrontage MasVnrArea GarageYrBlt
## 4 1 LotFrontage MasVnrArea GarageYrBlt
## 5 1 LotFrontage MasVnrArea GarageYrBlt
## .
## character factor integer
## 43 1 37
## .
## factor integer
## 44 37
## [1] "There are 0 features with missing values."
timeRecordB()
###################################################################################################
The fourth method of data cleaning.
1.Fit the missing value in Electrical with the majority type 2.Fit the missing value in MasVnrType with “None”, a new factor value. 3.Impute rest of the features with preProcess function in the caret package by applying knn or median impute.(knnImpute and medianImpute correspond to boston_clean6 and boston_clean7 )
# We will try to use the preProcess function in caret package and apply knnImpute and medianImpute to impute for the missing data
data_clean5=function(data,method){
library(caret)
features=c("Alley","BsmtQual","BsmtCond",
"BsmtExposure","BsmtFinType1","BsmtFinType2",
"FireplaceQu","GarageType","GarageFinish","GarageQual",
"GarageCond","PoolQC","Fence","MiscFeature")
data5=data%>%na_to_fac(names=features)
data5[is.na(MasVnrType),"MasVnrType"]="None"
data5[is.na(Electrical),"Electrical"]= "SBrkr"
# Remove targe variable from imputation
data5_impute=data5%>%select(-SalePrice)%>%char_to_factor()
data5_impute_f=data5_impute%>%preProcess(method=method)
data5_impute=data5_impute_f%>%predict(data5_impute)
data5=cbind(data5_impute,SalePrice=data5$SalePrice)
return(data5)
}
# use knnImpute method
boston_clean6=boston_raw%>%data_clean5(method="knnImpute")
## Loading required package: lattice
## .
## character factor integer
## 43 1 36
## .
## factor integer
## 44 36
timeRecordB()
###################################################################################################
# use medianImpute method
boston_clean7=boston_raw%>%data_clean5(method="medianImpute")
## .
## character factor integer
## 43 1 36
## .
## factor integer
## 44 36
timeRecordB()
###################################################################################################
timeRecordR()
## user.self sys.self elapsed user.child sys.child run_time
## 1 3.108 0.052 3.158 0 0 0.000
## 2 73.008 0.628 74.522 0 0 71.364
## 3 73.084 0.628 74.599 0 0 0.077
## 4 76.000 0.632 77.521 0 0 2.922
## 5 77.892 0.636 79.422 0 0 1.901
## 6 78.168 0.636 79.698 0 0 0.276
## 7 79.020 0.640 80.555 0 0 0.857
## 8 79.380 0.640 80.917 0 0 0.362
boston_raw=boston_raw%>%char_to_factor()
## .
## character factor integer
## 43 1 37
## .
## factor integer
## 44 37
miss_pct_df(boston_clean1)
## [1] "There are 0 features with missing values."
## [1] miss_pct names
## <0 rows> (or 0-length row.names)
miss_pct_df(boston_clean2)
## [1] "There are 0 features with missing values."
## [1] miss_pct names
## <0 rows> (or 0-length row.names)
miss_pct_df(boston_clean3)
## [1] "There are 0 features with missing values."
## [1] miss_pct names
## <0 rows> (or 0-length row.names)
miss_pct_df(boston_clean4)
## [1] "There are 0 features with missing values."
## [1] miss_pct names
## <0 rows> (or 0-length row.names)
miss_pct_df(boston_clean5)
## [1] "There are 0 features with missing values."
## [1] miss_pct names
## <0 rows> (or 0-length row.names)
miss_pct_df(boston_clean6)
## [1] "There are 0 features with missing values."
## [1] miss_pct names
## <0 rows> (or 0-length row.names)
miss_pct_df(boston_clean7)
## [1] "There are 0 features with missing values."
## [1] miss_pct names
## <0 rows> (or 0-length row.names)
The last few lines of output show that all dataset are imputed with no more missing values in them.
Next, let us look at the skewness of each feature. The skewness of a feature could affect the predictions of some machine learning algorithms, if the features have a really big value for skewness, then we need to do a Box-Cox transformation.
# First define a function to select integer or numeric value in features.
select_num=function(data){
library(magrittr)
data=as.data.frame(data)
data%>%sapply(class)%>%table()%>%print
int_index=sapply(data,class)== "integer"
int_index=int_index%>%as.logical()
num_index=sapply(data,class)== "numeric"
num_index=num_index%>%as.logical()
# data[,!int_index]%>%sapply(class)%>%table
data1=data[,int_index]
data2=data[,num_index]
data=cbind(data1,data2)
data%>%sapply(class)%>%table%>%print
return(data)
}
library(e1071)
library(dplyr)
# Select only integer or numeric data types for calculating skewness.
boston_clean1%>%select_num()%>%sapply(skewness)
## .
## factor integer
## 40 37
## .
## integer
## 37
## Id MSSubClass LotFrontage LotArea OverallCond
## 0.00000000 1.40476562 2.78263224 12.18261502 0.69164401
## YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2
## -0.61220121 -0.50252776 2.65780112 1.68204129 4.24652141
## BsmtUnfSF TotalBsmtSF 1stFlrSF 2ndFlrSF LowQualFinSF
## 0.91837835 1.52112395 1.37392896 0.81135997 8.99283329
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath
## 1.36375364 0.59484237 4.09497490 0.03648647 0.67450925
## BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces GarageYrBlt
## 0.21135511 4.47917826 0.67495173 0.64823107 -0.59602452
## GarageCars GarageArea WoodDeckSF OpenPorchSF EnclosedPorch
## -0.34184538 0.17961125 1.53820999 2.35948572 3.08352575
## 3SsnPorch ScreenPorch PoolArea MiscVal MoSold
## 10.28317840 4.11374731 14.79791829 24.42652237 0.21161746
## YrSold SalePrice
## 0.09607079 1.87900860
boston_clean2%>%select_num()%>%sapply(skewness)
## .
## factor integer
## 44 37
## .
## integer
## 37
## Id MSSubClass LotFrontage LotArea OverallCond
## 0.00000000 1.40476562 0.26727232 12.18261502 0.69164401
## YearBuilt YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2
## -0.61220121 -0.50252776 2.67211701 1.68204129 4.24652141
## BsmtUnfSF TotalBsmtSF 1stFlrSF 2ndFlrSF LowQualFinSF
## 0.91837835 1.52112395 1.37392896 0.81135997 8.99283329
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath
## 1.36375364 0.59484237 4.09497490 0.03648647 0.67450925
## BedroomAbvGr KitchenAbvGr TotRmsAbvGrd Fireplaces GarageYrBlt
## 0.21135511 4.47917826 0.67495173 0.64823107 -3.86155342
## GarageCars GarageArea WoodDeckSF OpenPorchSF EnclosedPorch
## -0.34184538 0.17961125 1.53820999 2.35948572 3.08352575
## 3SsnPorch ScreenPorch PoolArea MiscVal MoSold
## 10.28317840 4.11374731 14.79791829 24.42652237 0.21161746
## YrSold SalePrice
## 0.09607079 1.87900860
It is obvious that some features have very high skewness. Thus, a Box-Cox transformation would seem necessary for better performance for some of the models. Is it really true? It is for you to try it out.