Data cleaning

We will need to load the raw data and needed libraries (Plus, some of the functions we defined earlier.)

## Load my personal function libraries 
source('~/Dropbox/Utility/source_self.R', echo=TRUE)
## 
## > source("~/Dropbox/Utility/char_to_factor.R", echo = TRUE)
## 
## > char_to_factor = function(data) {
## +     library(magrittr)
## +     data = as.data.frame(data)
## +     sapply(data, class) %>% table() %>% print
## +     cha .... [TRUNCATED] 
## 
## > source("~/Dropbox/Utility/correlation.R", echo = TRUE)
## 
## > correlation = function(data) {
## +     select_num = function(data) {
## +         library(magrittr)
## +         data = as.data.frame(data)
## +         int_in .... [TRUNCATED] 
## 
## > source("~/Dropbox/Utility/miss_name.R", echo = TRUE)
## 
## > miss_name = function(data) {
## +     miss_pct = function(data) {
## +         pct = sum(is.na(data))/length(data)
## +     }
## +     missing = data %>% sapply .... [TRUNCATED] 
## 
## > source("~/Dropbox/Utility/miss_pct_plot.R", echo = TRUE)
## 
## > miss_pct_plot = function(data) {
## +     miss_pct = function(data) {
## +         pct = sum(is.na(data))/length(data)
## +     }
## +     missing = data %>% sa .... [TRUNCATED] 
## 
## > source("~/Dropbox/Utility/na_to_fac.R", echo = TRUE)
## 
## > na_to_fac = function(data, names) {
## +     library(magrittr)
## +     library(dplyr)
## +     data_na = data %>% select(names)
## +     data_nna = data %>% se .... [TRUNCATED] 
## 
## > source("~/Dropbox/Utility/select_comp.R", echo = TRUE)
## 
## > select_comp = function(data, p = 0.5) {
## +     miss_pct = function(data) {
## +         pct = sum(is.na(data))/length(data)
## +     }
## +     missing = sapp .... [TRUNCATED] 
## 
## > source("~/Dropbox/Utility/select_fac.R", echo = TRUE)
## 
## > select_fac = function(data) {
## +     library(magrittr)
## +     data = as.data.frame(data)
## +     data %>% sapply(class) %>% table() %>% print
## +     fac_ .... [TRUNCATED] 
## 
## > source("~/Dropbox/Utility/select_num.R", echo = TRUE)
## 
## > select_num = function(data) {
## +     library(magrittr)
## +     data = as.data.frame(data)
## +     data %>% sapply(class) %>% table() %>% print
## +     int_ .... [TRUNCATED] 
## 
## > source("~/Dropbox/Utility/timeRecord_functions.R", 
## +     echo = TRUE)
## 
## > timeRecordA = function() {
## +     file_name = paste(format(Sys.time(), "%F_%T"), ".log", sep = "")
## +     file.create(file_name)
## +     extremely_long_ .... [TRUNCATED] 
## 
## > timeRecordB = function() {
## +     if (file.exists(extremely_long_name_i_do_not_think_anyone_would_be_sanely_to_use)) {
## +         file_name = extremel .... [TRUNCATED] 
## 
## > timeRecordR = function(output = "job", unit = "s") {
## +     if (file.exists(extremely_long_name_i_do_not_think_anyone_would_be_sanely_to_use)) {
## +    .... [TRUNCATED] 
## 
## > source("~/Dropbox/Utility/miss_pct_df.R", echo = TRUE)
## 
## > miss_pct_df = function(data) {
## +     miss_pct = function(data) {
## +         pct = sum(is.na(data))/length(data)
## +     }
## +     missing = data %>% sapp .... [TRUNCATED] 
## 
## > source("~/Dropbox/Utility/int_to_num.R", echo = TRUE)
## 
## > int_to_num = function(data) {
## +     data = as.data.frame(data)
## +     library(magrittr)
## +     data = as.data.frame(data)
## +     sapply(data, class) %> .... [TRUNCATED]
## Load R packages 
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(magrittr)
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
boston_raw=fread("train.csv",header=TRUE)

#boston_raw=boston_raw%>%char_to_factor()

## Several versions of data cleaning 
## For dataset that are relatively large, it is a good practice to save the different version of the cleaned data.  

Missing value and unreasonable value

The next step is to understand the missing value and impute it.

There are basically two types of missing value: 1. Systematic missing value 2. Missing value at random

Let us first look at the features that have missing values.

miss_name=function(data){
  library(magrittr)
  # Define a function to calculate percentage of missing value in each feature
  miss_pct=function(data){
    pct=sum(is.na(data))/length(data)
    return(pct)
  }
  
  # Apply the function define above to every columns of the data set 
  missing=data%>%sapply(miss_pct)%>%as.data.frame()
  
  
  colnames(missing)="miss_pct"
  missing=data.frame(names=row.names(missing)%>%as.character(),missing_pct=missing)
  
  # Select only the features with missing values and their missing value percentage
  missing=missing%>%filter(miss_pct>0)%>%arrange(desc(miss_pct))
  
  
  return(missing)
}

miss_name(boston_raw)
##           names     miss_pct
## 1        PoolQC 0.9952054795
## 2   MiscFeature 0.9630136986
## 3         Alley 0.9376712329
## 4         Fence 0.8075342466
## 5   FireplaceQu 0.4726027397
## 6   LotFrontage 0.1773972603
## 7    GarageType 0.0554794521
## 8   GarageYrBlt 0.0554794521
## 9  GarageFinish 0.0554794521
## 10   GarageQual 0.0554794521
## 11   GarageCond 0.0554794521
## 12 BsmtExposure 0.0260273973
## 13 BsmtFinType2 0.0260273973
## 14     BsmtQual 0.0253424658
## 15     BsmtCond 0.0253424658
## 16 BsmtFinType1 0.0253424658
## 17   MasVnrType 0.0054794521
## 18   MasVnrArea 0.0054794521
## 19   Electrical 0.0006849315

Let us look at the data dictionary to find out if NA value actually means something else.

Alley:NA No alley access BsmtQual: NA No Basement BsmtCond: NA No Basement BsmtExposure:NA No Basement BsmtFinType1:NA No Basement BsmtFinType2: NA No Basement

FireplaceQu:NA No Fireplace

GarageType: NA No Garage GarageFinish:NA No Garage GarageQual:NA No Garage GarageCond:NA No Garage

PoolQC:NA No Poolcaret

Fence:NA No FenceGarageYrBlt MiscFeature: NA None

It seems that in all these features, NA does not mean missing value, it simply means there is no such property. Thus, we need to convert NA in these features into a factor value like “None”.

## Change NA values of the features into "None", a new factor value 

na_to_fac=function(data,names){
  # libraries (change it into installing it if not installed later)
  library(magrittr)
  library(dplyr)
  # Split the data into data with NA value and data with no NA value 
  data_na=data%>%select(names)
  data_nna=data%>%select(-one_of(names))
  data_na[is.na(data_na)]="None"
  
  data=cbind(data_na,data_nna)
  return(data)
  
}

boston_clean2=boston_raw%>%na_to_fac(names=c("Alley","BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2","FireplaceQu","GarageType","GarageFinish","GarageQual","GarageCond","PoolQC","Fence","MiscFeature"))%>%char_to_factor()
## .
## character   integer 
##        43        38 
## .
##  factor integer 
##      43      38
miss_pct_plot(boston_clean2)
## [1] "There are 5 features with missing values."

# boston_clean2%>%sapply(class)

Then, after this cleaning we are left with 5 features with missing values.

Let us look further into those variables.

LotFrontage is the Linear feet of street connected to property. GarageYrBlt is the Year garage was built.

MasVnrArea is the Masonry veneer area in square feet. MasVnrType is the Masonry veneer type

Electrical is Electrical system with various factors.

  1. Look at the missing value at Electrical
boston_raw%>%filter(is.na(Electrical))
##     Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1 1380         80       RL          73    9735   Pave  <NA>      Reg
##   LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1         Lvl    AllPub    Inside       Gtl       Timber       Norm
##   Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1       Norm     1Fam       SLvl           5           5      2006
##   YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1         2007     Gable  CompShg     VinylSd     VinylSd       None
##   MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1          0        TA        TA      PConc       Gd       TA           No
##   BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1          Unf          0          Unf          0       384         384
##   Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF
## 1    GasA        Gd          Y       <NA>      754      640            0
##   GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1      1394            0            0        2        1            3
##   KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1            1          Gd            7        Typ          0        <NA>
##   GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1    BuiltIn        2007          Fin          2        400         TA
##   GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch
## 1         TA          Y        100           0             0         0
##   ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1           0        0   <NA>  <NA>        <NA>       0      5   2008
##   SaleType SaleCondition SalePrice
## 1       WD        Normal    167500

There is only one missing value. We could definitely impute it with some methods.

2.Look at the missing value at MasVnrArea and Masonry veneer type.

boston_raw%>%filter(is.na(MasVnrArea))
##     Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1  235         60       RL          NA    7851   Pave  <NA>      Reg
## 2  530         20       RL          NA   32668   Pave  <NA>      IR1
## 3  651         60       FV          65    8125   Pave  <NA>      Reg
## 4  937         20       RL          67   10083   Pave  <NA>      Reg
## 5  974         20       FV          95   11639   Pave  <NA>      Reg
## 6  978        120       FV          35    4274   Pave  Pave      IR1
## 7 1244         20       RL         107   13891   Pave  <NA>      Reg
## 8 1279         60       RL          75    9473   Pave  <NA>      Reg
##   LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1         Lvl    AllPub    Inside       Gtl      Gilbert       Norm
## 2         Lvl    AllPub   CulDSac       Gtl      Crawfor       Norm
## 3         Lvl    AllPub    Inside       Gtl      Somerst       Norm
## 4         Lvl    AllPub    Inside       Gtl      SawyerW       Norm
## 5         Lvl    AllPub    Corner       Gtl      Somerst       Norm
## 6         Lvl    AllPub    Inside       Gtl      Somerst       Norm
## 7         Lvl    AllPub    Inside       Gtl      NridgHt       Norm
## 8         Lvl    AllPub    Inside       Gtl      CollgCr       Norm
##   Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1       Norm     1Fam     2Story           6           5      2002
## 2       Norm     1Fam     1Story           6           3      1957
## 3       Norm     1Fam     2Story           7           6      2007
## 4       Norm     1Fam     1Story           7           5      2003
## 5       Norm     1Fam     1Story           7           5      2007
## 6       Norm   TwnhsE     1Story           7           5      2006
## 7       Norm     1Fam     1Story          10           5      2006
## 8       Norm     1Fam     2Story           8           5      2002
##   YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1         2002     Gable  CompShg     VinylSd     VinylSd       <NA>
## 2         1975       Hip  CompShg     Wd Sdng       Stone       <NA>
## 3         2007     Gable  CompShg     CemntBd     CmentBd       <NA>
## 4         2003     Gable  CompShg     VinylSd     VinylSd       <NA>
## 5         2008     Gable  CompShg     CemntBd     CmentBd       <NA>
## 6         2007     Gable  CompShg     VinylSd     VinylSd       <NA>
## 7         2006     Gable  CompShg     VinylSd     VinylSd       <NA>
## 8         2002     Gable  CompShg     VinylSd     VinylSd       <NA>
##   MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1         NA        Gd        TA      PConc       Gd       TA           No
## 2         NA        Gd        TA      PConc       TA       TA           No
## 3         NA        Gd        TA      PConc       Gd       TA           No
## 4         NA        Gd        TA      PConc       Gd       TA           No
## 5         NA        Gd        TA      PConc       Gd       TA           No
## 6         NA        Gd        TA      PConc       Gd       TA           No
## 7         NA        Ex        TA      PConc       Ex       Gd           Gd
## 8         NA        Gd        TA      PConc       Gd       TA           No
##   BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1          GLQ        625          Unf          0       235         860
## 2          Rec       1219          Unf          0       816        2035
## 3          Unf          0          Unf          0       813         813
## 4          GLQ        833          Unf          0       343        1176
## 5          Unf          0          Unf          0      1428        1428
## 6          GLQ       1106          Unf          0       135        1241
## 7          GLQ       1386          Unf          0       690        2076
## 8          GLQ        804          Unf          0       324        1128
##   Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF
## 1    GasA        Ex          Y      SBrkr      860     1100            0
## 2    GasA        TA          Y      SBrkr     2515        0            0
## 3    GasA        Ex          Y      SBrkr      822      843            0
## 4    GasA        Ex          Y      SBrkr     1200        0            0
## 5    GasA        Ex          Y      SBrkr     1428        0            0
## 6    GasA        Ex          Y      SBrkr     1241        0            0
## 7    GasA        Ex          Y      SBrkr     2076        0            0
## 8    GasA        Ex          Y      SBrkr     1128      903            0
##   GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1      1960            1            0        2        1            4
## 2      2515            1            0        3        0            4
## 3      1665            0            0        2        1            3
## 4      1200            1            0        2        0            2
## 5      1428            0            0        2        0            3
## 6      1241            1            0        1        1            1
## 7      2076            1            0        2        1            2
## 8      2031            1            0        2        1            3
##   KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1            1          Gd            8        Typ          2          TA
## 2            2          TA            9       Maj1          2          TA
## 3            1          Gd            7        Typ          0        <NA>
## 4            1          Gd            5        Typ          0        <NA>
## 5            1          Gd            6        Typ          0        <NA>
## 6            1          Gd            4        Typ          0        <NA>
## 7            1          Ex            7        Typ          1          Gd
## 8            1          Gd            7        Typ          1          Gd
##   GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1    BuiltIn        2002          Fin          2        440         TA
## 2     Attchd        1975          RFn          2        484         TA
## 3     Attchd        2007          RFn          2        562         TA
## 4     Attchd        2003          RFn          2        555         TA
## 5     Attchd        2007          Fin          2        480         TA
## 6     Attchd        2007          Fin          2        569         TA
## 7     Attchd        2006          Fin          3        850         TA
## 8     Attchd        2002          RFn          2        577         TA
##   GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch
## 1         TA          Y        288          48             0         0
## 2         TA          Y          0           0           200         0
## 3         TA          Y          0           0             0         0
## 4         TA          Y          0          41             0         0
## 5         TA          Y          0         120             0         0
## 6         TA          Y          0         116             0         0
## 7         TA          Y        216         229             0         0
## 8         TA          Y          0         211             0         0
##   ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1           0        0   <NA>  <NA>        <NA>       0      5   2010
## 2           0        0   <NA>  <NA>        <NA>       0      3   2007
## 3           0        0   <NA>  <NA>        <NA>       0      5   2008
## 4           0        0   <NA>  <NA>        <NA>       0      8   2009
## 5           0        0   <NA>  <NA>        <NA>       0     12   2008
## 6           0        0   <NA>  <NA>        <NA>       0     11   2007
## 7           0        0   <NA>  <NA>        <NA>       0      9   2006
## 8           0        0   <NA>  <NA>        <NA>       0      3   2008
##   SaleType SaleCondition SalePrice
## 1       WD        Normal    216500
## 2       WD        Alloca    200624
## 3       WD        Normal    205950
## 4       WD        Normal    184900
## 5      New       Partial    182000
## 6      New       Partial    199900
## 7      New       Partial    465000
## 8       WD        Normal    237000
boston_raw%>%filter(is.na(MasVnrType))%>%select(Id)==boston_raw%>%filter(is.na(MasVnrArea))%>%select(Id)
##        Id
## [1,] TRUE
## [2,] TRUE
## [3,] TRUE
## [4,] TRUE
## [5,] TRUE
## [6,] TRUE
## [7,] TRUE
## [8,] TRUE

The house that has a missing value in MasVnrType also has a missing value in MasVnrArea.

I would consider two solutions: 1) use mice package to do the imputation 2) Assign missing values in MasVnrType to None and MasVnrArea to 0.

  1. Look at the missing values at GarageYrBlt.
boston_raw%>%filter(is.na(GarageYrBlt))%>%head()
##    Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1  40         90       RL          65    6040   Pave  <NA>      Reg
## 2  49        190       RM          33    4456   Pave  <NA>      Reg
## 3  79         90       RL          72   10778   Pave  <NA>      Reg
## 4  89         50  C (all)         105    8470   Pave  <NA>      IR1
## 5  90         20       RL          60    8070   Pave  <NA>      Reg
## 6 100         20       RL          77    9320   Pave  <NA>      IR1
##   LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1         Lvl    AllPub    Inside       Gtl      Edwards       Norm
## 2         Lvl    AllPub    Inside       Gtl      OldTown       Norm
## 3         Lvl    AllPub    Inside       Gtl       Sawyer       Norm
## 4         Lvl    AllPub    Corner       Gtl       IDOTRR      Feedr
## 5         Lvl    AllPub    Inside       Gtl      CollgCr       Norm
## 6         Lvl    AllPub    Inside       Gtl        NAmes       Norm
##   Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1       Norm   Duplex     1Story           4           5      1955
## 2       Norm   2fmCon     2Story           4           5      1920
## 3       Norm   Duplex     1Story           4           5      1968
## 4      Feedr     1Fam     1.5Fin           3           2      1915
## 5       Norm     1Fam     1Story           4           5      1994
## 6       Norm     1Fam     1Story           4           5      1959
##   YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1         1955     Gable  CompShg     AsbShng     Plywood       None
## 2         2008     Gable  CompShg     MetalSd     MetalSd       None
## 3         1968       Hip  CompShg     HdBoard     HdBoard       None
## 4         1982       Hip  CompShg     Plywood     Plywood       None
## 5         1995     Gable  CompShg     VinylSd     VinylSd       None
## 6         1959     Gable  CompShg     Plywood     Plywood       None
##   MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1          0        TA        TA      PConc     <NA>     <NA>         <NA>
## 2          0        TA        TA     BrkTil       TA       TA           No
## 3          0        TA        TA     CBlock       TA       TA           No
## 4          0        Fa        Fa     CBlock       TA       Fa           No
## 5          0        TA        TA      PConc       Gd       TA           No
## 6          0        TA        TA     CBlock       TA       TA           No
##   BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1         <NA>          0         <NA>          0         0           0
## 2          Unf          0          Unf          0       736         736
## 3          Unf          0          Unf          0      1768        1768
## 4          Unf          0          Unf          0      1013        1013
## 5          GLQ        588          Unf          0       402         990
## 6          ALQ        569          Unf          0       381         950
##   Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF
## 1    GasA        TA          N      FuseP     1152        0            0
## 2    GasA        Gd          Y      SBrkr      736      716            0
## 3    GasA        TA          N      SBrkr     1768        0            0
## 4    GasA        TA          N      SBrkr     1013        0          513
## 5    GasA        Ex          Y      SBrkr      990        0            0
## 6    GasA        Fa          Y      SBrkr     1225        0            0
##   GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1      1152            0            0        2        0            2
## 2      1452            0            0        2        0            2
## 3      1768            0            0        2        0            4
## 4      1526            0            0        1        0            2
## 5       990            1            0        1        0            3
## 6      1225            1            0        1        1            3
##   KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1            2          Fa            6        Typ          0        <NA>
## 2            3          TA            8        Typ          0        <NA>
## 3            2          TA            8        Typ          0        <NA>
## 4            1          Fa            6        Typ          0        <NA>
## 5            1          TA            5        Typ          0        <NA>
## 6            1          TA            6        Typ          0        <NA>
##   GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1       <NA>          NA         <NA>          0          0       <NA>
## 2       <NA>          NA         <NA>          0          0       <NA>
## 3       <NA>          NA         <NA>          0          0       <NA>
## 4       <NA>          NA         <NA>          0          0       <NA>
## 5       <NA>          NA         <NA>          0          0       <NA>
## 6       <NA>          NA         <NA>          0          0       <NA>
##   GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch
## 1       <NA>          N          0           0             0         0
## 2       <NA>          N          0           0           102         0
## 3       <NA>          Y          0           0             0         0
## 4       <NA>          N          0           0           156         0
## 5       <NA>          Y          0           0             0         0
## 6       <NA>          Y        352           0             0         0
##   ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1           0        0   <NA>  <NA>        <NA>       0      6   2008
## 2           0        0   <NA>  <NA>        <NA>       0      6   2009
## 3           0        0   <NA>  <NA>        <NA>       0      4   2010
## 4           0        0   <NA> MnPrv        <NA>       0     10   2009
## 5           0        0   <NA>  <NA>        <NA>       0      8   2007
## 6           0        0   <NA>  <NA>        Shed     400      1   2010
##   SaleType SaleCondition SalePrice
## 1       WD       AdjLand     82000
## 2      New       Partial    113000
## 3       WD        Normal    136500
## 4    ConLD       Abnorml     85000
## 5       WD        Normal    123600
## 6       WD        Normal    128950

It seems that other garage features are also missing when GarageYrBlt is missing.

Let us check.

boston_raw%>%filter(is.na(GarageYrBlt))%>%select(Id)==boston_raw%>%filter(is.na(GarageQual))%>%select(Id)
##         Id
##  [1,] TRUE
##  [2,] TRUE
##  [3,] TRUE
##  [4,] TRUE
##  [5,] TRUE
##  [6,] TRUE
##  [7,] TRUE
##  [8,] TRUE
##  [9,] TRUE
## [10,] TRUE
## [11,] TRUE
## [12,] TRUE
## [13,] TRUE
## [14,] TRUE
## [15,] TRUE
## [16,] TRUE
## [17,] TRUE
## [18,] TRUE
## [19,] TRUE
## [20,] TRUE
## [21,] TRUE
## [22,] TRUE
## [23,] TRUE
## [24,] TRUE
## [25,] TRUE
## [26,] TRUE
## [27,] TRUE
## [28,] TRUE
## [29,] TRUE
## [30,] TRUE
## [31,] TRUE
## [32,] TRUE
## [33,] TRUE
## [34,] TRUE
## [35,] TRUE
## [36,] TRUE
## [37,] TRUE
## [38,] TRUE
## [39,] TRUE
## [40,] TRUE
## [41,] TRUE
## [42,] TRUE
## [43,] TRUE
## [44,] TRUE
## [45,] TRUE
## [46,] TRUE
## [47,] TRUE
## [48,] TRUE
## [49,] TRUE
## [50,] TRUE
## [51,] TRUE
## [52,] TRUE
## [53,] TRUE
## [54,] TRUE
## [55,] TRUE
## [56,] TRUE
## [57,] TRUE
## [58,] TRUE
## [59,] TRUE
## [60,] TRUE
## [61,] TRUE
## [62,] TRUE
## [63,] TRUE
## [64,] TRUE
## [65,] TRUE
## [66,] TRUE
## [67,] TRUE
## [68,] TRUE
## [69,] TRUE
## [70,] TRUE
## [71,] TRUE
## [72,] TRUE
## [73,] TRUE
## [74,] TRUE
## [75,] TRUE
## [76,] TRUE
## [77,] TRUE
## [78,] TRUE
## [79,] TRUE
## [80,] TRUE
## [81,] TRUE

Let us look at the relationship between GarageYrBlt and SalePrice. Plus, we will color the points with different OverallQual.

library(ggplot2)
boston_raw$OverallQual=as.factor(boston_raw$OverallQual)
boston_raw%>%ggplot(aes(x=GarageYrBlt,y=log1p(SalePrice)))+geom_point(aes(color=OverallQual),alpha=0.6)
## Warning: Removed 81 rows containing missing values (geom_point).

There are no perfect solutions to this NA value.

There are three solutions available: 1) assign it a median value 2) assign it a zero value 3) assign it a value using CART imputation.

  1. Last but not the least LotFrontage:the Linear feet of street connected to property.
boston_raw%>%filter(is.na(LotFrontage))%>%head()
##   Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1  8         60       RL          NA   10382   Pave  <NA>      IR1
## 2 13         20       RL          NA   12968   Pave  <NA>      IR2
## 3 15         20       RL          NA   10920   Pave  <NA>      IR1
## 4 17         20       RL          NA   11241   Pave  <NA>      IR1
## 5 25         20       RL          NA    8246   Pave  <NA>      IR1
## 6 32         20       RL          NA    8544   Pave  <NA>      IR1
##   LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1         Lvl    AllPub    Corner       Gtl       NWAmes       PosN
## 2         Lvl    AllPub    Inside       Gtl       Sawyer       Norm
## 3         Lvl    AllPub    Corner       Gtl        NAmes       Norm
## 4         Lvl    AllPub   CulDSac       Gtl        NAmes       Norm
## 5         Lvl    AllPub    Inside       Gtl       Sawyer       Norm
## 6         Lvl    AllPub   CulDSac       Gtl       Sawyer       Norm
##   Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1       Norm     1Fam     2Story           7           6      1973
## 2       Norm     1Fam     1Story           5           6      1962
## 3       Norm     1Fam     1Story           6           5      1960
## 4       Norm     1Fam     1Story           6           7      1970
## 5       Norm     1Fam     1Story           5           8      1968
## 6       Norm     1Fam     1Story           5           6      1966
##   YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1         1973     Gable  CompShg     HdBoard     HdBoard      Stone
## 2         1962       Hip  CompShg     HdBoard     Plywood       None
## 3         1960       Hip  CompShg     MetalSd     MetalSd    BrkFace
## 4         1970     Gable  CompShg     Wd Sdng     Wd Sdng    BrkFace
## 5         2001     Gable  CompShg     Plywood     Plywood       None
## 6         2006     Gable  CompShg     HdBoard     HdBoard       None
##   MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1        240        TA        TA     CBlock       Gd       TA           Mn
## 2          0        TA        TA     CBlock       TA       TA           No
## 3        212        TA        TA     CBlock       TA       TA           No
## 4        180        TA        TA     CBlock       TA       TA           No
## 5          0        TA        Gd     CBlock       TA       TA           Mn
## 6          0        TA        TA     CBlock       TA       TA           No
##   BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1          ALQ        859          BLQ         32       216        1107
## 2          ALQ        737          Unf          0       175         912
## 3          BLQ        733          Unf          0       520        1253
## 4          ALQ        578          Unf          0       426        1004
## 5          Rec        188          ALQ        668       204        1060
## 6          Unf          0          Unf          0      1228        1228
##   Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF
## 1    GasA        Ex          Y      SBrkr     1107      983            0
## 2    GasA        TA          Y      SBrkr      912        0            0
## 3    GasA        TA          Y      SBrkr     1253        0            0
## 4    GasA        Ex          Y      SBrkr     1004        0            0
## 5    GasA        Ex          Y      SBrkr     1060        0            0
## 6    GasA        Gd          Y      SBrkr     1228        0            0
##   GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1      2090            1            0        2        1            3
## 2       912            1            0        1        0            2
## 3      1253            1            0        1        1            2
## 4      1004            1            0        1        0            2
## 5      1060            1            0        1        0            3
## 6      1228            0            0        1        1            3
##   KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1            1          TA            7        Typ          2          TA
## 2            1          TA            4        Typ          0        <NA>
## 3            1          TA            5        Typ          1          Fa
## 4            1          TA            5        Typ          1          TA
## 5            1          Gd            6        Typ          1          TA
## 6            1          Gd            6        Typ          0        <NA>
##   GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1     Attchd        1973          RFn          2        484         TA
## 2     Detchd        1962          Unf          1        352         TA
## 3     Attchd        1960          RFn          1        352         TA
## 4     Attchd        1970          Fin          2        480         TA
## 5     Attchd        1968          Unf          1        270         TA
## 6     Attchd        1966          Unf          1        271         TA
##   GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch
## 1         TA          Y        235         204           228         0
## 2         TA          Y        140           0             0         0
## 3         TA          Y          0         213           176         0
## 4         TA          Y          0           0             0         0
## 5         TA          Y        406          90             0         0
## 6         TA          Y          0          65             0         0
##   ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1           0        0   <NA>  <NA>        Shed     350     11   2009
## 2         176        0   <NA>  <NA>        <NA>       0      9   2008
## 3           0        0   <NA>  GdWo        <NA>       0      5   2008
## 4           0        0   <NA>  <NA>        Shed     700      3   2010
## 5           0        0   <NA> MnPrv        <NA>       0      5   2010
## 6           0        0   <NA> MnPrv        <NA>       0      6   2008
##   SaleType SaleCondition SalePrice
## 1       WD        Normal    200000
## 2       WD        Normal    144000
## 3       WD        Normal    157000
## 4       WD        Normal    149000
## 5       WD        Normal    154000
## 6       WD        Normal    149350

As per usual, the two solutions we could compare is: 1) assign 0 to the NA value in LotFrontage 2) use CART to impute the value.

After analyzing the missing values, let us actually do clean the data with various method and compare them with baseline models.

The details of the data cleaning code is as follows.

Use a function called timeRecordA to record the time used in the calculation. (details of timeRecord functions will be discussed in later blogs.)

The first method of data cleaning is done as in the function data_clean1().

1.get rid of features with more than 50% of missing value 2.Use mice pacakge and apply random forest method to impute the missing value in features (make one copy instead of 5).

It is important to exclude target variable, SalePrice, in the process.

# record the time 
timeRecordA()
## [1] "Fun time log has been created"
##########################################################################################
data_clean1=function(data){
  
  library(mice)
  # get rid of features with more than 50% of missing value. 
  data1=data%>%select_comp(p=0.5)%>%char_to_factor()
  # Never impute with target variable 
  data1_impute=data1%>%select(-SalePrice)
  
  data1_impute=data1_impute%>%mice(method="rf",m=1,seed=7) # just make one imputation instead of the default 5 results. 
  
  data1_impute=complete(data1_impute,1)
  data1=cbind(data1_impute,SalePrice=data1$SalePrice)
  return(data1)
}

boston_clean1=boston_raw%>%data_clean1()
## .
## character    factor   integer 
##        39         1        37 
## .
##  factor integer 
##      40      37 
## 
##  iter imp variable
##   1   1  LotFrontage  MasVnrArea  GarageYrBlt  MasVnrType  BsmtQual  BsmtCond  BsmtExposure  BsmtFinType1  BsmtFinType2  Electrical  FireplaceQu  GarageType  GarageFinish  GarageQual  GarageCond
##   2   1  LotFrontage  MasVnrArea  GarageYrBlt  MasVnrType  BsmtQual  BsmtCond  BsmtExposure  BsmtFinType1  BsmtFinType2  Electrical  FireplaceQu  GarageType  GarageFinish  GarageQual  GarageCond
##   3   1  LotFrontage  MasVnrArea  GarageYrBlt  MasVnrType  BsmtQual  BsmtCond  BsmtExposure  BsmtFinType1  BsmtFinType2  Electrical  FireplaceQu  GarageType  GarageFinish  GarageQual  GarageCond
##   4   1  LotFrontage  MasVnrArea  GarageYrBlt  MasVnrType  BsmtQual  BsmtCond  BsmtExposure  BsmtFinType1  BsmtFinType2  Electrical  FireplaceQu  GarageType  GarageFinish  GarageQual  GarageCond
##   5   1  LotFrontage  MasVnrArea  GarageYrBlt  MasVnrType  BsmtQual  BsmtCond  BsmtExposure  BsmtFinType1  BsmtFinType2  Electrical  FireplaceQu  GarageType  GarageFinish  GarageQual  GarageCond
timeRecordB()
##########################################################################################

The details of how mice package works will be too long to be included here, and we will do that in future blogs.

The second method of data cleaning is done as in the function data_clean2()

1.Fit the missing value in Electrical with the majority type 2.Fit the missing value in MasVnrType and MasVnrArea with “None”, a new factor value. 3.Fit the missing value in GarageYrBlt with 0. (I know there will be many other ways to impute year, but I believe it will not affect prediction too much. You could try to verify that yourself.) 4.Fit the missing value in LotFrontage with 0, too.

data_clean2=function(data){
  features=c("Alley","BsmtQual","BsmtCond",
             "BsmtExposure","BsmtFinType1","BsmtFinType2",
             "FireplaceQu","GarageType","GarageFinish","GarageQual",
             "GarageCond","PoolQC","Fence","MiscFeature")
  data2=data%>%na_to_fac(names=features)
  
  
  # Fit the missing value in Electrical with the majority type 
  data2%>%select(Electrical)%>%table(useNA="always")
  data2[is.na(Electrical),"Electrical"]= "SBrkr"
  
  ## Fit the missing value in MasVnrType and MasVnrArea
  
  data2[is.na(MasVnrType),"MasVnrType"]="None"
  data2[is.na(MasVnrArea),"MasVnrArea"]=0
  data2%>%miss_pct_df()
  ## Fit the missing value in GarageYrBlt
  
  data2[is.na(GarageYrBlt),"GarageYrBlt"]=0
  
  ## Fit the missing value in LotFrontage
  
  data2[is.na(LotFrontage),"LotFrontage"]=0
  data2=data2%>%char_to_factor()
  return(data2)
  
}

boston_clean2=boston_raw%>%data_clean2()
## [1] "There are 2 features with missing values."
## .
## character    factor   integer 
##        43         1        37 
## .
##  factor integer 
##      44      37
timeRecordB()
########################################################################

The third method of data cleaning.

1.Fit the missing value in Electrical with the majority type 2.Fit the missing value in MasVnrType with “None”, a new factor value. 2.Impute the rest with mice package. (Random forest, or CART, or sample method, which are corresponding to boston_clean3, boston_clean4,boston_clean5)

data_clean3=function(data,method){
  
  library(mice)
  features=c("Alley","BsmtQual","BsmtCond",
             "BsmtExposure","BsmtFinType1","BsmtFinType2",
             "FireplaceQu","GarageType","GarageFinish","GarageQual",
             "GarageCond","PoolQC","Fence","MiscFeature")
  data3=data%>%na_to_fac(names=features)
  data3[is.na(MasVnrType),"MasVnrType"]="None"
  data3[is.na(Electrical),"Electrical"]= "SBrkr"
  
  # Remove targe variable from imputation 
  data3_impute=data3%>%select(-SalePrice)
  data3_impute=data3_impute%>%mice(method=method,m=1,seed=7)
  data3_impute=complete(data3_impute,1)
  data3=cbind(data3_impute,SalePrice=data3$SalePrice)
  data3=data3%>%char_to_factor()
  data3%>%miss_pct_df()
  
  return(data3)
  
}


# Random forest imputations
boston_clean3=boston_raw%>%data_clean3(method="rf")
## 
##  iter imp variable
##   1   1  LotFrontage  MasVnrArea  GarageYrBlt
##   2   1  LotFrontage  MasVnrArea  GarageYrBlt
##   3   1  LotFrontage  MasVnrArea  GarageYrBlt
##   4   1  LotFrontage  MasVnrArea  GarageYrBlt
##   5   1  LotFrontage  MasVnrArea  GarageYrBlt
## .
## character    factor   integer 
##        43         1        37 
## .
##  factor integer 
##      44      37 
## [1] "There are 0 features with missing values."
timeRecordB()
###################################################################################################
# Classification and regression trees 
boston_clean4=boston_raw%>%data_clean3(method="cart")
## 
##  iter imp variable
##   1   1  LotFrontage  MasVnrArea  GarageYrBlt
##   2   1  LotFrontage  MasVnrArea  GarageYrBlt
##   3   1  LotFrontage  MasVnrArea  GarageYrBlt
##   4   1  LotFrontage  MasVnrArea  GarageYrBlt
##   5   1  LotFrontage  MasVnrArea  GarageYrBlt
## .
## character    factor   integer 
##        43         1        37 
## .
##  factor integer 
##      44      37 
## [1] "There are 0 features with missing values."
timeRecordB()
###################################################################################################
# Random sample from the observed values
boston_clean5=boston_raw%>%data_clean3(method="sample")
## 
##  iter imp variable
##   1   1  LotFrontage  MasVnrArea  GarageYrBlt
##   2   1  LotFrontage  MasVnrArea  GarageYrBlt
##   3   1  LotFrontage  MasVnrArea  GarageYrBlt
##   4   1  LotFrontage  MasVnrArea  GarageYrBlt
##   5   1  LotFrontage  MasVnrArea  GarageYrBlt
## .
## character    factor   integer 
##        43         1        37 
## .
##  factor integer 
##      44      37 
## [1] "There are 0 features with missing values."
timeRecordB()
###################################################################################################

The fourth method of data cleaning.

1.Fit the missing value in Electrical with the majority type 2.Fit the missing value in MasVnrType with “None”, a new factor value. 3.Impute rest of the features with preProcess function in the caret package by applying knn or median impute.(knnImpute and medianImpute correspond to boston_clean6 and boston_clean7 )

# We will try to use the preProcess function in caret package and apply knnImpute and medianImpute to impute for the missing data
data_clean5=function(data,method){
  
  library(caret)
  features=c("Alley","BsmtQual","BsmtCond",
             "BsmtExposure","BsmtFinType1","BsmtFinType2",
             "FireplaceQu","GarageType","GarageFinish","GarageQual",
             "GarageCond","PoolQC","Fence","MiscFeature")
  data5=data%>%na_to_fac(names=features)
  
  
  
  data5[is.na(MasVnrType),"MasVnrType"]="None"
  data5[is.na(Electrical),"Electrical"]= "SBrkr"
  # Remove targe variable from imputation 
  data5_impute=data5%>%select(-SalePrice)%>%char_to_factor()
  
  data5_impute_f=data5_impute%>%preProcess(method=method)
  
  data5_impute=data5_impute_f%>%predict(data5_impute)
  
  
  
  data5=cbind(data5_impute,SalePrice=data5$SalePrice)
  
  
  return(data5)
  
}

# use knnImpute method

boston_clean6=boston_raw%>%data_clean5(method="knnImpute")
## Loading required package: lattice
## .
## character    factor   integer 
##        43         1        36 
## .
##  factor integer 
##      44      36
timeRecordB()
###################################################################################################
# use medianImpute method 


boston_clean7=boston_raw%>%data_clean5(method="medianImpute")
## .
## character    factor   integer 
##        43         1        36 
## .
##  factor integer 
##      44      36
timeRecordB()
###################################################################################################








timeRecordR()
##   user.self sys.self elapsed user.child sys.child run_time
## 1     3.108    0.052   3.158          0         0    0.000
## 2    73.008    0.628  74.522          0         0   71.364
## 3    73.084    0.628  74.599          0         0    0.077
## 4    76.000    0.632  77.521          0         0    2.922
## 5    77.892    0.636  79.422          0         0    1.901
## 6    78.168    0.636  79.698          0         0    0.276
## 7    79.020    0.640  80.555          0         0    0.857
## 8    79.380    0.640  80.917          0         0    0.362
boston_raw=boston_raw%>%char_to_factor()
## .
## character    factor   integer 
##        43         1        37 
## .
##  factor integer 
##      44      37
miss_pct_df(boston_clean1)
## [1] "There are 0 features with missing values."
## [1] miss_pct names   
## <0 rows> (or 0-length row.names)
miss_pct_df(boston_clean2)
## [1] "There are 0 features with missing values."
## [1] miss_pct names   
## <0 rows> (or 0-length row.names)
miss_pct_df(boston_clean3)
## [1] "There are 0 features with missing values."
## [1] miss_pct names   
## <0 rows> (or 0-length row.names)
miss_pct_df(boston_clean4)
## [1] "There are 0 features with missing values."
## [1] miss_pct names   
## <0 rows> (or 0-length row.names)
miss_pct_df(boston_clean5)
## [1] "There are 0 features with missing values."
## [1] miss_pct names   
## <0 rows> (or 0-length row.names)
miss_pct_df(boston_clean6)
## [1] "There are 0 features with missing values."
## [1] miss_pct names   
## <0 rows> (or 0-length row.names)
miss_pct_df(boston_clean7)
## [1] "There are 0 features with missing values."
## [1] miss_pct names   
## <0 rows> (or 0-length row.names)

The last few lines of output show that all dataset are imputed with no more missing values in them.

Next, let us look at the skewness of each feature. The skewness of a feature could affect the predictions of some machine learning algorithms, if the features have a really big value for skewness, then we need to do a Box-Cox transformation.

# First define a function to select integer or numeric value in features. 
select_num=function(data){
  library(magrittr)
  data=as.data.frame(data)
  
  data%>%sapply(class)%>%table()%>%print
  int_index=sapply(data,class)== "integer"
  int_index=int_index%>%as.logical()
  num_index=sapply(data,class)== "numeric"
  num_index=num_index%>%as.logical()
  # data[,!int_index]%>%sapply(class)%>%table
  
  data1=data[,int_index]
  data2=data[,num_index]
  data=cbind(data1,data2)
  data%>%sapply(class)%>%table%>%print
  return(data)
}
library(e1071)
library(dplyr)
# Select only integer or numeric data types for calculating skewness. 
boston_clean1%>%select_num()%>%sapply(skewness)
## .
##  factor integer 
##      40      37 
## .
## integer 
##      37
##            Id    MSSubClass   LotFrontage       LotArea   OverallCond 
##    0.00000000    1.40476562    2.78263224   12.18261502    0.69164401 
##     YearBuilt  YearRemodAdd    MasVnrArea    BsmtFinSF1    BsmtFinSF2 
##   -0.61220121   -0.50252776    2.65780112    1.68204129    4.24652141 
##     BsmtUnfSF   TotalBsmtSF      1stFlrSF      2ndFlrSF  LowQualFinSF 
##    0.91837835    1.52112395    1.37392896    0.81135997    8.99283329 
##     GrLivArea  BsmtFullBath  BsmtHalfBath      FullBath      HalfBath 
##    1.36375364    0.59484237    4.09497490    0.03648647    0.67450925 
##  BedroomAbvGr  KitchenAbvGr  TotRmsAbvGrd    Fireplaces   GarageYrBlt 
##    0.21135511    4.47917826    0.67495173    0.64823107   -0.59602452 
##    GarageCars    GarageArea    WoodDeckSF   OpenPorchSF EnclosedPorch 
##   -0.34184538    0.17961125    1.53820999    2.35948572    3.08352575 
##     3SsnPorch   ScreenPorch      PoolArea       MiscVal        MoSold 
##   10.28317840    4.11374731   14.79791829   24.42652237    0.21161746 
##        YrSold     SalePrice 
##    0.09607079    1.87900860
boston_clean2%>%select_num()%>%sapply(skewness)
## .
##  factor integer 
##      44      37 
## .
## integer 
##      37
##            Id    MSSubClass   LotFrontage       LotArea   OverallCond 
##    0.00000000    1.40476562    0.26727232   12.18261502    0.69164401 
##     YearBuilt  YearRemodAdd    MasVnrArea    BsmtFinSF1    BsmtFinSF2 
##   -0.61220121   -0.50252776    2.67211701    1.68204129    4.24652141 
##     BsmtUnfSF   TotalBsmtSF      1stFlrSF      2ndFlrSF  LowQualFinSF 
##    0.91837835    1.52112395    1.37392896    0.81135997    8.99283329 
##     GrLivArea  BsmtFullBath  BsmtHalfBath      FullBath      HalfBath 
##    1.36375364    0.59484237    4.09497490    0.03648647    0.67450925 
##  BedroomAbvGr  KitchenAbvGr  TotRmsAbvGrd    Fireplaces   GarageYrBlt 
##    0.21135511    4.47917826    0.67495173    0.64823107   -3.86155342 
##    GarageCars    GarageArea    WoodDeckSF   OpenPorchSF EnclosedPorch 
##   -0.34184538    0.17961125    1.53820999    2.35948572    3.08352575 
##     3SsnPorch   ScreenPorch      PoolArea       MiscVal        MoSold 
##   10.28317840    4.11374731   14.79791829   24.42652237    0.21161746 
##        YrSold     SalePrice 
##    0.09607079    1.87900860

It is obvious that some features have very high skewness. Thus, a Box-Cox transformation would seem necessary for better performance for some of the models. Is it really true? It is for you to try it out.