The target of this notebook is to show a way of dealing with missing values on the Housing Price dataset from kaggle.
The first step before dealing with the missing values is to have a quick look into the dataset.
First, we load the tidyverse package:
## Importing packages
library(tidyverse)
And our datasets:
#load data
test = read_csv("house_test.csv")
train = read_csv("house_train.csv")
For personal preference, I convert all variables to lowercase (helps typing it quicker!):
# View the column names
names(train)<-tolower(names(train))
names(test)<-tolower(names(test))
And we combine both train and test datasets to analyse the missing values all at once, excluding our dependant variable:
train_variables <- train %>%
select(-saleprice)
all_data <- rbind(train_variables, test)
Additionally, it is good to have a perspective of the dimension of the dataset:
dim(all_data)
[1] 2919 80
We have 80 variables and 2919 observations. Now, it is time to check the structure of the data:
# Look at the structure using dplyr's glimpse()
glimpse(all_data)
Rows: 2,919
Columns: 80
$ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, …
$ mssubclass <dbl> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 20, 20, 45, 20, 90…
$ mszoning <chr> "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RM", "RL", "RL", "RL",…
$ lotfrontage <dbl> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 91, NA, 51, NA, 72,…
$ lotarea <dbl> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 6120, 7420, 11200,…
$ street <chr> "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", "Pave",…
$ alley <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ lotshape <chr> "Reg", "Reg", "IR1", "IR1", "IR1", "IR1", "Reg", "IR1", "Reg", "Reg", "…
$ landcontour <chr> "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "…
$ utilities <chr> "AllPub", "AllPub", "AllPub", "AllPub", "AllPub", "AllPub", "AllPub", "…
$ lotconfig <chr> "Inside", "FR2", "Inside", "Corner", "FR2", "Inside", "Inside", "Corner…
$ landslope <chr> "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "…
$ neighborhood <chr> "CollgCr", "Veenker", "CollgCr", "Crawfor", "NoRidge", "Mitchel", "Some…
$ condition1 <chr> "Norm", "Feedr", "Norm", "Norm", "Norm", "Norm", "Norm", "PosN", "Arter…
$ condition2 <chr> "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", "Norm",…
$ bldgtype <chr> "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam",…
$ housestyle <chr> "2Story", "1Story", "2Story", "2Story", "2Story", "1.5Fin", "1Story", "…
$ overallqual <dbl> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4, 5, 5, 8, 7, 8, 5,…
$ overallcond <dbl> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5, 5, 6, 5, 7, 5, 7,…
$ yearbuilt <dbl> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, 1939, 1965, 2005,…
$ yearremodadd <dbl> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950, 1950, 1965, 2006,…
$ roofstyle <chr> "Gable", "Gable", "Gable", "Gable", "Gable", "Gable", "Gable", "Gable",…
$ roofmatl <chr> "CompShg", "CompShg", "CompShg", "CompShg", "CompShg", "CompShg", "Comp…
$ exterior1st <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Sdng", "VinylSd", "VinylSd", "Viny…
$ exterior2nd <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Shng", "VinylSd", "VinylSd", "Viny…
$ masvnrtype <chr> "BrkFace", "None", "BrkFace", "None", "BrkFace", "None", "Stone", "Ston…
$ masvnrarea <dbl> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 306, 212, 0, 180, 0,…
$ exterqual <chr> "Gd", "TA", "Gd", "TA", "Gd", "TA", "Gd", "TA", "TA", "TA", "TA", "Ex",…
$ extercond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA",…
$ foundation <chr> "PConc", "CBlock", "PConc", "BrkTil", "PConc", "Wood", "PConc", "CBlock…
$ bsmtqual <chr> "Gd", "Gd", "Gd", "TA", "Gd", "Gd", "Ex", "Gd", "TA", "TA", "TA", "Ex",…
$ bsmtcond <chr> "TA", "TA", "TA", "Gd", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA",…
$ bsmtexposure <chr> "No", "Gd", "Mn", "No", "Av", "No", "Av", "Mn", "No", "No", "No", "No",…
$ bsmtfintype1 <chr> "GLQ", "ALQ", "GLQ", "ALQ", "GLQ", "GLQ", "GLQ", "ALQ", "Unf", "GLQ", "…
$ bsmtfinsf1 <dbl> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906, 998, 737, 0, 733,…
$ bsmtfintype2 <chr> "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "BLQ", "Unf", "Unf", "…
$ bsmtfinsf2 <dbl> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
$ bsmtunfsf <dbl> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134, 177, 175, 1494, 5…
$ totalbsmtsf <dbl> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991, 1040, 1175, 912, …
$ heating <chr> "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", "GasA",…
$ heatingqc <chr> "Ex", "Ex", "Ex", "Gd", "Ex", "Ex", "Ex", "Ex", "Gd", "Ex", "Ex", "Ex",…
$ centralair <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "…
$ electrical <chr> "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr",…
$ `1stflrsf` <dbl> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 1077, 1040, 1182, 912…
$ `2ndflrsf` <dbl> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142, 0, 0, 0, 0, 0, 0,…
$ lowqualfinsf <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ grlivarea <dbl> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, 1077, 1040, 2324,…
$ bsmtfullbath <dbl> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1,…
$ bsmthalfbath <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ fullbath <dbl> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2, 1, 1, 3, 1, 2, 1,…
$ halfbath <dbl> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,…
$ bedroomabvgr <dbl> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, 3, 3, 4, 3, 3, 3,…
$ kitchenabvgr <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1,…
$ kitchenqual <chr> "Gd", "TA", "Gd", "Gd", "Gd", "TA", "Gd", "TA", "TA", "TA", "TA", "Ex",…
$ totrmsabvgrd <dbl> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6, 6, 6, 9, 6, 7, 6…
$ functional <chr> "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Min1", "Typ", …
$ fireplaces <dbl> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1,…
$ fireplacequ <chr> NA, "TA", "TA", "Gd", "TA", NA, "Gd", "TA", "TA", "TA", NA, "Gd", NA, "…
$ garagetype <chr> "Attchd", "Attchd", "Attchd", "Detchd", "Attchd", "Attchd", "Attchd", "…
$ garageyrblt <dbl> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, 1931, 1939, 1965, 2005,…
$ garagefinish <chr> "RFn", "RFn", "RFn", "Unf", "RFn", "Unf", "RFn", "RFn", "Unf", "RFn", "…
$ garagecars <dbl> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2, 2, 1, 3, 1, 2, 2,…
$ garagearea <dbl> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384, 736, 352, 840, 3…
$ garagequal <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "Fa", "Gd", "TA", "TA",…
$ garagecond <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA",…
$ paveddrive <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "…
$ wooddecksf <dbl> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, 160, 0, 48, 0, 0, …
$ openporchsf <dbl> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 213, 112, 0, 0, 102…
$ enclosedporch <dbl> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0, 0, …
$ `3ssnporch` <dbl> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ screenporch <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ poolarea <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ poolqc <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ fence <chr> NA, NA, NA, NA, NA, "MnPrv", NA, NA, NA, NA, NA, NA, NA, NA, "GdWo", "G…
$ miscfeature <chr> NA, NA, NA, NA, NA, "Shed", NA, "Shed", NA, NA, NA, NA, NA, NA, NA, NA,…
$ miscval <dbl> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 700, 500, 0, 0, 0, …
$ mosold <dbl> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3, 10, 6, 5, 11, 6, …
$ yrsold <dbl> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008, 2008, 2008, 2006,…
$ saletype <chr> "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "New"…
$ salecondition <chr> "Normal", "Normal", "Normal", "Abnorml", "Normal", "Normal", "Normal", …
It seems we have some numerical data that is not numerical really (like “mssubclass”) but we won’t be analysing it in this notebook.
Next step is to get a summary of all the data, to see if we find any inconsistency:
summary(all_data)
id mssubclass mszoning lotfrontage lotarea
Min. : 1.0 Min. : 20.00 Length:2919 Min. : 21.00 Min. : 1300
1st Qu.: 730.5 1st Qu.: 20.00 Class :character 1st Qu.: 59.00 1st Qu.: 7478
Median :1460.0 Median : 50.00 Mode :character Median : 68.00 Median : 9453
Mean :1460.0 Mean : 57.14 Mean : 69.31 Mean : 10168
3rd Qu.:2189.5 3rd Qu.: 70.00 3rd Qu.: 80.00 3rd Qu.: 11570
Max. :2919.0 Max. :190.00 Max. :313.00 Max. :215245
NA's :486
street alley lotshape landcontour utilities
Length:2919 Length:2919 Length:2919 Length:2919 Length:2919
Class :character Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character Mode :character
lotconfig landslope neighborhood condition1 condition2
Length:2919 Length:2919 Length:2919 Length:2919 Length:2919
Class :character Class :character Class :character Class :character Class :character
Mode :character Mode :character Mode :character Mode :character Mode :character
bldgtype housestyle overallqual overallcond yearbuilt
Length:2919 Length:2919 Min. : 1.000 Min. :1.000 Min. :1872
Class :character Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
Mode :character Mode :character Median : 6.000 Median :5.000 Median :1973
Mean : 6.089 Mean :5.565 Mean :1971
3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2001
Max. :10.000 Max. :9.000 Max. :2010
yearremodadd roofstyle roofmatl exterior1st exterior2nd
Min. :1950 Length:2919 Length:2919 Length:2919 Length:2919
1st Qu.:1965 Class :character Class :character Class :character Class :character
Median :1993 Mode :character Mode :character Mode :character Mode :character
Mean :1984
3rd Qu.:2004
Max. :2010
masvnrtype masvnrarea exterqual extercond foundation
Length:2919 Min. : 0.0 Length:2919 Length:2919 Length:2919
Class :character 1st Qu.: 0.0 Class :character Class :character Class :character
Mode :character Median : 0.0 Mode :character Mode :character Mode :character
Mean : 102.2
3rd Qu.: 164.0
Max. :1600.0
NA's :23
bsmtqual bsmtcond bsmtexposure bsmtfintype1 bsmtfinsf1
Length:2919 Length:2919 Length:2919 Length:2919 Min. : 0.0
Class :character Class :character Class :character Class :character 1st Qu.: 0.0
Mode :character Mode :character Mode :character Mode :character Median : 368.5
Mean : 441.4
3rd Qu.: 733.0
Max. :5644.0
NA's :1
bsmtfintype2 bsmtfinsf2 bsmtunfsf totalbsmtsf heating
Length:2919 Min. : 0.00 Min. : 0.0 Min. : 0.0 Length:2919
Class :character 1st Qu.: 0.00 1st Qu.: 220.0 1st Qu.: 793.0 Class :character
Mode :character Median : 0.00 Median : 467.0 Median : 989.5 Mode :character
Mean : 49.58 Mean : 560.8 Mean :1051.8
3rd Qu.: 0.00 3rd Qu.: 805.5 3rd Qu.:1302.0
Max. :1526.00 Max. :2336.0 Max. :6110.0
NA's :1 NA's :1 NA's :1
heatingqc centralair electrical 1stflrsf 2ndflrsf
Length:2919 Length:2919 Length:2919 Min. : 334 Min. : 0.0
Class :character Class :character Class :character 1st Qu.: 876 1st Qu.: 0.0
Mode :character Mode :character Mode :character Median :1082 Median : 0.0
Mean :1160 Mean : 336.5
3rd Qu.:1388 3rd Qu.: 704.0
Max. :5095 Max. :2065.0
lowqualfinsf grlivarea bsmtfullbath bsmthalfbath fullbath
Min. : 0.000 Min. : 334 Min. :0.0000 Min. :0.00000 Min. :0.000
1st Qu.: 0.000 1st Qu.:1126 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.000
Median : 0.000 Median :1444 Median :0.0000 Median :0.00000 Median :2.000
Mean : 4.694 Mean :1501 Mean :0.4299 Mean :0.06136 Mean :1.568
3rd Qu.: 0.000 3rd Qu.:1744 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:2.000
Max. :1064.000 Max. :5642 Max. :3.0000 Max. :2.00000 Max. :4.000
NA's :2 NA's :2
halfbath bedroomabvgr kitchenabvgr kitchenqual totrmsabvgrd
Min. :0.0000 Min. :0.00 Min. :0.000 Length:2919 Min. : 2.000
1st Qu.:0.0000 1st Qu.:2.00 1st Qu.:1.000 Class :character 1st Qu.: 5.000
Median :0.0000 Median :3.00 Median :1.000 Mode :character Median : 6.000
Mean :0.3803 Mean :2.86 Mean :1.045 Mean : 6.452
3rd Qu.:1.0000 3rd Qu.:3.00 3rd Qu.:1.000 3rd Qu.: 7.000
Max. :2.0000 Max. :8.00 Max. :3.000 Max. :15.000
functional fireplaces fireplacequ garagetype garageyrblt
Length:2919 Min. :0.0000 Length:2919 Length:2919 Min. :1895
Class :character 1st Qu.:0.0000 Class :character Class :character 1st Qu.:1960
Mode :character Median :1.0000 Mode :character Mode :character Median :1979
Mean :0.5971 Mean :1978
3rd Qu.:1.0000 3rd Qu.:2002
Max. :4.0000 Max. :2207
NA's :159
garagefinish garagecars garagearea garagequal garagecond
Length:2919 Min. :0.000 Min. : 0.0 Length:2919 Length:2919
Class :character 1st Qu.:1.000 1st Qu.: 320.0 Class :character Class :character
Mode :character Median :2.000 Median : 480.0 Mode :character Mode :character
Mean :1.767 Mean : 472.9
3rd Qu.:2.000 3rd Qu.: 576.0
Max. :5.000 Max. :1488.0
NA's :1 NA's :1
paveddrive wooddecksf openporchsf enclosedporch 3ssnporch
Length:2919 Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.000
Class :character 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.0 1st Qu.: 0.000
Mode :character Median : 0.00 Median : 26.00 Median : 0.0 Median : 0.000
Mean : 93.71 Mean : 47.49 Mean : 23.1 Mean : 2.602
3rd Qu.: 168.00 3rd Qu.: 70.00 3rd Qu.: 0.0 3rd Qu.: 0.000
Max. :1424.00 Max. :742.00 Max. :1012.0 Max. :508.000
screenporch poolarea poolqc fence miscfeature
Min. : 0.00 Min. : 0.000 Length:2919 Length:2919 Length:2919
1st Qu.: 0.00 1st Qu.: 0.000 Class :character Class :character Class :character
Median : 0.00 Median : 0.000 Mode :character Mode :character Mode :character
Mean : 16.06 Mean : 2.252
3rd Qu.: 0.00 3rd Qu.: 0.000
Max. :576.00 Max. :800.000
miscval mosold yrsold saletype salecondition
Min. : 0.00 Min. : 1.000 Min. :2006 Length:2919 Length:2919
1st Qu.: 0.00 1st Qu.: 4.000 1st Qu.:2007 Class :character Class :character
Median : 0.00 Median : 6.000 Median :2008 Mode :character Mode :character
Mean : 50.83 Mean : 6.213 Mean :2008
3rd Qu.: 0.00 3rd Qu.: 8.000 3rd Qu.:2009
Max. :17000.00 Max. :12.000 Max. :2010
If we have a closer look, it seems there is a typo in one observation, regarding “garageyrblt”:
all_data %>%
select(garageyrblt) %>%
summary()
garageyrblt
Min. :1895
1st Qu.:1960
Median :1979
Mean :1978
3rd Qu.:2002
Max. :2207
NA's :159
This variable is telling us the year where the garage was build. As we can see, the maximun value is 2207, which it is obviusly a typo.
As we saw, there is a typo in the data. In order to find out where the typo is, we can just filter the data for any year after 2020 (I will also compare it versus the year that the property was built to get more perspective):
all_data %>%
filter(garageyrblt>2020) %>%
select(yearbuilt, garageyrblt)
We can see that the year when the property was built was 2006 and so the garage probably was built in 2007. We will amend the value to 2007.
all_data <- all_data %>%
mutate(garageyrblt = ifelse(garageyrblt > 2020, 2007, garageyrblt))
Now, it is our turn to get more deeply into the missing values. A good management of them, should help our model to predict the sales price more accurate (in theory!).
We can visualise the missing values:
all_data %>%
gather(key = "key", value = "val") %>%
mutate(is.missing = is.na(val)) %>%
group_by(key, is.missing) %>%
summarise(num_missing = n())%>%
filter(is.missing == TRUE, num_missing > 1) %>%
select(-is.missing) %>%
arrange(desc(num_missing)) %>%
ggplot(aes(x = reorder(key, num_missing), y = num_missing, fill = key)) +
geom_col() +
coord_flip() +
xlab("Variable") +
ylab("Missing values")+
theme(legend.position='none')
`summarise()` regrouping output by 'key' (override with `.groups` argument)
It seems we have a good quantity of variables with missing values, specially the top 6.
Let’s see the actual amount of missing values per variable:
NAcol <- which(colSums(is.na(all_data)) > 0)
sort(colSums(sapply(all_data[NAcol], is.na)), decreasing = TRUE)
poolqc miscfeature alley fence fireplacequ lotfrontage garageyrblt
2909 2814 2721 2348 1420 486 159
garagefinish garagequal garagecond garagetype bsmtcond bsmtexposure bsmtqual
159 159 159 157 82 82 81
bsmtfintype2 bsmtfintype1 masvnrtype masvnrarea mszoning utilities bsmtfullbath
80 79 24 23 4 2 2
bsmthalfbath functional exterior1st exterior2nd bsmtfinsf1 bsmtfinsf2 bsmtunfsf
2 2 1 1 1 1 1
totalbsmtsf electrical kitchenqual garagecars garagearea saletype
1 1 1 1 1 1
We are going to work from the top to the bottom.
I have created the following funciton to have a visual look of most common observation, so that will help us when filling the missing values.
view_group_data <- function(data, col){
data %>%
count({{col}}) %>%
arrange(desc(n)) %>%
top_n(5)
}
Does the property have a swimming pool?
view_group_data(all_data, poolqc)
Selecting by n
The majority of the observations (2909) are telling us that the properties don’t have a swimming pool. However, if we compare the pool quality versus the area, we see some surprises:
pool.cols <- c('poolarea', 'poolqc')
all_data %>%
subset(select = pool.cols) %>%
filter(poolarea > 0, is.na(poolqc))
It seems that 3 rows have a pool area bigger that 0, which means that 3 missing values seem to be incorrect.
For simplicity, we will fill those 3 values with “Gd” as seems to be the most frequent value. We will change the other missing values to “No”
all_data <- all_data %>%
mutate(poolqc = case_when(
poolarea > 0 & is.na(poolqc) ~ "Gd",
is.na(poolqc) ~ "No",
TRUE ~ poolqc
)
)
Here are included those features not covered in other categories.
view_group_data(all_data, miscfeature)
Selecting by n
For now, we can fill the missing values with “No” for those observations with no extra features:
all_data <- all_data %>%
mutate(miscfeature = ifelse(is.na(miscfeature), "No", miscfeature))
As per the documentation, those NA mean that they dont have alley access. We can fill them with “No”
all_data <- all_data %>%
mutate(alley = ifelse(is.na(alley), "No", alley))
Similar to the previous variable, NA means “No” fence. We will fill the missing values with “No”.
all_data <- all_data %>%
mutate(fence = ifelse(is.na(fence), "No", fence))
view_group_data(all_data, fence)
Selecting by n
For fireplace quality missing values, a good thing to check is to see if any missing value is aligned with a fireplace:
all_data %>%
select(fireplaces, fireplacequ) %>%
filter(fireplaces > 0, is.na(fireplacequ))
Seems all to be correct, so we are sure that missing value means “No” fireplace:
all_data <- all_data %>%
mutate(fireplacequ = ifelse(is.na(fireplacequ), "No", fireplacequ))
For the linear feet of street connected to the property, we can assume that missing values means 0 linear feet:
all_data <- all_data %>%
mutate(lotfrontage = ifelse(is.na(lotfrontage), 0, lotfrontage))
For Garage variables, we will filter our data set with all the related variables:
garage.cols <- c('id', 'garageyrblt', 'garagearea', 'garagecars', 'garagequal', 'garagefinish', 'garagecond', 'garagetype')
It seems some columns have 159 missing values (“no garage”) and garagetype have 157 missing values.
A good chech is to filter the data to see if any observation has garage area and also missing values on other attributes:
all_data %>%
subset(select = garage.cols) %>%
filter(garagearea > 0, is.na(garagequal))
We can see that in this case the data seems to be incomplete, as the area is 360 and the garage have space for 1 car.
We can check for detached garages with similar area in order to fill the missing values:
all_data %>%
filter(garagecars == 1,garagetype == "Detchd") %>%
select(garagequal, garagefinish, garagecond) %>%
group_by(garagequal, garagefinish, garagecond) %>%
summarise(freq = n()) %>%
arrange(desc(freq))
`summarise()` regrouping output by 'garagequal', 'garagefinish' (override with `.groups` argument)
I will use the most frequent set to fill the missing values of the observation number 2127:
all_data <- all_data %>%
mutate(garagequal = ifelse(id == 2127, "TA", garagequal),
garagefinish = ifelse(id == 2127, "Unf", garagefinish),
garagecond = ifelse(id == 2127, "TA", garagecond),
garageyrblt = ifelse(id == 2127, yearbuilt, garageyrblt))
all_data %>%
subset(select = garage.cols) %>%
filter(id == 2127)
Regarding observation 2577, we have a detached garage but no more information, it sounds like no garage at all
all_data %>%
filter(garagetype == "Detchd", is.na(garagecars)) %>%
subset(select = garage.cols)
all_data <- all_data %>%
mutate(garagetype = ifelse(id == 2577, "No", garagetype))
Now we can fill all the rest of missing values. For garageyrblt we will use yearbuilt. For categorical data we will use “No” and for numerical 0. Also, we will add a column “hasgarage” that should be helpful when modeling the data.
all_data <- all_data %>%
mutate(hasgarage = ifelse(is.na(garagequal), "No", "Yes"),
garageyrblt = ifelse(is.na(garageyrblt), yearbuilt, garageyrblt),
garagearea = ifelse(is.na(garagearea), 0, garagearea),
garagecars = ifelse(is.na(garagecars), 0, garagecars),
garagequal = ifelse(is.na(garagequal), "No", garagequal),
garagefinish = ifelse(is.na(garagefinish), "No", garagefinish),
garagecond = ifelse(is.na(garagecond), "No", garagecond),
garagetype = ifelse(is.na(garagetype), "No", garagetype)
)
We will use a similar approach to fill the missing values on the basement variables. So, first we get all the columns from the data.
bsmt.cols <- c('id', 'bsmtqual', 'bsmtexposure', 'bsmtcond', 'bsmtfintype1', 'bsmtfinsf1', 'bsmtfintype2', 'bsmtfinsf2', 'bsmtunfsf', 'totalbsmtsf', 'bsmtfullbath', 'bsmthalfbath')
We can have a quick look to see the missing values:
all_data %>%
subset(select = bsmt.cols) %>%
summarise_all(funs(sum(is.na(.)))) %>%
arrange()
`funs()` is deprecated as of dplyr 0.8.0.
Please use a list of either functions or lambdas:
# Simple named list:
list(mean = mean, median = median)
# Auto named with `tibble::lst()`:
tibble::lst(mean, median)
# Using lambdas
list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
This warning is displayed once every 8 hours.
Call `lifecycle::last_warnings()` to see where this warning was generated.
Comparing some basement variables with the total basement area greater than zero, we found the following variables where their missing values should not mean “No basement”:
For those with no basement, we will mark them as “NB” for categorical variables and 0 for numerical.
all_data <- all_data %>%
mutate(bsmtexposure = ifelse(is.na(bsmtexposure) & totalbsmtsf > 0, "No", bsmtexposure),
bsmtexposure = ifelse(is.na(bsmtexposure), "NB", bsmtexposure),
bsmtcond= ifelse(is.na(bsmtcond) & totalbsmtsf > 0, bsmtqual, bsmtcond),
bsmtcond= ifelse(is.na(bsmtcond), "NB", bsmtcond),
bsmtqual = ifelse(is.na(bsmtqual)& totalbsmtsf > 0, "TA", bsmtqual),
bsmtqual = ifelse(is.na(bsmtqual), "NB", bsmtqual),
bsmtfintype2 = ifelse(is.na(bsmtfintype2)& totalbsmtsf > 0, bsmtfintype1, bsmtfintype2),
bsmtfintype2 = ifelse(is.na(bsmtfintype2), "NB", bsmtfintype2),
bsmtfintype1 = ifelse(is.na(bsmtfintype1), "NB", bsmtfintype1),
bsmtfinsf1 = ifelse(is.na(bsmtfinsf1), 0, bsmtfinsf1),
bsmtunfsf = ifelse(is.na(bsmtunfsf), 0, bsmtunfsf),
bsmtfinsf2 = ifelse(is.na(bsmtfinsf2), 0, bsmtfinsf2),
totalbsmtsf = ifelse(is.na(totalbsmtsf), 0, totalbsmtsf),
bsmtfullbath = ifelse(is.na(bsmtfullbath), 0, bsmtfullbath),
bsmthalfbath = ifelse(is.na(bsmthalfbath), 0, bsmthalfbath))
One of the missing values for masvnrtype seems to have an area greater than zero. We can fill this missing value with the foundation value. For the rest of the missing values, we will fill them with “none” and 0.
all_data %>%
filter(is.na(masvnrtype), masvnrarea > 0)
all_data <- all_data %>%
mutate(masvnrtype = ifelse(is.na(masvnrtype) & masvnrarea > 0, foundation, masvnrtype),
masvnrtype = ifelse(is.na(masvnrtype), "None", masvnrtype),
masvnrarea = ifelse(is.na(masvnrarea), 0, masvnrarea))
We will fill the missing values with the most common value, “RL” in this case.
view_group_data(all_data, mszoning)
Selecting by n
#updated missing value with "RL"
all_data <- all_data %>%
mutate(mszoning = ifelse(is.na(mszoning), "RL", mszoning))
We will fill the missing values with the most common value, “AllPub” in this case.
view_group_data(all_data, utilities)
Selecting by n
#updated missing value with "AllPub"
all_data <- all_data %>%
mutate(utilities = ifelse(is.na(utilities), "AllPub", utilities))
As per the data description, we will assume “Typ” (Typical) for missing values:
all_data <- all_data %>%
mutate(functional = ifelse(is.na(functional), "Typ", functional))
We will fill the missing values with the most common value, “VinylSd” in this case.
view_group_data(all_data, exterior2nd)
Selecting by n
#updated missing values with "VinylSd"
all_data <- all_data %>%
mutate(exterior1st = ifelse(is.na(exterior1st), "VinylSd", exterior1st),
exterior2nd = ifelse(is.na(exterior2nd), "VinylSd", exterior2nd))
This must be an error, so we will use the most common value as well, “SBrkr”:
view_group_data(all_data, electrical)
Selecting by n
#updated missing value with "SBrkr"
all_data <- all_data %>%
mutate(electrical = ifelse(is.na(electrical), "SBrkr", electrical))
We will fill the missing values with the most common value, “TA” in this case.
view_group_data(all_data, kitchenqual)
Selecting by n
#updated missing value with "TA"
all_data <- all_data %>%
mutate(kitchenqual = ifelse(is.na(kitchenqual), "TA", kitchenqual))
We will fill the missing values with the most common value, “WD” in this case.
view_group_data(all_data, saletype)
Selecting by n
#updated missing value with "WD"
all_data <- all_data %>%
mutate(saletype = ifelse(is.na(saletype), "WD", saletype))