This data is from Kaggle, specifically the House Prices: Advanced Regression Techniques competition.
This data is available here:
https://www.kaggle.com/c/house-prices-advanced-regression-techniques
Load libraries.
library(ggplot2)
library(tidyr)
library(dplyr)
library(gridExtra)
library(MASS)
library(corrplot)
Read in training data.
Look at shape and first few rows.
train <- read.csv("train.csv",
header=TRUE,
row.names=1,
check.names=FALSE,
stringsAsFactors=FALSE)
dim(train)
## [1] 1460 80
head(train)
## MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape
## 1 60 RL 65 8450 Pave <NA> Reg
## 2 20 RL 80 9600 Pave <NA> Reg
## 3 60 RL 68 11250 Pave <NA> IR1
## 4 70 RL 60 9550 Pave <NA> IR1
## 5 60 RL 84 14260 Pave <NA> IR1
## 6 50 RL 85 14115 Pave <NA> IR1
## LandContour Utilities LotConfig LandSlope Neighborhood Condition1
## 1 Lvl AllPub Inside Gtl CollgCr Norm
## 2 Lvl AllPub FR2 Gtl Veenker Feedr
## 3 Lvl AllPub Inside Gtl CollgCr Norm
## 4 Lvl AllPub Corner Gtl Crawfor Norm
## 5 Lvl AllPub FR2 Gtl NoRidge Norm
## 6 Lvl AllPub Inside Gtl Mitchel Norm
## Condition2 BldgType HouseStyle OverallQual OverallCond YearBuilt
## 1 Norm 1Fam 2Story 7 5 2003
## 2 Norm 1Fam 1Story 6 8 1976
## 3 Norm 1Fam 2Story 7 5 2001
## 4 Norm 1Fam 2Story 7 5 1915
## 5 Norm 1Fam 2Story 8 5 2000
## 6 Norm 1Fam 1.5Fin 5 5 1993
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 1 2003 Gable CompShg VinylSd VinylSd BrkFace
## 2 1976 Gable CompShg MetalSd MetalSd None
## 3 2002 Gable CompShg VinylSd VinylSd BrkFace
## 4 1970 Gable CompShg Wd Sdng Wd Shng None
## 5 2000 Gable CompShg VinylSd VinylSd BrkFace
## 6 1995 Gable CompShg VinylSd VinylSd None
## MasVnrArea ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 1 196 Gd TA PConc Gd TA No
## 2 0 TA TA CBlock Gd TA Gd
## 3 162 Gd TA PConc Gd TA Mn
## 4 0 TA TA BrkTil TA Gd No
## 5 350 Gd TA PConc Gd TA Av
## 6 0 TA TA Wood Gd TA No
## BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 1 GLQ 706 Unf 0 150 856
## 2 ALQ 978 Unf 0 284 1262
## 3 GLQ 486 Unf 0 434 920
## 4 ALQ 216 Unf 0 540 756
## 5 GLQ 655 Unf 0 490 1145
## 6 GLQ 732 Unf 0 64 796
## Heating HeatingQC CentralAir Electrical 1stFlrSF 2ndFlrSF LowQualFinSF
## 1 GasA Ex Y SBrkr 856 854 0
## 2 GasA Ex Y SBrkr 1262 0 0
## 3 GasA Ex Y SBrkr 920 866 0
## 4 GasA Gd Y SBrkr 961 756 0
## 5 GasA Ex Y SBrkr 1145 1053 0
## 6 GasA Ex Y SBrkr 796 566 0
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 1 1710 1 0 2 1 3
## 2 1262 0 1 2 0 3
## 3 1786 1 0 2 1 3
## 4 1717 1 0 1 0 3
## 5 2198 1 0 2 1 4
## 6 1362 1 0 1 1 1
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1 1 Gd 8 Typ 0 <NA>
## 2 1 TA 6 Typ 1 TA
## 3 1 Gd 6 Typ 1 TA
## 4 1 Gd 7 Typ 1 Gd
## 5 1 Gd 9 Typ 1 TA
## 6 1 TA 5 Typ 0 <NA>
## GarageType GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## 1 Attchd 2003 RFn 2 548 TA
## 2 Attchd 1976 RFn 2 460 TA
## 3 Attchd 2001 RFn 2 608 TA
## 4 Detchd 1998 Unf 3 642 TA
## 5 Attchd 2000 RFn 3 836 TA
## 6 Attchd 1993 Unf 2 480 TA
## GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch
## 1 TA Y 0 61 0 0
## 2 TA Y 298 0 0 0
## 3 TA Y 0 42 0 0
## 4 TA Y 0 35 272 0
## 5 TA Y 192 84 0 0
## 6 TA Y 40 30 0 320
## ScreenPorch PoolArea PoolQC Fence MiscFeature MiscVal MoSold YrSold
## 1 0 0 <NA> <NA> <NA> 0 2 2008
## 2 0 0 <NA> <NA> <NA> 0 5 2007
## 3 0 0 <NA> <NA> <NA> 0 9 2008
## 4 0 0 <NA> <NA> <NA> 0 2 2006
## 5 0 0 <NA> <NA> <NA> 0 12 2008
## 6 0 0 <NA> MnPrv Shed 700 10 2009
## SaleType SaleCondition SalePrice
## 1 WD Normal 208500
## 2 WD Normal 181500
## 3 WD Normal 223500
## 4 WD Abnorml 140000
## 5 WD Normal 250000
## 6 WD Normal 143000
Next, use the data descriptions to convert MSSubClass from numeric to character.
train$MSSubClass <- plyr::mapvalues(train$MSSubClass,
from=c(20,30,40,45,50,60,70,75,80,85,90,120,150,160,180,190),
to=c("1-STORY 1946 & NEWER ALL STYLES","1-STORY 1945 & OLDER","1-STORY W/FINISHED ATTIC ALL AGES","1-1/2 STORY - UNFINISHED ALL AGES","1-1/2 STORY FINISHED ALL AGES","2-STORY 1946 & NEWER","2-STORY 1945 & OLDER","2-1/2 STORY ALL AGES","SPLIT OR MULTI-LEVEL","SPLIT FOYER","DUPLEX - ALL STYLES AND AGES","1-STORY PUD - 1946 & NEWER","1-1/2 STORY PUD - ALL AGES","2-STORY PUD - 1946 & NEWER","PUD - MULTILEVEL - INCL SPLIT LEV/FOYER","2 FAMILY CONVERSION - ALL STYLES AND AGES"))
Read in test data. Look at shape.
test <- read.csv("test.csv",
header=TRUE,
row.names=1,
check.names=FALSE,
stringsAsFactors=FALSE)
dim(test)
## [1] 1459 79
Test data has a similar number of rows, and one less column (because it does not have the target). Makes sense.
Also perform the same operation on MSSubClass.
test$MSSubClass <- plyr::mapvalues(test$MSSubClass,
from=c(20,30,40,45,50,60,70,75,80,85,90,120,150,160,180,190),
to=c("1-STORY 1946 & NEWER ALL STYLES","1-STORY 1945 & OLDER","1-STORY W/FINISHED ATTIC ALL AGES","1-1/2 STORY - UNFINISHED ALL AGES","1-1/2 STORY FINISHED ALL AGES","2-STORY 1946 & NEWER","2-STORY 1945 & OLDER","2-1/2 STORY ALL AGES","SPLIT OR MULTI-LEVEL","SPLIT FOYER","DUPLEX - ALL STYLES AND AGES","1-STORY PUD - 1946 & NEWER","1-1/2 STORY PUD - ALL AGES","2-STORY PUD - 1946 & NEWER","PUD - MULTILEVEL - INCL SPLIT LEV/FOYER","2 FAMILY CONVERSION - ALL STYLES AND AGES"))
Check type of each variable. How many variables are categorical vs. numeric?
class_per_variable <- sapply(train,class)
table(class_per_variable)
## class_per_variable
## character integer
## 44 36
Check number of unique values per variable for the numeric variables.
numeric_variables_train_data <- train[,class_per_variable == "integer"]
apply(numeric_variables_train_data,2,function(x)length(unique(x)))
## LotFrontage LotArea OverallQual OverallCond YearBuilt
## 111 1073 10 9 112
## YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF
## 61 328 637 144 780
## TotalBsmtSF 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea
## 721 753 417 24 861
## BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 4 3 4 3 8
## KitchenAbvGr TotRmsAbvGrd Fireplaces GarageYrBlt GarageCars
## 4 12 4 98 5
## GarageArea WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch
## 441 274 202 120 20
## ScreenPorch PoolArea MiscVal MoSold YrSold
## 76 8 21 12 5
## SalePrice
## 663
Some of these have only a few possible values. This makes sense. For example, FullBath counts the number of above-ground full bathrooms.
Let’s also count the number of 0’s per numeric variable.
apply(numeric_variables_train_data,2,function(x)length(which(x == 0)))
## LotFrontage LotArea OverallQual OverallCond YearBuilt
## 0 0 0 0 0
## YearRemodAdd MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF
## 0 861 467 1293 118
## TotalBsmtSF 1stFlrSF 2ndFlrSF LowQualFinSF GrLivArea
## 37 0 829 1434 0
## BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 856 1378 9 913 6
## KitchenAbvGr TotRmsAbvGrd Fireplaces GarageYrBlt GarageCars
## 1 0 690 0 81
## GarageArea WoodDeckSF OpenPorchSF EnclosedPorch 3SsnPorch
## 81 761 656 1252 1436
## ScreenPorch PoolArea MiscVal MoSold YrSold
## 1344 1453 1408 0 0
## SalePrice
## 0
We find a number of variables have a large proportion of 0’s. Most of these seem to make sense. For example, if MasVnrType = None, it makes sense that MasVnrArea would equal 0.
Let’s also check number of levels for each of the categorical variables.
categorical_variables_train_data <- train[,class_per_variable == "character"]
apply(categorical_variables_train_data,2,function(x)length(unique(x)))
## MSSubClass MSZoning Street Alley LotShape
## 15 5 2 3 4
## LandContour Utilities LotConfig LandSlope Neighborhood
## 4 2 5 3 25
## Condition1 Condition2 BldgType HouseStyle RoofStyle
## 9 8 5 8 6
## RoofMatl Exterior1st Exterior2nd MasVnrType ExterQual
## 8 15 16 5 4
## ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## 5 6 5 5 5
## BsmtFinType1 BsmtFinType2 Heating HeatingQC CentralAir
## 7 7 6 5 2
## Electrical KitchenQual Functional FireplaceQu GarageType
## 6 4 7 6 7
## GarageFinish GarageQual GarageCond PavedDrive PoolQC
## 4 6 6 3 4
## Fence MiscFeature SaleType SaleCondition
## 5 5 9 6
Max number of levels for a categorical variable is 25 (for neighborhood).
Note the following variables have NAs by design:
Would be good to also look at the number of NAs for these, and to see if there are any other variables with NAs.
First, get the table of possible values (incl. NA) for variables with known NAs.
vars_with_known_NAs <- c("Alley","BsmtQual","BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2","FireplaceQu","GarageType","GarageFinish","GarageQual","GarageCond","PoolQC","Fence","MiscFeature","GarageYrBlt")
apply(train[,vars_with_known_NAs],2,function(x)table(x,useNA="ifany"))
## $Alley
## x
## Grvl Pave <NA>
## 50 41 1369
##
## $BsmtQual
## x
## Ex Fa Gd TA <NA>
## 121 35 618 649 37
##
## $BsmtCond
## x
## Fa Gd Po TA <NA>
## 45 65 2 1311 37
##
## $BsmtExposure
## x
## Av Gd Mn No <NA>
## 221 134 114 953 38
##
## $BsmtFinType1
## x
## ALQ BLQ GLQ LwQ Rec Unf <NA>
## 220 148 418 74 133 430 37
##
## $BsmtFinType2
## x
## ALQ BLQ GLQ LwQ Rec Unf <NA>
## 19 33 14 46 54 1256 38
##
## $FireplaceQu
## x
## Ex Fa Gd Po TA <NA>
## 24 33 380 20 313 690
##
## $GarageType
## x
## 2Types Attchd Basment BuiltIn CarPort Detchd <NA>
## 6 870 19 88 9 387 81
##
## $GarageFinish
## x
## Fin RFn Unf <NA>
## 352 422 605 81
##
## $GarageQual
## x
## Ex Fa Gd Po TA <NA>
## 3 48 14 3 1311 81
##
## $GarageCond
## x
## Ex Fa Gd Po TA <NA>
## 2 35 9 7 1326 81
##
## $PoolQC
## x
## Ex Fa Gd <NA>
## 2 2 3 1453
##
## $Fence
## x
## GdPrv GdWo MnPrv MnWw <NA>
## 59 54 157 11 1179
##
## $MiscFeature
## x
## Gar2 Othr Shed TenC <NA>
## 2 2 49 1 1406
##
## $GarageYrBlt
## x
## 1900 1906 1908 1910 1914 1915 1916 1918 1920 1921 1922 1923 1924 1925 1926
## 1 1 1 3 2 2 5 2 14 3 5 3 3 10 6
## 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941
## 1 4 2 8 4 3 1 2 4 5 2 3 9 14 10
## 1942 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958
## 2 4 4 2 11 8 24 6 3 12 19 13 16 20 21
## 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973
## 17 19 13 21 16 18 21 21 15 26 15 20 13 14 14
## 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988
## 18 9 29 35 19 15 15 10 4 7 8 10 6 11 14
## 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003
## 10 16 9 13 22 18 18 20 19 31 30 27 20 26 50
## 2004 2005 2006 2007 2008 2009 2010 <NA>
## 53 65 59 49 29 21 3 81
Next, check other variables for NAs.
num_NAs_per_column_minus_vars_with_known_NAs <- apply(train[,setdiff(colnames(train),vars_with_known_NAs)],2,function(x)length(which(is.na(x) == TRUE)))
print("Number of NAs for other variables:")
## [1] "Number of NAs for other variables:"
num_NAs_per_column_minus_vars_with_known_NAs[num_NAs_per_column_minus_vars_with_known_NAs > 0]
## LotFrontage MasVnrType MasVnrArea Electrical
## 259 8 8 1
Examine these variables in more detail.
print("Distribution of values for MasVnrType and Electrical:")
## [1] "Distribution of values for MasVnrType and Electrical:"
apply(train[,c("MasVnrType","Electrical")],2,function(x)table(x,useNA="ifany"))
## $MasVnrType
## x
## BrkCmn BrkFace None Stone <NA>
## 15 445 864 128 8
##
## $Electrical
## x
## FuseA FuseF FuseP Mix SBrkr <NA>
## 94 27 3 1 1334 1
print("Number of observations where both MasVnrType and MasVnrArea are NA:")
## [1] "Number of observations where both MasVnrType and MasVnrArea are NA:"
length(which(is.na(train$MasVnrType) == TRUE & is.na(train$MasVnrArea) == TRUE))
## [1] 8
print("Table of MasVnrArea when MasVnrType == None")
## [1] "Table of MasVnrArea when MasVnrType == None"
table(train$MasVnrArea[train$MasVnrType == "None"])
##
## 0 1 288 312 344
## 859 2 1 1 1
We find that most values for “Alley” are NA. Only 1460 - 1369 = 91 homes, or a bit over 6% of homes, have alley access. Then half and half gravel vs. paved. This variable seems like it may be best as a three-level variable.
Similar idea for “PoolQC”. Looks like just 7 homes in this data set (1460 - 1453) have a pool.
Most homes have basements. The fact that the number of NAs for basement-related variables is similar suggests that there are 37 homes with no basement, and then 1 home with a basement but where info on BsmtExposure specifically is missing.
Similar idea for all the garage-related variables. Assuming that there are 81 homes with no garage.
A bit under half of homes have an NA for FireplaceQu, suggesting that around half of homes have a fireplace. Of the remainder, the majority have either a “good” or “average” fireplace. This suggests that we may want to convert this variable to a simple binary of whether or not a home has a fireplace.
Finally, (1460 - 1179)/1460, or around 19%, of homes have a fence. Then, a “minimum privacy” fence seems to be the most common even when homes do have a fence. Then can probably combine “MnWw” (minimum wood/wire) into minimum privacy, and “GdWo” (good wood) into “GdPrv” (good privacy). So this variable can probably be converted to three levels (no fence, minimal fence, good fence) if not a binary fence/no fence.
Lastly, 54 homes have some kind of miscellaneous feature. Of these, a shed is the most common.
We also find missing values for some other variables (LotFrontage, MasVnrType, MasVnrArea, and Electrical).
LotFrontage is “linear feet of street connected to property”. This seems like an important variable, and it is missing for quite a few homes. We’ll need to figure out a strategy for filling this in.
For MasVnrType and MasVnrArea, it seems most logical to just assume that type=“None” and area=0 to fill in those 8 NA’s. For Electrical, it seems logical to just assume the NA is equal to “SBrkr”.
Let’s run some transformations to either convert NAs to their own factor level or fill them in, as appropriate.
Also switch some variables with intentional NAs to binary/tertiary variables.
These transformations will include:
train$Alley <- ifelse(is.na(train$Alley) == TRUE,"None",train$Alley)
train$BsmtQual <- ifelse(is.na(train$BsmtQual) == TRUE,"None",train$BsmtQual)
for(var in c("BsmtCond","BsmtExposure","BsmtFinType1","BsmtFinType2"))
{
train[,var] <- ifelse(train$BsmtQual == "None","None",train[,var])
}
train$BsmtExposure[is.na(train$BsmtExposure) == TRUE] <- "No"
train$BsmtFinType2[is.na(train$BsmtFinType2) == TRUE] <- "Rec"
train$FireplaceQu <- ifelse(is.na(train$FireplaceQu) == TRUE,"N","Y")
for(var in c("GarageType","GarageFinish","GarageQual","GarageCond"))
{
train[,var] <- ifelse(is.na(train[,var]) == TRUE,"None",train[,var])
}
train$GarageYrBlt[is.na(train$GarageYrBlt) == TRUE] <- median(train$GarageYrBlt,na.rm=TRUE)
train$PoolQC <- ifelse(is.na(train$PoolQC) == TRUE,"N","Y")
train$Fence <- ifelse(is.na(train$Fence) == TRUE,"None",train$Fence)
train$Fence <- plyr::mapvalues(train$Fence,from=c("MnPrv","MnWw","GdPrv","GdWo"),to=c("Minimal","Minimal","Good","Good"))
train$MiscFeature <- ifelse(is.na(train$MiscFeature) == TRUE,"N","Y")
train$MasVnrType <- ifelse(is.na(train$MasVnrType) == TRUE,"None",train$MasVnrType)
train$MasVnrArea <- ifelse(is.na(train$MasVnrArea) == TRUE,0,train$MasVnrArea)
train$Electrical[is.na(train$Electrical) == TRUE] <- "SBrkr"
Now only NAs left to fill in are for LotFrontage.
Let’s come back to this question later on, after we’ve done some additional exploratory analysis.
A very similar procedure to resolve missingness should be applied to the test data as well.
However, there are going to be some minor differences. For example, the test data has a few observations where BasementQual is NA, but the home actually has a basement.
Summary of strategy to remove NAs in the test data:
SaleType - Use mode (“WD”).
MiscFeature - Same strategy as before.
test_before_remove_NAs <- test
vars_to_replace_NA <- c("MSZoning","Utilities","Exterior1st","Exterior2nd","MasVnrArea","BsmtFinSF1","BsmtFinType2","BsmtFinSF2","BsmtUnfSF","TotalBsmtSF","BsmtFullBath","BsmtHalfBath","KitchenQual","Functional","SaleType","Alley","GarageType","BsmtFinType1")
values_to_use <- c("RL","AllPub","VinylSd","VinylSd","0","0","None",rep(as.character(0),times=5),"TA","Typ","WD","None","None","None")
for(i in 1:length(vars_to_replace_NA))
{
var = vars_to_replace_NA[i]
replacement = values_to_use[i]
if(replacement == "0"){replacement = 0}
test[,var] <- ifelse(is.na(test[,var]) == TRUE,replacement,test[,var])
}
test$MasVnrType[is.na(test$MasVnrType) == TRUE & test$MasVnrArea == 0] <- "None"
test$MasVnrType[is.na(test$MasVnrType) == TRUE & test$MasVnrArea > 0] <- "BrkFace"
for(var in c("BsmtQual","BsmtCond","BsmtExposure"))
{
test[test$BsmtFinType1 == "None",var] <- "None"
}
vars_to_replace_NA <- c("BsmtQual","BsmtCond","BsmtExposure")
values_to_use <- c("TA","TA","No")
for(i in 1:length(vars_to_replace_NA))
{
var = vars_to_replace_NA[i]
replacement = values_to_use[i]
if(replacement == "0"){replacement = 0}
test[,var] <- ifelse(is.na(test[,var]) == TRUE,replacement,test[,var])
}
for(var in c("GarageFinish","GarageQual","GarageCond"))
{
test[,var] <- ifelse(test$GarageType == "None","None",test[,var])
}
test$GarageYrBlt <- ifelse(is.na(test$GarageYrBlt) == TRUE,median(train$GarageYrBlt,na.rm=TRUE),test$GarageYrBlt)
vars_to_replace_NA <- c("GarageFinish","GarageQual","GarageCond")
values_to_use <- c("Unf","TA","TA")
for(i in 1:length(vars_to_replace_NA))
{
var = vars_to_replace_NA[i]
replacement = values_to_use[i]
if(replacement == "0"){replacement = 0}
test[,var] <- ifelse(is.na(test[,var]) == TRUE,replacement,test[,var])
}
test$GarageArea <- ifelse(is.na(test$GarageArea) == TRUE,median(train$GarageArea[train$GarageType == "Detchd"],na.rm=TRUE),test$GarageArea)
test$GarageCars <- ifelse(is.na(test$GarageCars) == TRUE,2,test$GarageCars)
test$PoolQC <- ifelse(is.na(test$PoolQC) == TRUE,"N","Y")
test$FireplaceQu <- ifelse(is.na(test$FireplaceQu) == TRUE,"N","Y")
test$Fence <- ifelse(is.na(test$Fence) == TRUE,"None",test$Fence)
test$Fence <- plyr::mapvalues(test$Fence,from=c("MnPrv","MnWw","GdPrv","GdWo"),to=c("Minimal","Minimal","Good","Good"))
test$MiscFeature <- ifelse(is.na(test$MiscFeature) == TRUE,"N","Y")
Start with Neighborhood since this has a lot more levels than the others.
Then do MSSubClass and Exterior1st, which each have 15-16 levels.
Exterior2nd is nearly always the same as Exterior1st, especially after correct a few typos (“Wd Shng” to “WdShing”, “CmentBd” to “CemntBd”, “BrkCmn” to “BrkComm”). So skip this variable.
Other variables we can remove include Condition2 (nearly always either identical to Condition1, or “Normal”) and BsmtFinType2 (nearly always either Unfinished, identical to BsmtFinType1, or a bit lower than BsmtFinType1).
mycol <- c("#004949","#009292","#FF6DB6","#FFB677","#490092","#006DDB","#B66DFF","#6DB6FF","#B6DBFF","#920000","#924900","#DBD100","#24FF24","#FFFF6D","#000000")
barplot_single_var <- function(var,horizontal=FALSE){
count_per_level <- data.frame(table(train[,var]))
colnames(count_per_level) <- c("Level","Num.homes")
count_per_level$Level <- as.vector(count_per_level$Level)
count_per_level <- count_per_level[order(count_per_level$Num.homes,decreasing=TRUE),]
count_per_level$Level <- factor(count_per_level$Level,levels=count_per_level$Level)
my_barplot <- ggplot(count_per_level,
aes(x=Level,y=Num.homes,fill=Level)) +
geom_bar(stat="identity") +
scale_fill_manual(values=c(mycol,mycol)) +
ylab("Number of homes") +
guides(fill=FALSE) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_text(aes(label=Num.homes),vjust=0) +
xlab(var)
if(horizontal == TRUE){my_barplot <- my_barplot + coord_flip()}
return(my_barplot)
}
print(barplot_single_var("Neighborhood"))
print(barplot_single_var("MSSubClass",horizontal=TRUE))
print(barplot_single_var("Exterior1st"))
Next, we find a number of variables with most or all of the same levels indicating various quality levels. These include:
quality_vars <- c("ExterQual","ExterCond","BsmtQual","BsmtCond","HeatingQC","KitchenQual","GarageQual","GarageCond")
quality_var_counts <- train[,quality_vars] %>% gather() %>% count(key,value)
quality_var_counts <- data.frame(quality_var_counts)
quality_var_counts$value <- factor(quality_var_counts$value,levels=c("None","Po","Fa","TA","Gd","Ex"))
ggplot(quality_var_counts,
aes(x=key,y=n,fill=value)) +
geom_bar(stat="identity",position="dodge") +
scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2")) +
ylab("Number of homes") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_text(aes(label=n),vjust=-0.25,position=position_dodge(width=0.9)) +
xlab("")
Plot Street and Alley.
street_vs_alley_counts <- train[,c("Street","Alley")] %>% gather() %>% count(key,value)
street_vs_alley_counts <- data.frame(street_vs_alley_counts)
street_vs_alley_counts$value <- factor(street_vs_alley_counts$value,levels=c("None","Grvl","Pave"))
ggplot(street_vs_alley_counts,
aes(x=key,y=n,fill=value)) +
geom_bar(stat="identity",position="dodge") +
scale_fill_manual(values=c("lightgrey","#E69F00","darkgrey")) +
ylab("Number of homes") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_text(aes(label=n),vjust=-0.25,position=position_dodge(width=0.9)) +
xlab("")
Plot a number of yes/no variables.
yes_no_vars_data <- train[,c("CentralAir","FireplaceQu","PavedDrive","PoolQC","Utilities","MiscFeature")]
colnames(yes_no_vars_data) <- c("CentralAir","Fireplace","PavedDrive","Pool","Utilities.AllPub","MiscFeature")
yes_no_vars_data$Utilities.AllPub[yes_no_vars_data$Utilities.AllPub == "AllPub"] <- "Y"
yes_no_vars_data$Utilities.AllPub[yes_no_vars_data$Utilities.AllPub == "NoSeWa"] <- "N"
yes_no_vars_data$PavedDrive[yes_no_vars_data$PavedDrive == "P"] <- "Partial"
yes_no_vars_data <- yes_no_vars_data %>% gather() %>% count(key,value)
yes_no_vars_data <- data.frame(yes_no_vars_data)
yes_no_vars_data$value <- factor(yes_no_vars_data$value,levels=c("N","Y","Partial"))
ggplot(yes_no_vars_data,
aes(x=key,y=n,fill=value)) +
geom_bar(stat="identity",position="dodge") +
scale_fill_manual(values=c("lightgrey","black","darkgrey")) +
ylab("Number of homes") +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_text(aes(label=n),vjust=-0.25,position=position_dodge(width=0.9)) +
xlab("")
Plot remaining assorted other variables.
pairs_of_vars <- data.frame(Var1 = c("MSZoning","Condition1","LotShape","LandContour","BldgType","BsmtExposure","Heating","GarageType","SaleType","RoofStyle","MasVnrType"),
Var2 = c("Fence","Functional","LotConfig","LandSlope","HouseStyle","BsmtFinType1","Electrical","GarageFinish","SaleCondition","RoofMatl","Foundation"),
stringsAsFactors=FALSE)
for(i in 1:nrow(pairs_of_vars))
{
plot1 <- barplot_single_var(pairs_of_vars$Var1[i])
plot2 <- barplot_single_var(pairs_of_vars$Var2[i])
grid.arrange(plot1,plot2,ncol=2)
}
Some variables are numeric, but are things like counts or years that are not continuous in this data set.
Let’s also make barplots of these.
barplot_single_var_from_numeric <- function(var,new_levels=unique(train[,var])){
count_per_level <- data.frame(table(train[,var]))
colnames(count_per_level) <- c("Level","Num.homes")
count_per_level$Level <- as.vector(count_per_level$Level)
count_per_level <- count_per_level[order(count_per_level$Num.homes,decreasing=TRUE),]
count_per_level$Level <- factor(count_per_level$Level,levels=new_levels)
my_barplot <- ggplot(count_per_level,
aes(x=Level,y=Num.homes,fill=Level)) +
geom_bar(stat="identity") +
scale_fill_manual(values=c(mycol,mycol)) +
ylab("Number of homes") +
guides(fill=FALSE) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
geom_text(aes(label=Num.homes),vjust=0) +
xlab(var)
return(my_barplot)
}
barplot_single_var_from_numeric("KitchenAbvGr",new_levels=0:3)
plot1 <- barplot_single_var_from_numeric("BedroomAbvGr",new_levels=c(0:6,8))
plot2 <- barplot_single_var_from_numeric("TotRmsAbvGrd",new_levels=c(2:12,14))
grid.arrange(plot1,plot2,ncol=2)
plot1 <- barplot_single_var_from_numeric("Fireplaces",new_levels=0:3)
plot2 <- barplot_single_var_from_numeric("GarageCars",new_levels=0:4)
grid.arrange(plot1,plot2,ncol=2)
plot1 <- barplot_single_var_from_numeric("MoSold",new_levels=1:12)
plot2 <- barplot_single_var_from_numeric("YrSold",new_levels=2006:2010)
grid.arrange(plot1,plot2,ncol=2)
plot1 <- barplot_single_var_from_numeric("OverallQual",new_levels=1:10)
plot2 <- barplot_single_var_from_numeric("OverallCond",new_levels=1:9)
grid.arrange(plot1,plot2,ncol=2)
plot1 <- barplot_single_var_from_numeric("BsmtFullBath",new_levels=0:3)
plot2 <- barplot_single_var_from_numeric("BsmtHalfBath",new_levels=0:3)
grid.arrange(plot1,plot2,ncol=2)
plot1 <- barplot_single_var_from_numeric("FullBath",new_levels=0:3)
plot2 <- barplot_single_var_from_numeric("HalfBath",new_levels=0:3)
grid.arrange(plot1,plot2,ncol=2)
Make a histogram for all remaining numeric variables.
One thing we still need to deal with though, is large number of zeroes in many variables.
Get the list again of number of zeros per variable, minus variables where we already made barplots.
numeric_vars <- colnames(train)[class_per_variable == "integer"]
numeric_vars <- setdiff(numeric_vars,c("OverallQual","OverallCond","BsmtFullBath","BsmtHalfBath","FullBath","HalfBath","BedroomAbvGr","KitchenAbvGr","TotRmsAbvGrd","Fireplaces","GarageCars","MoSold","YrSold"))
num_zeroes <- apply(train[,numeric_vars],2,function(x)length(which(x == 0)))
num_zeroes[num_zeroes > 0]
## MasVnrArea BsmtFinSF1 BsmtFinSF2 BsmtUnfSF TotalBsmtSF
## 869 467 1293 118 37
## 2ndFlrSF LowQualFinSF GarageArea WoodDeckSF OpenPorchSF
## 829 1434 81 761 656
## EnclosedPorch 3SsnPorch ScreenPorch PoolArea MiscVal
## 1252 1436 1344 1453 1408
Let’s remove zeroes for all histograms.
Let’s also remove BsmtFinSF2, since we are not including corresponding categorical variable BsmtFinType2.
Let’s make histograms for the following variables:
numeric_vars <- setdiff(numeric_vars,"BsmtFinSF2")
numeric_vars
## [1] "LotFrontage" "LotArea" "YearBuilt" "YearRemodAdd"
## [5] "MasVnrArea" "BsmtFinSF1" "BsmtUnfSF" "TotalBsmtSF"
## [9] "1stFlrSF" "2ndFlrSF" "LowQualFinSF" "GrLivArea"
## [13] "GarageYrBlt" "GarageArea" "WoodDeckSF" "OpenPorchSF"
## [17] "EnclosedPorch" "3SsnPorch" "ScreenPorch" "PoolArea"
## [21] "MiscVal" "SalePrice"
par(mfrow=c(2,2))
for(var in numeric_vars[1:20])
{
num_zeroes = length(which(train[,var] == 0))
hist(train[train[,var] > 0,var],labels=TRUE,xlab="",ylab="Number of homes",main=ifelse(num_zeroes == 0,var,paste0(var," >0")))
}
par(mfrow=c(1,2))
var = numeric_vars[21]
num_zeroes = length(which(train[,var] == 0))
hist(train[train[,var] > 0,var],labels=TRUE,xlab="",ylab="Number of homes",main=ifelse(num_zeroes == 0,var,paste0(var," >0")))
var = numeric_vars[22]
num_zeroes = length(which(train[,var] == 0))
hist(train[train[,var] > 0,var],labels=TRUE,xlab="",ylab="Number of homes",main=ifelse(num_zeroes == 0,var,paste0(var," >0")))
I checked and the two very high-value items for MiscVal are both “Gar2”. Surprisingly, the tennis court is only valued at $2,000, comparable to some of the fancier sheds.
Anyway, we see that many variables have various degrees of skew. With right skew being especially common. This makes sense for measurements of area.
Let’s remove or simplify the following variables already known to be redundant with other variables in the data set, or that do not have sufficient variety.
Later on, we may also want to somehow combine BsmtFinType1, BsmtFinSF1, BsmtFinType2, and BsmtFinSF2 into one variable that gives an idea of the total finished basement area, adjusted for quality of the finished area(s). But we can figure that out later on.
Utilities - There is only one home in the training data that does not have all public utilities.
OpenPorchSF, EnclosedPorchSF, 3SsnPorch, ScreenPorch - Most homes have only one of these > 0 at most, though some have two. Create variables PorchType1, PorchSF1, PorchType2, and PorchSF2. If all of these are 0, set all equal to 0/None. If one of these is non-zero, set PorchType2 and PorchSF2 to None. If two are non-zero, then can set all variables. Set first equal to the higher of Enclosed/3-season/Screen/Open.
Caveat - Test data has a few homes with a mix of enclosed, open, and screen porch. Of these, there are three with any substantial open porch space. For these observations, change PorchType2 to open if open area > screened. Then make PorchSF2 equal to the sum of open and screened porch area.
Remove or modify from both train and test.
train <- train[,setdiff(colnames(train),c("Condition2","TotalBsmtSF","Exterior2nd","GrLivArea","Utilities","PoolArea","MiscVal","MiscFeature","MSSubClass","Heating","RoofMatl","Fireplaces"))]
test <- test[,setdiff(colnames(test),c("Condition2","TotalBsmtSF","Exterior2nd","GrLivArea","Utilities","PoolArea","MiscVal","MiscFeature","MSSubClass","Heating","RoofMatl","Fireplaces"))]
porch_binary_train <- apply(train[,grep('Porch',colnames(train))],2,function(x)ifelse(x > 0,1,0))
porch_binary_test <- apply(test[,grep('Porch',colnames(test))],2,function(x)ifelse(x > 0,1,0))
num_porch_types_train <- rowSums(porch_binary_train)
num_porch_types_test <- rowSums(porch_binary_test)
train <- data.frame(train,PorchType1 = NA,PorchType2 = NA,PorchSF1 = NA,PorchSF2 = NA,stringsAsFactors=FALSE,check.names=FALSE)
test <- data.frame(test,PorchType1 = NA,PorchType2 = NA,PorchSF1 = NA,PorchSF2 = NA,stringsAsFactors=FALSE,check.names=FALSE)
train$PorchType1[num_porch_types_train == 0] <- "None"
train$PorchType2[num_porch_types_train == 0] <- "None"
train$PorchSF1[num_porch_types_train == 0] <- 0
train$PorchSF2[num_porch_types_train == 0] <- 0
test$PorchType1[num_porch_types_test == 0] <- "None"
test$PorchType2[num_porch_types_test == 0] <- "None"
test$PorchSF1[num_porch_types_test == 0] <- 0
test$PorchSF2[num_porch_types_test == 0] <- 0
for(var in c("OpenPorchSF","ScreenPorch","3SsnPorch","EnclosedPorch"))
{
train$PorchType1[train[,var] > 0] <- var
test$PorchType1[test[,var] > 0] <- var
}
for(var in c("OpenPorchSF","ScreenPorch","3SsnPorch","EnclosedPorch"))
{
train$PorchSF1[train[,var] > 0] <- train[train[,var] > 0,var]
test$PorchSF1[test[,var] > 0] <- test[test[,var] > 0,var]
}
for(var in c("OpenPorchSF","ScreenPorch","3SsnPorch","EnclosedPorch"))
{
indices_porchtype1_equal_var <- which(train$PorchType1 == var)
indices_nonzero <- which(train[,var] > 0)
train$PorchType2[setdiff(indices_nonzero,indices_porchtype1_equal_var)] <- var
train$PorchSF2[setdiff(indices_nonzero,indices_porchtype1_equal_var)] <- train[setdiff(indices_nonzero,indices_porchtype1_equal_var),var]
indices_porchtype1_equal_var <- which(test$PorchType1 == var)
indices_nonzero <- which(test[,var] > 0)
test$PorchType2[setdiff(indices_nonzero,indices_porchtype1_equal_var)] <- var
test$PorchSF2[setdiff(indices_nonzero,indices_porchtype1_equal_var)] <- test[setdiff(indices_nonzero,indices_porchtype1_equal_var),var]
}
train$PorchType2[is.na(train$PorchType2) == TRUE] <- "None"
test$PorchType2[is.na(test$PorchType2) == TRUE] <- "None"
train$PorchSF2[is.na(train$PorchSF2) == TRUE] <- 0
test$PorchSF2[is.na(test$PorchSF2) == TRUE] <- 0
test[test$OpenPorchSF > 100 & test$PorchType1 != "OpenPorchSF" & test$PorchType2 != "OpenPorchSF","PorchSF2"] <- test[test$OpenPorchSF > 100 & test$PorchType1 != "OpenPorchSF" & test$PorchType2 != "OpenPorchSF","OpenPorchSF"] + test[test$OpenPorchSF > 100 & test$PorchType1 != "OpenPorchSF" & test$PorchType2 != "OpenPorchSF","ScreenPorch"]
test[test$OpenPorchSF > 100 & test$OpenPorchSF > test$ScreenPorch & test$PorchType1 != "OpenPorchSF" & test$PorchType2 != "OpenPorchSF","PorchType2"] <- "OpenPorchSF"
train <- train[,setdiff(colnames(train),c("OpenPorchSF","ScreenPorch","3SsnPorch","EnclosedPorch"))]
test <- test[,setdiff(colnames(test),c("OpenPorchSF","ScreenPorch","3SsnPorch","EnclosedPorch"))]
Let’s simplify certain variables by reducing levels.
Also just make sure certain variables are ordered.
train$BsmtExposure <- ifelse(train$BsmtExposure == "None" | train$BsmtExposure == "No","N","Y")
train$BsmtFinType1 <- plyr::mapvalues(train$BsmtFinType1,
from=c("LwQ","Rec","BLQ","ALQ"),
to=c("LowtoAvg","LowtoAvg","LowtoAvg","LowtoAvg"))
train$BsmtFinType2 <- plyr::mapvalues(train$BsmtFinType2,
from=c("LwQ","Rec","BLQ","ALQ"),
to=c("LowtoAvg","LowtoAvg","LowtoAvg","LowtoAvg"))
train$GarageFinish <- factor(train$GarageFinish,levels=c("None","Unf","RFn","Fin"))
train$BsmtFinType1 <- factor(train$BsmtFinType1,levels=c("None","Unf","LowtoAvg","GLQ"))
train$BsmtFinType2 <- factor(train$BsmtFinType2,levels=c("None","Unf","LowtoAvg","GLQ"))
train$Exterior1st <- plyr::mapvalues(train$Exterior1st,
from=c("BrkComm","BrkFace","Stone","AsphShn","CBlock","ImStucc"),
to=c("Brk","Brk",rep("Other",times=4)))
condition_vars <- c("ExterCond","BsmtCond","GarageCond")
for(var in condition_vars)
{
train[,var] <- plyr::mapvalues(train[,var],
from=c("Po","Fa","TA","Gd","Ex"),
to=c("Po/Fa","Po/Fa","TA/Gd/Ex","TA/Gd/Ex","TA/Gd/Ex"))
train[,var] <- factor(train[,var],levels=c("None","Po/Fa","TA/Gd/Ex"))
}
quality_vars <- c("ExterQual","BsmtQual","KitchenQual","GarageQual","HeatingQC")
for(var in quality_vars)
{
train[,var] <- plyr::mapvalues(train[,var],
from=c("Po","Fa"),
to=c("Po/Fa","Po/Fa"))
if(var == "GarageQual"){train[train[,var] == "Ex",var] <- "Gd"}
train[,var] <- factor(train[,var],levels=c("None","Po/Fa","TA","Gd","Ex"))
}
train$Condition1 <- plyr::mapvalues(train$Condition1,
from=c("Artery","Feedr","PosA","PosN",grep('RR',unique(train$Condition1),value=TRUE)),
to=c("Artery/Feedr","Artery/Feedr","PosAorN","PosAorN",rep("RRAorN",times=4)))
train$Functional <- plyr::mapvalues(train$Functional,
from=c("Min1","Min2","Maj1","Maj2","Sev"),
to=c("Min","Min","Maj/Sev","Maj/Sev","Maj/Sev"))
train$Functional <- factor(train$Functional,levels=c("Typ","Min","Mod","Maj/Sev"))
train$LotConfig <- plyr::mapvalues(train$LotConfig,
from=c("FR2","FR3"),
to=c("FR2or3","FR2or3"))
train$Electrical <- ifelse(train$Electrical == "SBrkr","SBrkr","Other")
train$GarageType[train$GarageType == "CarPort" | train$GarageType == "2Types"] <- "Detchd"
train$GarageType[train$GarageType == "Basment"] <- "Attchd"
train$SaleType <- plyr::mapvalues(train$SaleType,
from=c("WD","CWD","Oth","Con"),
to=c("WD/Oth","WD/Oth","WD/Oth","WD/Oth"))
train$SaleType <- plyr::mapvalues(train$SaleType,
from=c("ConLD","ConLI"),
to=c("ConLw","ConLw"))
train <- data.frame(train,MultiKitchen = ifelse(train$KitchenAbvGr > 1,1,0),stringsAsFactors=FALSE,check.names=FALSE)
train <- train[,setdiff(colnames(train),"KitchenAbvGr")]
train$MoSold <- plyr::mapvalues(train$MoSold,
from=1:12,
to=c("Winter","Winter","Spring","Spring","Summer","Summer","Summer",rep("Fall",times=4),"Winter"))
train$MoSold <- factor(train$MoSold,levels=c("Winter","Fall","Spring","Summer"))
train$BsmtFullBath <- ifelse(train$BsmtHalfBath > 0,train$BsmtFullBath + 0.5,train$BsmtFullBath)
train$FullBath <- ifelse(train$HalfBath > 0,train$FullBath + 0.5,train$FullBath)
train <- train[,setdiff(colnames(train),c("BsmtHalfBath","HalfBath"))]
test$BsmtExposure <- ifelse(test$BsmtExposure == "None" | test$BsmtExposure == "No","N","Y")
test$BsmtFinType1 <- plyr::mapvalues(test$BsmtFinType1,
from=c("LwQ","Rec","BLQ","ALQ"),
to=c("LowtoAvg","LowtoAvg","LowtoAvg","LowtoAvg"))
test$BsmtFinType2 <- plyr::mapvalues(test$BsmtFinType2,
from=c("LwQ","Rec","BLQ","ALQ"),
to=c("LowtoAvg","LowtoAvg","LowtoAvg","LowtoAvg"))
test$GarageFinish <- factor(test$GarageFinish,levels=c("None","Unf","RFn","Fin"))
test$BsmtFinType1 <- factor(test$BsmtFinType1,levels=c("None","Unf","LowtoAvg","GLQ"))
test$BsmtFinType2 <- factor(test$BsmtFinType2,levels=c("None","Unf","LowtoAvg","GLQ"))
test$Exterior1st <- plyr::mapvalues(test$Exterior1st,
from=c("BrkComm","BrkFace","Stone","AsphShn","CBlock","ImStucc"),
to=c("Brk","Brk",rep("Other",times=4)))
condition_vars <- c("ExterCond","BsmtCond","GarageCond")
for(var in condition_vars)
{
test[,var] <- plyr::mapvalues(test[,var],
from=c("Po","Fa","TA","Gd","Ex"),
to=c("Po/Fa","Po/Fa","TA/Gd/Ex","TA/Gd/Ex","TA/Gd/Ex"))
test[,var] <- factor(test[,var],levels=c("None","Po/Fa","TA/Gd/Ex"))
}
quality_vars <- c("ExterQual","BsmtQual","KitchenQual","GarageQual","HeatingQC")
for(var in quality_vars)
{
test[,var] <- plyr::mapvalues(test[,var],
from=c("Po","Fa"),
to=c("Po/Fa","Po/Fa"))
if(var == "GarageQual"){test[test[,var] == "Ex",var] <- "Gd"}
test[,var] <- factor(test[,var],levels=c("None","Po/Fa","TA","Gd","Ex"))
}
var="Condition1"
test[,var] <- plyr::mapvalues(test[,var],
from=c("Artery","Feedr","PosA","PosN",grep('RR',unique(test$Condition1),value=TRUE)),
to=c("Artery/Feedr","Artery/Feedr","PosAorN","PosAorN",rep("RRAorN",times=4)))
var="Functional"
test[,var] <- plyr::mapvalues(test[,var],
from=c("Min1","Min2","Maj1","Maj2","Sev"),
to=c("Min","Min","Maj/Sev","Maj/Sev","Maj/Sev"))
test[,var] <- factor(test[,var],levels=c("Typ","Min","Mod","Maj/Sev"))
var="LotConfig"
test[,var] <- plyr::mapvalues(test[,var],
from=c("FR2","FR3"),
to=c("FR2or3","FR2or3"))
test$Electrical <- ifelse(test$Electrical == "SBrkr","SBrkr","Other")
var = "GarageType"
test[,var] <- plyr::mapvalues(test[,var],
from=c("CarPort","2Types"),
to=c("Detchd","Detchd"))
test$GarageType[test$GarageType == "Basment"] <- "Attchd"
var="SaleType"
test[,var] <- plyr::mapvalues(test[,var],
from=c("WD","CWD","Oth","Con"),
to=c("WD/Oth","WD/Oth","WD/Oth","WD/Oth"))
test[,var] <- plyr::mapvalues(test[,var],
from=c("ConLD","ConLI"),
to=c("ConLw","ConLw"))
var="MoSold"
test[,var] <- plyr::mapvalues(test[,var],
from=1:12,
to=c("Winter","Winter","Spring","Spring","Summer","Summer","Summer",rep("Fall",times=4),"Winter"))
test[,var] <- factor(test[,var],levels=c("Winter","Fall","Spring","Summer"))
test <- data.frame(test,MultiKitchen = ifelse(test$KitchenAbvGr > 1,1,0),stringsAsFactors=FALSE,check.names=FALSE)
test$BsmtFullBath <- ifelse(test$BsmtHalfBath > 0,test$BsmtFullBath + 0.5,test$BsmtFullBath)
test$FullBath <- ifelse(test$HalfBath > 0,test$FullBath + 0.5,test$FullBath)
test <- test[,setdiff(colnames(test),c("KitchenAbvGr","BsmtHalfBath","HalfBath"))]
First, let’s review which variables we may want to create dummy vars for.
class_per_var <- sapply(train,class)
print("Class per variable")
## [1] "Class per variable"
table(class_per_var)
## class_per_var
## character factor integer numeric
## 27 13 20 6
print("Levels per var, character:")
## [1] "Levels per var, character:"
apply(train[,class_per_var == "character"],2,function(x)length(unique(x)))
## MSZoning Street Alley LotShape LandContour
## 5 2 3 4 4
## LotConfig LandSlope Neighborhood Condition1 BldgType
## 4 3 25 4 5
## HouseStyle RoofStyle Exterior1st MasVnrType Foundation
## 8 6 11 4 6
## BsmtExposure CentralAir Electrical FireplaceQu GarageType
## 2 2 2 2 4
## PavedDrive PoolQC Fence SaleType SaleCondition
## 3 2 3 4 6
## PorchType1 PorchType2
## 5 4
print("Numeric:")
## [1] "Numeric:"
apply(train[,class_per_var == "numeric"],2,function(x)length(unique(x)))
## MasVnrArea BsmtFullBath FullBath PorchSF1 PorchSF2
## 327 6 8 254 95
## MultiKitchen
## 2
print("Integer:")
## [1] "Integer:"
apply(train[,class_per_var == "integer"],2,function(x)length(unique(x)))
## LotFrontage LotArea OverallQual OverallCond YearBuilt
## 111 1073 10 9 112
## YearRemodAdd BsmtFinSF1 BsmtFinSF2 BsmtUnfSF 1stFlrSF
## 61 637 144 780 753
## 2ndFlrSF LowQualFinSF BedroomAbvGr TotRmsAbvGrd GarageYrBlt
## 417 24 8 12 97
## GarageCars GarageArea WoodDeckSF YrSold SalePrice
## 5 441 274 5 663
print("Factor:")
## [1] "Factor:"
apply(train[,class_per_var == "factor"],2,function(x)length(unique(as.vector(x))))
## ExterQual ExterCond BsmtQual BsmtCond BsmtFinType1
## 4 2 5 3 4
## BsmtFinType2 HeatingQC KitchenQual Functional GarageFinish
## 4 4 4 4 4
## GarageQual GarageCond MoSold
## 4 3 4
Looks good! Think we are ready to create dummy vars for character variables.
class_per_var <- sapply(train,class)
for(var in colnames(train)[class_per_var == "character"])
{
train[,var] <- factor(train[,var])
var_as_factor = factor(paste0(var,make.names(train[,var])),levels=paste0(var,make.names(levels(train[,var]))))
model_matrix = model.matrix(~var_as_factor)
ncol_model_matrix = ncol(model_matrix)
if(ncol_model_matrix > 2)
{
model_matrix = model_matrix[,2:ncol(model_matrix)]
colnames(model_matrix) <- unlist(lapply(strsplit(colnames(model_matrix),"var_as_factor"),"[[",2))
train <- data.frame(train,model_matrix,check.names=FALSE,stringsAsFactors=FALSE)
}
if(ncol_model_matrix == 2)
{
train <- data.frame(train,Newcol = as.numeric(as.vector(model_matrix[,2])),check.names=FALSE,stringsAsFactors=FALSE)
colnames(train)[colnames(train) == "Newcol"] <- strsplit(colnames(model_matrix)[2],"var_as_factor")[[1]][2]
}
train <- train[,setdiff(colnames(train),var)]
}
class_per_var <- sapply(test,class)
for(var in colnames(test)[class_per_var == "character"])
{
test[,var] <- factor(test[,var])
var_as_factor = factor(paste0(var,make.names(test[,var])),levels=paste0(var,make.names(levels(test[,var]))))
model_matrix = model.matrix(~var_as_factor)
ncol_model_matrix = ncol(model_matrix)
if(ncol_model_matrix > 2)
{
model_matrix = model_matrix[,2:ncol(model_matrix)]
colnames(model_matrix) <- unlist(lapply(strsplit(colnames(model_matrix),"var_as_factor"),"[[",2))
test <- data.frame(test,model_matrix,check.names=FALSE,stringsAsFactors=FALSE)
}
if(ncol_model_matrix == 2)
{
test <- data.frame(test,Newcol = as.numeric(as.vector(model_matrix[,2])),check.names=FALSE,stringsAsFactors=FALSE)
colnames(test)[colnames(test) == "Newcol"] <- strsplit(colnames(model_matrix)[2],"var_as_factor")[[1]][2]
}
test <- test[,setdiff(colnames(test),var)]
}
Now we need to decide how to deal with factor variables. Should we convert to numeric? Or are we actually interested in the individual factor levels?
There is an interesting discussion of this issue here:
For now, let’s make dummy variables (treated as nominal), but also leave the original variables in case we decide to use numeric later on.
class_per_var <- sapply(train,class)
for(var in colnames(train)[class_per_var == "factor"])
{
var_as_factor = factor(paste0(var,make.names(train[,var])),levels=paste0(var,make.names(levels(train[,var]))))
model_matrix = model.matrix(~var_as_factor)
ncol_model_matrix = ncol(model_matrix)
if(ncol_model_matrix > 2)
{
model_matrix = model_matrix[,2:ncol(model_matrix)]
colnames(model_matrix) <- unlist(lapply(strsplit(colnames(model_matrix),"var_as_factor"),"[[",2))
train <- data.frame(train,model_matrix,check.names=FALSE,stringsAsFactors=FALSE)
}
if(ncol_model_matrix == 2)
{
train <- data.frame(train,Newcol = as.numeric(as.vector(model_matrix[,2])),check.names=FALSE,stringsAsFactors=FALSE)
colnames(train)[colnames(train) == "Newcol"] <- strsplit(colnames(model_matrix)[2],"var_as_factor")[[1]][2]
}
}
class_per_var <- sapply(test,class)
for(var in colnames(test)[class_per_var == "factor"])
{
var_as_factor = factor(paste0(var,make.names(test[,var])),levels=paste0(var,make.names(levels(test[,var]))))
model_matrix = model.matrix(~var_as_factor)
ncol_model_matrix = ncol(model_matrix)
if(ncol_model_matrix > 2)
{
model_matrix = model_matrix[,2:ncol(model_matrix)]
colnames(model_matrix) <- unlist(lapply(strsplit(colnames(model_matrix),"var_as_factor"),"[[",2))
test <- data.frame(test,model_matrix,check.names=FALSE,stringsAsFactors=FALSE)
}
if(ncol_model_matrix == 2)
{
test <- data.frame(test,Newcol = as.numeric(as.vector(model_matrix[,2])),check.names=FALSE,stringsAsFactors=FALSE)
colnames(test)[colnames(test) == "Newcol"] <- strsplit(colnames(model_matrix)[2],"var_as_factor")[[1]][2]
}
}
Also just before we move on, noticed an error where one test GarageYrBlt is 2207 instead of 2007. Let’s fix that now.
test[test$GarageYrBlt == 2207 & is.na(test$GarageYrBlt) == FALSE,"GarageYrBlt"] <- 2007
Set aside the columns with factor variables.
train_factor_vars <- train[,as.vector(sapply(train,class)) == "factor"]
train <- train[,as.vector(sapply(train,class)) != "factor"]
test_factor_vars <- test[,as.vector(sapply(test,class)) == "factor"]
test <- test[,as.vector(sapply(test,class)) != "factor"]
Now, let’s start with a simple correlation matrix.
For now, skip LotFrontage since that has NAs. Also skip SalePrice to focus on independent variables.
Oh and also skip GarageQualEx, as there is no GarageQual with this level after transform.
For now, we are looking for correlation clusters. Let’s start with the correlations >= 0.6, and require that correlation with at least 3 other variables.
correlation_matrix <- cor(train[,setdiff(colnames(train),c("LotFrontage","SalePrice","GarageQualEx"))])
correlation_matrix_NA_diagonal <- correlation_matrix
for(i in 1:ncol(correlation_matrix_NA_diagonal))
{
correlation_matrix_NA_diagonal[i,i] <- NA
}
#corrplot(correlation_matrix[apply(abs(correlation_matrix_NA_diagonal),2,function(x)max(x,na.rm=TRUE)) >= 0.4,],tl.cex=0.3,order="hclust")
variables_to_plot <- which(apply(abs(correlation_matrix_NA_diagonal),2,function(x)quantile(x,probs=((ncol(correlation_matrix) - 3)/ncol(correlation_matrix)),type=2,na.rm=TRUE)) >= 0.6)
corrplot(correlation_matrix[variables_to_plot,variables_to_plot],order="hclust",type="upper",diag=FALSE)
We see some correlations that make sense, including:
Let’s also look at variables with at least one correlation >= 0.5 to see if we can spot any interesting one-off high correlations.
variables_to_plot <- which(apply(abs(correlation_matrix_NA_diagonal),2,function(x)max(x,na.rm=TRUE)) >= 0.5)
print("Variables to plot:")
## [1] "Variables to plot:"
names(variables_to_plot)
## [1] "LotArea" "OverallQual"
## [3] "YearBuilt" "YearRemodAdd"
## [5] "MasVnrArea" "BsmtFinSF1"
## [7] "BsmtFinSF2" "BsmtUnfSF"
## [9] "2ndFlrSF" "LowQualFinSF"
## [11] "BsmtFullBath" "FullBath"
## [13] "BedroomAbvGr" "TotRmsAbvGrd"
## [15] "GarageYrBlt" "GarageCars"
## [17] "GarageArea" "PorchSF1"
## [19] "PorchSF2" "MultiKitchen"
## [21] "MSZoningFV" "MSZoningRL"
## [23] "MSZoningRM" "AlleyNone"
## [25] "AlleyPave" "LandContourHLS"
## [27] "LandContourLvl" "LandSlopeSev"
## [29] "NeighborhoodMeadowV" "NeighborhoodOldTown"
## [31] "NeighborhoodSomerst" "BldgTypeDuplex"
## [33] "HouseStyleX1Story" "HouseStyleX2.5Fin"
## [35] "HouseStyleX2Story" "RoofStyleGable"
## [37] "RoofStyleHip" "Exterior1stCemntBd"
## [39] "Exterior1stVinylSd" "MasVnrTypeBrkFace"
## [41] "MasVnrTypeNone" "FoundationCBlock"
## [43] "FoundationPConc" "FoundationSlab"
## [45] "GarageTypeDetchd" "GarageTypeNone"
## [47] "FenceMinimal" "FenceNone"
## [49] "SaleTypeNew" "SaleTypeWD.Oth"
## [51] "SaleConditionNormal" "SaleConditionPartial"
## [53] "PorchType1None" "PorchType1OpenPorchSF"
## [55] "PorchType2None" "PorchType2OpenPorchSF"
## [57] "ExterQualTA" "ExterQualGd"
## [59] "ExterQualEx" "ExterCondPo.Fa"
## [61] "ExterCondTA.Gd.Ex" "BsmtQualTA"
## [63] "BsmtQualGd" "BsmtQualEx"
## [65] "BsmtCondPo.Fa" "BsmtCondTA.Gd.Ex"
## [67] "BsmtFinType1Unf" "BsmtFinType1LowtoAvg"
## [69] "BsmtFinType1GLQ" "BsmtFinType2Unf"
## [71] "BsmtFinType2LowtoAvg" "HeatingQCTA"
## [73] "HeatingQCEx" "KitchenQualTA"
## [75] "KitchenQualGd" "KitchenQualEx"
## [77] "GarageFinishUnf" "GarageFinishRFn"
## [79] "GarageQualPo.Fa" "GarageQualTA"
## [81] "GarageCondPo.Fa" "GarageCondTA.Gd.Ex"
## [83] "MoSoldFall" "MoSoldSummer"
corrplot(correlation_matrix[variables_to_plot,variables_to_plot],order="hclust",type="upper",diag=FALSE,tl.cex=0.5)
print("Correlation between LotArea and LandSlopeSev")
## [1] "Correlation between LotArea and LandSlopeSev"
cor(train$LotArea,train$LandSlopeSev)
## [1] 0.5403798
Now we start to see some other interesting less expected patterns. For example, homes in Meadow Village are much more likely to have a cement board exterior. Homes on land with a severe slope are more likely to be on larger lots.
There are also some correlations that just come about because of the nature of dummy variables. For example, RoofStyle is nearly always either Hip or Gable, so RoofStyleHip and RoofStyleGable dummy variables are nearly always mutually exclusive. Same idea for LandContour, etc.
Let’s also get some measurements of collinearity, variance inflation factors, by taking the inverse of the correlation matrix.
Then, display variables with VIF > 5 or 10.
variance_inflation_factors <- c()
precision_matrix <- solve(correlation_matrix,tol=1e-20)
for(i in 1:ncol(correlation_matrix))
{
variance_inflation_factors <- c(variance_inflation_factors,precision_matrix[i,i])
}
print("Variables with VIF > 5, < 10:")
## [1] "Variables with VIF > 5, < 10:"
colnames(correlation_matrix)[variance_inflation_factors > 5 & variance_inflation_factors < 10]
## [1] "TotRmsAbvGrd" "GarageYrBlt"
## [3] "GarageCars" "GarageArea"
## [5] "NeighborhoodBrkSide" "NeighborhoodCrawfor"
## [7] "NeighborhoodGilbert" "NeighborhoodIDOTRR"
## [9] "NeighborhoodMitchel" "NeighborhoodNoRidge"
## [11] "NeighborhoodNridgHt" "NeighborhoodNWAmes"
## [13] "NeighborhoodSawyer" "NeighborhoodSawyerW"
## [15] "HouseStyleX1Story" "HouseStyleX2Story"
## [17] "Exterior1stCemntBd" "Exterior1stPlywood"
## [19] "FoundationCBlock" "FoundationPConc"
## [21] "SaleTypeWD.Oth" "PorchType1ScreenPorch"
## [23] "PorchType2ScreenPorch"
print("Variables with VIF >= 10:")
## [1] "Variables with VIF >= 10:"
colnames(correlation_matrix)[variance_inflation_factors >= 10]
## [1] "YearBuilt" "BsmtFinSF1"
## [3] "BsmtUnfSF" "1stFlrSF"
## [5] "2ndFlrSF" "MSZoningFV"
## [7] "MSZoningRL" "MSZoningRM"
## [9] "NeighborhoodCollgCr" "NeighborhoodEdwards"
## [11] "NeighborhoodNAmes" "NeighborhoodOldTown"
## [13] "NeighborhoodSomerst" "RoofStyleGable"
## [15] "RoofStyleHip" "Exterior1stHdBoard"
## [17] "Exterior1stMetalSd" "Exterior1stVinylSd"
## [19] "Exterior1stWd.Sdng" "MasVnrTypeBrkFace"
## [21] "MasVnrTypeNone" "MasVnrTypeStone"
## [23] "GarageTypeNone" "SaleTypeNew"
## [25] "SaleConditionPartial" "PorchType1EnclosedPorch"
## [27] "PorchType1None" "PorchType1OpenPorchSF"
## [29] "PorchType2None" "PorchType2OpenPorchSF"
## [31] "ExterCondPo.Fa" "ExterCondTA.Gd.Ex"
## [33] "BsmtCondPo.Fa" "BsmtCondTA.Gd.Ex"
## [35] "BsmtFinType1Unf" "BsmtFinType1LowtoAvg"
## [37] "BsmtFinType1GLQ" "BsmtFinType2Unf"
## [39] "BsmtFinType2LowtoAvg" "BsmtFinType2GLQ"
## [41] "KitchenQualPo.Fa" "KitchenQualTA"
## [43] "KitchenQualGd" "KitchenQualEx"
## [45] "GarageFinishUnf" "GarageFinishRFn"
## [47] "GarageFinishFin" "GarageQualPo.Fa"
## [49] "GarageQualTA" "GarageQualGd"
## [51] "GarageCondPo.Fa" "GarageCondTA.Gd.Ex"
We see a lot of the same variables we saw from the correlation matrix also have high variance inflation factors.
Many variables have all levels with high VIFs. So, we’ll have to choose one or two levels to remove.
Other variables it’s a bit more complicated. For example, TotRmsAbvGrd and 2ndFlrSF are correlated, but a model of 2ndFlrSF as a function of TotRmsAbvGrd still has a lot of variance unexplained. For cases like these, we will have to leave both in.
For the variables with VIF >= 10 that are not neighborhoods, print the top 5 variables they are correlated with and the absolute value of the correlations.
for(var in grep('Neighborhood',colnames(correlation_matrix)[variance_inflation_factors >= 10],value=TRUE,invert=TRUE))
{
print(var)
correlations = abs(as.numeric(as.vector(correlation_matrix[var,])))
print(colnames(correlation_matrix)[order(correlations,decreasing=TRUE)[1:5]])
print(correlations[order(correlations,decreasing=TRUE)[1:5]])
}
## [1] "YearBuilt"
## [1] "YearBuilt" "GarageYrBlt" "FoundationPConc" "BsmtQualTA"
## [5] "YearRemodAdd"
## [1] 1.0000000 0.7771818 0.6511993 0.6317345 0.5928550
## [1] "BsmtFinSF1"
## [1] "BsmtFinSF1" "BsmtFullBath" "BsmtFinType1Unf" "BsmtFinType1GLQ"
## [5] "BsmtUnfSF"
## [1] 1.0000000 0.6702309 0.6286900 0.4974357 0.4952515
## [1] "BsmtUnfSF"
## [1] "BsmtUnfSF" "BsmtFinType1Unf" "BsmtFinSF1"
## [4] "BsmtFullBath" "BsmtFinType1LowtoAvg"
## [1] 1.0000000 0.6027066 0.4952515 0.4488708 0.3478936
## [1] "1stFlrSF"
## [1] "1stFlrSF" "GarageArea" "OverallQual" "BsmtFinSF1" "GarageCars"
## [1] 1.0000000 0.4897817 0.4762238 0.4458627 0.4393168
## [1] "2ndFlrSF"
## [1] "2ndFlrSF" "HouseStyleX2Story" "HouseStyleX1Story"
## [4] "TotRmsAbvGrd" "FullBath"
## [1] 1.0000000 0.8091498 0.7887493 0.6164226 0.6013969
## [1] "MSZoningFV"
## [1] "MSZoningFV" "NeighborhoodSomerst" "AlleyPave"
## [4] "MSZoningRL" "AlleyNone"
## [1] 1.0000000 0.8628071 0.4457413 0.4166084 0.2740310
## [1] "MSZoningRL"
## [1] "MSZoningRL" "MSZoningRM" "NeighborhoodOldTown"
## [4] "MSZoningFV" "AlleyNone"
## [1] 1.0000000 0.8085854 0.4523278 0.4166084 0.3588752
## [1] "MSZoningRM"
## [1] "MSZoningRM" "MSZoningRL" "NeighborhoodOldTown"
## [4] "YearBuilt" "GarageTypeDetchd"
## [1] 1.0000000 0.8085854 0.5618807 0.3886825 0.3182694
## [1] "RoofStyleGable"
## [1] "RoofStyleGable" "RoofStyleHip" "1stFlrSF" "ExterQualEx"
## [5] "MasVnrArea"
## [1] 1.0000000 0.9334616 0.3141314 0.2382424 0.2341087
## [1] "RoofStyleHip"
## [1] "RoofStyleHip" "RoofStyleGable" "1stFlrSF" "MasVnrArea"
## [5] "ExterQualEx"
## [1] 1.0000000 0.9334616 0.3239945 0.2631265 0.2589951
## [1] "Exterior1stHdBoard"
## [1] "Exterior1stHdBoard" "Exterior1stVinylSd" "FoundationCBlock"
## [4] "HeatingQCEx" "HeatingQCTA"
## [1] 1.0000000 0.3126107 0.2716872 0.2238607 0.2133769
## [1] "Exterior1stMetalSd"
## [1] "Exterior1stMetalSd" "Exterior1stVinylSd" "GarageTypeDetchd"
## [4] "YearBuilt" "GarageYrBlt"
## [1] 1.0000000 0.3109483 0.2504211 0.2096380 0.1926338
## [1] "Exterior1stVinylSd"
## [1] "Exterior1stVinylSd" "FoundationPConc" "GarageYrBlt"
## [4] "YearBuilt" "YearRemodAdd"
## [1] 1.0000000 0.5620494 0.5232504 0.5187342 0.4845302
## [1] "Exterior1stWd.Sdng"
## [1] "Exterior1stWd.Sdng" "YearBuilt" "GarageYrBlt"
## [4] "Exterior1stVinylSd" "BsmtQualTA"
## [1] 1.0000000 0.4086638 0.3157973 0.2992076 0.2392662
## [1] "MasVnrTypeBrkFace"
## [1] "MasVnrTypeBrkFace" "MasVnrTypeNone" "MasVnrArea"
## [4] "YearBuilt" "GarageTypeDetchd"
## [1] 1.0000000 0.8063370 0.5713282 0.2752798 0.2282697
## [1] "MasVnrTypeNone"
## [1] "MasVnrTypeNone" "MasVnrTypeBrkFace" "MasVnrArea"
## [4] "YearBuilt" "OverallQual"
## [1] 1.0000000 0.8063370 0.6877369 0.4171468 0.3895120
## [1] "MasVnrTypeStone"
## [1] "MasVnrTypeStone" "MasVnrTypeNone" "NeighborhoodNridgHt"
## [4] "BsmtQualEx" "OverallQual"
## [1] 1.0000000 0.3775048 0.3711002 0.3460337 0.3315835
## [1] "GarageTypeNone"
## [1] "GarageTypeNone" "GarageCondTA.Gd.Ex" "GarageQualTA"
## [4] "GarageCars" "GarageArea"
## [1] 1.0000000 0.7990492 0.7189002 0.5732873 0.5363330
## [1] "SaleTypeNew"
## [1] "SaleTypeNew" "SaleConditionPartial" "SaleTypeWD.Oth"
## [4] "SaleConditionNormal" "BsmtQualEx"
## [1] 1.0000000 0.9868190 0.7951852 0.6456983 0.3850421
## [1] "SaleConditionPartial"
## [1] "SaleConditionPartial" "SaleTypeNew" "SaleTypeWD.Oth"
## [4] "SaleConditionNormal" "BsmtQualEx"
## [1] 1.0000000 0.9868190 0.7910545 0.6543229 0.3786131
## [1] "PorchType1EnclosedPorch"
## [1] "PorchType1EnclosedPorch" "YearBuilt"
## [3] "PorchType1OpenPorchSF" "PorchSF1"
## [5] "GarageYrBlt"
## [1] 1.0000000 0.4550822 0.3707292 0.3636168 0.3294080
## [1] "PorchType1None"
## [1] "PorchType1None" "PorchType1OpenPorchSF" "PorchSF1"
## [4] "OverallQual" "FoundationCBlock"
## [1] 1.0000000 0.6149308 0.6102121 0.3432603 0.3041175
## [1] "PorchType1OpenPorchSF"
## [1] "PorchType1OpenPorchSF" "PorchType1None" "YearBuilt"
## [4] "FoundationPConc" "GarageYrBlt"
## [1] 1.0000000 0.6149308 0.4614907 0.4544960 0.4322527
## [1] "PorchType2None"
## [1] "PorchType2None" "PorchType2OpenPorchSF" "PorchSF2"
## [4] "PorchType1ScreenPorch" "PorchSF1"
## [1] 1.0000000 0.9737896 0.7284309 0.4418627 0.4058290
## [1] "PorchType2OpenPorchSF"
## [1] "PorchType2OpenPorchSF" "PorchType2None" "PorchSF2"
## [4] "PorchType1ScreenPorch" "PorchSF1"
## [1] 1.0000000 0.9737896 0.6968974 0.4559484 0.4106862
## [1] "ExterCondPo.Fa"
## [1] "ExterCondPo.Fa" "ExterCondTA.Gd.Ex" "ExterQualPo.Fa"
## [4] "KitchenQualPo.Fa" "GarageCondTA.Gd.Ex"
## [1] 1.0000000 1.0000000 0.2882222 0.2808583 0.2395932
## [1] "ExterCondTA.Gd.Ex"
## [1] "ExterCondPo.Fa" "ExterCondTA.Gd.Ex" "ExterQualPo.Fa"
## [4] "KitchenQualPo.Fa" "GarageCondTA.Gd.Ex"
## [1] 1.0000000 1.0000000 0.2882222 0.2808583 0.2395932
## [1] "BsmtCondPo.Fa"
## [1] "BsmtCondPo.Fa" "BsmtCondTA.Gd.Ex" "YearBuilt"
## [4] "CentralAirY" "BsmtQualPo.Fa"
## [1] 1.0000000 0.7381548 0.2557072 0.2350753 0.2251002
## [1] "BsmtCondTA.Gd.Ex"
## [1] "BsmtCondTA.Gd.Ex" "BsmtCondPo.Fa" "FoundationSlab"
## [4] "CentralAirY" "OverallQual"
## [1] 1.0000000 0.7381548 0.5232364 0.3164336 0.2539065
## [1] "BsmtFinType1Unf"
## [1] "BsmtFinType1Unf" "BsmtFinSF1" "BsmtUnfSF"
## [4] "BsmtFinType1LowtoAvg" "BsmtFullBath"
## [1] 1.0000000 0.6286900 0.6027066 0.5208083 0.5140292
## [1] "BsmtFinType1LowtoAvg"
## [1] "BsmtFinType1LowtoAvg" "FoundationCBlock" "BsmtFinType1Unf"
## [4] "BsmtFinType1GLQ" "FoundationPConc"
## [1] 1.0000000 0.5297482 0.5208083 0.5105245 0.4933104
## [1] "BsmtFinType1GLQ"
## [1] "BsmtFinType1GLQ" "BsmtFinType1LowtoAvg" "BsmtFinSF1"
## [4] "YearBuilt" "FoundationPConc"
## [1] 1.0000000 0.5105245 0.4974357 0.4794247 0.4537623
## [1] "BsmtFinType2Unf"
## [1] "BsmtFinType2Unf" "BsmtFinType2LowtoAvg" "BsmtFinSF2"
## [4] "FoundationSlab" "BsmtUnfSF"
## [1] 1.0000000 0.8489608 0.7162350 0.3207806 0.3120757
## [1] "BsmtFinType2LowtoAvg"
## [1] "BsmtFinType2LowtoAvg" "BsmtFinType2Unf" "BsmtFinSF2"
## [4] "BsmtFinType1LowtoAvg" "FoundationCBlock"
## [1] 1.0000000 0.8489608 0.7159191 0.3054714 0.2642030
## [1] "BsmtFinType2GLQ"
## [1] "BsmtFinType2GLQ" "BsmtFinSF2" "BsmtFinType2Unf"
## [4] "NeighborhoodBlueste" "BsmtFinType1LowtoAvg"
## [1] 1.0000000 0.3730816 0.2441516 0.1863808 0.1220724
## [1] "KitchenQualPo.Fa"
## [1] "KitchenQualPo.Fa" "ExterQualPo.Fa" "CentralAirY"
## [4] "ExterCondPo.Fa" "ExterCondTA.Gd.Ex"
## [1] 1.0000000 0.3324097 0.3007444 0.2808583 0.2808583
## [1] "KitchenQualTA"
## [1] "KitchenQualTA" "KitchenQualGd" "ExterQualTA" "ExterQualGd"
## [5] "YearRemodAdd"
## [1] 1.0000000 0.8244565 0.6715999 0.6175980 0.5769641
## [1] "KitchenQualGd"
## [1] "KitchenQualGd" "KitchenQualTA" "ExterQualGd" "ExterQualTA"
## [5] "YearRemodAdd"
## [1] 1.0000000 0.8244565 0.6283628 0.5576053 0.5336285
## [1] "KitchenQualEx"
## [1] "KitchenQualEx" "ExterQualEx" "BsmtQualEx"
## [4] "OverallQual" "NeighborhoodNridgHt"
## [1] 1.0000000 0.5623999 0.5184569 0.4257504 0.4091590
## [1] "GarageFinishUnf"
## [1] "GarageFinishUnf" "GarageTypeDetchd" "GarageFinishRFn"
## [4] "GarageYrBlt" "YearBuilt"
## [1] 1.0000000 0.6207264 0.5363548 0.5337779 0.5232329
## [1] "GarageFinishRFn"
## [1] "GarageFinishRFn" "GarageFinishUnf" "GarageFinishFin"
## [4] "GarageTypeDetchd" "YearBuilt"
## [1] 1.0000000 0.5363548 0.3593842 0.3219986 0.3142206
## [1] "GarageFinishFin"
## [1] "GarageFinishFin" "GarageFinishUnf" "OverallQual" "YearBuilt"
## [5] "GarageFinishRFn"
## [1] 1.0000000 0.4741285 0.3949964 0.3932369 0.3593842
## [1] "GarageQualPo.Fa"
## [1] "GarageQualPo.Fa" "GarageCondPo.Fa" "GarageQualTA"
## [4] "GarageYrBlt" "NeighborhoodOldTown"
## [1] 1.0000000 0.5698352 0.5643361 0.3631160 0.3078590
## [1] "GarageQualTA"
## [1] "GarageQualTA" "GarageCondTA.Gd.Ex" "GarageTypeNone"
## [4] "GarageQualPo.Fa" "GarageCars"
## [1] 1.0000000 0.7775057 0.7189002 0.5643361 0.4976131
## [1] "GarageQualGd"
## [1] "GarageQualGd" "GarageQualTA" "OverallCond"
## [4] "PoolQCY" "HouseStyleX2.5Fin"
## [1] 1.00000000 0.32195821 0.11030853 0.08489641 0.07843358
## [1] "GarageCondPo.Fa"
## [1] "GarageCondPo.Fa" "GarageQualPo.Fa" "GarageCondTA.Gd.Ex"
## [4] "GarageQualTA" "GarageYrBlt"
## [1] 1.0000000 0.5698352 0.5674135 0.3074564 0.3026416
## [1] "GarageCondTA.Gd.Ex"
## [1] "GarageCondTA.Gd.Ex" "GarageTypeNone" "GarageQualTA"
## [4] "GarageCondPo.Fa" "GarageCars"
## [1] 1.0000000 0.7990492 0.7775057 0.5674135 0.5227112
Let’s also make a table for some of the factor variables or combinations of factor variables.
apply(train[,grep('HouseStyle',colnames(train),value=TRUE)],2,table)
## HouseStyleX1.5Unf HouseStyleX1Story HouseStyleX2.5Fin HouseStyleX2.5Unf
## 0 1446 734 1452 1449
## 1 14 726 8 11
## HouseStyleX2Story HouseStyleSFoyer HouseStyleSLvl
## 0 1015 1423 1395
## 1 445 37 65
table(train[,c("Exterior1stVinylSd","FoundationPConc")])
## FoundationPConc
## Exterior1stVinylSd 0 1
## 0 721 224
## 1 92 423
table(train[,c("MasVnrTypeBrkFace","MasVnrTypeNone")])
## MasVnrTypeNone
## MasVnrTypeBrkFace 0 1
## 0 143 872
## 1 445 0
Some of these are unavoidable. For example, BsmtFinSF1 is highly correlated with BsmtFinType1Unf, which makes sense becaues it is 0 whenever the basement is unfinished.
But we should be able to remove a lot of collinearity by simply removing some variables.
Let’s remove the following variables:
train <- train[,setdiff(colnames(train),c("GarageYrBlt","HouseStyleX1Story","HouseStyleX2Story","RoofStyleGable","SaleConditionPartial","SaleConditionNormal","ExterCondTA.Gd.Ex","ExterQualTA","BsmtQualTA","KitchenQualTA","GarageQualTA","HeatingQCTA"))]
test <- test[,setdiff(colnames(test),c("GarageYrBlt","HouseStyleX1Story","HouseStyleX2Story","RoofStyleGable","SaleConditionPartial","SaleConditionNormal","ExterCondTA.Gd.Ex","ExterQualTA","BsmtQualTA","KitchenQualTA","GarageQualTA","HeatingQCTA"))]
Now we just need to run one last data transformation to resolve the NAs in LotFrontage.
Let’s use the step function to create a linear model of LotFrontage.
Found some other models online that take the square root of LotArea. Try with and without this transformation.
train_non_NA <- train[is.na(train$LotFrontage) == FALSE,]
model_lot_frontage <- step(lm(LotFrontage ~ .,data=train_non_NA),steps = 100,trace=FALSE)
summary(model_lot_frontage)
##
## Call:
## lm(formula = LotFrontage ~ LotArea + `1stFlrSF` + `2ndFlrSF` +
## LowQualFinSF + BedroomAbvGr + GarageCars + GarageArea + WoodDeckSF +
## SalePrice + PorchSF1 + MultiKitchen + MSZoningRM + AlleyNone +
## LotShapeIR2 + LotShapeIR3 + LotShapeReg + LandContourHLS +
## LotConfigCulDSac + LotConfigFR2or3 + LotConfigInside + LandSlopeMod +
## LandSlopeSev + NeighborhoodBrkSide + NeighborhoodGilbert +
## NeighborhoodNAmes + NeighborhoodNoRidge + NeighborhoodNPkVill +
## NeighborhoodNridgHt + NeighborhoodSawyer + NeighborhoodSawyerW +
## NeighborhoodSomerst + NeighborhoodStoneBr + NeighborhoodSWISU +
## NeighborhoodTimber + Condition1PosAorN + Condition1RRAorN +
## BldgTypeTwnhs + BldgTypeTwnhsE + HouseStyleSFoyer + Exterior1stBrk +
## Exterior1stCemntBd + Exterior1stMetalSd + Exterior1stVinylSd +
## Exterior1stWdShing + FoundationCBlock + FoundationWood +
## BsmtExposureY + GarageTypeDetchd + PavedDriveP + PoolQCY +
## SaleConditionAlloca + PorchType1ScreenPorch + ExterQualPo.Fa +
## BsmtQualEx + BsmtCondPo.Fa + HeatingQCEx + KitchenQualPo.Fa +
## FunctionalMod + FunctionalMaj.Sev + GarageFinishUnf, data = train_non_NA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.813 -6.945 -0.707 5.642 172.558
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.993e+01 3.809e+00 13.108 < 2e-16 ***
## LotArea 5.290e-04 7.692e-05 6.878 9.99e-12 ***
## `1stFlrSF` 1.583e-02 2.156e-03 7.342 4.01e-13 ***
## `2ndFlrSF` 5.142e-03 1.731e-03 2.971 0.003035 **
## LowQualFinSF 1.549e-02 9.192e-03 1.685 0.092174 .
## BedroomAbvGr 1.109e+00 7.492e-01 1.481 0.138933
## GarageCars 1.850e+00 1.367e+00 1.354 0.175969
## GarageArea 9.763e-03 4.738e-03 2.061 0.039545 *
## WoodDeckSF -1.357e-02 4.031e-03 -3.366 0.000788 ***
## SalePrice -3.690e-05 1.277e-05 -2.889 0.003939 **
## PorchSF1 9.689e-03 6.079e-03 1.594 0.111247
## MultiKitchen -5.360e+00 2.267e+00 -2.364 0.018261 *
## MSZoningRM -3.967e+00 1.519e+00 -2.612 0.009125 **
## AlleyNone 5.062e+00 1.936e+00 2.615 0.009051 **
## LotShapeIR2 4.335e+00 3.224e+00 1.344 0.179087
## LotShapeIR3 3.007e+01 6.477e+00 4.643 3.83e-06 ***
## LotShapeReg -2.275e+00 1.130e+00 -2.013 0.044338 *
## LandContourHLS 6.606e+00 2.532e+00 2.609 0.009196 **
## LotConfigCulDSac -3.324e+01 2.673e+00 -12.435 < 2e-16 ***
## LotConfigFR2or3 -1.485e+01 2.732e+00 -5.436 6.64e-08 ***
## LotConfigInside -1.149e+01 1.192e+00 -9.639 < 2e-16 ***
## LandSlopeMod 4.426e+00 2.423e+00 1.827 0.067982 .
## LandSlopeSev -2.218e+01 7.591e+00 -2.922 0.003552 **
## NeighborhoodBrkSide -3.462e+00 2.397e+00 -1.444 0.148892
## NeighborhoodGilbert 4.639e+00 2.541e+00 1.825 0.068197 .
## NeighborhoodNAmes 1.874e+00 1.491e+00 1.257 0.209150
## NeighborhoodNoRidge 1.021e+01 3.100e+00 3.294 0.001019 **
## NeighborhoodNPkVill -8.011e+00 6.022e+00 -1.330 0.183647
## NeighborhoodNridgHt 7.217e+00 2.429e+00 2.971 0.003027 **
## NeighborhoodSawyer 2.833e+00 2.375e+00 1.193 0.233232
## NeighborhoodSawyerW 3.033e+00 2.345e+00 1.294 0.196051
## NeighborhoodSomerst 3.731e+00 2.189e+00 1.704 0.088607 .
## NeighborhoodStoneBr -5.622e+00 3.829e+00 -1.468 0.142276
## NeighborhoodSWISU -9.133e+00 3.547e+00 -2.575 0.010148 *
## NeighborhoodTimber -7.541e+00 3.065e+00 -2.460 0.014028 *
## Condition1PosAorN 1.913e+01 4.335e+00 4.412 1.12e-05 ***
## Condition1RRAorN -6.742e+00 2.584e+00 -2.609 0.009203 **
## BldgTypeTwnhs -3.278e+01 2.796e+00 -11.723 < 2e-16 ***
## BldgTypeTwnhsE -2.478e+01 1.962e+00 -12.633 < 2e-16 ***
## HouseStyleSFoyer 3.513e+00 3.040e+00 1.156 0.248076
## Exterior1stBrk -5.123e+00 2.514e+00 -2.038 0.041779 *
## Exterior1stCemntBd -5.363e+00 2.468e+00 -2.173 0.030006 *
## Exterior1stMetalSd -2.598e+00 1.366e+00 -1.902 0.057443 .
## Exterior1stVinylSd -2.787e+00 1.306e+00 -2.135 0.032981 *
## Exterior1stWdShing -5.901e+00 3.130e+00 -1.886 0.059601 .
## FoundationCBlock 5.567e+00 1.255e+00 4.436 1.00e-05 ***
## FoundationWood 4.132e+01 1.072e+01 3.856 0.000122 ***
## BsmtExposureY 1.330e+00 1.089e+00 1.221 0.222369
## GarageTypeDetchd -5.436e+00 1.386e+00 -3.923 9.26e-05 ***
## PavedDriveP 5.037e+00 3.128e+00 1.610 0.107668
## PoolQCY 3.292e+01 6.521e+00 5.048 5.20e-07 ***
## SaleConditionAlloca -8.785e+00 5.094e+00 -1.725 0.084865 .
## PorchType1ScreenPorch -2.829e+00 1.874e+00 -1.510 0.131386
## ExterQualPo.Fa 4.554e+00 4.597e+00 0.991 0.322010
## BsmtQualEx 6.064e+00 2.009e+00 3.018 0.002598 **
## BsmtCondPo.Fa -4.796e+00 2.439e+00 -1.966 0.049508 *
## HeatingQCEx 2.152e+00 1.145e+00 1.879 0.060475 .
## KitchenQualPo.Fa 9.431e+00 2.834e+00 3.328 0.000904 ***
## FunctionalMod -4.875e+00 4.180e+00 -1.166 0.243752
## FunctionalMaj.Sev 4.415e+00 3.793e+00 1.164 0.244650
## GarageFinishUnf -1.739e+00 1.271e+00 -1.369 0.171377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.84 on 1140 degrees of freedom
## Multiple R-squared: 0.6452, Adjusted R-squared: 0.6265
## F-statistic: 34.55 on 60 and 1140 DF, p-value: < 2.2e-16
train_non_NA$LotArea <- sqrt(train_non_NA$LotArea)
model_lot_frontage_sqrt_lot_area <- step(lm(LotFrontage ~ .,data=train_non_NA),steps = 100,trace=FALSE)
summary(model_lot_frontage)
##
## Call:
## lm(formula = LotFrontage ~ LotArea + `1stFlrSF` + `2ndFlrSF` +
## LowQualFinSF + BedroomAbvGr + GarageCars + GarageArea + WoodDeckSF +
## YrSold + SalePrice + PorchSF1 + MultiKitchen + MSZoningFV +
## MSZoningRL + AlleyNone + LotShapeIR3 + LotShapeReg + LandContourHLS +
## LandContourLow + LotConfigCulDSac + LotConfigFR2or3 + LotConfigInside +
## LandSlopeMod + LandSlopeSev + NeighborhoodBrkSide + NeighborhoodCollgCr +
## NeighborhoodEdwards + NeighborhoodMitchel + NeighborhoodNoRidge +
## NeighborhoodNridgHt + NeighborhoodStoneBr + NeighborhoodSWISU +
## NeighborhoodTimber + Condition1PosAorN + Condition1RRAorN +
## BldgTypeTwnhs + BldgTypeTwnhsE + HouseStyleSFoyer + Exterior1stBrk +
## Exterior1stCemntBd + Exterior1stMetalSd + Exterior1stPlywood +
## Exterior1stVinylSd + Exterior1stWdShing + FoundationCBlock +
## FoundationWood + BsmtExposureY + GarageTypeDetchd + PavedDriveP +
## PoolQCY + SaleConditionAlloca + PorchType1ScreenPorch + ExterQualPo.Fa +
## BsmtQualEx + BsmtCondPo.Fa + HeatingQCEx + KitchenQualPo.Fa +
## FunctionalMod + FunctionalMaj.Sev + GarageFinishUnf, data = train_non_NA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -59.991 -6.500 -0.839 5.239 168.575
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.499e+02 6.526e+02 -1.149 0.250773
## LotArea 2.986e-01 2.776e-02 10.756 < 2e-16 ***
## `1stFlrSF` 1.304e-02 2.138e-03 6.100 1.45e-09 ***
## `2ndFlrSF` 3.331e-03 1.689e-03 1.972 0.048859 *
## LowQualFinSF 1.233e-02 8.993e-03 1.371 0.170545
## BedroomAbvGr 1.396e+00 7.314e-01 1.909 0.056483 .
## GarageCars 2.081e+00 1.321e+00 1.575 0.115546
## GarageArea 8.023e-03 4.582e-03 1.751 0.080230 .
## WoodDeckSF -1.528e-02 3.922e-03 -3.896 0.000104 ***
## YrSold 3.876e-01 3.250e-01 1.193 0.233282
## SalePrice -4.223e-05 1.255e-05 -3.364 0.000793 ***
## PorchSF1 6.418e-03 5.935e-03 1.081 0.279748
## MultiKitchen -4.377e+00 2.241e+00 -1.953 0.051014 .
## MSZoningFV 6.495e+00 2.506e+00 2.592 0.009670 **
## MSZoningRL 3.601e+00 1.444e+00 2.493 0.012790 *
## AlleyNone 5.518e+00 1.906e+00 2.896 0.003854 **
## LotShapeIR3 2.872e+01 6.042e+00 4.753 2.26e-06 ***
## LotShapeReg -2.137e+00 1.085e+00 -1.971 0.049013 *
## LandContourHLS 5.964e+00 2.486e+00 2.399 0.016587 *
## LandContourLow -5.924e+00 4.236e+00 -1.398 0.162291
## LotConfigCulDSac -3.386e+01 2.584e+00 -13.103 < 2e-16 ***
## LotConfigFR2or3 -1.540e+01 2.653e+00 -5.804 8.37e-09 ***
## LotConfigInside -1.122e+01 1.164e+00 -9.643 < 2e-16 ***
## LandSlopeMod 4.471e+00 2.449e+00 1.826 0.068151 .
## LandSlopeSev -2.060e+01 7.366e+00 -2.797 0.005244 **
## NeighborhoodBrkSide -3.416e+00 2.337e+00 -1.462 0.144077
## NeighborhoodCollgCr -3.039e+00 1.693e+00 -1.795 0.072929 .
## NeighborhoodEdwards -3.306e+00 1.751e+00 -1.888 0.059308 .
## NeighborhoodMitchel -4.013e+00 2.647e+00 -1.516 0.129814
## NeighborhoodNoRidge 8.209e+00 3.042e+00 2.699 0.007065 **
## NeighborhoodNridgHt 3.611e+00 2.396e+00 1.507 0.131995
## NeighborhoodStoneBr -7.548e+00 3.729e+00 -2.024 0.043220 *
## NeighborhoodSWISU -8.575e+00 3.483e+00 -2.462 0.013974 *
## NeighborhoodTimber -1.165e+01 2.967e+00 -3.925 9.20e-05 ***
## Condition1PosAorN 1.853e+01 4.200e+00 4.413 1.11e-05 ***
## Condition1RRAorN -7.923e+00 2.527e+00 -3.135 0.001762 **
## BldgTypeTwnhs -2.463e+01 2.850e+00 -8.641 < 2e-16 ***
## BldgTypeTwnhsE -1.942e+01 2.012e+00 -9.649 < 2e-16 ***
## HouseStyleSFoyer 4.026e+00 2.971e+00 1.355 0.175729
## Exterior1stBrk -5.433e+00 2.448e+00 -2.219 0.026671 *
## Exterior1stCemntBd -4.804e+00 2.408e+00 -1.995 0.046263 *
## Exterior1stMetalSd -2.170e+00 1.346e+00 -1.612 0.107135
## Exterior1stPlywood -2.480e+00 2.049e+00 -1.210 0.226555
## Exterior1stVinylSd -1.764e+00 1.316e+00 -1.341 0.180237
## Exterior1stWdShing -6.072e+00 3.063e+00 -1.983 0.047648 *
## FoundationCBlock 5.109e+00 1.176e+00 4.343 1.53e-05 ***
## FoundationWood 3.988e+01 1.052e+01 3.792 0.000157 ***
## BsmtExposureY 1.538e+00 1.074e+00 1.432 0.152373
## GarageTypeDetchd -4.911e+00 1.355e+00 -3.625 0.000302 ***
## PavedDriveP 4.285e+00 3.078e+00 1.392 0.164117
## PoolQCY 3.324e+01 6.381e+00 5.210 2.24e-07 ***
## SaleConditionAlloca -6.636e+00 4.959e+00 -1.338 0.181127
## PorchType1ScreenPorch -3.623e+00 1.825e+00 -1.985 0.047349 *
## ExterQualPo.Fa 5.824e+00 4.534e+00 1.285 0.199198
## BsmtQualEx 6.571e+00 1.993e+00 3.297 0.001006 **
## BsmtCondPo.Fa -4.738e+00 2.382e+00 -1.989 0.046902 *
## HeatingQCEx 2.394e+00 1.122e+00 2.134 0.033039 *
## KitchenQualPo.Fa 8.797e+00 2.771e+00 3.175 0.001537 **
## FunctionalMod -5.894e+00 4.086e+00 -1.443 0.149415
## FunctionalMaj.Sev 4.142e+00 3.706e+00 1.118 0.263976
## GarageFinishUnf -2.609e+00 1.232e+00 -2.119 0.034345 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.47 on 1140 degrees of freedom
## Multiple R-squared: 0.6629, Adjusted R-squared: 0.6452
## F-statistic: 37.36 on 60 and 1140 DF, p-value: < 2.2e-16
Let’s use the model with taking the square root of lot area. And choose only variables that were significant after 100 steps.
select_significant <- function(linear_model){
names_coefficients = rownames(summary(linear_model)$coef)[2:nrow(summary(linear_model)$coef)]
pvalues_coefficients = summary(linear_model)$coef[2:nrow(summary(linear_model)$coef),"Pr(>|t|)"]
significant = names_coefficients[pvalues_coefficients < .05]
return(gsub(significant,pattern='`',replace=''))
}
train_non_NA <- train[is.na(train$LotFrontage) == FALSE,]
train_non_NA$LotArea <- sqrt(train_non_NA$LotArea)
significant_predictors_lot_frontage <- setdiff(select_significant(model_lot_frontage_sqrt_lot_area),"SalePrice")
model_lot_frontage_sqrt_lot_area <- lm(LotFrontage ~ .,data=train_non_NA[,c("LotFrontage",significant_predictors_lot_frontage)])
summary(model_lot_frontage_sqrt_lot_area)
##
## Call:
## lm(formula = LotFrontage ~ ., data = train_non_NA[, c("LotFrontage",
## significant_predictors_lot_frontage)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -67.583 -6.777 -0.749 5.592 172.287
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.269406 3.651351 8.564 < 2e-16 ***
## LotArea 0.306821 0.026370 11.635 < 2e-16 ***
## `1stFlrSF` 0.013739 0.001581 8.691 < 2e-16 ***
## `2ndFlrSF` 0.003740 0.001186 3.155 0.001646 **
## WoodDeckSF -0.014693 0.003908 -3.760 0.000178 ***
## MSZoningFV 6.080020 2.365726 2.570 0.010292 *
## MSZoningRL 3.204782 1.367193 2.344 0.019242 *
## AlleyNone 4.463529 1.891310 2.360 0.018438 *
## LotShapeIR3 28.126015 6.060126 4.641 3.86e-06 ***
## LotShapeReg -2.247413 1.089804 -2.062 0.039408 *
## LandContourHLS 7.405485 2.374155 3.119 0.001858 **
## LotConfigCulDSac -36.354535 2.581695 -14.082 < 2e-16 ***
## LotConfigFR2or3 -15.525597 2.686266 -5.780 9.60e-09 ***
## LotConfigInside -11.809492 1.171145 -10.084 < 2e-16 ***
## LandSlopeSev -26.825764 7.114970 -3.770 0.000171 ***
## NeighborhoodNoRidge 7.436349 2.870556 2.591 0.009702 **
## NeighborhoodStoneBr -10.899832 3.577470 -3.047 0.002365 **
## NeighborhoodSWISU -5.534724 3.269265 -1.693 0.090731 .
## NeighborhoodTimber -10.405938 2.893765 -3.596 0.000337 ***
## Condition1PosAorN 19.570407 4.238988 4.617 4.33e-06 ***
## Condition1RRAorN -6.485429 2.492157 -2.602 0.009377 **
## BldgTypeTwnhs -23.988517 2.738832 -8.759 < 2e-16 ***
## BldgTypeTwnhsE -18.913701 1.898951 -9.960 < 2e-16 ***
## Exterior1stBrk -5.700947 2.373745 -2.402 0.016476 *
## Exterior1stCemntBd -3.336319 2.263271 -1.474 0.140721
## Exterior1stWdShing -5.843798 3.028114 -1.930 0.053868 .
## FoundationCBlock 5.407356 1.069219 5.057 4.94e-07 ***
## FoundationWood 44.193899 10.541422 4.192 2.97e-05 ***
## GarageTypeDetchd -3.817935 1.305291 -2.925 0.003512 **
## PoolQCY 32.263011 6.332091 5.095 4.06e-07 ***
## PorchType1ScreenPorch -2.780510 1.686227 -1.649 0.099427 .
## BsmtQualEx 6.404911 1.718104 3.728 0.000202 ***
## BsmtCondPo.Fa -4.812681 2.374843 -2.027 0.042938 *
## HeatingQCEx 1.756294 1.064998 1.649 0.099396 .
## KitchenQualPo.Fa 6.952762 2.634203 2.639 0.008416 **
## GarageFinishUnf -2.692916 1.219872 -2.208 0.027471 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.78 on 1165 degrees of freedom
## Multiple R-squared: 0.6402, Adjusted R-squared: 0.6294
## F-statistic: 59.23 on 35 and 1165 DF, p-value: < 2.2e-16
Use this to predict LotFrontage in train and test.
train_NA_LotFrontage <- train[is.na(train$LotFrontage) == TRUE,]
test_NA_LotFrontage <- test[is.na(test$LotFrontage) == TRUE,]
train_NA_LotFrontage$LotArea <- sqrt(train_NA_LotFrontage$LotArea)
test_NA_LotFrontage$LotArea <- sqrt(test_NA_LotFrontage$LotArea)
train[is.na(train$LotFrontage) == TRUE,"LotFrontage"] <- as.numeric(as.vector(predict(model_lot_frontage_sqrt_lot_area,train_NA_LotFrontage)))
test[is.na(test$LotFrontage) == TRUE,"LotFrontage"] <- as.numeric(as.vector(predict(model_lot_frontage_sqrt_lot_area,test_NA_LotFrontage)))
Use the step function again to create a linear model of SalePrice.
initial_model_saleprice <- step(lm(SalePrice ~ .,data=train),steps=100,trace=FALSE)
summary(initial_model_saleprice)
##
## Call:
## lm(formula = SalePrice ~ LotFrontage + LotArea + OverallQual +
## OverallCond + YearBuilt + MasVnrArea + `1stFlrSF` + `2ndFlrSF` +
## BsmtFullBath + FullBath + BedroomAbvGr + TotRmsAbvGrd + GarageCars +
## WoodDeckSF + PorchSF2 + MultiKitchen + MSZoningFV + MSZoningRH +
## MSZoningRL + MSZoningRM + StreetPave + LotShapeIR3 + LandContourHLS +
## LandContourLow + LandContourLvl + LotConfigCulDSac + LotConfigFR2or3 +
## LandSlopeMod + LandSlopeSev + NeighborhoodCrawfor + NeighborhoodEdwards +
## NeighborhoodMitchel + NeighborhoodNAmes + NeighborhoodNoRidge +
## NeighborhoodNridgHt + NeighborhoodNWAmes + NeighborhoodOldTown +
## NeighborhoodSawyer + NeighborhoodSomerst + NeighborhoodStoneBr +
## NeighborhoodVeenker + Condition1Norm + BldgTypeTwnhs + BldgTypeTwnhsE +
## RoofStyleShed + Exterior1stBrk + Exterior1stOther + Exterior1stStucco +
## MasVnrTypeBrkFace + MasVnrTypeNone + MasVnrTypeStone + FoundationCBlock +
## FoundationPConc + FoundationStone + BsmtExposureY + GarageTypeNone +
## SaleTypeConLw + SaleTypeNew + PorchType1EnclosedPorch + PorchType1None +
## PorchType1OpenPorchSF + PorchType2None + PorchType2OpenPorchSF +
## PorchType2ScreenPorch + ExterQualPo.Fa + ExterQualEx + BsmtQualEx +
## BsmtCondTA.Gd.Ex + BsmtFinType1Unf + HeatingQCEx + KitchenQualEx +
## FunctionalMin + FunctionalMod + FunctionalMaj.Sev + GarageFinishRFn +
## GarageQualPo.Fa + GarageQualGd + MoSoldSpring + MoSoldSummer,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -328162 -11736 -368 11016 240074
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.237e+05 1.209e+05 -4.332 1.59e-05 ***
## LotFrontage -1.376e+02 4.818e+01 -2.855 0.004368 **
## LotArea 6.599e-01 1.102e-01 5.990 2.68e-09 ***
## OverallQual 9.424e+03 1.045e+03 9.017 < 2e-16 ***
## OverallCond 4.654e+03 8.205e+02 5.672 1.72e-08 ***
## YearBuilt 1.621e+02 6.016e+01 2.695 0.007129 **
## MasVnrArea 1.410e+01 6.434e+00 2.191 0.028589 *
## `1stFlrSF` 5.595e+01 4.011e+00 13.947 < 2e-16 ***
## `2ndFlrSF` 3.873e+01 3.872e+00 10.002 < 2e-16 ***
## BsmtFullBath 8.232e+03 1.850e+03 4.451 9.24e-06 ***
## FullBath 8.223e+03 2.366e+03 3.475 0.000527 ***
## BedroomAbvGr -4.004e+03 1.427e+03 -2.807 0.005075 **
## TotRmsAbvGrd 2.331e+03 1.011e+03 2.305 0.021304 *
## GarageCars 1.170e+04 1.728e+03 6.771 1.88e-11 ***
## WoodDeckSF 1.873e+01 6.559e+00 2.856 0.004354 **
## PorchSF2 5.306e+01 2.834e+01 1.872 0.061395 .
## MultiKitchen -2.629e+04 4.315e+03 -6.092 1.44e-09 ***
## MSZoningFV 4.009e+04 1.255e+04 3.195 0.001428 **
## MSZoningRH 3.436e+04 1.203e+04 2.857 0.004337 **
## MSZoningRL 3.774e+04 9.986e+03 3.779 0.000164 ***
## MSZoningRM 3.410e+04 9.996e+03 3.411 0.000666 ***
## StreetPave 3.720e+04 1.298e+04 2.866 0.004216 **
## LotShapeIR3 -2.481e+04 9.532e+03 -2.603 0.009352 **
## LandContourHLS 2.674e+04 5.640e+03 4.741 2.35e-06 ***
## LandContourLow 1.868e+04 6.686e+03 2.794 0.005271 **
## LandContourLvl 1.931e+04 3.973e+03 4.860 1.31e-06 ***
## LotConfigCulDSac 5.527e+03 3.335e+03 1.657 0.097659 .
## LotConfigFR2or3 -1.045e+04 4.121e+03 -2.535 0.011355 *
## LandSlopeMod 1.051e+04 4.403e+03 2.386 0.017165 *
## LandSlopeSev -2.007e+04 1.053e+04 -1.906 0.056793 .
## NeighborhoodCrawfor 1.843e+04 4.708e+03 3.914 9.53e-05 ***
## NeighborhoodEdwards -1.814e+04 3.394e+03 -5.345 1.06e-07 ***
## NeighborhoodMitchel -1.595e+04 4.396e+03 -3.629 0.000295 ***
## NeighborhoodNAmes -7.131e+03 2.965e+03 -2.405 0.016306 *
## NeighborhoodNoRidge 5.287e+04 5.247e+03 10.076 < 2e-16 ***
## NeighborhoodNridgHt 3.362e+04 4.439e+03 7.574 6.58e-14 ***
## NeighborhoodNWAmes -9.209e+03 4.026e+03 -2.287 0.022338 *
## NeighborhoodOldTown -7.689e+03 3.961e+03 -1.941 0.052485 .
## NeighborhoodSawyer -5.467e+03 3.939e+03 -1.388 0.165317
## NeighborhoodSomerst 1.425e+04 6.836e+03 2.085 0.037290 *
## NeighborhoodStoneBr 4.720e+04 6.348e+03 7.435 1.83e-13 ***
## NeighborhoodVeenker 1.979e+04 8.838e+03 2.239 0.025342 *
## Condition1Norm 1.053e+04 2.293e+03 4.594 4.74e-06 ***
## BldgTypeTwnhs -3.319e+04 5.353e+03 -6.200 7.46e-10 ***
## BldgTypeTwnhsE -2.499e+04 3.590e+03 -6.960 5.23e-12 ***
## RoofStyleShed 3.128e+04 2.059e+04 1.519 0.128958
## Exterior1stBrk 1.686e+04 4.270e+03 3.950 8.23e-05 ***
## Exterior1stOther -2.072e+04 1.299e+04 -1.595 0.110884
## Exterior1stStucco -1.202e+04 5.910e+03 -2.034 0.042136 *
## MasVnrTypeBrkFace 1.150e+04 7.554e+03 1.523 0.128083
## MasVnrTypeNone 1.461e+04 7.620e+03 1.917 0.055411 .
## MasVnrTypeStone 1.423e+04 8.033e+03 1.771 0.076763 .
## FoundationCBlock 4.602e+03 3.101e+03 1.484 0.137957
## FoundationPConc 5.136e+03 3.532e+03 1.454 0.146164
## FoundationStone 1.737e+04 1.188e+04 1.462 0.143863
## BsmtExposureY 7.670e+03 1.824e+03 4.205 2.78e-05 ***
## GarageTypeNone 1.231e+04 4.391e+03 2.804 0.005114 **
## SaleTypeConLw 1.198e+04 6.678e+03 1.794 0.073006 .
## SaleTypeNew 9.085e+03 3.228e+03 2.815 0.004951 **
## PorchType1EnclosedPorch -9.900e+03 3.449e+03 -2.870 0.004166 **
## PorchType1None -9.286e+03 3.427e+03 -2.710 0.006821 **
## PorchType1OpenPorchSF -9.320e+03 3.436e+03 -2.712 0.006761 **
## PorchType2None 5.528e+04 2.924e+04 1.891 0.058883 .
## PorchType2OpenPorchSF 5.177e+04 2.892e+04 1.790 0.073684 .
## PorchType2ScreenPorch 5.263e+04 3.103e+04 1.696 0.090068 .
## ExterQualPo.Fa 2.601e+04 8.849e+03 2.939 0.003351 **
## ExterQualEx 1.139e+04 5.346e+03 2.130 0.033348 *
## BsmtQualEx 2.841e+04 3.735e+03 7.606 5.20e-14 ***
## BsmtCondTA.Gd.Ex 5.926e+03 3.670e+03 1.615 0.106587
## BsmtFinType1Unf -7.471e+03 2.043e+03 -3.656 0.000266 ***
## HeatingQCEx 4.830e+03 1.882e+03 2.567 0.010364 *
## KitchenQualEx 2.624e+04 3.912e+03 6.708 2.86e-11 ***
## FunctionalMin -1.353e+04 3.771e+03 -3.587 0.000346 ***
## FunctionalMod -1.484e+04 7.564e+03 -1.962 0.049973 *
## FunctionalMaj.Sev -1.677e+04 6.667e+03 -2.516 0.011986 *
## GarageFinishRFn -3.388e+03 1.817e+03 -1.865 0.062415 .
## GarageQualPo.Fa -6.514e+03 4.465e+03 -1.459 0.144816
## GarageQualGd 1.752e+04 6.965e+03 2.516 0.011988 *
## MoSoldSpring 3.313e+03 2.207e+03 1.501 0.133530
## MoSoldSummer 5.353e+03 1.678e+03 3.191 0.001451 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27610 on 1380 degrees of freedom
## Multiple R-squared: 0.8857, Adjusted R-squared: 0.8792
## F-statistic: 135.4 on 79 and 1380 DF, p-value: < 2.2e-16
significant_predictors_saleprice = select_significant(initial_model_saleprice)
model_saleprice <- lm(SalePrice ~ .,data=train[,c("SalePrice",significant_predictors_saleprice)])
summary(model_saleprice)
##
## Call:
## lm(formula = SalePrice ~ ., data = train[, c("SalePrice", significant_predictors_saleprice)])
##
## Residuals:
## Min 1Q Median 3Q Max
## -326456 -12049 -290 11427 240226
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.081e+05 1.009e+05 -6.024 2.17e-09 ***
## LotFrontage -1.535e+02 4.483e+01 -3.424 0.000635 ***
## LotArea 5.956e-01 9.507e-02 6.265 4.95e-10 ***
## OverallQual 9.481e+03 1.025e+03 9.249 < 2e-16 ***
## OverallCond 4.977e+03 7.991e+02 6.229 6.20e-10 ***
## YearBuilt 2.493e+02 5.171e+01 4.822 1.58e-06 ***
## MasVnrArea 8.710e+00 5.129e+00 1.698 0.089680 .
## `1stFlrSF` 5.599e+01 3.965e+00 14.121 < 2e-16 ***
## `2ndFlrSF` 4.028e+01 3.815e+00 10.560 < 2e-16 ***
## BsmtFullBath 8.560e+03 1.828e+03 4.682 3.11e-06 ***
## FullBath 6.802e+03 2.328e+03 2.922 0.003534 **
## BedroomAbvGr -3.648e+03 1.410e+03 -2.587 0.009785 **
## TotRmsAbvGrd 2.678e+03 1.006e+03 2.661 0.007883 **
## GarageCars 1.184e+04 1.712e+03 6.917 7.00e-12 ***
## WoodDeckSF 1.851e+01 6.570e+00 2.818 0.004900 **
## MultiKitchen -2.876e+04 4.201e+03 -6.847 1.13e-11 ***
## MSZoningFV 2.973e+04 1.218e+04 2.441 0.014762 *
## MSZoningRH 2.591e+04 1.186e+04 2.184 0.029094 *
## MSZoningRL 2.878e+04 9.784e+03 2.942 0.003319 **
## MSZoningRM 2.376e+04 9.705e+03 2.448 0.014487 *
## StreetPave 3.452e+04 1.297e+04 2.661 0.007872 **
## LotShapeIR3 -2.253e+04 9.508e+03 -2.370 0.017920 *
## LandContourHLS 2.491e+04 5.597e+03 4.451 9.22e-06 ***
## LandContourLow 1.677e+04 6.497e+03 2.582 0.009938 **
## LandContourLvl 1.931e+04 3.939e+03 4.903 1.05e-06 ***
## LotConfigFR2or3 -1.022e+04 4.110e+03 -2.486 0.013021 *
## LandSlopeMod 1.269e+04 4.289e+03 2.959 0.003134 **
## NeighborhoodCrawfor 2.089e+04 4.594e+03 4.547 5.91e-06 ***
## NeighborhoodEdwards -1.619e+04 3.224e+03 -5.022 5.77e-07 ***
## NeighborhoodMitchel -1.347e+04 4.324e+03 -3.115 0.001877 **
## NeighborhoodNAmes -4.930e+03 2.596e+03 -1.899 0.057724 .
## NeighborhoodNoRidge 5.373e+04 5.210e+03 10.313 < 2e-16 ***
## NeighborhoodNridgHt 3.266e+04 4.325e+03 7.552 7.72e-14 ***
## NeighborhoodNWAmes -7.740e+03 3.796e+03 -2.039 0.041648 *
## NeighborhoodSomerst 1.568e+04 6.622e+03 2.368 0.018010 *
## NeighborhoodStoneBr 4.666e+04 6.249e+03 7.467 1.44e-13 ***
## NeighborhoodVeenker 2.210e+04 8.795e+03 2.513 0.012088 *
## Condition1Norm 1.032e+04 2.281e+03 4.522 6.64e-06 ***
## BldgTypeTwnhs -3.182e+04 5.183e+03 -6.138 1.08e-09 ***
## BldgTypeTwnhsE -2.457e+04 3.510e+03 -7.001 3.93e-12 ***
## Exterior1stBrk 1.761e+04 4.247e+03 4.147 3.57e-05 ***
## Exterior1stStucco -1.263e+04 5.883e+03 -2.147 0.031946 *
## BsmtExposureY 8.236e+03 1.814e+03 4.539 6.13e-06 ***
## GarageTypeNone 1.538e+04 4.309e+03 3.569 0.000371 ***
## SaleTypeNew 8.788e+03 3.214e+03 2.734 0.006338 **
## PorchType1EnclosedPorch -1.007e+04 3.344e+03 -3.012 0.002643 **
## PorchType1None -9.275e+03 2.910e+03 -3.187 0.001468 **
## PorchType1OpenPorchSF -9.753e+03 2.816e+03 -3.464 0.000549 ***
## ExterQualPo.Fa 2.288e+04 8.653e+03 2.645 0.008271 **
## ExterQualEx 1.413e+04 5.320e+03 2.656 0.007991 **
## BsmtQualEx 2.972e+04 3.725e+03 7.977 3.08e-15 ***
## BsmtFinType1Unf -7.295e+03 2.011e+03 -3.628 0.000296 ***
## HeatingQCEx 4.787e+03 1.839e+03 2.603 0.009338 **
## KitchenQualEx 2.544e+04 3.901e+03 6.521 9.74e-11 ***
## FunctionalMin -1.463e+04 3.742e+03 -3.910 9.68e-05 ***
## FunctionalMod -1.372e+04 7.491e+03 -1.832 0.067161 .
## FunctionalMaj.Sev -1.766e+04 6.592e+03 -2.680 0.007456 **
## GarageQualGd 1.915e+04 6.965e+03 2.750 0.006040 **
## MoSoldSummer 4.210e+03 1.504e+03 2.800 0.005186 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27830 on 1401 degrees of freedom
## Multiple R-squared: 0.8821, Adjusted R-squared: 0.8773
## F-statistic: 180.8 on 58 and 1401 DF, p-value: < 2.2e-16
Looks great! Let’s just run some quick model diagnostics (residual plots).
plot(resid(model_saleprice),ylab="Residuals",xlab="Index",main="Model sale price")
plot(density(resid(model_saleprice)),xlab="Residuals",ylab="Density",main="Model sale price")
Seems OK enough. Let’s run prediction on test and output in CSV format.
predictions <- as.numeric(as.vector(predict(model_saleprice,test)))
write.table(data.frame(Id = rownames(test),SalePrice = predictions),
file="HGeiger_kaggle_houseprices_submission.csv",
row.names=FALSE,col.names=TRUE,quote=FALSE,sep=",")
Got a Kaggle score of 0.62307, which does not seem great (lower = better).
Kaggle username = hmgeiger.
Perhaps I went a bit overboard trying to simplify variables.
This submission was for a class, so I did not have time to try additional testing and possible tweaks to the model by also dividing the training data into train/test. May do that in the future on my own.