Ames Housing Authority is a public housing agency that serves the city of Ames, Iowa, US. It helps provide decent and safe rental housing for eligible low-income families, the elderly, and persons with disabilities The housing authority has collected 79 assessment parameters which describes every aspect of residential homes in Ames. These variables focus on the quality and quantity of the physical attributes of a property. Most of the variables are exactly the type of information that a typical home buyer would want to know about a potential property.
Problem Statement: Predict Home Sale Price for the Test Data Set with the lowest possible error.
# Load the package that contains the full dataset and necessary libraries for our work.
options(warn = -1) # Suppress Warnings
library(ggplot2) # for some amazing looking graphs
library(MASS) # Library for our box-cox transform down the end
library(corrplot) # Plotting nice correlation matrix
library(dplyr) # Lirary for spliting train & test dataset
library(caret)
library(caTools) # loading caTools library
library(e1071)
library(rpart)
library(randomForest)
library(ranger)
library(xgboost)
library(plyr)
library(mlr)
library(rJava)
library(FSelector)
library(randomForestSRC)
library(stringr)
library(mboost)
library(ipred)
Load the train and test dataset. Let’s look into how many observations are there in train and test dataset and how many features are present in the dataset
# Load the Network Intrusion Detection dataset - This is training dataset through which we will build the model
setwd("C:/Users/ajana/Desktop/Imarticus Project/R Project/Data Set/") # setting dataset directory
trainData = read.csv("Group Project on R-Data Set-4.csv")
testData = read.csv("Group Project on R-Data Set-3.csv")
target <- 'SalePrice'
predictors <- setdiff(colnames(trainData), target)
# Saving train data index for splitting later
trainingRowIndex <- nrow(trainData)
trainingSample <- nrow(trainData)
testingRowIndex <- trainingRowIndex + 1
testingSample <- nrow(testData)
totalFeatures <- length(predictors)
totalSample <- trainingSample + testingSample
stmt <- paste("Total number of observations in Train Dataset is ", trainingSample, " and Test Dataset is ", testingSample)
print(stmt, na.print = NULL)
## [1] "Total number of observations in Train Dataset is 1460 and Test Dataset is 1459"
stmt <- paste("Total number of relevant Features in Train Dataset ", totalFeatures)
print(stmt, na.print = NULL)
## [1] "Total number of relevant Features in Train Dataset 80"
# Removing Id (unique identifier) which doesn't has any significance for the model
trainData_wrk <- select(trainData, -Id, -SalePrice)
testData_wrk <- select(testData, -Id)
# Combining Training and Testing dataset for further analysis & feature cleaning and changes
completeData <- rbind(trainData_wrk, testData_wrk)
num_vars<-colnames(completeData[sapply(completeData, is.numeric) == TRUE])
cat_vars<-colnames(completeData[sapply(completeData, is.factor) == TRUE])
stmt <- paste("There are", length(num_vars), "numerical features and ", length(cat_vars), "categorical features" )
print(stmt, na.print = NULL)
## [1] "There are 36 numerical features and 43 categorical features"
Let’s gather null / missing values statistics and identify the percentage against overall population
summary(completeData) # Summarize the data of Housing Prices dataset before null values treatment
## MSSubClass MSZoning LotFrontage LotArea
## Min. : 20.00 C (all): 25 Min. : 21.00 Min. : 1300
## 1st Qu.: 20.00 FV : 139 1st Qu.: 59.00 1st Qu.: 7478
## Median : 50.00 RH : 26 Median : 68.00 Median : 9453
## Mean : 57.14 RL :2265 Mean : 69.31 Mean : 10168
## 3rd Qu.: 70.00 RM : 460 3rd Qu.: 80.00 3rd Qu.: 11570
## Max. :190.00 NA's : 4 Max. :313.00 Max. :215245
## NA's :486
## Street Alley LotShape LandContour Utilities
## Grvl: 12 Grvl: 120 IR1: 968 Bnk: 117 AllPub:2916
## Pave:2907 Pave: 78 IR2: 76 HLS: 120 NoSeWa: 1
## NA's:2721 IR3: 16 Low: 60 NA's : 2
## Reg:1859 Lvl:2622
##
##
##
## LotConfig LandSlope Neighborhood Condition1 Condition2
## Corner : 511 Gtl:2778 NAmes : 443 Norm :2511 Norm :2889
## CulDSac: 176 Mod: 125 CollgCr: 267 Feedr : 164 Feedr : 13
## FR2 : 85 Sev: 16 OldTown: 239 Artery : 92 Artery : 5
## FR3 : 14 Edwards: 194 RRAn : 50 PosA : 4
## Inside :2133 Somerst: 182 PosN : 39 PosN : 4
## NridgHt: 166 RRAe : 28 RRNn : 2
## (Other):1428 (Other): 35 (Other): 2
## BldgType HouseStyle OverallQual OverallCond
## 1Fam :2425 1Story :1471 Min. : 1.000 Min. :1.000
## 2fmCon: 62 2Story : 872 1st Qu.: 5.000 1st Qu.:5.000
## Duplex: 109 1.5Fin : 314 Median : 6.000 Median :5.000
## Twnhs : 96 SLvl : 128 Mean : 6.089 Mean :5.565
## TwnhsE: 227 SFoyer : 83 3rd Qu.: 7.000 3rd Qu.:6.000
## 2.5Unf : 24 Max. :10.000 Max. :9.000
## (Other): 27
## YearBuilt YearRemodAdd RoofStyle RoofMatl
## Min. :1872 Min. :1950 Flat : 20 CompShg:2876
## 1st Qu.:1954 1st Qu.:1965 Gable :2310 Tar&Grv: 23
## Median :1973 Median :1993 Gambrel: 22 WdShake: 9
## Mean :1971 Mean :1984 Hip : 551 WdShngl: 7
## 3rd Qu.:2001 3rd Qu.:2004 Mansard: 11 ClyTile: 1
## Max. :2010 Max. :2010 Shed : 5 Membran: 1
## (Other): 2
## Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual
## VinylSd:1025 VinylSd:1014 BrkCmn : 25 Min. : 0.0 Ex: 107
## MetalSd: 450 MetalSd: 447 BrkFace: 879 1st Qu.: 0.0 Fa: 35
## HdBoard: 442 HdBoard: 406 None :1742 Median : 0.0 Gd: 979
## Wd Sdng: 411 Wd Sdng: 391 Stone : 249 Mean : 102.2 TA:1798
## Plywood: 221 Plywood: 270 NA's : 24 3rd Qu.: 164.0
## (Other): 369 (Other): 390 Max. :1600.0
## NA's : 1 NA's : 1 NA's :23
## ExterCond Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1
## Ex: 12 BrkTil: 311 Ex : 258 Fa : 104 Av : 418 ALQ :429
## Fa: 67 CBlock:1235 Fa : 88 Gd : 122 Gd : 276 BLQ :269
## Gd: 299 PConc :1308 Gd :1209 Po : 5 Mn : 239 GLQ :849
## Po: 3 Slab : 49 TA :1283 TA :2606 No :1904 LwQ :154
## TA:2538 Stone : 11 NA's: 81 NA's: 82 NA's: 82 Rec :288
## Wood : 5 Unf :851
## NA's: 79
## BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF
## Min. : 0.0 ALQ : 52 Min. : 0.00 Min. : 0.0
## 1st Qu.: 0.0 BLQ : 68 1st Qu.: 0.00 1st Qu.: 220.0
## Median : 368.5 GLQ : 34 Median : 0.00 Median : 467.0
## Mean : 441.4 LwQ : 87 Mean : 49.58 Mean : 560.8
## 3rd Qu.: 733.0 Rec : 105 3rd Qu.: 0.00 3rd Qu.: 805.5
## Max. :5644.0 Unf :2493 Max. :1526.00 Max. :2336.0
## NA's :1 NA's: 80 NA's :1 NA's :1
## TotalBsmtSF Heating HeatingQC CentralAir Electrical
## Min. : 0.0 Floor: 1 Ex:1493 N: 196 FuseA: 188
## 1st Qu.: 793.0 GasA :2874 Fa: 92 Y:2723 FuseF: 50
## Median : 989.5 GasW : 27 Gd: 474 FuseP: 8
## Mean :1051.8 Grav : 9 Po: 3 Mix : 1
## 3rd Qu.:1302.0 OthW : 2 TA: 857 SBrkr:2671
## Max. :6110.0 Wall : 6 NA's : 1
## NA's :1
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea
## Min. : 334 Min. : 0.0 Min. : 0.000 Min. : 334
## 1st Qu.: 876 1st Qu.: 0.0 1st Qu.: 0.000 1st Qu.:1126
## Median :1082 Median : 0.0 Median : 0.000 Median :1444
## Mean :1160 Mean : 336.5 Mean : 4.694 Mean :1501
## 3rd Qu.:1388 3rd Qu.: 704.0 3rd Qu.: 0.000 3rd Qu.:1744
## Max. :5095 Max. :2065.0 Max. :1064.000 Max. :5642
##
## BsmtFullBath BsmtHalfBath FullBath HalfBath
## Min. :0.0000 Min. :0.00000 Min. :0.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000
## Median :0.0000 Median :0.00000 Median :2.000 Median :0.0000
## Mean :0.4299 Mean :0.06136 Mean :1.568 Mean :0.3803
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :3.0000 Max. :2.00000 Max. :4.000 Max. :2.0000
## NA's :2 NA's :2
## BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
## Min. :0.00 Min. :0.000 Ex : 205 Min. : 2.000
## 1st Qu.:2.00 1st Qu.:1.000 Fa : 70 1st Qu.: 5.000
## Median :3.00 Median :1.000 Gd :1151 Median : 6.000
## Mean :2.86 Mean :1.045 TA :1492 Mean : 6.452
## 3rd Qu.:3.00 3rd Qu.:1.000 NA's: 1 3rd Qu.: 7.000
## Max. :8.00 Max. :3.000 Max. :15.000
##
## Functional Fireplaces FireplaceQu GarageType GarageYrBlt
## Typ :2717 Min. :0.0000 Ex : 43 2Types : 23 Min. :1895
## Min2 : 70 1st Qu.:0.0000 Fa : 74 Attchd :1723 1st Qu.:1960
## Min1 : 65 Median :1.0000 Gd : 744 Basment: 36 Median :1979
## Mod : 35 Mean :0.5971 Po : 46 BuiltIn: 186 Mean :1978
## Maj1 : 19 3rd Qu.:1.0000 TA : 592 CarPort: 15 3rd Qu.:2002
## (Other): 11 Max. :4.0000 NA's:1420 Detchd : 779 Max. :2207
## NA's : 2 NA's : 157 NA's :159
## GarageFinish GarageCars GarageArea GarageQual GarageCond
## Fin : 719 Min. :0.000 Min. : 0.0 Ex : 3 Ex : 3
## RFn : 811 1st Qu.:1.000 1st Qu.: 320.0 Fa : 124 Fa : 74
## Unf :1230 Median :2.000 Median : 480.0 Gd : 24 Gd : 15
## NA's: 159 Mean :1.767 Mean : 472.9 Po : 5 Po : 14
## 3rd Qu.:2.000 3rd Qu.: 576.0 TA :2604 TA :2654
## Max. :5.000 Max. :1488.0 NA's: 159 NA's: 159
## NA's :1 NA's :1
## PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch
## N: 216 Min. : 0.00 Min. : 0.00 Min. : 0.0
## P: 62 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.0
## Y:2641 Median : 0.00 Median : 26.00 Median : 0.0
## Mean : 93.71 Mean : 47.49 Mean : 23.1
## 3rd Qu.: 168.00 3rd Qu.: 70.00 3rd Qu.: 0.0
## Max. :1424.00 Max. :742.00 Max. :1012.0
##
## X3SsnPorch ScreenPorch PoolArea PoolQC
## Min. : 0.000 Min. : 0.00 Min. : 0.000 Ex : 4
## 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.000 Fa : 2
## Median : 0.000 Median : 0.00 Median : 0.000 Gd : 4
## Mean : 2.602 Mean : 16.06 Mean : 2.252 NA's:2909
## 3rd Qu.: 0.000 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :508.000 Max. :576.00 Max. :800.000
##
## Fence MiscFeature MiscVal MoSold
## GdPrv: 118 Gar2: 5 Min. : 0.00 Min. : 1.000
## GdWo : 112 Othr: 4 1st Qu.: 0.00 1st Qu.: 4.000
## MnPrv: 329 Shed: 95 Median : 0.00 Median : 6.000
## MnWw : 12 TenC: 1 Mean : 50.83 Mean : 6.213
## NA's :2348 NA's:2814 3rd Qu.: 0.00 3rd Qu.: 8.000
## Max. :17000.00 Max. :12.000
##
## YrSold SaleType SaleCondition
## Min. :2006 WD :2525 Abnorml: 190
## 1st Qu.:2007 New : 239 AdjLand: 12
## Median :2008 COD : 87 Alloca : 24
## Mean :2008 ConLD : 26 Family : 46
## 3rd Qu.:2009 CWD : 12 Normal :2402
## Max. :2010 (Other): 29 Partial: 245
## NA's : 1
## Let's check for any missing values in the data
colSums(is.na(completeData))
## MSSubClass MSZoning LotFrontage LotArea Street
## 0 4 486 0 0
## Alley LotShape LandContour Utilities LotConfig
## 2721 0 0 2 0
## LandSlope Neighborhood Condition1 Condition2 BldgType
## 0 0 0 0 0
## HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd
## 0 0 0 0 0
## RoofStyle RoofMatl Exterior1st Exterior2nd MasVnrType
## 0 0 1 1 24
## MasVnrArea ExterQual ExterCond Foundation BsmtQual
## 23 0 0 0 81
## BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 82 82 79 1 80
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC
## 1 1 1 0 0
## CentralAir Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## 0 1 0 0 0
## GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath
## 0 2 2 0 0
## BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 0 0 1 0 2
## Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish
## 0 1420 157 159 159
## GarageCars GarageArea GarageQual GarageCond PavedDrive
## 1 1 159 159 0
## WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch ScreenPorch
## 0 0 0 0 0
## PoolArea PoolQC Fence MiscFeature MiscVal
## 0 2909 2348 2814 0
## MoSold YrSold SaleType SaleCondition
## 0 0 1 0
# create table to count null values and percentage against overall population
null_count_df <- sapply(completeData, function(y) sum(length(which(is.na(y)))))
null_count_df <- data.frame(sort(null_count_df[null_count_df>0], decreasing = TRUE))
colnames(null_count_df)[1] <- "NullCount"
null_count_df$PctNull <- round(null_count_df$NullCount / (trainingSample+testingSample), 2)
null_count_df
## NullCount PctNull
## PoolQC 2909 1.00
## MiscFeature 2814 0.96
## Alley 2721 0.93
## Fence 2348 0.80
## FireplaceQu 1420 0.49
## LotFrontage 486 0.17
## GarageYrBlt 159 0.05
## GarageFinish 159 0.05
## GarageQual 159 0.05
## GarageCond 159 0.05
## GarageType 157 0.05
## BsmtCond 82 0.03
## BsmtExposure 82 0.03
## BsmtQual 81 0.03
## BsmtFinType2 80 0.03
## BsmtFinType1 79 0.03
## MasVnrType 24 0.01
## MasVnrArea 23 0.01
## MSZoning 4 0.00
## Utilities 2 0.00
## BsmtFullBath 2 0.00
## BsmtHalfBath 2 0.00
## Functional 2 0.00
## Exterior1st 1 0.00
## Exterior2nd 1 0.00
## BsmtFinSF1 1 0.00
## BsmtFinSF2 1 0.00
## BsmtUnfSF 1 0.00
## TotalBsmtSF 1 0.00
## Electrical 1 0.00
## KitchenQual 1 0.00
## GarageCars 1 0.00
## GarageArea 1 0.00
## SaleType 1 0.00
Let’s remove the features where more than 40% of the data is either null or missing.
predictors <- colnames(completeData)
# Remove features with more than 40% null / missing values
null_col_name <- rownames(null_count_df[null_count_df$PctNull>=as.numeric(0.40), ])
null_col_name
## [1] "PoolQC" "MiscFeature" "Alley" "Fence" "FireplaceQu"
# selecting features without null values features and select the dataset accordingly
predictors <- setdiff(predictors, null_col_name)
completeData <- completeData[ , predictors]
print(paste0("Total number of features with more than 40% null values are ", length(null_col_name)))
## [1] "Total number of features with more than 40% null values are 5"
Features where null / missing values are less than 40% for continuous variables should be replaced with mean value of the feature
## Identify numeric variables and treat them with mean value if any missing / null values exists
for (col_name in colnames(completeData[sapply(completeData, is.numeric) == TRUE])) {
if (sum(is.na(completeData[[col_name]])) > 0) {
completeData[col_name][is.na(completeData[col_name])] <- mean(completeData[[col_name]], na.rm = T)
stmt <- paste('Null values of', col_name, ' feature has been replaced by mean value ', mean(completeData[[col_name]], na.rm = T))
print(stmt, na.print = NULL)
}
}
## [1] "Null values of LotFrontage feature has been replaced by mean value 69.3057953144266"
## [1] "Null values of MasVnrArea feature has been replaced by mean value 102.201312154696"
## [1] "Null values of BsmtFinSF1 feature has been replaced by mean value 441.423235092529"
## [1] "Null values of BsmtFinSF2 feature has been replaced by mean value 49.5822481151474"
## [1] "Null values of BsmtUnfSF feature has been replaced by mean value 560.772104180946"
## [1] "Null values of TotalBsmtSF feature has been replaced by mean value 1051.77758738862"
## [1] "Null values of BsmtFullBath feature has been replaced by mean value 0.429893726431265"
## [1] "Null values of BsmtHalfBath feature has been replaced by mean value 0.061364415495372"
## [1] "Null values of GarageYrBlt feature has been replaced by mean value 1978.1134057971"
## [1] "Null values of GarageCars feature has been replaced by mean value 1.76662097326936"
## [1] "Null values of GarageArea feature has been replaced by mean value 472.8745716244"
Features where null / missing values are less than 40% for categorical variables should be replaced with mode which is value whose frequency is maximum
Mode = function(x){
ta = table(x)
tam = max(ta)
mod = names(ta)[ta==tam]
return(mod)
}
## Identify numeric variables and treat them with mean value if any missing / null values exists
for (col_name in colnames(completeData[sapply(completeData, is.factor) == TRUE])) {
if (sum(is.na(completeData[[col_name]])) > 0) {
completeData[col_name][is.na(completeData[col_name])] <- Mode(completeData[[col_name]])
stmt <- paste('Null values of', col_name, ' feature has been replaced by mode value ', Mode(completeData[[col_name]]))
print(stmt, na.print = NULL)
}
}
## [1] "Null values of MSZoning feature has been replaced by mode value RL"
## [1] "Null values of Utilities feature has been replaced by mode value AllPub"
## [1] "Null values of Exterior1st feature has been replaced by mode value VinylSd"
## [1] "Null values of Exterior2nd feature has been replaced by mode value VinylSd"
## [1] "Null values of MasVnrType feature has been replaced by mode value None"
## [1] "Null values of BsmtQual feature has been replaced by mode value TA"
## [1] "Null values of BsmtCond feature has been replaced by mode value TA"
## [1] "Null values of BsmtExposure feature has been replaced by mode value No"
## [1] "Null values of BsmtFinType1 feature has been replaced by mode value Unf"
## [1] "Null values of BsmtFinType2 feature has been replaced by mode value Unf"
## [1] "Null values of Electrical feature has been replaced by mode value SBrkr"
## [1] "Null values of KitchenQual feature has been replaced by mode value TA"
## [1] "Null values of Functional feature has been replaced by mode value Typ"
## [1] "Null values of GarageType feature has been replaced by mode value Attchd"
## [1] "Null values of GarageFinish feature has been replaced by mode value Unf"
## [1] "Null values of GarageQual feature has been replaced by mode value TA"
## [1] "Null values of GarageCond feature has been replaced by mode value TA"
## [1] "Null values of SaleType feature has been replaced by mode value WD"
## Let's check for any missing values in the data
colSums(is.na(completeData))
## MSSubClass MSZoning LotFrontage LotArea Street
## 0 0 0 0 0
## LotShape LandContour Utilities LotConfig LandSlope
## 0 0 0 0 0
## Neighborhood Condition1 Condition2 BldgType HouseStyle
## 0 0 0 0 0
## OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle
## 0 0 0 0 0
## RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea
## 0 0 0 0 0
## ExterQual ExterCond Foundation BsmtQual BsmtCond
## 0 0 0 0 0
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2
## 0 0 0 0 0
## BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir
## 0 0 0 0 0
## Electrical X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea
## 0 0 0 0 0
## BsmtFullBath BsmtHalfBath FullBath HalfBath BedroomAbvGr
## 0 0 0 0 0
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces
## 0 0 0 0 0
## GarageType GarageYrBlt GarageFinish GarageCars GarageArea
## 0 0 0 0 0
## GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 0 0 0 0 0
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea MiscVal
## 0 0 0 0 0
## MoSold YrSold SaleType SaleCondition
## 0 0 0 0
summary(completeData) # Summarize the data of Housing Prices dataset after null values treatment
## MSSubClass MSZoning LotFrontage LotArea
## Min. : 20.00 C (all): 25 Min. : 21.00 Min. : 1300
## 1st Qu.: 20.00 FV : 139 1st Qu.: 60.00 1st Qu.: 7478
## Median : 50.00 RH : 26 Median : 69.31 Median : 9453
## Mean : 57.14 RL :2269 Mean : 69.31 Mean : 10168
## 3rd Qu.: 70.00 RM : 460 3rd Qu.: 78.00 3rd Qu.: 11570
## Max. :190.00 Max. :313.00 Max. :215245
##
## Street LotShape LandContour Utilities LotConfig
## Grvl: 12 IR1: 968 Bnk: 117 AllPub:2918 Corner : 511
## Pave:2907 IR2: 76 HLS: 120 NoSeWa: 1 CulDSac: 176
## IR3: 16 Low: 60 FR2 : 85
## Reg:1859 Lvl:2622 FR3 : 14
## Inside :2133
##
##
## LandSlope Neighborhood Condition1 Condition2 BldgType
## Gtl:2778 NAmes : 443 Norm :2511 Norm :2889 1Fam :2425
## Mod: 125 CollgCr: 267 Feedr : 164 Feedr : 13 2fmCon: 62
## Sev: 16 OldTown: 239 Artery : 92 Artery : 5 Duplex: 109
## Edwards: 194 RRAn : 50 PosA : 4 Twnhs : 96
## Somerst: 182 PosN : 39 PosN : 4 TwnhsE: 227
## NridgHt: 166 RRAe : 28 RRNn : 2
## (Other):1428 (Other): 35 (Other): 2
## HouseStyle OverallQual OverallCond YearBuilt
## 1Story :1471 Min. : 1.000 Min. :1.000 Min. :1872
## 2Story : 872 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## 1.5Fin : 314 Median : 6.000 Median :5.000 Median :1973
## SLvl : 128 Mean : 6.089 Mean :5.565 Mean :1971
## SFoyer : 83 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2001
## 2.5Unf : 24 Max. :10.000 Max. :9.000 Max. :2010
## (Other): 27
## YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1950 Flat : 20 CompShg:2876 VinylSd:1026
## 1st Qu.:1965 Gable :2310 Tar&Grv: 23 MetalSd: 450
## Median :1993 Gambrel: 22 WdShake: 9 HdBoard: 442
## Mean :1984 Hip : 551 WdShngl: 7 Wd Sdng: 411
## 3rd Qu.:2004 Mansard: 11 ClyTile: 1 Plywood: 221
## Max. :2010 Shed : 5 Membran: 1 CemntBd: 126
## (Other): 2 (Other): 243
## Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond
## VinylSd:1015 BrkCmn : 25 Min. : 0.0 Ex: 107 Ex: 12
## MetalSd: 447 BrkFace: 879 1st Qu.: 0.0 Fa: 35 Fa: 67
## HdBoard: 406 None :1766 Median : 0.0 Gd: 979 Gd: 299
## Wd Sdng: 391 Stone : 249 Mean : 102.2 TA:1798 Po: 3
## Plywood: 270 3rd Qu.: 163.5 TA:2538
## CmentBd: 126 Max. :1600.0
## (Other): 264
## Foundation BsmtQual BsmtCond BsmtExposure BsmtFinType1
## BrkTil: 311 Ex: 258 Fa: 104 Av: 418 ALQ:429
## CBlock:1235 Fa: 88 Gd: 122 Gd: 276 BLQ:269
## PConc :1308 Gd:1209 Po: 5 Mn: 239 GLQ:849
## Slab : 49 TA:1364 TA:2688 No:1986 LwQ:154
## Stone : 11 Rec:288
## Wood : 5 Unf:930
##
## BsmtFinSF1 BsmtFinType2 BsmtFinSF2 BsmtUnfSF
## Min. : 0.0 ALQ: 52 Min. : 0.00 Min. : 0.0
## 1st Qu.: 0.0 BLQ: 68 1st Qu.: 0.00 1st Qu.: 220.0
## Median : 369.0 GLQ: 34 Median : 0.00 Median : 467.0
## Mean : 441.4 LwQ: 87 Mean : 49.58 Mean : 560.8
## 3rd Qu.: 733.0 Rec: 105 3rd Qu.: 0.00 3rd Qu.: 805.0
## Max. :5644.0 Unf:2573 Max. :1526.00 Max. :2336.0
##
## TotalBsmtSF Heating HeatingQC CentralAir Electrical
## Min. : 0 Floor: 1 Ex:1493 N: 196 FuseA: 188
## 1st Qu.: 793 GasA :2874 Fa: 92 Y:2723 FuseF: 50
## Median : 990 GasW : 27 Gd: 474 FuseP: 8
## Mean :1052 Grav : 9 Po: 3 Mix : 1
## 3rd Qu.:1302 OthW : 2 TA: 857 SBrkr:2672
## Max. :6110 Wall : 6
##
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea
## Min. : 334 Min. : 0.0 Min. : 0.000 Min. : 334
## 1st Qu.: 876 1st Qu.: 0.0 1st Qu.: 0.000 1st Qu.:1126
## Median :1082 Median : 0.0 Median : 0.000 Median :1444
## Mean :1160 Mean : 336.5 Mean : 4.694 Mean :1501
## 3rd Qu.:1388 3rd Qu.: 704.0 3rd Qu.: 0.000 3rd Qu.:1744
## Max. :5095 Max. :2065.0 Max. :1064.000 Max. :5642
##
## BsmtFullBath BsmtHalfBath FullBath HalfBath
## Min. :0.0000 Min. :0.00000 Min. :0.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000
## Median :0.0000 Median :0.00000 Median :2.000 Median :0.0000
## Mean :0.4299 Mean :0.06136 Mean :1.568 Mean :0.3803
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :3.0000 Max. :2.00000 Max. :4.000 Max. :2.0000
##
## BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.00 Min. :0.000 Ex: 205 Min. : 2.000 Maj1: 19
## 1st Qu.:2.00 1st Qu.:1.000 Fa: 70 1st Qu.: 5.000 Maj2: 9
## Median :3.00 Median :1.000 Gd:1151 Median : 6.000 Min1: 65
## Mean :2.86 Mean :1.045 TA:1493 Mean : 6.452 Min2: 70
## 3rd Qu.:3.00 3rd Qu.:1.000 3rd Qu.: 7.000 Mod : 35
## Max. :8.00 Max. :3.000 Max. :15.000 Sev : 2
## Typ :2719
## Fireplaces GarageType GarageYrBlt GarageFinish
## Min. :0.0000 2Types : 23 Min. :1895 Fin: 719
## 1st Qu.:0.0000 Attchd :1880 1st Qu.:1962 RFn: 811
## Median :1.0000 Basment: 36 Median :1978 Unf:1389
## Mean :0.5971 BuiltIn: 186 Mean :1978
## 3rd Qu.:1.0000 CarPort: 15 3rd Qu.:2001
## Max. :4.0000 Detchd : 779 Max. :2207
##
## GarageCars GarageArea GarageQual GarageCond PavedDrive
## Min. :0.000 Min. : 0.0 Ex: 3 Ex: 3 N: 216
## 1st Qu.:1.000 1st Qu.: 320.0 Fa: 124 Fa: 74 P: 62
## Median :2.000 Median : 480.0 Gd: 24 Gd: 15 Y:2641
## Mean :1.767 Mean : 472.9 Po: 5 Po: 14
## 3rd Qu.:2.000 3rd Qu.: 576.0 TA:2763 TA:2813
## Max. :5.000 Max. :1488.0
##
## WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.0 1st Qu.: 0.000
## Median : 0.00 Median : 26.00 Median : 0.0 Median : 0.000
## Mean : 93.71 Mean : 47.49 Mean : 23.1 Mean : 2.602
## 3rd Qu.: 168.00 3rd Qu.: 70.00 3rd Qu.: 0.0 3rd Qu.: 0.000
## Max. :1424.00 Max. :742.00 Max. :1012.0 Max. :508.000
##
## ScreenPorch PoolArea MiscVal MoSold
## Min. : 0.00 Min. : 0.000 Min. : 0.00 Min. : 1.000
## 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 4.000
## Median : 0.00 Median : 0.000 Median : 0.00 Median : 6.000
## Mean : 16.06 Mean : 2.252 Mean : 50.83 Mean : 6.213
## 3rd Qu.: 0.00 3rd Qu.: 0.000 3rd Qu.: 0.00 3rd Qu.: 8.000
## Max. :576.00 Max. :800.000 Max. :17000.00 Max. :12.000
##
## YrSold SaleType SaleCondition
## Min. :2006 WD :2526 Abnorml: 190
## 1st Qu.:2007 New : 239 AdjLand: 12
## Median :2008 COD : 87 Alloca : 24
## Mean :2008 ConLD : 26 Family : 46
## 3rd Qu.:2009 CWD : 12 Normal :2402
## Max. :2010 ConLI : 9 Partial: 245
## (Other): 20
we can observe that features such as Street and Utilities doesn’t have much variance and hence, we can remove these features from the dataset.
# helper function for plotting categoric data for easier data visualization
plot.categoric <- function(cols, df){
for (col in cols) {
order.cols <- names(sort(table(completeData[,col]), decreasing = TRUE))
num.plot <- qplot(df[,col]) +
geom_bar(fill = 'cornflowerblue') +
geom_text(aes(label = ..count..), stat='count', vjust=-0.5) +
theme_minimal() +
scale_y_continuous(limits = c(0,max(table(df[,col]))*1.1)) +
scale_x_discrete(limits = order.cols) +
xlab(col) +
theme(axis.text.x = element_text(angle = 30, size=12))
print(num.plot)
}
}
# function that groups a column by its features and returns the mdedian saleprice for each unique feature.
group.df <- completeData[1:trainingRowIndex,]
group.df$SalePrice <- trainData$SalePrice
group.prices <- function(col) {
group.table <- group.df[,c(col, 'SalePrice', 'OverallQual')] %>%
group_by_(col) %>%
dplyr::summarise(mean.Quality = round(mean(OverallQual),2),
mean.Price = mean(SalePrice), n = n()) %>%
arrange(mean.Quality)
print(qplot(x=reorder(group.table[[col]], -group.table[['mean.Price']]), y=group.table[['mean.Price']]) +
geom_bar(stat='identity', fill='cornflowerblue') +
theme_minimal() +
labs(x=col, y='Mean SalePrice') +
theme(axis.text.x = element_text(angle = 45)))
return(data.frame(group.table))
}
## functional to compute the mean overall quality for each quality
quality.mean <- function(col) {
group.table <- group.df[,c(col, 'OverallQual')] %>%
group_by_(col) %>%
summarise(mean.qual = mean(OverallQual)) %>%
arrange(mean.qual)
return(data.frame(group.table))
}
# function that maps a categoric value to its corresponding numeric value and returns that column to the data frame
map.values <- function(cols, map.list, df){
for (col in cols){
df[[col]] <- as.numeric(map.list[df[,col]])
}
return(df)
}
# Year Bin - min year is 1872, max year is 2010, every 20 years create a new bin, 7 total bins
# May be for later versions :)
Let’s look into the distribution of some numerical features. If the distribution is skewed then we log transform skewed variables in the model so that the data is normally distributed and modelling the data will provide better model performance
ggplot(trainData, aes(SalePrice)) + geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, color = "red", args = list(mean = mean(trainData$SalePrice), sd = sd(trainData$SalePrice))) +
ggtitle("Plot of SalePrice Distribution") + theme(plot.title = element_text(size = 11))
ggplot(trainData, aes(log(SalePrice))) + geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, color = "red", args = list(mean = mean(log(trainData$SalePrice)),
sd = sd(log(trainData$SalePrice)))) +
ggtitle("Plot of log(SalePrice) Distribution") + theme(plot.title = element_text(size = 11))
ggplot(completeData, aes(LotFrontage)) + geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, color = "red", args = list(mean = mean(completeData$LotFrontage),
sd = sd(completeData$LotFrontage))) +
ggtitle("Plot of LotFrontage Distribution") + theme(plot.title = element_text(size = 11))
ggplot(completeData, aes(log(LotFrontage))) + geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, color = "red", args = list(mean = mean(log(completeData$LotFrontage)),
sd = sd(log(completeData$LotFrontage)))) +
ggtitle("Plot of log(LotFrontage) Distribution") + theme(plot.title = element_text(size = 11))
ggplot(completeData, aes(LotArea)) + geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, color = "red", args = list(mean = mean(completeData$LotArea),
sd = sd(completeData$LotArea))) +
ggtitle("Plot of LotArea Distribution") + theme(plot.title = element_text(size = 11))
ggplot(completeData, aes(log(LotArea))) + geom_histogram(aes(y = ..density..)) +
stat_function(fun = dnorm, color = "red", args = list(mean = mean(log(completeData$LotArea)),
sd = sd(log(completeData$LotArea)))) +
ggtitle("Plot of log(LotArea) Distribution") + theme(plot.title = element_text(size = 11))
As we observe above, log transformation of variables shows data distribution more normalized. Hence, it would be best to use log transformed variables for buidling our model which will have better performance.
Based on categorical feature values and their corresponding SalePrice, create custom features #### Step 3.2.1 - EDA on Utilities
#trainScaled = completeData[1:trainingRowIndex, ]
#testScaled = completeData[testingRowIndex:totalSample, ]
#trainScaled$SalePrice <- trainData$SalePrice
plot.categoric('Utilities', completeData)
Utilities only has 1 value for NoSeWa and the rest AllPub. We can drop this feature from our dataset as the house with ‘NoSeWa’ is from our training set and will have won’t help with any predictive modelling. Also, this shows that there is harldy any variance in the dataset for Utilities feature.
plot.categoric('Street', completeData)
Street only has 12 values for Grvl and the rest Pave. We can drop this feature from our dataset as there is hardly any variance in this feature.
plot.categoric('PavedDrive', completeData)
completeData$HasPavedDrive <- (completeData$PavedDrive == 'Y') * 1
Most of the values for PavedDrive feature is Y. So, we can create a feature which represents “Has Paved Drive” and remaining doesn’t for better modelling.
plot.categoric('GarageType', completeData)
completeData$GarageAttchd <- ifelse(completeData$GarageType == "Attchd" | completeData$GarageType == "BuiltIn", 1, 0)
completeData$GarageAttchd <- as.integer(completeData$GarageAttchd)
completeData$GarageDetchd <- (completeData$GarageType == 'Detchd') * 1
Most of the values for GarageType feature are Attchd, Detchd & Builtin. So, we can create a feature which represents has attached garage (Attchd & BuiltIn) and then detached garage and remaining doesn’t.
plot.categoric('LandContour', completeData)
completeData$LandLeveled <- (completeData$LandContour == 'Lvl') * 1
Most of the values for LandContour feature are Lvl. So, we can create a feature which represents has leveled land contour and remaining doesn’t.
plot.categoric('LandSlope', completeData)
completeData$LandSlopeGentle <- (completeData$LandSlope == 'Gtl') * 1
Most of the values for LandSlope feature are Gtl meaing Gentle. So, we can create a feature which represents has Gentle land slope and remaining doesn’t.
plot.categoric('Electrical', completeData)
completeData$ElectricalSB <- (completeData$Electrical == 'SBrkr') * 1
completeData$ElectricalFuse <- (completeData$Electrical == 'FuseA' | completeData$Electrical == 'FuseF' | completeData$Electrical == 'FuseP') * 1
Most of the values for Electrical feature are SBrkr & Fuse. So, we can create a feature which represents has SBrkr and combination of all Fuse and remaining doesn’t.
group.prices('GarageQual')
## GarageQual mean.Quality mean.Price n
## 1 Fa 5.17 123573.4 48
## 2 Po 5.33 100166.7 3
## 3 TA 6.12 182591.9 1392
## 4 Gd 6.71 215860.7 14
## 5 Ex 7.00 241000.0 3
group.prices('BsmtCond')
## BsmtCond mean.Quality mean.Price n
## 1 Po 3.00 64000.0 2
## 2 Fa 5.07 121809.5 45
## 3 TA 6.11 181492.2 1348
## 4 Gd 6.72 213599.9 65
qual.cols <- c('ExterQual', 'ExterCond', 'GarageQual', 'GarageCond', 'KitchenQual', 'HeatingQC', 'BsmtQual', 'BsmtCond')
qual.list <- c('Po' = 1, 'Fa' = 2, 'TA' = 3, 'Gd' = 4, 'Ex' = 5)
completeData <- map.values(qual.cols, qual.list, completeData)
From seeing the mean saleprices from a few of the quality and condition features we can infer that the abbreviations mean poor, fair, typical/average, good and excelent. We’ll map numeric values from 1-5 to their corresponding categoric values and combine that to our dataframe.
group.prices('BsmtFinType1')
## BsmtFinType1 mean.Quality mean.Price n
## 1 BLQ 5.35 149493.7 148
## 2 Rec 5.35 146889.2 133
## 3 LwQ 5.54 151852.7 74
## 4 ALQ 5.55 161573.1 220
## 5 Unf 6.05 165519.3 467
## 6 GLQ 7.05 235413.7 418
Here returning the mean sale prices might not be as helpful as computing the median basement areas for both columns to determine which quality is better than the other.
# visualization for BsmtFinTyp1 against BsmtFinSF1
completeData[,c('BsmtFinType1', 'BsmtFinSF1')] %>%
group_by(BsmtFinType1) %>% dplyr::summarise(medianArea = median(BsmtFinSF1)) %>%
arrange(medianArea) %>%
ggplot(aes(x=reorder(BsmtFinType1,-medianArea), y=medianArea)) +
geom_bar(stat = 'identity', fill='cornflowerblue') +
labs(x='BsmtFinType1', y='Median of BsmtFinSF1') +
geom_text(aes(label = sort(medianArea)), vjust = -0.5) +
scale_y_continuous(limits = c(0,850)) + theme_minimal()
# visualization for BsmtFinTyp2 against BsmtFinSF2
completeData[,c('BsmtFinType2', 'BsmtFinSF2')] %>%
group_by(BsmtFinType2) %>% dplyr::summarise(medianArea = median(BsmtFinSF2)) %>%
arrange(medianArea) %>%
ggplot(aes(x=reorder(BsmtFinType2,-medianArea), y=medianArea)) +
geom_bar(stat = 'identity', fill='cornflowerblue') +
labs(x='BsmtFinType2', y='Median of BsmtFinSF2') +
geom_text(aes(label = sort(medianArea)), vjust = -0.5) +
scale_y_continuous(limits = c(0,850)) + theme_minimal()
bsmt.fin.list <- c('Unf' = 1, 'LwQ' = 2,'Rec'= 3, 'BLQ' = 4, 'ALQ' = 5, 'GLQ' = 6)
completeData <- map.values(c('BsmtFinType1','BsmtFinType2'), bsmt.fin.list, completeData)
Through investigating the relationships between the basement quality and areas we an see the true order of qualities of each basement to be ‘Unf’ < ‘LwQ’ < ‘BLQ’ < ‘Rec’ < ‘ALQ’ < ‘GLQ’.
group.prices('BsmtExposure')
## BsmtExposure mean.Quality mean.Price n
## 1 No 5.86 163439.2 991
## 2 Mn 6.25 192789.7 114
## 3 Av 6.57 206643.4 221
## 4 Gd 6.96 257689.8 134
bsmt.list <- c('No' = 1, 'Mn' = 2, 'Av' = 3, 'Gd' = 4)
completeData = map.values(c('BsmtExposure'), bsmt.list, completeData)
group.prices('Functional')
## Functional mean.Quality mean.Price n
## 1 Min2 4.97 144240.6 34
## 2 Maj2 5.00 85800.0 5
## 3 Min1 5.26 146385.5 31
## 4 Mod 5.40 168393.3 15
## 5 Maj1 5.50 153948.1 14
## 6 Sev 6.00 129000.0 1
## 7 Typ 6.16 183429.1 1360
functional.list <- c('Maj2' = 1, 'Sev' = 2, 'Min2' = 3, 'Min1' = 4, 'Maj1' = 5, 'Mod' = 6, 'Typ' = 7)
completeData = map.values(c('Functional'), functional.list, completeData)
group.prices('GarageFinish')
## GarageFinish mean.Quality mean.Price n
## 1 Unf 5.31 137570.5 686
## 2 RFn 6.57 202068.9 422
## 3 Fin 7.07 240052.7 352
garage.fin.list <- c('Unf' = 1, 'RFn' = 1, 'Fin' = 2)
completeData = map.values(c('GarageFinish'), garage.fin.list, completeData)
group.prices('Condition1')
## Condition1 mean.Quality mean.Price n
## 1 Feedr 5.31 142475.5 81
## 2 RRAe 5.45 138400.0 11
## 3 Artery 5.58 135091.7 48
## 4 Norm 6.15 184495.5 1260
## 5 RRAn 6.35 184396.6 26
## 6 RRNe 6.50 190750.0 2
## 7 PosA 6.62 225875.0 8
## 8 PosN 6.68 215184.2 19
## 9 RRNn 7.00 212400.0 5
group.prices('Condition2')
## Condition2 mean.Quality mean.Price n
## 1 Artery 4.0 106500.0 2
## 2 RRNn 4.5 96750.0 2
## 3 RRAe 5.0 190000.0 1
## 4 RRAn 5.0 136905.0 1
## 5 Feedr 5.5 121166.7 6
## 6 Norm 6.1 181169.4 1445
## 7 PosA 10.0 325000.0 1
## 8 PosN 10.0 284875.0 2
condition.list <- c('Feedr' = 1, 'Artery' = 1,'RRAe'= 2, 'RRAn' = 2, 'RRNe' = 2, 'RRNn' = 2, 'Norm' = 3, 'PosA' = 4, 'PosN' = 4)
completeData <- map.values(c('Condition1','Condition2'), condition.list, completeData)
# visualization for LotShape against LotArea
completeData[,c('LotShape', 'LotArea')] %>%
group_by(LotShape) %>% dplyr::summarise(medianArea = median(LotArea)) %>%
arrange(medianArea) %>%
ggplot(aes(x=reorder(LotShape,-medianArea), y=medianArea)) +
geom_bar(stat = 'identity', fill='cornflowerblue') +
labs(x='LotShape', y='Median of LotArea') +
geom_text(aes(label = sort(medianArea)), vjust = -0.5) + theme_minimal()
# visualization for LotConfig against LotArea
completeData[,c('LotConfig', 'LotArea')] %>%
group_by(LotConfig) %>% dplyr::summarise(medianArea = median(LotArea)) %>%
arrange(medianArea) %>%
ggplot(aes(x=reorder(LotConfig,-medianArea), y=medianArea)) +
geom_bar(stat = 'identity', fill='cornflowerblue') +
labs(x='LotConfig', y='Median of LotArea') +
geom_text(aes(label = sort(medianArea)), vjust = -0.5) + theme_minimal()
lot.shape.list <- c('Reg' = 1,'IR1' = 2, 'IR2' = 3, 'IR3' = 4)
completeData = map.values(c('LotShape'), lot.shape.list, completeData)
lot.config.list <- c('Inside' = 1,'FR3' = 2, 'FR2' = 2, 'Corner' = 3, 'CulDSac' = 4)
completeData = map.values(c('LotConfig'), lot.config.list, completeData)
# visualization for MasVnrType against MasVnrArea
completeData[,c('MasVnrType', 'MasVnrArea')] %>%
group_by(MasVnrType) %>% dplyr::summarise(medianArea = median(MasVnrArea)) %>%
arrange(medianArea) %>%
ggplot(aes(x=reorder(MasVnrType,-medianArea), y=medianArea)) +
geom_bar(stat = 'identity', fill='cornflowerblue') +
labs(x='MasVnrType', y='Median of LotArea') +
geom_text(aes(label = sort(medianArea)), vjust = -0.5) + theme_minimal()
MasVnrType.list <- c('None' = 0,'BrkCmn' = 1, 'Stone' = 2, 'BrkFace' = 3)
completeData = map.values(c('MasVnrType'), MasVnrType.list, completeData)
group.prices('Neighborhood')
## Neighborhood mean.Quality mean.Price n
## 1 MeadowV 4.47 98576.47 17
## 2 IDOTRR 4.76 100123.78 37
## 3 Sawyer 5.03 136793.14 74
## 4 BrkSide 5.05 124834.05 58
## 5 Edwards 5.08 128219.70 100
## 6 NAmes 5.36 145847.08 225
## 7 OldTown 5.39 128225.30 113
## 8 SWISU 5.44 142591.36 25
## 9 Mitchel 5.59 156270.12 49
## 10 BrDale 5.69 104493.75 16
## 11 ClearCr 5.89 212565.43 28
## 12 Blueste 6.00 137500.00 2
## 13 NPkVill 6.00 142694.44 9
## 14 Crawfor 6.27 210624.73 51
## 15 SawyerW 6.32 186555.80 59
## 16 NWAmes 6.33 189050.07 73
## 17 Gilbert 6.56 192854.51 79
## 18 CollgCr 6.64 197965.77 150
## 19 Veenker 6.73 238772.73 11
## 20 Timber 7.16 242247.45 38
## 21 Blmngtn 7.18 194870.88 17
## 22 Somerst 7.34 225379.84 86
## 23 NoRidge 7.93 335295.32 41
## 24 StoneBr 8.16 310499.00 25
## 25 NridgHt 8.26 316270.62 77
nbrh.map <- c('MeadowV' = 0, 'IDOTRR' = 1, 'Sawyer' = 1, 'BrDale' = 1, 'OldTown' = 1, 'Edwards' = 1,
'BrkSide' = 1, 'Blueste' = 2, 'SWISU' = 2, 'NAmes' = 2, 'NPkVill' = 2, 'Mitchel' = 2,
'SawyerW' = 2, 'Gilbert' = 3, 'NWAmes' = 3, 'Blmngtn' = 3, 'CollgCr' = 3, 'Crawfor' = 3,
'ClearCr' = 4, 'Veenker' = 4, 'Somerst' = 4, 'Timber' = 4, 'StoneBr' = 5, 'NoRidge' = 5,
'NridgHt' = 5)
completeData$Neighborhood_bin <- as.numeric(nbrh.map[completeData$Neighborhood])
We can observe that features such as YearBuilt, YearRemodAdd, YrSold cannot be directly used as numeric feautre for the model as model cannot distinguish between year & numeric feature. Hence, we will convert year features into numeric by comaring against 2010 which is last year of the dataset. Also, we will add other features derived from existing features.
# handling Year features
completeData$yrs_since_built <- 2010 - completeData$YearBuilt
completeData$yrs_since_sold <- 2010 - completeData$YrSold
completeData$yrs_since_remod <- completeData$YrSold - completeData$YearRemodAdd
# Total Living Area
completeData$TotalArea <- completeData$GrLivArea + completeData$TotalBsmtSF
# Total Porch
completeData$TotalPorch <- completeData$OpenPorchSF + completeData$X3SsnPorch + completeData$EnclosedPorch + completeData$ScreenPorch
# TotalBath
completeData$TotalBath <- completeData$BsmtFullBath + 0.5 * completeData$BsmtHalfBath + completeData$FullBath + 0.5 * completeData$HalfBath
# TotalFlrSF
completeData$TotalFlrSF <- completeData$X1stFlrSF + completeData$X2ndFlrSF
# OverallGrade
completeData$OverallGrade <- completeData$OverallQual * completeData$OverallCond
completeData$BsmtGrade <- completeData$BsmtQual * completeData$BsmtCond
completeData$GarageGrade <- completeData$GarageQual * completeData$GarageCond
completeData$KitchenScore <- completeData$KitchenAbvGr * completeData$KitchenQual
completeData$ExterGrade <- completeData$ExterQual * completeData$ExterCond
completeData$CentralAir=ifelse(completeData$CentralAir=="Y",1,0)
completeData$CentralAir <- as.integer(completeData$CentralAir)
Handling the houses with area based features equal to 0. Houses with 0 square footage for a columnshows that the house does not have that feature at all. We add a one-hot encoded column for returning 1 for any house with an area greater than 0 since this means that the house does have this feature and 0 for those who do not!
cols.binary <- c('X2ndFlrSF', 'MasVnrArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'X3SsnPorch', 'ScreenPorch', 'PoolArea', 'LowQualFinSF', 'MiscVal')
for (col in cols.binary){
completeData[str_c('Has',col)] <- (completeData[,col] != 0) * 1
}
As mentioned above, Street and Utilities have near zero variance and we have created new features for Year variables. Hence, we can remove these features from the dataset.
predictors <- colnames(completeData)
remove_feat <- c('Street', 'Utilities', 'YearBuilt', 'YearRemodAdd', 'YrSold', 'GarageYrBlt',
'GarageType', 'PavedDrive', 'LandContour', 'LandSlope', 'Neighborhood', 'Exterior1st',
'Exterior2nd', 'Electrical', 'X2ndFlrSF', 'MasVnrArea', 'WoodDeckSF', 'OpenPorchSF',
'EnclosedPorch', 'X3SsnPorch', 'ScreenPorch', 'PoolArea', 'LowQualFinSF', 'MiscVal',
'Foundation', 'Heating')
# Removing features from predictors
predictors <- setdiff(predictors, remove_feat)
completeData <- completeData[ , predictors]
Let’s handle remaining categorical values by creating dummy variables for each value. Each new dummy column will have a 1 or a 0 based on the value of that categorical column. This technique is also called as One Hot Encoding - One hot encoding is a representation of categorical variables as binary vectors. Once the dummy variables are created, we will remove the original categorical feature from the dataset.
# One Hot Encoding for all categorical varialbes except class as that is target variable
#for (cat_colname in colnames(completeData[sapply(completeData, is.factor) == TRUE])) {
#
# for(unique_value in unique(completeData[[cat_colname]])) {
#
# unq_val <- trimws(unique_value)
# unq_val <- sub(".", "", unq_val,fixed=TRUE)
# unq_val <- sub(")", "", unq_val,fixed=TRUE)
# unq_val <- sub("(", "", unq_val,fixed=TRUE)
# unq_val <- sub(" ", "", unq_val,fixed=TRUE)
# unq_val <- sub("&", "", unq_val,fixed=TRUE)
# completeData[paste(cat_colname, unq_val, sep="_")] <- ifelse(completeData[[cat_colname]]==unique_value, 1, 0)
# }
#
# completeData[(cat_colname)] <- NULL
#}
completeData <- data.frame(lapply(completeData, function(x) as.numeric(as.factor(x))))
Before we perform any feature selection teachniques, we need to normalize the ‘train’ data by applying log transformations and split the whole dataset back to original train and test dataset
predictors <- colnames(completeData)
# transform into log variables
completeLog <- completeData
completeLog$LotFrontage <- log(completeData$LotFrontage)
completeLog$LotArea <- log(completeData$LotArea)
trainScaled = completeLog[1:trainingRowIndex, ]
testScaled = completeLog[testingRowIndex:totalSample, ]
trainScaled$SalePrice <- log(trainData$SalePrice)
#str(trainScaled)
#colSums(is.na(completeData))
With same reason as removing highly correlated features, we need to remove linearly dependent features as well.
# Check if there are linear dependecies
predictors <- setdiff(names(trainCleaned), target)
linear_dep_v <- findLinearCombos(trainCleaned[ ,predictors])
if (length(linear_dep_v$remove)>0) {
train_no_ld <- trainCleaned[, -c(linear_dep_v$remove)]
trainCleaned <- train_no_ld
rm(train_no_ld)
}
trainCleaned$SalePrice <- trainData$SalePrice
print(paste0("Number of linearly dependent features : ", length(linear_dep_v$remove)))
## [1] "Number of linearly dependent features : 0"
With same reason as removing highly correlated features, we need to remove linearly dependent features as well.
nzv.data <- nearZeroVar(trainCleaned, saveMetrics = TRUE)
# take any of the near-zero-variance perdictors
drop.cols <- rownames(nzv.data)[nzv.data$nzv == TRUE]
trainCleaned <- trainCleaned[,!names(trainCleaned) %in% drop.cols]
print(paste0("Number of linearly dependent features : ", length(drop.cols)))
## [1] "Number of linearly dependent features : 12"
The entropy characterizes the impurity of an arbitrary collection of samples. Information Gain is the expected reduction in entropy caused by partitioning the samples according to a given feature which is the way of measuring association between inputs and outputs. Thus, higher the information gain is the better. features with information gain value above 0.03 are selected for the future analysis.
set.seed(123)
trainCleaned <- trainScaled
predictors <- setdiff(names(trainCleaned), target)
train.task <- makeRegrTask(data=trainCleaned, target="SalePrice")
imp_var1 <- generateFilterValuesData(train.task, method=c("information.gain"))
var1 <- imp_var1$data[imp_var1$data$information.gain > 0.03, c('name')]
plot_infogain <- plotFilterValues(imp_var1, feat.type.cols=TRUE)
plot_infogain
print(paste0("Number of significant features through Information Gain : ", length(var1)))
## [1] "Number of significant features through Information Gain : 51"
Pearson’s chi-squared test is a statistical test applied to sets of categorical data to evaluate how likely it is that any observed difference between the sets arose by chance and to test the independence of two events, in another words, it is used to test whether the occurrence of a specific term and the occurrence of a specific class are independent. [8] Thus after estimating the following quantity for each term and features are ranked by the score. Higher scores indicate that the null hypothesis (H0) of independence should be rejected and thus that the occurrence of the term and class are dependent. If they are dependent, then those features are selected.features with chi squared value above 0.24 are selected for the future analysis.
imp_var2 <- generateFilterValuesData(train.task, method=c("chi.squared"))
var2 <- imp_var2$data[imp_var2$data$chi.squared > 0.30, c('name')]
plot_chi <- plotFilterValues(imp_var2, feat.type.cols=TRUE)
plot_chi
print(paste0("Number of significant features through Chi-Squared : ", length(var2)))
## [1] "Number of significant features through Chi-Squared : 39"
The mean decrease in Gini coefficient is a measure of how each variable contributes to the homogeneity of the nodes and leaves in the resulting random forest. [9] Each time a particular variable is used to split a node, the Gini coefficient for the child nodes are calculated and compared to that of the original node. The Gini coefficient is a measure of homogeneity from 0 (homogeneous) to 1 (heterogeneous). The changes in Gini are summed for each variable and normalized at the end of the calculation. Variables that result in nodes with higher purity have a higher decrease in Gini coefficient. - features with mean decreased Gini value above 0.001 are selected for the future analysis.
imp_var3 <- generateFilterValuesData(train.task, method=c("rf.importance"))
var3 <- imp_var3$data[imp_var3$data$rf.importance > 0.001, c('name')]
plot_rf <- plotFilterValues(imp_var3, feat.type.cols=TRUE)
plot_rf
print(paste0("Number of significant features through Mean Decrease Gini : ", length(var3)))
## [1] "Number of significant features through Mean Decrease Gini : 18"
Combine all significant features extracted above into single improtant features
imp_cols <- c(var1, var2, var3)
imp_cols <- unique(imp_cols)
length(imp_cols)
## [1] 51
imp_cols
## [1] "MSSubClass" "MSZoning" "LotFrontage"
## [4] "LotArea" "LotShape" "HouseStyle"
## [7] "OverallQual" "OverallCond" "MasVnrType"
## [10] "ExterQual" "BsmtQual" "BsmtExposure"
## [13] "BsmtFinType1" "BsmtFinSF1" "BsmtUnfSF"
## [16] "TotalBsmtSF" "HeatingQC" "CentralAir"
## [19] "X1stFlrSF" "GrLivArea" "BsmtFullBath"
## [22] "FullBath" "HalfBath" "BedroomAbvGr"
## [25] "KitchenQual" "TotRmsAbvGrd" "Fireplaces"
## [28] "GarageFinish" "GarageCars" "GarageArea"
## [31] "SaleType" "SaleCondition" "HasPavedDrive"
## [34] "GarageAttchd" "GarageDetchd" "ElectricalSB"
## [37] "ElectricalFuse" "Neighborhood_bin" "yrs_since_built"
## [40] "yrs_since_remod" "TotalArea" "TotalPorch"
## [43] "TotalBath" "TotalFlrSF" "OverallGrade"
## [46] "BsmtGrade" "KitchenScore" "ExterGrade"
## [49] "HasMasVnrArea" "HasWoodDeckSF" "HasOpenPorchSF"
final_trainData <- trainCleaned[, imp_cols]
final_trainData$SalePrice <- trainCleaned$SalePrice
final_testData <- testScaled[, imp_cols]
set.seed(123) # set seed to ensure you always have same random numbers generated
# splits the data in the ratio mentioned in SplitRatio. After splitting marks these rows as logical TRUE and the the remaining are marked as logical FALSE
sample = sample.split(final_trainData, SplitRatio = 0.70)
# creates a training dataset named train1 with rows which are marked as TRUE
ames_trainData = subset(final_trainData, sample ==TRUE)
ames_validationData = subset(final_trainData, sample==FALSE)
ames_testData = final_testData
myControl = trainControl(method = "cv", number = 5, verboseIter = FALSE) # five-fold cross-validation
model_lm = caret::train(SalePrice ~ ., data=ames_trainData, method="lm", trControl=myControl, metric="RMSE")
# Extract Predictions
model_lm_pred <- predict(model_lm, ames_validationData)
# Extract Residuals
model_lm_resid <- ames_validationData$SalePrice - model_lm_pred
# Calculate RMSE
rmse.test.model.lm <- sqrt(mean(model_lm_resid^2, na.rm = TRUE))
valid.model.lm <- data.frame(cbind(model="Linear", rmse=rmse.test.model.lm))
mtry <- ncol(ames_trainData)
tunegrid <- expand.grid(.mtry=mtry)
model_rpart = caret::train(SalePrice ~ ., data=ames_trainData, method="rpart", tuneLength=10, trControl=myControl, metric="RMSE")
# Extract Predictions
model_rpart_pred <- predict(model_rpart, ames_validationData)
# Extract Residuals
model_rpart_resid <- ames_validationData$SalePrice - model_rpart_pred
# Calculate RMSE
rmse.test.model.rpart <- sqrt(mean(model_rpart_resid^2, na.rm = TRUE))
valid.model.rpart <- data.frame(cbind(model="Decision Tree", rmse=rmse.test.model.rpart))
mtry <- ncol(ames_trainData)
tunegrid <- expand.grid(.mtry=mtry)
model_rf = caret::train(SalePrice ~ ., data=ames_trainData, method="rf", tuneGrid=tunegrid, ntree=100, trControl=myControl, metric="RMSE")
# Extract Predictions
model_rf_pred <- predict(model_rf, ames_validationData)
# Extract Residuals
model_rf_resid <- ames_validationData$SalePrice - model_rf_pred
# Calculate RMSE
rmse.test.model.rf <- sqrt(mean(model_rf_resid^2, na.rm = TRUE))
valid.model.rf <- data.frame(cbind(model="Random Forest", rmse=rmse.test.model.rf))
mtry <- ncol(ames_trainData)
tunegrid <- expand.grid(nrounds=750, max_depth=8, eta=0.01, colsample_bytree=1, min_child_weight=2, subsample=0.6, gamma=0.01)
model_xgb = caret::train(SalePrice ~ ., data=ames_trainData, method="xgbTree", tuneGrid=tunegrid, trControl=myControl, metric="RMSE", verbose=T, nthread =3)
#model_xgb = caret::train(SalePrice ~ ., data=ames_trainData, method="xgbDART", trControl=myControl, metric="RMSE")
# Extract Predictions
model_xgb_pred <- predict(model_xgb, ames_validationData)
# Extract Residuals
model_xgb_resid <- ames_validationData$SalePrice - model_xgb_pred
# Calculate RMSE
rmse.test.model.xgb <- sqrt(mean(model_xgb_resid^2, na.rm = TRUE))
valid.model.xgb <- data.frame(cbind(model="eXtreme Gradient Boosting", rmse=rmse.test.model.xgb))
mtry <- ncol(ames_trainData)
tunegrid <- expand.grid(.mtry=mtry)
model_treebag = caret::train(SalePrice ~ ., data=ames_trainData, method="treebag", trControl=myControl, metric="RMSE")
# Extract Predictions
model_treebag_pred <- predict(model_treebag, ames_validationData)
# Extract Residuals
model_treebag_resid <- ames_validationData$SalePrice - model_treebag_pred
# Calculate RMSE
rmse.test.model.treebag <- sqrt(mean(model_treebag_resid^2, na.rm = TRUE))
valid.model.treebag <- data.frame(cbind(model="Bagged Tree", rmse=rmse.test.model.treebag))
model_bglm = caret::train(SalePrice ~ ., data=ames_trainData, method="glmboost", trControl=myControl, metric="RMSE")
# Extract Predictions
model_bglm_pred <- predict(model_bglm, ames_validationData)
# Extract Residuals
model_bglm_resid <- ames_validationData$SalePrice - model_bglm_pred
# Calculate RMSE
rmse.test.model.bglm <- sqrt(mean(model_bglm_resid^2, na.rm = TRUE))
valid.model.bglm <- data.frame(cbind(model="Boosted GLM", rmse=rmse.test.model.bglm))
validation_model <- rbind(valid.model.lm, valid.model.rf, valid.model.rpart, valid.model.xgb, valid.model.treebag, valid.model.bglm)
validation_model
## model rmse
## 1 Linear 0.131537110952662
## 2 Random Forest 0.145441879390337
## 3 Decision Tree 0.221395305550647
## 4 eXtreme Gradient Boosting 0.121532667903922
## 5 Bagged Tree 0.186662063078336
## 6 Boosted GLM 0.134744883658696
model_list = list(lm=model_lm, rf=model_rf, rpart=model_rpart, xgboost=model_xgb, treebag=model_treebag, boostglm=model_bglm)
resamples = resamples(model_list)
summary(resamples)
##
## Call:
## summary.resamples(object = resamples)
##
## Models: lm, rf, rpart, xgboost, treebag, boostglm
## Number of resamples: 5
##
## MAE
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## lm 0.08591649 0.08897851 0.09322397 0.09199129 0.09322433 0.09861315
## rf 0.09458390 0.09478176 0.09911968 0.09942623 0.10406481 0.10458103
## rpart 0.14793635 0.15105955 0.15390938 0.15457795 0.15561791 0.16436659
## xgboost 0.07900268 0.07928686 0.09056147 0.08770904 0.09325510 0.09643910
## treebag 0.12827950 0.13144696 0.13453027 0.13531570 0.13643086 0.14589090
## boostglm 0.09126963 0.09790826 0.09840963 0.09941597 0.10145881 0.10803352
## NA's
## lm 0
## rf 0
## rpart 0
## xgboost 0
## treebag 0
## boostglm 0
##
## RMSE
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lm 0.1153665 0.1245304 0.1314734 0.1353970 0.1517409 0.1538738 0
## rf 0.1311499 0.1419406 0.1431966 0.1459131 0.1497471 0.1635315 0
## rpart 0.1913057 0.2087365 0.2096506 0.2103783 0.2177584 0.2244403 0
## xgboost 0.1113762 0.1157546 0.1305864 0.1301929 0.1412667 0.1519807 0
## treebag 0.1713135 0.1754272 0.1883920 0.1853247 0.1897662 0.2017244 0
## boostglm 0.1196438 0.1295988 0.1335208 0.1405935 0.1531422 0.1670618 0
##
## Rsquared
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lm 0.8632114 0.8845830 0.8916825 0.8884827 0.8952200 0.9077167 0
## rf 0.8310874 0.8613636 0.8694012 0.8712425 0.8880499 0.9063103 0
## rpart 0.6835327 0.7171496 0.7436434 0.7332021 0.7444074 0.7772773 0
## xgboost 0.8611971 0.8946406 0.9006626 0.8991990 0.9151309 0.9243636 0
## treebag 0.7832584 0.7896091 0.7943940 0.7936107 0.7950630 0.8057292 0
## boostglm 0.8351873 0.8651526 0.8869678 0.8792133 0.8953109 0.9134477 0
set.seed(123)
cv.ctrl <- trainControl(method = "repeatedcv", repeats = 1, number = 4, allowParallel=T)
#xgb.grid <- expand.grid(nrounds = 750,
# eta = c(0.01,0.005,0.001),
# max_depth = c(4,6,8),
# colsample_bytree=c(1,5,10),
# min_child_weight = 2,
# subsample=c(0.2,0.4,0.6),
# gamma=0.01)
#xgb_tune <- caret::train(SalePrice ~ .,
# data=ames_trainData,
# method="xgbTree",
# trControl=cv.ctrl,
# tuneGrid=xgb.grid,
# verbose=T,
# metric="RMSE",
# nthread =3)
#xgb_tune
# After hyper-tuning through above code, below parameters are selected for building final repdiction model
xgb.bst.params <- expand.grid(nrounds = 1000,
colsample_bytree=1,
eta=0.010,
max_depth=8,
min_child_weight=3,
gamma=0.01, # less overfit
subsample=0.4)
final.xgb.model <- caret::train(SalePrice ~ ., data=ames_trainData, method="xgbTree",
trControl=cv.ctrl, tuneGrid=xgb.bst.params, verbose=T,
metric="RMSE", nthread =3)
final.xgb.model
## eXtreme Gradient Boosting
##
## 1010 samples
## 51 predictor
##
## No pre-processing
## Resampling: Cross-Validated (4 fold, repeated 1 times)
## Summary of sample sizes: 757, 759, 757, 757
## Resampling results:
##
## RMSE Rsquared MAE
## 0.1294419 0.8980078 0.08742543
##
## Tuning parameter 'nrounds' was held constant at a value of 1000
## 1
## Tuning parameter 'min_child_weight' was held constant at a value of
## 3
## Tuning parameter 'subsample' was held constant at a value of 0.4
actual.pred.train <- data.frame(cbind(SalePrice=exp(ames_trainData$SalePrice),
PredSalePrice=exp(predict(final.xgb.model)),
Residuals=exp(residuals(final.xgb.model))))
summary(actual.pred.train)
## SalePrice PredSalePrice Residuals
## Min. : 34900 Min. : 36471 Min. :0.5559
## 1st Qu.:129000 1st Qu.:128966 1st Qu.:0.9708
## Median :161875 Median :159844 Median :1.0020
## Mean :179879 Mean :178878 Mean :1.0024
## 3rd Qu.:213438 3rd Qu.:210609 3rd Qu.:1.0317
## Max. :745000 Max. :578701 Max. :1.4297
final.rmse.train.xgb <- sqrt(mean((exp(ames_trainData$SalePrice) - exp(predict(final.xgb.model)))^2, na.rm = TRUE))
final.rmse.train.xgb
## [1] 12830.52
# Extract Predictions for Validation Dataset
final.valid.pred <- predict(final.xgb.model, ames_validationData)
# Extract Residuals for Validation Dataset
final.valid.resid <- ames_validationData$SalePrice - final.valid.pred
actual.pred.valid <- data.frame(cbind(SalePrice=exp(ames_validationData$SalePrice),
PredSalePrice=exp(final.valid.pred),
Residuals=exp(final.valid.resid)))
summary(actual.pred.valid)
## SalePrice PredSalePrice Residuals
## Min. : 40000 Min. : 68563 Min. :0.3412
## 1st Qu.:130750 1st Qu.:131472 1st Qu.:0.9515
## Median :165000 Median :163492 Median :1.0127
## Mean :183261 Mean :181207 Mean :1.0113
## 3rd Qu.:214750 3rd Qu.:212853 3rd Qu.:1.0659
## Max. :755000 Max. :541505 Max. :1.5850
# Calculate RMSE for validation Dataset
final.rmse.valid.xgb <- sqrt(mean((exp(ames_validationData$SalePrice) - exp(final.valid.pred))^2, na.rm = TRUE))
final.rmse.valid.xgb
## [1] 29728.3
# Extract Predictions for Test Dataset
final.test.pred <- predict(final.xgb.model, ames_testData)
test.predictions <- data.frame(cbind(Id=testData$Id,
SalePrice=exp(final.test.pred)))
# Write Predictions of Test Datset into csv file
write.csv(test.predictions, "C:/Users/ajana/Desktop/Imarticus Project/R Project/Data Set/test_predict.csv")