Predictive Analytics on AMES Housing Data

Ames Housing Authority is a public housing agency that serves the city of Ames, Iowa, US. It helps provide decent and safe rental housing for eligible low-income families, the elderly, and persons with disabilities The housing authority has collected 79 assessment parameters which describes every aspect of residential homes in Ames. These variables focus on the quality and quantity of the physical attributes of a property. Most of the variables are exactly the type of information that a typical home buyer would want to know about a potential property.

Problem Statement: Predict Home Sale Price for the Test Data Set with the lowest possible error.

# Load the package that contains the full dataset and necessary libraries for our work.
options(warn = -1) # Suppress Warnings
library(ggplot2) # for some amazing looking graphs
library(MASS) # Library for our box-cox transform down the end
library(corrplot) # Plotting nice correlation matrix
library(dplyr) # Lirary for spliting train & test dataset
library(caret)
library(caTools) # loading caTools library
library(e1071)
library(rpart)
library(randomForest)
library(ranger)
library(xgboost)
library(plyr)
library(mlr)
library(rJava)
library(FSelector)
library(randomForestSRC)
library(stringr)
library(mboost)
library(ipred)

Step 1 - Load AMES Housing Dataset

Load the train and test dataset. Let’s look into how many observations are there in train and test dataset and how many features are present in the dataset

# Load the Network Intrusion Detection dataset - This is training dataset through which we will build the model
setwd("C:/Users/ajana/Desktop/Imarticus Project/R Project/Data Set/") # setting dataset directory
trainData = read.csv("Group Project on R-Data Set-4.csv")
testData = read.csv("Group Project on R-Data Set-3.csv")

target <- 'SalePrice'
predictors <- setdiff(colnames(trainData), target)

# Saving train data index for splitting later
trainingRowIndex <- nrow(trainData)
trainingSample <- nrow(trainData)

testingRowIndex <- trainingRowIndex + 1
testingSample <- nrow(testData)
totalFeatures <- length(predictors)

totalSample <- trainingSample + testingSample

stmt <- paste("Total number of observations in Train Dataset is ", trainingSample, " and Test Dataset is ", testingSample)
print(stmt, na.print = NULL)

## [1] "Total number of observations in Train Dataset is  1460  and Test Dataset is  1459"

stmt <- paste("Total number of relevant Features in Train Dataset ", totalFeatures)
print(stmt, na.print = NULL)

## [1] "Total number of relevant Features in Train Dataset  80"

# Removing Id (unique identifier) which doesn't has any significance for the model
trainData_wrk <- select(trainData, -Id, -SalePrice) 
testData_wrk <- select(testData, -Id) 

# Combining Training and Testing dataset for further analysis & feature cleaning and changes
completeData <- rbind(trainData_wrk, testData_wrk)

num_vars<-colnames(completeData[sapply(completeData, is.numeric) == TRUE])
cat_vars<-colnames(completeData[sapply(completeData, is.factor) == TRUE])

stmt <- paste("There are", length(num_vars), "numerical features and ", length(cat_vars), "categorical features" )
print(stmt, na.print = NULL)

## [1] "There are 36 numerical features and  43 categorical features"

Step 2 - Examine Null / Missing Values in Housing Dataset

Let’s gather null / missing values statistics and identify the percentage against overall population

summary(completeData) # Summarize the data of Housing Prices dataset before null values treatment

##    MSSubClass        MSZoning     LotFrontage        LotArea      
##  Min.   : 20.00   C (all):  25   Min.   : 21.00   Min.   :  1300  
##  1st Qu.: 20.00   FV     : 139   1st Qu.: 59.00   1st Qu.:  7478  
##  Median : 50.00   RH     :  26   Median : 68.00   Median :  9453  
##  Mean   : 57.14   RL     :2265   Mean   : 69.31   Mean   : 10168  
##  3rd Qu.: 70.00   RM     : 460   3rd Qu.: 80.00   3rd Qu.: 11570  
##  Max.   :190.00   NA's   :   4   Max.   :313.00   Max.   :215245  
##                                  NA's   :486                      
##   Street      Alley      LotShape   LandContour  Utilities   
##  Grvl:  12   Grvl: 120   IR1: 968   Bnk: 117    AllPub:2916  
##  Pave:2907   Pave:  78   IR2:  76   HLS: 120    NoSeWa:   1  
##              NA's:2721   IR3:  16   Low:  60    NA's  :   2  
##                          Reg:1859   Lvl:2622                 
##                                                              
##                                                              
##                                                              
##    LotConfig    LandSlope   Neighborhood    Condition1     Condition2  
##  Corner : 511   Gtl:2778   NAmes  : 443   Norm   :2511   Norm   :2889  
##  CulDSac: 176   Mod: 125   CollgCr: 267   Feedr  : 164   Feedr  :  13  
##  FR2    :  85   Sev:  16   OldTown: 239   Artery :  92   Artery :   5  
##  FR3    :  14              Edwards: 194   RRAn   :  50   PosA   :   4  
##  Inside :2133              Somerst: 182   PosN   :  39   PosN   :   4  
##                            NridgHt: 166   RRAe   :  28   RRNn   :   2  
##                            (Other):1428   (Other):  35   (Other):   2  
##    BldgType      HouseStyle    OverallQual      OverallCond   
##  1Fam  :2425   1Story :1471   Min.   : 1.000   Min.   :1.000  
##  2fmCon:  62   2Story : 872   1st Qu.: 5.000   1st Qu.:5.000  
##  Duplex: 109   1.5Fin : 314   Median : 6.000   Median :5.000  
##  Twnhs :  96   SLvl   : 128   Mean   : 6.089   Mean   :5.565  
##  TwnhsE: 227   SFoyer :  83   3rd Qu.: 7.000   3rd Qu.:6.000  
##                2.5Unf :  24   Max.   :10.000   Max.   :9.000  
##                (Other):  27                                   
##    YearBuilt     YearRemodAdd    RoofStyle       RoofMatl   
##  Min.   :1872   Min.   :1950   Flat   :  20   CompShg:2876  
##  1st Qu.:1954   1st Qu.:1965   Gable  :2310   Tar&Grv:  23  
##  Median :1973   Median :1993   Gambrel:  22   WdShake:   9  
##  Mean   :1971   Mean   :1984   Hip    : 551   WdShngl:   7  
##  3rd Qu.:2001   3rd Qu.:2004   Mansard:  11   ClyTile:   1  
##  Max.   :2010   Max.   :2010   Shed   :   5   Membran:   1  
##                                               (Other):   2  
##   Exterior1st    Exterior2nd     MasVnrType     MasVnrArea     ExterQual
##  VinylSd:1025   VinylSd:1014   BrkCmn :  25   Min.   :   0.0   Ex: 107  
##  MetalSd: 450   MetalSd: 447   BrkFace: 879   1st Qu.:   0.0   Fa:  35  
##  HdBoard: 442   HdBoard: 406   None   :1742   Median :   0.0   Gd: 979  
##  Wd Sdng: 411   Wd Sdng: 391   Stone  : 249   Mean   : 102.2   TA:1798  
##  Plywood: 221   Plywood: 270   NA's   :  24   3rd Qu.: 164.0            
##  (Other): 369   (Other): 390                  Max.   :1600.0            
##  NA's   :   1   NA's   :   1                  NA's   :23                
##  ExterCond  Foundation   BsmtQual    BsmtCond    BsmtExposure BsmtFinType1
##  Ex:  12   BrkTil: 311   Ex  : 258   Fa  : 104   Av  : 418    ALQ :429    
##  Fa:  67   CBlock:1235   Fa  :  88   Gd  : 122   Gd  : 276    BLQ :269    
##  Gd: 299   PConc :1308   Gd  :1209   Po  :   5   Mn  : 239    GLQ :849    
##  Po:   3   Slab  :  49   TA  :1283   TA  :2606   No  :1904    LwQ :154    
##  TA:2538   Stone :  11   NA's:  81   NA's:  82   NA's:  82    Rec :288    
##            Wood  :   5                                        Unf :851    
##                                                               NA's: 79    
##    BsmtFinSF1     BsmtFinType2   BsmtFinSF2        BsmtUnfSF     
##  Min.   :   0.0   ALQ :  52    Min.   :   0.00   Min.   :   0.0  
##  1st Qu.:   0.0   BLQ :  68    1st Qu.:   0.00   1st Qu.: 220.0  
##  Median : 368.5   GLQ :  34    Median :   0.00   Median : 467.0  
##  Mean   : 441.4   LwQ :  87    Mean   :  49.58   Mean   : 560.8  
##  3rd Qu.: 733.0   Rec : 105    3rd Qu.:   0.00   3rd Qu.: 805.5  
##  Max.   :5644.0   Unf :2493    Max.   :1526.00   Max.   :2336.0  
##  NA's   :1        NA's:  80    NA's   :1         NA's   :1       
##   TotalBsmtSF      Heating     HeatingQC CentralAir Electrical  
##  Min.   :   0.0   Floor:   1   Ex:1493   N: 196     FuseA: 188  
##  1st Qu.: 793.0   GasA :2874   Fa:  92   Y:2723     FuseF:  50  
##  Median : 989.5   GasW :  27   Gd: 474              FuseP:   8  
##  Mean   :1051.8   Grav :   9   Po:   3              Mix  :   1  
##  3rd Qu.:1302.0   OthW :   2   TA: 857              SBrkr:2671  
##  Max.   :6110.0   Wall :   6                        NA's :   1  
##  NA's   :1                                                      
##    X1stFlrSF      X2ndFlrSF       LowQualFinSF        GrLivArea   
##  Min.   : 334   Min.   :   0.0   Min.   :   0.000   Min.   : 334  
##  1st Qu.: 876   1st Qu.:   0.0   1st Qu.:   0.000   1st Qu.:1126  
##  Median :1082   Median :   0.0   Median :   0.000   Median :1444  
##  Mean   :1160   Mean   : 336.5   Mean   :   4.694   Mean   :1501  
##  3rd Qu.:1388   3rd Qu.: 704.0   3rd Qu.:   0.000   3rd Qu.:1744  
##  Max.   :5095   Max.   :2065.0   Max.   :1064.000   Max.   :5642  
##                                                                   
##   BsmtFullBath     BsmtHalfBath        FullBath        HalfBath     
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.00000   Median :2.000   Median :0.0000  
##  Mean   :0.4299   Mean   :0.06136   Mean   :1.568   Mean   :0.3803  
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :3.0000   Max.   :2.00000   Max.   :4.000   Max.   :2.0000  
##  NA's   :2        NA's   :2                                         
##   BedroomAbvGr   KitchenAbvGr   KitchenQual  TotRmsAbvGrd   
##  Min.   :0.00   Min.   :0.000   Ex  : 205   Min.   : 2.000  
##  1st Qu.:2.00   1st Qu.:1.000   Fa  :  70   1st Qu.: 5.000  
##  Median :3.00   Median :1.000   Gd  :1151   Median : 6.000  
##  Mean   :2.86   Mean   :1.045   TA  :1492   Mean   : 6.452  
##  3rd Qu.:3.00   3rd Qu.:1.000   NA's:   1   3rd Qu.: 7.000  
##  Max.   :8.00   Max.   :3.000               Max.   :15.000  
##                                                             
##    Functional     Fireplaces     FireplaceQu   GarageType    GarageYrBlt  
##  Typ    :2717   Min.   :0.0000   Ex  :  43   2Types :  23   Min.   :1895  
##  Min2   :  70   1st Qu.:0.0000   Fa  :  74   Attchd :1723   1st Qu.:1960  
##  Min1   :  65   Median :1.0000   Gd  : 744   Basment:  36   Median :1979  
##  Mod    :  35   Mean   :0.5971   Po  :  46   BuiltIn: 186   Mean   :1978  
##  Maj1   :  19   3rd Qu.:1.0000   TA  : 592   CarPort:  15   3rd Qu.:2002  
##  (Other):  11   Max.   :4.0000   NA's:1420   Detchd : 779   Max.   :2207  
##  NA's   :   2                                NA's   : 157   NA's   :159   
##  GarageFinish   GarageCars      GarageArea     GarageQual  GarageCond 
##  Fin : 719    Min.   :0.000   Min.   :   0.0   Ex  :   3   Ex  :   3  
##  RFn : 811    1st Qu.:1.000   1st Qu.: 320.0   Fa  : 124   Fa  :  74  
##  Unf :1230    Median :2.000   Median : 480.0   Gd  :  24   Gd  :  15  
##  NA's: 159    Mean   :1.767   Mean   : 472.9   Po  :   5   Po  :  14  
##               3rd Qu.:2.000   3rd Qu.: 576.0   TA  :2604   TA  :2654  
##               Max.   :5.000   Max.   :1488.0   NA's: 159   NA's: 159  
##               NA's   :1       NA's   :1                               
##  PavedDrive   WoodDeckSF       OpenPorchSF     EnclosedPorch   
##  N: 216     Min.   :   0.00   Min.   :  0.00   Min.   :   0.0  
##  P:  62     1st Qu.:   0.00   1st Qu.:  0.00   1st Qu.:   0.0  
##  Y:2641     Median :   0.00   Median : 26.00   Median :   0.0  
##             Mean   :  93.71   Mean   : 47.49   Mean   :  23.1  
##             3rd Qu.: 168.00   3rd Qu.: 70.00   3rd Qu.:   0.0  
##             Max.   :1424.00   Max.   :742.00   Max.   :1012.0  
##                                                                
##    X3SsnPorch       ScreenPorch        PoolArea        PoolQC    
##  Min.   :  0.000   Min.   :  0.00   Min.   :  0.000   Ex  :   4  
##  1st Qu.:  0.000   1st Qu.:  0.00   1st Qu.:  0.000   Fa  :   2  
##  Median :  0.000   Median :  0.00   Median :  0.000   Gd  :   4  
##  Mean   :  2.602   Mean   : 16.06   Mean   :  2.252   NA's:2909  
##  3rd Qu.:  0.000   3rd Qu.:  0.00   3rd Qu.:  0.000              
##  Max.   :508.000   Max.   :576.00   Max.   :800.000              
##                                                                  
##    Fence      MiscFeature    MiscVal             MoSold      
##  GdPrv: 118   Gar2:   5   Min.   :    0.00   Min.   : 1.000  
##  GdWo : 112   Othr:   4   1st Qu.:    0.00   1st Qu.: 4.000  
##  MnPrv: 329   Shed:  95   Median :    0.00   Median : 6.000  
##  MnWw :  12   TenC:   1   Mean   :   50.83   Mean   : 6.213  
##  NA's :2348   NA's:2814   3rd Qu.:    0.00   3rd Qu.: 8.000  
##                           Max.   :17000.00   Max.   :12.000  
##                                                              
##      YrSold        SaleType    SaleCondition 
##  Min.   :2006   WD     :2525   Abnorml: 190  
##  1st Qu.:2007   New    : 239   AdjLand:  12  
##  Median :2008   COD    :  87   Alloca :  24  
##  Mean   :2008   ConLD  :  26   Family :  46  
##  3rd Qu.:2009   CWD    :  12   Normal :2402  
##  Max.   :2010   (Other):  29   Partial: 245  
##                 NA's   :   1

## Let's check for any missing values in the data
colSums(is.na(completeData))

##    MSSubClass      MSZoning   LotFrontage       LotArea        Street 
##             0             4           486             0             0 
##         Alley      LotShape   LandContour     Utilities     LotConfig 
##          2721             0             0             2             0 
##     LandSlope  Neighborhood    Condition1    Condition2      BldgType 
##             0             0             0             0             0 
##    HouseStyle   OverallQual   OverallCond     YearBuilt  YearRemodAdd 
##             0             0             0             0             0 
##     RoofStyle      RoofMatl   Exterior1st   Exterior2nd    MasVnrType 
##             0             0             1             1            24 
##    MasVnrArea     ExterQual     ExterCond    Foundation      BsmtQual 
##            23             0             0             0            81 
##      BsmtCond  BsmtExposure  BsmtFinType1    BsmtFinSF1  BsmtFinType2 
##            82            82            79             1            80 
##    BsmtFinSF2     BsmtUnfSF   TotalBsmtSF       Heating     HeatingQC 
##             1             1             1             0             0 
##    CentralAir    Electrical     X1stFlrSF     X2ndFlrSF  LowQualFinSF 
##             0             1             0             0             0 
##     GrLivArea  BsmtFullBath  BsmtHalfBath      FullBath      HalfBath 
##             0             2             2             0             0 
##  BedroomAbvGr  KitchenAbvGr   KitchenQual  TotRmsAbvGrd    Functional 
##             0             0             1             0             2 
##    Fireplaces   FireplaceQu    GarageType   GarageYrBlt  GarageFinish 
##             0          1420           157           159           159 
##    GarageCars    GarageArea    GarageQual    GarageCond    PavedDrive 
##             1             1           159           159             0 
##    WoodDeckSF   OpenPorchSF EnclosedPorch    X3SsnPorch   ScreenPorch 
##             0             0             0             0             0 
##      PoolArea        PoolQC         Fence   MiscFeature       MiscVal 
##             0          2909          2348          2814             0 
##        MoSold        YrSold      SaleType SaleCondition 
##             0             0             1             0

# create table to count null values and percentage against overall population
null_count_df <- sapply(completeData, function(y) sum(length(which(is.na(y)))))
null_count_df <- data.frame(sort(null_count_df[null_count_df>0], decreasing = TRUE))
colnames(null_count_df)[1] <- "NullCount"
null_count_df$PctNull <- round(null_count_df$NullCount / (trainingSample+testingSample), 2)

null_count_df

##              NullCount PctNull
## PoolQC            2909    1.00
## MiscFeature       2814    0.96
## Alley             2721    0.93
## Fence             2348    0.80
## FireplaceQu       1420    0.49
## LotFrontage        486    0.17
## GarageYrBlt        159    0.05
## GarageFinish       159    0.05
## GarageQual         159    0.05
## GarageCond         159    0.05
## GarageType         157    0.05
## BsmtCond            82    0.03
## BsmtExposure        82    0.03
## BsmtQual            81    0.03
## BsmtFinType2        80    0.03
## BsmtFinType1        79    0.03
## MasVnrType          24    0.01
## MasVnrArea          23    0.01
## MSZoning             4    0.00
## Utilities            2    0.00
## BsmtFullBath         2    0.00
## BsmtHalfBath         2    0.00
## Functional           2    0.00
## Exterior1st          1    0.00
## Exterior2nd          1    0.00
## BsmtFinSF1           1    0.00
## BsmtFinSF2           1    0.00
## BsmtUnfSF            1    0.00
## TotalBsmtSF          1    0.00
## Electrical           1    0.00
## KitchenQual          1    0.00
## GarageCars           1    0.00
## GarageArea           1    0.00
## SaleType             1    0.00

Step 2.1 - Cleaning Features with more than 40% Missing / Null Values

Let’s remove the features where more than 40% of the data is either null or missing.

predictors <- colnames(completeData)

# Remove features with more than 40% null / missing values
null_col_name <- rownames(null_count_df[null_count_df$PctNull>=as.numeric(0.40), ])
null_col_name

## [1] "PoolQC"      "MiscFeature" "Alley"       "Fence"       "FireplaceQu"

# selecting features without null values features and select the dataset accordingly
predictors <- setdiff(predictors, null_col_name)
completeData <- completeData[ , predictors]

print(paste0("Total number of features with more than 40% null values are ", length(null_col_name)))

## [1] "Total number of features with more than 40% null values are 5"

Step 2.2 - Handle Missing / Null values for Numeric Features

Features where null / missing values are less than 40% for continuous variables should be replaced with mean value of the feature

## Identify numeric variables and treat them with mean value if any missing / null values exists
for (col_name in colnames(completeData[sapply(completeData, is.numeric) == TRUE])) {
  
  if (sum(is.na(completeData[[col_name]])) > 0) {
    completeData[col_name][is.na(completeData[col_name])] <- mean(completeData[[col_name]], na.rm = T)
    stmt <- paste('Null values of', col_name, ' feature has been replaced by mean value ', mean(completeData[[col_name]], na.rm = T))
    print(stmt, na.print = NULL)
  }
}

## [1] "Null values of LotFrontage  feature has been replaced by mean value  69.3057953144266"
## [1] "Null values of MasVnrArea  feature has been replaced by mean value  102.201312154696"
## [1] "Null values of BsmtFinSF1  feature has been replaced by mean value  441.423235092529"
## [1] "Null values of BsmtFinSF2  feature has been replaced by mean value  49.5822481151474"
## [1] "Null values of BsmtUnfSF  feature has been replaced by mean value  560.772104180946"
## [1] "Null values of TotalBsmtSF  feature has been replaced by mean value  1051.77758738862"
## [1] "Null values of BsmtFullBath  feature has been replaced by mean value  0.429893726431265"
## [1] "Null values of BsmtHalfBath  feature has been replaced by mean value  0.061364415495372"
## [1] "Null values of GarageYrBlt  feature has been replaced by mean value  1978.1134057971"
## [1] "Null values of GarageCars  feature has been replaced by mean value  1.76662097326936"
## [1] "Null values of GarageArea  feature has been replaced by mean value  472.8745716244"

Step 2.3 - Handle Missing / Null values for Factorial Features

Features where null / missing values are less than 40% for categorical variables should be replaced with mode which is value whose frequency is maximum

Mode = function(x){
  ta = table(x)
  tam = max(ta)
  mod = names(ta)[ta==tam]
  return(mod)
}

## Identify numeric variables and treat them with mean value if any missing / null values exists
for (col_name in colnames(completeData[sapply(completeData, is.factor) == TRUE])) {
  
  if (sum(is.na(completeData[[col_name]])) > 0) {
    completeData[col_name][is.na(completeData[col_name])] <- Mode(completeData[[col_name]])
    stmt <- paste('Null values of', col_name, ' feature has been replaced by mode value ', Mode(completeData[[col_name]]))
    print(stmt, na.print = NULL)
  }
}

## [1] "Null values of MSZoning  feature has been replaced by mode value  RL"
## [1] "Null values of Utilities  feature has been replaced by mode value  AllPub"
## [1] "Null values of Exterior1st  feature has been replaced by mode value  VinylSd"
## [1] "Null values of Exterior2nd  feature has been replaced by mode value  VinylSd"
## [1] "Null values of MasVnrType  feature has been replaced by mode value  None"
## [1] "Null values of BsmtQual  feature has been replaced by mode value  TA"
## [1] "Null values of BsmtCond  feature has been replaced by mode value  TA"
## [1] "Null values of BsmtExposure  feature has been replaced by mode value  No"
## [1] "Null values of BsmtFinType1  feature has been replaced by mode value  Unf"
## [1] "Null values of BsmtFinType2  feature has been replaced by mode value  Unf"
## [1] "Null values of Electrical  feature has been replaced by mode value  SBrkr"
## [1] "Null values of KitchenQual  feature has been replaced by mode value  TA"
## [1] "Null values of Functional  feature has been replaced by mode value  Typ"
## [1] "Null values of GarageType  feature has been replaced by mode value  Attchd"
## [1] "Null values of GarageFinish  feature has been replaced by mode value  Unf"
## [1] "Null values of GarageQual  feature has been replaced by mode value  TA"
## [1] "Null values of GarageCond  feature has been replaced by mode value  TA"
## [1] "Null values of SaleType  feature has been replaced by mode value  WD"

## Let's check for any missing values in the data
colSums(is.na(completeData))

##    MSSubClass      MSZoning   LotFrontage       LotArea        Street 
##             0             0             0             0             0 
##      LotShape   LandContour     Utilities     LotConfig     LandSlope 
##             0             0             0             0             0 
##  Neighborhood    Condition1    Condition2      BldgType    HouseStyle 
##             0             0             0             0             0 
##   OverallQual   OverallCond     YearBuilt  YearRemodAdd     RoofStyle 
##             0             0             0             0             0 
##      RoofMatl   Exterior1st   Exterior2nd    MasVnrType    MasVnrArea 
##             0             0             0             0             0 
##     ExterQual     ExterCond    Foundation      BsmtQual      BsmtCond 
##             0             0             0             0             0 
##  BsmtExposure  BsmtFinType1    BsmtFinSF1  BsmtFinType2    BsmtFinSF2 
##             0             0             0             0             0 
##     BsmtUnfSF   TotalBsmtSF       Heating     HeatingQC    CentralAir 
##             0             0             0             0             0 
##    Electrical     X1stFlrSF     X2ndFlrSF  LowQualFinSF     GrLivArea 
##             0             0             0             0             0 
##  BsmtFullBath  BsmtHalfBath      FullBath      HalfBath  BedroomAbvGr 
##             0             0             0             0             0 
##  KitchenAbvGr   KitchenQual  TotRmsAbvGrd    Functional    Fireplaces 
##             0             0             0             0             0 
##    GarageType   GarageYrBlt  GarageFinish    GarageCars    GarageArea 
##             0             0             0             0             0 
##    GarageQual    GarageCond    PavedDrive    WoodDeckSF   OpenPorchSF 
##             0             0             0             0             0 
## EnclosedPorch    X3SsnPorch   ScreenPorch      PoolArea       MiscVal 
##             0             0             0             0             0 
##        MoSold        YrSold      SaleType SaleCondition 
##             0             0             0             0

summary(completeData) # Summarize the data of Housing Prices dataset after null values treatment

##    MSSubClass        MSZoning     LotFrontage        LotArea      
##  Min.   : 20.00   C (all):  25   Min.   : 21.00   Min.   :  1300  
##  1st Qu.: 20.00   FV     : 139   1st Qu.: 60.00   1st Qu.:  7478  
##  Median : 50.00   RH     :  26   Median : 69.31   Median :  9453  
##  Mean   : 57.14   RL     :2269   Mean   : 69.31   Mean   : 10168  
##  3rd Qu.: 70.00   RM     : 460   3rd Qu.: 78.00   3rd Qu.: 11570  
##  Max.   :190.00                  Max.   :313.00   Max.   :215245  
##                                                                   
##   Street     LotShape   LandContour  Utilities      LotConfig   
##  Grvl:  12   IR1: 968   Bnk: 117    AllPub:2918   Corner : 511  
##  Pave:2907   IR2:  76   HLS: 120    NoSeWa:   1   CulDSac: 176  
##              IR3:  16   Low:  60                  FR2    :  85  
##              Reg:1859   Lvl:2622                  FR3    :  14  
##                                                   Inside :2133  
##                                                                 
##                                                                 
##  LandSlope   Neighborhood    Condition1     Condition2     BldgType   
##  Gtl:2778   NAmes  : 443   Norm   :2511   Norm   :2889   1Fam  :2425  
##  Mod: 125   CollgCr: 267   Feedr  : 164   Feedr  :  13   2fmCon:  62  
##  Sev:  16   OldTown: 239   Artery :  92   Artery :   5   Duplex: 109  
##             Edwards: 194   RRAn   :  50   PosA   :   4   Twnhs :  96  
##             Somerst: 182   PosN   :  39   PosN   :   4   TwnhsE: 227  
##             NridgHt: 166   RRAe   :  28   RRNn   :   2                
##             (Other):1428   (Other):  35   (Other):   2                
##    HouseStyle    OverallQual      OverallCond      YearBuilt   
##  1Story :1471   Min.   : 1.000   Min.   :1.000   Min.   :1872  
##  2Story : 872   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954  
##  1.5Fin : 314   Median : 6.000   Median :5.000   Median :1973  
##  SLvl   : 128   Mean   : 6.089   Mean   :5.565   Mean   :1971  
##  SFoyer :  83   3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2001  
##  2.5Unf :  24   Max.   :10.000   Max.   :9.000   Max.   :2010  
##  (Other):  27                                                  
##   YearRemodAdd    RoofStyle       RoofMatl     Exterior1st  
##  Min.   :1950   Flat   :  20   CompShg:2876   VinylSd:1026  
##  1st Qu.:1965   Gable  :2310   Tar&Grv:  23   MetalSd: 450  
##  Median :1993   Gambrel:  22   WdShake:   9   HdBoard: 442  
##  Mean   :1984   Hip    : 551   WdShngl:   7   Wd Sdng: 411  
##  3rd Qu.:2004   Mansard:  11   ClyTile:   1   Plywood: 221  
##  Max.   :2010   Shed   :   5   Membran:   1   CemntBd: 126  
##                                (Other):   2   (Other): 243  
##   Exterior2nd     MasVnrType     MasVnrArea     ExterQual ExterCond
##  VinylSd:1015   BrkCmn :  25   Min.   :   0.0   Ex: 107   Ex:  12  
##  MetalSd: 447   BrkFace: 879   1st Qu.:   0.0   Fa:  35   Fa:  67  
##  HdBoard: 406   None   :1766   Median :   0.0   Gd: 979   Gd: 299  
##  Wd Sdng: 391   Stone  : 249   Mean   : 102.2   TA:1798   Po:   3  
##  Plywood: 270                  3rd Qu.: 163.5             TA:2538  
##  CmentBd: 126                  Max.   :1600.0                      
##  (Other): 264                                                      
##   Foundation   BsmtQual  BsmtCond  BsmtExposure BsmtFinType1
##  BrkTil: 311   Ex: 258   Fa: 104   Av: 418      ALQ:429     
##  CBlock:1235   Fa:  88   Gd: 122   Gd: 276      BLQ:269     
##  PConc :1308   Gd:1209   Po:   5   Mn: 239      GLQ:849     
##  Slab  :  49   TA:1364   TA:2688   No:1986      LwQ:154     
##  Stone :  11                                    Rec:288     
##  Wood  :   5                                    Unf:930     
##                                                             
##    BsmtFinSF1     BsmtFinType2   BsmtFinSF2        BsmtUnfSF     
##  Min.   :   0.0   ALQ:  52     Min.   :   0.00   Min.   :   0.0  
##  1st Qu.:   0.0   BLQ:  68     1st Qu.:   0.00   1st Qu.: 220.0  
##  Median : 369.0   GLQ:  34     Median :   0.00   Median : 467.0  
##  Mean   : 441.4   LwQ:  87     Mean   :  49.58   Mean   : 560.8  
##  3rd Qu.: 733.0   Rec: 105     3rd Qu.:   0.00   3rd Qu.: 805.0  
##  Max.   :5644.0   Unf:2573     Max.   :1526.00   Max.   :2336.0  
##                                                                  
##   TotalBsmtSF    Heating     HeatingQC CentralAir Electrical  
##  Min.   :   0   Floor:   1   Ex:1493   N: 196     FuseA: 188  
##  1st Qu.: 793   GasA :2874   Fa:  92   Y:2723     FuseF:  50  
##  Median : 990   GasW :  27   Gd: 474              FuseP:   8  
##  Mean   :1052   Grav :   9   Po:   3              Mix  :   1  
##  3rd Qu.:1302   OthW :   2   TA: 857              SBrkr:2672  
##  Max.   :6110   Wall :   6                                    
##                                                               
##    X1stFlrSF      X2ndFlrSF       LowQualFinSF        GrLivArea   
##  Min.   : 334   Min.   :   0.0   Min.   :   0.000   Min.   : 334  
##  1st Qu.: 876   1st Qu.:   0.0   1st Qu.:   0.000   1st Qu.:1126  
##  Median :1082   Median :   0.0   Median :   0.000   Median :1444  
##  Mean   :1160   Mean   : 336.5   Mean   :   4.694   Mean   :1501  
##  3rd Qu.:1388   3rd Qu.: 704.0   3rd Qu.:   0.000   3rd Qu.:1744  
##  Max.   :5095   Max.   :2065.0   Max.   :1064.000   Max.   :5642  
##                                                                   
##   BsmtFullBath     BsmtHalfBath        FullBath        HalfBath     
##  Min.   :0.0000   Min.   :0.00000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.00000   Median :2.000   Median :0.0000  
##  Mean   :0.4299   Mean   :0.06136   Mean   :1.568   Mean   :0.3803  
##  3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :3.0000   Max.   :2.00000   Max.   :4.000   Max.   :2.0000  
##                                                                     
##   BedroomAbvGr   KitchenAbvGr   KitchenQual  TotRmsAbvGrd    Functional 
##  Min.   :0.00   Min.   :0.000   Ex: 205     Min.   : 2.000   Maj1:  19  
##  1st Qu.:2.00   1st Qu.:1.000   Fa:  70     1st Qu.: 5.000   Maj2:   9  
##  Median :3.00   Median :1.000   Gd:1151     Median : 6.000   Min1:  65  
##  Mean   :2.86   Mean   :1.045   TA:1493     Mean   : 6.452   Min2:  70  
##  3rd Qu.:3.00   3rd Qu.:1.000               3rd Qu.: 7.000   Mod :  35  
##  Max.   :8.00   Max.   :3.000               Max.   :15.000   Sev :   2  
##                                                              Typ :2719  
##    Fireplaces       GarageType    GarageYrBlt   GarageFinish
##  Min.   :0.0000   2Types :  23   Min.   :1895   Fin: 719    
##  1st Qu.:0.0000   Attchd :1880   1st Qu.:1962   RFn: 811    
##  Median :1.0000   Basment:  36   Median :1978   Unf:1389    
##  Mean   :0.5971   BuiltIn: 186   Mean   :1978               
##  3rd Qu.:1.0000   CarPort:  15   3rd Qu.:2001               
##  Max.   :4.0000   Detchd : 779   Max.   :2207               
##                                                             
##    GarageCars      GarageArea     GarageQual GarageCond PavedDrive
##  Min.   :0.000   Min.   :   0.0   Ex:   3    Ex:   3    N: 216    
##  1st Qu.:1.000   1st Qu.: 320.0   Fa: 124    Fa:  74    P:  62    
##  Median :2.000   Median : 480.0   Gd:  24    Gd:  15    Y:2641    
##  Mean   :1.767   Mean   : 472.9   Po:   5    Po:  14              
##  3rd Qu.:2.000   3rd Qu.: 576.0   TA:2763    TA:2813              
##  Max.   :5.000   Max.   :1488.0                                   
##                                                                   
##    WoodDeckSF       OpenPorchSF     EnclosedPorch      X3SsnPorch     
##  Min.   :   0.00   Min.   :  0.00   Min.   :   0.0   Min.   :  0.000  
##  1st Qu.:   0.00   1st Qu.:  0.00   1st Qu.:   0.0   1st Qu.:  0.000  
##  Median :   0.00   Median : 26.00   Median :   0.0   Median :  0.000  
##  Mean   :  93.71   Mean   : 47.49   Mean   :  23.1   Mean   :  2.602  
##  3rd Qu.: 168.00   3rd Qu.: 70.00   3rd Qu.:   0.0   3rd Qu.:  0.000  
##  Max.   :1424.00   Max.   :742.00   Max.   :1012.0   Max.   :508.000  
##                                                                       
##   ScreenPorch        PoolArea          MiscVal             MoSold      
##  Min.   :  0.00   Min.   :  0.000   Min.   :    0.00   Min.   : 1.000  
##  1st Qu.:  0.00   1st Qu.:  0.000   1st Qu.:    0.00   1st Qu.: 4.000  
##  Median :  0.00   Median :  0.000   Median :    0.00   Median : 6.000  
##  Mean   : 16.06   Mean   :  2.252   Mean   :   50.83   Mean   : 6.213  
##  3rd Qu.:  0.00   3rd Qu.:  0.000   3rd Qu.:    0.00   3rd Qu.: 8.000  
##  Max.   :576.00   Max.   :800.000   Max.   :17000.00   Max.   :12.000  
##                                                                        
##      YrSold        SaleType    SaleCondition 
##  Min.   :2006   WD     :2526   Abnorml: 190  
##  1st Qu.:2007   New    : 239   AdjLand:  12  
##  Median :2008   COD    :  87   Alloca :  24  
##  Mean   :2008   ConLD  :  26   Family :  46  
##  3rd Qu.:2009   CWD    :  12   Normal :2402  
##  Max.   :2010   ConLI  :   9   Partial: 245  
##                 (Other):  20

we can observe that features such as Street and Utilities doesn’t have much variance and hence, we can remove these features from the dataset.

Step 3 - Exploratory Data Analysis (EDA) & Feature Engineering

Step 3.1 - Generic Functions for EDA & Feature Engineering

# helper function for plotting categoric data for easier data visualization
plot.categoric <- function(cols, df){
  for (col in cols) {
    order.cols <- names(sort(table(completeData[,col]), decreasing = TRUE))
  
    num.plot <- qplot(df[,col]) +
      geom_bar(fill = 'cornflowerblue') +
      geom_text(aes(label = ..count..), stat='count', vjust=-0.5) +
      theme_minimal() +
      scale_y_continuous(limits = c(0,max(table(df[,col]))*1.1)) +
      scale_x_discrete(limits = order.cols) +
      xlab(col) +
      theme(axis.text.x = element_text(angle = 30, size=12))
  
    print(num.plot)
  }
}

# function that groups a column by its features and returns the mdedian saleprice for each unique feature. 
group.df <- completeData[1:trainingRowIndex,]
group.df$SalePrice <- trainData$SalePrice

group.prices <- function(col) {
  group.table <- group.df[,c(col, 'SalePrice', 'OverallQual')] %>%
    group_by_(col) %>%
    dplyr::summarise(mean.Quality = round(mean(OverallQual),2),
      mean.Price = mean(SalePrice), n = n()) %>%
    arrange(mean.Quality)
    
  print(qplot(x=reorder(group.table[[col]], -group.table[['mean.Price']]), y=group.table[['mean.Price']]) +
    geom_bar(stat='identity', fill='cornflowerblue') +
    theme_minimal() +
    labs(x=col, y='Mean SalePrice') +
    theme(axis.text.x = element_text(angle = 45)))
  
  return(data.frame(group.table))
}

## functional to compute the mean overall quality for each quality
quality.mean <- function(col) {
  group.table <- group.df[,c(col, 'OverallQual')] %>%
    group_by_(col) %>%
    summarise(mean.qual = mean(OverallQual)) %>%
    arrange(mean.qual)
  
  return(data.frame(group.table))
}

# function that maps a categoric value to its corresponding numeric value and returns that column to the data frame
map.values <- function(cols, map.list, df){
  for (col in cols){
    df[[col]] <- as.numeric(map.list[df[,col]])
  }
  return(df)
}

# Year Bin - min year is 1872, max year is 2010, every 20 years create a new bin, 7 total bins
# May be for later versions :)

Step 3.2 - Numerical Atrributes Distribution & Log Transformations

Let’s look into the distribution of some numerical features. If the distribution is skewed then we log transform skewed variables in the model so that the data is normally distributed and modelling the data will provide better model performance

ggplot(trainData, aes(SalePrice)) + geom_histogram(aes(y = ..density..)) +
  stat_function(fun = dnorm, color = "red", args = list(mean = mean(trainData$SalePrice), sd = sd(trainData$SalePrice))) +
  ggtitle("Plot of SalePrice Distribution") + theme(plot.title = element_text(size = 11))

ggplot(trainData, aes(log(SalePrice))) + geom_histogram(aes(y = ..density..)) +
  stat_function(fun = dnorm, color = "red", args = list(mean = mean(log(trainData$SalePrice)),
                                                        sd = sd(log(trainData$SalePrice)))) + 
  ggtitle("Plot of log(SalePrice) Distribution") + theme(plot.title = element_text(size = 11))

ggplot(completeData, aes(LotFrontage)) + geom_histogram(aes(y = ..density..)) +
  stat_function(fun = dnorm, color = "red", args = list(mean = mean(completeData$LotFrontage),
                                                        sd = sd(completeData$LotFrontage))) +
  ggtitle("Plot of LotFrontage Distribution") + theme(plot.title = element_text(size = 11))

ggplot(completeData, aes(log(LotFrontage))) + geom_histogram(aes(y = ..density..)) +
  stat_function(fun = dnorm, color = "red", args = list(mean = mean(log(completeData$LotFrontage)),
                                                        sd = sd(log(completeData$LotFrontage)))) + 
  ggtitle("Plot of log(LotFrontage) Distribution") + theme(plot.title = element_text(size = 11))

ggplot(completeData, aes(LotArea)) + geom_histogram(aes(y = ..density..)) +
  stat_function(fun = dnorm, color = "red", args = list(mean = mean(completeData$LotArea),
                                                        sd = sd(completeData$LotArea))) +
  ggtitle("Plot of LotArea Distribution") + theme(plot.title = element_text(size = 11))

ggplot(completeData, aes(log(LotArea))) + geom_histogram(aes(y = ..density..)) +
  stat_function(fun = dnorm, color = "red", args = list(mean = mean(log(completeData$LotArea)),
                                                        sd = sd(log(completeData$LotArea)))) + 
  ggtitle("Plot of log(LotArea) Distribution") + theme(plot.title = element_text(size = 11))

As we observe above, log transformation of variables shows data distribution more normalized. Hence, it would be best to use log transformed variables for buidling our model which will have better performance.

Step 3.2 - EDA on Categorical Features

Based on categorical feature values and their corresponding SalePrice, create custom features #### Step 3.2.1 - EDA on Utilities

#trainScaled = completeData[1:trainingRowIndex, ]
#testScaled = completeData[testingRowIndex:totalSample, ]

#trainScaled$SalePrice <- trainData$SalePrice

plot.categoric('Utilities', completeData)

Utilities only has 1 value for NoSeWa and the rest AllPub. We can drop this feature from our dataset as the house with ‘NoSeWa’ is from our training set and will have won’t help with any predictive modelling. Also, this shows that there is harldy any variance in the dataset for Utilities feature.

Step 3.2.2 - EDA on Street

plot.categoric('Street', completeData)

Street only has 12 values for Grvl and the rest Pave. We can drop this feature from our dataset as there is hardly any variance in this feature.

Step 3.2.3 - EDA on Paved Drive

plot.categoric('PavedDrive', completeData)

completeData$HasPavedDrive <- (completeData$PavedDrive == 'Y') * 1

Most of the values for PavedDrive feature is Y. So, we can create a feature which represents “Has Paved Drive” and remaining doesn’t for better modelling.

Step 3.2.4 - EDA on GarageType

plot.categoric('GarageType', completeData)

completeData$GarageAttchd <- ifelse(completeData$GarageType == "Attchd" | completeData$GarageType == "BuiltIn", 1, 0)
completeData$GarageAttchd <- as.integer(completeData$GarageAttchd)
completeData$GarageDetchd <- (completeData$GarageType == 'Detchd') * 1

Most of the values for GarageType feature are Attchd, Detchd & Builtin. So, we can create a feature which represents has attached garage (Attchd & BuiltIn) and then detached garage and remaining doesn’t.

Step 3.2.5 - EDA on LandContour

plot.categoric('LandContour', completeData)

completeData$LandLeveled <- (completeData$LandContour == 'Lvl') * 1

Most of the values for LandContour feature are Lvl. So, we can create a feature which represents has leveled land contour and remaining doesn’t.

Step 3.2.6 - EDA on LandSlope

plot.categoric('LandSlope', completeData)

completeData$LandSlopeGentle <- (completeData$LandSlope == 'Gtl') * 1

Most of the values for LandSlope feature are Gtl meaing Gentle. So, we can create a feature which represents has Gentle land slope and remaining doesn’t.

Step 3.2.7 - EDA on Electrical

plot.categoric('Electrical', completeData)

completeData$ElectricalSB <- (completeData$Electrical == 'SBrkr') * 1
completeData$ElectricalFuse <- (completeData$Electrical == 'FuseA' | completeData$Electrical == 'FuseF' | completeData$Electrical == 'FuseP') * 1

Most of the values for Electrical feature are SBrkr & Fuse. So, we can create a feature which represents has SBrkr and combination of all Fuse and remaining doesn’t.

Step 3.2.8 - EDA on Quality & Condition Features

group.prices('GarageQual')

##   GarageQual mean.Quality mean.Price    n
## 1         Fa         5.17   123573.4   48
## 2         Po         5.33   100166.7    3
## 3         TA         6.12   182591.9 1392
## 4         Gd         6.71   215860.7   14
## 5         Ex         7.00   241000.0    3

group.prices('BsmtCond')

##   BsmtCond mean.Quality mean.Price    n
## 1       Po         3.00    64000.0    2
## 2       Fa         5.07   121809.5   45
## 3       TA         6.11   181492.2 1348
## 4       Gd         6.72   213599.9   65

qual.cols <- c('ExterQual', 'ExterCond', 'GarageQual', 'GarageCond', 'KitchenQual', 'HeatingQC', 'BsmtQual', 'BsmtCond')
qual.list <- c('Po' = 1, 'Fa' = 2, 'TA' = 3, 'Gd' = 4, 'Ex' = 5)
completeData <- map.values(qual.cols, qual.list, completeData)

From seeing the mean saleprices from a few of the quality and condition features we can infer that the abbreviations mean poor, fair, typical/average, good and excelent. We’ll map numeric values from 1-5 to their corresponding categoric values and combine that to our dataframe.

Step 3.2.9 - EDA on BsmtFinType1 & BsmtFinType2

group.prices('BsmtFinType1')

##   BsmtFinType1 mean.Quality mean.Price   n
## 1          BLQ         5.35   149493.7 148
## 2          Rec         5.35   146889.2 133
## 3          LwQ         5.54   151852.7  74
## 4          ALQ         5.55   161573.1 220
## 5          Unf         6.05   165519.3 467
## 6          GLQ         7.05   235413.7 418

Here returning the mean sale prices might not be as helpful as computing the median basement areas for both columns to determine which quality is better than the other.

# visualization for BsmtFinTyp1 against BsmtFinSF1
completeData[,c('BsmtFinType1', 'BsmtFinSF1')] %>%
  group_by(BsmtFinType1) %>% dplyr::summarise(medianArea = median(BsmtFinSF1)) %>%
  arrange(medianArea) %>%
  ggplot(aes(x=reorder(BsmtFinType1,-medianArea), y=medianArea)) +
  geom_bar(stat = 'identity', fill='cornflowerblue') +
  labs(x='BsmtFinType1', y='Median of BsmtFinSF1') +
  geom_text(aes(label = sort(medianArea)), vjust = -0.5) +
  scale_y_continuous(limits = c(0,850)) + theme_minimal()

# visualization for BsmtFinTyp2 against BsmtFinSF2
completeData[,c('BsmtFinType2', 'BsmtFinSF2')] %>%
  group_by(BsmtFinType2) %>% dplyr::summarise(medianArea = median(BsmtFinSF2)) %>%
  arrange(medianArea) %>%
  ggplot(aes(x=reorder(BsmtFinType2,-medianArea), y=medianArea)) +
  geom_bar(stat = 'identity', fill='cornflowerblue') +
  labs(x='BsmtFinType2', y='Median of BsmtFinSF2') +
  geom_text(aes(label = sort(medianArea)), vjust = -0.5) +
  scale_y_continuous(limits = c(0,850)) + theme_minimal()

bsmt.fin.list <- c('Unf' = 1, 'LwQ' = 2,'Rec'= 3, 'BLQ' = 4, 'ALQ' = 5, 'GLQ' = 6)
completeData <- map.values(c('BsmtFinType1','BsmtFinType2'), bsmt.fin.list, completeData)

Through investigating the relationships between the basement quality and areas we an see the true order of qualities of each basement to be ‘Unf’ < ‘LwQ’ < ‘BLQ’ < ‘Rec’ < ‘ALQ’ < ‘GLQ’.

Step 3.2.10 - EDA on BsmtExposure Functional GarageFinish

group.prices('BsmtExposure')

##   BsmtExposure mean.Quality mean.Price   n
## 1           No         5.86   163439.2 991
## 2           Mn         6.25   192789.7 114
## 3           Av         6.57   206643.4 221
## 4           Gd         6.96   257689.8 134

bsmt.list <- c('No' = 1, 'Mn' = 2, 'Av' = 3, 'Gd' = 4)
completeData = map.values(c('BsmtExposure'), bsmt.list, completeData)

group.prices('Functional')

##   Functional mean.Quality mean.Price    n
## 1       Min2         4.97   144240.6   34
## 2       Maj2         5.00    85800.0    5
## 3       Min1         5.26   146385.5   31
## 4        Mod         5.40   168393.3   15
## 5       Maj1         5.50   153948.1   14
## 6        Sev         6.00   129000.0    1
## 7        Typ         6.16   183429.1 1360

functional.list <- c('Maj2' = 1, 'Sev' = 2, 'Min2' = 3, 'Min1' = 4, 'Maj1' = 5, 'Mod' = 6, 'Typ' = 7)
completeData = map.values(c('Functional'), functional.list, completeData)

group.prices('GarageFinish')

##   GarageFinish mean.Quality mean.Price   n
## 1          Unf         5.31   137570.5 686
## 2          RFn         6.57   202068.9 422
## 3          Fin         7.07   240052.7 352

garage.fin.list <- c('Unf' = 1, 'RFn' = 1, 'Fin' = 2)
completeData = map.values(c('GarageFinish'), garage.fin.list, completeData)

Step 3.2.11 - EDA on Condition1 & Condition2

group.prices('Condition1')

##   Condition1 mean.Quality mean.Price    n
## 1      Feedr         5.31   142475.5   81
## 2       RRAe         5.45   138400.0   11
## 3     Artery         5.58   135091.7   48
## 4       Norm         6.15   184495.5 1260
## 5       RRAn         6.35   184396.6   26
## 6       RRNe         6.50   190750.0    2
## 7       PosA         6.62   225875.0    8
## 8       PosN         6.68   215184.2   19
## 9       RRNn         7.00   212400.0    5

group.prices('Condition2')

##   Condition2 mean.Quality mean.Price    n
## 1     Artery          4.0   106500.0    2
## 2       RRNn          4.5    96750.0    2
## 3       RRAe          5.0   190000.0    1
## 4       RRAn          5.0   136905.0    1
## 5      Feedr          5.5   121166.7    6
## 6       Norm          6.1   181169.4 1445
## 7       PosA         10.0   325000.0    1
## 8       PosN         10.0   284875.0    2

condition.list <- c('Feedr' = 1, 'Artery' = 1,'RRAe'= 2, 'RRAn' = 2, 'RRNe' = 2, 'RRNn' = 2, 'Norm' = 3, 'PosA' = 4, 'PosN' = 4)
completeData <- map.values(c('Condition1','Condition2'), condition.list, completeData)

Step 3.2.12 - EDA on LotShape & LotConfig

# visualization for LotShape against LotArea
completeData[,c('LotShape', 'LotArea')] %>%
  group_by(LotShape) %>% dplyr::summarise(medianArea = median(LotArea)) %>%
  arrange(medianArea) %>%
  ggplot(aes(x=reorder(LotShape,-medianArea), y=medianArea)) +
  geom_bar(stat = 'identity', fill='cornflowerblue') +
  labs(x='LotShape', y='Median of LotArea') +
  geom_text(aes(label = sort(medianArea)), vjust = -0.5) + theme_minimal()

# visualization for LotConfig against LotArea
completeData[,c('LotConfig', 'LotArea')] %>%
  group_by(LotConfig) %>% dplyr::summarise(medianArea = median(LotArea)) %>%
  arrange(medianArea) %>%
  ggplot(aes(x=reorder(LotConfig,-medianArea), y=medianArea)) +
  geom_bar(stat = 'identity', fill='cornflowerblue') +
  labs(x='LotConfig', y='Median of LotArea') +
  geom_text(aes(label = sort(medianArea)), vjust = -0.5) + theme_minimal()

lot.shape.list <- c('Reg' = 1,'IR1' = 2, 'IR2' = 3, 'IR3' = 4)
completeData = map.values(c('LotShape'), lot.shape.list, completeData)

lot.config.list <- c('Inside' = 1,'FR3' = 2, 'FR2' = 2, 'Corner' = 3, 'CulDSac' = 4)
completeData = map.values(c('LotConfig'), lot.config.list, completeData)

Step 3.2.13 - EDA on MasVnrType

# visualization for MasVnrType against MasVnrArea
completeData[,c('MasVnrType', 'MasVnrArea')] %>%
  group_by(MasVnrType) %>% dplyr::summarise(medianArea = median(MasVnrArea)) %>%
  arrange(medianArea) %>%
  ggplot(aes(x=reorder(MasVnrType,-medianArea), y=medianArea)) +
  geom_bar(stat = 'identity', fill='cornflowerblue') +
  labs(x='MasVnrType', y='Median of LotArea') +
  geom_text(aes(label = sort(medianArea)), vjust = -0.5) + theme_minimal()

MasVnrType.list <- c('None' = 0,'BrkCmn' = 1, 'Stone' = 2, 'BrkFace' = 3)
completeData = map.values(c('MasVnrType'), MasVnrType.list, completeData)

Step 3.2.14 - EDA on Neighborhood

group.prices('Neighborhood')

##    Neighborhood mean.Quality mean.Price   n
## 1       MeadowV         4.47   98576.47  17
## 2        IDOTRR         4.76  100123.78  37
## 3        Sawyer         5.03  136793.14  74
## 4       BrkSide         5.05  124834.05  58
## 5       Edwards         5.08  128219.70 100
## 6         NAmes         5.36  145847.08 225
## 7       OldTown         5.39  128225.30 113
## 8         SWISU         5.44  142591.36  25
## 9       Mitchel         5.59  156270.12  49
## 10       BrDale         5.69  104493.75  16
## 11      ClearCr         5.89  212565.43  28
## 12      Blueste         6.00  137500.00   2
## 13      NPkVill         6.00  142694.44   9
## 14      Crawfor         6.27  210624.73  51
## 15      SawyerW         6.32  186555.80  59
## 16       NWAmes         6.33  189050.07  73
## 17      Gilbert         6.56  192854.51  79
## 18      CollgCr         6.64  197965.77 150
## 19      Veenker         6.73  238772.73  11
## 20       Timber         7.16  242247.45  38
## 21      Blmngtn         7.18  194870.88  17
## 22      Somerst         7.34  225379.84  86
## 23      NoRidge         7.93  335295.32  41
## 24      StoneBr         8.16  310499.00  25
## 25      NridgHt         8.26  316270.62  77

nbrh.map <- c('MeadowV' = 0, 'IDOTRR' = 1, 'Sawyer' = 1, 'BrDale' = 1, 'OldTown' = 1, 'Edwards' = 1, 
             'BrkSide' = 1, 'Blueste' = 2, 'SWISU' = 2, 'NAmes' = 2, 'NPkVill' = 2, 'Mitchel' = 2,
             'SawyerW' = 2, 'Gilbert' = 3, 'NWAmes' = 3, 'Blmngtn' = 3, 'CollgCr' = 3, 'Crawfor' = 3, 
             'ClearCr' = 4, 'Veenker' = 4, 'Somerst' = 4, 'Timber' = 4, 'StoneBr' = 5, 'NoRidge' = 5, 
             'NridgHt' = 5)

completeData$Neighborhood_bin <- as.numeric(nbrh.map[completeData$Neighborhood])

Step 3 - Feature Engineering

Step 3.1 - New Feature Creation

We can observe that features such as YearBuilt, YearRemodAdd, YrSold cannot be directly used as numeric feautre for the model as model cannot distinguish between year & numeric feature. Hence, we will convert year features into numeric by comaring against 2010 which is last year of the dataset. Also, we will add other features derived from existing features.

# handling Year features
completeData$yrs_since_built <- 2010 - completeData$YearBuilt
completeData$yrs_since_sold <- 2010 - completeData$YrSold
completeData$yrs_since_remod <- completeData$YrSold - completeData$YearRemodAdd

# Total Living Area
completeData$TotalArea <- completeData$GrLivArea + completeData$TotalBsmtSF

# Total Porch
completeData$TotalPorch <- completeData$OpenPorchSF + completeData$X3SsnPorch + completeData$EnclosedPorch + completeData$ScreenPorch

# TotalBath
completeData$TotalBath <- completeData$BsmtFullBath + 0.5 * completeData$BsmtHalfBath + completeData$FullBath + 0.5 * completeData$HalfBath

# TotalFlrSF
completeData$TotalFlrSF <- completeData$X1stFlrSF + completeData$X2ndFlrSF

# OverallGrade
completeData$OverallGrade <- completeData$OverallQual * completeData$OverallCond

completeData$BsmtGrade <- completeData$BsmtQual * completeData$BsmtCond

completeData$GarageGrade <- completeData$GarageQual * completeData$GarageCond

completeData$KitchenScore <- completeData$KitchenAbvGr * completeData$KitchenQual

completeData$ExterGrade <- completeData$ExterQual * completeData$ExterCond

completeData$CentralAir=ifelse(completeData$CentralAir=="Y",1,0)
completeData$CentralAir <- as.integer(completeData$CentralAir)

Step 3.2 - Hadnling Features based on Area

Handling the houses with area based features equal to 0. Houses with 0 square footage for a columnshows that the house does not have that feature at all. We add a one-hot encoded column for returning 1 for any house with an area greater than 0 since this means that the house does have this feature and 0 for those who do not!

cols.binary <- c('X2ndFlrSF', 'MasVnrArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', 'X3SsnPorch', 'ScreenPorch', 'PoolArea', 'LowQualFinSF', 'MiscVal')

for (col in cols.binary){
  completeData[str_c('Has',col)] <- (completeData[,col] != 0) * 1
}

Step 3.3 Remove Features Not Required

As mentioned above, Street and Utilities have near zero variance and we have created new features for Year variables. Hence, we can remove these features from the dataset.

predictors <- colnames(completeData)

remove_feat <- c('Street', 'Utilities', 'YearBuilt', 'YearRemodAdd', 'YrSold', 'GarageYrBlt',
                 'GarageType', 'PavedDrive', 'LandContour', 'LandSlope', 'Neighborhood', 'Exterior1st',
                 'Exterior2nd', 'Electrical', 'X2ndFlrSF', 'MasVnrArea', 'WoodDeckSF', 'OpenPorchSF',
                 'EnclosedPorch', 'X3SsnPorch', 'ScreenPorch', 'PoolArea', 'LowQualFinSF', 'MiscVal',
                 'Foundation', 'Heating')

# Removing features from predictors
predictors <- setdiff(predictors, remove_feat)
completeData <- completeData[ , predictors]

Step 3.4 - One Hot Encoding for Categorical Features

Let’s handle remaining categorical values by creating dummy variables for each value. Each new dummy column will have a 1 or a 0 based on the value of that categorical column. This technique is also called as One Hot Encoding - One hot encoding is a representation of categorical variables as binary vectors. Once the dummy variables are created, we will remove the original categorical feature from the dataset.

# One Hot Encoding for all categorical varialbes except class as that is target variable
#for (cat_colname in colnames(completeData[sapply(completeData, is.factor) == TRUE])) {
#  
#  for(unique_value in unique(completeData[[cat_colname]])) {
#    
#    unq_val <- trimws(unique_value)
#    unq_val <- sub(".", "", unq_val,fixed=TRUE)
#    unq_val <- sub(")", "", unq_val,fixed=TRUE)
#    unq_val <- sub("(", "", unq_val,fixed=TRUE)
#    unq_val <- sub(" ", "", unq_val,fixed=TRUE)
#    unq_val <- sub("&", "", unq_val,fixed=TRUE)
#    completeData[paste(cat_colname, unq_val, sep="_")] <- ifelse(completeData[[cat_colname]]==unique_value, 1, 0)
#  }
#  
#  completeData[(cat_colname)] <- NULL
#}

completeData <- data.frame(lapply(completeData, function(x) as.numeric(as.factor(x))))

Step 4 - Transform Features & Split to Original Dataset

Before we perform any feature selection teachniques, we need to normalize the ‘train’ data by applying log transformations and split the whole dataset back to original train and test dataset

predictors <- colnames(completeData)

# transform into log variables
completeLog <- completeData
completeLog$LotFrontage <- log(completeData$LotFrontage)
completeLog$LotArea <- log(completeData$LotArea)

trainScaled = completeLog[1:trainingRowIndex, ]
testScaled = completeLog[testingRowIndex:totalSample, ]

trainScaled$SalePrice <- log(trainData$SalePrice)

#str(trainScaled)
#colSums(is.na(completeData))

Step 5 - Feature Selection

Step 5.1 - Remove Highly Correlated Features

The data may contain many features that are either redundant (strongly correlated, linearly dependent) or irrelevant. These features can be removed without incurring much loss of information. Multicollinearity (also collinearity) is a phenomenon in which two or more predictor variables in a multiple regression model are highly correlated, meaning that one can be linearly predicted from the others with a substantial degree of accuracy. Thus, all the features that shows more than 80% of correlation are removed.

# Remove highly correlated predictors
train_corr <- cor(trainScaled[ ,predictors])
train_high_corr_v <- findCorrelation(train_corr, cutoff=.8)
train_low_corr <- trainScaled[ ,-c(train_high_corr_v)]
trainCleaned <- train_low_corr
rm(train_low_corr)
print(paste0("Number of more than 70% correlated columns : ", length(train_high_corr_v)))

## [1] "Number of more than 70% correlated columns : 10"

Step 5.2 - Remove Linearly Dependent Features

With same reason as removing highly correlated features, we need to remove linearly dependent features as well.

# Check if there are linear dependecies
predictors <- setdiff(names(trainCleaned), target)
linear_dep_v <- findLinearCombos(trainCleaned[ ,predictors])

if (length(linear_dep_v$remove)>0) {
  train_no_ld <- trainCleaned[, -c(linear_dep_v$remove)]
  trainCleaned <- train_no_ld
  rm(train_no_ld)
}

trainCleaned$SalePrice <- trainData$SalePrice
print(paste0("Number of linearly dependent features : ", length(linear_dep_v$remove)))

## [1] "Number of linearly dependent features : 0"

Step 5.3 - Remove Near Zero Variance Features

With same reason as removing highly correlated features, we need to remove linearly dependent features as well.

nzv.data <- nearZeroVar(trainCleaned, saveMetrics = TRUE)

# take any of the near-zero-variance perdictors
drop.cols <- rownames(nzv.data)[nzv.data$nzv == TRUE]

trainCleaned <- trainCleaned[,!names(trainCleaned) %in% drop.cols]

print(paste0("Number of linearly dependent features : ", length(drop.cols)))

## [1] "Number of linearly dependent features : 12"

Step 5.4 - Information Gain

The entropy characterizes the impurity of an arbitrary collection of samples. Information Gain is the expected reduction in entropy caused by partitioning the samples according to a given feature which is the way of measuring association between inputs and outputs. Thus, higher the information gain is the better. features with information gain value above 0.03 are selected for the future analysis.

set.seed(123)

trainCleaned <- trainScaled

predictors <- setdiff(names(trainCleaned), target)

train.task <- makeRegrTask(data=trainCleaned, target="SalePrice")

imp_var1 <- generateFilterValuesData(train.task, method=c("information.gain"))
var1 <- imp_var1$data[imp_var1$data$information.gain > 0.03, c('name')]
plot_infogain <- plotFilterValues(imp_var1, feat.type.cols=TRUE)
plot_infogain

print(paste0("Number of significant features through Information Gain : ", length(var1)))

## [1] "Number of significant features through Information Gain : 51"

Step 5.5 - Chi-Squared

Pearson’s chi-squared test is a statistical test applied to sets of categorical data to evaluate how likely it is that any observed difference between the sets arose by chance and to test the independence of two events, in another words, it is used to test whether the occurrence of a specific term and the occurrence of a specific class are independent. [8] Thus after estimating the following quantity for each term and features are ranked by the score. Higher scores indicate that the null hypothesis (H0) of independence should be rejected and thus that the occurrence of the term and class are dependent. If they are dependent, then those features are selected.features with chi squared value above 0.24 are selected for the future analysis.

imp_var2 <- generateFilterValuesData(train.task, method=c("chi.squared"))
var2 <- imp_var2$data[imp_var2$data$chi.squared > 0.30, c('name')]
plot_chi <- plotFilterValues(imp_var2, feat.type.cols=TRUE)
plot_chi

print(paste0("Number of significant features through Chi-Squared : ", length(var2)))

## [1] "Number of significant features through Chi-Squared : 39"

Step 5.6 - Mean Decrease Gini

The mean decrease in Gini coefficient is a measure of how each variable contributes to the homogeneity of the nodes and leaves in the resulting random forest. [9] Each time a particular variable is used to split a node, the Gini coefficient for the child nodes are calculated and compared to that of the original node. The Gini coefficient is a measure of homogeneity from 0 (homogeneous) to 1 (heterogeneous). The changes in Gini are summed for each variable and normalized at the end of the calculation. Variables that result in nodes with higher purity have a higher decrease in Gini coefficient. - features with mean decreased Gini value above 0.001 are selected for the future analysis.

imp_var3 <- generateFilterValuesData(train.task, method=c("rf.importance"))
var3 <- imp_var3$data[imp_var3$data$rf.importance > 0.001, c('name')]
plot_rf <- plotFilterValues(imp_var3, feat.type.cols=TRUE)
plot_rf

print(paste0("Number of significant features through Mean Decrease Gini : ", length(var3)))

## [1] "Number of significant features through Mean Decrease Gini : 18"

Step 5.7 - Combine Selected Significant Features

Combine all significant features extracted above into single improtant features

imp_cols <- c(var1, var2, var3)
imp_cols <- unique(imp_cols)
length(imp_cols)

## [1] 51

imp_cols

##  [1] "MSSubClass"       "MSZoning"         "LotFrontage"     
##  [4] "LotArea"          "LotShape"         "HouseStyle"      
##  [7] "OverallQual"      "OverallCond"      "MasVnrType"      
## [10] "ExterQual"        "BsmtQual"         "BsmtExposure"    
## [13] "BsmtFinType1"     "BsmtFinSF1"       "BsmtUnfSF"       
## [16] "TotalBsmtSF"      "HeatingQC"        "CentralAir"      
## [19] "X1stFlrSF"        "GrLivArea"        "BsmtFullBath"    
## [22] "FullBath"         "HalfBath"         "BedroomAbvGr"    
## [25] "KitchenQual"      "TotRmsAbvGrd"     "Fireplaces"      
## [28] "GarageFinish"     "GarageCars"       "GarageArea"      
## [31] "SaleType"         "SaleCondition"    "HasPavedDrive"   
## [34] "GarageAttchd"     "GarageDetchd"     "ElectricalSB"    
## [37] "ElectricalFuse"   "Neighborhood_bin" "yrs_since_built" 
## [40] "yrs_since_remod"  "TotalArea"        "TotalPorch"      
## [43] "TotalBath"        "TotalFlrSF"       "OverallGrade"    
## [46] "BsmtGrade"        "KitchenScore"     "ExterGrade"      
## [49] "HasMasVnrArea"    "HasWoodDeckSF"    "HasOpenPorchSF"

Step 6 - Training Model

Step 6.1 - Split Train dataset into Train & Validation Dataset

final_trainData <- trainCleaned[, imp_cols]
final_trainData$SalePrice <- trainCleaned$SalePrice
final_testData <- testScaled[, imp_cols]

set.seed(123)   #  set seed to ensure you always have same random numbers generated

# splits the data in the ratio mentioned in SplitRatio. After splitting marks these rows as logical TRUE and the the remaining are marked as logical FALSE
sample = sample.split(final_trainData, SplitRatio = 0.70) 

# creates a training dataset named train1 with rows which are marked as TRUE
ames_trainData = subset(final_trainData, sample ==TRUE)

ames_validationData = subset(final_trainData, sample==FALSE)

ames_testData = final_testData

myControl = trainControl(method = "cv", number = 5, verboseIter = FALSE) # five-fold cross-validation

Step 6.2 - Model 1 Linear Regression

model_lm = caret::train(SalePrice ~ ., data=ames_trainData, method="lm", trControl=myControl, metric="RMSE")

# Extract Predictions
model_lm_pred <- predict(model_lm, ames_validationData)

# Extract Residuals
model_lm_resid <- ames_validationData$SalePrice - model_lm_pred

# Calculate RMSE
rmse.test.model.lm <- sqrt(mean(model_lm_resid^2, na.rm = TRUE))

valid.model.lm <- data.frame(cbind(model="Linear", rmse=rmse.test.model.lm))

Step 6.3 - Model 2 Decision Tree (CART)

mtry <- ncol(ames_trainData)
tunegrid <- expand.grid(.mtry=mtry)

model_rpart = caret::train(SalePrice ~ ., data=ames_trainData, method="rpart", tuneLength=10, trControl=myControl, metric="RMSE")

# Extract Predictions
model_rpart_pred <- predict(model_rpart, ames_validationData)

# Extract Residuals
model_rpart_resid <- ames_validationData$SalePrice - model_rpart_pred

# Calculate RMSE
rmse.test.model.rpart <- sqrt(mean(model_rpart_resid^2, na.rm = TRUE))

valid.model.rpart <- data.frame(cbind(model="Decision Tree", rmse=rmse.test.model.rpart))

Step 6.4 - Model 3 Random Forest

mtry <- ncol(ames_trainData)
tunegrid <- expand.grid(.mtry=mtry)

model_rf = caret::train(SalePrice ~ ., data=ames_trainData, method="rf", tuneGrid=tunegrid, ntree=100, trControl=myControl, metric="RMSE")

# Extract Predictions
model_rf_pred <- predict(model_rf, ames_validationData)

# Extract Residuals
model_rf_resid <- ames_validationData$SalePrice - model_rf_pred

# Calculate RMSE
rmse.test.model.rf <- sqrt(mean(model_rf_resid^2, na.rm = TRUE))

valid.model.rf <- data.frame(cbind(model="Random Forest", rmse=rmse.test.model.rf))

Step 6.5 - Model 4 eXtreme Gradient Boosting

mtry <- ncol(ames_trainData)
tunegrid <- expand.grid(nrounds=750, max_depth=8, eta=0.01, colsample_bytree=1, min_child_weight=2, subsample=0.6, gamma=0.01)

model_xgb = caret::train(SalePrice ~ ., data=ames_trainData, method="xgbTree", tuneGrid=tunegrid, trControl=myControl, metric="RMSE", verbose=T, nthread =3)
#model_xgb = caret::train(SalePrice ~ ., data=ames_trainData, method="xgbDART", trControl=myControl, metric="RMSE")

# Extract Predictions
model_xgb_pred <- predict(model_xgb, ames_validationData)

# Extract Residuals
model_xgb_resid <- ames_validationData$SalePrice - model_xgb_pred

# Calculate RMSE
rmse.test.model.xgb <- sqrt(mean(model_xgb_resid^2, na.rm = TRUE))

valid.model.xgb <- data.frame(cbind(model="eXtreme Gradient Boosting", rmse=rmse.test.model.xgb))

Step 6.6. - Model 5 Bagged CART

mtry <- ncol(ames_trainData)
tunegrid <- expand.grid(.mtry=mtry)

model_treebag = caret::train(SalePrice ~ ., data=ames_trainData, method="treebag", trControl=myControl, metric="RMSE")

# Extract Predictions
model_treebag_pred <- predict(model_treebag, ames_validationData)

# Extract Residuals
model_treebag_resid <- ames_validationData$SalePrice - model_treebag_pred

# Calculate RMSE
rmse.test.model.treebag <- sqrt(mean(model_treebag_resid^2, na.rm = TRUE))

valid.model.treebag <- data.frame(cbind(model="Bagged Tree", rmse=rmse.test.model.treebag))

Step 6.7 - Model 6 Boosted Generalized Linear Model

model_bglm = caret::train(SalePrice ~ ., data=ames_trainData, method="glmboost", trControl=myControl, metric="RMSE")

# Extract Predictions
model_bglm_pred <- predict(model_bglm, ames_validationData)

# Extract Residuals
model_bglm_resid <- ames_validationData$SalePrice - model_bglm_pred

# Calculate RMSE
rmse.test.model.bglm <- sqrt(mean(model_bglm_resid^2, na.rm = TRUE))

valid.model.bglm <- data.frame(cbind(model="Boosted GLM", rmse=rmse.test.model.bglm))

Step 6.8 - Identifying Best Model

validation_model <- rbind(valid.model.lm, valid.model.rf, valid.model.rpart, valid.model.xgb, valid.model.treebag, valid.model.bglm)
validation_model

##                       model              rmse
## 1                    Linear 0.131537110952662
## 2             Random Forest 0.145441879390337
## 3             Decision Tree 0.221395305550647
## 4 eXtreme Gradient Boosting 0.121532667903922
## 5               Bagged Tree 0.186662063078336
## 6               Boosted GLM 0.134744883658696

model_list = list(lm=model_lm, rf=model_rf, rpart=model_rpart, xgboost=model_xgb, treebag=model_treebag, boostglm=model_bglm)
resamples = resamples(model_list)
summary(resamples)

## 
## Call:
## summary.resamples(object = resamples)
## 
## Models: lm, rf, rpart, xgboost, treebag, boostglm 
## Number of resamples: 5 
## 
## MAE 
##                Min.    1st Qu.     Median       Mean    3rd Qu.       Max.
## lm       0.08591649 0.08897851 0.09322397 0.09199129 0.09322433 0.09861315
## rf       0.09458390 0.09478176 0.09911968 0.09942623 0.10406481 0.10458103
## rpart    0.14793635 0.15105955 0.15390938 0.15457795 0.15561791 0.16436659
## xgboost  0.07900268 0.07928686 0.09056147 0.08770904 0.09325510 0.09643910
## treebag  0.12827950 0.13144696 0.13453027 0.13531570 0.13643086 0.14589090
## boostglm 0.09126963 0.09790826 0.09840963 0.09941597 0.10145881 0.10803352
##          NA's
## lm          0
## rf          0
## rpart       0
## xgboost     0
## treebag     0
## boostglm    0
## 
## RMSE 
##               Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## lm       0.1153665 0.1245304 0.1314734 0.1353970 0.1517409 0.1538738    0
## rf       0.1311499 0.1419406 0.1431966 0.1459131 0.1497471 0.1635315    0
## rpart    0.1913057 0.2087365 0.2096506 0.2103783 0.2177584 0.2244403    0
## xgboost  0.1113762 0.1157546 0.1305864 0.1301929 0.1412667 0.1519807    0
## treebag  0.1713135 0.1754272 0.1883920 0.1853247 0.1897662 0.2017244    0
## boostglm 0.1196438 0.1295988 0.1335208 0.1405935 0.1531422 0.1670618    0
## 
## Rsquared 
##               Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## lm       0.8632114 0.8845830 0.8916825 0.8884827 0.8952200 0.9077167    0
## rf       0.8310874 0.8613636 0.8694012 0.8712425 0.8880499 0.9063103    0
## rpart    0.6835327 0.7171496 0.7436434 0.7332021 0.7444074 0.7772773    0
## xgboost  0.8611971 0.8946406 0.9006626 0.8991990 0.9151309 0.9243636    0
## treebag  0.7832584 0.7896091 0.7943940 0.7936107 0.7950630 0.8057292    0
## boostglm 0.8351873 0.8651526 0.8869678 0.8792133 0.8953109 0.9134477    0

Step 7 - Hypertuning Best Model

Step 7.1 - eXtreme Gradient Boosting (XGB) & Tuning Hyper Parameters

set.seed(123)
cv.ctrl <- trainControl(method = "repeatedcv", repeats = 1, number = 4, allowParallel=T)

#xgb.grid <- expand.grid(nrounds = 750,
#                        eta = c(0.01,0.005,0.001),
#                        max_depth = c(4,6,8),
#                        colsample_bytree=c(1,5,10),
#                        min_child_weight = 2,
#                        subsample=c(0.2,0.4,0.6),
#                        gamma=0.01)

#xgb_tune <- caret::train(SalePrice ~ .,
#                  data=ames_trainData,
#                  method="xgbTree",
#                  trControl=cv.ctrl,
#                  tuneGrid=xgb.grid,
#                  verbose=T,
#                  metric="RMSE",
#                  nthread =3)

#xgb_tune

# After hyper-tuning through above code, below parameters are selected for building final repdiction model
xgb.bst.params <- expand.grid(nrounds = 1000,
                              colsample_bytree=1,
                              eta=0.010,
                              max_depth=8,
                              min_child_weight=3,
                              gamma=0.01, # less overfit
                              subsample=0.4)

final.xgb.model <- caret::train(SalePrice ~ ., data=ames_trainData, method="xgbTree",
                            trControl=cv.ctrl, tuneGrid=xgb.bst.params, verbose=T,
                            metric="RMSE", nthread =3)

final.xgb.model

## eXtreme Gradient Boosting 
## 
## 1010 samples
##   51 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (4 fold, repeated 1 times) 
## Summary of sample sizes: 757, 759, 757, 757 
## Resampling results:
## 
##   RMSE       Rsquared   MAE       
##   0.1294419  0.8980078  0.08742543
## 
## Tuning parameter 'nrounds' was held constant at a value of 1000
##  1
## Tuning parameter 'min_child_weight' was held constant at a value of
##  3
## Tuning parameter 'subsample' was held constant at a value of 0.4

actual.pred.train <- data.frame(cbind(SalePrice=exp(ames_trainData$SalePrice),
                                      PredSalePrice=exp(predict(final.xgb.model)),
                                      Residuals=exp(residuals(final.xgb.model))))

summary(actual.pred.train)

##    SalePrice      PredSalePrice      Residuals     
##  Min.   : 34900   Min.   : 36471   Min.   :0.5559  
##  1st Qu.:129000   1st Qu.:128966   1st Qu.:0.9708  
##  Median :161875   Median :159844   Median :1.0020  
##  Mean   :179879   Mean   :178878   Mean   :1.0024  
##  3rd Qu.:213438   3rd Qu.:210609   3rd Qu.:1.0317  
##  Max.   :745000   Max.   :578701   Max.   :1.4297

final.rmse.train.xgb <- sqrt(mean((exp(ames_trainData$SalePrice) - exp(predict(final.xgb.model)))^2, na.rm = TRUE))
final.rmse.train.xgb

## [1] 12830.52

Step 7.2 - Run Prediction on Validation Dataset & Check Performance

# Extract Predictions for Validation Dataset
final.valid.pred <- predict(final.xgb.model, ames_validationData)

# Extract Residuals for Validation Dataset
final.valid.resid <- ames_validationData$SalePrice - final.valid.pred

actual.pred.valid <- data.frame(cbind(SalePrice=exp(ames_validationData$SalePrice),
                                      PredSalePrice=exp(final.valid.pred),
                                      Residuals=exp(final.valid.resid)))

summary(actual.pred.valid)

##    SalePrice      PredSalePrice      Residuals     
##  Min.   : 40000   Min.   : 68563   Min.   :0.3412  
##  1st Qu.:130750   1st Qu.:131472   1st Qu.:0.9515  
##  Median :165000   Median :163492   Median :1.0127  
##  Mean   :183261   Mean   :181207   Mean   :1.0113  
##  3rd Qu.:214750   3rd Qu.:212853   3rd Qu.:1.0659  
##  Max.   :755000   Max.   :541505   Max.   :1.5850

# Calculate RMSE for validation Dataset
final.rmse.valid.xgb <- sqrt(mean((exp(ames_validationData$SalePrice) - exp(final.valid.pred))^2, na.rm = TRUE))
final.rmse.valid.xgb

## [1] 29728.3

Step 7.3 - Run Prediction on Test Dataset

# Extract Predictions for Test Dataset
final.test.pred <- predict(final.xgb.model, ames_testData)

test.predictions <- data.frame(cbind(Id=testData$Id,
                                     SalePrice=exp(final.test.pred)))

# Write Predictions of Test Datset into csv file
write.csv(test.predictions, "C:/Users/ajana/Desktop/Imarticus Project/R Project/Data Set/test_predict.csv")

AMES Housing Price Project Solution

Anup Kumar Jana

July 30, 2018