require(caTools) # for splitting datasets
## Loading required package: caTools
## Warning: package 'caTools' was built under R version 3.3.3
require(ggplot2) # for data visualization
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.3.3
require(magrittr) #for pipe operator
## Loading required package: magrittr
## Warning: package 'magrittr' was built under R version 3.3.3
# require(stringr) #extracting string patterns
# require(Matrix) # matrix transformations
require(lmtest) # for bcv test heteroskedasticity
## Loading required package: lmtest
## Warning: package 'lmtest' was built under R version 3.3.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.3.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
require(glmnet) # ridge, lasso & elastinet
## Loading required package: glmnet
## Warning: package 'glmnet' was built under R version 3.3.3
## Loading required package: Matrix
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 3.3.3
## Loaded glmnet 2.0-5
require(xgboost) # gbm
## Loading required package: xgboost
## Warning: package 'xgboost' was built under R version 3.3.3
require(randomForest)
## Loading required package: randomForest
## Warning: package 'randomForest' was built under R version 3.3.3
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
require(rpart)
## Loading required package: rpart
# require(Metrics) # rmse
require(dplyr) # load this in last so plyr doens't overlap it
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.3.3
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:xgboost':
##
## slice
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(caret) # one hot encoding
## Loading required package: caret
## Warning: package 'caret' was built under R version 3.3.3
## Loading required package: lattice
require(gbm)
## Loading required package: gbm
## Warning: package 'gbm' was built under R version 3.3.3
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.3
# require(scales) # plotting $$
# require(e1071) # skewness
require(corrplot) # correlation plot
## Loading required package: corrplot
## Warning: package 'corrplot' was built under R version 3.3.3
require(metrics) #for RMSE function
## Loading required package: metrics
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'metrics'
library(plotly)
## Warning: package 'plotly' was built under R version 3.3.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:xgboost':
##
## slice
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
# library(shiny)
#Loading Test and Train Datasets
house_train <- read.csv("D:/Documents/Data Mining Project/Housing Prices Competetion/train.csv", header = T,stringsAsFactors = FALSE )
## Data preprocessing
# The data preprocessing step proved to be the most crucial in this project.
#Missing Data 'NA' Count and Percent of missing data
na_count <- sort(sapply(house_train, function(x) sum(is.na(x))),decreasing = T)
percent_na_count <-sort(apply(house_train, 2, function(col)sum(is.na(col))/length(col)),decreasing = T)
dt_MissingData <- data.frame(percent_na_count,na_count)
head(dt_MissingData,20)
## percent_na_count na_count
## PoolQC 0.9952054795 1453
## MiscFeature 0.9630136986 1406
## Alley 0.9376712329 1369
## Fence 0.8075342466 1179
## FireplaceQu 0.4726027397 690
## LotFrontage 0.1773972603 259
## GarageType 0.0554794521 81
## GarageYrBlt 0.0554794521 81
## GarageFinish 0.0554794521 81
## GarageQual 0.0554794521 81
## GarageCond 0.0554794521 81
## BsmtExposure 0.0260273973 38
## BsmtFinType2 0.0260273973 38
## BsmtQual 0.0253424658 37
## BsmtCond 0.0253424658 37
## BsmtFinType1 0.0253424658 37
## MasVnrType 0.0054794521 8
## MasVnrArea 0.0054794521 8
## Electrical 0.0006849315 1
## MSSubClass 0.0000000000 0
#Variables poolQc,MiscFeature and Alley are having missing data more than 95%. Even though we impute this variables with mode or median of respective coloumns , these wont add information to the model because of near zero variance , hence we can delete this columns from dataset which we will do late after imputation of missing values of all columns.
## Missing Values Imputation
###PoolQC: Pool quality In the data description it was mentioned that NA in poolQC represents there was no pool in particular house. So, we will replace NA with 'None'
house_train$PoolQC[is.na(house_train$PoolQC)] <- 'None'
#We can replace any missing values for Fence,Alley and MiscFeature with 'None' as they probably don't have this feature with their property.
house_train$MiscFeature[is.na(house_train$MiscFeature)] <- 'None'
house_train$Fence[is.na(house_train$Fence)] <- 'None'
house_train$Alley[is.na(house_train$Alley)] <- 'None'
house_train$FireplaceQu[is.na(house_train$FireplaceQu)] <- 'None'
###LotFrontage: Linear feet of street connected to property.
#There are many missing values for LotFrontage, which is quite a lot of values to fill and we can't just replace these with 0. We're given that "LotFrontage: Linear feet of street connected to property." The area of each street connected to the house property is most likely going to have a similar area to other houses in its neighborhood. We can group by each neighborhood and take the median of each LotFrontage and fill the missing values of each LotFrontage based on what neighborhood the house comes from.
lot.by.nbrh = house_train[,c('Neighborhood','LotFrontage')] %>%
group_by(Neighborhood) %>%
summarise(median = median(LotFrontage, na.rm = TRUE))
lot.by.nbrh
## # A tibble: 25 × 2
## Neighborhood median
## <chr> <dbl>
## 1 Blmngtn 43.0
## 2 Blueste 24.0
## 3 BrDale 21.0
## 4 BrkSide 52.0
## 5 ClearCr 80.0
## 6 CollgCr 70.0
## 7 Crawfor 74.0
## 8 Edwards 65.5
## 9 Gilbert 65.0
## 10 IDOTRR 60.0
## # ... with 15 more rows
lotNA = which(is.na(house_train$LotFrontage))
#For loop to replace each missing value in Lot Frontage with Median Sales price of respective neighbourhood
for(i in lotNA){
nbrhd <- house_train$Neighborhood[i]
lot.median <- lot.by.nbrh[lot.by.nbrh$Neighborhood==nbrhd[1],'median']
house_train[i,'LotFrontage'] = lot.median[[1]]
}
###Garage Features
# GarageType: Garage location
# GarageYrBlt: Year garage was built
# GarageFinish: Interior finish of the garage
# GarageCars: Size of garage in car capacity
# GarageArea: Size of garage in square feet
# GarageQual: Garage quality
# GarageCond: Garage condition#
# It seems reasonable that most houses would build a garage when the house itself was built. We can check this by seeing how many houses were built the same year their garage was built.
length(which(house_train$GarageYrBlt == house_train$YearBuilt))
## [1] 1089
# 1089 of the 1460 houses have same year for for GarageYrBlt and YearBuilt. Lets replace any of the NA's for GarageYrBlt with the year from YearBuilt.
gyr.na = which(is.na(house_train$GarageYrBlt))
house_train[gyr.na, 'GarageYrBlt'] = house_train[gyr.na, 'YearBuilt']
# In the data description it was mentioned that NA in Garage Columns represents there was no Garage, thus we can assume this particular house does not have a garage at all. and will replace NA with 'None' ,
house_train$GarageQual[is.na(house_train$GarageQual)] <- 'None'
house_train$GarageFinish[is.na(house_train$GarageFinish)] <- 'None'
house_train$GarageCond[is.na(house_train$GarageCond)] <- 'None'
house_train$GarageType[is.na(house_train$GarageType)] <- 'None'
###Basement Features
# BsmtExposure
# BsmtFinType2
# BsmtQual
# BsmtCond
# BsmtFinType1
length(which(house_train$TotalBsmtSF ==0 ))
## [1] 37
# 37 houses are not having Total BasementSF, implies houses certainly don't have basements
#Almost all of the missing values for each categoric basement feature comes from houses with 0 on Basement Square Feet Area. We can fill in these values with 'None' since these houses certainly don't have basements.
bsmt.cols <- c('BsmtExposure','BsmtFinType2','BsmtQual','BsmtCond','BsmtFinType1' )
for (col in bsmt.cols)
{
house_train[sapply(house_train[col],is.na),col] = 'None'
}
# MasVnrType: Masonry veneer type
# MasVnrArea: Masonry veneer area in square feet
# The areas we can replace with 0 and types can be replaced with 'None'
house_train$MasVnrType[is.na(house_train$MasVnrType)] <- 'None'
house_train$MasVnrArea[is.na(house_train$MasVnrArea)] <- 0
#Electrical
#There is only one missing value in Electrical feature and hence we are replacing it with the most repeated value of that column
p <- plot_ly(x = house_train$Electrical, type = "histogram")
p
## Warning: Ignoring 1 observations
house_train$Electrical[is.na(house_train$Electrical)] <- 'SBrkr'
##Variable Selection - Feature Selection
#Reducing the variables by Deleting coloumns with near zero variance. for example look at street variable histogram almost 95% are of same values Pave. As there is no variance in variable therefore it does not convey any informaton to model and more over it decreases the stability of model.
p <- plot_ly(x = house_train$Street, type = "histogram")
p
###nearZeroVar function is in Caret package , which will inform the zero variance or near zero variance of variables
nzv.data = nearZeroVar(house_train, saveMetrics = TRUE)
nzv.data <- nzv.data[nzv.data[,"zeroVar"] + nzv.data[,"nzv"] > 0, ]
nzv.data
## freqRatio percentUnique zeroVar nzv
## Street 242.33333 0.1369863 FALSE TRUE
## Alley 27.38000 0.2054795 FALSE TRUE
## LandContour 20.80952 0.2739726 FALSE TRUE
## Utilities 1459.00000 0.1369863 FALSE TRUE
## LandSlope 21.26154 0.2054795 FALSE TRUE
## Condition2 240.83333 0.5479452 FALSE TRUE
## RoofMatl 130.36364 0.5479452 FALSE TRUE
## BsmtCond 20.16923 0.3424658 FALSE TRUE
## BsmtFinType2 23.25926 0.4794521 FALSE TRUE
## BsmtFinSF2 258.60000 9.8630137 FALSE TRUE
## Heating 79.33333 0.4109589 FALSE TRUE
## LowQualFinSF 478.00000 1.6438356 FALSE TRUE
## KitchenAbvGr 21.41538 0.2739726 FALSE TRUE
## Functional 40.00000 0.4794521 FALSE TRUE
## EnclosedPorch 83.46667 8.2191781 FALSE TRUE
## X3SsnPorch 478.66667 1.3698630 FALSE TRUE
## ScreenPorch 224.00000 5.2054795 FALSE TRUE
## PoolArea 1453.00000 0.5479452 FALSE TRUE
## PoolQC 484.33333 0.2739726 FALSE TRUE
## MiscFeature 28.69388 0.3424658 FALSE TRUE
## MiscVal 128.00000 1.4383562 FALSE TRUE
nzv.coloums <- rownames(nzv.data)[nzv.data$nzv == TRUE]
nzv.coloums
## [1] "Street" "Alley" "LandContour" "Utilities"
## [5] "LandSlope" "Condition2" "RoofMatl" "BsmtCond"
## [9] "BsmtFinType2" "BsmtFinSF2" "Heating" "LowQualFinSF"
## [13] "KitchenAbvGr" "Functional" "EnclosedPorch" "X3SsnPorch"
## [17] "ScreenPorch" "PoolArea" "PoolQC" "MiscFeature"
## [21] "MiscVal"
#Now we are removing those variables with near zero variance
house_train = house_train[,!names(house_train) %in% nzv.coloums]
dim(house_train)[2]
## [1] 59
#The variables has been reduced to 59.
##Correlation
#Correlation between the independent variables lead to multicollinearity in model, which makes the model insignificant. Thus from correlation plot we can find mostly correlated variables and remove them. And also we can find the relation on Target variable SalePrice
cont <- sapply(house_train, is.numeric)
df_cont<-data.frame(house_train[, cont])
correlations <- cor(df_cont)
# correlations
row_indic <- apply(correlations, 1, function(x) sum(x > 0.3 | x < -0.3) > 1)
correlations<- correlations[row_indic ,row_indic ]
corrplot(correlations, method="square")

# From the correlation plot we can see the 10 features with the strongest effect on SalePrice.
# OverallQual: Overall material and finish quality
# GrLivArea: Above grade (ground) living area square feet
# These 2 features have the highest correlation and very reasonably make sense.
#
# As expected, Garage Yr Built and Yr Built are having high correlation , hence we will remove Garage Yr built and keep Yr Built
house_train$GarageYrBlt <- NULL
# GarageCars: Size of garage in car capacity
# GarageArea: Size of garage in square feet
# These 2 features from our plot are the next best thing, however, this might not have been something we expected. We can also see that the correlation between the 2 is extremely high, 0.88, which makes sense as the area of the garage is a constraint on how many cars can fit.
house_train$GarageCars <- NULL
# TotalBsmtSF: Total square feet of basement area
# 1stFlrSF: First Floor square feet
#We can also see that these 2 features have a strong linear relationship with one another, which makes sense as the size of the basement can certainly depend on the size of the houses 1st floor.
# GrLivArea: Above grade (ground) living area square feet
# TotRmsAbvGrd: Total rooms above grade (does not include bathrooms)
# YearBuilt: Original construction date
# YearRemodAdd: Remodel date
# A positive correlation for both of these is not something we should be surprised by. The newer homes will likely give higher listings compared to older models.
house_train$TotRmsAbvGrd <- NULL
##Converting Categorical Data to Numeric data
# Now Converting Categorical Data to Numeric data we will have to transform any categoric into a binary feature using one-hot encoding. First off, we can start by adding all our numeric features to a new dataframe along with adding any extra custom made features that will help with predictive modeling.
# Next we will transform the ordinal variables (variables that can be scaled) into numeric values. We will do this by determining which order the categories follow and assigning the values an order from 1,2,..,n. We'll group each feature by its possible values and return the median SalePrice for each unique value, then map the higher priced/better quality homes larger numeric values and the lower priced/lower quality homes smaller numeric values.
# get names of categorical features
cat_features = names(which(sapply(house_train, is.character)))
cat_features
## [1] "MSZoning" "LotShape" "LotConfig" "Neighborhood"
## [5] "Condition1" "BldgType" "HouseStyle" "RoofStyle"
## [9] "Exterior1st" "Exterior2nd" "MasVnrType" "ExterQual"
## [13] "ExterCond" "Foundation" "BsmtQual" "BsmtExposure"
## [17] "BsmtFinType1" "HeatingQC" "CentralAir" "Electrical"
## [21] "KitchenQual" "FireplaceQu" "GarageType" "GarageFinish"
## [25] "GarageQual" "GarageCond" "PavedDrive" "Fence"
## [29] "SaleType" "SaleCondition"
#There are 30 Categorical variables
#Neighbourhood
nbhdprice <- summarize(group_by(house_train, Neighborhood),
mean(SalePrice, na.rm=T))
#nbhdprice[order(nbhdprice$`mean(SalePrice, na.rm = T)`),]
nbhdprice_lo <- filter(nbhdprice, nbhdprice$`mean(SalePrice, na.rm = T)` < 140000)
nbhdprice_med <- filter(nbhdprice, nbhdprice$`mean(SalePrice, na.rm = T)` < 200000 &
nbhdprice$`mean(SalePrice, na.rm = T)` >= 140000 )
nbhdprice_hi <- filter(nbhdprice, nbhdprice$`mean(SalePrice, na.rm = T)` >= 200000)
house_train$nbhd_price_level[house_train$Neighborhood %in% nbhdprice_lo$Neighborhood] <- 1
house_train$nbhd_price_level[house_train$Neighborhood %in% nbhdprice_med$Neighborhood] <- 2
house_train$nbhd_price_level[house_train$Neighborhood %in% nbhdprice_hi$Neighborhood] <- 3
#housestyle
housestyle_price <- summarize(group_by(house_train, HouseStyle),
mean(SalePrice, na.rm=T))
housestyle_lo <- filter(housestyle_price, housestyle_price$`mean(SalePrice, na.rm = T)` < 140000)
housestyle_med <- filter(housestyle_price, housestyle_price$`mean(SalePrice, na.rm = T)` < 200000 &
housestyle_price$`mean(SalePrice, na.rm = T)` >= 140000 )
housestyle_hi <- filter(housestyle_price, housestyle_price$`mean(SalePrice, na.rm = T)` >= 200000)
house_train$house_style_level[house_train$HouseStyle %in% housestyle_lo$HouseStyle] <- 1
house_train$house_style_level[house_train$HouseStyle %in% housestyle_med$HouseStyle] <- 2
house_train$house_style_level[house_train$HouseStyle %in% housestyle_hi$HouseStyle] <- 3
#Exterior1st
price <- summarize(group_by(house_train, Exterior1st),
mean(SalePrice, na.rm=T))
matl_lo_1 <- filter(price, price$`mean(SalePrice, na.rm = T)` < 140000)
matl_med_1<- filter(price, price$`mean(SalePrice, na.rm = T)` < 200000 &
price$`mean(SalePrice, na.rm = T)` >= 140000 )
matl_hi_1 <- filter(price, price$`mean(SalePrice, na.rm = T)` >= 200000)
house_train$exterior_1[house_train$Exterior1st %in% matl_lo_1$Exterior1st] <- 1
house_train$exterior_1[house_train$Exterior1st %in% matl_med_1$Exterior1st] <- 2
house_train$exterior_1[house_train$Exterior1st %in% matl_hi_1$Exterior1st] <- 3
#exterior2
price <- summarize(group_by(house_train, Exterior2nd),
mean(SalePrice, na.rm=T))
matl_lo <- filter(price, price$`mean(SalePrice, na.rm = T)` < 140000)
matl_med <- filter(price, price$`mean(SalePrice, na.rm = T)` < 200000 &
price$`mean(SalePrice, na.rm = T)` >= 140000 )
matl_hi <- filter(price, price$`mean(SalePrice, na.rm = T)` >= 200000)
house_train$exterior_2[house_train$Exterior2nd %in% matl_lo$Exterior2nd] <- 1
house_train$exterior_2[house_train$Exterior2nd %in% matl_med$Exterior2nd] <- 2
house_train$exterior_2[house_train$Exterior2nd %in% matl_hi$Exterior2nd] <- 3
#ExterQual
price <- summarize(group_by(house_train, ExterQual),
mean(SalePrice, na.rm=T))
house_train$exterior_cond[house_train$ExterQual == "Ex"] <- 4
house_train$exterior_cond[house_train$ExterQual == "Gd"] <- 3
house_train$exterior_cond[house_train$ExterQual == "TA"] <- 2
house_train$exterior_cond[house_train$ExterQual == "Fa"] <- 1
#ExterCond
price <- summarize(group_by(house_train, ExterCond),
mean(SalePrice, na.rm=T))
house_train$exterior_cond2[house_train$ExterCond == "Ex"] <- 5
house_train$exterior_cond2[house_train$ExterCond == "Gd"] <- 4
house_train$exterior_cond2[house_train$ExterCond == "TA"] <- 3
house_train$exterior_cond2[house_train$ExterCond == "Fa"] <- 2
house_train$exterior_cond2[house_train$ExterCond == "Po"] <- 1
#Kitchen Variables
#HeatingQC
price <- summarize(group_by(house_train, HeatingQC),
mean(SalePrice, na.rm=T))
house_train$heatqual[house_train$HeatingQC == "Ex"] <- 5
house_train$heatqual[house_train$HeatingQC == "Gd"] <- 4
house_train$heatqual[house_train$HeatingQC == "TA"] <- 3
house_train$heatqual[house_train$HeatingQC == "Fa"] <- 2
house_train$heatqual[house_train$HeatingQC == "Po"] <- 1
#one. Now, time to drop off the variables above which have been made numeric and are no longer needed.
house_train$Neighborhood <- NULL
house_train$Exterior1st <- NULL
house_train$Exterior2nd <- NULL
house_train$ExterQual <- NULL
house_train$HeatingQC <- NULL
house_train$HouseStyle <- NULL
house_train$ExterCond <- NULL
# we have converted all the ordinal variables to numeric by assigning weights based on mean sale price. Now we will convert the remaining nominal variables into numeric using as.numeric function. First we will convert to factors and then convert to numeric
for(i in colnames(house_train[,sapply(house_train, is.character)])){
house_train[,i] <- as.factor(house_train[,i])
}
for(i in colnames(house_train[,sapply(house_train, is.factor)])){
house_train[,i] <- as.numeric(house_train[,i])
}
str(house_train)
## 'data.frame': 1460 obs. of 56 variables:
## $ MSSubClass : int 60 20 60 70 60 50 20 60 50 190 ...
## $ MSZoning : num 4 4 4 4 4 4 4 4 5 4 ...
## $ LotFrontage : num 65 80 68 60 84 85 75 80 51 50 ...
## $ LotArea : int 8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
## $ LotShape : num 4 4 1 1 1 1 4 1 4 4 ...
## $ LotConfig : num 5 3 5 1 3 5 5 1 5 1 ...
## $ Condition1 : num 3 2 3 3 3 3 3 5 1 1 ...
## $ BldgType : num 1 1 1 1 1 1 1 1 1 2 ...
## $ OverallQual : int 7 6 7 7 8 5 8 7 7 5 ...
## $ OverallCond : int 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : int 2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
## $ YearRemodAdd : int 2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
## $ RoofStyle : num 2 2 2 2 2 2 2 2 2 2 ...
## $ MasVnrType : num 2 3 2 3 2 3 4 4 3 3 ...
## $ MasVnrArea : num 196 0 162 0 350 0 186 240 0 0 ...
## $ Foundation : num 3 2 3 1 3 6 3 2 1 1 ...
## $ BsmtQual : num 3 3 3 5 3 3 1 3 5 5 ...
## $ BsmtExposure : num 4 2 3 4 1 4 1 3 4 4 ...
## $ BsmtFinType1 : num 3 1 3 1 3 3 3 1 7 3 ...
## $ BsmtFinSF1 : int 706 978 486 216 655 732 1369 859 0 851 ...
## $ BsmtUnfSF : int 150 284 434 540 490 64 317 216 952 140 ...
## $ TotalBsmtSF : int 856 1262 920 756 1145 796 1686 1107 952 991 ...
## $ CentralAir : num 2 2 2 2 2 2 2 2 2 2 ...
## $ Electrical : num 5 5 5 5 5 5 5 5 2 5 ...
## $ X1stFlrSF : int 856 1262 920 961 1145 796 1694 1107 1022 1077 ...
## $ X2ndFlrSF : int 854 0 866 756 1053 566 0 983 752 0 ...
## $ GrLivArea : int 1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
## $ BsmtFullBath : int 1 0 1 1 1 1 1 1 0 1 ...
## $ BsmtHalfBath : int 0 1 0 0 0 0 0 0 0 0 ...
## $ FullBath : int 2 2 2 1 2 1 2 2 2 1 ...
## $ HalfBath : int 1 0 1 0 1 1 0 1 0 0 ...
## $ BedroomAbvGr : int 3 3 3 3 4 1 3 3 2 2 ...
## $ KitchenQual : num 3 4 3 3 3 4 3 4 4 4 ...
## $ Fireplaces : int 0 1 1 1 1 0 1 2 2 2 ...
## $ FireplaceQu : num 4 6 6 3 6 4 3 6 6 6 ...
## $ GarageType : num 2 2 2 6 2 2 2 2 6 2 ...
## $ GarageFinish : num 3 3 3 4 3 4 3 3 4 3 ...
## $ GarageArea : int 548 460 608 642 836 480 636 484 468 205 ...
## $ GarageQual : num 6 6 6 6 6 6 6 6 2 3 ...
## $ GarageCond : num 6 6 6 6 6 6 6 6 6 6 ...
## $ PavedDrive : num 3 3 3 3 3 3 3 3 3 3 ...
## $ WoodDeckSF : int 0 298 0 0 192 40 255 235 90 0 ...
## $ OpenPorchSF : int 61 0 42 35 84 30 57 204 0 4 ...
## $ Fence : num 5 5 5 5 5 3 5 5 5 5 ...
## $ MoSold : int 2 5 9 2 12 10 8 11 4 1 ...
## $ YrSold : int 2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
## $ SaleType : num 9 9 9 9 9 9 9 9 9 9 ...
## $ SaleCondition : num 5 5 5 1 5 5 5 5 1 5 ...
## $ SalePrice : int 208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
## $ nbhd_price_level : num 2 3 2 3 3 2 3 2 1 1 ...
## $ house_style_level: num 3 2 3 3 3 2 2 3 2 1 ...
## $ exterior_1 : num 3 2 3 2 3 3 3 2 2 2 ...
## $ exterior_2 : num 3 2 3 2 3 3 3 2 2 2 ...
## $ exterior_cond : num 3 2 3 2 3 2 3 2 2 2 ...
## $ exterior_cond2 : num 3 3 3 3 3 3 3 3 3 3 ...
## $ heatqual : num 5 5 5 4 5 5 5 5 4 5 ...
# Now the data is ready we start modelling the data
# A Linear Model
# Finally, we have our data and can build some models. Since our outcome is a continuous numeric variable, we want a linear model. First, let's just toss it all in there.
# Linear Regression Model
model.mlr <- lm(house_train$SalePrice~. , data = house_train)
summary(model.mlr)
##
## Call:
## lm(formula = house_train$SalePrice ~ ., data = house_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -433080 -14550 -1590 12372 271751
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.238e+06 1.310e+06 0.945 0.344970
## MSSubClass -1.504e+02 4.045e+01 -3.718 0.000209 ***
## MSZoning 3.382e+03 1.596e+03 2.119 0.034297 *
## LotFrontage -1.695e+02 4.980e+01 -3.405 0.000681 ***
## LotArea 3.339e-01 9.620e-02 3.471 0.000535 ***
## LotShape -3.825e+02 6.574e+02 -0.582 0.560793
## LotConfig -4.205e+02 5.439e+02 -0.773 0.439617
## Condition1 -1.482e+02 1.002e+03 -0.148 0.882417
## BldgType -2.629e+03 1.429e+03 -1.840 0.066051 .
## OverallQual 1.115e+04 1.179e+03 9.457 < 2e-16 ***
## OverallCond 4.944e+03 1.042e+03 4.743 2.32e-06 ***
## YearBuilt 1.138e+02 7.073e+01 1.609 0.107920
## YearRemodAdd -2.325e+01 6.634e+01 -0.350 0.726055
## RoofStyle 3.373e+03 1.095e+03 3.079 0.002114 **
## MasVnrType 4.684e+03 1.591e+03 2.944 0.003292 **
## MasVnrArea 3.124e+01 6.024e+00 5.186 2.46e-07 ***
## Foundation 1.224e+03 1.638e+03 0.747 0.455209
## BsmtQual -4.383e+03 1.018e+03 -4.306 1.77e-05 ***
## BsmtExposure -3.175e+03 8.605e+02 -3.690 0.000233 ***
## BsmtFinType1 -1.192e+03 5.144e+02 -2.317 0.020665 *
## BsmtFinSF1 1.624e+00 5.631e+00 0.288 0.773143
## BsmtUnfSF -4.337e+00 5.875e+00 -0.738 0.460501
## TotalBsmtSF 9.739e+00 6.593e+00 1.477 0.139841
## CentralAir 1.631e+03 4.244e+03 0.384 0.700780
## Electrical -7.450e+01 9.060e+02 -0.082 0.934476
## X1stFlrSF 2.152e+01 1.865e+01 1.154 0.248763
## X2ndFlrSF 2.592e+01 1.836e+01 1.412 0.158217
## GrLivArea 2.698e+01 1.803e+01 1.496 0.134769
## BsmtFullBath 6.352e+03 2.438e+03 2.605 0.009282 **
## BsmtHalfBath 1.354e+03 3.792e+03 0.357 0.721123
## FullBath 4.890e+03 2.644e+03 1.849 0.064630 .
## HalfBath 3.833e+03 2.536e+03 1.512 0.130871
## BedroomAbvGr -2.067e+03 1.481e+03 -1.396 0.162942
## KitchenQual -9.511e+03 1.398e+03 -6.802 1.52e-11 ***
## Fireplaces 4.388e+03 1.657e+03 2.648 0.008186 **
## FireplaceQu -1.597e+03 7.831e+02 -2.039 0.041627 *
## GarageType 8.648e+02 6.002e+02 1.441 0.149892
## GarageFinish -2.237e+03 8.833e+02 -2.533 0.011414 *
## GarageArea 2.740e+01 5.778e+00 4.742 2.33e-06 ***
## GarageQual -5.686e+02 1.277e+03 -0.445 0.656086
## GarageCond 6.978e+02 1.429e+03 0.488 0.625523
## PavedDrive 7.473e+02 2.048e+03 0.365 0.715215
## WoodDeckSF 1.792e+01 7.402e+00 2.421 0.015599 *
## OpenPorchSF -2.801e+00 1.417e+01 -0.198 0.843351
## Fence -2.723e+02 8.351e+02 -0.326 0.744389
## MoSold -2.143e+02 3.167e+02 -0.677 0.498739
## YrSold -7.084e+02 6.483e+02 -1.093 0.274753
## SaleType -6.456e+02 5.655e+02 -1.142 0.253782
## SaleCondition 2.366e+03 8.202e+02 2.885 0.003975 **
## nbhd_price_level 1.569e+04 1.745e+03 8.991 < 2e-16 ***
## house_style_level -6.549e+03 2.894e+03 -2.263 0.023765 *
## exterior_1 -7.849e+03 5.284e+03 -1.485 0.137643
## exterior_2 9.194e+03 5.199e+03 1.768 0.077210 .
## exterior_cond 7.917e+03 2.558e+03 3.095 0.002007 **
## exterior_cond2 -2.530e+03 2.675e+03 -0.946 0.344414
## heatqual 6.446e+02 1.159e+03 0.556 0.578111
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31810 on 1404 degrees of freedom
## Multiple R-squared: 0.8457, Adjusted R-squared: 0.8397
## F-statistic: 139.9 on 55 and 1404 DF, p-value: < 2.2e-16
#Adjusted R-squared: 0.8397 which implies 83 % of variation in SalePrice is explained by these independent variables
# Prediction by linear regression model
predict.model.mlr<-predict(model.mlr, data = house_train)
#RMSE
RMSE <- sqrt(mean((house_train$SalePrice-predict.model.mlr)^2))
RMSE
## [1] 31194.71
# transforming the error to log for easy reference with other models.
RMSELOG.lm <- sqrt(mean((log(house_train$SalePrice) - log(predict.model.mlr))^2))
RMSELOG.lm
## [1] 0.1526588
# After removing the insignificant vaiables given by linear regression model.
reg1_Modified_2<-lm(formula = SalePrice ~ MSSubClass +LotFrontage+ LotArea + BldgType
+ OverallQual + OverallCond +
RoofStyle +MasVnrType+MasVnrArea +
BsmtQual +BsmtExposure+ BsmtFinType1
+ BsmtFullBath + BedroomAbvGr +
KitchenQual + Fireplaces + FireplaceQu + GarageFinish + WoodDeckSF+ GarageArea + SaleCondition +nbhd_price_level+house_style_level+exterior_cond,
data = house_train)
summary(reg1_Modified_2)
##
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotFrontage + LotArea +
## BldgType + OverallQual + OverallCond + RoofStyle + MasVnrType +
## MasVnrArea + BsmtQual + BsmtExposure + BsmtFinType1 + BsmtFullBath +
## BedroomAbvGr + KitchenQual + Fireplaces + FireplaceQu + GarageFinish +
## WoodDeckSF + GarageArea + SaleCondition + nbhd_price_level +
## house_style_level + exterior_cond, data = house_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -320590 -17644 -2816 14504 365856
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.550e+04 1.602e+04 -0.968 0.333331
## MSSubClass -6.515e+01 3.855e+01 -1.690 0.091268 .
## LotFrontage -2.152e+01 5.216e+01 -0.413 0.679939
## LotArea 5.629e-01 1.033e-01 5.448 5.99e-08 ***
## BldgType -3.625e+03 1.398e+03 -2.593 0.009607 **
## OverallQual 1.668e+04 1.198e+03 13.921 < 2e-16 ***
## OverallCond 3.883e+03 9.008e+02 4.310 1.74e-05 ***
## RoofStyle 4.224e+03 1.170e+03 3.610 0.000316 ***
## MasVnrType 5.604e+03 1.699e+03 3.298 0.000998 ***
## MasVnrArea 4.754e+01 6.401e+00 7.427 1.90e-13 ***
## BsmtQual -6.472e+03 1.053e+03 -6.144 1.04e-09 ***
## BsmtExposure -2.537e+03 9.178e+02 -2.764 0.005785 **
## BsmtFinType1 -1.275e+03 4.846e+02 -2.630 0.008628 **
## BsmtFullBath 9.075e+03 2.168e+03 4.187 3.00e-05 ***
## BedroomAbvGr 1.006e+04 1.369e+03 7.349 3.35e-13 ***
## KitchenQual -1.214e+04 1.486e+03 -8.168 6.79e-16 ***
## Fireplaces 1.220e+04 1.669e+03 7.309 4.44e-13 ***
## FireplaceQu -1.010e+03 8.418e+02 -1.200 0.230386
## GarageFinish -3.265e+03 9.204e+02 -3.548 0.000401 ***
## WoodDeckSF 3.044e+01 7.968e+00 3.820 0.000139 ***
## GarageArea 4.832e+01 5.808e+00 8.319 < 2e-16 ***
## SaleCondition 1.344e+03 8.685e+02 1.548 0.121933
## nbhd_price_level 1.334e+04 1.698e+03 7.859 7.56e-15 ***
## house_style_level 5.370e+03 2.296e+03 2.339 0.019498 *
## exterior_cond 1.184e+04 2.666e+03 4.441 9.66e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35200 on 1435 degrees of freedom
## Multiple R-squared: 0.807, Adjusted R-squared: 0.8037
## F-statistic: 249.9 on 24 and 1435 DF, p-value: < 2.2e-16
#The Adjusted R-squared: 0.8037 is almost same as above. But we got important features from the below model
par(mfrow=c(2,2)) # 4 charts in 1 panel
plot(reg1_Modified_2)

##Heteroskedasticity
# The word "heteroscedasticity" comes from the Greek, and quite literally means data with a different (hetero) dispersion (skedasis). In simple terms, heteroscedasticity is any set of data that isn't homoscedastic. More technically, it refers to data with unequal variability (scatter) across a set of second, predictor variables.
bptest(reg1_Modified_2) # Breusch-Pagan test
##
## studentized Breusch-Pagan test
##
## data: reg1_Modified_2
## BP = 272.65, df = 24, p-value < 2.2e-16
#A p-Value > 0.05 indicates that the null hypothesis(the variance is unchanging in the residual)
# can be rejected and therefore heterscedasticity exists. This can be confirmed by running a global
# From BP Test we can reject the null hypothesis, hence there is no heteroskedasticity in the model
# Prediction by linear regression model
predict.reg1_Modified_2<-predict(reg1_Modified_2, data = house_train)
#RMSE
RMSE <- sqrt(mean((house_train$SalePrice-predict.reg1_Modified_2)^2))
RMSE
## [1] 34892.61
RMSELOG.lm2 <- sqrt(mean((log(house_train$SalePrice) - log(predict.reg1_Modified_2))^2))
RMSELOG.lm2
## [1] 0.1798438
#As there are many variables in the dataset there is a chance for correlation between them and which results to multicollinearity , which decreases the model performance.
#Lasso Regression
#For model to avoid overfitting, we divide dataset into train and validate datasets
# split the data into 70% train and 30% validate
split<-sample.split(house_train$SalePrice, SplitRatio = 0.7)
train_data<-subset(house_train, split==T)
validate_data<-subset(house_train, split==F)
# create data for training and test
X_train1<-as.matrix(train_data[, c(1:48,50:56)]) #Since 49th is the target Saleprice Variable
Y_train1<-log(train_data$SalePrice + 1)
X_test1<-as.matrix(validate_data[, c(1:48,50:56)])
Y_test1<-log(validate_data$SalePrice + 1)
#lasso
# set up caret model training parameters
# model specific training parameter
CARET.TRAIN.CTRL <- trainControl(method="repeatedcv",
number=5,
repeats=5,
verboseIter=FALSE)
set.seed(123) # for reproducibility
model_lasso <- train(x=X_train1,y=Y_train1,
method="glmnet",
metric="RMSE",
maximize=FALSE,
trControl=CARET.TRAIN.CTRL,
tuneGrid=expand.grid(alpha=1,
lambda=c(1,0.1,0.05,0.01,seq(0.009,0.001,-0.001),
0.00075,0.0005,0.0001)))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.
model_lasso
## glmnet
##
## 1117 samples
## 55 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times)
## Summary of sample sizes: 894, 893, 895, 893, 893, 894, ...
## Resampling results across tuning parameters:
##
## lambda RMSE Rsquared
## 0.00010 0.1548463 0.8631561
## 0.00050 0.1545157 0.8636847
## 0.00075 0.1542358 0.8641277
## 0.00100 0.1539988 0.8644989
## 0.00200 0.1534382 0.8653761
## 0.00300 0.1533044 0.8655919
## 0.00400 0.1533202 0.8655985
## 0.00500 0.1534929 0.8653776
## 0.00600 0.1537562 0.8650237
## 0.00700 0.1541201 0.8645302
## 0.00800 0.1545822 0.8639098
## 0.00900 0.1550865 0.8632390
## 0.01000 0.1556202 0.8625333
## 0.05000 0.1842950 0.8281603
## 0.10000 0.2216712 0.7957915
## 1.00000 0.4166745 NaN
##
## Tuning parameter 'alpha' was held constant at a value of 1
## RMSE was used to select the optimal model using the smallest value.
## The final values used for the model were alpha = 1 and lambda = 0.003.
model_lasso$resample$RMSE
## [1] 0.2106952 0.2100043 0.1341817 0.1332913 0.1377757 0.1259775 0.1662398
## [8] 0.1241500 0.1391062 0.2438342 0.1259210 0.1674068 0.1379432 0.1320743
## [15] 0.1341333 0.1617226 0.1363260 0.1148617 0.1402917 0.1529127 0.1401305
## [22] 0.1245191 0.1405407 0.1993267 0.1992432
mean(model_lasso$resample$RMSE)
## [1] 0.1533044
coef <- data.frame(coef.name = dimnames(coef(model_lasso$finalModel,s=model_lasso$bestTune$lambda))[[1]],
coef.value = matrix(coef(model_lasso$finalModel,s=model_lasso$bestTune$lambda)))
coef <- coef[-1,]
picked_features <- nrow(filter(coef,coef.value!=0))
not_picked_features <- nrow(filter(coef,coef.value==0))
cat("Lasso picked",picked_features,"variables and eliminated the other",
not_picked_features,"variables\n")
## Lasso picked 42 variables and eliminated the other 13 variables
coef <- arrange(coef,-coef.value)
imp_coef <- rbind(head(coef,10),
tail(coef,10))
ggplot(imp_coef) +
geom_bar(aes(x=reorder(coef.name,coef.value),y=coef.value),
stat="identity") +
ylim(-0.5,0.5) +
coord_flip() +
ggtitle("Coefficents in the Lasso Model") +
theme(axis.title=element_blank())

###
# RIDGE
CARET.TRAIN.CTRL <- trainControl(method="repeatedcv",
number=5,
repeats=5,
verboseIter=FALSE)
lambdas <- seq(1,0,-0.001)
# train model
set.seed(123) # for reproducibility
model_ridge <- train(x=X_train1,y=Y_train1,
method="glmnet",
metric="RMSE",
maximize=FALSE,
trControl=CARET.TRAIN.CTRL,
tuneGrid=expand.grid(alpha=0, # Ridge regression
lambda=lambdas))
ggplot(data=filter(model_ridge$result,RMSE<0.17)) +
geom_line(aes(x=lambda,y=RMSE))

mean(model_ridge$resample$RMSE)
## [1] 0.1532867
model_ridge2 <- train(x=X_train1,y=Y_train1,
method="glmnet",
metric="RMSE",
maximize=FALSE,
trControl=CARET.TRAIN.CTRL,
tuneGrid=expand.grid(alpha=0,
lambda=0.06)) # Ridge regression
mean(model_ridge2$resample$RMSE)
## [1] 0.1547717
# R, Regression Trees, function rpart(), method "anova"
model.regtree<- rpart(SalePrice ~., data = train_data, method = "anova")
predict.regtree <- predict(model.regtree, validate_data)
# RMSE
RMSE.regressiontree <- sqrt(mean((log(validate_data$SalePrice) - log(predict.regtree))^2))
RMSE.regressiontree
## [1] 0.2123953
# Random Forest
rf.model_1<-randomForest(train_data$SalePrice~., train_data)
rf.model_1
##
## Call:
## randomForest(formula = train_data$SalePrice ~ ., data = train_data)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 18
##
## Mean of squared residuals: 918621356
## % Var explained: 87.21
predict_1<-predict(rf.model_1, validate_data)
# RMSE
RMSE.rfmodel <- sqrt(mean((log(validate_data$SalePrice) - log(predict_1))^2))
RMSE.rfmodel
## [1] 0.1091713
# Error Rate of Random Forest
plot(rf.model_1)

# Now we will tune the RF model
#from plot we came to to know that the rmse is constant from ntree=100, hence we will tune the model with above ntree which reduces the complexity of tree
rf.model_2<-randomForest(train_data$SalePrice~., train_data, ntree = 100, mtry = 20, importance = T,proximity = T)
rf.model_2
##
## Call:
## randomForest(formula = train_data$SalePrice ~ ., data = train_data, ntree = 100, mtry = 20, importance = T, proximity = T)
## Type of random forest: regression
## Number of trees: 100
## No. of variables tried at each split: 20
##
## Mean of squared residuals: 1029154928
## % Var explained: 85.67
#Thus 87.66% of variation of SalePrice is explained by this Random forest model
predict_tune <- predict(rf.model_2, validate_data)
RMSE.tune_rfmodel <- sqrt(mean((log(validate_data$SalePrice) - log(predict_tune))^2))
RMSE.tune_rfmodel
## [1] 0.1115868
# No of nodes for the trees
hist(treesize(rf.model_2), main = 'No of nodes for the treee', col = 'blue')

varImpPlot(rf.model_2, col = 'blue', sort = T, n.var = 15, main = 'Top 15 important predictors')

# From above graph we can plot the important variables used in Random Forest
# Partial Dependence plot on GrLivArea.
partialPlot(rf.model_2, train_data, GrLivArea, '0')

#Gradient Boosting Model #package GBM.
gbm.model <- gbm(SalePrice ~., data = train_data, distribution = "laplace",
shrinkage = 0.05,
interaction.depth = 5,
bag.fraction = 0.66,
n.minobsinnode = 1,
cv.folds = 100,
keep.data = F,
verbose = F,
n.trees = 300)
predict.gbm <- predict(gbm.model, validate_data, n.trees = 300)
# RMSE
RMSE.gbm <- sqrt(mean((log(validate_data$SalePrice) - log(predict.gbm))^2))
RMSE.gbm
## [1] 0.1017775
#The RMSE for the Gradient Boosting Model is low when compared to other models
#Conclusion:
#Gradient Boosting algorithm is performing well with low RMSE . Hence we will use this algorithm.