Predicting Housing Sales

require(caTools) # for splitting datasets

## Loading required package: caTools

## Warning: package 'caTools' was built under R version 3.3.3

require(ggplot2) # for data visualization

## Loading required package: ggplot2

## Warning: package 'ggplot2' was built under R version 3.3.3

 require(magrittr) #for pipe operator

## Loading required package: magrittr

## Warning: package 'magrittr' was built under R version 3.3.3

# require(stringr) #extracting string patterns
# require(Matrix) # matrix transformations
require(lmtest) # for bcv test heteroskedasticity

## Loading required package: lmtest

## Warning: package 'lmtest' was built under R version 3.3.3

## Loading required package: zoo

## Warning: package 'zoo' was built under R version 3.3.3

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

require(glmnet) # ridge, lasso & elastinet

## Loading required package: glmnet

## Warning: package 'glmnet' was built under R version 3.3.3

## Loading required package: Matrix

## Loading required package: foreach

## Warning: package 'foreach' was built under R version 3.3.3

## Loaded glmnet 2.0-5

require(xgboost) # gbm

## Loading required package: xgboost

## Warning: package 'xgboost' was built under R version 3.3.3

require(randomForest)

## Loading required package: randomForest

## Warning: package 'randomForest' was built under R version 3.3.3

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

require(rpart)

## Loading required package: rpart

# require(Metrics) # rmse
require(dplyr) # load this in last so plyr doens't overlap it

## Loading required package: dplyr

## Warning: package 'dplyr' was built under R version 3.3.3

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:randomForest':
## 
##     combine

## The following object is masked from 'package:xgboost':
## 
##     slice

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

 require(caret) # one hot encoding

## Loading required package: caret

## Warning: package 'caret' was built under R version 3.3.3

## Loading required package: lattice

require(gbm)

## Loading required package: gbm

## Warning: package 'gbm' was built under R version 3.3.3

## Loading required package: survival

## 
## Attaching package: 'survival'

## The following object is masked from 'package:caret':
## 
##     cluster

## Loading required package: splines

## Loading required package: parallel

## Loaded gbm 2.1.3

# require(scales) # plotting $$
# require(e1071) # skewness
require(corrplot) # correlation plot

## Loading required package: corrplot

## Warning: package 'corrplot' was built under R version 3.3.3

require(metrics) #for RMSE function

## Loading required package: metrics

## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'metrics'

 library(plotly)

## Warning: package 'plotly' was built under R version 3.3.3

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:xgboost':
## 
##     slice

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

# library(shiny)

#Loading Test and Train Datasets
house_train <- read.csv("D:/Documents/Data Mining Project/Housing Prices Competetion/train.csv", header = T,stringsAsFactors = FALSE )

## Data preprocessing

# The data preprocessing step proved to be the most crucial in this project.
#Missing Data 'NA' Count and Percent of missing data
na_count <- sort(sapply(house_train, function(x) sum(is.na(x))),decreasing = T)
percent_na_count <-sort(apply(house_train, 2, function(col)sum(is.na(col))/length(col)),decreasing = T)
dt_MissingData <- data.frame(percent_na_count,na_count)
head(dt_MissingData,20)

##              percent_na_count na_count
## PoolQC           0.9952054795     1453
## MiscFeature      0.9630136986     1406
## Alley            0.9376712329     1369
## Fence            0.8075342466     1179
## FireplaceQu      0.4726027397      690
## LotFrontage      0.1773972603      259
## GarageType       0.0554794521       81
## GarageYrBlt      0.0554794521       81
## GarageFinish     0.0554794521       81
## GarageQual       0.0554794521       81
## GarageCond       0.0554794521       81
## BsmtExposure     0.0260273973       38
## BsmtFinType2     0.0260273973       38
## BsmtQual         0.0253424658       37
## BsmtCond         0.0253424658       37
## BsmtFinType1     0.0253424658       37
## MasVnrType       0.0054794521        8
## MasVnrArea       0.0054794521        8
## Electrical       0.0006849315        1
## MSSubClass       0.0000000000        0

#Variables poolQc,MiscFeature and Alley are having missing data more than 95%. Even though we impute this variables with mode or median of respective coloumns , these wont add information to the model because of near zero variance , hence we can delete this columns from dataset which we will do late after imputation of missing values of all columns.


## Missing Values Imputation

###PoolQC: Pool quality In the data description it was mentioned that NA in poolQC represents there was no pool in particular house. So, we will replace NA with 'None'
house_train$PoolQC[is.na(house_train$PoolQC)] <- 'None'

#We can replace any missing values for Fence,Alley and MiscFeature with 'None' as they probably don't have this feature with their property.
house_train$MiscFeature[is.na(house_train$MiscFeature)] <- 'None'
house_train$Fence[is.na(house_train$Fence)] <- 'None'
house_train$Alley[is.na(house_train$Alley)] <- 'None'
house_train$FireplaceQu[is.na(house_train$FireplaceQu)] <- 'None'

###LotFrontage: Linear feet of street connected to property. 
#There are many missing values for LotFrontage, which is quite a lot of values to fill and we can't just replace these with 0. We're given that "LotFrontage: Linear feet of street connected to property." The area of each street connected to the house property is most likely going to have a similar area to other houses in its neighborhood. We can group by each neighborhood and take the median of each LotFrontage and fill the missing values of each LotFrontage based on what neighborhood the house comes from.


  lot.by.nbrh = house_train[,c('Neighborhood','LotFrontage')] %>%
    group_by(Neighborhood) %>%
   summarise(median = median(LotFrontage, na.rm = TRUE))
lot.by.nbrh

## # A tibble: 25 × 2
##    Neighborhood median
##           <chr>  <dbl>
## 1       Blmngtn   43.0
## 2       Blueste   24.0
## 3        BrDale   21.0
## 4       BrkSide   52.0
## 5       ClearCr   80.0
## 6       CollgCr   70.0
## 7       Crawfor   74.0
## 8       Edwards   65.5
## 9       Gilbert   65.0
## 10       IDOTRR   60.0
## # ... with 15 more rows

lotNA = which(is.na(house_train$LotFrontage))
#For loop to replace each missing value in Lot Frontage with Median Sales price of respective neighbourhood
for(i in lotNA){
  nbrhd <- house_train$Neighborhood[i]
  lot.median <- lot.by.nbrh[lot.by.nbrh$Neighborhood==nbrhd[1],'median']
  house_train[i,'LotFrontage'] = lot.median[[1]]
}

###Garage Features
# GarageType: Garage location  
# GarageYrBlt: Year garage was built
# GarageFinish: Interior finish of the garage
# GarageCars: Size of garage in car capacity
# GarageArea: Size of garage in square feet
# GarageQual: Garage quality
# GarageCond: Garage condition#

# It seems reasonable that most houses would build a garage when the house itself was built. We can check this by seeing how many houses were built the same year their garage was built.

length(which(house_train$GarageYrBlt == house_train$YearBuilt))

## [1] 1089

# 1089 of the 1460 houses have same year for for GarageYrBlt and YearBuilt. Lets replace any of the NA's for GarageYrBlt with the year from YearBuilt.

gyr.na = which(is.na(house_train$GarageYrBlt))
house_train[gyr.na, 'GarageYrBlt'] = house_train[gyr.na, 'YearBuilt']

# In the data description it was mentioned that NA in Garage Columns represents there was no Garage, thus we can assume this particular house does not have a garage at all. and will replace NA with 'None' ,

house_train$GarageQual[is.na(house_train$GarageQual)] <- 'None'
house_train$GarageFinish[is.na(house_train$GarageFinish)] <- 'None'
house_train$GarageCond[is.na(house_train$GarageCond)] <- 'None'
house_train$GarageType[is.na(house_train$GarageType)] <- 'None'

###Basement Features
# BsmtExposure     
# BsmtFinType2    
# BsmtQual         
# BsmtCond        
# BsmtFinType1 

length(which(house_train$TotalBsmtSF ==0 ))

## [1] 37

# 37 houses are not having Total BasementSF, implies houses certainly don't have basements 
#Almost all of the missing values for each categoric basement feature comes from houses with 0 on Basement Square Feet Area. We can fill in these values with 'None' since these houses certainly don't have basements.

bsmt.cols <- c('BsmtExposure','BsmtFinType2','BsmtQual','BsmtCond','BsmtFinType1' ) 

for (col in bsmt.cols)
{
  house_train[sapply(house_train[col],is.na),col] = 'None'
}

# MasVnrType: Masonry veneer type
# MasVnrArea: Masonry veneer area in square feet

# The areas we can replace with 0 and types can be replaced with 'None'

house_train$MasVnrType[is.na(house_train$MasVnrType)] <- 'None'
house_train$MasVnrArea[is.na(house_train$MasVnrArea)] <- 0


#Electrical
#There is only one missing value in Electrical feature and hence we are replacing it with the most repeated value of that column
p <- plot_ly(x = house_train$Electrical, type = "histogram")
p

## Warning: Ignoring 1 observations

house_train$Electrical[is.na(house_train$Electrical)] <- 'SBrkr'

##Variable Selection - Feature Selection

#Reducing the variables by Deleting coloumns with near zero variance. for example look at street variable histogram almost 95% are of same values Pave. As there is no variance in variable therefore it does not convey any informaton to model and more over it decreases the stability of model. 
p <- plot_ly(x = house_train$Street, type = "histogram")
p

###nearZeroVar function is in Caret package , which will inform the zero variance or near zero variance of variables
nzv.data = nearZeroVar(house_train, saveMetrics = TRUE)
nzv.data <- nzv.data[nzv.data[,"zeroVar"] + nzv.data[,"nzv"] > 0, ]
nzv.data

##                freqRatio percentUnique zeroVar  nzv
## Street         242.33333     0.1369863   FALSE TRUE
## Alley           27.38000     0.2054795   FALSE TRUE
## LandContour     20.80952     0.2739726   FALSE TRUE
## Utilities     1459.00000     0.1369863   FALSE TRUE
## LandSlope       21.26154     0.2054795   FALSE TRUE
## Condition2     240.83333     0.5479452   FALSE TRUE
## RoofMatl       130.36364     0.5479452   FALSE TRUE
## BsmtCond        20.16923     0.3424658   FALSE TRUE
## BsmtFinType2    23.25926     0.4794521   FALSE TRUE
## BsmtFinSF2     258.60000     9.8630137   FALSE TRUE
## Heating         79.33333     0.4109589   FALSE TRUE
## LowQualFinSF   478.00000     1.6438356   FALSE TRUE
## KitchenAbvGr    21.41538     0.2739726   FALSE TRUE
## Functional      40.00000     0.4794521   FALSE TRUE
## EnclosedPorch   83.46667     8.2191781   FALSE TRUE
## X3SsnPorch     478.66667     1.3698630   FALSE TRUE
## ScreenPorch    224.00000     5.2054795   FALSE TRUE
## PoolArea      1453.00000     0.5479452   FALSE TRUE
## PoolQC         484.33333     0.2739726   FALSE TRUE
## MiscFeature     28.69388     0.3424658   FALSE TRUE
## MiscVal        128.00000     1.4383562   FALSE TRUE

nzv.coloums <- rownames(nzv.data)[nzv.data$nzv == TRUE]
nzv.coloums

##  [1] "Street"        "Alley"         "LandContour"   "Utilities"    
##  [5] "LandSlope"     "Condition2"    "RoofMatl"      "BsmtCond"     
##  [9] "BsmtFinType2"  "BsmtFinSF2"    "Heating"       "LowQualFinSF" 
## [13] "KitchenAbvGr"  "Functional"    "EnclosedPorch" "X3SsnPorch"   
## [17] "ScreenPorch"   "PoolArea"      "PoolQC"        "MiscFeature"  
## [21] "MiscVal"

#Now we are removing those variables with near zero variance
house_train = house_train[,!names(house_train) %in% nzv.coloums]

dim(house_train)[2]

## [1] 59

#The variables has been reduced to 59.

##Correlation
#Correlation between the independent variables lead to multicollinearity in model, which makes the model insignificant. Thus from correlation plot we can find mostly correlated variables and remove them. And also we can find the relation on Target variable SalePrice
cont <- sapply(house_train, is.numeric)
df_cont<-data.frame(house_train[, cont])
correlations <- cor(df_cont)

# correlations
row_indic <- apply(correlations, 1, function(x) sum(x > 0.3 | x < -0.3) > 1)

correlations<- correlations[row_indic ,row_indic ]
corrplot(correlations, method="square")

# From the correlation plot we can see the 10 features with the strongest effect on SalePrice.

# OverallQual: Overall material and finish quality
# GrLivArea: Above grade (ground) living area square feet
# These 2 features have the highest correlation and very reasonably make sense.
# 
# As expected, Garage Yr Built and Yr Built are having high correlation , hence we will remove Garage Yr built and keep Yr Built
house_train$GarageYrBlt <- NULL
# GarageCars: Size of garage in car capacity
# GarageArea: Size of garage in square feet
# These 2 features from our plot are the next best thing, however, this might not have been something we expected. We can also see that the correlation between the 2 is extremely high, 0.88, which makes sense as the area of the garage is a constraint on how many cars can fit.

house_train$GarageCars <- NULL

# TotalBsmtSF: Total square feet of basement area
# 1stFlrSF: First Floor square feet
#We can also see that these 2 features have a strong linear relationship with one another, which makes sense as the size of the basement can certainly depend on the size of the houses 1st floor.


# GrLivArea: Above grade (ground) living area square feet
# TotRmsAbvGrd: Total rooms above grade (does not include bathrooms)


# YearBuilt: Original construction date
# YearRemodAdd: Remodel date
# A positive correlation for both of these is not something we should be surprised by. The newer homes will likely give higher listings compared to older models.
                       

house_train$TotRmsAbvGrd <- NULL

##Converting Categorical Data to Numeric data
# Now Converting Categorical Data to Numeric data we will have to transform any categoric into a binary feature using one-hot encoding. First off, we can start by adding all our numeric features to a new dataframe along with adding any extra custom made features that will help with predictive modeling.

# Next we will transform  the ordinal variables (variables that can be scaled) into numeric values. We will do this by determining which order the categories follow and assigning the values an order from 1,2,..,n. We'll group each feature by its possible values and return the median SalePrice for each unique value, then map the higher priced/better quality homes larger numeric values and the lower priced/lower quality homes smaller numeric values.

# get names of categorical features
cat_features = names(which(sapply(house_train, is.character)))
cat_features

##  [1] "MSZoning"      "LotShape"      "LotConfig"     "Neighborhood" 
##  [5] "Condition1"    "BldgType"      "HouseStyle"    "RoofStyle"    
##  [9] "Exterior1st"   "Exterior2nd"   "MasVnrType"    "ExterQual"    
## [13] "ExterCond"     "Foundation"    "BsmtQual"      "BsmtExposure" 
## [17] "BsmtFinType1"  "HeatingQC"     "CentralAir"    "Electrical"   
## [21] "KitchenQual"   "FireplaceQu"   "GarageType"    "GarageFinish" 
## [25] "GarageQual"    "GarageCond"    "PavedDrive"    "Fence"        
## [29] "SaleType"      "SaleCondition"

#There are 30 Categorical variables

#Neighbourhood
nbhdprice <- summarize(group_by(house_train, Neighborhood),
                       mean(SalePrice, na.rm=T))

#nbhdprice[order(nbhdprice$`mean(SalePrice, na.rm = T)`),]

nbhdprice_lo <- filter(nbhdprice, nbhdprice$`mean(SalePrice, na.rm = T)` < 140000)
nbhdprice_med <- filter(nbhdprice, nbhdprice$`mean(SalePrice, na.rm = T)` < 200000 &
                          nbhdprice$`mean(SalePrice, na.rm = T)` >= 140000 )
nbhdprice_hi <- filter(nbhdprice, nbhdprice$`mean(SalePrice, na.rm = T)` >= 200000)

house_train$nbhd_price_level[house_train$Neighborhood %in% nbhdprice_lo$Neighborhood] <- 1
house_train$nbhd_price_level[house_train$Neighborhood %in% nbhdprice_med$Neighborhood] <- 2
house_train$nbhd_price_level[house_train$Neighborhood %in% nbhdprice_hi$Neighborhood] <- 3

#housestyle
housestyle_price <- summarize(group_by(house_train, HouseStyle),
                              mean(SalePrice, na.rm=T))

housestyle_lo <- filter(housestyle_price, housestyle_price$`mean(SalePrice, na.rm = T)` < 140000)
housestyle_med <- filter(housestyle_price, housestyle_price$`mean(SalePrice, na.rm = T)` < 200000 &
                           housestyle_price$`mean(SalePrice, na.rm = T)` >= 140000 )
housestyle_hi <- filter(housestyle_price, housestyle_price$`mean(SalePrice, na.rm = T)` >= 200000)

house_train$house_style_level[house_train$HouseStyle %in% housestyle_lo$HouseStyle] <- 1
house_train$house_style_level[house_train$HouseStyle %in% housestyle_med$HouseStyle] <- 2
house_train$house_style_level[house_train$HouseStyle %in% housestyle_hi$HouseStyle] <- 3

#Exterior1st
price <- summarize(group_by(house_train, Exterior1st),
                   mean(SalePrice, na.rm=T))

matl_lo_1 <- filter(price, price$`mean(SalePrice, na.rm = T)` < 140000)
matl_med_1<- filter(price, price$`mean(SalePrice, na.rm = T)` < 200000 &
                      price$`mean(SalePrice, na.rm = T)` >= 140000 )
matl_hi_1 <- filter(price, price$`mean(SalePrice, na.rm = T)` >= 200000)

house_train$exterior_1[house_train$Exterior1st %in% matl_lo_1$Exterior1st] <- 1
house_train$exterior_1[house_train$Exterior1st %in% matl_med_1$Exterior1st] <- 2
house_train$exterior_1[house_train$Exterior1st %in% matl_hi_1$Exterior1st] <- 3

#exterior2
price <- summarize(group_by(house_train, Exterior2nd),
                   mean(SalePrice, na.rm=T))

matl_lo <- filter(price, price$`mean(SalePrice, na.rm = T)` < 140000)
matl_med <- filter(price, price$`mean(SalePrice, na.rm = T)` < 200000 &
                     price$`mean(SalePrice, na.rm = T)` >= 140000 )
matl_hi <- filter(price, price$`mean(SalePrice, na.rm = T)` >= 200000)

house_train$exterior_2[house_train$Exterior2nd %in% matl_lo$Exterior2nd] <- 1
house_train$exterior_2[house_train$Exterior2nd %in% matl_med$Exterior2nd] <- 2
house_train$exterior_2[house_train$Exterior2nd %in% matl_hi$Exterior2nd] <- 3


#ExterQual
price <- summarize(group_by(house_train, ExterQual),
                   mean(SalePrice, na.rm=T))

house_train$exterior_cond[house_train$ExterQual == "Ex"] <- 4
house_train$exterior_cond[house_train$ExterQual == "Gd"] <- 3
house_train$exterior_cond[house_train$ExterQual == "TA"] <- 2
house_train$exterior_cond[house_train$ExterQual == "Fa"] <- 1

#ExterCond
price <- summarize(group_by(house_train, ExterCond),
                   mean(SalePrice, na.rm=T))

house_train$exterior_cond2[house_train$ExterCond == "Ex"] <- 5
house_train$exterior_cond2[house_train$ExterCond == "Gd"] <- 4
house_train$exterior_cond2[house_train$ExterCond == "TA"] <- 3
house_train$exterior_cond2[house_train$ExterCond == "Fa"] <- 2
house_train$exterior_cond2[house_train$ExterCond == "Po"] <- 1


#Kitchen Variables
#HeatingQC
price <- summarize(group_by(house_train, HeatingQC),
                   mean(SalePrice, na.rm=T))

house_train$heatqual[house_train$HeatingQC == "Ex"] <- 5
house_train$heatqual[house_train$HeatingQC == "Gd"] <- 4
house_train$heatqual[house_train$HeatingQC == "TA"] <- 3
house_train$heatqual[house_train$HeatingQC == "Fa"] <- 2
house_train$heatqual[house_train$HeatingQC == "Po"] <- 1

#one. Now, time to drop off the variables above which have been made numeric and are no longer needed.


house_train$Neighborhood <- NULL
house_train$Exterior1st <- NULL
house_train$Exterior2nd <- NULL
house_train$ExterQual <- NULL

house_train$HeatingQC <- NULL
house_train$HouseStyle <- NULL
house_train$ExterCond <- NULL

# we have converted all the ordinal variables to numeric by assigning weights based on mean sale price. Now we will convert the remaining nominal variables into numeric using as.numeric function. First we will convert to factors and then convert to numeric

for(i in colnames(house_train[,sapply(house_train, is.character)])){
  house_train[,i] <- as.factor(house_train[,i])
}

for(i in colnames(house_train[,sapply(house_train, is.factor)])){
  house_train[,i] <- as.numeric(house_train[,i])
}

str(house_train)

## 'data.frame':    1460 obs. of  56 variables:
##  $ MSSubClass       : int  60 20 60 70 60 50 20 60 50 190 ...
##  $ MSZoning         : num  4 4 4 4 4 4 4 4 5 4 ...
##  $ LotFrontage      : num  65 80 68 60 84 85 75 80 51 50 ...
##  $ LotArea          : int  8450 9600 11250 9550 14260 14115 10084 10382 6120 7420 ...
##  $ LotShape         : num  4 4 1 1 1 1 4 1 4 4 ...
##  $ LotConfig        : num  5 3 5 1 3 5 5 1 5 1 ...
##  $ Condition1       : num  3 2 3 3 3 3 3 5 1 1 ...
##  $ BldgType         : num  1 1 1 1 1 1 1 1 1 2 ...
##  $ OverallQual      : int  7 6 7 7 8 5 8 7 7 5 ...
##  $ OverallCond      : int  5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt        : int  2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
##  $ YearRemodAdd     : int  2003 1976 2002 1970 2000 1995 2005 1973 1950 1950 ...
##  $ RoofStyle        : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ MasVnrType       : num  2 3 2 3 2 3 4 4 3 3 ...
##  $ MasVnrArea       : num  196 0 162 0 350 0 186 240 0 0 ...
##  $ Foundation       : num  3 2 3 1 3 6 3 2 1 1 ...
##  $ BsmtQual         : num  3 3 3 5 3 3 1 3 5 5 ...
##  $ BsmtExposure     : num  4 2 3 4 1 4 1 3 4 4 ...
##  $ BsmtFinType1     : num  3 1 3 1 3 3 3 1 7 3 ...
##  $ BsmtFinSF1       : int  706 978 486 216 655 732 1369 859 0 851 ...
##  $ BsmtUnfSF        : int  150 284 434 540 490 64 317 216 952 140 ...
##  $ TotalBsmtSF      : int  856 1262 920 756 1145 796 1686 1107 952 991 ...
##  $ CentralAir       : num  2 2 2 2 2 2 2 2 2 2 ...
##  $ Electrical       : num  5 5 5 5 5 5 5 5 2 5 ...
##  $ X1stFlrSF        : int  856 1262 920 961 1145 796 1694 1107 1022 1077 ...
##  $ X2ndFlrSF        : int  854 0 866 756 1053 566 0 983 752 0 ...
##  $ GrLivArea        : int  1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
##  $ BsmtFullBath     : int  1 0 1 1 1 1 1 1 0 1 ...
##  $ BsmtHalfBath     : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ FullBath         : int  2 2 2 1 2 1 2 2 2 1 ...
##  $ HalfBath         : int  1 0 1 0 1 1 0 1 0 0 ...
##  $ BedroomAbvGr     : int  3 3 3 3 4 1 3 3 2 2 ...
##  $ KitchenQual      : num  3 4 3 3 3 4 3 4 4 4 ...
##  $ Fireplaces       : int  0 1 1 1 1 0 1 2 2 2 ...
##  $ FireplaceQu      : num  4 6 6 3 6 4 3 6 6 6 ...
##  $ GarageType       : num  2 2 2 6 2 2 2 2 6 2 ...
##  $ GarageFinish     : num  3 3 3 4 3 4 3 3 4 3 ...
##  $ GarageArea       : int  548 460 608 642 836 480 636 484 468 205 ...
##  $ GarageQual       : num  6 6 6 6 6 6 6 6 2 3 ...
##  $ GarageCond       : num  6 6 6 6 6 6 6 6 6 6 ...
##  $ PavedDrive       : num  3 3 3 3 3 3 3 3 3 3 ...
##  $ WoodDeckSF       : int  0 298 0 0 192 40 255 235 90 0 ...
##  $ OpenPorchSF      : int  61 0 42 35 84 30 57 204 0 4 ...
##  $ Fence            : num  5 5 5 5 5 3 5 5 5 5 ...
##  $ MoSold           : int  2 5 9 2 12 10 8 11 4 1 ...
##  $ YrSold           : int  2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
##  $ SaleType         : num  9 9 9 9 9 9 9 9 9 9 ...
##  $ SaleCondition    : num  5 5 5 1 5 5 5 5 1 5 ...
##  $ SalePrice        : int  208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
##  $ nbhd_price_level : num  2 3 2 3 3 2 3 2 1 1 ...
##  $ house_style_level: num  3 2 3 3 3 2 2 3 2 1 ...
##  $ exterior_1       : num  3 2 3 2 3 3 3 2 2 2 ...
##  $ exterior_2       : num  3 2 3 2 3 3 3 2 2 2 ...
##  $ exterior_cond    : num  3 2 3 2 3 2 3 2 2 2 ...
##  $ exterior_cond2   : num  3 3 3 3 3 3 3 3 3 3 ...
##  $ heatqual         : num  5 5 5 4 5 5 5 5 4 5 ...

# Now the data is ready we start modelling the data

# A Linear Model
# Finally, we have our data and can build some models. Since our outcome is a continuous numeric variable, we want a linear model. First, let's just toss it all in there. 

# Linear Regression Model
model.mlr <- lm(house_train$SalePrice~. , data = house_train)

summary(model.mlr)

## 
## Call:
## lm(formula = house_train$SalePrice ~ ., data = house_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -433080  -14550   -1590   12372  271751 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        1.238e+06  1.310e+06   0.945 0.344970    
## MSSubClass        -1.504e+02  4.045e+01  -3.718 0.000209 ***
## MSZoning           3.382e+03  1.596e+03   2.119 0.034297 *  
## LotFrontage       -1.695e+02  4.980e+01  -3.405 0.000681 ***
## LotArea            3.339e-01  9.620e-02   3.471 0.000535 ***
## LotShape          -3.825e+02  6.574e+02  -0.582 0.560793    
## LotConfig         -4.205e+02  5.439e+02  -0.773 0.439617    
## Condition1        -1.482e+02  1.002e+03  -0.148 0.882417    
## BldgType          -2.629e+03  1.429e+03  -1.840 0.066051 .  
## OverallQual        1.115e+04  1.179e+03   9.457  < 2e-16 ***
## OverallCond        4.944e+03  1.042e+03   4.743 2.32e-06 ***
## YearBuilt          1.138e+02  7.073e+01   1.609 0.107920    
## YearRemodAdd      -2.325e+01  6.634e+01  -0.350 0.726055    
## RoofStyle          3.373e+03  1.095e+03   3.079 0.002114 ** 
## MasVnrType         4.684e+03  1.591e+03   2.944 0.003292 ** 
## MasVnrArea         3.124e+01  6.024e+00   5.186 2.46e-07 ***
## Foundation         1.224e+03  1.638e+03   0.747 0.455209    
## BsmtQual          -4.383e+03  1.018e+03  -4.306 1.77e-05 ***
## BsmtExposure      -3.175e+03  8.605e+02  -3.690 0.000233 ***
## BsmtFinType1      -1.192e+03  5.144e+02  -2.317 0.020665 *  
## BsmtFinSF1         1.624e+00  5.631e+00   0.288 0.773143    
## BsmtUnfSF         -4.337e+00  5.875e+00  -0.738 0.460501    
## TotalBsmtSF        9.739e+00  6.593e+00   1.477 0.139841    
## CentralAir         1.631e+03  4.244e+03   0.384 0.700780    
## Electrical        -7.450e+01  9.060e+02  -0.082 0.934476    
## X1stFlrSF          2.152e+01  1.865e+01   1.154 0.248763    
## X2ndFlrSF          2.592e+01  1.836e+01   1.412 0.158217    
## GrLivArea          2.698e+01  1.803e+01   1.496 0.134769    
## BsmtFullBath       6.352e+03  2.438e+03   2.605 0.009282 ** 
## BsmtHalfBath       1.354e+03  3.792e+03   0.357 0.721123    
## FullBath           4.890e+03  2.644e+03   1.849 0.064630 .  
## HalfBath           3.833e+03  2.536e+03   1.512 0.130871    
## BedroomAbvGr      -2.067e+03  1.481e+03  -1.396 0.162942    
## KitchenQual       -9.511e+03  1.398e+03  -6.802 1.52e-11 ***
## Fireplaces         4.388e+03  1.657e+03   2.648 0.008186 ** 
## FireplaceQu       -1.597e+03  7.831e+02  -2.039 0.041627 *  
## GarageType         8.648e+02  6.002e+02   1.441 0.149892    
## GarageFinish      -2.237e+03  8.833e+02  -2.533 0.011414 *  
## GarageArea         2.740e+01  5.778e+00   4.742 2.33e-06 ***
## GarageQual        -5.686e+02  1.277e+03  -0.445 0.656086    
## GarageCond         6.978e+02  1.429e+03   0.488 0.625523    
## PavedDrive         7.473e+02  2.048e+03   0.365 0.715215    
## WoodDeckSF         1.792e+01  7.402e+00   2.421 0.015599 *  
## OpenPorchSF       -2.801e+00  1.417e+01  -0.198 0.843351    
## Fence             -2.723e+02  8.351e+02  -0.326 0.744389    
## MoSold            -2.143e+02  3.167e+02  -0.677 0.498739    
## YrSold            -7.084e+02  6.483e+02  -1.093 0.274753    
## SaleType          -6.456e+02  5.655e+02  -1.142 0.253782    
## SaleCondition      2.366e+03  8.202e+02   2.885 0.003975 ** 
## nbhd_price_level   1.569e+04  1.745e+03   8.991  < 2e-16 ***
## house_style_level -6.549e+03  2.894e+03  -2.263 0.023765 *  
## exterior_1        -7.849e+03  5.284e+03  -1.485 0.137643    
## exterior_2         9.194e+03  5.199e+03   1.768 0.077210 .  
## exterior_cond      7.917e+03  2.558e+03   3.095 0.002007 ** 
## exterior_cond2    -2.530e+03  2.675e+03  -0.946 0.344414    
## heatqual           6.446e+02  1.159e+03   0.556 0.578111    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31810 on 1404 degrees of freedom
## Multiple R-squared:  0.8457, Adjusted R-squared:  0.8397 
## F-statistic: 139.9 on 55 and 1404 DF,  p-value: < 2.2e-16

#Adjusted R-squared:  0.8397 which implies 83 % of variation in SalePrice is explained by these independent variables

# Prediction by linear regression model
predict.model.mlr<-predict(model.mlr, data = house_train)

#RMSE 
RMSE <- sqrt(mean((house_train$SalePrice-predict.model.mlr)^2))
RMSE

## [1] 31194.71

# transforming the error to log for easy reference with other models.
RMSELOG.lm <- sqrt(mean((log(house_train$SalePrice) - log(predict.model.mlr))^2))
RMSELOG.lm

## [1] 0.1526588

# After removing the insignificant vaiables given by linear regression model.
reg1_Modified_2<-lm(formula = SalePrice ~ MSSubClass +LotFrontage+ LotArea + BldgType
                       + OverallQual + OverallCond + 
                      RoofStyle  +MasVnrType+MasVnrArea + 
                      BsmtQual +BsmtExposure+  BsmtFinType1  
                     + BsmtFullBath  + BedroomAbvGr +
                      KitchenQual + Fireplaces + FireplaceQu + GarageFinish +                           WoodDeckSF+ GarageArea +  SaleCondition                        +nbhd_price_level+house_style_level+exterior_cond, 
                    data = house_train)
summary(reg1_Modified_2)

## 
## Call:
## lm(formula = SalePrice ~ MSSubClass + LotFrontage + LotArea + 
##     BldgType + OverallQual + OverallCond + RoofStyle + MasVnrType + 
##     MasVnrArea + BsmtQual + BsmtExposure + BsmtFinType1 + BsmtFullBath + 
##     BedroomAbvGr + KitchenQual + Fireplaces + FireplaceQu + GarageFinish + 
##     WoodDeckSF + GarageArea + SaleCondition + nbhd_price_level + 
##     house_style_level + exterior_cond, data = house_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -320590  -17644   -2816   14504  365856 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1.550e+04  1.602e+04  -0.968 0.333331    
## MSSubClass        -6.515e+01  3.855e+01  -1.690 0.091268 .  
## LotFrontage       -2.152e+01  5.216e+01  -0.413 0.679939    
## LotArea            5.629e-01  1.033e-01   5.448 5.99e-08 ***
## BldgType          -3.625e+03  1.398e+03  -2.593 0.009607 ** 
## OverallQual        1.668e+04  1.198e+03  13.921  < 2e-16 ***
## OverallCond        3.883e+03  9.008e+02   4.310 1.74e-05 ***
## RoofStyle          4.224e+03  1.170e+03   3.610 0.000316 ***
## MasVnrType         5.604e+03  1.699e+03   3.298 0.000998 ***
## MasVnrArea         4.754e+01  6.401e+00   7.427 1.90e-13 ***
## BsmtQual          -6.472e+03  1.053e+03  -6.144 1.04e-09 ***
## BsmtExposure      -2.537e+03  9.178e+02  -2.764 0.005785 ** 
## BsmtFinType1      -1.275e+03  4.846e+02  -2.630 0.008628 ** 
## BsmtFullBath       9.075e+03  2.168e+03   4.187 3.00e-05 ***
## BedroomAbvGr       1.006e+04  1.369e+03   7.349 3.35e-13 ***
## KitchenQual       -1.214e+04  1.486e+03  -8.168 6.79e-16 ***
## Fireplaces         1.220e+04  1.669e+03   7.309 4.44e-13 ***
## FireplaceQu       -1.010e+03  8.418e+02  -1.200 0.230386    
## GarageFinish      -3.265e+03  9.204e+02  -3.548 0.000401 ***
## WoodDeckSF         3.044e+01  7.968e+00   3.820 0.000139 ***
## GarageArea         4.832e+01  5.808e+00   8.319  < 2e-16 ***
## SaleCondition      1.344e+03  8.685e+02   1.548 0.121933    
## nbhd_price_level   1.334e+04  1.698e+03   7.859 7.56e-15 ***
## house_style_level  5.370e+03  2.296e+03   2.339 0.019498 *  
## exterior_cond      1.184e+04  2.666e+03   4.441 9.66e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35200 on 1435 degrees of freedom
## Multiple R-squared:  0.807,  Adjusted R-squared:  0.8037 
## F-statistic: 249.9 on 24 and 1435 DF,  p-value: < 2.2e-16

#The Adjusted R-squared:  0.8037 is almost same as above. But we got important features from the below model

par(mfrow=c(2,2)) # 4 charts in 1 panel

plot(reg1_Modified_2)

##Heteroskedasticity
# The word "heteroscedasticity" comes from the Greek, and quite literally means data with a different (hetero) dispersion (skedasis). In simple terms, heteroscedasticity is any set of data that isn't homoscedastic. More technically, it refers to data with unequal variability (scatter) across a set of second, predictor variables.

bptest(reg1_Modified_2) # Breusch-Pagan test

## 
##  studentized Breusch-Pagan test
## 
## data:  reg1_Modified_2
## BP = 272.65, df = 24, p-value < 2.2e-16

#A p-Value > 0.05 indicates that the null hypothesis(the variance is unchanging in the residual) 
# can be rejected and therefore heterscedasticity exists. This can be confirmed by running a global

# From BP Test we can reject the null hypothesis, hence  there is no heteroskedasticity in the model

# Prediction by linear regression model
predict.reg1_Modified_2<-predict(reg1_Modified_2, data = house_train)

#RMSE 
RMSE <- sqrt(mean((house_train$SalePrice-predict.reg1_Modified_2)^2))
RMSE

## [1] 34892.61

RMSELOG.lm2 <- sqrt(mean((log(house_train$SalePrice) - log(predict.reg1_Modified_2))^2))
RMSELOG.lm2

## [1] 0.1798438

#As there are many variables in the dataset there is a chance for correlation between them and which results to multicollinearity , which decreases the model performance.
#Lasso Regression
#For model  to avoid overfitting, we divide dataset into train and validate datasets
# split the data into 70% train and 30% validate

split<-sample.split(house_train$SalePrice, SplitRatio = 0.7)
train_data<-subset(house_train, split==T)
validate_data<-subset(house_train, split==F)

# create data for training and test
X_train1<-as.matrix(train_data[, c(1:48,50:56)]) #Since 49th is the target Saleprice Variable
Y_train1<-log(train_data$SalePrice + 1)

X_test1<-as.matrix(validate_data[, c(1:48,50:56)])
Y_test1<-log(validate_data$SalePrice + 1)


#lasso

# set up caret model training parameters
# model specific training parameter
CARET.TRAIN.CTRL <- trainControl(method="repeatedcv",
                                 number=5,
                                 repeats=5,
                                 verboseIter=FALSE)



set.seed(123)  # for reproducibility

model_lasso <- train(x=X_train1,y=Y_train1,
                     method="glmnet",
                     metric="RMSE",
                     maximize=FALSE,
                     trControl=CARET.TRAIN.CTRL,
                     tuneGrid=expand.grid(alpha=1,  
                                          lambda=c(1,0.1,0.05,0.01,seq(0.009,0.001,-0.001),
                                                   0.00075,0.0005,0.0001)))

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info =
## trainInfo, : There were missing values in resampled performance measures.

model_lasso

## glmnet 
## 
## 1117 samples
##   55 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold, repeated 5 times) 
## Summary of sample sizes: 894, 893, 895, 893, 893, 894, ... 
## Resampling results across tuning parameters:
## 
##   lambda   RMSE       Rsquared 
##   0.00010  0.1548463  0.8631561
##   0.00050  0.1545157  0.8636847
##   0.00075  0.1542358  0.8641277
##   0.00100  0.1539988  0.8644989
##   0.00200  0.1534382  0.8653761
##   0.00300  0.1533044  0.8655919
##   0.00400  0.1533202  0.8655985
##   0.00500  0.1534929  0.8653776
##   0.00600  0.1537562  0.8650237
##   0.00700  0.1541201  0.8645302
##   0.00800  0.1545822  0.8639098
##   0.00900  0.1550865  0.8632390
##   0.01000  0.1556202  0.8625333
##   0.05000  0.1842950  0.8281603
##   0.10000  0.2216712  0.7957915
##   1.00000  0.4166745        NaN
## 
## Tuning parameter 'alpha' was held constant at a value of 1
## RMSE was used to select the optimal model using  the smallest value.
## The final values used for the model were alpha = 1 and lambda = 0.003.

model_lasso$resample$RMSE

##  [1] 0.2106952 0.2100043 0.1341817 0.1332913 0.1377757 0.1259775 0.1662398
##  [8] 0.1241500 0.1391062 0.2438342 0.1259210 0.1674068 0.1379432 0.1320743
## [15] 0.1341333 0.1617226 0.1363260 0.1148617 0.1402917 0.1529127 0.1401305
## [22] 0.1245191 0.1405407 0.1993267 0.1992432

mean(model_lasso$resample$RMSE)

## [1] 0.1533044

coef <- data.frame(coef.name = dimnames(coef(model_lasso$finalModel,s=model_lasso$bestTune$lambda))[[1]], 
                   coef.value = matrix(coef(model_lasso$finalModel,s=model_lasso$bestTune$lambda)))

coef <- coef[-1,]

picked_features <- nrow(filter(coef,coef.value!=0))
not_picked_features <- nrow(filter(coef,coef.value==0))

cat("Lasso picked",picked_features,"variables and eliminated the other",
    not_picked_features,"variables\n")

## Lasso picked 42 variables and eliminated the other 13 variables

coef <- arrange(coef,-coef.value)

imp_coef <- rbind(head(coef,10),
                  tail(coef,10))

ggplot(imp_coef) +
  geom_bar(aes(x=reorder(coef.name,coef.value),y=coef.value),
           stat="identity") +
  ylim(-0.5,0.5) +
  coord_flip() +
  ggtitle("Coefficents in the Lasso Model") +
  theme(axis.title=element_blank())

###
# RIDGE
  CARET.TRAIN.CTRL <- trainControl(method="repeatedcv",
                                   number=5,
                                   repeats=5,
                                   verboseIter=FALSE)

lambdas <- seq(1,0,-0.001)

# train model
set.seed(123)  # for reproducibility
model_ridge <- train(x=X_train1,y=Y_train1,
                     method="glmnet",
                     metric="RMSE",
                     maximize=FALSE,
                     trControl=CARET.TRAIN.CTRL,
                     tuneGrid=expand.grid(alpha=0, # Ridge regression
                                          lambda=lambdas))


ggplot(data=filter(model_ridge$result,RMSE<0.17)) +
  geom_line(aes(x=lambda,y=RMSE))

mean(model_ridge$resample$RMSE)

## [1] 0.1532867

model_ridge2 <- train(x=X_train1,y=Y_train1,
                     method="glmnet",
                     metric="RMSE",
                     maximize=FALSE,
                     trControl=CARET.TRAIN.CTRL,
                     tuneGrid=expand.grid(alpha=0,
                                          lambda=0.06)) # Ridge regression

mean(model_ridge2$resample$RMSE)

## [1] 0.1547717

# R, Regression Trees, function rpart(), method "anova" 
model.regtree<- rpart(SalePrice ~., data = train_data, method = "anova")
predict.regtree <- predict(model.regtree, validate_data)

# RMSE

RMSE.regressiontree <- sqrt(mean((log(validate_data$SalePrice) - log(predict.regtree))^2))
RMSE.regressiontree

## [1] 0.2123953

# Random Forest
rf.model_1<-randomForest(train_data$SalePrice~., train_data)
rf.model_1

## 
## Call:
##  randomForest(formula = train_data$SalePrice ~ ., data = train_data) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 18
## 
##           Mean of squared residuals: 918621356
##                     % Var explained: 87.21

predict_1<-predict(rf.model_1, validate_data)

# RMSE

RMSE.rfmodel <- sqrt(mean((log(validate_data$SalePrice) - log(predict_1))^2))
RMSE.rfmodel

## [1] 0.1091713

# Error Rate of Random Forest
plot(rf.model_1)

# Now we will tune the  RF model
#from plot we came to to know that the rmse is constant from ntree=100, hence we will tune the model with above ntree which reduces the complexity of tree

rf.model_2<-randomForest(train_data$SalePrice~., train_data, ntree = 100, mtry = 20, importance = T,proximity = T)
rf.model_2

## 
## Call:
##  randomForest(formula = train_data$SalePrice ~ ., data = train_data,      ntree = 100, mtry = 20, importance = T, proximity = T) 
##                Type of random forest: regression
##                      Number of trees: 100
## No. of variables tried at each split: 20
## 
##           Mean of squared residuals: 1029154928
##                     % Var explained: 85.67

#Thus 87.66% of variation of SalePrice is explained by this Random forest model
predict_tune <- predict(rf.model_2, validate_data)

RMSE.tune_rfmodel <- sqrt(mean((log(validate_data$SalePrice) - log(predict_tune))^2))
RMSE.tune_rfmodel

## [1] 0.1115868

# No of nodes for the trees
hist(treesize(rf.model_2), main = 'No of nodes for the treee', col = 'blue')

varImpPlot(rf.model_2, col = 'blue', sort = T, n.var = 15, main = 'Top 15 important predictors')

# From above graph we can plot the important variables used in Random Forest

# Partial Dependence plot on GrLivArea.
partialPlot(rf.model_2, train_data, GrLivArea, '0')

#Gradient Boosting Model #package GBM.
gbm.model <- gbm(SalePrice ~., data = train_data, distribution = "laplace",
               shrinkage = 0.05,
               interaction.depth = 5,
               bag.fraction = 0.66,
               n.minobsinnode = 1,
               cv.folds = 100,
               keep.data = F,
               verbose = F,
               n.trees = 300)
predict.gbm <- predict(gbm.model, validate_data, n.trees = 300)

# RMSE

RMSE.gbm <- sqrt(mean((log(validate_data$SalePrice) - log(predict.gbm))^2))
RMSE.gbm

## [1] 0.1017775

#The RMSE for the Gradient Boosting Model is low  when compared to other models

#Conclusion:
#Gradient Boosting algorithm is performing well with low RMSE . Hence we will use this algorithm.

Predicting Housing Sales

Neeraj kumar

May 2, 2017