Loading all Required Libraries and Loading the data
library(ggplot2)
library(readr)
library(gplots)
library(repr)
library(plyr)
library(corrplot)
library(ggplot2)
library(reshape2)
library(forecast)
library(rpart)
library(rpart.plot)
library(randomForest)
library(lattice)
library(Rmisc)
library(pROC)
library(ROCR)
library(knitr)
library(ggplot2)
library(plyr)
library(dplyr)
library(corrplot)
library(caret)
library(gridExtra)
library(scales)
library(Rmisc)
library(ggrepel)
library(randomForest)
library(psych)
library(xgboost)
data = read.csv("~/Downloads/train_545.csv")
# data pre processing
# discard unwanted columns
colnames(data)
## [1] "Id" "MSSubClass" "MSZoning" "LotFrontage"
## [5] "LotArea" "Street" "Alley" "LotShape"
## [9] "LandContour" "Utilities" "LotConfig" "LandSlope"
## [13] "Neighborhood" "Condition1" "Condition2" "BldgType"
## [17] "HouseStyle" "OverallQual" "OverallCond" "YearBuilt"
## [21] "YearRemodAdd" "RoofStyle" "RoofMatl" "Exterior1st"
## [25] "Exterior2nd" "MasVnrType" "MasVnrArea" "ExterQual"
## [29] "ExterCond" "Foundation" "BsmtQual" "BsmtCond"
## [33] "BsmtExposure" "BsmtFinType1" "BsmtFinSF1" "BsmtFinType2"
## [37] "BsmtFinSF2" "BsmtUnfSF" "TotalBsmtSF" "Heating"
## [41] "HeatingQC" "CentralAir" "Electrical" "X1stFlrSF"
## [45] "X2ndFlrSF" "LowQualFinSF" "GrLivArea" "BsmtFullBath"
## [49] "BsmtHalfBath" "FullBath" "HalfBath" "BedroomAbvGr"
## [53] "KitchenAbvGr" "KitchenQual" "TotRmsAbvGrd" "Functional"
## [57] "Fireplaces" "FireplaceQu" "GarageType" "GarageYrBlt"
## [61] "GarageFinish" "GarageCars" "GarageArea" "GarageQual"
## [65] "GarageCond" "PavedDrive" "WoodDeckSF" "OpenPorchSF"
## [69] "EnclosedPorch" "X3SsnPorch" "ScreenPorch" "PoolArea"
## [73] "PoolQC" "Fence" "MiscFeature" "MiscVal"
## [77] "MoSold" "YrSold" "SaleType" "SaleCondition"
## [81] "SalePrice"
cols = c('Id','MSZoning','Utilities', 'Neighborhood','BldgType','HouseStyle','OverallQual',
'OverallCond','YearBuilt', 'ExterQual','ExterCond', 'BsmtQual','BsmtCond','TotalBsmtSF',
'Heating','HeatingQC','CentralAir','Electrical','GrLivArea','BedroomAbvGr','KitchenAbvGr',
'KitchenQual','TotRmsAbvGrd','Functional','Fireplaces','FireplaceQu','GarageArea',
'GarageQual','GarageCond','OpenPorchSF','PoolArea','Fence','MoSold','YrSold','SaleType',
'SaleCondition','SalePrice')
data = data[,cols]
head(data)
## Id MSZoning Utilities Neighborhood BldgType HouseStyle OverallQual
## 1 1 RL AllPub CollgCr 1Fam 2Story 7
## 2 2 RL AllPub Veenker 1Fam 1Story 6
## 3 3 RL AllPub CollgCr 1Fam 2Story 7
## 4 4 RL AllPub Crawfor 1Fam 2Story 7
## 5 5 RL AllPub NoRidge 1Fam 2Story 8
## 6 6 RL AllPub Mitchel 1Fam 1.5Fin 5
## OverallCond YearBuilt ExterQual ExterCond BsmtQual BsmtCond TotalBsmtSF
## 1 5 2003 Gd TA Gd TA 856
## 2 8 1976 TA TA Gd TA 1262
## 3 5 2001 Gd TA Gd TA 920
## 4 5 1915 TA TA TA Gd 756
## 5 5 2000 Gd TA Gd TA 1145
## 6 5 1993 TA TA Gd TA 796
## Heating HeatingQC CentralAir Electrical GrLivArea BedroomAbvGr
## 1 GasA Ex Y SBrkr 1710 3
## 2 GasA Ex Y SBrkr 1262 3
## 3 GasA Ex Y SBrkr 1786 3
## 4 GasA Gd Y SBrkr 1717 3
## 5 GasA Ex Y SBrkr 2198 4
## 6 GasA Ex Y SBrkr 1362 1
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1 1 Gd 8 Typ 0 <NA>
## 2 1 TA 6 Typ 1 TA
## 3 1 Gd 6 Typ 1 TA
## 4 1 Gd 7 Typ 1 Gd
## 5 1 Gd 9 Typ 1 TA
## 6 1 TA 5 Typ 0 <NA>
## GarageArea GarageQual GarageCond OpenPorchSF PoolArea Fence MoSold
## 1 548 TA TA 61 0 <NA> 2
## 2 460 TA TA 0 0 <NA> 5
## 3 608 TA TA 42 0 <NA> 9
## 4 642 TA TA 35 0 <NA> 2
## 5 836 TA TA 84 0 <NA> 12
## 6 480 TA TA 30 0 MnPrv 10
## YrSold SaleType SaleCondition SalePrice
## 1 2008 WD Normal 208500
## 2 2007 WD Normal 181500
## 3 2008 WD Normal 223500
## 4 2006 WD Abnorml 140000
## 5 2008 WD Normal 250000
## 6 2009 WD Normal 143000
str(data)
## 'data.frame': 1460 obs. of 37 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ MSZoning : Factor w/ 5 levels "C (all)","FV",..: 4 4 4 4 4 4 4 4 5 4 ...
## $ Utilities : Factor w/ 2 levels "AllPub","NoSeWa": 1 1 1 1 1 1 1 1 1 1 ...
## $ Neighborhood : Factor w/ 25 levels "Blmngtn","Blueste",..: 6 25 6 7 14 12 21 17 18 4 ...
## $ BldgType : Factor w/ 5 levels "1Fam","2fmCon",..: 1 1 1 1 1 1 1 1 1 2 ...
## $ HouseStyle : Factor w/ 8 levels "1.5Fin","1.5Unf",..: 6 3 6 6 6 1 3 6 1 2 ...
## $ OverallQual : int 7 6 7 7 8 5 8 7 7 5 ...
## $ OverallCond : int 5 8 5 5 5 5 5 6 5 6 ...
## $ YearBuilt : int 2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
## $ ExterQual : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 4 3 4 3 4 4 4 ...
## $ ExterCond : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ BsmtQual : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 3 3 4 3 3 1 3 4 4 ...
## $ BsmtCond : Factor w/ 4 levels "Fa","Gd","Po",..: 4 4 4 2 4 4 4 4 4 4 ...
## $ TotalBsmtSF : int 856 1262 920 756 1145 796 1686 1107 952 991 ...
## $ Heating : Factor w/ 6 levels "Floor","GasA",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ HeatingQC : Factor w/ 5 levels "Ex","Fa","Gd",..: 1 1 1 3 1 1 1 1 3 1 ...
## $ CentralAir : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 2 2 2 2 ...
## $ Electrical : Factor w/ 5 levels "FuseA","FuseF",..: 5 5 5 5 5 5 5 5 2 5 ...
## $ GrLivArea : int 1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
## $ BedroomAbvGr : int 3 3 3 3 4 1 3 3 2 2 ...
## $ KitchenAbvGr : int 1 1 1 1 1 1 1 1 2 2 ...
## $ KitchenQual : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 3 3 4 3 4 4 4 ...
## $ TotRmsAbvGrd : int 8 6 6 7 9 5 7 7 8 5 ...
## $ Functional : Factor w/ 7 levels "Maj1","Maj2",..: 7 7 7 7 7 7 7 7 3 7 ...
## $ Fireplaces : int 0 1 1 1 1 0 1 2 2 2 ...
## $ FireplaceQu : Factor w/ 5 levels "Ex","Fa","Gd",..: NA 5 5 3 5 NA 3 5 5 5 ...
## $ GarageArea : int 548 460 608 642 836 480 636 484 468 205 ...
## $ GarageQual : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 2 3 ...
## $ GarageCond : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
## $ OpenPorchSF : int 61 0 42 35 84 30 57 204 0 4 ...
## $ PoolArea : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Fence : Factor w/ 4 levels "GdPrv","GdWo",..: NA NA NA NA NA 3 NA NA NA NA ...
## $ MoSold : int 2 5 9 2 12 10 8 11 4 1 ...
## $ YrSold : int 2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
## $ SaleType : Factor w/ 9 levels "COD","Con","ConLD",..: 9 9 9 9 9 9 9 9 9 9 ...
## $ SaleCondition: Factor w/ 6 levels "Abnorml","AdjLand",..: 5 5 5 1 5 5 5 5 1 5 ...
## $ SalePrice : int 208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
summary(data)
## Id MSZoning Utilities Neighborhood BldgType
## Min. : 1.0 C (all): 10 AllPub:1459 NAmes :225 1Fam :1220
## 1st Qu.: 365.8 FV : 65 NoSeWa: 1 CollgCr:150 2fmCon: 31
## Median : 730.5 RH : 16 OldTown:113 Duplex: 52
## Mean : 730.5 RL :1151 Edwards:100 Twnhs : 43
## 3rd Qu.:1095.2 RM : 218 Somerst: 86 TwnhsE: 114
## Max. :1460.0 Gilbert: 79
## (Other):707
## HouseStyle OverallQual OverallCond YearBuilt ExterQual
## 1Story :726 Min. : 1.000 Min. :1.000 Min. :1872 Ex: 52
## 2Story :445 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954 Fa: 14
## 1.5Fin :154 Median : 6.000 Median :5.000 Median :1973 Gd:488
## SLvl : 65 Mean : 6.099 Mean :5.575 Mean :1971 TA:906
## SFoyer : 37 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## 1.5Unf : 14 Max. :10.000 Max. :9.000 Max. :2010
## (Other): 19
## ExterCond BsmtQual BsmtCond TotalBsmtSF Heating HeatingQC
## Ex: 3 Ex :121 Fa : 45 Min. : 0.0 Floor: 1 Ex:741
## Fa: 28 Fa : 35 Gd : 65 1st Qu.: 795.8 GasA :1428 Fa: 49
## Gd: 146 Gd :618 Po : 2 Median : 991.5 GasW : 18 Gd:241
## Po: 1 TA :649 TA :1311 Mean :1057.4 Grav : 7 Po: 1
## TA:1282 NA's: 37 NA's: 37 3rd Qu.:1298.2 OthW : 2 TA:428
## Max. :6110.0 Wall : 4
##
## CentralAir Electrical GrLivArea BedroomAbvGr KitchenAbvGr
## N: 95 FuseA: 94 Min. : 334 Min. :0.000 Min. :0.000
## Y:1365 FuseF: 27 1st Qu.:1130 1st Qu.:2.000 1st Qu.:1.000
## FuseP: 3 Median :1464 Median :3.000 Median :1.000
## Mix : 1 Mean :1515 Mean :2.866 Mean :1.047
## SBrkr:1334 3rd Qu.:1777 3rd Qu.:3.000 3rd Qu.:1.000
## NA's : 1 Max. :5642 Max. :8.000 Max. :3.000
##
## KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## Ex:100 Min. : 2.000 Maj1: 14 Min. :0.000 Ex : 24
## Fa: 39 1st Qu.: 5.000 Maj2: 5 1st Qu.:0.000 Fa : 33
## Gd:586 Median : 6.000 Min1: 31 Median :1.000 Gd :380
## TA:735 Mean : 6.518 Min2: 34 Mean :0.613 Po : 20
## 3rd Qu.: 7.000 Mod : 15 3rd Qu.:1.000 TA :313
## Max. :14.000 Sev : 1 Max. :3.000 NA's:690
## Typ :1360
## GarageArea GarageQual GarageCond OpenPorchSF
## Min. : 0.0 Ex : 3 Ex : 2 Min. : 0.00
## 1st Qu.: 334.5 Fa : 48 Fa : 35 1st Qu.: 0.00
## Median : 480.0 Gd : 14 Gd : 9 Median : 25.00
## Mean : 473.0 Po : 3 Po : 7 Mean : 46.66
## 3rd Qu.: 576.0 TA :1311 TA :1326 3rd Qu.: 68.00
## Max. :1418.0 NA's: 81 NA's: 81 Max. :547.00
##
## PoolArea Fence MoSold YrSold
## Min. : 0.000 GdPrv: 59 Min. : 1.000 Min. :2006
## 1st Qu.: 0.000 GdWo : 54 1st Qu.: 5.000 1st Qu.:2007
## Median : 0.000 MnPrv: 157 Median : 6.000 Median :2008
## Mean : 2.759 MnWw : 11 Mean : 6.322 Mean :2008
## 3rd Qu.: 0.000 NA's :1179 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :738.000 Max. :12.000 Max. :2010
##
## SaleType SaleCondition SalePrice
## WD :1267 Abnorml: 101 Min. : 34900
## New : 122 AdjLand: 4 1st Qu.:129975
## COD : 43 Alloca : 12 Median :163000
## ConLD : 9 Family : 20 Mean :180921
## ConLI : 5 Normal :1198 3rd Qu.:214000
## ConLw : 5 Partial: 125 Max. :755000
## (Other): 9
Checking the distribution of Sale Price
ggplot(data, aes(x = SalePrice, fill = ..count..)) +
geom_histogram(binwidth = 5000) +
ggtitle("Histogram of SalePrice") + ylab("Count of houses") + xlab("Housing Price")
#log term of SalePrice
data$lSalePrice <- log(data$SalePrice)
ggplot(data, aes(x =lSalePrice, fill = ..count..)) +
geom_histogram(binwidth = 0.05) +
ggtitle("Histogram of Log SalePrice") + ylab("Count of houses") + xlab("Housing Price")
# correlation plot:
corr_cols = c('GarageArea', 'TotRmsAbvGrd', 'TotalBsmtSF', 'GrLivArea', 'BldgType',
'YearBuilt', 'OverallQual', 'Neighborhood', 'GrLivArea', 'MSZoning', 'SalePrice')
data_corr = data[, corr_cols]
data_corr$BldgType = as.numeric(data_corr$BldgType)
data_corr$MSZoning = as.numeric(data_corr$MSZoning)
data_corr$Neighborhood = as.numeric(data_corr$Neighborhood)
corrplot(cor(data_corr))
summary(data$MSZoning)
## C (all) FV RH RL RM
## 10 65 16 1151 218
# plot(data$MSZoning)
ggplot(data, aes(x = MSZoning, fill = MSZoning )) +
geom_bar() + ggtitle("Distribution of MSZoning")
# MSZoning ~ SalePrice
# boxplot(SalePrice ~ MSZoning, data = data)
ggplot(data, aes(x=MSZoning, y=SalePrice, fill=MSZoning)) +
geom_boxplot(alpha=0.3) + ggtitle("Boxplot of SalePrice by MSZoning")
# GrLivArea
summary(data$GrLivArea)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 334 1130 1464 1515 1777 5642
# hist(data$GrLivArea, breaks = 30, col = "grey")
ggplot(data, aes(x = GrLivArea, fill = ..count..)) +
geom_histogram(binwidth = 100) +
ggtitle("Histogram of GrLivArea") + ylab("Count of houses") + xlab("Living Area")
#plot(ddply(data, .(MSZoning), summarize, size=mean(GrLivArea)))
# OverallQual
# 10 Very Excellent
# 9 Excellent
# 8 Very Good
# 7 Good
# 6 Above Average
# 5 Average
# 4 Below Average
# 3 Fair
# 2 Poor
# 1 Very Poor
summary(data$OverallQual)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 5.000 6.000 6.099 7.000 10.000
ggplot(data, aes(x = OverallQual, fill = ..count..)) +
geom_histogram(binwidth = 1)
data$OverallQualFact = factor(data$OverallQual)
ggplot(data, aes(x=OverallQualFact, y=SalePrice, fill=OverallQualFact)) +
geom_boxplot(alpha=0.3) + ggtitle("Boxplot of SalePrice by OverallQual")
# BldgType
# 1Fam Single-family Detached
# 2FmCon Two-family Conversion; originally built as one-family dwelling
# Duplx Duplex
# TwnhsE Townhouse End Unit
# TwnhsI Townhouse Inside Unit
summary(data$BldgType)
## 1Fam 2fmCon Duplex Twnhs TwnhsE
## 1220 31 52 43 114
ggplot(data, aes(x = BldgType, fill = ..count..)) +
geom_bar(stat="count")
ggplot(data, aes(x=BldgType, y=SalePrice, fill=BldgType)) +
geom_boxplot(alpha=0.3) + ggtitle("Boxplot of SalePrice by BldgType")
# other variables
p1 = ggplot(data, aes(x=GrLivArea, y=SalePrice)) +
geom_point(shape=1) + geom_smooth(method=lm , color="red", se=FALSE) +
ggtitle("SalePrice vs GrLivArea")
p2 = ggplot(data, aes(x=TotalBsmtSF, y=SalePrice)) +
geom_point(shape=1) + geom_smooth(method=lm , color="red", se=FALSE) +
ggtitle("SalePrice vs TotalBsmtSF")
p3 = ggplot(data, aes(x=TotRmsAbvGrd, y=SalePrice)) +
geom_point(shape=1) + geom_smooth(method=lm , color="red", se=FALSE) +
ggtitle("SalePrice vs TotRmsAbvGrd")
p4 = ggplot(data, aes(x=GarageArea, y=SalePrice)) +
geom_point(shape=1) + geom_smooth(method=lm , color="red", se=FALSE) +
ggtitle("SalePrice vs GarageArea")
multiplot(p1, p2, p3, p4, cols=2)
corr_cols = c('GarageArea', 'TotRmsAbvGrd', 'TotalBsmtSF', 'GrLivArea', 'BldgType',
'OverallQual', 'GrLivArea', 'MSZoning', 'SalePrice')
data = data[,corr_cols]
data$BldgType = as.numeric(data$BldgType)
data$MSZoning = as.numeric(data$MSZoning)
data$LSalePrice = log(data$SalePrice)
#partition data
## 75% of the sample size
smp_size = floor(0.75 * nrow(data))
## set the seed to make your partition reproducible
set.seed(123)
train_ind = sample(seq_len(nrow(data)), size = smp_size)
train = data[train_ind, ]
test = data[-train_ind, ]
# Linear Regression Model
lin_reg_model = lm(LSalePrice~.-SalePrice, data = train)
summary(lin_reg_model)
##
## Call:
## lm(formula = LSalePrice ~ . - SalePrice, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.41513 -0.07292 0.01766 0.10526 0.62555
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.072e+01 5.223e-02 205.248 < 2e-16 ***
## GarageArea 3.391e-04 3.380e-05 10.034 < 2e-16 ***
## TotRmsAbvGrd -5.199e-04 6.233e-03 -0.083 0.933545
## TotalBsmtSF 1.344e-04 1.603e-05 8.386 < 2e-16 ***
## GrLivArea 2.000e-04 2.190e-05 9.135 < 2e-16 ***
## BldgType -1.464e-02 4.752e-03 -3.080 0.002119 **
## OverallQual 1.398e-01 5.754e-03 24.288 < 2e-16 ***
## GrLivArea.1 NA NA NA NA
## MSZoning -3.208e-02 9.087e-03 -3.530 0.000432 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1857 on 1087 degrees of freedom
## Multiple R-squared: 0.7885, Adjusted R-squared: 0.7872
## F-statistic: 579 on 7 and 1087 DF, p-value: < 2.2e-16
# test the model
lin_reg_predictions = predict(lin_reg_model, test, type = "response")
lin_reg_residuals = test$LSalePrice - lin_reg_predictions
accuracy(lin_reg_predictions, test$LSalePrice)
## ME RMSE MAE MPE MAPE
## Test set 0.005606172 0.177485 0.1195952 0.02872382 1.001581
plot(lin_reg_predictions, test$LSalePrice, title("Linear Regression- Predicted vs Actual Price"))
lines(c(11, 13), c(11, 13), col = 'red')
# Regression tree model
reg_tree_model = rpart(LSalePrice~.-SalePrice, data = train, control = rpart.control(cp = 0.01))
printcp(reg_tree_model)
##
## Regression tree:
## rpart(formula = LSalePrice ~ . - SalePrice, data = train, control = rpart.control(cp = 0.01))
##
## Variables actually used in tree construction:
## [1] GarageArea GrLivArea OverallQual TotalBsmtSF
##
## Root node error: 177.33/1095 = 0.16195
##
## n= 1095
##
## CP nsplit rel error xerror xstd
## 1 0.453668 0 1.00000 1.00069 0.051474
## 2 0.085673 1 0.54633 0.54749 0.031100
## 3 0.073347 2 0.46066 0.46436 0.027958
## 4 0.040919 3 0.38731 0.39919 0.024450
## 5 0.025618 4 0.34639 0.36278 0.022486
## 6 0.022020 5 0.32077 0.34905 0.022134
## 7 0.015239 6 0.29875 0.33740 0.021351
## 8 0.014309 7 0.28351 0.32822 0.020998
## 9 0.010000 8 0.26921 0.31284 0.020442
plotcp(reg_tree_model)
rpart.plot(reg_tree_model)
# test
reg_tree_predictions = predict(reg_tree_model, newdata = test, type = "vector")
reg_tree_residuals = test$LSalePrice - lin_reg_predictions
accuracy(reg_tree_predictions, test$LSalePrice)
## ME RMSE MAE MPE MAPE
## Test set 0.0014339 0.2079031 0.1552894 -0.01489638 1.300271
plot(reg_tree_predictions, test$LSalePrice)
lines(c(11, 13), c(11, 13), col = 'red')
# Random Forest
rand_forest_model = randomForest(
LSalePrice ~.-SalePrice, data = train,
importance =TRUE, ntree=500, nodesize=7, na.action=na.roughfix
)
summary(rand_forest_model)
## Length Class Mode
## call 7 -none- call
## type 1 -none- character
## predicted 1095 -none- numeric
## mse 500 -none- numeric
## rsq 500 -none- numeric
## oob.times 1095 -none- numeric
## importance 16 -none- numeric
## importanceSD 8 -none- numeric
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 11 -none- list
## coefs 0 -none- NULL
## y 1095 -none- numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
plot(rand_forest_model)
varImpPlot(rand_forest_model, type=1)
Lasso Regression
#Lasso Regression
# The test sample only has 10 variables
#data = read.csv("~/Downloads/train_545.csv")
#data$lSalePrice <- log(data$SalePrice)
set.seed(27042018)
my_control <-trainControl(method="cv", number=5)
lassoGrid <- expand.grid(alpha = 1, lambda = seq(0.001,0.1,by = 0.005))
lasso_mod <- train(x=train, y=train$LSalePrice, method='glmnet', trControl= my_control, tuneGrid=lassoGrid)
lasso_mod$bestTune
## alpha lambda
## 3 1 0.011
min(lasso_mod$results$RMSE)
## [1] 0.01172239
lassoVarImp <- varImp(lasso_mod,scale=F)
lassoImportance <- lassoVarImp$importance
varsSelected <- length(which(lassoImportance$Overall!=0))
varsNotSelected <- length(which(lassoImportance$Overall==0))
cat('Lasso uses', varsSelected, 'variables in its model, and did not select', varsNotSelected, 'variables.')
## Lasso uses 1 variables in its model, and did not select 9 variables.
LassoPred <- predict(lasso_mod, test)
predictions_lasso <- exp(LassoPred) #need to reverse the log to the real values
head(predictions_lasso)
## 2 5 7 21 22 25
## 181063.1 247081.1 301604.4 319043.8 140138.4 154366.9
The model is using just 1 variable to predict the house price and is overfitting
Random forest Model seems to be more accurate than others.
Predicting price in the train subset using random forest
# test
rand_forest_predictions = predict(rand_forest_model, newdata = test, type = "response")
rand_forest_residuals = test$LSalePrice - lin_reg_predictions
accuracy(rand_forest_predictions, test$LSalePrice)
## ME RMSE MAE MPE MAPE
## Test set -0.003473407 0.1631106 0.1078001 -0.04431492 0.9039332
plot(rand_forest_predictions, test$LSalePrice)
lines(c(11, 13), c(11, 13), col = 'red')