Loading all Required Libraries and Loading the data

library(ggplot2)
library(readr)
library(gplots)
library(repr)
library(plyr)
library(corrplot)
library(ggplot2)
library(reshape2)
library(forecast)
library(rpart)
library(rpart.plot)
library(randomForest)
library(lattice)
library(Rmisc)
library(pROC)
library(ROCR)
library(knitr)
library(ggplot2)
library(plyr)
library(dplyr)
library(corrplot)
library(caret)
library(gridExtra)
library(scales)
library(Rmisc)
library(ggrepel)
library(randomForest)
library(psych)
library(xgboost)
data = read.csv("~/Downloads/train_545.csv")
# data pre processing

# discard unwanted columns
colnames(data)
##  [1] "Id"            "MSSubClass"    "MSZoning"      "LotFrontage"  
##  [5] "LotArea"       "Street"        "Alley"         "LotShape"     
##  [9] "LandContour"   "Utilities"     "LotConfig"     "LandSlope"    
## [13] "Neighborhood"  "Condition1"    "Condition2"    "BldgType"     
## [17] "HouseStyle"    "OverallQual"   "OverallCond"   "YearBuilt"    
## [21] "YearRemodAdd"  "RoofStyle"     "RoofMatl"      "Exterior1st"  
## [25] "Exterior2nd"   "MasVnrType"    "MasVnrArea"    "ExterQual"    
## [29] "ExterCond"     "Foundation"    "BsmtQual"      "BsmtCond"     
## [33] "BsmtExposure"  "BsmtFinType1"  "BsmtFinSF1"    "BsmtFinType2" 
## [37] "BsmtFinSF2"    "BsmtUnfSF"     "TotalBsmtSF"   "Heating"      
## [41] "HeatingQC"     "CentralAir"    "Electrical"    "X1stFlrSF"    
## [45] "X2ndFlrSF"     "LowQualFinSF"  "GrLivArea"     "BsmtFullBath" 
## [49] "BsmtHalfBath"  "FullBath"      "HalfBath"      "BedroomAbvGr" 
## [53] "KitchenAbvGr"  "KitchenQual"   "TotRmsAbvGrd"  "Functional"   
## [57] "Fireplaces"    "FireplaceQu"   "GarageType"    "GarageYrBlt"  
## [61] "GarageFinish"  "GarageCars"    "GarageArea"    "GarageQual"   
## [65] "GarageCond"    "PavedDrive"    "WoodDeckSF"    "OpenPorchSF"  
## [69] "EnclosedPorch" "X3SsnPorch"    "ScreenPorch"   "PoolArea"     
## [73] "PoolQC"        "Fence"         "MiscFeature"   "MiscVal"      
## [77] "MoSold"        "YrSold"        "SaleType"      "SaleCondition"
## [81] "SalePrice"
cols = c('Id','MSZoning','Utilities', 'Neighborhood','BldgType','HouseStyle','OverallQual',
         'OverallCond','YearBuilt', 'ExterQual','ExterCond', 'BsmtQual','BsmtCond','TotalBsmtSF',
         'Heating','HeatingQC','CentralAir','Electrical','GrLivArea','BedroomAbvGr','KitchenAbvGr',
         'KitchenQual','TotRmsAbvGrd','Functional','Fireplaces','FireplaceQu','GarageArea',
         'GarageQual','GarageCond','OpenPorchSF','PoolArea','Fence','MoSold','YrSold','SaleType',
         'SaleCondition','SalePrice')
data = data[,cols]

head(data)
##   Id MSZoning Utilities Neighborhood BldgType HouseStyle OverallQual
## 1  1       RL    AllPub      CollgCr     1Fam     2Story           7
## 2  2       RL    AllPub      Veenker     1Fam     1Story           6
## 3  3       RL    AllPub      CollgCr     1Fam     2Story           7
## 4  4       RL    AllPub      Crawfor     1Fam     2Story           7
## 5  5       RL    AllPub      NoRidge     1Fam     2Story           8
## 6  6       RL    AllPub      Mitchel     1Fam     1.5Fin           5
##   OverallCond YearBuilt ExterQual ExterCond BsmtQual BsmtCond TotalBsmtSF
## 1           5      2003        Gd        TA       Gd       TA         856
## 2           8      1976        TA        TA       Gd       TA        1262
## 3           5      2001        Gd        TA       Gd       TA         920
## 4           5      1915        TA        TA       TA       Gd         756
## 5           5      2000        Gd        TA       Gd       TA        1145
## 6           5      1993        TA        TA       Gd       TA         796
##   Heating HeatingQC CentralAir Electrical GrLivArea BedroomAbvGr
## 1    GasA        Ex          Y      SBrkr      1710            3
## 2    GasA        Ex          Y      SBrkr      1262            3
## 3    GasA        Ex          Y      SBrkr      1786            3
## 4    GasA        Gd          Y      SBrkr      1717            3
## 5    GasA        Ex          Y      SBrkr      2198            4
## 6    GasA        Ex          Y      SBrkr      1362            1
##   KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1            1          Gd            8        Typ          0        <NA>
## 2            1          TA            6        Typ          1          TA
## 3            1          Gd            6        Typ          1          TA
## 4            1          Gd            7        Typ          1          Gd
## 5            1          Gd            9        Typ          1          TA
## 6            1          TA            5        Typ          0        <NA>
##   GarageArea GarageQual GarageCond OpenPorchSF PoolArea Fence MoSold
## 1        548         TA         TA          61        0  <NA>      2
## 2        460         TA         TA           0        0  <NA>      5
## 3        608         TA         TA          42        0  <NA>      9
## 4        642         TA         TA          35        0  <NA>      2
## 5        836         TA         TA          84        0  <NA>     12
## 6        480         TA         TA          30        0 MnPrv     10
##   YrSold SaleType SaleCondition SalePrice
## 1   2008       WD        Normal    208500
## 2   2007       WD        Normal    181500
## 3   2008       WD        Normal    223500
## 4   2006       WD       Abnorml    140000
## 5   2008       WD        Normal    250000
## 6   2009       WD        Normal    143000
str(data)
## 'data.frame':    1460 obs. of  37 variables:
##  $ Id           : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ MSZoning     : Factor w/ 5 levels "C (all)","FV",..: 4 4 4 4 4 4 4 4 5 4 ...
##  $ Utilities    : Factor w/ 2 levels "AllPub","NoSeWa": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Neighborhood : Factor w/ 25 levels "Blmngtn","Blueste",..: 6 25 6 7 14 12 21 17 18 4 ...
##  $ BldgType     : Factor w/ 5 levels "1Fam","2fmCon",..: 1 1 1 1 1 1 1 1 1 2 ...
##  $ HouseStyle   : Factor w/ 8 levels "1.5Fin","1.5Unf",..: 6 3 6 6 6 1 3 6 1 2 ...
##  $ OverallQual  : int  7 6 7 7 8 5 8 7 7 5 ...
##  $ OverallCond  : int  5 8 5 5 5 5 5 6 5 6 ...
##  $ YearBuilt    : int  2003 1976 2001 1915 2000 1993 2004 1973 1931 1939 ...
##  $ ExterQual    : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 4 3 4 3 4 4 4 ...
##  $ ExterCond    : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ BsmtQual     : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 3 3 4 3 3 1 3 4 4 ...
##  $ BsmtCond     : Factor w/ 4 levels "Fa","Gd","Po",..: 4 4 4 2 4 4 4 4 4 4 ...
##  $ TotalBsmtSF  : int  856 1262 920 756 1145 796 1686 1107 952 991 ...
##  $ Heating      : Factor w/ 6 levels "Floor","GasA",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ HeatingQC    : Factor w/ 5 levels "Ex","Fa","Gd",..: 1 1 1 3 1 1 1 1 3 1 ...
##  $ CentralAir   : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Electrical   : Factor w/ 5 levels "FuseA","FuseF",..: 5 5 5 5 5 5 5 5 2 5 ...
##  $ GrLivArea    : int  1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 ...
##  $ BedroomAbvGr : int  3 3 3 3 4 1 3 3 2 2 ...
##  $ KitchenAbvGr : int  1 1 1 1 1 1 1 1 2 2 ...
##  $ KitchenQual  : Factor w/ 4 levels "Ex","Fa","Gd",..: 3 4 3 3 3 4 3 4 4 4 ...
##  $ TotRmsAbvGrd : int  8 6 6 7 9 5 7 7 8 5 ...
##  $ Functional   : Factor w/ 7 levels "Maj1","Maj2",..: 7 7 7 7 7 7 7 7 3 7 ...
##  $ Fireplaces   : int  0 1 1 1 1 0 1 2 2 2 ...
##  $ FireplaceQu  : Factor w/ 5 levels "Ex","Fa","Gd",..: NA 5 5 3 5 NA 3 5 5 5 ...
##  $ GarageArea   : int  548 460 608 642 836 480 636 484 468 205 ...
##  $ GarageQual   : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 2 3 ...
##  $ GarageCond   : Factor w/ 5 levels "Ex","Fa","Gd",..: 5 5 5 5 5 5 5 5 5 5 ...
##  $ OpenPorchSF  : int  61 0 42 35 84 30 57 204 0 4 ...
##  $ PoolArea     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Fence        : Factor w/ 4 levels "GdPrv","GdWo",..: NA NA NA NA NA 3 NA NA NA NA ...
##  $ MoSold       : int  2 5 9 2 12 10 8 11 4 1 ...
##  $ YrSold       : int  2008 2007 2008 2006 2008 2009 2007 2009 2008 2008 ...
##  $ SaleType     : Factor w/ 9 levels "COD","Con","ConLD",..: 9 9 9 9 9 9 9 9 9 9 ...
##  $ SaleCondition: Factor w/ 6 levels "Abnorml","AdjLand",..: 5 5 5 1 5 5 5 5 1 5 ...
##  $ SalePrice    : int  208500 181500 223500 140000 250000 143000 307000 200000 129900 118000 ...
summary(data)
##        Id            MSZoning     Utilities     Neighborhood   BldgType   
##  Min.   :   1.0   C (all):  10   AllPub:1459   NAmes  :225   1Fam  :1220  
##  1st Qu.: 365.8   FV     :  65   NoSeWa:   1   CollgCr:150   2fmCon:  31  
##  Median : 730.5   RH     :  16                 OldTown:113   Duplex:  52  
##  Mean   : 730.5   RL     :1151                 Edwards:100   Twnhs :  43  
##  3rd Qu.:1095.2   RM     : 218                 Somerst: 86   TwnhsE: 114  
##  Max.   :1460.0                                Gilbert: 79                
##                                                (Other):707                
##    HouseStyle   OverallQual      OverallCond      YearBuilt    ExterQual
##  1Story :726   Min.   : 1.000   Min.   :1.000   Min.   :1872   Ex: 52   
##  2Story :445   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954   Fa: 14   
##  1.5Fin :154   Median : 6.000   Median :5.000   Median :1973   Gd:488   
##  SLvl   : 65   Mean   : 6.099   Mean   :5.575   Mean   :1971   TA:906   
##  SFoyer : 37   3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000            
##  1.5Unf : 14   Max.   :10.000   Max.   :9.000   Max.   :2010            
##  (Other): 19                                                            
##  ExterCond BsmtQual   BsmtCond     TotalBsmtSF      Heating     HeatingQC
##  Ex:   3   Ex  :121   Fa  :  45   Min.   :   0.0   Floor:   1   Ex:741   
##  Fa:  28   Fa  : 35   Gd  :  65   1st Qu.: 795.8   GasA :1428   Fa: 49   
##  Gd: 146   Gd  :618   Po  :   2   Median : 991.5   GasW :  18   Gd:241   
##  Po:   1   TA  :649   TA  :1311   Mean   :1057.4   Grav :   7   Po:  1   
##  TA:1282   NA's: 37   NA's:  37   3rd Qu.:1298.2   OthW :   2   TA:428   
##                                   Max.   :6110.0   Wall :   4            
##                                                                          
##  CentralAir Electrical     GrLivArea     BedroomAbvGr    KitchenAbvGr  
##  N:  95     FuseA:  94   Min.   : 334   Min.   :0.000   Min.   :0.000  
##  Y:1365     FuseF:  27   1st Qu.:1130   1st Qu.:2.000   1st Qu.:1.000  
##             FuseP:   3   Median :1464   Median :3.000   Median :1.000  
##             Mix  :   1   Mean   :1515   Mean   :2.866   Mean   :1.047  
##             SBrkr:1334   3rd Qu.:1777   3rd Qu.:3.000   3rd Qu.:1.000  
##             NA's :   1   Max.   :5642   Max.   :8.000   Max.   :3.000  
##                                                                        
##  KitchenQual  TotRmsAbvGrd    Functional    Fireplaces    FireplaceQu
##  Ex:100      Min.   : 2.000   Maj1:  14   Min.   :0.000   Ex  : 24   
##  Fa: 39      1st Qu.: 5.000   Maj2:   5   1st Qu.:0.000   Fa  : 33   
##  Gd:586      Median : 6.000   Min1:  31   Median :1.000   Gd  :380   
##  TA:735      Mean   : 6.518   Min2:  34   Mean   :0.613   Po  : 20   
##              3rd Qu.: 7.000   Mod :  15   3rd Qu.:1.000   TA  :313   
##              Max.   :14.000   Sev :   1   Max.   :3.000   NA's:690   
##                               Typ :1360                              
##    GarageArea     GarageQual  GarageCond   OpenPorchSF    
##  Min.   :   0.0   Ex  :   3   Ex  :   2   Min.   :  0.00  
##  1st Qu.: 334.5   Fa  :  48   Fa  :  35   1st Qu.:  0.00  
##  Median : 480.0   Gd  :  14   Gd  :   9   Median : 25.00  
##  Mean   : 473.0   Po  :   3   Po  :   7   Mean   : 46.66  
##  3rd Qu.: 576.0   TA  :1311   TA  :1326   3rd Qu.: 68.00  
##  Max.   :1418.0   NA's:  81   NA's:  81   Max.   :547.00  
##                                                           
##     PoolArea         Fence          MoSold           YrSold    
##  Min.   :  0.000   GdPrv:  59   Min.   : 1.000   Min.   :2006  
##  1st Qu.:  0.000   GdWo :  54   1st Qu.: 5.000   1st Qu.:2007  
##  Median :  0.000   MnPrv: 157   Median : 6.000   Median :2008  
##  Mean   :  2.759   MnWw :  11   Mean   : 6.322   Mean   :2008  
##  3rd Qu.:  0.000   NA's :1179   3rd Qu.: 8.000   3rd Qu.:2009  
##  Max.   :738.000                Max.   :12.000   Max.   :2010  
##                                                                
##     SaleType    SaleCondition    SalePrice     
##  WD     :1267   Abnorml: 101   Min.   : 34900  
##  New    : 122   AdjLand:   4   1st Qu.:129975  
##  COD    :  43   Alloca :  12   Median :163000  
##  ConLD  :   9   Family :  20   Mean   :180921  
##  ConLI  :   5   Normal :1198   3rd Qu.:214000  
##  ConLw  :   5   Partial: 125   Max.   :755000  
##  (Other):   9

Checking the distribution of Sale Price

ggplot(data, aes(x = SalePrice, fill = ..count..)) +
  geom_histogram(binwidth = 5000) +
  ggtitle("Histogram of SalePrice") + ylab("Count of houses") + xlab("Housing Price")

#log term of SalePrice
data$lSalePrice <- log(data$SalePrice)

ggplot(data, aes(x =lSalePrice, fill = ..count..)) +
  geom_histogram(binwidth = 0.05) +
  ggtitle("Histogram of Log SalePrice") + ylab("Count of houses") + xlab("Housing Price")

# correlation plot:

corr_cols = c('GarageArea', 'TotRmsAbvGrd', 'TotalBsmtSF', 'GrLivArea', 'BldgType', 
              'YearBuilt', 'OverallQual', 'Neighborhood', 'GrLivArea', 'MSZoning', 'SalePrice')

data_corr = data[, corr_cols]

data_corr$BldgType = as.numeric(data_corr$BldgType)
data_corr$MSZoning = as.numeric(data_corr$MSZoning)
data_corr$Neighborhood = as.numeric(data_corr$Neighborhood)

corrplot(cor(data_corr))

summary(data$MSZoning)
## C (all)      FV      RH      RL      RM 
##      10      65      16    1151     218
# plot(data$MSZoning)

ggplot(data, aes(x = MSZoning, fill = MSZoning )) + 
  geom_bar() + ggtitle("Distribution of MSZoning")

# MSZoning ~ SalePrice

# boxplot(SalePrice ~ MSZoning, data = data)

ggplot(data, aes(x=MSZoning, y=SalePrice, fill=MSZoning)) + 
  geom_boxplot(alpha=0.3) + ggtitle("Boxplot of SalePrice by MSZoning")

# GrLivArea 

summary(data$GrLivArea)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     334    1130    1464    1515    1777    5642
# hist(data$GrLivArea, breaks = 30, col = "grey")

ggplot(data, aes(x = GrLivArea, fill = ..count..)) +
  geom_histogram(binwidth = 100) +
  ggtitle("Histogram of GrLivArea") + ylab("Count of houses") + xlab("Living Area")

#plot(ddply(data, .(MSZoning), summarize,  size=mean(GrLivArea)))
# OverallQual 

# 10   Very Excellent
# 9    Excellent
# 8    Very Good
# 7    Good
# 6    Above Average
# 5    Average
# 4    Below Average
# 3    Fair
# 2    Poor
# 1    Very Poor

summary(data$OverallQual)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   5.000   6.000   6.099   7.000  10.000
ggplot(data, aes(x = OverallQual, fill = ..count..)) +
  geom_histogram(binwidth = 1)

data$OverallQualFact = factor(data$OverallQual)
ggplot(data, aes(x=OverallQualFact, y=SalePrice, fill=OverallQualFact)) + 
  geom_boxplot(alpha=0.3) + ggtitle("Boxplot of SalePrice by OverallQual")

# BldgType

# 1Fam     Single-family Detached  
# 2FmCon   Two-family Conversion; originally built as one-family dwelling
# Duplx    Duplex
# TwnhsE   Townhouse End Unit
# TwnhsI   Townhouse Inside Unit

summary(data$BldgType)
##   1Fam 2fmCon Duplex  Twnhs TwnhsE 
##   1220     31     52     43    114
ggplot(data, aes(x = BldgType, fill = ..count..)) +
  geom_bar(stat="count")

ggplot(data, aes(x=BldgType, y=SalePrice, fill=BldgType)) + 
  geom_boxplot(alpha=0.3) + ggtitle("Boxplot of SalePrice by BldgType")

# other variables
p1 = ggplot(data, aes(x=GrLivArea, y=SalePrice)) + 
  geom_point(shape=1) + geom_smooth(method=lm , color="red", se=FALSE) +
  ggtitle("SalePrice vs GrLivArea") 

p2 = ggplot(data, aes(x=TotalBsmtSF, y=SalePrice)) + 
  geom_point(shape=1) + geom_smooth(method=lm , color="red", se=FALSE) +
  ggtitle("SalePrice vs TotalBsmtSF")

p3 = ggplot(data, aes(x=TotRmsAbvGrd, y=SalePrice)) + 
  geom_point(shape=1) + geom_smooth(method=lm , color="red", se=FALSE) +
  ggtitle("SalePrice vs TotRmsAbvGrd")

p4 = ggplot(data, aes(x=GarageArea, y=SalePrice)) + 
  geom_point(shape=1) + geom_smooth(method=lm , color="red", se=FALSE) +
  ggtitle("SalePrice vs GarageArea")

multiplot(p1, p2, p3, p4, cols=2)

corr_cols = c('GarageArea', 'TotRmsAbvGrd', 'TotalBsmtSF', 'GrLivArea', 'BldgType', 
              'OverallQual', 'GrLivArea', 'MSZoning', 'SalePrice')
data = data[,corr_cols]

data$BldgType = as.numeric(data$BldgType)
data$MSZoning = as.numeric(data$MSZoning)
data$LSalePrice = log(data$SalePrice)

Modeling the Data

#partition data
## 75% of the sample size
smp_size = floor(0.75 * nrow(data))

## set the seed to make your partition reproducible
set.seed(123)
train_ind = sample(seq_len(nrow(data)), size = smp_size)

train = data[train_ind, ]
test = data[-train_ind, ]
# Linear Regression Model

lin_reg_model = lm(LSalePrice~.-SalePrice, data = train)
summary(lin_reg_model)
## 
## Call:
## lm(formula = LSalePrice ~ . - SalePrice, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.41513 -0.07292  0.01766  0.10526  0.62555 
## 
## Coefficients: (1 not defined because of singularities)
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.072e+01  5.223e-02 205.248  < 2e-16 ***
## GarageArea    3.391e-04  3.380e-05  10.034  < 2e-16 ***
## TotRmsAbvGrd -5.199e-04  6.233e-03  -0.083 0.933545    
## TotalBsmtSF   1.344e-04  1.603e-05   8.386  < 2e-16 ***
## GrLivArea     2.000e-04  2.190e-05   9.135  < 2e-16 ***
## BldgType     -1.464e-02  4.752e-03  -3.080 0.002119 ** 
## OverallQual   1.398e-01  5.754e-03  24.288  < 2e-16 ***
## GrLivArea.1          NA         NA      NA       NA    
## MSZoning     -3.208e-02  9.087e-03  -3.530 0.000432 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1857 on 1087 degrees of freedom
## Multiple R-squared:  0.7885, Adjusted R-squared:  0.7872 
## F-statistic:   579 on 7 and 1087 DF,  p-value: < 2.2e-16
# test the model

lin_reg_predictions = predict(lin_reg_model, test, type = "response")
lin_reg_residuals = test$LSalePrice - lin_reg_predictions
accuracy(lin_reg_predictions, test$LSalePrice)
##                   ME     RMSE       MAE        MPE     MAPE
## Test set 0.005606172 0.177485 0.1195952 0.02872382 1.001581
plot(lin_reg_predictions, test$LSalePrice, title("Linear Regression- Predicted vs Actual Price"))
lines(c(11, 13), c(11, 13), col = 'red')

# Regression tree model

reg_tree_model = rpart(LSalePrice~.-SalePrice, data = train, control = rpart.control(cp = 0.01))

printcp(reg_tree_model)
## 
## Regression tree:
## rpart(formula = LSalePrice ~ . - SalePrice, data = train, control = rpart.control(cp = 0.01))
## 
## Variables actually used in tree construction:
## [1] GarageArea  GrLivArea   OverallQual TotalBsmtSF
## 
## Root node error: 177.33/1095 = 0.16195
## 
## n= 1095 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.453668      0   1.00000 1.00069 0.051474
## 2 0.085673      1   0.54633 0.54749 0.031100
## 3 0.073347      2   0.46066 0.46436 0.027958
## 4 0.040919      3   0.38731 0.39919 0.024450
## 5 0.025618      4   0.34639 0.36278 0.022486
## 6 0.022020      5   0.32077 0.34905 0.022134
## 7 0.015239      6   0.29875 0.33740 0.021351
## 8 0.014309      7   0.28351 0.32822 0.020998
## 9 0.010000      8   0.26921 0.31284 0.020442
plotcp(reg_tree_model)

rpart.plot(reg_tree_model)

# test 

reg_tree_predictions = predict(reg_tree_model, newdata = test, type = "vector")
reg_tree_residuals = test$LSalePrice - lin_reg_predictions
accuracy(reg_tree_predictions, test$LSalePrice)
##                 ME      RMSE       MAE         MPE     MAPE
## Test set 0.0014339 0.2079031 0.1552894 -0.01489638 1.300271
plot(reg_tree_predictions, test$LSalePrice)
lines(c(11, 13), c(11, 13), col = 'red')

# Random Forest
rand_forest_model = randomForest(
  LSalePrice ~.-SalePrice, data = train, 
  importance =TRUE, ntree=500, nodesize=7, na.action=na.roughfix
)
summary(rand_forest_model)
##                 Length Class  Mode     
## call               7   -none- call     
## type               1   -none- character
## predicted       1095   -none- numeric  
## mse              500   -none- numeric  
## rsq              500   -none- numeric  
## oob.times       1095   -none- numeric  
## importance        16   -none- numeric  
## importanceSD       8   -none- numeric  
## localImportance    0   -none- NULL     
## proximity          0   -none- NULL     
## ntree              1   -none- numeric  
## mtry               1   -none- numeric  
## forest            11   -none- list     
## coefs              0   -none- NULL     
## y               1095   -none- numeric  
## test               0   -none- NULL     
## inbag              0   -none- NULL     
## terms              3   terms  call
plot(rand_forest_model)

varImpPlot(rand_forest_model, type=1)

Lasso Regression

#Lasso Regression 
# The test sample only has 10 variables 


#data = read.csv("~/Downloads/train_545.csv")

#data$lSalePrice <- log(data$SalePrice)

set.seed(27042018)

my_control <-trainControl(method="cv", number=5)

lassoGrid <- expand.grid(alpha = 1, lambda = seq(0.001,0.1,by = 0.005))

lasso_mod <- train(x=train, y=train$LSalePrice, method='glmnet', trControl= my_control, tuneGrid=lassoGrid) 

lasso_mod$bestTune
##   alpha lambda
## 3     1  0.011
min(lasso_mod$results$RMSE)
## [1] 0.01172239
lassoVarImp <- varImp(lasso_mod,scale=F)
lassoImportance <- lassoVarImp$importance

varsSelected <- length(which(lassoImportance$Overall!=0))
varsNotSelected <- length(which(lassoImportance$Overall==0))
cat('Lasso uses', varsSelected, 'variables in its model, and did not select', varsNotSelected, 'variables.')
## Lasso uses 1 variables in its model, and did not select 9 variables.
LassoPred <- predict(lasso_mod, test)
predictions_lasso <- exp(LassoPred) #need to reverse the log to the real values
head(predictions_lasso)
##        2        5        7       21       22       25 
## 181063.1 247081.1 301604.4 319043.8 140138.4 154366.9

The model is using just 1 variable to predict the house price and is overfitting

Random forest Model seems to be more accurate than others.

Predicting price in the train subset using random forest

# test

rand_forest_predictions = predict(rand_forest_model, newdata = test, type = "response")
rand_forest_residuals = test$LSalePrice - lin_reg_predictions
accuracy(rand_forest_predictions, test$LSalePrice)
##                    ME      RMSE       MAE         MPE      MAPE
## Test set -0.003473407 0.1631106 0.1078001 -0.04431492 0.9039332
plot(rand_forest_predictions, test$LSalePrice)
lines(c(11, 13), c(11, 13), col = 'red')