House Data Set

The purpose of this project it is to create a model that predicts the price that houses will be sold in the illinois region.
There are two data sets, one contains the train data and the test data. Uploading those data set in here.

library(readr)
housing_train <- read.csv("/Users/jusimioni/Desktop/train.csv")
housing_test <- read.csv("/Users/jusimioni/Desktop/test.csv")
colnames(housing_train)
##  [1] "Id"            "MSSubClass"    "MSZoning"      "LotFrontage"  
##  [5] "LotArea"       "Street"        "Alley"         "LotShape"     
##  [9] "LandContour"   "Utilities"     "LotConfig"     "LandSlope"    
## [13] "Neighborhood"  "Condition1"    "Condition2"    "BldgType"     
## [17] "HouseStyle"    "OverallQual"   "OverallCond"   "YearBuilt"    
## [21] "YearRemodAdd"  "RoofStyle"     "RoofMatl"      "Exterior1st"  
## [25] "Exterior2nd"   "MasVnrType"    "MasVnrArea"    "ExterQual"    
## [29] "ExterCond"     "Foundation"    "BsmtQual"      "BsmtCond"     
## [33] "BsmtExposure"  "BsmtFinType1"  "BsmtFinSF1"    "BsmtFinType2" 
## [37] "BsmtFinSF2"    "BsmtUnfSF"     "TotalBsmtSF"   "Heating"      
## [41] "HeatingQC"     "CentralAir"    "Electrical"    "X1stFlrSF"    
## [45] "X2ndFlrSF"     "LowQualFinSF"  "GrLivArea"     "BsmtFullBath" 
## [49] "BsmtHalfBath"  "FullBath"      "HalfBath"      "BedroomAbvGr" 
## [53] "KitchenAbvGr"  "KitchenQual"   "TotRmsAbvGrd"  "Functional"   
## [57] "Fireplaces"    "FireplaceQu"   "GarageType"    "GarageYrBlt"  
## [61] "GarageFinish"  "GarageCars"    "GarageArea"    "GarageQual"   
## [65] "GarageCond"    "PavedDrive"    "WoodDeckSF"    "OpenPorchSF"  
## [69] "EnclosedPorch" "X3SsnPorch"    "ScreenPorch"   "PoolArea"     
## [73] "PoolQC"        "Fence"         "MiscFeature"   "MiscVal"      
## [77] "MoSold"        "YrSold"        "SaleType"      "SaleCondition"
## [81] "SalePrice"
head(housing_train)
##   Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour
## 1  1         60       RL          65    8450   Pave  <NA>      Reg         Lvl
## 2  2         20       RL          80    9600   Pave  <NA>      Reg         Lvl
## 3  3         60       RL          68   11250   Pave  <NA>      IR1         Lvl
## 4  4         70       RL          60    9550   Pave  <NA>      IR1         Lvl
## 5  5         60       RL          84   14260   Pave  <NA>      IR1         Lvl
## 6  6         50       RL          85   14115   Pave  <NA>      IR1         Lvl
##   Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType
## 1    AllPub    Inside       Gtl      CollgCr       Norm       Norm     1Fam
## 2    AllPub       FR2       Gtl      Veenker      Feedr       Norm     1Fam
## 3    AllPub    Inside       Gtl      CollgCr       Norm       Norm     1Fam
## 4    AllPub    Corner       Gtl      Crawfor       Norm       Norm     1Fam
## 5    AllPub       FR2       Gtl      NoRidge       Norm       Norm     1Fam
## 6    AllPub    Inside       Gtl      Mitchel       Norm       Norm     1Fam
##   HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl
## 1     2Story           7           5      2003         2003     Gable  CompShg
## 2     1Story           6           8      1976         1976     Gable  CompShg
## 3     2Story           7           5      2001         2002     Gable  CompShg
## 4     2Story           7           5      1915         1970     Gable  CompShg
## 5     2Story           8           5      2000         2000     Gable  CompShg
## 6     1.5Fin           5           5      1993         1995     Gable  CompShg
##   Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 1     VinylSd     VinylSd    BrkFace        196        Gd        TA      PConc
## 2     MetalSd     MetalSd       None          0        TA        TA     CBlock
## 3     VinylSd     VinylSd    BrkFace        162        Gd        TA      PConc
## 4     Wd Sdng     Wd Shng       None          0        TA        TA     BrkTil
## 5     VinylSd     VinylSd    BrkFace        350        Gd        TA      PConc
## 6     VinylSd     VinylSd       None          0        TA        TA       Wood
##   BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 1       Gd       TA           No          GLQ        706          Unf
## 2       Gd       TA           Gd          ALQ        978          Unf
## 3       Gd       TA           Mn          GLQ        486          Unf
## 4       TA       Gd           No          ALQ        216          Unf
## 5       Gd       TA           Av          GLQ        655          Unf
## 6       Gd       TA           No          GLQ        732          Unf
##   BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 1          0       150         856    GasA        Ex          Y      SBrkr
## 2          0       284        1262    GasA        Ex          Y      SBrkr
## 3          0       434         920    GasA        Ex          Y      SBrkr
## 4          0       540         756    GasA        Gd          Y      SBrkr
## 5          0       490        1145    GasA        Ex          Y      SBrkr
## 6          0        64         796    GasA        Ex          Y      SBrkr
##   X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 1       856       854            0      1710            1            0        2
## 2      1262         0            0      1262            0            1        2
## 3       920       866            0      1786            1            0        2
## 4       961       756            0      1717            1            0        1
## 5      1145      1053            0      2198            1            0        2
## 6       796       566            0      1362            1            0        1
##   HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 1        1            3            1          Gd            8        Typ
## 2        0            3            1          TA            6        Typ
## 3        1            3            1          Gd            6        Typ
## 4        0            3            1          Gd            7        Typ
## 5        1            4            1          Gd            9        Typ
## 6        1            1            1          TA            5        Typ
##   Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
## 1          0        <NA>     Attchd        2003          RFn          2
## 2          1          TA     Attchd        1976          RFn          2
## 3          1          TA     Attchd        2001          RFn          2
## 4          1          Gd     Detchd        1998          Unf          3
## 5          1          TA     Attchd        2000          RFn          3
## 6          0        <NA>     Attchd        1993          Unf          2
##   GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 1        548         TA         TA          Y          0          61
## 2        460         TA         TA          Y        298           0
## 3        608         TA         TA          Y          0          42
## 4        642         TA         TA          Y          0          35
## 5        836         TA         TA          Y        192          84
## 6        480         TA         TA          Y         40          30
##   EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
## 1             0          0           0        0   <NA>  <NA>        <NA>
## 2             0          0           0        0   <NA>  <NA>        <NA>
## 3             0          0           0        0   <NA>  <NA>        <NA>
## 4           272          0           0        0   <NA>  <NA>        <NA>
## 5             0          0           0        0   <NA>  <NA>        <NA>
## 6             0        320           0        0   <NA> MnPrv        Shed
##   MiscVal MoSold YrSold SaleType SaleCondition SalePrice
## 1       0      2   2008       WD        Normal    208500
## 2       0      5   2007       WD        Normal    181500
## 3       0      9   2008       WD        Normal    223500
## 4       0      2   2006       WD       Abnorml    140000
## 5       0     12   2008       WD        Normal    250000
## 6     700     10   2009       WD        Normal    143000
cbind(c("train", "test"),
      rbind(dim(housing_train), dim(housing_test)))
##      [,1]    [,2]   [,3]
## [1,] "train" "1460" "81"
## [2,] "test"  "1459" "80"

Data Preparation

Looking at TrainHouse many columns need to be changed, some need to replace NA to 0 and some columns are going to be changed from category to numerical so they can be use when doing analysis. You can check for the explanation for each column here with the changes made < Input Columnn guide here > .

Data Cleaning

Transforming the data to factor.

housing_train$MSZoning = as.factor(housing_train$MSZoning)
levels(housing_train$MSZoning)
## [1] "C (all)" "FV"      "RH"      "RL"      "RM"
# MSZoning column of train dataset has following levels: "C (all)", "FV", "RH", "RL", "RM"
housing_test$MSZoning = as.factor(housing_test$MSZoning)
levels(housing_test$MSZoning)
## [1] "C (all)" "FV"      "RH"      "RL"      "RM"
# Change of factors to numeric in train dataset
MSZoning=as.numeric(housing_train$MSZoning,"C "=1, "FV"=2, "RH"=3, "RL"=4, "RM"=5)
housing_train$MSZoning <-MSZoning
# Change of factors to numeric in test dataset
MSZoning=as.numeric(housing_test$MSZoning,"C "=1, "FV"=2, "RH"=3, "RL"=4, "RM"=5)
housing_test$MSZoning <-MSZoning
Street = as.factor(housing_train$Street)
Street = as.numeric(Street, "Pave"= 1,"Grvl"= 2)
housing_train$Street <-Street
# Pave got replaced with 1 and Grvl type of rode got replaced with 2
Street = as.factor(housing_test$Street)
Street = as.numeric(Street, "Pave"= 1,"Grvl"= 2)
housing_test$Street <-Street
# Pave got replaced with 1 and Grvl type of rode got replaced with 2
# Transforming Alley column to numeric in train dataset
Alley<-as.factor(housing_train$Alley)
levels(Alley)
## [1] "Grvl" "Pave"
Alley = as.numeric(Alley, "Pave"= 1,"Grvl"= 2, "NA"=0)
housing_train$Alley <- Alley
# Transforming Alley column to numeric in test dataset
Alley<-as.factor(housing_test$Alley)
levels(Alley)
## [1] "Grvl" "Pave"
Alley = as.numeric(Alley, "Pave"= 1,"Grvl"= 2)
housing_test$Alley <- Alley
# Transforming LotShape column to numeric in train dataset
LotShape <-as.factor(housing_train$LotShape)
levels(LotShape) # 4 levels: "IR1", "IR2", "IR3", "Reg"
## [1] "IR1" "IR2" "IR3" "Reg"
LotShape=as.numeric(LotShape,"IR1"=1, "IR2"=2, "IR3"=3, "Reg"=4)
housing_train$LotShape <- LotShape
# Transforming LotShape column to numeric in test dataset
LotShape <-as.factor(housing_test$LotShape)
levels(LotShape) # 4 levels: "IR1", "IR2", "IR3", "Reg"
## [1] "IR1" "IR2" "IR3" "Reg"
LotShape=as.numeric(LotShape,"IR1"=1, "IR2"=2, "IR3"=3, "Reg"=4)
housing_test$LotShape <- LotShape
# Transforming LandContour column to numeric in train dataset
LandContour <-as.factor(housing_train$LandContour)
levels(LandContour) # 4 levels: "Bnk", "HLS", "Low", "Lvl"
## [1] "Bnk" "HLS" "Low" "Lvl"
LandContour=as.numeric(LandContour,"Bnk"=1, "HLS"=2, "Low"=3, "Lvl"=4)
housing_train$LandContour <- LandContour
# Transforming LandContour column to numeric in test dataset
LandContour <-as.factor(housing_test$LandContour)
levels(LandContour) # 4 levels: "Bnk", "HLS", "Low", "Lvl"
## [1] "Bnk" "HLS" "Low" "Lvl"
LandContour=as.numeric(LandContour,"Bnk"=1, "HLS"=2, "Low"=3, "Lvl"=4)
housing_test$LandContour <- LandContour
# Transforming Utilities column to numeric in train dataset
Utilities <-as.factor(housing_train$Utilities)
levels(Utilities) # 2 levels: "AllPub", "NoSeWa"
## [1] "AllPub" "NoSeWa"
Utilities=as.numeric(Utilities,"AllPub"=1, "NoSeWa"=2, "NA"=0)
housing_train$Utilities <- Utilities

# Transforming Utilities column to numeric in test dataset
Utilities <-as.factor(housing_test$Utilities)
levels(Utilities) # 2 levels: "AllPub", "NoSeWa"
## [1] "AllPub"
Utilities=as.numeric(Utilities,"AllPub"=1, "NA"=0)
housing_test$Utilities <- Utilities
# Transforming LotConfig column to numeric in train dataset
LotConfig <-as.factor(housing_train$LotConfig)
levels(LotConfig)
## [1] "Corner"  "CulDSac" "FR2"     "FR3"     "Inside"
LotConfig=as.numeric(LotConfig,"Corner"=1, "CulDSac"=2, "FR2"=3, "FR3"=4, "Inside"=5, "NA"= 0)
housing_train$LotConfig <- LotConfig

# Transforming LotConfig column to numeric in test dataset
LotConfig <-as.factor(housing_test$LotConfig)
levels(LotConfig)
## [1] "Corner"  "CulDSac" "FR2"     "FR3"     "Inside"
LotConfig=as.numeric(LotConfig,"Corner"=1, "CulDSac"=2, "FR2"=3, "FR3"=4, "Inside"=5, "NA"=0)
housing_test$LotConfig <- LotConfig

# Transforming LandSlope column to numeric in train dataset
LandSlope <-as.factor(housing_train$LandSlope)
levels(LandSlope)
## [1] "Gtl" "Mod" "Sev"
LandSlope=as.numeric(LandSlope,"Gtl"=1, "Mod"=2, "Sev"=3)
housing_train$LandSlope <- LandSlope

# Transforming LandSlope column to numeric in test dataset
LandSlope <-as.factor(housing_test$LandSlope)
levels(LandSlope)
## [1] "Gtl" "Mod" "Sev"
LandSlope=as.numeric(LandSlope,"Blmngtn"=1, "Blueste"=2, "Sev"=3)
housing_test$LandSlope <- LandSlope

# Transforming Neighborhood column to numeric in train dataset
Neighborhood <-as.factor(housing_train$Neighborhood)
levels(Neighborhood)
##  [1] "Blmngtn" "Blueste" "BrDale"  "BrkSide" "ClearCr" "CollgCr" "Crawfor"
##  [8] "Edwards" "Gilbert" "IDOTRR"  "MeadowV" "Mitchel" "NAmes"   "NoRidge"
## [15] "NPkVill" "NridgHt" "NWAmes"  "OldTown" "Sawyer"  "SawyerW" "Somerst"
## [22] "StoneBr" "SWISU"   "Timber"  "Veenker"
Neighborhood=as.numeric(Neighborhood,"Blmngtn"=1, "Blueste"=2, "BrDale"=3, "BrkSide"=4, "ClearCr"=5, "CollgCr"=6, "Crawfor"=7, "Edwards"=8, "Gilbert"=9, "IDOTRR"=10, "MeadowV"=11, "Mitchel"=12, "NAmes"=13, "NoRidge"=14, "NPkVill"=15, "NridgHt"=16, "NWAmes"=17, "OldTown"=18, "SWISU"=19, "Sawyer"=20, "SawyerW"=21, "Somerst"=22, "StoneBr"=23, "Timber"=24, "Veenker"=25)
housing_train$Neighborhood <- Neighborhood

# Transforming Neighborhood column to numeric in test dataset
Neighborhood <-as.factor(housing_test$Neighborhood)
levels(Neighborhood)
##  [1] "Blmngtn" "Blueste" "BrDale"  "BrkSide" "ClearCr" "CollgCr" "Crawfor"
##  [8] "Edwards" "Gilbert" "IDOTRR"  "MeadowV" "Mitchel" "NAmes"   "NoRidge"
## [15] "NPkVill" "NridgHt" "NWAmes"  "OldTown" "Sawyer"  "SawyerW" "Somerst"
## [22] "StoneBr" "SWISU"   "Timber"  "Veenker"
Neighborhood=as.numeric(Neighborhood,"Blmngtn"=1, "Blueste"=2, "BrDale"=3, "BrkSide"=4, "ClearCr"=5, "CollgCr"=6, "Crawfor"=7, "Edwards"=8, "Gilbert"=9, "IDOTRR"=10, "MeadowV"=11, "Mitchel"=12, "NAmes"=13, "NoRidge"=14, "NPkVill"=15, "NridgHt"=16, "NWAmes"=17, "OldTown"=18, "SWISU"=19, "Sawyer"=20, "SawyerW"=21, "Somerst"=22, "StoneBr"=23, "Timber"=24, "Veenker"=25)
housing_test$Neighborhood <- Neighborhood
# Transforming Condition1 column to numeric in train dataset
Condition1 <-as.factor(housing_train$Condition1)
levels(Condition1)
## [1] "Artery" "Feedr"  "Norm"   "PosA"   "PosN"   "RRAe"   "RRAn"   "RRNe"  
## [9] "RRNn"
Condition1=as.numeric(Condition1,"Artery"=1, "Feedr"=2, "Norm"=3, "RRNn"=4, "RRAn"=5, "PosN"=6, "PosA"=7, "RRNe"=8, "RRAe"=9)
housing_train$Condition1 <- Condition1
# Transforming Condition1 column to numeric in test dataset
Condition1 <-as.factor(housing_test$Condition1)
levels(Condition1) 
## [1] "Artery" "Feedr"  "Norm"   "PosA"   "PosN"   "RRAe"   "RRAn"   "RRNe"  
## [9] "RRNn"
Condition1=as.numeric(Condition1,"Artery"=1, "Feedr"=2, "Norm"=3, "RRNn"=4, "RRAn"=5, "PosN"=6, "PosA"=7, "RRNe"=8, "RRAe"=9)
housing_test$Condition1 <- Condition1
# Transforming Condition2 column to numeric in train dataset
Condition2 <-as.factor(housing_train$Condition2)
levels(Condition2)
## [1] "Artery" "Feedr"  "Norm"   "PosA"   "PosN"   "RRAe"   "RRAn"   "RRNn"
Condition2=as.numeric(Condition2,"Artery"=1, "Feedr"=2, "Norm"=3, "RRNn"=4, "RRAn"=5, "PosN"=6, "PosA"=7, "RRNe"=8, "RRAe"=9)
housing_train$Condition2 <- Condition2
# Transforming Condition2 column to numeric in test dataset
Condition2 <-as.factor(housing_test$Condition2)
levels(Condition2) #values
## [1] "Artery" "Feedr"  "Norm"   "PosA"   "PosN"
Condition2=as.numeric(Condition2,"Artery"=1, "Feedr"=2, "Norm"=3, "RRNn"=4, "RRAn"=5, "PosN"=6, "PosA"=7, "RRNe"=8, "RRAe"=9)
housing_test$Condition2 <- Condition2
# Transforming BldgType column to numeric in train dataset
BldgType <-as.factor(housing_train$BldgType)
levels(BldgType)
## [1] "1Fam"   "2fmCon" "Duplex" "Twnhs"  "TwnhsE"
BldgType=as.numeric(BldgType,"1Fam"=1, "2FmCon"=2, "Duplx"=3, "TwnhsE"=4, "TwnhsI"=5)
housing_train$BldgType <- BldgType
# Transforming BldgType column to numeric in test dataset
BldgType <-as.factor(housing_test$BldgType)
levels(BldgType) 
## [1] "1Fam"   "2fmCon" "Duplex" "Twnhs"  "TwnhsE"
BldgType=as.numeric(BldgType,"1Fam"=1, "2FmCon"=2, "Duplx"=3, "TwnhsE"=4, "TwnhsI"=5)
housing_test$BldgType <- BldgType
# Transforming HouseStyle column to numeric in train dataset
HouseStyle <-as.factor(housing_train$HouseStyle)
levels(HouseStyle)
## [1] "1.5Fin" "1.5Unf" "1Story" "2.5Fin" "2.5Unf" "2Story" "SFoyer" "SLvl"
HouseStyle=as.numeric(HouseStyle,"1Story"=1, "1.5Fin"=2, "1.5Unf"=3, "2Story"=4, "2.5Fin"=5, "2.5Unf"=6, "SFoyer"=7, "SLvl"=8)
housing_train$HouseStyle <- HouseStyle
# Transforming HouseStyle column to numeric in test dataset
HouseStyle <-as.factor(housing_test$HouseStyle)
levels(HouseStyle) 
## [1] "1.5Fin" "1.5Unf" "1Story" "2.5Unf" "2Story" "SFoyer" "SLvl"
HouseStyle=as.numeric(HouseStyle,"1Story"=1, "1.5Fin"=2, "1.5Unf"=3, "2Story"=4, "2.5Fin"=5, "2.5Unf"=6, "SFoyer"=7, "SLvl"=8)
housing_test$HouseStyle <- HouseStyle
RoofStyle <-as.factor(housing_train$RoofStyle)
levels(RoofStyle)
## [1] "Flat"    "Gable"   "Gambrel" "Hip"     "Mansard" "Shed"
RoofStyle=as.numeric(RoofStyle,"Flat"=1, "Gable"=2, "Gambrel"=3, "Hip"=4, "Mansard"=5, "Shed"=6)
housing_train$RoofStyle <- RoofStyle
# Transforming RoofStyle column to numeric in test dataset
RoofStyle <-as.factor(housing_test$RoofStyle)
levels(RoofStyle) 
## [1] "Flat"    "Gable"   "Gambrel" "Hip"     "Mansard" "Shed"
RoofStyle=as.numeric(RoofStyle,"Flat"=1, "Gable"=2, "Gambrel"=3, "Hip"=4, "Mansard"=5, "Shed"=6)
housing_test$RoofStyle <- RoofStyle
# Transforming RoofMatl column to numeric in train dataset
RoofMatl <-as.factor(housing_train$RoofMatl)
levels(RoofMatl)
## [1] "ClyTile" "CompShg" "Membran" "Metal"   "Roll"    "Tar&Grv" "WdShake"
## [8] "WdShngl"
RoofMatl=as.numeric(RoofMatl,"ClyTile"=1, "CompShg"=2, "Membran"=3, "Metal"=4, "Roll"=5, "Tar&Grv"=6, "WdShake"=7, "WdShngl"=8)
housing_train$RoofMatl <- RoofMatl
# Transforming RoofMatl column to numeric in test dataset
RoofMatl <-as.factor(housing_test$RoofMatl)
levels(RoofMatl)
## [1] "CompShg" "Tar&Grv" "WdShake" "WdShngl"
RoofMatl=as.numeric(RoofMatl,"ClyTile"=1, "CompShg"=2, "Membran"=3, "Metal"=4, "Roll"=5, "Tar&Grv"=6, "WdShake"=7, "WdShngl"=8)
housing_test$RoofMatl <- RoofMatl
# Transforming Exterior1st column to numeric in train dataset
Exterior1st <-as.factor(housing_train$Exterior1st)
levels(Exterior1st)
##  [1] "AsbShng" "AsphShn" "BrkComm" "BrkFace" "CBlock"  "CemntBd" "HdBoard"
##  [8] "ImStucc" "MetalSd" "Plywood" "Stone"   "Stucco"  "VinylSd" "Wd Sdng"
## [15] "WdShing"
Exterior1st=as.numeric(Exterior1st,"AsbShng"=1, "AsphShn"=2, "BrkComm"=3, "BrkFace"=4, "CBlock"=5, "CemntBd"=6, "HdBoard"=7, "ImStucc"=8, "MetalSd"=9, "Plywood"=10, "Stone"=11, "Stucco"=12,"VinylSd"=13, "Wd Sdng"=14, "WdShing"=15)
housing_train$Exterior1st <- Exterior1st
# Transforming Exterior1st column to numeric in test dataset
Exterior1st <-as.factor(housing_test$Exterior1st)
levels(Exterior1st)
##  [1] "AsbShng" "AsphShn" "BrkComm" "BrkFace" "CBlock"  "CemntBd" "HdBoard"
##  [8] "MetalSd" "Plywood" "Stucco"  "VinylSd" "Wd Sdng" "WdShing"
Exterior1st=as.numeric(Exterior1st,"AsbShng"=1, "AsphShn"=2, "BrkComm"=3, "BrkFace"=4, "CBlock"=5, "CemntBd"=6, "HdBoard"=7, "ImStucc"=8, "MetalSd"=9, "Plywood"=10, "Stone"=11, "Stucco"=12,"VinylSd"=13, "Wd Sdng"=14, "WdShing"=15)
housing_test$Exterior1st <- Exterior1st
# Transforming Exterior2nd column to numeric in train dataset
Exterior2nd <-as.factor(housing_train$Exterior2nd)
levels(Exterior2nd)
##  [1] "AsbShng" "AsphShn" "Brk Cmn" "BrkFace" "CBlock"  "CmentBd" "HdBoard"
##  [8] "ImStucc" "MetalSd" "Other"   "Plywood" "Stone"   "Stucco"  "VinylSd"
## [15] "Wd Sdng" "Wd Shng"
Exterior2nd = as.numeric(Exterior2nd,"AsbShng"=1, "AsphShn"=2, "BrkComm"=3, "BrkFace"=4, "CBlock"=5, "CemntBd"=6, "HdBoard"=7, "ImStucc"=8, "MetalSd"=9, "Plywood"=10, "Stone"=11, "Stucco"=12,"VinylSd"=13, "Wd Sdng"=14, "WdShing"=15)
housing_train$Exterior2nd <- Exterior2nd
# Transforming Exterior2nd column to numeric in test dataset
Exterior2nd <-as.factor(housing_test$Exterior2nd)
levels(Exterior2nd)
##  [1] "AsbShng" "AsphShn" "Brk Cmn" "BrkFace" "CBlock"  "CmentBd" "HdBoard"
##  [8] "ImStucc" "MetalSd" "Plywood" "Stone"   "Stucco"  "VinylSd" "Wd Sdng"
## [15] "Wd Shng"
Exterior2nd=as.numeric(Exterior2nd,"AsbShng"=1, "AsphShn"=2, "Brk Cmn"=3, "BrkFace"=4, "CBlock"=5, "CemntBd"=6, "HdBoard"=7, "ImStucc"=8, "MetalSd"=9, "Plywood"=10,"PreCast"=11, "Stone"=12, "Stucco"=13,"VinylSd"=14, "Wd Sdng"=15, "Wd Shing"=16)
housing_test$Exterior2nd <- Exterior2nd
# Transforming MasVnrType column to numeric in train dataset
MasVnrType <-as.factor(housing_train$MasVnrType)
levels(MasVnrType)
## [1] "BrkCmn"  "BrkFace" "None"    "Stone"
MasVnrType=as.numeric(MasVnrType,"BrkCmn"=1, "BrkFace"=2, "None"=0, "Stone"=4)
housing_train$MasVnrType <- MasVnrType
# Transforming MasVnrType column to numeric in test dataset
MasVnrType <-as.factor(housing_test$MasVnrType)
levels(MasVnrType)
## [1] "BrkCmn"  "BrkFace" "None"    "Stone"
MasVnrType=as.numeric(MasVnrType,"BrkCmn"=1, "BrkFace"=2, "Stone"=3, "NA"=0) 
housing_test$MasVnrType <- MasVnrType
# Transforming ExterQual column to numeric in train dataset
ExterQual <-as.factor(housing_train$ExterQual)
levels(ExterQual)
## [1] "Ex" "Fa" "Gd" "TA"
ExterQual=as.numeric(ExterQual,"Ex"=1, "Gd"=2, "TA"=3, "Fa"=4, "Po"=5 ) 
housing_train$ExterQual <- ExterQual
# Transforming ExterQual column to numeric in test dataset
ExterQual <-as.factor(housing_test$ExterQual)
levels(ExterQual)
## [1] "Ex" "Fa" "Gd" "TA"
ExterQual=as.numeric(ExterQual,"Ex"=1, "Gd"=2, "TA"=3, "Fa"=4, "Po"=5 ) 
housing_test$ExterQual <- ExterQual
# Transforming ExterCond column to numeric in train dataset
ExterCond <-as.factor(housing_train$ExterCond)
levels(ExterCond)
## [1] "Ex" "Fa" "Gd" "Po" "TA"
ExterCond=as.numeric(ExterCond,"Ex"=1, "Gd"=2, "TA"=3, "Fa"=4, "Po"=5 ) 
housing_train$ExterCond <- ExterCond
# Transforming ExterCond column to numeric in test dataset
ExterCond <-as.factor(housing_test$ExterCond)
levels(ExterCond)
## [1] "Ex" "Fa" "Gd" "Po" "TA"
ExterCond=as.numeric(ExterCond,"Ex"=1, "Gd"=2, "TA"=3, "Fa"=4, "Po"=5 ) 
housing_test$ExterCond <- ExterCond
# Transforming Foundation column to numeric in train dataset
Foundation <-as.factor(housing_train$Foundation)
levels(Foundation)
## [1] "BrkTil" "CBlock" "PConc"  "Slab"   "Stone"  "Wood"
Foundation=as.numeric(Foundation,"BrkTil"=1, "CBlock"=2, "PConc"=3, "Slab"=4, "Stone"=5, "Wood" = 6 ) 
housing_train$Foundation <- Foundation
# Transforming Foundation column to numeric in test dataset
Foundation <-as.factor(housing_test$Foundation)
levels(Foundation)
## [1] "BrkTil" "CBlock" "PConc"  "Slab"   "Stone"  "Wood"
Foundation=as.numeric(Foundation,"BrkTil"=1, "CBlock"=2, "PConc"=3, "Slab"=4, "Stone"=5, "Wood" = 6 ) 
housing_test$Foundation<- Foundation
# Transforming BsmtQual column to numeric in train dataset
BsmtQual <-as.factor(housing_train$BsmtQual)
levels(BsmtQual)
## [1] "Ex" "Fa" "Gd" "TA"
BsmtQual=as.numeric(BsmtQual,"Ex"=1, "Fa"=2, "Gd"=3, "TA"=4, "Fa"=5, "Po" = 6, "NA"=0) 
housing_train$BsmtQual <- BsmtQual
# Transforming BsmtQual column to numeric in test dataset
BsmtQual <-as.factor(housing_test$BsmtQual)
levels(BsmtQual)
## [1] "Ex" "Fa" "Gd" "TA"
BsmtQual=as.numeric(BsmtQual,"Ex"=1, "Gd"=2, "TA"=3, "Fa"=4, "Po"=5, "NA"=0 ) 
housing_test$BsmtQual<- BsmtQual
# Transforming BsmtCond column to numeric in train dataset
BsmtCond <-as.factor(housing_train$BsmtCond)
levels(BsmtCond)
## [1] "Fa" "Gd" "Po" "TA"
BsmtCond=as.numeric(BsmtCond,"Ex"=1, "Gd"=2, "TA"=3, "Fa"=4, "Po"=5, "NA"=0) 
housing_train$BsmtCond <- BsmtCond
# Transforming BsmtCond column to numeric in test dataset
BsmtCond <-as.factor(housing_test$BsmtCond)
levels(BsmtCond)
## [1] "Fa" "Gd" "Po" "TA"
BsmtCond=as.numeric(BsmtCond,"Ex"=1, "Fa"=4, "Gd"=2, "TA"=3, "Fa"=4, "Po"=5, "NA"=0 ) 
housing_test$BsmtCond<- BsmtCond
# Transforming BsmtExposure column to numeric in train dataset
BsmtExposure <-as.factor(housing_train$BsmtExposure)
levels(BsmtExposure)
## [1] "Av" "Gd" "Mn" "No"
BsmtExposure=as.numeric(BsmtExposure,"Av"=1, "Gd"=2, "Mn"=3, "No"=4, "NA"=0) 
housing_train$BsmtExposure <- BsmtExposure
# Transforming BsmtExposure column to numeric in test dataset
BsmtExposure <-as.factor(housing_test$BsmtExposure)
levels(BsmtExposure)
## [1] "Av" "Gd" "Mn" "No"
BsmtExposure=as.numeric(BsmtExposure,"Av"=1, "Gd"=2, "Mn"=3, "No"=4, "NA"=0) 
housing_test$BsmtExposure<- BsmtExposure
# Transforming BsmtFinType1 column to numeric in train dataset
BsmtFinType1 <-as.factor(housing_train$BsmtFinType1)
levels(BsmtFinType1)
## [1] "ALQ" "BLQ" "GLQ" "LwQ" "Rec" "Unf"
BsmtFinType1=as.numeric(BsmtFinType1,"ALQ"=1, "BLQ"=2, "GLQ"=3, "LwQ"=4, "Rec"=5, "Unf"=6, "NA"=0) 
housing_train$BsmtFinType1 <- BsmtFinType1
# Transforming BsmtFinType1 column to numeric in test dataset
BsmtFinType1 <-as.factor(housing_test$BsmtFinType1)
levels(BsmtFinType1)
## [1] "ALQ" "BLQ" "GLQ" "LwQ" "Rec" "Unf"
BsmtFinType1=as.numeric(BsmtFinType1,"ALQ"=1, "BLQ"=2, "GLQ"=3, "LwQ"=4, "Rec"=5, "Unf"=6) 
housing_test$BsmtFinType1<- BsmtFinType1
housing_train$BsmtFinType1[is.na(housing_train$BsmtFinType1)] <- 0
sum(is.na(housing_train$BsmtFinType1))
## [1] 0
housing_test$BsmtFinType1[is.na(housing_test$BsmtFinType1)] <- 0
sum(is.na(housing_test$BsmtFinType1))
## [1] 0
# Transforming BsmtFinType2 column to numeric in train dataset
BsmtFinType2 <-as.factor(housing_train$BsmtFinType2)
levels(BsmtFinType2)
## [1] "ALQ" "BLQ" "GLQ" "LwQ" "Rec" "Unf"
BsmtFinType2=as.numeric(BsmtFinType2,"ALQ"=1, "BLQ"=2, "GLQ"=3, "LwQ"=4, "Rec"=5, "Unf"=6, "NA"=0) 
housing_train$BsmtFinType2 <- BsmtFinType2
# Transforming BsmtFinType2 column to numeric in test dataset
BsmtFinType2 <-as.factor(housing_test$BsmtFinType2)
levels(BsmtFinType2)
## [1] "ALQ" "BLQ" "GLQ" "LwQ" "Rec" "Unf"
BsmtFinType2=as.numeric(BsmtFinType2,"ALQ"=1, "BLQ"=2, "GLQ"=3, "LwQ"=4, "Rec"=5, "Unf"=6, "NA"=0) 
housing_test$BsmtFinType2<- BsmtFinType2
housing_train$BsmtFinType2[is.na(housing_train$BsmtFinType2)] <- 0
sum(is.na(housing_train$BsmtFinType2))
## [1] 0
housing_test$BsmtFinType2[is.na(housing_test$BsmtFinType2)] <- 0
sum(is.na(housing_test$BsmtFinType2))
## [1] 0
# Transforming Heating column to numeric in train dataset
Heating <-as.factor(housing_train$Heating)
levels(Heating)
## [1] "Floor" "GasA"  "GasW"  "Grav"  "OthW"  "Wall"
Heating=as.numeric(Heating,"Floor"=1, "GasA"=2, "GasW"=3, "Grav"=4, "OthW"=5, "Wall"=6, "NA"=0) 
housing_train$Heating <- Heating
# Transforming Heating column to numeric in test dataset
Heating <-as.factor(housing_test$Heating)
levels(Heating)
## [1] "GasA" "GasW" "Grav" "Wall"
Heating=as.numeric(Heating,"Floor"=1, "GasA"=2, "GasW"=3, "Grav"=4, "OthW"=5, "Wall"=6, "NA"=0) 
housing_test$Heating<- Heating
# Transforming HeatingQC column to numeric in train dataset
HeatingQC <-as.factor(housing_train$HeatingQC)
levels(HeatingQC)
## [1] "Ex" "Fa" "Gd" "Po" "TA"
HeatingQC=as.numeric(HeatingQC,"Ex"=1, "Gd"=2, "TA"=3, "Fa"=4, "Po"=5) 
housing_train$HeatingQC <- HeatingQC
# Transforming HHeatingQC column to numeric in test dataset
HeatingQC <-as.factor(housing_test$HeatingQC)
levels(HeatingQC)
## [1] "Ex" "Fa" "Gd" "Po" "TA"
HeatingQC=as.numeric(HeatingQC,"Ex"=1, "Gd"=2, "TA"=3, "Fa"=4, "Po"=5) 
housing_test$HeatingQC<- HeatingQC
# Transforming CentralAir column to numeric in train dataset
CentralAir <-as.factor(housing_train$CentralAir)
levels(CentralAir)
## [1] "N" "Y"
CentralAir=as.numeric(CentralAir,"N"=0, "Y"=1) 
housing_train$CentralAir <- CentralAir
# Transforming CentralAir column to numeric in test dataset
CentralAir <-as.factor(housing_test$CentralAir)
levels(CentralAir)
## [1] "N" "Y"
CentralAir=as.numeric(CentralAir,"N"=0, "Y"=1) 
housing_test$CentralAir<- CentralAir
# Transforming Electrical column to numeric in train dataset
Electrical <-as.factor(housing_train$Electrical)
levels(Electrical)
## [1] "FuseA" "FuseF" "FuseP" "Mix"   "SBrkr"
Electrical=as.numeric(Electrical,"SBrkr"=1, "FuseA"=2, "FuseF"=3, "FuseP"=4, "Mix"=5, "NA"=0) 
housing_train$Electrical <- Electrical
# Transforming Electrical column to numeric in test dataset
Electrical <-as.factor(housing_test$Electrical)
levels(Electrical)
## [1] "FuseA" "FuseF" "FuseP" "SBrkr"
Electrical=as.numeric(Electrical,"SBrkr"=1, "FuseA"=2, "FuseF"=3, "FuseP"=4, "Mix"=5 ) 
housing_test$Electrical<- Electrical
housing_train$Electrical[is.na(housing_train$Electrical)] <- 0
sum(is.na(housing_train$Electrical))
## [1] 0
housing_test$Electrical[is.na(housing_test$Electrical)] <- 0
sum(is.na(housing_test$Electrical))
## [1] 0
# Transforming KitchenQual column to numeric in train dataset
KitchenQual <-as.factor(housing_train$KitchenQual)
levels(KitchenQual)
## [1] "Ex" "Fa" "Gd" "TA"
KitchenQual=as.numeric(KitchenQual,"Ex"=1, "Gd"=2, "TA"=3, "Fa"=4, "Po"=5) 
housing_train$KitchenQual <- KitchenQual
# Transforming KitchenQual column to numeric in test dataset
KitchenQual <-as.factor(housing_test$KitchenQual)
levels(KitchenQual)
## [1] "Ex" "Fa" "Gd" "TA"
KitchenQual=as.numeric(KitchenQual,"Ex"=1, "Gd"=2, "TA"=3, "Fa"=4, "Po"=5 ) 
housing_test$KitchenQual<- KitchenQual
#Transforming Functional column to numeric in train dataset
Functional=as.factor(housing_train$Functional)
levels(Functional) #"Maj1" "Maj2" "Min1" "Min2" "Mod"  "Sev"  "Typ"
## [1] "Maj1" "Maj2" "Min1" "Min2" "Mod"  "Sev"  "Typ"
Functional=as.numeric(Functional,  "Sev"=1, "Maj2"=2,"Maj1"=3, "Mod"=4, "Min2"=5, "Min1"=6, "Typ"=7)
Functional=Functional+1 # "Sal"=1, Sev"=2, "Maj2"=3,"Maj1"=4, "Mod"=5, "Min2"=6, "Min1"=7, "Typ"=8
housing_train$Functional <- Functional
#Transforming Functional column to numeric in test dataset
Functional=as.factor(housing_test$Functional)
levels(Functional) #"Maj1" "Maj2" "Min1" "Min2" "Mod"  "Sev"  "Typ"
## [1] "Maj1" "Maj2" "Min1" "Min2" "Mod"  "Sev"  "Typ"
Functional=as.numeric(Functional,  "Sev"=1, "Maj2"=2,"Maj1"=3, "Mod"=4, "Min2"=5, "Min1"=6, "Typ"=7, "NA"=0)
Functional=Functional +1 # "Sal"=1, Sev"=2, "Maj2"=3,"Maj1"=4, "Mod"=5, "Min2"=6, "Min1"=7, "Typ"=8
housing_test$Functional <- Functional
#Transforming FireplaceQu column to numeric in train dataset
FireplaceQu=as.factor(housing_train$FireplaceQu)
levels(FireplaceQu) #"Ex" "Fa" "Gd" "Po" "TA"
## [1] "Ex" "Fa" "Gd" "Po" "TA"
sum(is.na(FireplaceQu)) # 690 Missing Entries
## [1] 690
FireplaceQu=as.numeric(FireplaceQu,  "Po"=1, "Fa"=2,"TA"=3, "Gd"=4, "Ex"=5, "NA"=0)
FireplaceQu[is.na(FireplaceQu)]<-0
housing_train$FireplaceQu <- FireplaceQu
#Transforming FireplaceQu column to numeric in test dataset
FireplaceQu=as.factor(housing_test$FireplaceQu)
levels(FireplaceQu) #"Ex" "Fa" "Gd" "Po" "TA"
## [1] "Ex" "Fa" "Gd" "Po" "TA"
sum(is.na(FireplaceQu)) # 730 Missing Entries
## [1] 730
FireplaceQu=as.numeric(FireplaceQu,  "Po"=1, "Fa"=2,"TA"=3, "Gd"=4, "Ex"=5, "NA"=0)
FireplaceQu[is.na(FireplaceQu)]<-0
housing_test$FireplaceQu <- FireplaceQu
#Transforming GarageType column to numeric in train dataset
GarageType=as.factor(housing_train$GarageType)
levels(GarageType) #"2Types"  "Attchd"  "Basment" "BuiltIn" "CarPort" "Detchd"
## [1] "2Types"  "Attchd"  "Basment" "BuiltIn" "CarPort" "Detchd"
sum(is.na(GarageType)) # 81 Missing Entries
## [1] 81
GarageType=as.numeric(GarageType,  "Detchd"=1, "CarPort"=2,"BuiltIn"=3, "Basment"=4, "Attchd"=5, "2Types"=6)
GarageType[is.na(GarageType)]<-0
housing_train$GarageType <- GarageType
#Transforming GarageType column to numeric in test dataset
GarageType=as.factor(housing_test$GarageType)
levels(GarageType) #"2Types"  "Attchd"  "Basment" "BuiltIn" "CarPort" "Detchd"
## [1] "2Types"  "Attchd"  "Basment" "BuiltIn" "CarPort" "Detchd"
sum(is.na(GarageType)) # 76 Missing Entries
## [1] 76
GarageType=as.numeric(GarageType,  "Detchd"=1, "CarPort"=2,"BuiltIn"=3, "Basment"=4, "Attchd"=5, "2Types"=6)
GarageType[is.na(GarageType)]<-0
housing_test$GarageType <- GarageType
# Changing missing values of GarageYrBlt 
sum(is.na(housing_train$GarageYrBlt)) # 81 missing values
## [1] 81
sum(is.na(housing_test$GarageYrBlt)) #78 missing values
## [1] 78
housing_train$GarageYrBlt[is.na(housing_train$GarageYrBlt)] <- 0
sum(is.na(housing_train$GarageYrBlt))
## [1] 0
housing_test$GarageYrBlt[is.na(housing_test$GarageYrBlt)] <- 0
sum(is.na(housing_test$GarageYrBlt))
## [1] 0
#Transforming GarageFinish column to numeric in train dataset
GarageFinish=as.factor(housing_train$GarageFinish)
levels(GarageFinish) #"Fin" "RFn" "Unf"
## [1] "Fin" "RFn" "Unf"
sum(is.na(GarageFinish)) # 81 Missing Entries
## [1] 81
GarageFinish=as.numeric(GarageFinish,  "Unf"=1, "RFn"=2,"Fin"=3)
GarageFinish[is.na(GarageFinish)]<-0
housing_train$GarageFinish <- GarageFinish
#Transforming GarageFinish column to numeric in test dataset
GarageFinish=as.factor(housing_test$GarageFinish)
levels(GarageFinish) #"Fin" "RFn" "Unf"
## [1] "Fin" "RFn" "Unf"
sum(is.na(GarageFinish)) # 78 Missing Entries
## [1] 78
GarageFinish=as.numeric(GarageFinish,  "Unf"=1, "RFn"=2,"Fin"=3)
GarageFinish[is.na(GarageFinish)]<-0
housing_test$GarageFinish <- GarageFinish
# Changing missing values of GarageCars 
sum(is.na(housing_train$GarageCars)) # no missing values
## [1] 0
sum(is.na(housing_test$GarageCars)) #1 missing value
## [1] 1
housing_test$GarageCars[is.na(housing_test$GarageCars)]<-0
# Changing missing values of GarageArea
sum(is.na(housing_train$GarageArea)) # no missing values
## [1] 0
sum(is.na(housing_test$GarageArea)) #1 missing value
## [1] 1
housing_test$GarageArea[is.na(housing_test$GarageArea)]<-0
#Transforming GarageQual column to numeric in train dataset
GarageQual=as.factor(housing_train$GarageQual)
levels(GarageQual) # "Ex" "Fa" "Gd" "Po" "TA"
## [1] "Ex" "Fa" "Gd" "Po" "TA"
sum(is.na(GarageQual)) # 81 Missing Entries
## [1] 81
GarageQual=as.numeric(GarageQual,  "Po"=1, "Fa"=2,"TA"=3, "Gd"=4, "Ex"=5)
GarageQual[is.na(GarageQual)]<-0
housing_train$GarageQual <- GarageQual
#Transforming GarageQual column to numeric in test dataset
GarageQual=as.factor(housing_test$GarageQual)
levels(GarageQual) # "Fa" "Gd" "Po" "TA"
## [1] "Fa" "Gd" "Po" "TA"
sum(is.na(GarageQual)) # 78 Missing Entries
## [1] 78
GarageQual=as.numeric(GarageQual,  "Po"=1, "Fa"=2,"TA"=3, "Gd"=4)
GarageQual[is.na(GarageQual)]<-0
housing_test$GarageQual <- GarageQual
#Transforming GarageCond column to numeric in train dataset
GarageCond=as.factor(housing_train$GarageCond)
levels(GarageCond) # "Ex" "Fa" "Gd" "Po" "TA"
## [1] "Ex" "Fa" "Gd" "Po" "TA"
sum(is.na(GarageCond)) # 81 Missing Entries
## [1] 81
GarageCond=as.numeric(GarageCond,  "Po"=1, "Fa"=2,"TA"=3, "Gd"=4, "Ex"=5)
GarageCond[is.na(GarageCond)]<-0
housing_train$GarageCond <- GarageCond
#Transforming GarageCond column to numeric in test dataset
GarageCond=as.factor(housing_test$GarageCond)
levels(GarageCond) # "Ex" "Fa" "Gd" "Po" "TA"
## [1] "Ex" "Fa" "Gd" "Po" "TA"
sum(is.na(GarageCond)) # 78 Missing Entries
## [1] 78
GarageCond=as.numeric(GarageCond,  "Po"=1, "Fa"=2,"TA"=3, "Gd"=4, "Ex"=5)
GarageCond[is.na(GarageCond)]<-0
housing_test$GarageCond <- GarageCond
#Transforming PavedDrive column to numeric in train dataset
PavedDrive=as.factor(housing_train$PavedDrive)
levels(PavedDrive) # "N" "P" "Y"
## [1] "N" "P" "Y"
sum(is.na(PavedDrive)) # 0 Missing Entries
## [1] 0
PavedDrive=as.numeric(PavedDrive,  "N"=1, "P"=2,"Y"=3)
housing_train$PavedDrive <- PavedDrive
#Transforming PavedDrive column to numeric in test dataset
PavedDrive=as.factor(housing_test$PavedDrive)
levels(PavedDrive) # "N" "P" "Y"
## [1] "N" "P" "Y"
sum(is.na(PavedDrive)) # 0 Missing Entries
## [1] 0
PavedDrive=as.numeric(PavedDrive,  "N"=1, "P"=2,"Y"=3)
housing_test$PavedDrive <- PavedDrive
#Transforming PoolQC column to numeric in train dataset
PoolQC=as.factor(housing_train$PoolQC)
levels(PoolQC) # "N" "P" "Y"
## [1] "Ex" "Fa" "Gd"
sum(is.na(PoolQC)) # 1453 Missing Entries
## [1] 1453
PoolQC=as.numeric(PoolQC,  "Fa"=1, "Gd"=2,"Ex"=3)
PoolQC <-ifelse(PoolQC==2|PoolQC==3,PoolQC+1,PoolQC) # No pool=0, Fa=1, TA=2, Gd=3, Ex=4
PoolQC[is.na(PoolQC)]<-0
housing_train$PoolQC <- PoolQC
#Transforming PoolQC column to numeric in test dataset
PoolQC=as.factor(housing_test$PoolQC)
levels(PoolQC) # "Ex" "Gd" 
## [1] "Ex" "Gd"
sum(is.na(PoolQC)) # 1456 Missing Entries
## [1] 1456
PoolQC=as.numeric(PoolQC,  "Gd"=1, "Ex"=2)
PoolQC=PoolQC+2
PoolQC[is.na(PoolQC)]<-0
housing_test$PoolQC <- PoolQC
#Transforming Fence column to numeric in train dataset
Fence=as.factor(housing_train$Fence)
levels(Fence) # "GdPrv" "GdWo"  "MnPrv" "MnWw" 
## [1] "GdPrv" "GdWo"  "MnPrv" "MnWw"
sum(is.na(Fence)) # 1179 Missing Entries
## [1] 1179
Fence=as.numeric(Fence,  "MnWw"=1, "GdWo"=2,"MnPrv"=3, "GdPrv"=4)
Fence[is.na(Fence)]<-0
housing_train$Fence <- Fence
#Transforming Fence column to numeric in test dataset
Fence=as.factor(housing_test$Fence)
levels(Fence) # "GdPrv" "GdWo"  "MnPrv" "MnWw" 
## [1] "GdPrv" "GdWo"  "MnPrv" "MnWw"
sum(is.na(Fence)) # 1169 Missing Entries
## [1] 1169
Fence=as.numeric(Fence,  "MnWw"=1, "GdWo"=2,"MnPrv"=3, "GdPrv"=4)
Fence[is.na(Fence)]<-0
housing_test$Fence <- Fence
#Transforming MiscFeature column to numeric in train dataset
MiscFeature=as.factor(housing_train$MiscFeature)
levels(MiscFeature) # "Gar2" "Othr" "Shed" "TenC"
## [1] "Gar2" "Othr" "Shed" "TenC"
sum(is.na(MiscFeature)) # 1406 Missing Entries
## [1] 1406
MiscFeature=as.numeric(MiscFeature,  "TenC"=1, "Shed"=2,"Othr"=3, "Gar2"=4)
MiscFeature[is.na(MiscFeature)]<-0
housing_train$MiscFeature <- MiscFeature
#Transforming MiscFeature column to numeric in test dataset
MiscFeature=as.factor(housing_test$MiscFeature)
levels(MiscFeature) # "Gar2" "Othr" "Shed" 
## [1] "Gar2" "Othr" "Shed"
sum(is.na(MiscFeature)) # 1408 Missing Entries
## [1] 1408
MiscFeature=as.numeric(MiscFeature, "Shed"=1,"Othr"=2, "Gar2"=3)
MiscFeature=MiscFeature+1
MiscFeature[is.na(MiscFeature)]<-0 #"TenC"=1, "Shed"=2,"Othr"=3, "Gar2"=4
housing_test$MiscFeature <- MiscFeature
#Transforming SaleType column to numeric in train dataset
SaleType=as.factor(housing_train$SaleType)
levels(SaleType) # "COD"   "Con"   "ConLD" "ConLI" "ConLw" "CWD"   "New"   "Oth"   "WD" 
## [1] "COD"   "Con"   "ConLD" "ConLI" "ConLw" "CWD"   "New"   "Oth"   "WD"
sum(is.na(SaleType)) # 0 Missing Entries
## [1] 0
SaleType=as.numeric(SaleType,  "Oth"=1, "ConLD"=2,"ConLI"=3, "ConLw"=4, "Con"=5, "COD"=6, "New"=7, "CWD"=8, "WD"=9)
housing_train$SaleType <- SaleType
#Transforming SaleType column to numeric in test dataset
SaleType=as.factor(housing_test$SaleType)
levels(SaleType) # "COD"   "Con"   "ConLD" "ConLI" "ConLw" "CWD"   "New"   "Oth"   "WD" 
## [1] "COD"   "Con"   "ConLD" "ConLI" "ConLw" "CWD"   "New"   "Oth"   "WD"
sum(is.na(SaleType)) # 1 Missing Entries
## [1] 1
SaleType=as.numeric(SaleType,  "Oth"=1, "ConLD"=2,"ConLI"=3, "ConLw"=4, "Con"=5, "COD"=6, "New"=7, "CWD"=8, "WD"=9)
SaleType[is.na(SaleType)]<-1
housing_test$SaleType <- SaleType
#Transforming SaleCondition column to numeric in train dataset
SaleCondition=as.factor(housing_train$SaleCondition)
levels(SaleCondition) # "Abnorml" "AdjLand" "Alloca"  "Family"  "Normal"  "Partial" 
## [1] "Abnorml" "AdjLand" "Alloca"  "Family"  "Normal"  "Partial"
sum(is.na(SaleCondition)) # 0 Missing Entries
## [1] 0
SaleCondition=as.numeric(SaleCondition,  "Partial"=1, "Family"=2,"Alloca"=3, "AdjLand"=4, "Abnorml"=5, "Normal"=6)
housing_train$SaleCondition <- SaleCondition
#Transforming SaleCondition column to numeric in test dataset
SaleCondition=as.factor(housing_test$SaleCondition)
levels(SaleCondition) # "Abnorml" "AdjLand" "Alloca"  "Family"  "Normal"  "Partial" 
## [1] "Abnorml" "AdjLand" "Alloca"  "Family"  "Normal"  "Partial"
sum(is.na(SaleCondition)) # 0 Missing Entries
## [1] 0
SaleCondition=as.numeric(SaleCondition,  "Partial"=1, "Family"=2,"Alloca"=3, "AdjLand"=4, "Abnorml"=5, "Normal"=6)
housing_test$SaleCondition <- SaleCondition

##Replacing NA with Zeros

data.frame(num_missing=colSums(is.na(housing_train)))
##               num_missing
## Id                      0
## MSSubClass              0
## MSZoning                0
## LotFrontage           259
## LotArea                 0
## Street                  0
## Alley                1369
## LotShape                0
## LandContour             0
## Utilities               0
## LotConfig               0
## LandSlope               0
## Neighborhood            0
## Condition1              0
## Condition2              0
## BldgType                0
## HouseStyle              0
## OverallQual             0
## OverallCond             0
## YearBuilt               0
## YearRemodAdd            0
## RoofStyle               0
## RoofMatl                0
## Exterior1st             0
## Exterior2nd             0
## MasVnrType              8
## MasVnrArea              8
## ExterQual               0
## ExterCond               0
## Foundation              0
## BsmtQual               37
## BsmtCond               37
## BsmtExposure           38
## BsmtFinType1            0
## BsmtFinSF1              0
## BsmtFinType2            0
## BsmtFinSF2              0
## BsmtUnfSF               0
## TotalBsmtSF             0
## Heating                 0
## HeatingQC               0
## CentralAir              0
## Electrical              0
## X1stFlrSF               0
## X2ndFlrSF               0
## LowQualFinSF            0
## GrLivArea               0
## BsmtFullBath            0
## BsmtHalfBath            0
## FullBath                0
## HalfBath                0
## BedroomAbvGr            0
## KitchenAbvGr            0
## KitchenQual             0
## TotRmsAbvGrd            0
## Functional              0
## Fireplaces              0
## FireplaceQu             0
## GarageType              0
## GarageYrBlt             0
## GarageFinish            0
## GarageCars              0
## GarageArea              0
## GarageQual              0
## GarageCond              0
## PavedDrive              0
## WoodDeckSF              0
## OpenPorchSF             0
## EnclosedPorch           0
## X3SsnPorch              0
## ScreenPorch             0
## PoolArea                0
## PoolQC                  0
## Fence                   0
## MiscFeature             0
## MiscVal                 0
## MoSold                  0
## YrSold                  0
## SaleType                0
## SaleCondition           0
## SalePrice               0
data.frame(num_missing=colSums(is.na(housing_test)))
##               num_missing
## Id                      0
## MSSubClass              0
## MSZoning                4
## LotFrontage           227
## LotArea                 0
## Street                  0
## Alley                1352
## LotShape                0
## LandContour             0
## Utilities               2
## LotConfig               0
## LandSlope               0
## Neighborhood            0
## Condition1              0
## Condition2              0
## BldgType                0
## HouseStyle              0
## OverallQual             0
## OverallCond             0
## YearBuilt               0
## YearRemodAdd            0
## RoofStyle               0
## RoofMatl                0
## Exterior1st             1
## Exterior2nd             1
## MasVnrType             16
## MasVnrArea             15
## ExterQual               0
## ExterCond               0
## Foundation              0
## BsmtQual               44
## BsmtCond               45
## BsmtExposure           44
## BsmtFinType1            0
## BsmtFinSF1              1
## BsmtFinType2            0
## BsmtFinSF2              1
## BsmtUnfSF               1
## TotalBsmtSF             1
## Heating                 0
## HeatingQC               0
## CentralAir              0
## Electrical              0
## X1stFlrSF               0
## X2ndFlrSF               0
## LowQualFinSF            0
## GrLivArea               0
## BsmtFullBath            2
## BsmtHalfBath            2
## FullBath                0
## HalfBath                0
## BedroomAbvGr            0
## KitchenAbvGr            0
## KitchenQual             1
## TotRmsAbvGrd            0
## Functional              2
## Fireplaces              0
## FireplaceQu             0
## GarageType              0
## GarageYrBlt             0
## GarageFinish            0
## GarageCars              0
## GarageArea              0
## GarageQual              0
## GarageCond              0
## PavedDrive              0
## WoodDeckSF              0
## OpenPorchSF             0
## EnclosedPorch           0
## X3SsnPorch              0
## ScreenPorch             0
## PoolArea                0
## PoolQC                  0
## Fence                   0
## MiscFeature             0
## MiscVal                 0
## MoSold                  0
## YrSold                  0
## SaleType                0
## SaleCondition           0

Input 0 for missing values.

housing_train$LotFrontage[is.na(housing_train$LotFrontage)] <- 0
sum(is.na(housing_train$LotFrontage))
## [1] 0
housing_test$LotFrontage[is.na(housing_test$LotFrontage)] <- 0
sum(is.na(housing_test$LotFrontage))
## [1] 0
housing_train$BsmtQual[is.na(housing_train$BsmtQual)] <- 0
sum(is.na(housing_train$BsmtQual))
## [1] 0
housing_test$BsmtQual[is.na(housing_test$BsmtQual)] <- 0
sum(is.na(housing_test$BsmtQual))
## [1] 0
housing_train$MasVnrType[is.na(housing_train$MasVnrType)] <- 0
sum(is.na(housing_train$MasVnrType))
## [1] 0
housing_test$MasVnrType[is.na(housing_test$MasVnrType)] <- 0
sum(is.na(housing_test$MasVnrType))
## [1] 0
housing_train$MasVnrArea[is.na(housing_train$MasVnrArea)] <- 0
sum(is.na(housing_train$MasVnrArea))
## [1] 0
housing_test$MasVnrArea[is.na(housing_test$MasVnrArea)] <- 0
sum(is.na(housing_test$MasVnrArea))
## [1] 0
housing_train$BsmtCond[is.na(housing_train$BsmtCond)] <- 0
sum(is.na(housing_train$BsmtCond))
## [1] 0
housing_test$BsmtCond[is.na(housing_test$BsmtCond)] <- 0
sum(is.na(housing_test$BsmtCond))
## [1] 0
housing_train$BsmtExposure[is.na(housing_train$BsmtExposure)] <- 0
sum(is.na(housing_train$BsmtExposure))
## [1] 0
housing_test$BsmtExposure[is.na(housing_test$BsmtExposure)] <- 0
sum(is.na(housing_test$BsmtExposure))
## [1] 0
housing_train$MiscFeature[is.na(housing_train$MiscFeature)] <- 0
sum(is.na(housing_train$MiscFeature))
## [1] 0
housing_test$GarageQual[is.na(housing_test$GarageQual)] <- 0
sum(is.na(housing_test$GarageQual))
## [1] 0
housing_test$MSZoning[is.na(housing_test$MSZoning)] <- 0
sum(is.na(housing_test$MSZoning))
## [1] 0
housing_test$Exterior1st[is.na(housing_test$Exterior1st)] <- 0
sum(is.na(housing_test$Exterior1st))
## [1] 0
housing_test$Exterior2nd[is.na(housing_test$Exterior2nd)] <- 0
sum(is.na(housing_test$Exterior2nd))
## [1] 0
housing_test$BsmtFinSF1[is.na(housing_test$BsmtFinSF1)] <- 0
sum(is.na(housing_test$BsmtFinSF1))
## [1] 0
housing_test$BsmtFinSF2[is.na(housing_test$BsmtFinSF2)] <- 0
sum(is.na(housing_test$BsmtFinSF2   ))
## [1] 0
housing_test$BsmtUnfSF[is.na(housing_test$BsmtUnfSF)] <- 0
sum(is.na(housing_test$BsmtUnfSF))
## [1] 0
housing_test$BsmtFullBath[is.na(housing_test$BsmtFullBath)] <- 0
sum(is.na(housing_test$BsmtFullBath))
## [1] 0
housing_test$BsmtHalfBath[is.na(housing_test$BsmtHalfBath)] <- 0
sum(is.na(housing_test$BsmtHalfBath))
## [1] 0
housing_test$KitchenQual[is.na(housing_test$KitchenQual)] <- 0
sum(is.na(housing_test$KitchenQual))
## [1] 0
housing_test$Functional[is.na(housing_test$Functional)] <- 0
sum(is.na(housing_test$Functional))
## [1] 0
#housing_train$Alley[is.na(housing_train$Alley)] <- 0
#sum(is.na(housing_train$Alley))
#housing_test$Alley[is.na(housing_test$Alley)] <- 0
#sum(is.na(housing_test$Alley))

Everything else will have 0 for missing values since that aren’t many values missing.
For Alley the column will be stored since most of the rows do not have a value. The model was run with Alley and without alley, it shows that alley column variable makes the model slightly worst so the variable will be excluded.

trainAlley <- housing_train$Alley
testAlley <- housing_test$Alley
housing_train$Alley <- NULL
housing_test$Alley <- NULL

Everything else will have 0 for missing values since that aren’t many values missing.

#Visualizations The next step it is to create visualizations to start the analysis.
Looking at the correlation between variables:

#Libraries for the next visualizations
library(ggcorrplot) 
## Loading required package: ggplot2
library(ggplot2)
correlations <- cor(housing_train[,c(2:15
                                    ,80)], use="everything")
corrplot::corrplot(correlations, method="circle", type="lower",  sig.level = 0.01, insig = "blank")

Looking at positive correlations the variables that show the strongest correlation is MSSubClass and BldgType, followed by landSlope and LotArea. Looking a negative correlations there are greater correlations on Utilities and LandSlope, BlgdType with LotFrontage and LotArea. Now looking on SalesPrice variables MSSubClass, MSZoning, LotShape, LotConfig, BldgType have negative correlation with sales price. The variables LotFrontage, LotArea, Neighborhood, and condition have a positive correlation.

Looking at a few more variables.

correlations <- cor(housing_train[,c(16:26, 80)], use="everything")
corrplot::corrplot(correlations, method="circle", type="lower",  sig.level = 0.01, insig = "blank")

Looking at the variables that have greater correlation to sales price. OverallQual, YearBuilt, YearRemondAdd, MasVnrArea, and RoofStyle have a positive correlation. The only variable that has a negative correlation to SalesPrice is OverallCond.

correlations <- cor(housing_train[,c(27:40, 80)], use="everything")
corrplot::corrplot(correlations, method="circle", type="lower",  sig.level = 0.01, insig = "blank")

In this set of variables it shows that ExterQual has a negative correlation to the Sales price, as well as BsmtQual, HeatingQC, and BsmtQUal. TotalBsmtSF has a positive correlation followed by BsmtFinSF1, and foundation.

correlations <- cor(housing_train[,c(41:60, 80)], use="everything")
corrplot::corrplot(correlations, method="circle", type="lower",  sig.level = 0.01, insig = "blank")

All the variables that are relagted to living spaces and quality have a positive correlation, the only living space that has a negative correlation is the kitchen. Something to look into to see what is influecing about kitchens.

correlations <- cor(housing_train[,c(61:79, 80)], use="everything")
corrplot::corrplot(correlations, method="circle", type="lower",  sig.level = 0.01, insig = "blank")

At last, garage area is the last thing that has a stroger correlation to SalesPrice. Other amenities do not seem to be as important.
In this case a fireplace has a good relation between other amenities of the house.

pairs(SalePrice~YearBuilt+OverallQual+TotalBsmtSF+GrLivArea,data=housing_train,
   main="Simple Scatterplot Matrix")

Looking at the plots the data seems to be well distributed but also show how the variables correlate.
The following chart shows the sale price comparing to the year it was build.

#install.packages('carData')
library(car)
## Loading required package: carData
scatterplot(SalePrice ~ YearBuilt, data=housing_train,  xlab="Year Built", ylab="Sale Price", grid=FALSE)

We can see a correlation between the year it was build and increase of price. Overall, prices seem to have a higher increase after 1980 and after the 2000 there are more values scatter towards higher prices.

scatterplot(SalePrice ~ YrSold, data=housing_train,  xlab="Year Built", ylab="Sale Price", grid=FALSE)

The Year Built vs Sale Price shows how the 2008 market could influenced the sale price of houses, the data varies a bit more in that year as it kinda shows a small decline. There is a slight increase in 2009 for sale price but after 2009 it seems it is back to a normal linear line.

Looking at the house LotArea comparing to sales price.

scatterplot(SalePrice ~ LotArea, data=housing_train,  xlab="Lot Area", ylab="Sale Price", grid=FALSE)

The chart shows a non-linear relationship between the size of the lot and the Sales Price. This shows that house aspects have a greater weight on the price of the house.
Looking at the Sale rice and the Square foot of basement finished.

scatterplot(SalePrice ~ X1stFlrSF, data=housing_train,  xlab="1st Floor Square Foot", ylab="Sale Price", grid=FALSE)

At last, 1st floor Square Foot shows a strong relathionship to Sales price, but still having data points outside the expected.
#Modeling Preparation The data will be partition for analysis using caret function.

#Data partition using caret partition function.
#install.packages('lattice')
library(caret)
## Loading required package: lattice
#Packages for RMSE
#install.packages('Metrics')
library(Metrics)
## 
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
## 
##     precision, recall
#Naming the Sale Price as Outcome
outcome <- housing_train$SalePrice
#Partition the data to be 60% train and 40% test
partition <- createDataPartition(y=outcome, p=.6, list=FALSE)
train <- housing_train[partition,]
test <- housing_train[-partition,]

After testing the models I couple of times with different train and test sets, reducing the train set would improve the XGBoost but increase the error for Linear Regression. Train 60% of the data was where the best prediction was display for XGBoost, using 50% as train data made our prediction error increase by 2%. The first part of the project will use a differen percentage of train data than the XGBoost to make the model better.

#Data Analysis

##Linear Regression
The first step it is to create a linear model to see what variables share a strong relation with the Sale Price.

LM_model1 <- lm(SalePrice ~., data=train)
summary(LM_model1)
## 
## Call:
## lm(formula = SalePrice ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -132185  -12383    -473   11930  164660 
## 
## Coefficients: (2 not defined because of singularities)
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    9.402e+05  1.342e+06   0.701 0.483605    
## Id             6.066e-02  2.092e+00   0.029 0.976881    
## MSSubClass    -3.974e+01  4.697e+01  -0.846 0.397717    
## MSZoning      -1.528e+03  1.585e+03  -0.964 0.335116    
## LotFrontage    4.709e+01  2.845e+01   1.655 0.098345 .  
## LotArea        5.614e-01  9.058e-02   6.198 9.17e-10 ***
## Street         4.456e+04  1.376e+04   3.239 0.001249 ** 
## LotShape      -1.076e+03  6.742e+02  -1.596 0.110889    
## LandContour   -1.614e+03  1.335e+03  -1.208 0.227227    
## Utilities     -2.598e+04  2.644e+04  -0.983 0.326062    
## LotConfig     -6.484e+01  5.514e+02  -0.118 0.906419    
## LandSlope     -1.901e+03  3.524e+03  -0.539 0.589744    
## Neighborhood  -6.981e+01  1.552e+02  -0.450 0.653048    
## Condition1     3.708e+02  1.025e+03   0.362 0.717648    
## Condition2     1.729e+03  5.936e+03   0.291 0.770909    
## BldgType      -2.176e+03  1.525e+03  -1.427 0.154056    
## HouseStyle     4.213e+02  6.431e+02   0.655 0.512620    
## OverallQual    8.312e+03  1.211e+03   6.862 1.36e-11 ***
## OverallCond    5.817e+03  1.094e+03   5.319 1.36e-07 ***
## YearBuilt      2.998e+02  7.463e+01   4.016 6.47e-05 ***
## YearRemodAdd   1.332e+00  7.075e+01   0.019 0.984988    
## RoofStyle      1.894e+03  1.125e+03   1.683 0.092792 .  
## RoofMatl      -1.359e+03  1.599e+03  -0.850 0.395461    
## Exterior1st   -1.060e+03  5.449e+02  -1.945 0.052118 .  
## Exterior2nd    5.364e+02  4.891e+02   1.097 0.273097    
## MasVnrType     5.592e+03  1.491e+03   3.752 0.000188 ***
## MasVnrArea     2.000e+01  6.017e+00   3.323 0.000930 ***
## ExterQual     -8.431e+03  1.999e+03  -4.216 2.77e-05 ***
## ExterCond      1.442e+03  1.338e+03   1.078 0.281287    
## Foundation     8.651e+01  1.730e+03   0.050 0.960138    
## BsmtQual      -7.055e+03  1.343e+03  -5.253 1.92e-07 ***
## BsmtCond       1.849e+03  1.403e+03   1.318 0.187946    
## BsmtExposure  -1.172e+03  8.843e+02  -1.326 0.185306    
## BsmtFinType1  -3.425e+02  6.244e+02  -0.549 0.583460    
## BsmtFinSF1     3.936e+01  5.290e+00   7.441 2.59e-13 ***
## BsmtFinType2   5.009e+02  1.133e+03   0.442 0.658563    
## BsmtFinSF2     2.784e+01  7.828e+00   3.557 0.000397 ***
## BsmtUnfSF      2.169e+01  4.903e+00   4.425 1.10e-05 ***
## TotalBsmtSF           NA         NA      NA       NA    
## Heating        5.935e+02  2.983e+03   0.199 0.842349    
## HeatingQC     -4.407e+02  6.293e+02  -0.700 0.483988    
## CentralAir     9.773e+02  4.525e+03   0.216 0.829055    
## Electrical    -2.697e+02  8.963e+02  -0.301 0.763575    
## X1stFlrSF      6.627e+01  6.183e+00  10.718  < 2e-16 ***
## X2ndFlrSF      6.406e+01  5.007e+00  12.794  < 2e-16 ***
## LowQualFinSF  -1.144e+01  1.781e+01  -0.642 0.520806    
## GrLivArea             NA         NA      NA       NA    
## BsmtFullBath   1.463e+03  2.478e+03   0.591 0.554972    
## BsmtHalfBath  -5.360e+02  3.830e+03  -0.140 0.888750    
## FullBath      -3.169e+03  2.706e+03  -1.171 0.241992    
## HalfBath      -1.204e+03  2.538e+03  -0.474 0.635343    
## BedroomAbvGr  -5.880e+03  1.652e+03  -3.559 0.000394 ***
## KitchenAbvGr  -1.818e+04  5.027e+03  -3.617 0.000317 ***
## KitchenQual   -6.845e+03  1.476e+03  -4.638 4.12e-06 ***
## TotRmsAbvGrd   2.145e+02  1.193e+03   0.180 0.857389    
## Functional     4.856e+03  9.288e+02   5.229 2.18e-07 ***
## Fireplaces     3.243e+03  2.591e+03   1.252 0.211090    
## FireplaceQu   -3.065e+02  7.947e+02  -0.386 0.699845    
## GarageType     9.463e+02  6.359e+02   1.488 0.137122    
## GarageYrBlt   -4.371e+00  5.727e+00  -0.763 0.445574    
## GarageFinish  -1.344e+03  1.468e+03  -0.916 0.360171    
## GarageCars     5.116e+03  2.917e+03   1.754 0.079840 .  
## GarageArea     1.686e+01  9.489e+00   1.777 0.075905 .  
## GarageQual    -6.425e+02  1.795e+03  -0.358 0.720502    
## GarageCond    -6.917e+02  1.977e+03  -0.350 0.726510    
## PavedDrive     3.557e+03  2.096e+03   1.697 0.090037 .  
## WoodDeckSF     2.628e+01  7.643e+00   3.438 0.000615 ***
## OpenPorchSF    1.528e+01  1.522e+01   1.004 0.315832    
## EnclosedPorch  3.595e+00  1.503e+01   0.239 0.811023    
## X3SsnPorch     6.717e+00  2.712e+01   0.248 0.804440    
## ScreenPorch    2.393e+01  1.692e+01   1.415 0.157541    
## PoolArea       2.498e+02  5.242e+01   4.765 2.25e-06 ***
## PoolQC        -4.050e+04  1.037e+04  -3.905 0.000102 ***
## Fence          2.556e+02  9.312e+02   0.274 0.783816    
## MiscFeature   -2.145e+03  1.638e+03  -1.309 0.190802    
## MiscVal       -4.737e-01  1.501e+00  -0.315 0.752493    
## MoSold        -6.882e+01  3.171e+02  -0.217 0.828250    
## YrSold        -7.965e+02  6.628e+02  -1.202 0.229816    
## SaleType      -2.163e+02  6.357e+02  -0.340 0.733755    
## SaleCondition  2.620e+03  8.235e+02   3.181 0.001523 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24300 on 800 degrees of freedom
## Multiple R-squared:  0.9097, Adjusted R-squared:  0.901 
## F-statistic: 104.7 on 77 and 800 DF,  p-value: < 2.2e-16
#The competition asks to use RMSE for predicting error.
prediction_lm1 <- predict(LM_model1, test, type="response")
## Warning in predict.lm(LM_model1, test, type = "response"): prediction from a
## rank-deficient fit may be misleading
model_output <- cbind(test, prediction_lm1)

model_output$log_prediction <- log(model_output$prediction_lm1)
model_output$log_SalePrice <- log(model_output$SalePrice)

#Test with RMSE

rmse(model_output$log_SalePrice,model_output$log_prediction)
## [1] 0.1661432

Many variables can be excluded from the model, the model has a of explanation and the error is what seems really large.The new model is the following:

LM_model2 <- lm(SalePrice ~LotArea+Street+Neighborhood+Condition1+BldgType+OverallCond+OverallQual+YearBuilt+RoofMatl+MasVnrArea+ExterQual+BsmtFinSF1+BsmtUnfSF+X1stFlrSF+ X2ndFlrSF+BedroomAbvGr+KitchenAbvGr+KitchenQual+TotRmsAbvGrd+Fireplaces+GarageArea+GarageQual, data=train)
summary(LM_model2)
## 
## Call:
## lm(formula = SalePrice ~ LotArea + Street + Neighborhood + Condition1 + 
##     BldgType + OverallCond + OverallQual + YearBuilt + RoofMatl + 
##     MasVnrArea + ExterQual + BsmtFinSF1 + BsmtUnfSF + X1stFlrSF + 
##     X2ndFlrSF + BedroomAbvGr + KitchenAbvGr + KitchenQual + TotRmsAbvGrd + 
##     Fireplaces + GarageArea + GarageQual, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -127318  -14127   -1188   13938  218389 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -9.243e+05  9.307e+04  -9.932  < 2e-16 ***
## LotArea       6.144e-01  8.327e-02   7.379 3.79e-13 ***
## Street        5.084e+04  1.393e+04   3.649 0.000280 ***
## Neighborhood -4.111e+01  1.555e+02  -0.264 0.791590    
## Condition1    5.620e+02  1.038e+03   0.541 0.588496    
## BldgType     -4.141e+03  8.585e+02  -4.823 1.67e-06 ***
## OverallCond   5.696e+03  9.201e+02   6.190 9.32e-10 ***
## OverallQual   1.038e+04  1.212e+03   8.560  < 2e-16 ***
## YearBuilt     4.443e+02  4.515e+01   9.840  < 2e-16 ***
## RoofMatl     -1.048e+03  1.605e+03  -0.653 0.514100    
## MasVnrArea    1.042e+01  5.697e+00   1.828 0.067830 .  
## ExterQual    -1.124e+04  2.017e+03  -5.575 3.32e-08 ***
## BsmtFinSF1    3.367e+01  3.741e+00   9.000  < 2e-16 ***
## BsmtUnfSF     1.000e+01  3.592e+00   2.785 0.005479 ** 
## X1stFlrSF     7.733e+01  5.194e+00  14.888  < 2e-16 ***
## X2ndFlrSF     6.394e+01  3.989e+00  16.028  < 2e-16 ***
## BedroomAbvGr -7.890e+03  1.647e+03  -4.791 1.96e-06 ***
## KitchenAbvGr -1.490e+04  4.873e+03  -3.057 0.002305 ** 
## KitchenQual  -8.417e+03  1.521e+03  -5.532 4.21e-08 ***
## TotRmsAbvGrd  1.267e+00  1.187e+03   0.001 0.999149    
## Fireplaces    3.385e+03  1.668e+03   2.029 0.042722 *  
## GarageArea    3.708e+01  6.608e+00   5.611 2.72e-08 ***
## GarageQual   -2.939e+03  8.868e+02  -3.314 0.000958 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 26440 on 855 degrees of freedom
## Multiple R-squared:  0.8858, Adjusted R-squared:  0.8828 
## F-statistic: 301.4 on 22 and 855 DF,  p-value: < 2.2e-16

After excluding many of the variables the model has Residual standar Error: and R-Square:
Prediction Linear Regression:

#The competition asks to use RMSE for predicting error.
prediction_lm <- predict(LM_model2, test, type="response")
model_output <- cbind(test, prediction_lm)

model_output$log_prediction <- log(model_output$prediction_lm)
model_output$log_SalePrice <- log(model_output$SalePrice)

#Test with RMSE

rmse(model_output$log_SalePrice,model_output$log_prediction)
## [1] 0.1763702

The model error is

Visualization to see if a transformation is necessary.

plot(LM_model2$fitted.values, LM_model2$residuals, pch = 20, col = "blue")
abline(h = 0)

The model seem to be around the 0 with some scatter prices when the fitted value increases.

#Package for BoxCox 
library(MASS)
boxcox(LM_model2)

Using Log transformation may be helpful improving the linear regression.

model3 <- lm(I(log(SalePrice)) ~LotArea+Street+Neighborhood+Condition1+BldgType+OverallCond+OverallQual+YearBuilt+RoofMatl+MasVnrArea+ExterQual+BsmtFinSF1+BsmtUnfSF+X1stFlrSF+ X2ndFlrSF+BedroomAbvGr+KitchenAbvGr+KitchenQual+TotRmsAbvGrd+Fireplaces+GarageArea+GarageQual, data=train)
summary(model3)
## 
## Call:
## lm(formula = I(log(SalePrice)) ~ LotArea + Street + Neighborhood + 
##     Condition1 + BldgType + OverallCond + OverallQual + YearBuilt + 
##     RoofMatl + MasVnrArea + ExterQual + BsmtFinSF1 + BsmtUnfSF + 
##     X1stFlrSF + X2ndFlrSF + BedroomAbvGr + KitchenAbvGr + KitchenQual + 
##     TotRmsAbvGrd + Fireplaces + GarageArea + GarageQual, data = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.82295 -0.05835  0.00712  0.07346  0.42917 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.065e+00  4.544e-01   6.746 2.79e-11 ***
## LotArea       2.953e-06  4.065e-07   7.264 8.47e-13 ***
## Street        3.291e-01  6.802e-02   4.839 1.55e-06 ***
## Neighborhood -6.863e-05  7.593e-04  -0.090 0.928002    
## Condition1    5.237e-03  5.069e-03   1.033 0.301901    
## BldgType     -1.588e-02  4.191e-03  -3.789 0.000162 ***
## OverallCond   5.522e-02  4.492e-03  12.293  < 2e-16 ***
## OverallQual   7.290e-02  5.919e-03  12.317  < 2e-16 ***
## YearBuilt     3.545e-03  2.204e-04  16.083  < 2e-16 ***
## RoofMatl      4.462e-03  7.838e-03   0.569 0.569327    
## MasVnrArea   -2.741e-05  2.781e-05  -0.985 0.324746    
## ExterQual    -2.301e-02  9.846e-03  -2.337 0.019681 *  
## BsmtFinSF1    1.460e-04  1.826e-05   7.992 4.28e-15 ***
## BsmtUnfSF     5.641e-05  1.754e-05   3.217 0.001345 ** 
## X1stFlrSF     3.355e-04  2.536e-05  13.232  < 2e-16 ***
## X2ndFlrSF     2.695e-04  1.948e-05  13.835  < 2e-16 ***
## BedroomAbvGr -7.734e-03  8.041e-03  -0.962 0.336365    
## KitchenAbvGr -4.920e-02  2.379e-02  -2.068 0.038964 *  
## KitchenQual  -1.491e-02  7.428e-03  -2.007 0.045083 *  
## TotRmsAbvGrd  3.429e-03  5.793e-03   0.592 0.554096    
## Fireplaces    3.509e-02  8.142e-03   4.310 1.82e-05 ***
## GarageArea    1.995e-04  3.226e-05   6.184 9.65e-10 ***
## GarageQual    4.396e-03  4.329e-03   1.015 0.310189    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1291 on 855 degrees of freedom
## Multiple R-squared:  0.8975, Adjusted R-squared:  0.8949 
## F-statistic: 340.3 on 22 and 855 DF,  p-value: < 2.2e-16

Residual standar Error: and R-Square: Now using RMSE:

prediction3 <- predict(model3, test, type="response")
model_output <- cbind(test, prediction3)

model_output$log_prediction3 <- log(model_output$prediction3)
model_output$log_SalePrice3 <- log(model_output$SalePrice)
#Test with RMSE

rmse(model_output$log_SalePrice3, model_output$log_prediction3)
## [1] 9.542621

The RMSE is higher than the previous model.
Looking at cooks distance plot to see if there are any outliers that can influence the model. This will help decide if the variable should be excluded or not.

mean(hatvalues(model3))
## [1] 0.0261959

QQ-plot looking at leverage of data points.

qqnorm(LM_model2$residuals, main = "LM_model2") 
qqline(LM_model2$residuals)
abline(h = 0, col = "grey")

Overall the model does not need to remove any datapoints.

##Bagging For the bagging model no preparation needs to be made since the data was already changed from categorical to numeric. The model will start with 500 bootstrap samples and will be reduced as see fit.

#Package for bagging
#install.packages('ipred')
library(ipred)
house_bag <- bagging(formula = SalePrice ~., data = train, nbagg = 500) 
house_bag
## 
## Bagging regression trees with 500 bootstrap replications 
## 
## Call: bagging.data.frame(formula = SalePrice ~ ., data = train, nbagg = 500)

Out_of_bag Prediction

house_bag_oob <- bagging(formula = SalePrice~., data = train, coob = T, nbagg = 500)
house_bag_oob
## 
## Bagging regression trees with 500 bootstrap replications 
## 
## Call: bagging.data.frame(formula = SalePrice ~ ., data = train, coob = T, 
##     nbagg = 500)
## 
## Out-of-bag estimate of root mean squared error:  34823.62

The obb eror also seems really high, but still smaller than the linear regression with no transformation ( linear regression = , oob = ).
The out of bag show a large error. Looking at the RMSE:

# Predict using the test set
house_bag_pred_1 <- predict(house_bag_oob, test)
model_output <- cbind(test, house_bag_pred_1)


model_output$log_prediction_bag <- log(model_output$house_bag_pred_1)
model_output$log_SalePrice_bag <- log(model_output$SalePrice)

#Test with RMSE

rmse(model_output$log_SalePrice_bag,model_output$log_prediction_bag)
## [1] 0.2000202

The prediction model is showing error.

house_bag2 <- bagging(formula = SalePrice ~LotArea+Street+Neighborhood+Condition1+BldgType+OverallCond+OverallQual+YearBuilt+RoofMatl+MasVnrArea+ExterQual+BsmtFinSF1+BsmtUnfSF+X1stFlrSF+ X2ndFlrSF+BedroomAbvGr+KitchenAbvGr+KitchenQual+TotRmsAbvGrd+Fireplaces+GarageArea+GarageQual, data = train, nbagg = 500) 
house_bag2
## 
## Bagging regression trees with 500 bootstrap replications 
## 
## Call: bagging.data.frame(formula = SalePrice ~ LotArea + Street + Neighborhood + 
##     Condition1 + BldgType + OverallCond + OverallQual + YearBuilt + 
##     RoofMatl + MasVnrArea + ExterQual + BsmtFinSF1 + BsmtUnfSF + 
##     X1stFlrSF + X2ndFlrSF + BedroomAbvGr + KitchenAbvGr + KitchenQual + 
##     TotRmsAbvGrd + Fireplaces + GarageArea + GarageQual, data = train, 
##     nbagg = 500)
# Predict using the test set
house_bag_pred_2 <- predict(house_bag2, test)
model_output2 <- cbind(test, house_bag_pred_2)


model_output2$log_prediction_bag2 <- log(model_output2$house_bag_pred_2)
model_output2$log_SalePrice_bag2 <- log(model_output2$SalePrice)

#Test with RMSE

rmse(model_output2$log_SalePrice_bag2,model_output2$log_prediction_bag2)
## [1] 0.2042415

Looking at the trees split and error.

ntree <- c(1, 3, 5, seq(20, 500, 20))
MSE_test <- rep(0, length(ntree))
for(i in 1:length(ntree)){
  bag1 <- bagging(SalePrice~., data = train, nbagg = ntree[i])
 predict <- predict(bag1, newdata = test)
 MSE_test[i] <- mean((test$SalePrice - predict)^2)
}
plot(ntree, MSE_test, type = 'l', col = 2, lwd = 2, xaxt = "n")
axis(1, at = ntree, las = 1)

The chart shows a decline on trees at around 20, but the lowest us around 200 trees.

house_bag3 <- bagging(formula = SalePrice ~LotArea+Street+Neighborhood+Condition1+BldgType+OverallCond+OverallQual+YearBuilt+RoofMatl+MasVnrArea+ExterQual+BsmtFinSF1+BsmtUnfSF+X1stFlrSF+ X2ndFlrSF+BedroomAbvGr+KitchenAbvGr+KitchenQual+TotRmsAbvGrd+Fireplaces+GarageArea+GarageQual, data = train, nbagg = 250) 
house_bag3
## 
## Bagging regression trees with 250 bootstrap replications 
## 
## Call: bagging.data.frame(formula = SalePrice ~ LotArea + Street + Neighborhood + 
##     Condition1 + BldgType + OverallCond + OverallQual + YearBuilt + 
##     RoofMatl + MasVnrArea + ExterQual + BsmtFinSF1 + BsmtUnfSF + 
##     X1stFlrSF + X2ndFlrSF + BedroomAbvGr + KitchenAbvGr + KitchenQual + 
##     TotRmsAbvGrd + Fireplaces + GarageArea + GarageQual, data = train, 
##     nbagg = 250)
# Predict using the test set
house_bag_pred_3 <- predict(house_bag3, test)
model_output3 <- cbind(test, house_bag_pred_3)


model_output3$log_prediction_bag3 <- log(model_output3$house_bag_pred_3)
model_output3$log_SalePrice_bag3 <- log(model_output3$SalePrice)

#Test with RMSE

rmse(model_output3$log_SalePrice_bag3,model_output3$log_prediction_bag3)
## [1] 0.203948

The model prediction still did not improve from the previous models.
##Random Florest

#Package for Randpom Florest
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
house_rf <- randomForest(SalePrice~., data = train, importance = TRUE) 
house_rf
## 
## Call:
##  randomForest(formula = SalePrice ~ ., data = train, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 26
## 
##           Mean of squared residuals: 785295483
##                     % Var explained: 86.82

Since random Florest has it owns filter for variables, there is no need to select the variables that showed correlations previously. The obb error is greater than the bagging model, but has a explanation.

# Predict using the test set
prediction_rf <- predict(house_rf, test)
model_output_rf <- cbind(test, prediction_rf)


model_output_rf$log_prediction_rf <- log(model_output_rf$prediction_rf)
model_output_rf$log_SalePrice_rf <- log(model_output_rf$SalePrice)

#Test with RMSE

rmse(model_output_rf$log_SalePrice_rf,model_output_rf$log_prediction_rf)
## [1] 0.1557613

The prediciton model has a smaller error than bagging showing a error.
##XGBoost

#Package for XGBoost
#install.packages('xgboost')
library(xgboost)

Splitting the data again:

#Partition the data to be 60% train and 40% test
partition2 <- createDataPartition(y=outcome, p=.9, list=FALSE)
train <- housing_train[partition2,]
test <- housing_train[-partition2,]

The first step it is to transform the dataset into Sparse Matrix.

#Assemble and format the data - Using Log for Variable Sale Price
train$log_SalePrice <- log(train$SalePrice)
test$log_SalePrice <- log(test$SalePrice)

#Create matrices from the data frames
trainData<- as.matrix(train, rownames.force=NA)
testData<- as.matrix(test, rownames.force=NA)

#Turn the matrices into sparse matrices
train2 <- as(trainData, "sparseMatrix")
test2 <- as(testData, "sparseMatrix")

#colnames(train2)
#colnames(pred_data)
#Cross Validate the model
vars <- c(1:78) #Choose the variables 
trainD <- xgb.DMatrix(data = train2[,vars], label = train2[,"SalePrice"]) #Convert to xgb.DMatrix format for space and efficiency 

Creating a cross validation model

#Cross validate the model
cv.sparse <- xgb.cv(data = trainD,
                    nrounds = 500,
                    min_child_weight = 0,
                    max_depth = 10,
                    eta = 0.04,
                    subsample = .7,
                    colsample_bytree = .7,
                    booster = "gbtree",
                    eval_metric = "rmse",
                    print_every_n = 100,
                    nfold = 4,
                    nthread = 2,
                    objective="reg:linear")
## [12:19:45] WARNING: amalgamation/../src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror.
## [12:19:45] WARNING: amalgamation/../src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror.
## [12:19:45] WARNING: amalgamation/../src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror.
## [12:19:45] WARNING: amalgamation/../src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror.
## [1]  train-rmse:189849.979523+1511.409729    test-rmse:189921.284886+4743.807534 
## [101]    train-rmse:9272.817965+313.818642   test-rmse:28146.745034+3265.877929 
## [201]    train-rmse:2694.760256+121.520884   test-rmse:26841.697785+2772.386454 
## [301]    train-rmse:1140.339352+35.077249    test-rmse:26742.019161+2661.761679 
## [401]    train-rmse:439.398337+25.838777 test-rmse:26702.003424+2627.430251 
## [500]    train-rmse:159.772079+22.728284 test-rmse:26698.238330+2620.314668

The model shows that a RMSE of _____ is possible when using around 500 rounds.

#Choose the parameters for the model - tunning the model
param <- list(colsample_bytree = .7, #amount of features for each tree
             subsample = .7, #fractions of observation for random samples bt .5 and 1 lower than .5 is very conservative model
             booster = "gbtree", #tree Based model for a linear model use 'gblinear' 
             max_depth = 10, #maximun dept of a tree
             eta = 0.04, #makes the model more robust by shrinking the weight of each step 
             eval_metric = "rmse",
             objective="reg:linear")
#Train the model using those parameters
bstSparse <-
  xgb.train(params = param,
            data = trainD,
            nrounds = 500,
            watchlist = list(train = trainD),
            verbose = TRUE,
            print_every_n = 100,
            nthread = 2)
## [12:20:00] WARNING: amalgamation/../src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror.
## [1]  train-rmse:189965.179363 
## [101]    train-rmse:8814.407182 
## [201]    train-rmse:2705.367672 
## [301]    train-rmse:1304.936278 
## [401]    train-rmse:567.416511 
## [500]    train-rmse:213.331507

After running the model we can see that a train RMSE of 0.001744 Prediction of the bstSparse Model:

testD <- xgb.DMatrix(data = test2[,vars])
#Column names must match the inputs EXACTLY
prediction <- predict(bstSparse, testD) #Make the prediction based on the half of the training data set aside

#Put testing prediction and test dataset all together
test3 <- as.data.frame(as.matrix(test2))
prediction <- as.data.frame(as.matrix(prediction))
colnames(prediction) <- "prediction"
model_output <- cbind(test3, prediction)

model_output$log_prediction <- log(model_output$prediction)
model_output$log_SalePrice <- log(model_output$SalePrice)

#Test with RMSE

rmse(model_output$log_SalePrice,model_output$log_prediction)
## [1] 0.1192127

The RMSE error is 13.88% for the first model, after running many different model with different values the best RMSE was 1.11% error (but it varies between 1% and 2%)

#Changing the parameters
param2 <- list(colsample_bytree = .6, 
             subsample = .8, 
             booster = "gbtree", 
             max_depth = 12, 
             eta = 0.05, 
             eval_metric = "rmse",
             objective="reg:linear")

Make a second model

#Train the model using those parameters
bstSparse2 <-
  xgb.train(params = param2,
            data = trainD,
            nrounds = 500,
            watchlist = list(train = trainD),
            verbose = TRUE,
            print_every_n = 100,
            nthread = 2)
## [12:20:05] WARNING: amalgamation/../src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror.
## [1]  train-rmse:187987.981438 
## [101]    train-rmse:4339.814771 
## [201]    train-rmse:812.060518 
## [301]    train-rmse:196.878824 
## [401]    train-rmse:37.382146 
## [500]    train-rmse:6.728579
#Column names must match the inputs EXACTLY
prediction_2 <- predict(bstSparse2, testD) #Make the prediction based on the half of the training data set aside

#Put testing prediction and test dataset all together
test3 <- as.data.frame(as.matrix(test2))
prediction2 <- as.data.frame(as.matrix(prediction_2))
colnames(prediction2) <- "prediction"
output <- cbind(test3, prediction2)

output$log_prediction_2 <- log(output$prediction)
output$log_SalePrice2 <- log(output$SalePrice)

#Test with RMSE

rmse(output$log_SalePrice2,output$log_prediction_2)
## [1] 0.1185841

The RMSE error is ____ what is slightly higher than the previous model.
Preparing the test data set

# Get the supplied test data ready #
predict <- as.data.frame(housing_test) #Get the dataset formatted as a frame for later combining

#Create matrices from the data frames
predData<- as.matrix(predict, rownames.force=NA)

#Turn the matrices into sparse matrices
predicting <- as(predData, "sparseMatrix")
#colnames(train[,c(2:79)])
vars <- c("Id", "MSSubClass", "MSZoning", "LotFrontage", "LotArea", "Street",       
"LotShape", "LandContour", "Utilities",     "LotConfig",     "LandSlope",     "Neighborhood", 
"Condition1",    "Condition2",    "BldgType",      "HouseStyle",    "OverallQual",   "OverallCond",  
"YearBuilt",     "YearRemodAdd",  "RoofStyle",     "RoofMatl",      "Exterior1st",   "Exterior2nd",  
"MasVnrType",    "MasVnrArea",    "ExterQual",     "ExterCond",     "Foundation",    "BsmtQual",     
"BsmtCond",      "BsmtExposure",  "BsmtFinType1",  "BsmtFinSF1",    "BsmtFinType2",  "BsmtFinSF2",   
"BsmtUnfSF",     "TotalBsmtSF",   "Heating",       "HeatingQC",     "CentralAir",    "Electrical",   
"X1stFlrSF",     "X2ndFlrSF",     "LowQualFinSF", "GrLivArea",     "BsmtFullBath",  "BsmtHalfBath", 
"FullBath",      "HalfBath",      "BedroomAbvGr",  "KitchenAbvGr",  "KitchenQual",   "TotRmsAbvGrd", 
"Functional",    "Fireplaces",    "FireplaceQu",   "GarageType",    "GarageYrBlt",   "GarageFinish", 
"GarageCars",    "GarageArea",    "GarageQual",    "GarageCond",    "PavedDrive",    "WoodDeckSF","OpenPorchSF",   "EnclosedPorch", "X3SsnPorch",    "ScreenPorch",   "PoolArea",      "PoolQC",       
"Fence",         "MiscFeature",   "MiscVal",       "MoSold",        "YrSold",  "SaleType",     
"SaleCondition")
colnames(predicting[,vars])
##  [1] "Id"            "MSSubClass"    "MSZoning"      "LotFrontage"  
##  [5] "LotArea"       "Street"        "LotShape"      "LandContour"  
##  [9] "Utilities"     "LotConfig"     "LandSlope"     "Neighborhood" 
## [13] "Condition1"    "Condition2"    "BldgType"      "HouseStyle"   
## [17] "OverallQual"   "OverallCond"   "YearBuilt"     "YearRemodAdd" 
## [21] "RoofStyle"     "RoofMatl"      "Exterior1st"   "Exterior2nd"  
## [25] "MasVnrType"    "MasVnrArea"    "ExterQual"     "ExterCond"    
## [29] "Foundation"    "BsmtQual"      "BsmtCond"      "BsmtExposure" 
## [33] "BsmtFinType1"  "BsmtFinSF1"    "BsmtFinType2"  "BsmtFinSF2"   
## [37] "BsmtUnfSF"     "TotalBsmtSF"   "Heating"       "HeatingQC"    
## [41] "CentralAir"    "Electrical"    "X1stFlrSF"     "X2ndFlrSF"    
## [45] "LowQualFinSF"  "GrLivArea"     "BsmtFullBath"  "BsmtHalfBath" 
## [49] "FullBath"      "HalfBath"      "BedroomAbvGr"  "KitchenAbvGr" 
## [53] "KitchenQual"   "TotRmsAbvGrd"  "Functional"    "Fireplaces"   
## [57] "FireplaceQu"   "GarageType"    "GarageYrBlt"   "GarageFinish" 
## [61] "GarageCars"    "GarageArea"    "GarageQual"    "GarageCond"   
## [65] "PavedDrive"    "WoodDeckSF"    "OpenPorchSF"   "EnclosedPorch"
## [69] "X3SsnPorch"    "ScreenPorch"   "PoolArea"      "PoolQC"       
## [73] "Fence"         "MiscFeature"   "MiscVal"       "MoSold"       
## [77] "YrSold"        "SaleType"      "SaleCondition"
rm(bstSparse)
#Create matrices from the data frames
retrainData<- as.matrix(train, rownames.force=NA)

#Turn the matrices into sparse matrices
retrain <- as(retrainData, "sparseMatrix")

param3 <- list(colsample_bytree = .7,
             subsample = .7,
             booster = "gbtree",
             max_depth = 10,
             eta = 0.04,
             eval_metric = "rmse",
             objective="reg:linear")

retrainD <- xgb.DMatrix(data = retrain[,vars], label = retrain[,"SalePrice"])

#retrain the model using those parameters
bstSparse3 <-
 xgb.train(params = param3,
           data = retrainD,
           nrounds = 500,
           watchlist = list(train = trainD),
           verbose = TRUE,
           print_every_n = 100,
           nthread = 2)
## [12:20:11] WARNING: amalgamation/../src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror.
## [1]  train-rmse:189823.030525 
## [101]    train-rmse:11189.402378 
## [201]    train-rmse:5478.465614 
## [301]    train-rmse:4650.097411 
## [401]    train-rmse:4397.783603 
## [500]    train-rmse:4335.717392
#Column names must match the inputs EXACTLY
prediction <- predict(bstSparse3, predicting[,vars])

prediction <- as.data.frame(as.matrix(prediction))  #Get the dataset formatted as a frame for later combining
colnames(prediction) <- "prediction"
model_output <- cbind(predict, prediction) #Combine the prediction output with the rest of the set

results <- data.frame(Id = model_output$Id, SalePrice = model_output$prediction)
length(model_output$prediction)
## [1] 1459

#Result

write.csv(results, file = "Prediction.csv", row.names = F)
head(results$SalePrice)
## [1] 123595.1 156060.7 185800.5 189598.8 186934.7 173599.3

The file has a sales price prediction for the house_testing set.

summary <- read.csv("/Users/jusimioni/Desktop/sample_submission.csv")
head(summary)
##     Id SalePrice
## 1 1461  169277.1
## 2 1462  187758.4
## 3 1463  183583.7
## 4 1464  179317.5
## 5 1465  150730.1
## 6 1466  177151.0