The purpose of this project it is to create a model that predicts the
price that houses will be sold in the illinois region.
There are two data sets, one contains the train data and the test data.
Uploading those data set in here.
library(readr)
housing_train <- read.csv("/Users/jusimioni/Desktop/train.csv")
housing_test <- read.csv("/Users/jusimioni/Desktop/test.csv")
colnames(housing_train)
## [1] "Id" "MSSubClass" "MSZoning" "LotFrontage"
## [5] "LotArea" "Street" "Alley" "LotShape"
## [9] "LandContour" "Utilities" "LotConfig" "LandSlope"
## [13] "Neighborhood" "Condition1" "Condition2" "BldgType"
## [17] "HouseStyle" "OverallQual" "OverallCond" "YearBuilt"
## [21] "YearRemodAdd" "RoofStyle" "RoofMatl" "Exterior1st"
## [25] "Exterior2nd" "MasVnrType" "MasVnrArea" "ExterQual"
## [29] "ExterCond" "Foundation" "BsmtQual" "BsmtCond"
## [33] "BsmtExposure" "BsmtFinType1" "BsmtFinSF1" "BsmtFinType2"
## [37] "BsmtFinSF2" "BsmtUnfSF" "TotalBsmtSF" "Heating"
## [41] "HeatingQC" "CentralAir" "Electrical" "X1stFlrSF"
## [45] "X2ndFlrSF" "LowQualFinSF" "GrLivArea" "BsmtFullBath"
## [49] "BsmtHalfBath" "FullBath" "HalfBath" "BedroomAbvGr"
## [53] "KitchenAbvGr" "KitchenQual" "TotRmsAbvGrd" "Functional"
## [57] "Fireplaces" "FireplaceQu" "GarageType" "GarageYrBlt"
## [61] "GarageFinish" "GarageCars" "GarageArea" "GarageQual"
## [65] "GarageCond" "PavedDrive" "WoodDeckSF" "OpenPorchSF"
## [69] "EnclosedPorch" "X3SsnPorch" "ScreenPorch" "PoolArea"
## [73] "PoolQC" "Fence" "MiscFeature" "MiscVal"
## [77] "MoSold" "YrSold" "SaleType" "SaleCondition"
## [81] "SalePrice"
head(housing_train)
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour
## 1 1 60 RL 65 8450 Pave <NA> Reg Lvl
## 2 2 20 RL 80 9600 Pave <NA> Reg Lvl
## 3 3 60 RL 68 11250 Pave <NA> IR1 Lvl
## 4 4 70 RL 60 9550 Pave <NA> IR1 Lvl
## 5 5 60 RL 84 14260 Pave <NA> IR1 Lvl
## 6 6 50 RL 85 14115 Pave <NA> IR1 Lvl
## Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType
## 1 AllPub Inside Gtl CollgCr Norm Norm 1Fam
## 2 AllPub FR2 Gtl Veenker Feedr Norm 1Fam
## 3 AllPub Inside Gtl CollgCr Norm Norm 1Fam
## 4 AllPub Corner Gtl Crawfor Norm Norm 1Fam
## 5 AllPub FR2 Gtl NoRidge Norm Norm 1Fam
## 6 AllPub Inside Gtl Mitchel Norm Norm 1Fam
## HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl
## 1 2Story 7 5 2003 2003 Gable CompShg
## 2 1Story 6 8 1976 1976 Gable CompShg
## 3 2Story 7 5 2001 2002 Gable CompShg
## 4 2Story 7 5 1915 1970 Gable CompShg
## 5 2Story 8 5 2000 2000 Gable CompShg
## 6 1.5Fin 5 5 1993 1995 Gable CompShg
## Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 1 VinylSd VinylSd BrkFace 196 Gd TA PConc
## 2 MetalSd MetalSd None 0 TA TA CBlock
## 3 VinylSd VinylSd BrkFace 162 Gd TA PConc
## 4 Wd Sdng Wd Shng None 0 TA TA BrkTil
## 5 VinylSd VinylSd BrkFace 350 Gd TA PConc
## 6 VinylSd VinylSd None 0 TA TA Wood
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 1 Gd TA No GLQ 706 Unf
## 2 Gd TA Gd ALQ 978 Unf
## 3 Gd TA Mn GLQ 486 Unf
## 4 TA Gd No ALQ 216 Unf
## 5 Gd TA Av GLQ 655 Unf
## 6 Gd TA No GLQ 732 Unf
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 1 0 150 856 GasA Ex Y SBrkr
## 2 0 284 1262 GasA Ex Y SBrkr
## 3 0 434 920 GasA Ex Y SBrkr
## 4 0 540 756 GasA Gd Y SBrkr
## 5 0 490 1145 GasA Ex Y SBrkr
## 6 0 64 796 GasA Ex Y SBrkr
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 1 856 854 0 1710 1 0 2
## 2 1262 0 0 1262 0 1 2
## 3 920 866 0 1786 1 0 2
## 4 961 756 0 1717 1 0 1
## 5 1145 1053 0 2198 1 0 2
## 6 796 566 0 1362 1 0 1
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 1 1 3 1 Gd 8 Typ
## 2 0 3 1 TA 6 Typ
## 3 1 3 1 Gd 6 Typ
## 4 0 3 1 Gd 7 Typ
## 5 1 4 1 Gd 9 Typ
## 6 1 1 1 TA 5 Typ
## Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
## 1 0 <NA> Attchd 2003 RFn 2
## 2 1 TA Attchd 1976 RFn 2
## 3 1 TA Attchd 2001 RFn 2
## 4 1 Gd Detchd 1998 Unf 3
## 5 1 TA Attchd 2000 RFn 3
## 6 0 <NA> Attchd 1993 Unf 2
## GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 1 548 TA TA Y 0 61
## 2 460 TA TA Y 298 0
## 3 608 TA TA Y 0 42
## 4 642 TA TA Y 0 35
## 5 836 TA TA Y 192 84
## 6 480 TA TA Y 40 30
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
## 1 0 0 0 0 <NA> <NA> <NA>
## 2 0 0 0 0 <NA> <NA> <NA>
## 3 0 0 0 0 <NA> <NA> <NA>
## 4 272 0 0 0 <NA> <NA> <NA>
## 5 0 0 0 0 <NA> <NA> <NA>
## 6 0 320 0 0 <NA> MnPrv Shed
## MiscVal MoSold YrSold SaleType SaleCondition SalePrice
## 1 0 2 2008 WD Normal 208500
## 2 0 5 2007 WD Normal 181500
## 3 0 9 2008 WD Normal 223500
## 4 0 2 2006 WD Abnorml 140000
## 5 0 12 2008 WD Normal 250000
## 6 700 10 2009 WD Normal 143000
cbind(c("train", "test"),
rbind(dim(housing_train), dim(housing_test)))
## [,1] [,2] [,3]
## [1,] "train" "1460" "81"
## [2,] "test" "1459" "80"
Looking at TrainHouse many columns need to be changed, some need to replace NA to 0 and some columns are going to be changed from category to numerical so they can be use when doing analysis. You can check for the explanation for each column here with the changes made < Input Columnn guide here > .
Transforming the data to factor.
housing_train$MSZoning = as.factor(housing_train$MSZoning)
levels(housing_train$MSZoning)
## [1] "C (all)" "FV" "RH" "RL" "RM"
# MSZoning column of train dataset has following levels: "C (all)", "FV", "RH", "RL", "RM"
housing_test$MSZoning = as.factor(housing_test$MSZoning)
levels(housing_test$MSZoning)
## [1] "C (all)" "FV" "RH" "RL" "RM"
# Change of factors to numeric in train dataset
MSZoning=as.numeric(housing_train$MSZoning,"C "=1, "FV"=2, "RH"=3, "RL"=4, "RM"=5)
housing_train$MSZoning <-MSZoning
# Change of factors to numeric in test dataset
MSZoning=as.numeric(housing_test$MSZoning,"C "=1, "FV"=2, "RH"=3, "RL"=4, "RM"=5)
housing_test$MSZoning <-MSZoning
Street = as.factor(housing_train$Street)
Street = as.numeric(Street, "Pave"= 1,"Grvl"= 2)
housing_train$Street <-Street
# Pave got replaced with 1 and Grvl type of rode got replaced with 2
Street = as.factor(housing_test$Street)
Street = as.numeric(Street, "Pave"= 1,"Grvl"= 2)
housing_test$Street <-Street
# Pave got replaced with 1 and Grvl type of rode got replaced with 2
# Transforming Alley column to numeric in train dataset
Alley<-as.factor(housing_train$Alley)
levels(Alley)
## [1] "Grvl" "Pave"
Alley = as.numeric(Alley, "Pave"= 1,"Grvl"= 2, "NA"=0)
housing_train$Alley <- Alley
# Transforming Alley column to numeric in test dataset
Alley<-as.factor(housing_test$Alley)
levels(Alley)
## [1] "Grvl" "Pave"
Alley = as.numeric(Alley, "Pave"= 1,"Grvl"= 2)
housing_test$Alley <- Alley
# Transforming LotShape column to numeric in train dataset
LotShape <-as.factor(housing_train$LotShape)
levels(LotShape) # 4 levels: "IR1", "IR2", "IR3", "Reg"
## [1] "IR1" "IR2" "IR3" "Reg"
LotShape=as.numeric(LotShape,"IR1"=1, "IR2"=2, "IR3"=3, "Reg"=4)
housing_train$LotShape <- LotShape
# Transforming LotShape column to numeric in test dataset
LotShape <-as.factor(housing_test$LotShape)
levels(LotShape) # 4 levels: "IR1", "IR2", "IR3", "Reg"
## [1] "IR1" "IR2" "IR3" "Reg"
LotShape=as.numeric(LotShape,"IR1"=1, "IR2"=2, "IR3"=3, "Reg"=4)
housing_test$LotShape <- LotShape
# Transforming LandContour column to numeric in train dataset
LandContour <-as.factor(housing_train$LandContour)
levels(LandContour) # 4 levels: "Bnk", "HLS", "Low", "Lvl"
## [1] "Bnk" "HLS" "Low" "Lvl"
LandContour=as.numeric(LandContour,"Bnk"=1, "HLS"=2, "Low"=3, "Lvl"=4)
housing_train$LandContour <- LandContour
# Transforming LandContour column to numeric in test dataset
LandContour <-as.factor(housing_test$LandContour)
levels(LandContour) # 4 levels: "Bnk", "HLS", "Low", "Lvl"
## [1] "Bnk" "HLS" "Low" "Lvl"
LandContour=as.numeric(LandContour,"Bnk"=1, "HLS"=2, "Low"=3, "Lvl"=4)
housing_test$LandContour <- LandContour
# Transforming Utilities column to numeric in train dataset
Utilities <-as.factor(housing_train$Utilities)
levels(Utilities) # 2 levels: "AllPub", "NoSeWa"
## [1] "AllPub" "NoSeWa"
Utilities=as.numeric(Utilities,"AllPub"=1, "NoSeWa"=2, "NA"=0)
housing_train$Utilities <- Utilities
# Transforming Utilities column to numeric in test dataset
Utilities <-as.factor(housing_test$Utilities)
levels(Utilities) # 2 levels: "AllPub", "NoSeWa"
## [1] "AllPub"
Utilities=as.numeric(Utilities,"AllPub"=1, "NA"=0)
housing_test$Utilities <- Utilities
# Transforming LotConfig column to numeric in train dataset
LotConfig <-as.factor(housing_train$LotConfig)
levels(LotConfig)
## [1] "Corner" "CulDSac" "FR2" "FR3" "Inside"
LotConfig=as.numeric(LotConfig,"Corner"=1, "CulDSac"=2, "FR2"=3, "FR3"=4, "Inside"=5, "NA"= 0)
housing_train$LotConfig <- LotConfig
# Transforming LotConfig column to numeric in test dataset
LotConfig <-as.factor(housing_test$LotConfig)
levels(LotConfig)
## [1] "Corner" "CulDSac" "FR2" "FR3" "Inside"
LotConfig=as.numeric(LotConfig,"Corner"=1, "CulDSac"=2, "FR2"=3, "FR3"=4, "Inside"=5, "NA"=0)
housing_test$LotConfig <- LotConfig
# Transforming LandSlope column to numeric in train dataset
LandSlope <-as.factor(housing_train$LandSlope)
levels(LandSlope)
## [1] "Gtl" "Mod" "Sev"
LandSlope=as.numeric(LandSlope,"Gtl"=1, "Mod"=2, "Sev"=3)
housing_train$LandSlope <- LandSlope
# Transforming LandSlope column to numeric in test dataset
LandSlope <-as.factor(housing_test$LandSlope)
levels(LandSlope)
## [1] "Gtl" "Mod" "Sev"
LandSlope=as.numeric(LandSlope,"Blmngtn"=1, "Blueste"=2, "Sev"=3)
housing_test$LandSlope <- LandSlope
# Transforming Neighborhood column to numeric in train dataset
Neighborhood <-as.factor(housing_train$Neighborhood)
levels(Neighborhood)
## [1] "Blmngtn" "Blueste" "BrDale" "BrkSide" "ClearCr" "CollgCr" "Crawfor"
## [8] "Edwards" "Gilbert" "IDOTRR" "MeadowV" "Mitchel" "NAmes" "NoRidge"
## [15] "NPkVill" "NridgHt" "NWAmes" "OldTown" "Sawyer" "SawyerW" "Somerst"
## [22] "StoneBr" "SWISU" "Timber" "Veenker"
Neighborhood=as.numeric(Neighborhood,"Blmngtn"=1, "Blueste"=2, "BrDale"=3, "BrkSide"=4, "ClearCr"=5, "CollgCr"=6, "Crawfor"=7, "Edwards"=8, "Gilbert"=9, "IDOTRR"=10, "MeadowV"=11, "Mitchel"=12, "NAmes"=13, "NoRidge"=14, "NPkVill"=15, "NridgHt"=16, "NWAmes"=17, "OldTown"=18, "SWISU"=19, "Sawyer"=20, "SawyerW"=21, "Somerst"=22, "StoneBr"=23, "Timber"=24, "Veenker"=25)
housing_train$Neighborhood <- Neighborhood
# Transforming Neighborhood column to numeric in test dataset
Neighborhood <-as.factor(housing_test$Neighborhood)
levels(Neighborhood)
## [1] "Blmngtn" "Blueste" "BrDale" "BrkSide" "ClearCr" "CollgCr" "Crawfor"
## [8] "Edwards" "Gilbert" "IDOTRR" "MeadowV" "Mitchel" "NAmes" "NoRidge"
## [15] "NPkVill" "NridgHt" "NWAmes" "OldTown" "Sawyer" "SawyerW" "Somerst"
## [22] "StoneBr" "SWISU" "Timber" "Veenker"
Neighborhood=as.numeric(Neighborhood,"Blmngtn"=1, "Blueste"=2, "BrDale"=3, "BrkSide"=4, "ClearCr"=5, "CollgCr"=6, "Crawfor"=7, "Edwards"=8, "Gilbert"=9, "IDOTRR"=10, "MeadowV"=11, "Mitchel"=12, "NAmes"=13, "NoRidge"=14, "NPkVill"=15, "NridgHt"=16, "NWAmes"=17, "OldTown"=18, "SWISU"=19, "Sawyer"=20, "SawyerW"=21, "Somerst"=22, "StoneBr"=23, "Timber"=24, "Veenker"=25)
housing_test$Neighborhood <- Neighborhood
# Transforming Condition1 column to numeric in train dataset
Condition1 <-as.factor(housing_train$Condition1)
levels(Condition1)
## [1] "Artery" "Feedr" "Norm" "PosA" "PosN" "RRAe" "RRAn" "RRNe"
## [9] "RRNn"
Condition1=as.numeric(Condition1,"Artery"=1, "Feedr"=2, "Norm"=3, "RRNn"=4, "RRAn"=5, "PosN"=6, "PosA"=7, "RRNe"=8, "RRAe"=9)
housing_train$Condition1 <- Condition1
# Transforming Condition1 column to numeric in test dataset
Condition1 <-as.factor(housing_test$Condition1)
levels(Condition1)
## [1] "Artery" "Feedr" "Norm" "PosA" "PosN" "RRAe" "RRAn" "RRNe"
## [9] "RRNn"
Condition1=as.numeric(Condition1,"Artery"=1, "Feedr"=2, "Norm"=3, "RRNn"=4, "RRAn"=5, "PosN"=6, "PosA"=7, "RRNe"=8, "RRAe"=9)
housing_test$Condition1 <- Condition1
# Transforming Condition2 column to numeric in train dataset
Condition2 <-as.factor(housing_train$Condition2)
levels(Condition2)
## [1] "Artery" "Feedr" "Norm" "PosA" "PosN" "RRAe" "RRAn" "RRNn"
Condition2=as.numeric(Condition2,"Artery"=1, "Feedr"=2, "Norm"=3, "RRNn"=4, "RRAn"=5, "PosN"=6, "PosA"=7, "RRNe"=8, "RRAe"=9)
housing_train$Condition2 <- Condition2
# Transforming Condition2 column to numeric in test dataset
Condition2 <-as.factor(housing_test$Condition2)
levels(Condition2) #values
## [1] "Artery" "Feedr" "Norm" "PosA" "PosN"
Condition2=as.numeric(Condition2,"Artery"=1, "Feedr"=2, "Norm"=3, "RRNn"=4, "RRAn"=5, "PosN"=6, "PosA"=7, "RRNe"=8, "RRAe"=9)
housing_test$Condition2 <- Condition2
# Transforming BldgType column to numeric in train dataset
BldgType <-as.factor(housing_train$BldgType)
levels(BldgType)
## [1] "1Fam" "2fmCon" "Duplex" "Twnhs" "TwnhsE"
BldgType=as.numeric(BldgType,"1Fam"=1, "2FmCon"=2, "Duplx"=3, "TwnhsE"=4, "TwnhsI"=5)
housing_train$BldgType <- BldgType
# Transforming BldgType column to numeric in test dataset
BldgType <-as.factor(housing_test$BldgType)
levels(BldgType)
## [1] "1Fam" "2fmCon" "Duplex" "Twnhs" "TwnhsE"
BldgType=as.numeric(BldgType,"1Fam"=1, "2FmCon"=2, "Duplx"=3, "TwnhsE"=4, "TwnhsI"=5)
housing_test$BldgType <- BldgType
# Transforming HouseStyle column to numeric in train dataset
HouseStyle <-as.factor(housing_train$HouseStyle)
levels(HouseStyle)
## [1] "1.5Fin" "1.5Unf" "1Story" "2.5Fin" "2.5Unf" "2Story" "SFoyer" "SLvl"
HouseStyle=as.numeric(HouseStyle,"1Story"=1, "1.5Fin"=2, "1.5Unf"=3, "2Story"=4, "2.5Fin"=5, "2.5Unf"=6, "SFoyer"=7, "SLvl"=8)
housing_train$HouseStyle <- HouseStyle
# Transforming HouseStyle column to numeric in test dataset
HouseStyle <-as.factor(housing_test$HouseStyle)
levels(HouseStyle)
## [1] "1.5Fin" "1.5Unf" "1Story" "2.5Unf" "2Story" "SFoyer" "SLvl"
HouseStyle=as.numeric(HouseStyle,"1Story"=1, "1.5Fin"=2, "1.5Unf"=3, "2Story"=4, "2.5Fin"=5, "2.5Unf"=6, "SFoyer"=7, "SLvl"=8)
housing_test$HouseStyle <- HouseStyle
RoofStyle <-as.factor(housing_train$RoofStyle)
levels(RoofStyle)
## [1] "Flat" "Gable" "Gambrel" "Hip" "Mansard" "Shed"
RoofStyle=as.numeric(RoofStyle,"Flat"=1, "Gable"=2, "Gambrel"=3, "Hip"=4, "Mansard"=5, "Shed"=6)
housing_train$RoofStyle <- RoofStyle
# Transforming RoofStyle column to numeric in test dataset
RoofStyle <-as.factor(housing_test$RoofStyle)
levels(RoofStyle)
## [1] "Flat" "Gable" "Gambrel" "Hip" "Mansard" "Shed"
RoofStyle=as.numeric(RoofStyle,"Flat"=1, "Gable"=2, "Gambrel"=3, "Hip"=4, "Mansard"=5, "Shed"=6)
housing_test$RoofStyle <- RoofStyle
# Transforming RoofMatl column to numeric in train dataset
RoofMatl <-as.factor(housing_train$RoofMatl)
levels(RoofMatl)
## [1] "ClyTile" "CompShg" "Membran" "Metal" "Roll" "Tar&Grv" "WdShake"
## [8] "WdShngl"
RoofMatl=as.numeric(RoofMatl,"ClyTile"=1, "CompShg"=2, "Membran"=3, "Metal"=4, "Roll"=5, "Tar&Grv"=6, "WdShake"=7, "WdShngl"=8)
housing_train$RoofMatl <- RoofMatl
# Transforming RoofMatl column to numeric in test dataset
RoofMatl <-as.factor(housing_test$RoofMatl)
levels(RoofMatl)
## [1] "CompShg" "Tar&Grv" "WdShake" "WdShngl"
RoofMatl=as.numeric(RoofMatl,"ClyTile"=1, "CompShg"=2, "Membran"=3, "Metal"=4, "Roll"=5, "Tar&Grv"=6, "WdShake"=7, "WdShngl"=8)
housing_test$RoofMatl <- RoofMatl
# Transforming Exterior1st column to numeric in train dataset
Exterior1st <-as.factor(housing_train$Exterior1st)
levels(Exterior1st)
## [1] "AsbShng" "AsphShn" "BrkComm" "BrkFace" "CBlock" "CemntBd" "HdBoard"
## [8] "ImStucc" "MetalSd" "Plywood" "Stone" "Stucco" "VinylSd" "Wd Sdng"
## [15] "WdShing"
Exterior1st=as.numeric(Exterior1st,"AsbShng"=1, "AsphShn"=2, "BrkComm"=3, "BrkFace"=4, "CBlock"=5, "CemntBd"=6, "HdBoard"=7, "ImStucc"=8, "MetalSd"=9, "Plywood"=10, "Stone"=11, "Stucco"=12,"VinylSd"=13, "Wd Sdng"=14, "WdShing"=15)
housing_train$Exterior1st <- Exterior1st
# Transforming Exterior1st column to numeric in test dataset
Exterior1st <-as.factor(housing_test$Exterior1st)
levels(Exterior1st)
## [1] "AsbShng" "AsphShn" "BrkComm" "BrkFace" "CBlock" "CemntBd" "HdBoard"
## [8] "MetalSd" "Plywood" "Stucco" "VinylSd" "Wd Sdng" "WdShing"
Exterior1st=as.numeric(Exterior1st,"AsbShng"=1, "AsphShn"=2, "BrkComm"=3, "BrkFace"=4, "CBlock"=5, "CemntBd"=6, "HdBoard"=7, "ImStucc"=8, "MetalSd"=9, "Plywood"=10, "Stone"=11, "Stucco"=12,"VinylSd"=13, "Wd Sdng"=14, "WdShing"=15)
housing_test$Exterior1st <- Exterior1st
# Transforming Exterior2nd column to numeric in train dataset
Exterior2nd <-as.factor(housing_train$Exterior2nd)
levels(Exterior2nd)
## [1] "AsbShng" "AsphShn" "Brk Cmn" "BrkFace" "CBlock" "CmentBd" "HdBoard"
## [8] "ImStucc" "MetalSd" "Other" "Plywood" "Stone" "Stucco" "VinylSd"
## [15] "Wd Sdng" "Wd Shng"
Exterior2nd = as.numeric(Exterior2nd,"AsbShng"=1, "AsphShn"=2, "BrkComm"=3, "BrkFace"=4, "CBlock"=5, "CemntBd"=6, "HdBoard"=7, "ImStucc"=8, "MetalSd"=9, "Plywood"=10, "Stone"=11, "Stucco"=12,"VinylSd"=13, "Wd Sdng"=14, "WdShing"=15)
housing_train$Exterior2nd <- Exterior2nd
# Transforming Exterior2nd column to numeric in test dataset
Exterior2nd <-as.factor(housing_test$Exterior2nd)
levels(Exterior2nd)
## [1] "AsbShng" "AsphShn" "Brk Cmn" "BrkFace" "CBlock" "CmentBd" "HdBoard"
## [8] "ImStucc" "MetalSd" "Plywood" "Stone" "Stucco" "VinylSd" "Wd Sdng"
## [15] "Wd Shng"
Exterior2nd=as.numeric(Exterior2nd,"AsbShng"=1, "AsphShn"=2, "Brk Cmn"=3, "BrkFace"=4, "CBlock"=5, "CemntBd"=6, "HdBoard"=7, "ImStucc"=8, "MetalSd"=9, "Plywood"=10,"PreCast"=11, "Stone"=12, "Stucco"=13,"VinylSd"=14, "Wd Sdng"=15, "Wd Shing"=16)
housing_test$Exterior2nd <- Exterior2nd
# Transforming MasVnrType column to numeric in train dataset
MasVnrType <-as.factor(housing_train$MasVnrType)
levels(MasVnrType)
## [1] "BrkCmn" "BrkFace" "None" "Stone"
MasVnrType=as.numeric(MasVnrType,"BrkCmn"=1, "BrkFace"=2, "None"=0, "Stone"=4)
housing_train$MasVnrType <- MasVnrType
# Transforming MasVnrType column to numeric in test dataset
MasVnrType <-as.factor(housing_test$MasVnrType)
levels(MasVnrType)
## [1] "BrkCmn" "BrkFace" "None" "Stone"
MasVnrType=as.numeric(MasVnrType,"BrkCmn"=1, "BrkFace"=2, "Stone"=3, "NA"=0)
housing_test$MasVnrType <- MasVnrType
# Transforming ExterQual column to numeric in train dataset
ExterQual <-as.factor(housing_train$ExterQual)
levels(ExterQual)
## [1] "Ex" "Fa" "Gd" "TA"
ExterQual=as.numeric(ExterQual,"Ex"=1, "Gd"=2, "TA"=3, "Fa"=4, "Po"=5 )
housing_train$ExterQual <- ExterQual
# Transforming ExterQual column to numeric in test dataset
ExterQual <-as.factor(housing_test$ExterQual)
levels(ExterQual)
## [1] "Ex" "Fa" "Gd" "TA"
ExterQual=as.numeric(ExterQual,"Ex"=1, "Gd"=2, "TA"=3, "Fa"=4, "Po"=5 )
housing_test$ExterQual <- ExterQual
# Transforming ExterCond column to numeric in train dataset
ExterCond <-as.factor(housing_train$ExterCond)
levels(ExterCond)
## [1] "Ex" "Fa" "Gd" "Po" "TA"
ExterCond=as.numeric(ExterCond,"Ex"=1, "Gd"=2, "TA"=3, "Fa"=4, "Po"=5 )
housing_train$ExterCond <- ExterCond
# Transforming ExterCond column to numeric in test dataset
ExterCond <-as.factor(housing_test$ExterCond)
levels(ExterCond)
## [1] "Ex" "Fa" "Gd" "Po" "TA"
ExterCond=as.numeric(ExterCond,"Ex"=1, "Gd"=2, "TA"=3, "Fa"=4, "Po"=5 )
housing_test$ExterCond <- ExterCond
# Transforming Foundation column to numeric in train dataset
Foundation <-as.factor(housing_train$Foundation)
levels(Foundation)
## [1] "BrkTil" "CBlock" "PConc" "Slab" "Stone" "Wood"
Foundation=as.numeric(Foundation,"BrkTil"=1, "CBlock"=2, "PConc"=3, "Slab"=4, "Stone"=5, "Wood" = 6 )
housing_train$Foundation <- Foundation
# Transforming Foundation column to numeric in test dataset
Foundation <-as.factor(housing_test$Foundation)
levels(Foundation)
## [1] "BrkTil" "CBlock" "PConc" "Slab" "Stone" "Wood"
Foundation=as.numeric(Foundation,"BrkTil"=1, "CBlock"=2, "PConc"=3, "Slab"=4, "Stone"=5, "Wood" = 6 )
housing_test$Foundation<- Foundation
# Transforming BsmtQual column to numeric in train dataset
BsmtQual <-as.factor(housing_train$BsmtQual)
levels(BsmtQual)
## [1] "Ex" "Fa" "Gd" "TA"
BsmtQual=as.numeric(BsmtQual,"Ex"=1, "Fa"=2, "Gd"=3, "TA"=4, "Fa"=5, "Po" = 6, "NA"=0)
housing_train$BsmtQual <- BsmtQual
# Transforming BsmtQual column to numeric in test dataset
BsmtQual <-as.factor(housing_test$BsmtQual)
levels(BsmtQual)
## [1] "Ex" "Fa" "Gd" "TA"
BsmtQual=as.numeric(BsmtQual,"Ex"=1, "Gd"=2, "TA"=3, "Fa"=4, "Po"=5, "NA"=0 )
housing_test$BsmtQual<- BsmtQual
# Transforming BsmtCond column to numeric in train dataset
BsmtCond <-as.factor(housing_train$BsmtCond)
levels(BsmtCond)
## [1] "Fa" "Gd" "Po" "TA"
BsmtCond=as.numeric(BsmtCond,"Ex"=1, "Gd"=2, "TA"=3, "Fa"=4, "Po"=5, "NA"=0)
housing_train$BsmtCond <- BsmtCond
# Transforming BsmtCond column to numeric in test dataset
BsmtCond <-as.factor(housing_test$BsmtCond)
levels(BsmtCond)
## [1] "Fa" "Gd" "Po" "TA"
BsmtCond=as.numeric(BsmtCond,"Ex"=1, "Fa"=4, "Gd"=2, "TA"=3, "Fa"=4, "Po"=5, "NA"=0 )
housing_test$BsmtCond<- BsmtCond
# Transforming BsmtExposure column to numeric in train dataset
BsmtExposure <-as.factor(housing_train$BsmtExposure)
levels(BsmtExposure)
## [1] "Av" "Gd" "Mn" "No"
BsmtExposure=as.numeric(BsmtExposure,"Av"=1, "Gd"=2, "Mn"=3, "No"=4, "NA"=0)
housing_train$BsmtExposure <- BsmtExposure
# Transforming BsmtExposure column to numeric in test dataset
BsmtExposure <-as.factor(housing_test$BsmtExposure)
levels(BsmtExposure)
## [1] "Av" "Gd" "Mn" "No"
BsmtExposure=as.numeric(BsmtExposure,"Av"=1, "Gd"=2, "Mn"=3, "No"=4, "NA"=0)
housing_test$BsmtExposure<- BsmtExposure
# Transforming BsmtFinType1 column to numeric in train dataset
BsmtFinType1 <-as.factor(housing_train$BsmtFinType1)
levels(BsmtFinType1)
## [1] "ALQ" "BLQ" "GLQ" "LwQ" "Rec" "Unf"
BsmtFinType1=as.numeric(BsmtFinType1,"ALQ"=1, "BLQ"=2, "GLQ"=3, "LwQ"=4, "Rec"=5, "Unf"=6, "NA"=0)
housing_train$BsmtFinType1 <- BsmtFinType1
# Transforming BsmtFinType1 column to numeric in test dataset
BsmtFinType1 <-as.factor(housing_test$BsmtFinType1)
levels(BsmtFinType1)
## [1] "ALQ" "BLQ" "GLQ" "LwQ" "Rec" "Unf"
BsmtFinType1=as.numeric(BsmtFinType1,"ALQ"=1, "BLQ"=2, "GLQ"=3, "LwQ"=4, "Rec"=5, "Unf"=6)
housing_test$BsmtFinType1<- BsmtFinType1
housing_train$BsmtFinType1[is.na(housing_train$BsmtFinType1)] <- 0
sum(is.na(housing_train$BsmtFinType1))
## [1] 0
housing_test$BsmtFinType1[is.na(housing_test$BsmtFinType1)] <- 0
sum(is.na(housing_test$BsmtFinType1))
## [1] 0
# Transforming BsmtFinType2 column to numeric in train dataset
BsmtFinType2 <-as.factor(housing_train$BsmtFinType2)
levels(BsmtFinType2)
## [1] "ALQ" "BLQ" "GLQ" "LwQ" "Rec" "Unf"
BsmtFinType2=as.numeric(BsmtFinType2,"ALQ"=1, "BLQ"=2, "GLQ"=3, "LwQ"=4, "Rec"=5, "Unf"=6, "NA"=0)
housing_train$BsmtFinType2 <- BsmtFinType2
# Transforming BsmtFinType2 column to numeric in test dataset
BsmtFinType2 <-as.factor(housing_test$BsmtFinType2)
levels(BsmtFinType2)
## [1] "ALQ" "BLQ" "GLQ" "LwQ" "Rec" "Unf"
BsmtFinType2=as.numeric(BsmtFinType2,"ALQ"=1, "BLQ"=2, "GLQ"=3, "LwQ"=4, "Rec"=5, "Unf"=6, "NA"=0)
housing_test$BsmtFinType2<- BsmtFinType2
housing_train$BsmtFinType2[is.na(housing_train$BsmtFinType2)] <- 0
sum(is.na(housing_train$BsmtFinType2))
## [1] 0
housing_test$BsmtFinType2[is.na(housing_test$BsmtFinType2)] <- 0
sum(is.na(housing_test$BsmtFinType2))
## [1] 0
# Transforming Heating column to numeric in train dataset
Heating <-as.factor(housing_train$Heating)
levels(Heating)
## [1] "Floor" "GasA" "GasW" "Grav" "OthW" "Wall"
Heating=as.numeric(Heating,"Floor"=1, "GasA"=2, "GasW"=3, "Grav"=4, "OthW"=5, "Wall"=6, "NA"=0)
housing_train$Heating <- Heating
# Transforming Heating column to numeric in test dataset
Heating <-as.factor(housing_test$Heating)
levels(Heating)
## [1] "GasA" "GasW" "Grav" "Wall"
Heating=as.numeric(Heating,"Floor"=1, "GasA"=2, "GasW"=3, "Grav"=4, "OthW"=5, "Wall"=6, "NA"=0)
housing_test$Heating<- Heating
# Transforming HeatingQC column to numeric in train dataset
HeatingQC <-as.factor(housing_train$HeatingQC)
levels(HeatingQC)
## [1] "Ex" "Fa" "Gd" "Po" "TA"
HeatingQC=as.numeric(HeatingQC,"Ex"=1, "Gd"=2, "TA"=3, "Fa"=4, "Po"=5)
housing_train$HeatingQC <- HeatingQC
# Transforming HHeatingQC column to numeric in test dataset
HeatingQC <-as.factor(housing_test$HeatingQC)
levels(HeatingQC)
## [1] "Ex" "Fa" "Gd" "Po" "TA"
HeatingQC=as.numeric(HeatingQC,"Ex"=1, "Gd"=2, "TA"=3, "Fa"=4, "Po"=5)
housing_test$HeatingQC<- HeatingQC
# Transforming CentralAir column to numeric in train dataset
CentralAir <-as.factor(housing_train$CentralAir)
levels(CentralAir)
## [1] "N" "Y"
CentralAir=as.numeric(CentralAir,"N"=0, "Y"=1)
housing_train$CentralAir <- CentralAir
# Transforming CentralAir column to numeric in test dataset
CentralAir <-as.factor(housing_test$CentralAir)
levels(CentralAir)
## [1] "N" "Y"
CentralAir=as.numeric(CentralAir,"N"=0, "Y"=1)
housing_test$CentralAir<- CentralAir
# Transforming Electrical column to numeric in train dataset
Electrical <-as.factor(housing_train$Electrical)
levels(Electrical)
## [1] "FuseA" "FuseF" "FuseP" "Mix" "SBrkr"
Electrical=as.numeric(Electrical,"SBrkr"=1, "FuseA"=2, "FuseF"=3, "FuseP"=4, "Mix"=5, "NA"=0)
housing_train$Electrical <- Electrical
# Transforming Electrical column to numeric in test dataset
Electrical <-as.factor(housing_test$Electrical)
levels(Electrical)
## [1] "FuseA" "FuseF" "FuseP" "SBrkr"
Electrical=as.numeric(Electrical,"SBrkr"=1, "FuseA"=2, "FuseF"=3, "FuseP"=4, "Mix"=5 )
housing_test$Electrical<- Electrical
housing_train$Electrical[is.na(housing_train$Electrical)] <- 0
sum(is.na(housing_train$Electrical))
## [1] 0
housing_test$Electrical[is.na(housing_test$Electrical)] <- 0
sum(is.na(housing_test$Electrical))
## [1] 0
# Transforming KitchenQual column to numeric in train dataset
KitchenQual <-as.factor(housing_train$KitchenQual)
levels(KitchenQual)
## [1] "Ex" "Fa" "Gd" "TA"
KitchenQual=as.numeric(KitchenQual,"Ex"=1, "Gd"=2, "TA"=3, "Fa"=4, "Po"=5)
housing_train$KitchenQual <- KitchenQual
# Transforming KitchenQual column to numeric in test dataset
KitchenQual <-as.factor(housing_test$KitchenQual)
levels(KitchenQual)
## [1] "Ex" "Fa" "Gd" "TA"
KitchenQual=as.numeric(KitchenQual,"Ex"=1, "Gd"=2, "TA"=3, "Fa"=4, "Po"=5 )
housing_test$KitchenQual<- KitchenQual
#Transforming Functional column to numeric in train dataset
Functional=as.factor(housing_train$Functional)
levels(Functional) #"Maj1" "Maj2" "Min1" "Min2" "Mod" "Sev" "Typ"
## [1] "Maj1" "Maj2" "Min1" "Min2" "Mod" "Sev" "Typ"
Functional=as.numeric(Functional, "Sev"=1, "Maj2"=2,"Maj1"=3, "Mod"=4, "Min2"=5, "Min1"=6, "Typ"=7)
Functional=Functional+1 # "Sal"=1, Sev"=2, "Maj2"=3,"Maj1"=4, "Mod"=5, "Min2"=6, "Min1"=7, "Typ"=8
housing_train$Functional <- Functional
#Transforming Functional column to numeric in test dataset
Functional=as.factor(housing_test$Functional)
levels(Functional) #"Maj1" "Maj2" "Min1" "Min2" "Mod" "Sev" "Typ"
## [1] "Maj1" "Maj2" "Min1" "Min2" "Mod" "Sev" "Typ"
Functional=as.numeric(Functional, "Sev"=1, "Maj2"=2,"Maj1"=3, "Mod"=4, "Min2"=5, "Min1"=6, "Typ"=7, "NA"=0)
Functional=Functional +1 # "Sal"=1, Sev"=2, "Maj2"=3,"Maj1"=4, "Mod"=5, "Min2"=6, "Min1"=7, "Typ"=8
housing_test$Functional <- Functional
#Transforming FireplaceQu column to numeric in train dataset
FireplaceQu=as.factor(housing_train$FireplaceQu)
levels(FireplaceQu) #"Ex" "Fa" "Gd" "Po" "TA"
## [1] "Ex" "Fa" "Gd" "Po" "TA"
sum(is.na(FireplaceQu)) # 690 Missing Entries
## [1] 690
FireplaceQu=as.numeric(FireplaceQu, "Po"=1, "Fa"=2,"TA"=3, "Gd"=4, "Ex"=5, "NA"=0)
FireplaceQu[is.na(FireplaceQu)]<-0
housing_train$FireplaceQu <- FireplaceQu
#Transforming FireplaceQu column to numeric in test dataset
FireplaceQu=as.factor(housing_test$FireplaceQu)
levels(FireplaceQu) #"Ex" "Fa" "Gd" "Po" "TA"
## [1] "Ex" "Fa" "Gd" "Po" "TA"
sum(is.na(FireplaceQu)) # 730 Missing Entries
## [1] 730
FireplaceQu=as.numeric(FireplaceQu, "Po"=1, "Fa"=2,"TA"=3, "Gd"=4, "Ex"=5, "NA"=0)
FireplaceQu[is.na(FireplaceQu)]<-0
housing_test$FireplaceQu <- FireplaceQu
#Transforming GarageType column to numeric in train dataset
GarageType=as.factor(housing_train$GarageType)
levels(GarageType) #"2Types" "Attchd" "Basment" "BuiltIn" "CarPort" "Detchd"
## [1] "2Types" "Attchd" "Basment" "BuiltIn" "CarPort" "Detchd"
sum(is.na(GarageType)) # 81 Missing Entries
## [1] 81
GarageType=as.numeric(GarageType, "Detchd"=1, "CarPort"=2,"BuiltIn"=3, "Basment"=4, "Attchd"=5, "2Types"=6)
GarageType[is.na(GarageType)]<-0
housing_train$GarageType <- GarageType
#Transforming GarageType column to numeric in test dataset
GarageType=as.factor(housing_test$GarageType)
levels(GarageType) #"2Types" "Attchd" "Basment" "BuiltIn" "CarPort" "Detchd"
## [1] "2Types" "Attchd" "Basment" "BuiltIn" "CarPort" "Detchd"
sum(is.na(GarageType)) # 76 Missing Entries
## [1] 76
GarageType=as.numeric(GarageType, "Detchd"=1, "CarPort"=2,"BuiltIn"=3, "Basment"=4, "Attchd"=5, "2Types"=6)
GarageType[is.na(GarageType)]<-0
housing_test$GarageType <- GarageType
# Changing missing values of GarageYrBlt
sum(is.na(housing_train$GarageYrBlt)) # 81 missing values
## [1] 81
sum(is.na(housing_test$GarageYrBlt)) #78 missing values
## [1] 78
housing_train$GarageYrBlt[is.na(housing_train$GarageYrBlt)] <- 0
sum(is.na(housing_train$GarageYrBlt))
## [1] 0
housing_test$GarageYrBlt[is.na(housing_test$GarageYrBlt)] <- 0
sum(is.na(housing_test$GarageYrBlt))
## [1] 0
#Transforming GarageFinish column to numeric in train dataset
GarageFinish=as.factor(housing_train$GarageFinish)
levels(GarageFinish) #"Fin" "RFn" "Unf"
## [1] "Fin" "RFn" "Unf"
sum(is.na(GarageFinish)) # 81 Missing Entries
## [1] 81
GarageFinish=as.numeric(GarageFinish, "Unf"=1, "RFn"=2,"Fin"=3)
GarageFinish[is.na(GarageFinish)]<-0
housing_train$GarageFinish <- GarageFinish
#Transforming GarageFinish column to numeric in test dataset
GarageFinish=as.factor(housing_test$GarageFinish)
levels(GarageFinish) #"Fin" "RFn" "Unf"
## [1] "Fin" "RFn" "Unf"
sum(is.na(GarageFinish)) # 78 Missing Entries
## [1] 78
GarageFinish=as.numeric(GarageFinish, "Unf"=1, "RFn"=2,"Fin"=3)
GarageFinish[is.na(GarageFinish)]<-0
housing_test$GarageFinish <- GarageFinish
# Changing missing values of GarageCars
sum(is.na(housing_train$GarageCars)) # no missing values
## [1] 0
sum(is.na(housing_test$GarageCars)) #1 missing value
## [1] 1
housing_test$GarageCars[is.na(housing_test$GarageCars)]<-0
# Changing missing values of GarageArea
sum(is.na(housing_train$GarageArea)) # no missing values
## [1] 0
sum(is.na(housing_test$GarageArea)) #1 missing value
## [1] 1
housing_test$GarageArea[is.na(housing_test$GarageArea)]<-0
#Transforming GarageQual column to numeric in train dataset
GarageQual=as.factor(housing_train$GarageQual)
levels(GarageQual) # "Ex" "Fa" "Gd" "Po" "TA"
## [1] "Ex" "Fa" "Gd" "Po" "TA"
sum(is.na(GarageQual)) # 81 Missing Entries
## [1] 81
GarageQual=as.numeric(GarageQual, "Po"=1, "Fa"=2,"TA"=3, "Gd"=4, "Ex"=5)
GarageQual[is.na(GarageQual)]<-0
housing_train$GarageQual <- GarageQual
#Transforming GarageQual column to numeric in test dataset
GarageQual=as.factor(housing_test$GarageQual)
levels(GarageQual) # "Fa" "Gd" "Po" "TA"
## [1] "Fa" "Gd" "Po" "TA"
sum(is.na(GarageQual)) # 78 Missing Entries
## [1] 78
GarageQual=as.numeric(GarageQual, "Po"=1, "Fa"=2,"TA"=3, "Gd"=4)
GarageQual[is.na(GarageQual)]<-0
housing_test$GarageQual <- GarageQual
#Transforming GarageCond column to numeric in train dataset
GarageCond=as.factor(housing_train$GarageCond)
levels(GarageCond) # "Ex" "Fa" "Gd" "Po" "TA"
## [1] "Ex" "Fa" "Gd" "Po" "TA"
sum(is.na(GarageCond)) # 81 Missing Entries
## [1] 81
GarageCond=as.numeric(GarageCond, "Po"=1, "Fa"=2,"TA"=3, "Gd"=4, "Ex"=5)
GarageCond[is.na(GarageCond)]<-0
housing_train$GarageCond <- GarageCond
#Transforming GarageCond column to numeric in test dataset
GarageCond=as.factor(housing_test$GarageCond)
levels(GarageCond) # "Ex" "Fa" "Gd" "Po" "TA"
## [1] "Ex" "Fa" "Gd" "Po" "TA"
sum(is.na(GarageCond)) # 78 Missing Entries
## [1] 78
GarageCond=as.numeric(GarageCond, "Po"=1, "Fa"=2,"TA"=3, "Gd"=4, "Ex"=5)
GarageCond[is.na(GarageCond)]<-0
housing_test$GarageCond <- GarageCond
#Transforming PavedDrive column to numeric in train dataset
PavedDrive=as.factor(housing_train$PavedDrive)
levels(PavedDrive) # "N" "P" "Y"
## [1] "N" "P" "Y"
sum(is.na(PavedDrive)) # 0 Missing Entries
## [1] 0
PavedDrive=as.numeric(PavedDrive, "N"=1, "P"=2,"Y"=3)
housing_train$PavedDrive <- PavedDrive
#Transforming PavedDrive column to numeric in test dataset
PavedDrive=as.factor(housing_test$PavedDrive)
levels(PavedDrive) # "N" "P" "Y"
## [1] "N" "P" "Y"
sum(is.na(PavedDrive)) # 0 Missing Entries
## [1] 0
PavedDrive=as.numeric(PavedDrive, "N"=1, "P"=2,"Y"=3)
housing_test$PavedDrive <- PavedDrive
#Transforming PoolQC column to numeric in train dataset
PoolQC=as.factor(housing_train$PoolQC)
levels(PoolQC) # "N" "P" "Y"
## [1] "Ex" "Fa" "Gd"
sum(is.na(PoolQC)) # 1453 Missing Entries
## [1] 1453
PoolQC=as.numeric(PoolQC, "Fa"=1, "Gd"=2,"Ex"=3)
PoolQC <-ifelse(PoolQC==2|PoolQC==3,PoolQC+1,PoolQC) # No pool=0, Fa=1, TA=2, Gd=3, Ex=4
PoolQC[is.na(PoolQC)]<-0
housing_train$PoolQC <- PoolQC
#Transforming PoolQC column to numeric in test dataset
PoolQC=as.factor(housing_test$PoolQC)
levels(PoolQC) # "Ex" "Gd"
## [1] "Ex" "Gd"
sum(is.na(PoolQC)) # 1456 Missing Entries
## [1] 1456
PoolQC=as.numeric(PoolQC, "Gd"=1, "Ex"=2)
PoolQC=PoolQC+2
PoolQC[is.na(PoolQC)]<-0
housing_test$PoolQC <- PoolQC
#Transforming Fence column to numeric in train dataset
Fence=as.factor(housing_train$Fence)
levels(Fence) # "GdPrv" "GdWo" "MnPrv" "MnWw"
## [1] "GdPrv" "GdWo" "MnPrv" "MnWw"
sum(is.na(Fence)) # 1179 Missing Entries
## [1] 1179
Fence=as.numeric(Fence, "MnWw"=1, "GdWo"=2,"MnPrv"=3, "GdPrv"=4)
Fence[is.na(Fence)]<-0
housing_train$Fence <- Fence
#Transforming Fence column to numeric in test dataset
Fence=as.factor(housing_test$Fence)
levels(Fence) # "GdPrv" "GdWo" "MnPrv" "MnWw"
## [1] "GdPrv" "GdWo" "MnPrv" "MnWw"
sum(is.na(Fence)) # 1169 Missing Entries
## [1] 1169
Fence=as.numeric(Fence, "MnWw"=1, "GdWo"=2,"MnPrv"=3, "GdPrv"=4)
Fence[is.na(Fence)]<-0
housing_test$Fence <- Fence
#Transforming MiscFeature column to numeric in train dataset
MiscFeature=as.factor(housing_train$MiscFeature)
levels(MiscFeature) # "Gar2" "Othr" "Shed" "TenC"
## [1] "Gar2" "Othr" "Shed" "TenC"
sum(is.na(MiscFeature)) # 1406 Missing Entries
## [1] 1406
MiscFeature=as.numeric(MiscFeature, "TenC"=1, "Shed"=2,"Othr"=3, "Gar2"=4)
MiscFeature[is.na(MiscFeature)]<-0
housing_train$MiscFeature <- MiscFeature
#Transforming MiscFeature column to numeric in test dataset
MiscFeature=as.factor(housing_test$MiscFeature)
levels(MiscFeature) # "Gar2" "Othr" "Shed"
## [1] "Gar2" "Othr" "Shed"
sum(is.na(MiscFeature)) # 1408 Missing Entries
## [1] 1408
MiscFeature=as.numeric(MiscFeature, "Shed"=1,"Othr"=2, "Gar2"=3)
MiscFeature=MiscFeature+1
MiscFeature[is.na(MiscFeature)]<-0 #"TenC"=1, "Shed"=2,"Othr"=3, "Gar2"=4
housing_test$MiscFeature <- MiscFeature
#Transforming SaleType column to numeric in train dataset
SaleType=as.factor(housing_train$SaleType)
levels(SaleType) # "COD" "Con" "ConLD" "ConLI" "ConLw" "CWD" "New" "Oth" "WD"
## [1] "COD" "Con" "ConLD" "ConLI" "ConLw" "CWD" "New" "Oth" "WD"
sum(is.na(SaleType)) # 0 Missing Entries
## [1] 0
SaleType=as.numeric(SaleType, "Oth"=1, "ConLD"=2,"ConLI"=3, "ConLw"=4, "Con"=5, "COD"=6, "New"=7, "CWD"=8, "WD"=9)
housing_train$SaleType <- SaleType
#Transforming SaleType column to numeric in test dataset
SaleType=as.factor(housing_test$SaleType)
levels(SaleType) # "COD" "Con" "ConLD" "ConLI" "ConLw" "CWD" "New" "Oth" "WD"
## [1] "COD" "Con" "ConLD" "ConLI" "ConLw" "CWD" "New" "Oth" "WD"
sum(is.na(SaleType)) # 1 Missing Entries
## [1] 1
SaleType=as.numeric(SaleType, "Oth"=1, "ConLD"=2,"ConLI"=3, "ConLw"=4, "Con"=5, "COD"=6, "New"=7, "CWD"=8, "WD"=9)
SaleType[is.na(SaleType)]<-1
housing_test$SaleType <- SaleType
#Transforming SaleCondition column to numeric in train dataset
SaleCondition=as.factor(housing_train$SaleCondition)
levels(SaleCondition) # "Abnorml" "AdjLand" "Alloca" "Family" "Normal" "Partial"
## [1] "Abnorml" "AdjLand" "Alloca" "Family" "Normal" "Partial"
sum(is.na(SaleCondition)) # 0 Missing Entries
## [1] 0
SaleCondition=as.numeric(SaleCondition, "Partial"=1, "Family"=2,"Alloca"=3, "AdjLand"=4, "Abnorml"=5, "Normal"=6)
housing_train$SaleCondition <- SaleCondition
#Transforming SaleCondition column to numeric in test dataset
SaleCondition=as.factor(housing_test$SaleCondition)
levels(SaleCondition) # "Abnorml" "AdjLand" "Alloca" "Family" "Normal" "Partial"
## [1] "Abnorml" "AdjLand" "Alloca" "Family" "Normal" "Partial"
sum(is.na(SaleCondition)) # 0 Missing Entries
## [1] 0
SaleCondition=as.numeric(SaleCondition, "Partial"=1, "Family"=2,"Alloca"=3, "AdjLand"=4, "Abnorml"=5, "Normal"=6)
housing_test$SaleCondition <- SaleCondition
##Replacing NA with Zeros
data.frame(num_missing=colSums(is.na(housing_train)))
## num_missing
## Id 0
## MSSubClass 0
## MSZoning 0
## LotFrontage 259
## LotArea 0
## Street 0
## Alley 1369
## LotShape 0
## LandContour 0
## Utilities 0
## LotConfig 0
## LandSlope 0
## Neighborhood 0
## Condition1 0
## Condition2 0
## BldgType 0
## HouseStyle 0
## OverallQual 0
## OverallCond 0
## YearBuilt 0
## YearRemodAdd 0
## RoofStyle 0
## RoofMatl 0
## Exterior1st 0
## Exterior2nd 0
## MasVnrType 8
## MasVnrArea 8
## ExterQual 0
## ExterCond 0
## Foundation 0
## BsmtQual 37
## BsmtCond 37
## BsmtExposure 38
## BsmtFinType1 0
## BsmtFinSF1 0
## BsmtFinType2 0
## BsmtFinSF2 0
## BsmtUnfSF 0
## TotalBsmtSF 0
## Heating 0
## HeatingQC 0
## CentralAir 0
## Electrical 0
## X1stFlrSF 0
## X2ndFlrSF 0
## LowQualFinSF 0
## GrLivArea 0
## BsmtFullBath 0
## BsmtHalfBath 0
## FullBath 0
## HalfBath 0
## BedroomAbvGr 0
## KitchenAbvGr 0
## KitchenQual 0
## TotRmsAbvGrd 0
## Functional 0
## Fireplaces 0
## FireplaceQu 0
## GarageType 0
## GarageYrBlt 0
## GarageFinish 0
## GarageCars 0
## GarageArea 0
## GarageQual 0
## GarageCond 0
## PavedDrive 0
## WoodDeckSF 0
## OpenPorchSF 0
## EnclosedPorch 0
## X3SsnPorch 0
## ScreenPorch 0
## PoolArea 0
## PoolQC 0
## Fence 0
## MiscFeature 0
## MiscVal 0
## MoSold 0
## YrSold 0
## SaleType 0
## SaleCondition 0
## SalePrice 0
data.frame(num_missing=colSums(is.na(housing_test)))
## num_missing
## Id 0
## MSSubClass 0
## MSZoning 4
## LotFrontage 227
## LotArea 0
## Street 0
## Alley 1352
## LotShape 0
## LandContour 0
## Utilities 2
## LotConfig 0
## LandSlope 0
## Neighborhood 0
## Condition1 0
## Condition2 0
## BldgType 0
## HouseStyle 0
## OverallQual 0
## OverallCond 0
## YearBuilt 0
## YearRemodAdd 0
## RoofStyle 0
## RoofMatl 0
## Exterior1st 1
## Exterior2nd 1
## MasVnrType 16
## MasVnrArea 15
## ExterQual 0
## ExterCond 0
## Foundation 0
## BsmtQual 44
## BsmtCond 45
## BsmtExposure 44
## BsmtFinType1 0
## BsmtFinSF1 1
## BsmtFinType2 0
## BsmtFinSF2 1
## BsmtUnfSF 1
## TotalBsmtSF 1
## Heating 0
## HeatingQC 0
## CentralAir 0
## Electrical 0
## X1stFlrSF 0
## X2ndFlrSF 0
## LowQualFinSF 0
## GrLivArea 0
## BsmtFullBath 2
## BsmtHalfBath 2
## FullBath 0
## HalfBath 0
## BedroomAbvGr 0
## KitchenAbvGr 0
## KitchenQual 1
## TotRmsAbvGrd 0
## Functional 2
## Fireplaces 0
## FireplaceQu 0
## GarageType 0
## GarageYrBlt 0
## GarageFinish 0
## GarageCars 0
## GarageArea 0
## GarageQual 0
## GarageCond 0
## PavedDrive 0
## WoodDeckSF 0
## OpenPorchSF 0
## EnclosedPorch 0
## X3SsnPorch 0
## ScreenPorch 0
## PoolArea 0
## PoolQC 0
## Fence 0
## MiscFeature 0
## MiscVal 0
## MoSold 0
## YrSold 0
## SaleType 0
## SaleCondition 0
Input 0 for missing values.
housing_train$LotFrontage[is.na(housing_train$LotFrontage)] <- 0
sum(is.na(housing_train$LotFrontage))
## [1] 0
housing_test$LotFrontage[is.na(housing_test$LotFrontage)] <- 0
sum(is.na(housing_test$LotFrontage))
## [1] 0
housing_train$BsmtQual[is.na(housing_train$BsmtQual)] <- 0
sum(is.na(housing_train$BsmtQual))
## [1] 0
housing_test$BsmtQual[is.na(housing_test$BsmtQual)] <- 0
sum(is.na(housing_test$BsmtQual))
## [1] 0
housing_train$MasVnrType[is.na(housing_train$MasVnrType)] <- 0
sum(is.na(housing_train$MasVnrType))
## [1] 0
housing_test$MasVnrType[is.na(housing_test$MasVnrType)] <- 0
sum(is.na(housing_test$MasVnrType))
## [1] 0
housing_train$MasVnrArea[is.na(housing_train$MasVnrArea)] <- 0
sum(is.na(housing_train$MasVnrArea))
## [1] 0
housing_test$MasVnrArea[is.na(housing_test$MasVnrArea)] <- 0
sum(is.na(housing_test$MasVnrArea))
## [1] 0
housing_train$BsmtCond[is.na(housing_train$BsmtCond)] <- 0
sum(is.na(housing_train$BsmtCond))
## [1] 0
housing_test$BsmtCond[is.na(housing_test$BsmtCond)] <- 0
sum(is.na(housing_test$BsmtCond))
## [1] 0
housing_train$BsmtExposure[is.na(housing_train$BsmtExposure)] <- 0
sum(is.na(housing_train$BsmtExposure))
## [1] 0
housing_test$BsmtExposure[is.na(housing_test$BsmtExposure)] <- 0
sum(is.na(housing_test$BsmtExposure))
## [1] 0
housing_train$MiscFeature[is.na(housing_train$MiscFeature)] <- 0
sum(is.na(housing_train$MiscFeature))
## [1] 0
housing_test$GarageQual[is.na(housing_test$GarageQual)] <- 0
sum(is.na(housing_test$GarageQual))
## [1] 0
housing_test$MSZoning[is.na(housing_test$MSZoning)] <- 0
sum(is.na(housing_test$MSZoning))
## [1] 0
housing_test$Exterior1st[is.na(housing_test$Exterior1st)] <- 0
sum(is.na(housing_test$Exterior1st))
## [1] 0
housing_test$Exterior2nd[is.na(housing_test$Exterior2nd)] <- 0
sum(is.na(housing_test$Exterior2nd))
## [1] 0
housing_test$BsmtFinSF1[is.na(housing_test$BsmtFinSF1)] <- 0
sum(is.na(housing_test$BsmtFinSF1))
## [1] 0
housing_test$BsmtFinSF2[is.na(housing_test$BsmtFinSF2)] <- 0
sum(is.na(housing_test$BsmtFinSF2 ))
## [1] 0
housing_test$BsmtUnfSF[is.na(housing_test$BsmtUnfSF)] <- 0
sum(is.na(housing_test$BsmtUnfSF))
## [1] 0
housing_test$BsmtFullBath[is.na(housing_test$BsmtFullBath)] <- 0
sum(is.na(housing_test$BsmtFullBath))
## [1] 0
housing_test$BsmtHalfBath[is.na(housing_test$BsmtHalfBath)] <- 0
sum(is.na(housing_test$BsmtHalfBath))
## [1] 0
housing_test$KitchenQual[is.na(housing_test$KitchenQual)] <- 0
sum(is.na(housing_test$KitchenQual))
## [1] 0
housing_test$Functional[is.na(housing_test$Functional)] <- 0
sum(is.na(housing_test$Functional))
## [1] 0
#housing_train$Alley[is.na(housing_train$Alley)] <- 0
#sum(is.na(housing_train$Alley))
#housing_test$Alley[is.na(housing_test$Alley)] <- 0
#sum(is.na(housing_test$Alley))
Everything else will have 0 for missing values since that aren’t many
values missing.
For Alley the column will be stored since most of the rows do not have a
value. The model was run with Alley and without alley, it shows that
alley column variable makes the model slightly worst so the variable
will be excluded.
trainAlley <- housing_train$Alley
testAlley <- housing_test$Alley
housing_train$Alley <- NULL
housing_test$Alley <- NULL
Everything else will have 0 for missing values since that aren’t many values missing.
#Visualizations The next step it is to create visualizations to start
the analysis.
Looking at the correlation between variables:
#Libraries for the next visualizations
library(ggcorrplot)
## Loading required package: ggplot2
library(ggplot2)
correlations <- cor(housing_train[,c(2:15
,80)], use="everything")
corrplot::corrplot(correlations, method="circle", type="lower", sig.level = 0.01, insig = "blank")
Looking at positive correlations the variables that show the strongest
correlation is MSSubClass and BldgType, followed by landSlope and
LotArea. Looking a negative correlations there are greater correlations
on Utilities and LandSlope, BlgdType with LotFrontage and LotArea. Now
looking on SalesPrice variables MSSubClass, MSZoning, LotShape,
LotConfig, BldgType have negative correlation with sales price. The
variables LotFrontage, LotArea, Neighborhood, and condition have a
positive correlation.
Looking at a few more variables.
correlations <- cor(housing_train[,c(16:26, 80)], use="everything")
corrplot::corrplot(correlations, method="circle", type="lower", sig.level = 0.01, insig = "blank")
Looking at the variables that have greater correlation to sales price. OverallQual, YearBuilt, YearRemondAdd, MasVnrArea, and RoofStyle have a positive correlation. The only variable that has a negative correlation to SalesPrice is OverallCond.
correlations <- cor(housing_train[,c(27:40, 80)], use="everything")
corrplot::corrplot(correlations, method="circle", type="lower", sig.level = 0.01, insig = "blank")
In this set of variables it shows that ExterQual has a negative correlation to the Sales price, as well as BsmtQual, HeatingQC, and BsmtQUal. TotalBsmtSF has a positive correlation followed by BsmtFinSF1, and foundation.
correlations <- cor(housing_train[,c(41:60, 80)], use="everything")
corrplot::corrplot(correlations, method="circle", type="lower", sig.level = 0.01, insig = "blank")
All the variables that are relagted to living spaces and quality have a positive correlation, the only living space that has a negative correlation is the kitchen. Something to look into to see what is influecing about kitchens.
correlations <- cor(housing_train[,c(61:79, 80)], use="everything")
corrplot::corrplot(correlations, method="circle", type="lower", sig.level = 0.01, insig = "blank")
At last, garage area is the last thing that has a stroger correlation
to SalesPrice. Other amenities do not seem to be as important.
In this case a fireplace has a good relation between other amenities of
the house.
pairs(SalePrice~YearBuilt+OverallQual+TotalBsmtSF+GrLivArea,data=housing_train,
main="Simple Scatterplot Matrix")
Looking at the plots the data seems to be well distributed but also
show how the variables correlate.
The following chart shows the sale price comparing to the year it was
build.
#install.packages('carData')
library(car)
## Loading required package: carData
scatterplot(SalePrice ~ YearBuilt, data=housing_train, xlab="Year Built", ylab="Sale Price", grid=FALSE)
We can see a correlation between the year it was build and increase of
price. Overall, prices seem to have a higher increase after 1980 and
after the 2000 there are more values scatter towards higher prices.
scatterplot(SalePrice ~ YrSold, data=housing_train, xlab="Year Built", ylab="Sale Price", grid=FALSE)
The Year Built vs Sale Price shows how the 2008 market could influenced the sale price of houses, the data varies a bit more in that year as it kinda shows a small decline. There is a slight increase in 2009 for sale price but after 2009 it seems it is back to a normal linear line.
Looking at the house LotArea comparing to sales price.
scatterplot(SalePrice ~ LotArea, data=housing_train, xlab="Lot Area", ylab="Sale Price", grid=FALSE)
The chart shows a non-linear relationship between the size of the lot
and the Sales Price. This shows that house aspects have a greater weight
on the price of the house.
Looking at the Sale rice and the Square foot of basement finished.
scatterplot(SalePrice ~ X1stFlrSF, data=housing_train, xlab="1st Floor Square Foot", ylab="Sale Price", grid=FALSE)
At last, 1st floor Square Foot shows a strong relathionship to Sales
price, but still having data points outside the expected.
#Modeling Preparation The data will be partition for analysis using
caret function.
#Data partition using caret partition function.
#install.packages('lattice')
library(caret)
## Loading required package: lattice
#Packages for RMSE
#install.packages('Metrics')
library(Metrics)
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
#Naming the Sale Price as Outcome
outcome <- housing_train$SalePrice
#Partition the data to be 60% train and 40% test
partition <- createDataPartition(y=outcome, p=.6, list=FALSE)
train <- housing_train[partition,]
test <- housing_train[-partition,]
After testing the models I couple of times with different train and test sets, reducing the train set would improve the XGBoost but increase the error for Linear Regression. Train 60% of the data was where the best prediction was display for XGBoost, using 50% as train data made our prediction error increase by 2%. The first part of the project will use a differen percentage of train data than the XGBoost to make the model better.
#Data Analysis
##Linear Regression
The first step it is to create a linear model to see what variables
share a strong relation with the Sale Price.
LM_model1 <- lm(SalePrice ~., data=train)
summary(LM_model1)
##
## Call:
## lm(formula = SalePrice ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -132185 -12383 -473 11930 164660
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.402e+05 1.342e+06 0.701 0.483605
## Id 6.066e-02 2.092e+00 0.029 0.976881
## MSSubClass -3.974e+01 4.697e+01 -0.846 0.397717
## MSZoning -1.528e+03 1.585e+03 -0.964 0.335116
## LotFrontage 4.709e+01 2.845e+01 1.655 0.098345 .
## LotArea 5.614e-01 9.058e-02 6.198 9.17e-10 ***
## Street 4.456e+04 1.376e+04 3.239 0.001249 **
## LotShape -1.076e+03 6.742e+02 -1.596 0.110889
## LandContour -1.614e+03 1.335e+03 -1.208 0.227227
## Utilities -2.598e+04 2.644e+04 -0.983 0.326062
## LotConfig -6.484e+01 5.514e+02 -0.118 0.906419
## LandSlope -1.901e+03 3.524e+03 -0.539 0.589744
## Neighborhood -6.981e+01 1.552e+02 -0.450 0.653048
## Condition1 3.708e+02 1.025e+03 0.362 0.717648
## Condition2 1.729e+03 5.936e+03 0.291 0.770909
## BldgType -2.176e+03 1.525e+03 -1.427 0.154056
## HouseStyle 4.213e+02 6.431e+02 0.655 0.512620
## OverallQual 8.312e+03 1.211e+03 6.862 1.36e-11 ***
## OverallCond 5.817e+03 1.094e+03 5.319 1.36e-07 ***
## YearBuilt 2.998e+02 7.463e+01 4.016 6.47e-05 ***
## YearRemodAdd 1.332e+00 7.075e+01 0.019 0.984988
## RoofStyle 1.894e+03 1.125e+03 1.683 0.092792 .
## RoofMatl -1.359e+03 1.599e+03 -0.850 0.395461
## Exterior1st -1.060e+03 5.449e+02 -1.945 0.052118 .
## Exterior2nd 5.364e+02 4.891e+02 1.097 0.273097
## MasVnrType 5.592e+03 1.491e+03 3.752 0.000188 ***
## MasVnrArea 2.000e+01 6.017e+00 3.323 0.000930 ***
## ExterQual -8.431e+03 1.999e+03 -4.216 2.77e-05 ***
## ExterCond 1.442e+03 1.338e+03 1.078 0.281287
## Foundation 8.651e+01 1.730e+03 0.050 0.960138
## BsmtQual -7.055e+03 1.343e+03 -5.253 1.92e-07 ***
## BsmtCond 1.849e+03 1.403e+03 1.318 0.187946
## BsmtExposure -1.172e+03 8.843e+02 -1.326 0.185306
## BsmtFinType1 -3.425e+02 6.244e+02 -0.549 0.583460
## BsmtFinSF1 3.936e+01 5.290e+00 7.441 2.59e-13 ***
## BsmtFinType2 5.009e+02 1.133e+03 0.442 0.658563
## BsmtFinSF2 2.784e+01 7.828e+00 3.557 0.000397 ***
## BsmtUnfSF 2.169e+01 4.903e+00 4.425 1.10e-05 ***
## TotalBsmtSF NA NA NA NA
## Heating 5.935e+02 2.983e+03 0.199 0.842349
## HeatingQC -4.407e+02 6.293e+02 -0.700 0.483988
## CentralAir 9.773e+02 4.525e+03 0.216 0.829055
## Electrical -2.697e+02 8.963e+02 -0.301 0.763575
## X1stFlrSF 6.627e+01 6.183e+00 10.718 < 2e-16 ***
## X2ndFlrSF 6.406e+01 5.007e+00 12.794 < 2e-16 ***
## LowQualFinSF -1.144e+01 1.781e+01 -0.642 0.520806
## GrLivArea NA NA NA NA
## BsmtFullBath 1.463e+03 2.478e+03 0.591 0.554972
## BsmtHalfBath -5.360e+02 3.830e+03 -0.140 0.888750
## FullBath -3.169e+03 2.706e+03 -1.171 0.241992
## HalfBath -1.204e+03 2.538e+03 -0.474 0.635343
## BedroomAbvGr -5.880e+03 1.652e+03 -3.559 0.000394 ***
## KitchenAbvGr -1.818e+04 5.027e+03 -3.617 0.000317 ***
## KitchenQual -6.845e+03 1.476e+03 -4.638 4.12e-06 ***
## TotRmsAbvGrd 2.145e+02 1.193e+03 0.180 0.857389
## Functional 4.856e+03 9.288e+02 5.229 2.18e-07 ***
## Fireplaces 3.243e+03 2.591e+03 1.252 0.211090
## FireplaceQu -3.065e+02 7.947e+02 -0.386 0.699845
## GarageType 9.463e+02 6.359e+02 1.488 0.137122
## GarageYrBlt -4.371e+00 5.727e+00 -0.763 0.445574
## GarageFinish -1.344e+03 1.468e+03 -0.916 0.360171
## GarageCars 5.116e+03 2.917e+03 1.754 0.079840 .
## GarageArea 1.686e+01 9.489e+00 1.777 0.075905 .
## GarageQual -6.425e+02 1.795e+03 -0.358 0.720502
## GarageCond -6.917e+02 1.977e+03 -0.350 0.726510
## PavedDrive 3.557e+03 2.096e+03 1.697 0.090037 .
## WoodDeckSF 2.628e+01 7.643e+00 3.438 0.000615 ***
## OpenPorchSF 1.528e+01 1.522e+01 1.004 0.315832
## EnclosedPorch 3.595e+00 1.503e+01 0.239 0.811023
## X3SsnPorch 6.717e+00 2.712e+01 0.248 0.804440
## ScreenPorch 2.393e+01 1.692e+01 1.415 0.157541
## PoolArea 2.498e+02 5.242e+01 4.765 2.25e-06 ***
## PoolQC -4.050e+04 1.037e+04 -3.905 0.000102 ***
## Fence 2.556e+02 9.312e+02 0.274 0.783816
## MiscFeature -2.145e+03 1.638e+03 -1.309 0.190802
## MiscVal -4.737e-01 1.501e+00 -0.315 0.752493
## MoSold -6.882e+01 3.171e+02 -0.217 0.828250
## YrSold -7.965e+02 6.628e+02 -1.202 0.229816
## SaleType -2.163e+02 6.357e+02 -0.340 0.733755
## SaleCondition 2.620e+03 8.235e+02 3.181 0.001523 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24300 on 800 degrees of freedom
## Multiple R-squared: 0.9097, Adjusted R-squared: 0.901
## F-statistic: 104.7 on 77 and 800 DF, p-value: < 2.2e-16
#The competition asks to use RMSE for predicting error.
prediction_lm1 <- predict(LM_model1, test, type="response")
## Warning in predict.lm(LM_model1, test, type = "response"): prediction from a
## rank-deficient fit may be misleading
model_output <- cbind(test, prediction_lm1)
model_output$log_prediction <- log(model_output$prediction_lm1)
model_output$log_SalePrice <- log(model_output$SalePrice)
#Test with RMSE
rmse(model_output$log_SalePrice,model_output$log_prediction)
## [1] 0.1661432
Many variables can be excluded from the model, the model has a of explanation and the error is what seems really large.The new model is the following:
LM_model2 <- lm(SalePrice ~LotArea+Street+Neighborhood+Condition1+BldgType+OverallCond+OverallQual+YearBuilt+RoofMatl+MasVnrArea+ExterQual+BsmtFinSF1+BsmtUnfSF+X1stFlrSF+ X2ndFlrSF+BedroomAbvGr+KitchenAbvGr+KitchenQual+TotRmsAbvGrd+Fireplaces+GarageArea+GarageQual, data=train)
summary(LM_model2)
##
## Call:
## lm(formula = SalePrice ~ LotArea + Street + Neighborhood + Condition1 +
## BldgType + OverallCond + OverallQual + YearBuilt + RoofMatl +
## MasVnrArea + ExterQual + BsmtFinSF1 + BsmtUnfSF + X1stFlrSF +
## X2ndFlrSF + BedroomAbvGr + KitchenAbvGr + KitchenQual + TotRmsAbvGrd +
## Fireplaces + GarageArea + GarageQual, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -127318 -14127 -1188 13938 218389
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.243e+05 9.307e+04 -9.932 < 2e-16 ***
## LotArea 6.144e-01 8.327e-02 7.379 3.79e-13 ***
## Street 5.084e+04 1.393e+04 3.649 0.000280 ***
## Neighborhood -4.111e+01 1.555e+02 -0.264 0.791590
## Condition1 5.620e+02 1.038e+03 0.541 0.588496
## BldgType -4.141e+03 8.585e+02 -4.823 1.67e-06 ***
## OverallCond 5.696e+03 9.201e+02 6.190 9.32e-10 ***
## OverallQual 1.038e+04 1.212e+03 8.560 < 2e-16 ***
## YearBuilt 4.443e+02 4.515e+01 9.840 < 2e-16 ***
## RoofMatl -1.048e+03 1.605e+03 -0.653 0.514100
## MasVnrArea 1.042e+01 5.697e+00 1.828 0.067830 .
## ExterQual -1.124e+04 2.017e+03 -5.575 3.32e-08 ***
## BsmtFinSF1 3.367e+01 3.741e+00 9.000 < 2e-16 ***
## BsmtUnfSF 1.000e+01 3.592e+00 2.785 0.005479 **
## X1stFlrSF 7.733e+01 5.194e+00 14.888 < 2e-16 ***
## X2ndFlrSF 6.394e+01 3.989e+00 16.028 < 2e-16 ***
## BedroomAbvGr -7.890e+03 1.647e+03 -4.791 1.96e-06 ***
## KitchenAbvGr -1.490e+04 4.873e+03 -3.057 0.002305 **
## KitchenQual -8.417e+03 1.521e+03 -5.532 4.21e-08 ***
## TotRmsAbvGrd 1.267e+00 1.187e+03 0.001 0.999149
## Fireplaces 3.385e+03 1.668e+03 2.029 0.042722 *
## GarageArea 3.708e+01 6.608e+00 5.611 2.72e-08 ***
## GarageQual -2.939e+03 8.868e+02 -3.314 0.000958 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26440 on 855 degrees of freedom
## Multiple R-squared: 0.8858, Adjusted R-squared: 0.8828
## F-statistic: 301.4 on 22 and 855 DF, p-value: < 2.2e-16
After excluding many of the variables the model has Residual standar
Error: and R-Square:
Prediction Linear Regression:
#The competition asks to use RMSE for predicting error.
prediction_lm <- predict(LM_model2, test, type="response")
model_output <- cbind(test, prediction_lm)
model_output$log_prediction <- log(model_output$prediction_lm)
model_output$log_SalePrice <- log(model_output$SalePrice)
#Test with RMSE
rmse(model_output$log_SalePrice,model_output$log_prediction)
## [1] 0.1763702
The model error is
Visualization to see if a transformation is necessary.
plot(LM_model2$fitted.values, LM_model2$residuals, pch = 20, col = "blue")
abline(h = 0)
The model seem to be around the 0 with some scatter prices when the fitted value increases.
#Package for BoxCox
library(MASS)
boxcox(LM_model2)
Using Log transformation may be helpful improving the linear regression.
model3 <- lm(I(log(SalePrice)) ~LotArea+Street+Neighborhood+Condition1+BldgType+OverallCond+OverallQual+YearBuilt+RoofMatl+MasVnrArea+ExterQual+BsmtFinSF1+BsmtUnfSF+X1stFlrSF+ X2ndFlrSF+BedroomAbvGr+KitchenAbvGr+KitchenQual+TotRmsAbvGrd+Fireplaces+GarageArea+GarageQual, data=train)
summary(model3)
##
## Call:
## lm(formula = I(log(SalePrice)) ~ LotArea + Street + Neighborhood +
## Condition1 + BldgType + OverallCond + OverallQual + YearBuilt +
## RoofMatl + MasVnrArea + ExterQual + BsmtFinSF1 + BsmtUnfSF +
## X1stFlrSF + X2ndFlrSF + BedroomAbvGr + KitchenAbvGr + KitchenQual +
## TotRmsAbvGrd + Fireplaces + GarageArea + GarageQual, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.82295 -0.05835 0.00712 0.07346 0.42917
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.065e+00 4.544e-01 6.746 2.79e-11 ***
## LotArea 2.953e-06 4.065e-07 7.264 8.47e-13 ***
## Street 3.291e-01 6.802e-02 4.839 1.55e-06 ***
## Neighborhood -6.863e-05 7.593e-04 -0.090 0.928002
## Condition1 5.237e-03 5.069e-03 1.033 0.301901
## BldgType -1.588e-02 4.191e-03 -3.789 0.000162 ***
## OverallCond 5.522e-02 4.492e-03 12.293 < 2e-16 ***
## OverallQual 7.290e-02 5.919e-03 12.317 < 2e-16 ***
## YearBuilt 3.545e-03 2.204e-04 16.083 < 2e-16 ***
## RoofMatl 4.462e-03 7.838e-03 0.569 0.569327
## MasVnrArea -2.741e-05 2.781e-05 -0.985 0.324746
## ExterQual -2.301e-02 9.846e-03 -2.337 0.019681 *
## BsmtFinSF1 1.460e-04 1.826e-05 7.992 4.28e-15 ***
## BsmtUnfSF 5.641e-05 1.754e-05 3.217 0.001345 **
## X1stFlrSF 3.355e-04 2.536e-05 13.232 < 2e-16 ***
## X2ndFlrSF 2.695e-04 1.948e-05 13.835 < 2e-16 ***
## BedroomAbvGr -7.734e-03 8.041e-03 -0.962 0.336365
## KitchenAbvGr -4.920e-02 2.379e-02 -2.068 0.038964 *
## KitchenQual -1.491e-02 7.428e-03 -2.007 0.045083 *
## TotRmsAbvGrd 3.429e-03 5.793e-03 0.592 0.554096
## Fireplaces 3.509e-02 8.142e-03 4.310 1.82e-05 ***
## GarageArea 1.995e-04 3.226e-05 6.184 9.65e-10 ***
## GarageQual 4.396e-03 4.329e-03 1.015 0.310189
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1291 on 855 degrees of freedom
## Multiple R-squared: 0.8975, Adjusted R-squared: 0.8949
## F-statistic: 340.3 on 22 and 855 DF, p-value: < 2.2e-16
Residual standar Error: and R-Square: Now using RMSE:
prediction3 <- predict(model3, test, type="response")
model_output <- cbind(test, prediction3)
model_output$log_prediction3 <- log(model_output$prediction3)
model_output$log_SalePrice3 <- log(model_output$SalePrice)
#Test with RMSE
rmse(model_output$log_SalePrice3, model_output$log_prediction3)
## [1] 9.542621
The RMSE is higher than the previous model.
Looking at cooks distance plot to see if there are any outliers that can
influence the model. This will help decide if the variable should be
excluded or not.
mean(hatvalues(model3))
## [1] 0.0261959
QQ-plot looking at leverage of data points.
qqnorm(LM_model2$residuals, main = "LM_model2")
qqline(LM_model2$residuals)
abline(h = 0, col = "grey")
Overall the model does not need to remove any datapoints.
##Bagging For the bagging model no preparation needs to be made since the data was already changed from categorical to numeric. The model will start with 500 bootstrap samples and will be reduced as see fit.
#Package for bagging
#install.packages('ipred')
library(ipred)
house_bag <- bagging(formula = SalePrice ~., data = train, nbagg = 500)
house_bag
##
## Bagging regression trees with 500 bootstrap replications
##
## Call: bagging.data.frame(formula = SalePrice ~ ., data = train, nbagg = 500)
Out_of_bag Prediction
house_bag_oob <- bagging(formula = SalePrice~., data = train, coob = T, nbagg = 500)
house_bag_oob
##
## Bagging regression trees with 500 bootstrap replications
##
## Call: bagging.data.frame(formula = SalePrice ~ ., data = train, coob = T,
## nbagg = 500)
##
## Out-of-bag estimate of root mean squared error: 34823.62
The obb eror also seems really high, but still smaller than the
linear regression with no transformation ( linear regression = , oob =
).
The out of bag show a large error. Looking at the RMSE:
# Predict using the test set
house_bag_pred_1 <- predict(house_bag_oob, test)
model_output <- cbind(test, house_bag_pred_1)
model_output$log_prediction_bag <- log(model_output$house_bag_pred_1)
model_output$log_SalePrice_bag <- log(model_output$SalePrice)
#Test with RMSE
rmse(model_output$log_SalePrice_bag,model_output$log_prediction_bag)
## [1] 0.2000202
The prediction model is showing error.
house_bag2 <- bagging(formula = SalePrice ~LotArea+Street+Neighborhood+Condition1+BldgType+OverallCond+OverallQual+YearBuilt+RoofMatl+MasVnrArea+ExterQual+BsmtFinSF1+BsmtUnfSF+X1stFlrSF+ X2ndFlrSF+BedroomAbvGr+KitchenAbvGr+KitchenQual+TotRmsAbvGrd+Fireplaces+GarageArea+GarageQual, data = train, nbagg = 500)
house_bag2
##
## Bagging regression trees with 500 bootstrap replications
##
## Call: bagging.data.frame(formula = SalePrice ~ LotArea + Street + Neighborhood +
## Condition1 + BldgType + OverallCond + OverallQual + YearBuilt +
## RoofMatl + MasVnrArea + ExterQual + BsmtFinSF1 + BsmtUnfSF +
## X1stFlrSF + X2ndFlrSF + BedroomAbvGr + KitchenAbvGr + KitchenQual +
## TotRmsAbvGrd + Fireplaces + GarageArea + GarageQual, data = train,
## nbagg = 500)
# Predict using the test set
house_bag_pred_2 <- predict(house_bag2, test)
model_output2 <- cbind(test, house_bag_pred_2)
model_output2$log_prediction_bag2 <- log(model_output2$house_bag_pred_2)
model_output2$log_SalePrice_bag2 <- log(model_output2$SalePrice)
#Test with RMSE
rmse(model_output2$log_SalePrice_bag2,model_output2$log_prediction_bag2)
## [1] 0.2042415
Looking at the trees split and error.
ntree <- c(1, 3, 5, seq(20, 500, 20))
MSE_test <- rep(0, length(ntree))
for(i in 1:length(ntree)){
bag1 <- bagging(SalePrice~., data = train, nbagg = ntree[i])
predict <- predict(bag1, newdata = test)
MSE_test[i] <- mean((test$SalePrice - predict)^2)
}
plot(ntree, MSE_test, type = 'l', col = 2, lwd = 2, xaxt = "n")
axis(1, at = ntree, las = 1)
The chart shows a decline on trees at around 20, but the lowest us around 200 trees.
house_bag3 <- bagging(formula = SalePrice ~LotArea+Street+Neighborhood+Condition1+BldgType+OverallCond+OverallQual+YearBuilt+RoofMatl+MasVnrArea+ExterQual+BsmtFinSF1+BsmtUnfSF+X1stFlrSF+ X2ndFlrSF+BedroomAbvGr+KitchenAbvGr+KitchenQual+TotRmsAbvGrd+Fireplaces+GarageArea+GarageQual, data = train, nbagg = 250)
house_bag3
##
## Bagging regression trees with 250 bootstrap replications
##
## Call: bagging.data.frame(formula = SalePrice ~ LotArea + Street + Neighborhood +
## Condition1 + BldgType + OverallCond + OverallQual + YearBuilt +
## RoofMatl + MasVnrArea + ExterQual + BsmtFinSF1 + BsmtUnfSF +
## X1stFlrSF + X2ndFlrSF + BedroomAbvGr + KitchenAbvGr + KitchenQual +
## TotRmsAbvGrd + Fireplaces + GarageArea + GarageQual, data = train,
## nbagg = 250)
# Predict using the test set
house_bag_pred_3 <- predict(house_bag3, test)
model_output3 <- cbind(test, house_bag_pred_3)
model_output3$log_prediction_bag3 <- log(model_output3$house_bag_pred_3)
model_output3$log_SalePrice_bag3 <- log(model_output3$SalePrice)
#Test with RMSE
rmse(model_output3$log_SalePrice_bag3,model_output3$log_prediction_bag3)
## [1] 0.203948
The model prediction still did not improve from the previous
models.
##Random Florest
#Package for Randpom Florest
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
house_rf <- randomForest(SalePrice~., data = train, importance = TRUE)
house_rf
##
## Call:
## randomForest(formula = SalePrice ~ ., data = train, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 26
##
## Mean of squared residuals: 785295483
## % Var explained: 86.82
Since random Florest has it owns filter for variables, there is no need to select the variables that showed correlations previously. The obb error is greater than the bagging model, but has a explanation.
# Predict using the test set
prediction_rf <- predict(house_rf, test)
model_output_rf <- cbind(test, prediction_rf)
model_output_rf$log_prediction_rf <- log(model_output_rf$prediction_rf)
model_output_rf$log_SalePrice_rf <- log(model_output_rf$SalePrice)
#Test with RMSE
rmse(model_output_rf$log_SalePrice_rf,model_output_rf$log_prediction_rf)
## [1] 0.1557613
The prediciton model has a smaller error than bagging showing a
error.
##XGBoost
#Package for XGBoost
#install.packages('xgboost')
library(xgboost)
Splitting the data again:
#Partition the data to be 60% train and 40% test
partition2 <- createDataPartition(y=outcome, p=.9, list=FALSE)
train <- housing_train[partition2,]
test <- housing_train[-partition2,]
The first step it is to transform the dataset into Sparse Matrix.
#Assemble and format the data - Using Log for Variable Sale Price
train$log_SalePrice <- log(train$SalePrice)
test$log_SalePrice <- log(test$SalePrice)
#Create matrices from the data frames
trainData<- as.matrix(train, rownames.force=NA)
testData<- as.matrix(test, rownames.force=NA)
#Turn the matrices into sparse matrices
train2 <- as(trainData, "sparseMatrix")
test2 <- as(testData, "sparseMatrix")
#colnames(train2)
#colnames(pred_data)
#Cross Validate the model
vars <- c(1:78) #Choose the variables
trainD <- xgb.DMatrix(data = train2[,vars], label = train2[,"SalePrice"]) #Convert to xgb.DMatrix format for space and efficiency
Creating a cross validation model
#Cross validate the model
cv.sparse <- xgb.cv(data = trainD,
nrounds = 500,
min_child_weight = 0,
max_depth = 10,
eta = 0.04,
subsample = .7,
colsample_bytree = .7,
booster = "gbtree",
eval_metric = "rmse",
print_every_n = 100,
nfold = 4,
nthread = 2,
objective="reg:linear")
## [12:19:45] WARNING: amalgamation/../src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror.
## [12:19:45] WARNING: amalgamation/../src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror.
## [12:19:45] WARNING: amalgamation/../src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror.
## [12:19:45] WARNING: amalgamation/../src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror.
## [1] train-rmse:189849.979523+1511.409729 test-rmse:189921.284886+4743.807534
## [101] train-rmse:9272.817965+313.818642 test-rmse:28146.745034+3265.877929
## [201] train-rmse:2694.760256+121.520884 test-rmse:26841.697785+2772.386454
## [301] train-rmse:1140.339352+35.077249 test-rmse:26742.019161+2661.761679
## [401] train-rmse:439.398337+25.838777 test-rmse:26702.003424+2627.430251
## [500] train-rmse:159.772079+22.728284 test-rmse:26698.238330+2620.314668
The model shows that a RMSE of _____ is possible when using around 500 rounds.
#Choose the parameters for the model - tunning the model
param <- list(colsample_bytree = .7, #amount of features for each tree
subsample = .7, #fractions of observation for random samples bt .5 and 1 lower than .5 is very conservative model
booster = "gbtree", #tree Based model for a linear model use 'gblinear'
max_depth = 10, #maximun dept of a tree
eta = 0.04, #makes the model more robust by shrinking the weight of each step
eval_metric = "rmse",
objective="reg:linear")
#Train the model using those parameters
bstSparse <-
xgb.train(params = param,
data = trainD,
nrounds = 500,
watchlist = list(train = trainD),
verbose = TRUE,
print_every_n = 100,
nthread = 2)
## [12:20:00] WARNING: amalgamation/../src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror.
## [1] train-rmse:189965.179363
## [101] train-rmse:8814.407182
## [201] train-rmse:2705.367672
## [301] train-rmse:1304.936278
## [401] train-rmse:567.416511
## [500] train-rmse:213.331507
After running the model we can see that a train RMSE of 0.001744 Prediction of the bstSparse Model:
testD <- xgb.DMatrix(data = test2[,vars])
#Column names must match the inputs EXACTLY
prediction <- predict(bstSparse, testD) #Make the prediction based on the half of the training data set aside
#Put testing prediction and test dataset all together
test3 <- as.data.frame(as.matrix(test2))
prediction <- as.data.frame(as.matrix(prediction))
colnames(prediction) <- "prediction"
model_output <- cbind(test3, prediction)
model_output$log_prediction <- log(model_output$prediction)
model_output$log_SalePrice <- log(model_output$SalePrice)
#Test with RMSE
rmse(model_output$log_SalePrice,model_output$log_prediction)
## [1] 0.1192127
The RMSE error is 13.88% for the first model, after running many different model with different values the best RMSE was 1.11% error (but it varies between 1% and 2%)
#Changing the parameters
param2 <- list(colsample_bytree = .6,
subsample = .8,
booster = "gbtree",
max_depth = 12,
eta = 0.05,
eval_metric = "rmse",
objective="reg:linear")
Make a second model
#Train the model using those parameters
bstSparse2 <-
xgb.train(params = param2,
data = trainD,
nrounds = 500,
watchlist = list(train = trainD),
verbose = TRUE,
print_every_n = 100,
nthread = 2)
## [12:20:05] WARNING: amalgamation/../src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror.
## [1] train-rmse:187987.981438
## [101] train-rmse:4339.814771
## [201] train-rmse:812.060518
## [301] train-rmse:196.878824
## [401] train-rmse:37.382146
## [500] train-rmse:6.728579
#Column names must match the inputs EXACTLY
prediction_2 <- predict(bstSparse2, testD) #Make the prediction based on the half of the training data set aside
#Put testing prediction and test dataset all together
test3 <- as.data.frame(as.matrix(test2))
prediction2 <- as.data.frame(as.matrix(prediction_2))
colnames(prediction2) <- "prediction"
output <- cbind(test3, prediction2)
output$log_prediction_2 <- log(output$prediction)
output$log_SalePrice2 <- log(output$SalePrice)
#Test with RMSE
rmse(output$log_SalePrice2,output$log_prediction_2)
## [1] 0.1185841
The RMSE error is ____ what is slightly higher than the previous
model.
Preparing the test data set
# Get the supplied test data ready #
predict <- as.data.frame(housing_test) #Get the dataset formatted as a frame for later combining
#Create matrices from the data frames
predData<- as.matrix(predict, rownames.force=NA)
#Turn the matrices into sparse matrices
predicting <- as(predData, "sparseMatrix")
#colnames(train[,c(2:79)])
vars <- c("Id", "MSSubClass", "MSZoning", "LotFrontage", "LotArea", "Street",
"LotShape", "LandContour", "Utilities", "LotConfig", "LandSlope", "Neighborhood",
"Condition1", "Condition2", "BldgType", "HouseStyle", "OverallQual", "OverallCond",
"YearBuilt", "YearRemodAdd", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd",
"MasVnrType", "MasVnrArea", "ExterQual", "ExterCond", "Foundation", "BsmtQual",
"BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinSF1", "BsmtFinType2", "BsmtFinSF2",
"BsmtUnfSF", "TotalBsmtSF", "Heating", "HeatingQC", "CentralAir", "Electrical",
"X1stFlrSF", "X2ndFlrSF", "LowQualFinSF", "GrLivArea", "BsmtFullBath", "BsmtHalfBath",
"FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr", "KitchenQual", "TotRmsAbvGrd",
"Functional", "Fireplaces", "FireplaceQu", "GarageType", "GarageYrBlt", "GarageFinish",
"GarageCars", "GarageArea", "GarageQual", "GarageCond", "PavedDrive", "WoodDeckSF","OpenPorchSF", "EnclosedPorch", "X3SsnPorch", "ScreenPorch", "PoolArea", "PoolQC",
"Fence", "MiscFeature", "MiscVal", "MoSold", "YrSold", "SaleType",
"SaleCondition")
colnames(predicting[,vars])
## [1] "Id" "MSSubClass" "MSZoning" "LotFrontage"
## [5] "LotArea" "Street" "LotShape" "LandContour"
## [9] "Utilities" "LotConfig" "LandSlope" "Neighborhood"
## [13] "Condition1" "Condition2" "BldgType" "HouseStyle"
## [17] "OverallQual" "OverallCond" "YearBuilt" "YearRemodAdd"
## [21] "RoofStyle" "RoofMatl" "Exterior1st" "Exterior2nd"
## [25] "MasVnrType" "MasVnrArea" "ExterQual" "ExterCond"
## [29] "Foundation" "BsmtQual" "BsmtCond" "BsmtExposure"
## [33] "BsmtFinType1" "BsmtFinSF1" "BsmtFinType2" "BsmtFinSF2"
## [37] "BsmtUnfSF" "TotalBsmtSF" "Heating" "HeatingQC"
## [41] "CentralAir" "Electrical" "X1stFlrSF" "X2ndFlrSF"
## [45] "LowQualFinSF" "GrLivArea" "BsmtFullBath" "BsmtHalfBath"
## [49] "FullBath" "HalfBath" "BedroomAbvGr" "KitchenAbvGr"
## [53] "KitchenQual" "TotRmsAbvGrd" "Functional" "Fireplaces"
## [57] "FireplaceQu" "GarageType" "GarageYrBlt" "GarageFinish"
## [61] "GarageCars" "GarageArea" "GarageQual" "GarageCond"
## [65] "PavedDrive" "WoodDeckSF" "OpenPorchSF" "EnclosedPorch"
## [69] "X3SsnPorch" "ScreenPorch" "PoolArea" "PoolQC"
## [73] "Fence" "MiscFeature" "MiscVal" "MoSold"
## [77] "YrSold" "SaleType" "SaleCondition"
rm(bstSparse)
#Create matrices from the data frames
retrainData<- as.matrix(train, rownames.force=NA)
#Turn the matrices into sparse matrices
retrain <- as(retrainData, "sparseMatrix")
param3 <- list(colsample_bytree = .7,
subsample = .7,
booster = "gbtree",
max_depth = 10,
eta = 0.04,
eval_metric = "rmse",
objective="reg:linear")
retrainD <- xgb.DMatrix(data = retrain[,vars], label = retrain[,"SalePrice"])
#retrain the model using those parameters
bstSparse3 <-
xgb.train(params = param3,
data = retrainD,
nrounds = 500,
watchlist = list(train = trainD),
verbose = TRUE,
print_every_n = 100,
nthread = 2)
## [12:20:11] WARNING: amalgamation/../src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror.
## [1] train-rmse:189823.030525
## [101] train-rmse:11189.402378
## [201] train-rmse:5478.465614
## [301] train-rmse:4650.097411
## [401] train-rmse:4397.783603
## [500] train-rmse:4335.717392
#Column names must match the inputs EXACTLY
prediction <- predict(bstSparse3, predicting[,vars])
prediction <- as.data.frame(as.matrix(prediction)) #Get the dataset formatted as a frame for later combining
colnames(prediction) <- "prediction"
model_output <- cbind(predict, prediction) #Combine the prediction output with the rest of the set
results <- data.frame(Id = model_output$Id, SalePrice = model_output$prediction)
length(model_output$prediction)
## [1] 1459
#Result
write.csv(results, file = "Prediction.csv", row.names = F)
head(results$SalePrice)
## [1] 123595.1 156060.7 185800.5 189598.8 186934.7 173599.3
The file has a sales price prediction for the house_testing set.
summary <- read.csv("/Users/jusimioni/Desktop/sample_submission.csv")
head(summary)
## Id SalePrice
## 1 1461 169277.1
## 2 1462 187758.4
## 3 1463 183583.7
## 4 1464 179317.5
## 5 1465 150730.1
## 6 1466 177151.0