getwd()
## [1] "C:/Users/TANAY/Documents"
house<- read.csv("train_housing.csv")
summary(house)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 C (all): 10 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 FV : 65 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 RH : 16 Median : 69.00
## Mean : 730.5 Mean : 56.9 RL :1151 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 RM : 218 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape LandContour
## Min. : 1300 Grvl: 6 Grvl: 50 IR1:484 Bnk: 63
## 1st Qu.: 7554 Pave:1454 Pave: 41 IR2: 41 HLS: 50
## Median : 9478 NA's:1369 IR3: 10 Low: 36
## Mean : 10517 Reg:925 Lvl:1311
## 3rd Qu.: 11602
## Max. :215245
##
## Utilities LotConfig LandSlope Neighborhood Condition1
## AllPub:1459 Corner : 263 Gtl:1382 NAmes :225 Norm :1260
## NoSeWa: 1 CulDSac: 94 Mod: 65 CollgCr:150 Feedr : 81
## FR2 : 47 Sev: 13 OldTown:113 Artery : 48
## FR3 : 4 Edwards:100 RRAn : 26
## Inside :1052 Somerst: 86 PosN : 19
## Gilbert: 79 RRAe : 11
## (Other):707 (Other): 15
## Condition2 BldgType HouseStyle OverallQual
## Norm :1445 1Fam :1220 1Story :726 Min. : 1.000
## Feedr : 6 2fmCon: 31 2Story :445 1st Qu.: 5.000
## Artery : 2 Duplex: 52 1.5Fin :154 Median : 6.000
## PosN : 2 Twnhs : 43 SLvl : 65 Mean : 6.099
## RRNn : 2 TwnhsE: 114 SFoyer : 37 3rd Qu.: 7.000
## PosA : 1 1.5Unf : 14 Max. :10.000
## (Other): 2 (Other): 19
## OverallCond YearBuilt YearRemodAdd RoofStyle
## Min. :1.000 Min. :1872 Min. :1950 Flat : 13
## 1st Qu.:5.000 1st Qu.:1954 1st Qu.:1967 Gable :1141
## Median :5.000 Median :1973 Median :1994 Gambrel: 11
## Mean :5.575 Mean :1971 Mean :1985 Hip : 286
## 3rd Qu.:6.000 3rd Qu.:2000 3rd Qu.:2004 Mansard: 7
## Max. :9.000 Max. :2010 Max. :2010 Shed : 2
##
## RoofMatl Exterior1st Exterior2nd MasVnrType MasVnrArea
## CompShg:1434 VinylSd:515 VinylSd:504 BrkCmn : 15 Min. : 0.0
## Tar&Grv: 11 HdBoard:222 MetalSd:214 BrkFace:445 1st Qu.: 0.0
## WdShngl: 6 MetalSd:220 HdBoard:207 None :864 Median : 0.0
## WdShake: 5 Wd Sdng:206 Wd Sdng:197 Stone :128 Mean : 103.7
## ClyTile: 1 Plywood:108 Plywood:142 NA's : 8 3rd Qu.: 166.0
## Membran: 1 CemntBd: 61 CmentBd: 60 Max. :1600.0
## (Other): 2 (Other):128 (Other):136 NA's :8
## ExterQual ExterCond Foundation BsmtQual BsmtCond BsmtExposure
## Ex: 52 Ex: 3 BrkTil:146 Ex :121 Fa : 45 Av :221
## Fa: 14 Fa: 28 CBlock:634 Fa : 35 Gd : 65 Gd :134
## Gd:488 Gd: 146 PConc :647 Gd :618 Po : 2 Mn :114
## TA:906 Po: 1 Slab : 24 TA :649 TA :1311 No :953
## TA:1282 Stone : 6 NA's: 37 NA's: 37 NA's: 38
## Wood : 3
##
## BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2
## ALQ :220 Min. : 0.0 ALQ : 19 Min. : 0.00
## BLQ :148 1st Qu.: 0.0 BLQ : 33 1st Qu.: 0.00
## GLQ :418 Median : 383.5 GLQ : 14 Median : 0.00
## LwQ : 74 Mean : 443.6 LwQ : 46 Mean : 46.55
## Rec :133 3rd Qu.: 712.2 Rec : 54 3rd Qu.: 0.00
## Unf :430 Max. :5644.0 Unf :1256 Max. :1474.00
## NA's: 37 NA's: 38
## BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir
## Min. : 0.0 Min. : 0.0 Floor: 1 Ex:741 N: 95
## 1st Qu.: 223.0 1st Qu.: 795.8 GasA :1428 Fa: 49 Y:1365
## Median : 477.5 Median : 991.5 GasW : 18 Gd:241
## Mean : 567.2 Mean :1057.4 Grav : 7 Po: 1
## 3rd Qu.: 808.0 3rd Qu.:1298.2 OthW : 2 TA:428
## Max. :2336.0 Max. :6110.0 Wall : 4
##
## Electrical X1stFlrSF X2ndFlrSF LowQualFinSF
## FuseA: 94 Min. : 334 Min. : 0 Min. : 0.000
## FuseF: 27 1st Qu.: 882 1st Qu.: 0 1st Qu.: 0.000
## FuseP: 3 Median :1087 Median : 0 Median : 0.000
## Mix : 1 Mean :1163 Mean : 347 Mean : 5.845
## SBrkr:1334 3rd Qu.:1391 3rd Qu.: 728 3rd Qu.: 0.000
## NA's : 1 Max. :4692 Max. :2065 Max. :572.000
##
## GrLivArea BsmtFullBath BsmtHalfBath FullBath
## Min. : 334 Min. :0.0000 Min. :0.00000 Min. :0.000
## 1st Qu.:1130 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.000
## Median :1464 Median :0.0000 Median :0.00000 Median :2.000
## Mean :1515 Mean :0.4253 Mean :0.05753 Mean :1.565
## 3rd Qu.:1777 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:2.000
## Max. :5642 Max. :3.0000 Max. :2.00000 Max. :3.000
##
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual
## Min. :0.0000 Min. :0.000 Min. :0.000 Ex:100
## 1st Qu.:0.0000 1st Qu.:2.000 1st Qu.:1.000 Fa: 39
## Median :0.0000 Median :3.000 Median :1.000 Gd:586
## Mean :0.3829 Mean :2.866 Mean :1.047 TA:735
## 3rd Qu.:1.0000 3rd Qu.:3.000 3rd Qu.:1.000
## Max. :2.0000 Max. :8.000 Max. :3.000
##
## TotRmsAbvGrd Functional Fireplaces FireplaceQu GarageType
## Min. : 2.000 Maj1: 14 Min. :0.000 Ex : 24 2Types : 6
## 1st Qu.: 5.000 Maj2: 5 1st Qu.:0.000 Fa : 33 Attchd :870
## Median : 6.000 Min1: 31 Median :1.000 Gd :380 Basment: 19
## Mean : 6.518 Min2: 34 Mean :0.613 Po : 20 BuiltIn: 88
## 3rd Qu.: 7.000 Mod : 15 3rd Qu.:1.000 TA :313 CarPort: 9
## Max. :14.000 Sev : 1 Max. :3.000 NA's:690 Detchd :387
## Typ :1360 NA's : 81
## GarageYrBlt GarageFinish GarageCars GarageArea GarageQual
## Min. :1900 Fin :352 Min. :0.000 Min. : 0.0 Ex : 3
## 1st Qu.:1961 RFn :422 1st Qu.:1.000 1st Qu.: 334.5 Fa : 48
## Median :1980 Unf :605 Median :2.000 Median : 480.0 Gd : 14
## Mean :1979 NA's: 81 Mean :1.767 Mean : 473.0 Po : 3
## 3rd Qu.:2002 3rd Qu.:2.000 3rd Qu.: 576.0 TA :1311
## Max. :2010 Max. :4.000 Max. :1418.0 NA's: 81
## NA's :81
## GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch
## Ex : 2 N: 90 Min. : 0.00 Min. : 0.00 Min. : 0.00
## Fa : 35 P: 30 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00
## Gd : 9 Y:1340 Median : 0.00 Median : 25.00 Median : 0.00
## Po : 7 Mean : 94.24 Mean : 46.66 Mean : 21.95
## TA :1326 3rd Qu.:168.00 3rd Qu.: 68.00 3rd Qu.: 0.00
## NA's: 81 Max. :857.00 Max. :547.00 Max. :552.00
##
## X3SsnPorch ScreenPorch PoolArea PoolQC
## Min. : 0.00 Min. : 0.00 Min. : 0.000 Ex : 2
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000 Fa : 2
## Median : 0.00 Median : 0.00 Median : 0.000 Gd : 3
## Mean : 3.41 Mean : 15.06 Mean : 2.759 NA's:1453
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :508.00 Max. :480.00 Max. :738.000
##
## Fence MiscFeature MiscVal MoSold
## GdPrv: 59 Gar2: 2 Min. : 0.00 Min. : 1.000
## GdWo : 54 Othr: 2 1st Qu.: 0.00 1st Qu.: 5.000
## MnPrv: 157 Shed: 49 Median : 0.00 Median : 6.000
## MnWw : 11 TenC: 1 Mean : 43.49 Mean : 6.322
## NA's :1179 NA's:1406 3rd Qu.: 0.00 3rd Qu.: 8.000
## Max. :15500.00 Max. :12.000
##
## YrSold SaleType SaleCondition SalePrice
## Min. :2006 WD :1267 Abnorml: 101 Min. : 34900
## 1st Qu.:2007 New : 122 AdjLand: 4 1st Qu.:129975
## Median :2008 COD : 43 Alloca : 12 Median :163000
## Mean :2008 ConLD : 9 Family : 20 Mean :180921
## 3rd Qu.:2009 ConLI : 5 Normal :1198 3rd Qu.:214000
## Max. :2010 ConLw : 5 Partial: 125 Max. :755000
## (Other): 9
select_var <- c('Id','MSZoning','Utilities', 'Neighborhood','BldgType','HouseStyle',
'OverallQual','OverallCond','YearBuilt', 'ExterQual','ExterCond',
'BsmtQual','BsmtCond','TotalBsmtSF','Heating','HeatingQC',
'CentralAir','Electrical','GrLivArea','BedroomAbvGr','KitchenAbvGr',
'KitchenQual','TotRmsAbvGrd','Functional','Fireplaces','FireplaceQu',
'GarageArea','GarageQual','GarageCond','OpenPorchSF','PoolArea',
'Fence','MoSold','YrSold','SaleType','SaleCondition','SalePrice')
select_train <- house[,select_var]
head(select_train)
## Id MSZoning Utilities Neighborhood BldgType HouseStyle OverallQual
## 1 1 RL AllPub CollgCr 1Fam 2Story 7
## 2 2 RL AllPub Veenker 1Fam 1Story 6
## 3 3 RL AllPub CollgCr 1Fam 2Story 7
## 4 4 RL AllPub Crawfor 1Fam 2Story 7
## 5 5 RL AllPub NoRidge 1Fam 2Story 8
## 6 6 RL AllPub Mitchel 1Fam 1.5Fin 5
## OverallCond YearBuilt ExterQual ExterCond BsmtQual BsmtCond TotalBsmtSF
## 1 5 2003 Gd TA Gd TA 856
## 2 8 1976 TA TA Gd TA 1262
## 3 5 2001 Gd TA Gd TA 920
## 4 5 1915 TA TA TA Gd 756
## 5 5 2000 Gd TA Gd TA 1145
## 6 5 1993 TA TA Gd TA 796
## Heating HeatingQC CentralAir Electrical GrLivArea BedroomAbvGr
## 1 GasA Ex Y SBrkr 1710 3
## 2 GasA Ex Y SBrkr 1262 3
## 3 GasA Ex Y SBrkr 1786 3
## 4 GasA Gd Y SBrkr 1717 3
## 5 GasA Ex Y SBrkr 2198 4
## 6 GasA Ex Y SBrkr 1362 1
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## 1 1 Gd 8 Typ 0 <NA>
## 2 1 TA 6 Typ 1 TA
## 3 1 Gd 6 Typ 1 TA
## 4 1 Gd 7 Typ 1 Gd
## 5 1 Gd 9 Typ 1 TA
## 6 1 TA 5 Typ 0 <NA>
## GarageArea GarageQual GarageCond OpenPorchSF PoolArea Fence MoSold
## 1 548 TA TA 61 0 <NA> 2
## 2 460 TA TA 0 0 <NA> 5
## 3 608 TA TA 42 0 <NA> 9
## 4 642 TA TA 35 0 <NA> 2
## 5 836 TA TA 84 0 <NA> 12
## 6 480 TA TA 30 0 MnPrv 10
## YrSold SaleType SaleCondition SalePrice
## 1 2008 WD Normal 208500
## 2 2007 WD Normal 181500
## 3 2008 WD Normal 223500
## 4 2006 WD Abnorml 140000
## 5 2008 WD Normal 250000
## 6 2009 WD Normal 143000
summary(select_train)
## Id MSZoning Utilities Neighborhood BldgType
## Min. : 1.0 C (all): 10 AllPub:1459 NAmes :225 1Fam :1220
## 1st Qu.: 365.8 FV : 65 NoSeWa: 1 CollgCr:150 2fmCon: 31
## Median : 730.5 RH : 16 OldTown:113 Duplex: 52
## Mean : 730.5 RL :1151 Edwards:100 Twnhs : 43
## 3rd Qu.:1095.2 RM : 218 Somerst: 86 TwnhsE: 114
## Max. :1460.0 Gilbert: 79
## (Other):707
## HouseStyle OverallQual OverallCond YearBuilt ExterQual
## 1Story :726 Min. : 1.000 Min. :1.000 Min. :1872 Ex: 52
## 2Story :445 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954 Fa: 14
## 1.5Fin :154 Median : 6.000 Median :5.000 Median :1973 Gd:488
## SLvl : 65 Mean : 6.099 Mean :5.575 Mean :1971 TA:906
## SFoyer : 37 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## 1.5Unf : 14 Max. :10.000 Max. :9.000 Max. :2010
## (Other): 19
## ExterCond BsmtQual BsmtCond TotalBsmtSF Heating HeatingQC
## Ex: 3 Ex :121 Fa : 45 Min. : 0.0 Floor: 1 Ex:741
## Fa: 28 Fa : 35 Gd : 65 1st Qu.: 795.8 GasA :1428 Fa: 49
## Gd: 146 Gd :618 Po : 2 Median : 991.5 GasW : 18 Gd:241
## Po: 1 TA :649 TA :1311 Mean :1057.4 Grav : 7 Po: 1
## TA:1282 NA's: 37 NA's: 37 3rd Qu.:1298.2 OthW : 2 TA:428
## Max. :6110.0 Wall : 4
##
## CentralAir Electrical GrLivArea BedroomAbvGr KitchenAbvGr
## N: 95 FuseA: 94 Min. : 334 Min. :0.000 Min. :0.000
## Y:1365 FuseF: 27 1st Qu.:1130 1st Qu.:2.000 1st Qu.:1.000
## FuseP: 3 Median :1464 Median :3.000 Median :1.000
## Mix : 1 Mean :1515 Mean :2.866 Mean :1.047
## SBrkr:1334 3rd Qu.:1777 3rd Qu.:3.000 3rd Qu.:1.000
## NA's : 1 Max. :5642 Max. :8.000 Max. :3.000
##
## KitchenQual TotRmsAbvGrd Functional Fireplaces FireplaceQu
## Ex:100 Min. : 2.000 Maj1: 14 Min. :0.000 Ex : 24
## Fa: 39 1st Qu.: 5.000 Maj2: 5 1st Qu.:0.000 Fa : 33
## Gd:586 Median : 6.000 Min1: 31 Median :1.000 Gd :380
## TA:735 Mean : 6.518 Min2: 34 Mean :0.613 Po : 20
## 3rd Qu.: 7.000 Mod : 15 3rd Qu.:1.000 TA :313
## Max. :14.000 Sev : 1 Max. :3.000 NA's:690
## Typ :1360
## GarageArea GarageQual GarageCond OpenPorchSF
## Min. : 0.0 Ex : 3 Ex : 2 Min. : 0.00
## 1st Qu.: 334.5 Fa : 48 Fa : 35 1st Qu.: 0.00
## Median : 480.0 Gd : 14 Gd : 9 Median : 25.00
## Mean : 473.0 Po : 3 Po : 7 Mean : 46.66
## 3rd Qu.: 576.0 TA :1311 TA :1326 3rd Qu.: 68.00
## Max. :1418.0 NA's: 81 NA's: 81 Max. :547.00
##
## PoolArea Fence MoSold YrSold
## Min. : 0.000 GdPrv: 59 Min. : 1.000 Min. :2006
## 1st Qu.: 0.000 GdWo : 54 1st Qu.: 5.000 1st Qu.:2007
## Median : 0.000 MnPrv: 157 Median : 6.000 Median :2008
## Mean : 2.759 MnWw : 11 Mean : 6.322 Mean :2008
## 3rd Qu.: 0.000 NA's :1179 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :738.000 Max. :12.000 Max. :2010
##
## SaleType SaleCondition SalePrice
## WD :1267 Abnorml: 101 Min. : 34900
## New : 122 AdjLand: 4 1st Qu.:129975
## COD : 43 Alloca : 12 Median :163000
## ConLD : 9 Family : 20 Mean :180921
## ConLI : 5 Normal :1198 3rd Qu.:214000
## ConLw : 5 Partial: 125 Max. :755000
## (Other): 9
summary(select_train$SalePrice)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 34900 129975 163000 180921 214000 755000
summary(select_train$MSZoning)
## C (all) FV RH RL RM
## 10 65 16 1151 218
summary(select_train$SaleCondition)
## Abnorml AdjLand Alloca Family Normal Partial
## 101 4 12 20 1198 125
summary(select_train$Utilities)
## AllPub NoSeWa
## 1459 1
summary(select_train$HouseStyle)
## 1.5Fin 1.5Unf 1Story 2.5Fin 2.5Unf 2Story SFoyer SLvl
## 154 14 726 8 11 445 37 65
library(plyr)
ddply(select_train, .(BldgType), summarize,Total = length(BldgType),Max_price=max(SalePrice),Min_price=min(SalePrice))
## BldgType Total Max_price Min_price
## 1 1Fam 1220 755000 34900
## 2 2fmCon 31 228950 55000
## 3 Duplex 52 206300 82000
## 4 Twnhs 43 230000 75000
## 5 TwnhsE 114 392500 75500
select_train$HouseStyle2 <- as.numeric(factor(select_train$HouseStyle,
levels = c("1.5Fin", "1.5Unf","1Story", "2.5Fin","2.5Unf","2Story","SFoyer","SLvl"),
labels = c(8,7,6,5,2,4,3,1) ,ordered = TRUE))
PLOTS
library(lattice)
library(ggplot2)
histogram(select_train$SalePrice, Main="Range of Sale Price", xlab="Sale price", ylab="Count of houses")

boxplot(select_train$SalePrice, Main="Range of Sale Price", xlab="Sale price", horizontal = TRUE)

ggplot(select_train, aes(x = SalePrice, fill = ..count..)) +
geom_histogram(binwidth = 5000) +
ggtitle("Figure 1 Histogram of SalePrice") +
ylab("Count of houses") +
xlab("Housing Price") +
theme(plot.title = element_text(hjust = 0.5))

ggplot(select_train, aes(x=MSZoning, y=SalePrice, fill=MSZoning)) +
geom_boxplot(alpha=0.3) +
stat_summary(fun.y=mean, geom="point", shape=20, size=4, color="red", fill="red")+
theme(legend.position="none")+
ggtitle("Figure 4 Boxplot of SalePrice by MSZoning")+
theme(plot.title = element_text(hjust = 0.5))

ggplot(select_train, aes(SalePrice)) +
geom_histogram(aes(fill = BldgType), position = position_stack(reverse = TRUE), binwidth = 20000) +
coord_flip() + ggtitle("Figure 5 Histogram of SalePrice") +
ylab("Count") +
xlab("Housing Price") +
theme(plot.title = element_text(hjust = 0.5),legend.position=c(0.9,0.8), legend.background = element_rect(fill="grey90",
size=0.5, linetype="solid",
colour ="black"))

library("car")
scatterplot(select_train$SalePrice~select_train$OverallQual, main="Sale price Vs Overall quality", ylab="Sale price", xlab="Overall quality")

ggplot(select_train, aes(x=TotalBsmtSF, y=SalePrice)) +
geom_point(shape=1) +
geom_smooth(method=lm , color="red", se=FALSE)+
ggtitle("Figure 9 Scatter plot of SalePrice and TotalBsmtSF") +
theme(plot.title = element_text(hjust = 0.4))

CORRELATIONS
cor.test(select_train$SalePrice,select_train$YrSold)
##
## Pearson's product-moment correlation
##
## data: select_train$SalePrice and select_train$YrSold
## t = -1.1048, df = 1458, p-value = 0.2694
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.08010603 0.02241298
## sample estimates:
## cor
## -0.02892259
cor.test(select_train$SalePrice,select_train$OverallQual)
##
## Pearson's product-moment correlation
##
## data: select_train$SalePrice and select_train$OverallQual
## t = 49.364, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7709644 0.8094376
## sample estimates:
## cor
## 0.7909816
cor.test(select_train$SalePrice,select_train$OverallCond)
##
## Pearson's product-moment correlation
##
## data: select_train$SalePrice and select_train$OverallCond
## t = -2.9819, df = 1458, p-value = 0.002912
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.12864437 -0.02666008
## sample estimates:
## cor
## -0.07785589
cor.test(select_train$SalePrice,select_train$GrLivArea)
##
## Pearson's product-moment correlation
##
## data: select_train$SalePrice and select_train$GrLivArea
## t = 38.348, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6821200 0.7332695
## sample estimates:
## cor
## 0.7086245
cor.test(select_train$SalePrice,select_train$TotalBsmtSF)
##
## Pearson's product-moment correlation
##
## data: select_train$SalePrice and select_train$TotalBsmtSF
## t = 29.671, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5805529 0.6445923
## sample estimates:
## cor
## 0.6135806
cor.test(select_train$SalePrice,select_train$BedroomAbvGr)
##
## Pearson's product-moment correlation
##
## data: select_train$SalePrice and select_train$BedroomAbvGr
## t = 6.5159, df = 1458, p-value = 9.927e-11
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1179285 0.2176373
## sample estimates:
## cor
## 0.1682132
cor.test(select_train$SalePrice,select_train$TotRmsAbvGrd)
##
## Pearson's product-moment correlation
##
## data: select_train$SalePrice and select_train$TotRmsAbvGrd
## t = 24.099, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4960020 0.5694337
## sample estimates:
## cor
## 0.5337232
cor.test(select_train$SalePrice,select_train$GarageArea)
##
## Pearson's product-moment correlation
##
## data: select_train$SalePrice and select_train$GarageArea
## t = 30.446, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5910324 0.6538222
## sample estimates:
## cor
## 0.6234314
cor.test(select_train$SalePrice,select_train$OpenPorchSF)
##
## Pearson's product-moment correlation
##
## data: select_train$SalePrice and select_train$OpenPorchSF
## t = 12.711, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2689114 0.3613039
## sample estimates:
## cor
## 0.3158562
cor.test(select_train$SalePrice,select_train$PoolArea)
##
## Pearson's product-moment correlation
##
## data: select_train$SalePrice and select_train$PoolArea
## t = 3.5435, df = 1458, p-value = 0.0004073
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.04129701 0.14302783
## sample estimates:
## cor
## 0.09240355
cor.test(select_train$SalePrice,select_train$YearBuilt)
##
## Pearson's product-moment correlation
##
## data: select_train$SalePrice and select_train$YearBuilt
## t = 23.424, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4845947 0.5591987
## sample estimates:
## cor
## 0.5228973
cor.test(select_train$SalePrice,select_train$HouseStyle2)
##
## Pearson's product-moment correlation
##
## data: select_train$SalePrice and select_train$HouseStyle2
## t = 6.9937, df = 1458, p-value = 4.064e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1300625 0.2293451
## sample estimates:
## cor
## 0.1801626
MODEL
select_train$ExterCond2 <- as.numeric(factor(select_train$ExterCond,
levels = c("Ex", "Fa","Gd", "TA","Po"),
labels = c(5,2,4,3,1) ,ordered = TRUE))
select_train$HeatingQC2 <- as.numeric(factor(select_train$HeatingQC,
levels = c("Ex", "Fa","Gd", "TA","Po"),
labels = c(5,2,4,3,1) ,ordered = TRUE))
select_train$CentralAir2 <- as.numeric(factor(select_train$CentralAir,
levels = c("N", "Y"),
labels = c(0,1) ,ordered = TRUE))
model_var <- c('SalePrice',
'OverallQual','OverallCond','YearBuilt','ExterCond2',
'TotalBsmtSF','HeatingQC2',
'CentralAir2','GrLivArea','BedroomAbvGr','KitchenAbvGr',
'TotRmsAbvGrd','Fireplaces',
'GarageArea','OpenPorchSF','PoolArea',
'YrSold')
heat <- select_train[,model_var]
library("corrplot")
## corrplot 0.84 loaded
corrplot(corr=cor(heat ,use="complete.obs"), method="ellipse")

library("corrgram")
##
## Attaching package: 'corrgram'
## The following object is masked from 'package:plyr':
##
## baseball
corrgram(heat,upper.panel=panel.pie, main="Corrgram of real estate Data variables")

#Using linear regression model
model1 <- lm(SalePrice~.-SalePrice ,data=heat)
summary(model1)
##
## Call:
## lm(formula = SalePrice ~ . - SalePrice, data = heat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -515179 -18402 -2568 14445 290927
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.033e+05 1.462e+06 -0.344 0.73069
## OverallQual 1.573e+04 1.184e+03 13.287 < 2e-16 ***
## OverallCond 6.680e+03 1.017e+03 6.569 7.06e-11 ***
## YearBuilt 4.733e+02 5.055e+01 9.362 < 2e-16 ***
## ExterCond2 -1.143e+03 2.412e+03 -0.474 0.63564
## TotalBsmtSF 2.780e+01 2.815e+00 9.878 < 2e-16 ***
## HeatingQC2 -2.364e+03 8.374e+02 -2.823 0.00482 **
## CentralAir2 -9.472e+03 4.574e+03 -2.071 0.03855 *
## GrLivArea 4.975e+01 4.077e+00 12.203 < 2e-16 ***
## BedroomAbvGr -9.825e+03 1.721e+03 -5.710 1.37e-08 ***
## KitchenAbvGr -2.254e+04 4.978e+03 -4.528 6.45e-06 ***
## TotRmsAbvGrd 5.407e+03 1.282e+03 4.218 2.62e-05 ***
## Fireplaces 8.621e+03 1.767e+03 4.880 1.18e-06 ***
## GarageArea 4.292e+01 5.960e+00 7.201 9.61e-13 ***
## OpenPorchSF -1.514e+01 1.576e+01 -0.961 0.33688
## PoolArea -3.215e+01 2.460e+01 -1.307 0.19140
## YrSold -2.326e+02 7.277e+02 -0.320 0.74933
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 36690 on 1443 degrees of freedom
## Multiple R-squared: 0.7891, Adjusted R-squared: 0.7868
## F-statistic: 337.4 on 16 and 1443 DF, p-value: < 2.2e-16
model_lin <- select_train[, model_var]
model_lin$lSalePrice <- log(model_lin$SalePrice)
set.seed(10000)
train.index <- sample(c(1:dim(model_lin)[1]), dim(model_lin)[1]*0.8)
model_lin_train = model_lin[train.index,]
model_lin_valid <- model_lin[-train.index,]
model2<- lm(SalePrice~.-SalePrice ,data=model_lin_train)
summary(model2)
##
## Call:
## lm(formula = SalePrice ~ . - SalePrice, data = model_lin_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -97776 -11306 -4266 5851 257436
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.169e+06 1.007e+06 -3.146 0.001696 **
## OverallQual 1.787e+03 8.864e+02 2.016 0.044082 *
## OverallCond -2.592e+03 7.486e+02 -3.463 0.000554 ***
## YearBuilt -1.009e+02 3.849e+01 -2.620 0.008898 **
## ExterCond2 -4.763e+03 1.706e+03 -2.793 0.005313 **
## TotalBsmtSF 9.338e+00 2.221e+00 4.205 2.82e-05 ***
## HeatingQC2 5.327e+02 5.864e+02 0.908 0.363812
## CentralAir2 -2.121e+04 3.262e+03 -6.503 1.17e-10 ***
## GrLivArea 1.984e+01 3.027e+00 6.555 8.40e-11 ***
## BedroomAbvGr -9.137e+03 1.205e+03 -7.585 6.85e-14 ***
## KitchenAbvGr -1.117e+04 3.603e+03 -3.101 0.001975 **
## TotRmsAbvGrd 1.075e+03 8.807e+02 1.221 0.222339
## Fireplaces -2.811e+03 1.276e+03 -2.203 0.027822 *
## GarageArea 1.761e+00 4.329e+00 0.407 0.684175
## OpenPorchSF -8.383e+00 1.064e+01 -0.788 0.430846
## PoolArea 5.335e+01 1.579e+01 3.378 0.000754 ***
## YrSold 7.727e+02 5.017e+02 1.540 0.123790
## lSalePrice 1.705e+05 4.794e+03 35.576 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22580 on 1150 degrees of freedom
## Multiple R-squared: 0.9179, Adjusted R-squared: 0.9167
## F-statistic: 756.7 on 17 and 1150 DF, p-value: < 2.2e-16
linreg <- lm(lSalePrice~.-SalePrice, data = model_lin_train)
summary(linreg)
##
## Call:
## lm(formula = lSalePrice ~ . - SalePrice, data = model_lin_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.58114 -0.06702 0.00342 0.07786 0.44064
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.602e+00 6.190e+00 0.905 0.36565
## OverallQual 6.725e-02 5.077e-03 13.246 < 2e-16 ***
## OverallCond 5.923e-02 4.259e-03 13.907 < 2e-16 ***
## YearBuilt 3.345e-03 2.152e-04 15.543 < 2e-16 ***
## ExterCond2 1.736e-02 1.047e-02 1.658 0.09769 .
## TotalBsmtSF 1.782e-04 1.261e-05 14.133 < 2e-16 ***
## HeatingQC2 -1.633e-02 3.573e-03 -4.570 5.41e-06 ***
## CentralAir2 5.788e-02 1.998e-02 2.896 0.00385 **
## GrLivArea 2.319e-04 1.732e-05 13.392 < 2e-16 ***
## BedroomAbvGr -1.188e-02 7.399e-03 -1.606 0.10866
## KitchenAbvGr -8.760e-02 2.200e-02 -3.981 7.29e-05 ***
## TotRmsAbvGrd 1.646e-02 5.394e-03 3.052 0.00233 **
## Fireplaces 6.310e-02 7.625e-03 8.276 3.49e-16 ***
## GarageArea 2.488e-04 2.559e-05 9.723 < 2e-16 ***
## OpenPorchSF -1.497e-07 6.541e-05 -0.002 0.99817
## PoolArea 1.315e-04 9.702e-05 1.355 0.17571
## YrSold -8.613e-04 3.085e-03 -0.279 0.78013
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1389 on 1151 degrees of freedom
## Multiple R-squared: 0.8787, Adjusted R-squared: 0.877
## F-statistic: 521 on 16 and 1151 DF, p-value: < 2.2e-16
library(forecast)
## Warning: package 'forecast' was built under R version 3.4.4
#use predict() to make prediction on a new set
pred1 <- predict(linreg,model_lin_valid,type = "response")
residuals <- model_lin_valid$lSalePrice - pred1
linreg_pred <- data.frame("Predicted" = pred1, "Actual" = model_lin_valid$lSalePrice, "Residual" = residuals)
accuracy(pred1, model_lin_valid$lSalePrice)
## ME RMSE MAE MPE MAPE
## Test set -0.01358129 0.2256403 0.1155791 -0.1262696 0.9675185
pred1train <- predict(linreg,model_lin_train,type = "response")
residualstrain <- model_lin_train$lSalePrice - pred1train
linreg_predtrain <- data.frame("Predicted" = pred1train, "Actual" = model_lin_train$lSalePrice, "Residual" = residualstrain)
accuracy(pred1train, model_lin_train$lSalePrice)
## ME RMSE MAE MPE MAPE
## Test set -6.963971e-15 0.1378493 0.09716002 -0.0134659 0.8122805
#To get back actual Sales price values
pred1SP <- exp(pred1)
residualsSP <- model_lin_valid$SalePrice - pred1SP
linreg_predSP <- data.frame("Predicted" = pred1SP, "Actual" = model_lin_valid$SalePrice, "Residual" = residualsSP)
accuracy(pred1SP, model_lin_valid$SalePrice)
## ME RMSE MAE MPE MAPE
## Test set -7889.941 150563.4 27609.39 -7.064565 16.38413
pred2<- predict(model2,model_lin_valid,type = "response")
residuals2 <- model_lin_valid$SalePrice - pred2
model2_pred <- data.frame("Predicted" = pred2, "Actual" = model_lin_valid$SalePrice, "Residual" = residuals2)
accuracy(pred2, model_lin_valid$SalePrice)
## ME RMSE MAE MPE MAPE
## Test set 753.8356 24739.1 14932.91 1.386573 9.730968
pred2train<- predict(model2,model_lin_train,type = "response")
residuals2train <- model_lin_train$SalePrice - pred2train
model2_predtrain <- data.frame("Predicted" = pred2train, "Actual" = model_lin_train$SalePrice, "Residual" = residuals2train)
accuracy(pred2train, model_lin_train$SalePrice)
## ME RMSE MAE MPE MAPE
## Test set 6.346432e-09 22409.07 13554.7 1.526914 9.291755
library(gbm)
## Warning: package 'gbm' was built under R version 3.4.4
## Loading required package: survival
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.3
gbm1 <- gbm(lSalePrice~.-SalePrice, data = model_lin_train, distribution = "gaussian", n.trees = 10000, interaction.depth = 4, shrinkage = 0.01)
summary(gbm1)

## var rel.inf
## OverallQual OverallQual 32.3510863
## GrLivArea GrLivArea 21.2835300
## TotalBsmtSF TotalBsmtSF 12.7418111
## YearBuilt YearBuilt 9.9735082
## GarageArea GarageArea 8.2870102
## OverallCond OverallCond 4.1620693
## OpenPorchSF OpenPorchSF 2.8860044
## Fireplaces Fireplaces 2.3703428
## CentralAir2 CentralAir2 1.8781091
## TotRmsAbvGrd TotRmsAbvGrd 0.8918616
## ExterCond2 ExterCond2 0.7514918
## HeatingQC2 HeatingQC2 0.7201534
## YrSold YrSold 0.6993501
## BedroomAbvGr BedroomAbvGr 0.5546517
## KitchenAbvGr KitchenAbvGr 0.4490200
## PoolArea PoolArea 0.0000000
#n.trees = seq(from=100 ,to=10000, by=100)
predgbm <- predict(gbm1,model_lin_valid,type = "response", n.trees=100)
residualsgbm <- model_lin_valid$lSalePrice - predgbm
gbm_pred <- data.frame("Predicted" = predgbm, "Actual" = model_lin_valid$lSalePrice, "Residual" = residualsgbm)
accuracy(predgbm, model_lin_valid$lSalePrice)
## ME RMSE MAE MPE MAPE
## Test set -0.01658722 0.2518658 0.1850822 -0.2050172 1.543412
#RANDOM FOREST
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.4.4
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
RF <- randomForest(lSalePrice ~.-SalePrice, data = model_lin_train,
importance =TRUE,ntree=500,nodesize=7, na.action=na.roughfix)
rf.pred <- predict(RF, newdata=model_lin_valid )
residualsrf <- model_lin_valid$lSalePrice - rf.pred
rf_pred <- data.frame("Predicted" = rf.pred, "Actual" = model_lin_valid$lSalePrice, "Residual" = residualsrf)
accuracy(rf.pred, model_lin_valid$lSalePrice)
## ME RMSE MAE MPE MAPE
## Test set -0.01170986 0.1633046 0.1061606 -0.1182802 0.8878536
# RF using sales price directly
rfmodel <- randomForest(SalePrice ~.-SalePrice, data = model_lin_train,
importance =TRUE,ntree=500,nodesize=7, na.action=na.roughfix)
rfpredsp <- predict(rfmodel, newdata=model_lin_valid )
residualsrfsp <- model_lin_valid$SalePrice - rfpredsp
rf_pred <- data.frame("Predicted" = rfpredsp, "Actual" = model_lin_valid$SalePrice, "Residual" = residualsrfsp)
accuracy(rfpredsp, model_lin_valid$SalePrice)
## ME RMSE MAE MPE MAPE
## Test set -1029.869 13463.5 6288.608 -1.786636 3.685443