library(tidyverse)
library(car)
library(matrixcalc)
library(MASS)
Load the data:
train <- read.csv("./house_prices_data/train.csv")
test <- read.csv("./house_prices_data/test.csv")
names(train)
## [1] "Id" "MSSubClass" "MSZoning" "LotFrontage"
## [5] "LotArea" "Street" "Alley" "LotShape"
## [9] "LandContour" "Utilities" "LotConfig" "LandSlope"
## [13] "Neighborhood" "Condition1" "Condition2" "BldgType"
## [17] "HouseStyle" "OverallQual" "OverallCond" "YearBuilt"
## [21] "YearRemodAdd" "RoofStyle" "RoofMatl" "Exterior1st"
## [25] "Exterior2nd" "MasVnrType" "MasVnrArea" "ExterQual"
## [29] "ExterCond" "Foundation" "BsmtQual" "BsmtCond"
## [33] "BsmtExposure" "BsmtFinType1" "BsmtFinSF1" "BsmtFinType2"
## [37] "BsmtFinSF2" "BsmtUnfSF" "TotalBsmtSF" "Heating"
## [41] "HeatingQC" "CentralAir" "Electrical" "X1stFlrSF"
## [45] "X2ndFlrSF" "LowQualFinSF" "GrLivArea" "BsmtFullBath"
## [49] "BsmtHalfBath" "FullBath" "HalfBath" "BedroomAbvGr"
## [53] "KitchenAbvGr" "KitchenQual" "TotRmsAbvGrd" "Functional"
## [57] "Fireplaces" "FireplaceQu" "GarageType" "GarageYrBlt"
## [61] "GarageFinish" "GarageCars" "GarageArea" "GarageQual"
## [65] "GarageCond" "PavedDrive" "WoodDeckSF" "OpenPorchSF"
## [69] "EnclosedPorch" "X3SsnPorch" "ScreenPorch" "PoolArea"
## [73] "PoolQC" "Fence" "MiscFeature" "MiscVal"
## [77] "MoSold" "YrSold" "SaleType" "SaleCondition"
## [81] "SalePrice"
scatterplot(SalePrice ~ YearBuilt, data=train, xlab="Year Built", ylab="Sale Price", grid=FALSE)
scatterplot(SalePrice ~ YrSold, data=train, xlab="Year Sold", ylab="Sale Price", grid=FALSE)
scatterplot(SalePrice ~ X1stFlrSF, data=train, xlab="Square Footage Floor 1", ylab="Sale Price", grid=FALSE)
summary(train)
## Id MSSubClass MSZoning LotFrontage
## Min. : 1.0 Min. : 20.0 Length:1460 Min. : 21.00
## 1st Qu.: 365.8 1st Qu.: 20.0 Class :character 1st Qu.: 59.00
## Median : 730.5 Median : 50.0 Mode :character Median : 69.00
## Mean : 730.5 Mean : 56.9 Mean : 70.05
## 3rd Qu.:1095.2 3rd Qu.: 70.0 3rd Qu.: 80.00
## Max. :1460.0 Max. :190.0 Max. :313.00
## NA's :259
## LotArea Street Alley LotShape
## Min. : 1300 Length:1460 Length:1460 Length:1460
## 1st Qu.: 7554 Class :character Class :character Class :character
## Median : 9478 Mode :character Mode :character Mode :character
## Mean : 10517
## 3rd Qu.: 11602
## Max. :215245
##
## LandContour Utilities LotConfig LandSlope
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Neighborhood Condition1 Condition2 BldgType
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## HouseStyle OverallQual OverallCond YearBuilt
## Length:1460 Min. : 1.000 Min. :1.000 Min. :1872
## Class :character 1st Qu.: 5.000 1st Qu.:5.000 1st Qu.:1954
## Mode :character Median : 6.000 Median :5.000 Median :1973
## Mean : 6.099 Mean :5.575 Mean :1971
## 3rd Qu.: 7.000 3rd Qu.:6.000 3rd Qu.:2000
## Max. :10.000 Max. :9.000 Max. :2010
##
## YearRemodAdd RoofStyle RoofMatl Exterior1st
## Min. :1950 Length:1460 Length:1460 Length:1460
## 1st Qu.:1967 Class :character Class :character Class :character
## Median :1994 Mode :character Mode :character Mode :character
## Mean :1985
## 3rd Qu.:2004
## Max. :2010
##
## Exterior2nd MasVnrType MasVnrArea ExterQual
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 0.0 Mode :character
## Mean : 103.7
## 3rd Qu.: 166.0
## Max. :1600.0
## NA's :8
## ExterCond Foundation BsmtQual BsmtCond
## Length:1460 Length:1460 Length:1460 Length:1460
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## Length:1460 Length:1460 Min. : 0.0 Length:1460
## Class :character Class :character 1st Qu.: 0.0 Class :character
## Mode :character Mode :character Median : 383.5 Mode :character
## Mean : 443.6
## 3rd Qu.: 712.2
## Max. :5644.0
##
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Length:1460
## 1st Qu.: 0.00 1st Qu.: 223.0 1st Qu.: 795.8 Class :character
## Median : 0.00 Median : 477.5 Median : 991.5 Mode :character
## Mean : 46.55 Mean : 567.2 Mean :1057.4
## 3rd Qu.: 0.00 3rd Qu.: 808.0 3rd Qu.:1298.2
## Max. :1474.00 Max. :2336.0 Max. :6110.0
##
## HeatingQC CentralAir Electrical X1stFlrSF
## Length:1460 Length:1460 Length:1460 Min. : 334
## Class :character Class :character Class :character 1st Qu.: 882
## Mode :character Mode :character Mode :character Median :1087
## Mean :1163
## 3rd Qu.:1391
## Max. :4692
##
## X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath
## Min. : 0 Min. : 0.000 Min. : 334 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.000 1st Qu.:1130 1st Qu.:0.0000
## Median : 0 Median : 0.000 Median :1464 Median :0.0000
## Mean : 347 Mean : 5.845 Mean :1515 Mean :0.4253
## 3rd Qu.: 728 3rd Qu.: 0.000 3rd Qu.:1777 3rd Qu.:1.0000
## Max. :2065 Max. :572.000 Max. :5642 Max. :3.0000
##
## BsmtHalfBath FullBath HalfBath BedroomAbvGr
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.00000 Median :2.000 Median :0.0000 Median :3.000
## Mean :0.05753 Mean :1.565 Mean :0.3829 Mean :2.866
## 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :2.00000 Max. :3.000 Max. :2.0000 Max. :8.000
##
## KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## Min. :0.000 Length:1460 Min. : 2.000 Length:1460
## 1st Qu.:1.000 Class :character 1st Qu.: 5.000 Class :character
## Median :1.000 Mode :character Median : 6.000 Mode :character
## Mean :1.047 Mean : 6.518
## 3rd Qu.:1.000 3rd Qu.: 7.000
## Max. :3.000 Max. :14.000
##
## Fireplaces FireplaceQu GarageType GarageYrBlt
## Min. :0.000 Length:1460 Length:1460 Min. :1900
## 1st Qu.:0.000 Class :character Class :character 1st Qu.:1961
## Median :1.000 Mode :character Mode :character Median :1980
## Mean :0.613 Mean :1979
## 3rd Qu.:1.000 3rd Qu.:2002
## Max. :3.000 Max. :2010
## NA's :81
## GarageFinish GarageCars GarageArea GarageQual
## Length:1460 Min. :0.000 Min. : 0.0 Length:1460
## Class :character 1st Qu.:1.000 1st Qu.: 334.5 Class :character
## Mode :character Median :2.000 Median : 480.0 Mode :character
## Mean :1.767 Mean : 473.0
## 3rd Qu.:2.000 3rd Qu.: 576.0
## Max. :4.000 Max. :1418.0
##
## GarageCond PavedDrive WoodDeckSF OpenPorchSF
## Length:1460 Length:1460 Min. : 0.00 Min. : 0.00
## Class :character Class :character 1st Qu.: 0.00 1st Qu.: 0.00
## Mode :character Mode :character Median : 0.00 Median : 25.00
## Mean : 94.24 Mean : 46.66
## 3rd Qu.:168.00 3rd Qu.: 68.00
## Max. :857.00 Max. :547.00
##
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea
## Min. : 0.00 Min. : 0.00 Min. : 0.00 Min. : 0.000
## 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.: 0.000
## Median : 0.00 Median : 0.00 Median : 0.00 Median : 0.000
## Mean : 21.95 Mean : 3.41 Mean : 15.06 Mean : 2.759
## 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.00 3rd Qu.: 0.000
## Max. :552.00 Max. :508.00 Max. :480.00 Max. :738.000
##
## PoolQC Fence MiscFeature MiscVal
## Length:1460 Length:1460 Length:1460 Min. : 0.00
## Class :character Class :character Class :character 1st Qu.: 0.00
## Mode :character Mode :character Mode :character Median : 0.00
## Mean : 43.49
## 3rd Qu.: 0.00
## Max. :15500.00
##
## MoSold YrSold SaleType SaleCondition
## Min. : 1.000 Min. :2006 Length:1460 Length:1460
## 1st Qu.: 5.000 1st Qu.:2007 Class :character Class :character
## Median : 6.000 Median :2008 Mode :character Mode :character
## Mean : 6.322 Mean :2008
## 3rd Qu.: 8.000 3rd Qu.:2009
## Max. :12.000 Max. :2010
##
## SalePrice
## Min. : 34900
## 1st Qu.:129975
## Median :163000
## Mean :180921
## 3rd Qu.:214000
## Max. :755000
##
(freq <- table(train$YrSold))
##
## 2006 2007 2008 2009 2010
## 314 329 304 338 175
print ("Cumulative Frequency Table")
## [1] "Cumulative Frequency Table"
cumsum <- cumsum(freq)
print (cumsum)
## 2006 2007 2008 2009 2010
## 314 643 947 1285 1460
print ("Relative Frequency Table")
## [1] "Relative Frequency Table"
prob <- prop.table(freq)
print (prob)
##
## 2006 2007 2008 2009 2010
## 0.2150685 0.2253425 0.2082192 0.2315068 0.1198630
pairs(SalePrice~YearBuilt+OverallQual+TotalBsmtSF+GrLivArea,data=train,
main="Simple Scatterplot Matrix")
correlation_three <- dplyr::select(train, YearBuilt, OverallQual, GrLivArea)
corr_matrix <- cor(correlation_three)
corr_matrix
## YearBuilt OverallQual GrLivArea
## YearBuilt 1.0000000 0.5723228 0.1990097
## OverallQual 0.5723228 1.0000000 0.5930074
## GrLivArea 0.1990097 0.5930074 1.0000000
\[ H_0:p=0 (\text{there is no linear relationship}) \\ H_1:p!=0 (\text{there is a linear relationship}) \]
# mpg
qqPlot(train$YearBuilt, ylab = "YearBuilt")
## [1] 1350 1138
# wt
qqPlot(train$OverallQual, ylab = "Overall Quality")
## [1] 376 534
Using Pearson’s correlation test:
#test the correlation between 1st and 2nd variables
(test1 <- cor.test(formula = ~ OverallQual + GrLivArea,
data = correlation_three,
method = "pearson",
conf.level = 0.80))
##
## Pearson's product-moment correlation
##
## data: OverallQual and GrLivArea
## t = 28.121, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.5708061 0.6143422
## sample estimates:
## cor
## 0.5930074
test1$p.value <0.05
## [1] TRUE
# test the correlation between 1st and 3rd variables
(test2 <- cor.test(formula = ~ OverallQual + YearBuilt,
data = correlation_three,
method = "pearson",
conf.level = 0.80))
##
## Pearson's product-moment correlation
##
## data: OverallQual and YearBuilt
## t = 26.65, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.5493124 0.5944659
## sample estimates:
## cor
## 0.5723228
test2$p.value <0.05
## [1] TRUE
# test the correlation between 2nd and 3rd variables
(test3 <- cor.test(formula = ~ YearBuilt + GrLivArea,
data = correlation_three,
method = "pearson",
conf.level = 0.80))
##
## Pearson's product-moment correlation
##
## data: YearBuilt and GrLivArea
## t = 7.754, df = 1458, p-value = 1.66e-14
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
## 0.1665605 0.2310283
## sample estimates:
## cor
## 0.1990097
test3$p.value <0.05
## [1] TRUE
When we perform one hypothesis test, the type I error rate is equal to the significance level (\(\alpha\)), which is commonly chosen to be 0.01, 0.05, or 0.10. However, when we conduct multiple hypothesis tests at once, the probability of getting a false positive increases. So we do have to worry about family wise error.
The family wise error Rate is defined below.
\[ 1-(1-\alpha)^n \]
Where \(\alpha\) is the significant and n is the total number of tests.
1-(1-0.05)^3
## [1] 0.142625
In other words, the probability of getting a type I error on at least one of the hypothesis tests is 14.26%!
(This is known as the precision matrix and contains variance inflation factors on the diagonal.)
(precision <- solve(corr_matrix))
## YearBuilt OverallQual GrLivArea
## YearBuilt 1.557510 -1.091384 0.337239
## OverallQual -1.091384 2.307153 -1.150963
## GrLivArea 0.337239 -1.150963 1.615416
corr_matrix %*% precision
## YearBuilt OverallQual GrLivArea
## YearBuilt 1.000000e+00 -8.326673e-17 0
## OverallQual -8.326673e-17 1.000000e+00 0
## GrLivArea 0.000000e+00 0.000000e+00 1
precision %*% corr_matrix
## YearBuilt OverallQual GrLivArea
## YearBuilt 1.000000e+00 -8.326673e-17 0
## OverallQual -8.326673e-17 1.000000e+00 0
## GrLivArea 0.000000e+00 0.000000e+00 1
lu.decomposition(corr_matrix)
## $L
## [,1] [,2] [,3]
## [1,] 1.0000000 0.0000000 0
## [2,] 0.5723228 1.0000000 0
## [3,] 0.1990097 0.7124872 1
##
## $U
## [,1] [,2] [,3]
## [1,] 1 5.723228e-01 0.1990097
## [2,] 0 6.724466e-01 0.4791096
## [3,] 0 -5.551115e-17 0.6190356
Many times, it makes sense to fit a closed form distribution to data.
Shift it so that the minimum value is absolutely above zero if
necessary. Then load the MASS package and run fitdistr to
fit an exponential probability density function.
(See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ).
train %>%
dplyr::select(BsmtFinSF1, BsmtUnfSF, GrLivArea,
LotFrontage, GrLivArea, MasVnrArea,
OpenPorchSF, X1stFlrSF) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 267 rows containing non-finite values (`stat_bin()`).
I am going to select X1stFlrSF column for this
question.
summary(train$X1stFlrSF)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 334 882 1087 1163 1391 4692
Histogram of chosen variable:
hist(train$X1stFlrSF, freq = FALSE, col = "lightblue", main = "Exponential Distribution", xlab = "X", ylab = "Density")
x<- train$X1stFlrSF
qqnorm(x)
qqline(x, col="red")
(exp_df <- fitdistr(x,"exponential"))
## rate
## 0.0008601213
## (0.0000225104)
lambda = 0.0008601213
X <- rexp(1000,lambda)
hist(X, freq = FALSE, col = "lightblue", main = "Exponential Distribution", xlab = "X", ylab = "Density")
curve(dexp(x, rate = lambda), add = TRUE, col = "red", lwd = 2)
A PDF is a derivative of the CDF. So, in order to find the probability density function (PDF) of an exponential distribution, we can differentiate its cumulative distribution function (CDF), 1 — P(T > t).
qexp(0.05, rate = lambda)
## [1] 59.63495
qexp(.95, rate = lambda)
## [1] 3482.918
z <- 1.96
n <- length(train$X1stFlrSF)
mean <- mean(train$X1stFlrSF)
sd <- sd(train$X1stFlrSF)
upper_bound <- round(mean + z * sd / sqrt(n), 4)
upper_bound
## [1] 1182.457
lower_bound <- round(mean - z * sd / sqrt(n), 4)
lower_bound
## [1] 1142.796
The upper bound 1182.457 and the lower bound 1142.796.
quantile(train$X1stFlrSF,0.05)
## 5%
## 672.95
quantile(train$X1stFlrSF,0.95)
## 95%
## 1831.25
10 points. Modeling.
Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis.
First I will look at the train dataset.
head(train)
## Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour
## 1 1 60 RL 65 8450 Pave <NA> Reg Lvl
## 2 2 20 RL 80 9600 Pave <NA> Reg Lvl
## 3 3 60 RL 68 11250 Pave <NA> IR1 Lvl
## 4 4 70 RL 60 9550 Pave <NA> IR1 Lvl
## 5 5 60 RL 84 14260 Pave <NA> IR1 Lvl
## 6 6 50 RL 85 14115 Pave <NA> IR1 Lvl
## Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType
## 1 AllPub Inside Gtl CollgCr Norm Norm 1Fam
## 2 AllPub FR2 Gtl Veenker Feedr Norm 1Fam
## 3 AllPub Inside Gtl CollgCr Norm Norm 1Fam
## 4 AllPub Corner Gtl Crawfor Norm Norm 1Fam
## 5 AllPub FR2 Gtl NoRidge Norm Norm 1Fam
## 6 AllPub Inside Gtl Mitchel Norm Norm 1Fam
## HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl
## 1 2Story 7 5 2003 2003 Gable CompShg
## 2 1Story 6 8 1976 1976 Gable CompShg
## 3 2Story 7 5 2001 2002 Gable CompShg
## 4 2Story 7 5 1915 1970 Gable CompShg
## 5 2Story 8 5 2000 2000 Gable CompShg
## 6 1.5Fin 5 5 1993 1995 Gable CompShg
## Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 1 VinylSd VinylSd BrkFace 196 Gd TA PConc
## 2 MetalSd MetalSd None 0 TA TA CBlock
## 3 VinylSd VinylSd BrkFace 162 Gd TA PConc
## 4 Wd Sdng Wd Shng None 0 TA TA BrkTil
## 5 VinylSd VinylSd BrkFace 350 Gd TA PConc
## 6 VinylSd VinylSd None 0 TA TA Wood
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 1 Gd TA No GLQ 706 Unf
## 2 Gd TA Gd ALQ 978 Unf
## 3 Gd TA Mn GLQ 486 Unf
## 4 TA Gd No ALQ 216 Unf
## 5 Gd TA Av GLQ 655 Unf
## 6 Gd TA No GLQ 732 Unf
## BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 1 0 150 856 GasA Ex Y SBrkr
## 2 0 284 1262 GasA Ex Y SBrkr
## 3 0 434 920 GasA Ex Y SBrkr
## 4 0 540 756 GasA Gd Y SBrkr
## 5 0 490 1145 GasA Ex Y SBrkr
## 6 0 64 796 GasA Ex Y SBrkr
## X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 1 856 854 0 1710 1 0 2
## 2 1262 0 0 1262 0 1 2
## 3 920 866 0 1786 1 0 2
## 4 961 756 0 1717 1 0 1
## 5 1145 1053 0 2198 1 0 2
## 6 796 566 0 1362 1 0 1
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 1 1 3 1 Gd 8 Typ
## 2 0 3 1 TA 6 Typ
## 3 1 3 1 Gd 6 Typ
## 4 0 3 1 Gd 7 Typ
## 5 1 4 1 Gd 9 Typ
## 6 1 1 1 TA 5 Typ
## Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
## 1 0 <NA> Attchd 2003 RFn 2
## 2 1 TA Attchd 1976 RFn 2
## 3 1 TA Attchd 2001 RFn 2
## 4 1 Gd Detchd 1998 Unf 3
## 5 1 TA Attchd 2000 RFn 3
## 6 0 <NA> Attchd 1993 Unf 2
## GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 1 548 TA TA Y 0 61
## 2 460 TA TA Y 298 0
## 3 608 TA TA Y 0 42
## 4 642 TA TA Y 0 35
## 5 836 TA TA Y 192 84
## 6 480 TA TA Y 40 30
## EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
## 1 0 0 0 0 <NA> <NA> <NA>
## 2 0 0 0 0 <NA> <NA> <NA>
## 3 0 0 0 0 <NA> <NA> <NA>
## 4 272 0 0 0 <NA> <NA> <NA>
## 5 0 0 0 0 <NA> <NA> <NA>
## 6 0 320 0 0 <NA> MnPrv Shed
## MiscVal MoSold YrSold SaleType SaleCondition SalePrice
## 1 0 2 2008 WD Normal 208500
## 2 0 5 2007 WD Normal 181500
## 3 0 9 2008 WD Normal 223500
## 4 0 2 2006 WD Abnorml 140000
## 5 0 12 2008 WD Normal 250000
## 6 700 10 2009 WD Normal 143000
Next I will check if there are NA values in the data set.
colSums(is.na(train))
## Id MSSubClass MSZoning LotFrontage LotArea
## 0 0 0 259 0
## Street Alley LotShape LandContour Utilities
## 0 1369 0 0 0
## LotConfig LandSlope Neighborhood Condition1 Condition2
## 0 0 0 0 0
## BldgType HouseStyle OverallQual OverallCond YearBuilt
## 0 0 0 0 0
## YearRemodAdd RoofStyle RoofMatl Exterior1st Exterior2nd
## 0 0 0 0 0
## MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 8 8 0 0 0
## BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1
## 37 37 38 37 0
## BsmtFinType2 BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating
## 38 0 0 0 0
## HeatingQC CentralAir Electrical X1stFlrSF X2ndFlrSF
## 0 0 1 0 0
## LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 0 0 0 0 0
## HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd
## 0 0 0 0 0
## Functional Fireplaces FireplaceQu GarageType GarageYrBlt
## 0 0 690 81 81
## GarageFinish GarageCars GarageArea GarageQual GarageCond
## 81 0 0 81 81
## PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch X3SsnPorch
## 0 0 0 0 0
## ScreenPorch PoolArea PoolQC Fence MiscFeature
## 0 0 1453 1179 1406
## MiscVal MoSold YrSold SaleType SaleCondition
## 0 0 0 0 0
## SalePrice
## 0
I will choose some columns that are mostly empry and the Id which is not needed. Then I will save it into the dataframe training.
training <- train %>% dplyr::select(-c(Id,PoolQC,Alley))
Remove unnecessary columns
training[is.na(training)] <- 0
head(training)
## MSSubClass MSZoning LotFrontage LotArea Street LotShape LandContour Utilities
## 1 60 RL 65 8450 Pave Reg Lvl AllPub
## 2 20 RL 80 9600 Pave Reg Lvl AllPub
## 3 60 RL 68 11250 Pave IR1 Lvl AllPub
## 4 70 RL 60 9550 Pave IR1 Lvl AllPub
## 5 60 RL 84 14260 Pave IR1 Lvl AllPub
## 6 50 RL 85 14115 Pave IR1 Lvl AllPub
## LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle
## 1 Inside Gtl CollgCr Norm Norm 1Fam 2Story
## 2 FR2 Gtl Veenker Feedr Norm 1Fam 1Story
## 3 Inside Gtl CollgCr Norm Norm 1Fam 2Story
## 4 Corner Gtl Crawfor Norm Norm 1Fam 2Story
## 5 FR2 Gtl NoRidge Norm Norm 1Fam 2Story
## 6 Inside Gtl Mitchel Norm Norm 1Fam 1.5Fin
## OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st
## 1 7 5 2003 2003 Gable CompShg VinylSd
## 2 6 8 1976 1976 Gable CompShg MetalSd
## 3 7 5 2001 2002 Gable CompShg VinylSd
## 4 7 5 1915 1970 Gable CompShg Wd Sdng
## 5 8 5 2000 2000 Gable CompShg VinylSd
## 6 5 5 1993 1995 Gable CompShg VinylSd
## Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual
## 1 VinylSd BrkFace 196 Gd TA PConc Gd
## 2 MetalSd None 0 TA TA CBlock Gd
## 3 VinylSd BrkFace 162 Gd TA PConc Gd
## 4 Wd Shng None 0 TA TA BrkTil TA
## 5 VinylSd BrkFace 350 Gd TA PConc Gd
## 6 VinylSd None 0 TA TA Wood Gd
## BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2
## 1 TA No GLQ 706 Unf 0
## 2 TA Gd ALQ 978 Unf 0
## 3 TA Mn GLQ 486 Unf 0
## 4 Gd No ALQ 216 Unf 0
## 5 TA Av GLQ 655 Unf 0
## 6 TA No GLQ 732 Unf 0
## BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical X1stFlrSF
## 1 150 856 GasA Ex Y SBrkr 856
## 2 284 1262 GasA Ex Y SBrkr 1262
## 3 434 920 GasA Ex Y SBrkr 920
## 4 540 756 GasA Gd Y SBrkr 961
## 5 490 1145 GasA Ex Y SBrkr 1145
## 6 64 796 GasA Ex Y SBrkr 796
## X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath
## 1 854 0 1710 1 0 2 1
## 2 0 0 1262 0 1 2 0
## 3 866 0 1786 1 0 2 1
## 4 756 0 1717 1 0 1 0
## 5 1053 0 2198 1 0 2 1
## 6 566 0 1362 1 0 1 1
## BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces
## 1 3 1 Gd 8 Typ 0
## 2 3 1 TA 6 Typ 1
## 3 3 1 Gd 6 Typ 1
## 4 3 1 Gd 7 Typ 1
## 5 4 1 Gd 9 Typ 1
## 6 1 1 TA 5 Typ 0
## FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars GarageArea
## 1 0 Attchd 2003 RFn 2 548
## 2 TA Attchd 1976 RFn 2 460
## 3 TA Attchd 2001 RFn 2 608
## 4 Gd Detchd 1998 Unf 3 642
## 5 TA Attchd 2000 RFn 3 836
## 6 0 Attchd 1993 Unf 2 480
## GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch
## 1 TA TA Y 0 61 0
## 2 TA TA Y 298 0 0
## 3 TA TA Y 0 42 0
## 4 TA TA Y 0 35 272
## 5 TA TA Y 192 84 0
## 6 TA TA Y 40 30 0
## X3SsnPorch ScreenPorch PoolArea Fence MiscFeature MiscVal MoSold YrSold
## 1 0 0 0 0 0 0 2 2008
## 2 0 0 0 0 0 0 5 2007
## 3 0 0 0 0 0 0 9 2008
## 4 0 0 0 0 0 0 2 2006
## 5 0 0 0 0 0 0 12 2008
## 6 320 0 0 MnPrv Shed 700 10 2009
## SaleType SaleCondition SalePrice
## 1 WD Normal 208500
## 2 WD Normal 181500
## 3 WD Normal 223500
## 4 WD Abnorml 140000
## 5 WD Normal 250000
## 6 WD Normal 143000
First I will build a full model and save it into
house.prices.lm.
house.prices.lm <- lm(SalePrice ~ .,data=training)
The summary below shows all the p-values of the columns.
summary(house.prices.lm)
##
## Call:
## lm(formula = SalePrice ~ ., data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -179735 -9252 36 9580 179735
##
## Coefficients: (8 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.330e+05 1.053e+06 -0.506 0.612718
## MSSubClass -4.832e+01 8.290e+01 -0.583 0.560141
## MSZoningFV 3.344e+04 1.199e+04 2.788 0.005382 **
## MSZoningRH 2.311e+04 1.192e+04 1.938 0.052818 .
## MSZoningRL 2.584e+04 1.024e+04 2.524 0.011742 *
## MSZoningRM 2.253e+04 9.582e+03 2.351 0.018890 *
## LotFrontage 8.328e+00 2.297e+01 0.363 0.717038
## LotArea 7.405e-01 1.091e-01 6.790 1.75e-11 ***
## StreetPave 3.322e+04 1.222e+04 2.719 0.006635 **
## LotShapeIR2 4.416e+03 4.229e+03 1.044 0.296632
## LotShapeIR3 5.541e+03 8.847e+03 0.626 0.531267
## LotShapeReg 1.458e+03 1.642e+03 0.888 0.374811
## LandContourHLS 7.585e+03 5.149e+03 1.473 0.140971
## LandContourLow -1.073e+04 6.426e+03 -1.669 0.095334 .
## LandContourLvl 5.285e+03 3.703e+03 1.427 0.153759
## UtilitiesNoSeWa -3.377e+04 2.646e+04 -1.276 0.202096
## LotConfigCulDSac 7.876e+03 3.262e+03 2.414 0.015911 *
## LotConfigFR2 -7.870e+03 4.012e+03 -1.962 0.050046 .
## LotConfigFR3 -1.735e+04 1.259e+04 -1.378 0.168605
## LotConfigInside -1.821e+03 1.759e+03 -1.035 0.300878
## LandSlopeMod 7.368e+03 3.984e+03 1.849 0.064677 .
## LandSlopeSev -4.434e+04 1.140e+04 -3.889 0.000106 ***
## NeighborhoodBlueste 8.738e+03 1.929e+04 0.453 0.650643
## NeighborhoodBrDale -1.548e+03 1.101e+04 -0.141 0.888262
## NeighborhoodBrkSide -5.112e+03 9.452e+03 -0.541 0.588705
## NeighborhoodClearCr -1.616e+04 9.218e+03 -1.753 0.079917 .
## NeighborhoodCollgCr -1.021e+04 7.260e+03 -1.406 0.159922
## NeighborhoodCrawfor 1.109e+04 8.566e+03 1.294 0.195881
## NeighborhoodEdwards -2.068e+04 8.006e+03 -2.583 0.009910 **
## NeighborhoodGilbert -1.121e+04 7.674e+03 -1.461 0.144410
## NeighborhoodIDOTRR -1.184e+04 1.075e+04 -1.102 0.270805
## NeighborhoodMeadowV -6.379e+03 1.121e+04 -0.569 0.569373
## NeighborhoodMitchel -2.224e+04 8.169e+03 -2.722 0.006580 **
## NeighborhoodNAmes -1.637e+04 7.851e+03 -2.085 0.037323 *
## NeighborhoodNoRidge 2.479e+04 8.442e+03 2.936 0.003389 **
## NeighborhoodNPkVill 1.424e+04 1.408e+04 1.011 0.312003
## NeighborhoodNridgHt 1.799e+04 7.535e+03 2.388 0.017101 *
## NeighborhoodNWAmes -1.767e+04 8.007e+03 -2.206 0.027550 *
## NeighborhoodOldTown -1.429e+04 9.595e+03 -1.489 0.136651
## NeighborhoodSawyer -1.023e+04 8.128e+03 -1.259 0.208237
## NeighborhoodSawyerW -3.334e+03 7.801e+03 -0.427 0.669194
## NeighborhoodSomerst -2.578e+03 9.018e+03 -0.286 0.775063
## NeighborhoodStoneBr 3.799e+04 8.312e+03 4.571 5.35e-06 ***
## NeighborhoodSWISU -9.284e+03 9.687e+03 -0.958 0.338003
## NeighborhoodTimber -1.051e+04 8.148e+03 -1.290 0.197176
## NeighborhoodVeenker 5.105e+01 1.051e+04 0.005 0.996126
## Condition1Feedr 6.387e+03 5.005e+03 1.276 0.202168
## Condition1Norm 1.555e+04 4.175e+03 3.724 0.000205 ***
## Condition1PosA 8.971e+03 9.960e+03 0.901 0.367926
## Condition1PosN 1.379e+04 7.476e+03 1.845 0.065339 .
## Condition1RRAe -1.663e+04 9.079e+03 -1.832 0.067151 .
## Condition1RRAn 1.233e+04 6.955e+03 1.772 0.076640 .
## Condition1RRNe -4.010e+03 1.753e+04 -0.229 0.819136
## Condition1RRNn 1.067e+04 1.287e+04 0.829 0.407027
## Condition2Feedr -4.753e+03 2.350e+04 -0.202 0.839715
## Condition2Norm -7.085e+03 2.033e+04 -0.348 0.727579
## Condition2PosA 4.128e+04 3.710e+04 1.113 0.265985
## Condition2PosN -2.382e+05 2.773e+04 -8.592 < 2e-16 ***
## Condition2RRAe -1.252e+05 6.531e+04 -1.918 0.055382 .
## Condition2RRAn -1.826e+04 3.159e+04 -0.578 0.563234
## Condition2RRNn 2.633e+02 2.714e+04 0.010 0.992263
## BldgType2fmCon -3.386e+03 1.249e+04 -0.271 0.786375
## BldgTypeDuplex -7.856e+03 7.397e+03 -1.062 0.288390
## BldgTypeTwnhs -1.953e+04 9.939e+03 -1.965 0.049652 *
## BldgTypeTwnhsE -1.529e+04 8.957e+03 -1.707 0.088137 .
## HouseStyle1.5Unf 1.374e+04 7.933e+03 1.732 0.083578 .
## HouseStyle1Story 7.419e+03 4.350e+03 1.706 0.088351 .
## HouseStyle2.5Fin -2.205e+04 1.232e+04 -1.789 0.073877 .
## HouseStyle2.5Unf -1.038e+04 9.258e+03 -1.121 0.262416
## HouseStyle2Story -6.006e+03 3.508e+03 -1.712 0.087102 .
## HouseStyleSFoyer 2.408e+03 6.270e+03 0.384 0.701010
## HouseStyleSLvl 4.745e+03 5.581e+03 0.850 0.395379
## OverallQual 6.781e+03 1.015e+03 6.682 3.59e-11 ***
## OverallCond 5.663e+03 8.735e+02 6.483 1.30e-10 ***
## YearBuilt 3.215e+02 7.695e+01 4.177 3.16e-05 ***
## YearRemodAdd 1.036e+02 5.570e+01 1.861 0.063033 .
## RoofStyleGable 6.160e+03 1.843e+04 0.334 0.738314
## RoofStyleGambrel 8.545e+03 2.016e+04 0.424 0.671688
## RoofStyleHip 5.766e+03 1.850e+04 0.312 0.755397
## RoofStyleMansard 1.648e+04 2.147e+04 0.767 0.443068
## RoofStyleShed 1.011e+05 3.463e+04 2.920 0.003561 **
## RoofMatlCompShg 6.895e+05 3.337e+04 20.662 < 2e-16 ***
## RoofMatlMembran 7.853e+05 4.780e+04 16.429 < 2e-16 ***
## RoofMatlMetal 7.547e+05 4.684e+04 16.114 < 2e-16 ***
## RoofMatlRoll 6.762e+05 4.191e+04 16.135 < 2e-16 ***
## RoofMatlTar&Grv 6.926e+05 3.812e+04 18.170 < 2e-16 ***
## RoofMatlWdShake 6.811e+05 3.692e+04 18.446 < 2e-16 ***
## RoofMatlWdShngl 7.433e+05 3.452e+04 21.530 < 2e-16 ***
## Exterior1stAsphShn -2.583e+04 3.309e+04 -0.780 0.435253
## Exterior1stBrkComm -7.723e+03 2.779e+04 -0.278 0.781143
## Exterior1stBrkFace 5.717e+03 1.277e+04 0.448 0.654368
## Exterior1stCBlock -1.558e+04 2.726e+04 -0.571 0.567889
## Exterior1stCemntBd -1.370e+04 1.905e+04 -0.719 0.472255
## Exterior1stHdBoard -1.510e+04 1.294e+04 -1.166 0.243729
## Exterior1stImStucc -3.048e+04 2.814e+04 -1.083 0.278994
## Exterior1stMetalSd -7.333e+03 1.460e+04 -0.502 0.615515
## Exterior1stPlywood -1.691e+04 1.275e+04 -1.326 0.185230
## Exterior1stStone -4.617e+03 2.438e+04 -0.189 0.849814
## Exterior1stStucco -8.160e+03 1.411e+04 -0.578 0.563081
## Exterior1stVinylSd -1.613e+04 1.329e+04 -1.214 0.225134
## Exterior1stWd Sdng -1.501e+04 1.238e+04 -1.212 0.225886
## Exterior1stWdShing -1.176e+04 1.333e+04 -0.882 0.377902
## Exterior2ndAsphShn 1.310e+04 2.227e+04 0.588 0.556343
## Exterior2ndBrk Cmn 7.080e+03 2.012e+04 0.352 0.724959
## Exterior2ndBrkFace 5.033e+03 1.325e+04 0.380 0.704173
## Exterior2ndCBlock NA NA NA NA
## Exterior2ndCmentBd 1.354e+04 1.875e+04 0.722 0.470344
## Exterior2ndHdBoard 9.496e+03 1.245e+04 0.763 0.445661
## Exterior2ndImStucc 2.506e+04 1.425e+04 1.759 0.078907 .
## Exterior2ndMetalSd 6.336e+03 1.423e+04 0.445 0.656094
## Exterior2ndOther -1.607e+04 2.718e+04 -0.591 0.554557
## Exterior2ndPlywood 8.309e+03 1.207e+04 0.688 0.491318
## Exterior2ndStone -1.007e+04 1.727e+04 -0.583 0.559882
## Exterior2ndStucco 7.749e+03 1.358e+04 0.571 0.568368
## Exterior2ndVinylSd 1.437e+04 1.280e+04 1.122 0.262106
## Exterior2ndWd Sdng 1.233e+04 1.196e+04 1.031 0.302752
## Exterior2ndWd Shng 6.307e+03 1.247e+04 0.506 0.613010
## MasVnrTypeBrkCmn -2.599e+03 1.094e+04 -0.238 0.812218
## MasVnrTypeBrkFace 2.086e+03 8.781e+03 0.238 0.812289
## MasVnrTypeNone 5.183e+03 8.622e+03 0.601 0.547838
## MasVnrTypeStone 6.878e+03 8.859e+03 0.776 0.437652
## MasVnrArea 1.956e+01 5.775e+00 3.387 0.000729 ***
## ExterQualFa -7.070e+03 1.109e+04 -0.638 0.523914
## ExterQualGd -2.010e+04 4.802e+03 -4.186 3.04e-05 ***
## ExterQualTA -1.981e+04 5.321e+03 -3.724 0.000205 ***
## ExterCondFa -2.889e+03 1.807e+04 -0.160 0.873011
## ExterCondGd -6.996e+03 1.726e+04 -0.405 0.685238
## ExterCondPo 4.888e+03 3.175e+04 0.154 0.877691
## ExterCondTA -4.406e+03 1.721e+04 -0.256 0.797995
## FoundationCBlock 3.106e+03 3.172e+03 0.979 0.327703
## FoundationPConc 4.272e+03 3.429e+03 1.246 0.213073
## FoundationSlab -7.401e+03 1.007e+04 -0.735 0.462499
## FoundationStone 8.826e+03 1.131e+04 0.780 0.435355
## FoundationWood -2.649e+04 1.479e+04 -1.791 0.073609 .
## BsmtQualEx -3.168e+04 3.637e+04 -0.871 0.383814
## BsmtQualFa -4.473e+04 3.616e+04 -1.237 0.216320
## BsmtQualGd -5.018e+04 3.614e+04 -1.389 0.165227
## BsmtQualTA -4.683e+04 3.604e+04 -1.299 0.194031
## BsmtCondFa -2.915e+03 4.252e+03 -0.686 0.493087
## BsmtCondGd -3.357e+03 3.245e+03 -1.034 0.301131
## BsmtCondPo 6.941e+04 3.010e+04 2.306 0.021254 *
## BsmtCondTA NA NA NA NA
## BsmtExposureAv 1.172e+04 2.308e+04 0.508 0.611763
## BsmtExposureGd 2.507e+04 2.317e+04 1.082 0.279540
## BsmtExposureMn 7.637e+03 2.315e+04 0.330 0.741543
## BsmtExposureNo 6.200e+03 2.304e+04 0.269 0.787857
## BsmtFinType1ALQ -3.485e+03 2.919e+03 -1.194 0.232690
## BsmtFinType1BLQ -4.870e+02 3.131e+03 -0.156 0.876432
## BsmtFinType1GLQ 2.706e+03 2.714e+03 0.997 0.318887
## BsmtFinType1LwQ -6.724e+03 3.780e+03 -1.779 0.075526 .
## BsmtFinType1Rec -3.255e+03 3.182e+03 -1.023 0.306439
## BsmtFinType1Unf NA NA NA NA
## BsmtFinSF1 3.936e+01 5.322e+00 7.395 2.63e-13 ***
## BsmtFinType2ALQ 2.918e+04 2.506e+04 1.165 0.244438
## BsmtFinType2BLQ 1.675e+04 2.481e+04 0.675 0.499672
## BsmtFinType2GLQ 2.680e+04 2.559e+04 1.047 0.295126
## BsmtFinType2LwQ 1.494e+04 2.483e+04 0.601 0.547622
## BsmtFinType2Rec 1.926e+04 2.478e+04 0.777 0.437215
## BsmtFinType2Unf 2.073e+04 2.472e+04 0.839 0.401908
## BsmtFinSF2 3.125e+01 9.091e+00 3.437 0.000609 ***
## BsmtUnfSF 2.080e+01 4.881e+00 4.262 2.19e-05 ***
## TotalBsmtSF NA NA NA NA
## HeatingGasA 9.172e+03 2.567e+04 0.357 0.720891
## HeatingGasW 5.051e+03 2.646e+04 0.191 0.848674
## HeatingGrav 3.554e+03 2.814e+04 0.126 0.899509
## HeatingOthW -1.162e+04 3.146e+04 -0.369 0.711979
## HeatingWall 2.355e+04 2.981e+04 0.790 0.429641
## HeatingQCFa -6.126e+02 4.691e+03 -0.131 0.896122
## HeatingQCGd -3.780e+03 2.075e+03 -1.822 0.068709 .
## HeatingQCPo 2.182e+03 2.664e+04 0.082 0.934734
## HeatingQCTA -3.470e+03 2.071e+03 -1.675 0.094179 .
## CentralAirY 1.007e+02 3.881e+03 0.026 0.979309
## ElectricalFuseA -1.204e+04 2.416e+04 -0.499 0.618209
## ElectricalFuseF -1.216e+04 2.456e+04 -0.495 0.620789
## ElectricalFuseP -2.048e+04 3.024e+04 -0.677 0.498273
## ElectricalMix -6.012e+04 5.075e+04 -1.185 0.236387
## ElectricalSBrkr -1.390e+04 2.396e+04 -0.580 0.561901
## X1stFlrSF 4.558e+01 5.652e+00 8.064 1.76e-15 ***
## X2ndFlrSF 6.615e+01 5.593e+00 11.827 < 2e-16 ***
## LowQualFinSF 6.703e+00 1.864e+01 0.360 0.719216
## GrLivArea NA NA NA NA
## BsmtFullBath 1.085e+03 1.981e+03 0.547 0.584146
## BsmtHalfBath -1.690e+02 3.032e+03 -0.056 0.955550
## FullBath 3.947e+03 2.206e+03 1.789 0.073859 .
## HalfBath 1.603e+03 2.102e+03 0.763 0.445826
## BedroomAbvGr -3.672e+03 1.365e+03 -2.690 0.007254 **
## KitchenAbvGr -1.385e+04 5.670e+03 -2.443 0.014698 *
## KitchenQualFa -1.960e+04 6.212e+03 -3.156 0.001641 **
## KitchenQualGd -2.401e+04 3.489e+03 -6.881 9.48e-12 ***
## KitchenQualTA -2.265e+04 3.930e+03 -5.762 1.05e-08 ***
## TotRmsAbvGrd 1.593e+03 9.560e+02 1.666 0.095904 .
## FunctionalMaj2 -1.023e+03 1.443e+04 -0.071 0.943518
## FunctionalMin1 7.684e+03 8.643e+03 0.889 0.374157
## FunctionalMin2 9.625e+03 8.661e+03 1.111 0.266664
## FunctionalMod -4.694e+03 1.061e+04 -0.443 0.658160
## FunctionalSev -4.415e+04 2.957e+04 -1.493 0.135655
## FunctionalTyp 1.924e+04 7.502e+03 2.565 0.010442 *
## Fireplaces 6.446e+03 2.572e+03 2.507 0.012321 *
## FireplaceQuEx -9.041e+03 6.238e+03 -1.449 0.147535
## FireplaceQuFa -1.129e+04 5.317e+03 -2.123 0.033975 *
## FireplaceQuGd -6.230e+03 3.434e+03 -1.814 0.069877 .
## FireplaceQuPo -1.636e+02 6.279e+03 -0.026 0.979214
## FireplaceQuTA -5.169e+03 3.593e+03 -1.439 0.150497
## GarageType2Types 4.263e+04 1.188e+05 0.359 0.719826
## GarageTypeAttchd 6.214e+04 1.187e+05 0.524 0.600668
## GarageTypeBasment 6.572e+04 1.184e+05 0.555 0.578997
## GarageTypeBuiltIn 6.083e+04 1.188e+05 0.512 0.608851
## GarageTypeCarPort 6.709e+04 1.194e+05 0.562 0.574159
## GarageTypeDetchd 6.526e+04 1.188e+05 0.549 0.582901
## GarageYrBlt -3.630e+01 6.085e+01 -0.597 0.550929
## GarageFinishFin 7.956e+01 2.431e+03 0.033 0.973902
## GarageFinishRFn -2.300e+03 2.160e+03 -1.065 0.287022
## GarageFinishUnf NA NA NA NA
## GarageCars 4.146e+03 2.277e+03 1.821 0.068900 .
## GarageArea 1.855e+01 7.909e+00 2.345 0.019181 *
## GarageQualEx 1.160e+05 2.992e+04 3.878 0.000111 ***
## GarageQualFa -6.395e+03 4.872e+03 -1.313 0.189506
## GarageQualGd 8.426e+02 7.620e+03 0.111 0.911969
## GarageQualPo -2.215e+04 2.403e+04 -0.922 0.356889
## GarageQualTA NA NA NA NA
## GarageCondEx -1.105e+05 3.458e+04 -3.194 0.001438 **
## GarageCondFa -1.467e+03 5.427e+03 -0.270 0.786895
## GarageCondGd 4.798e+02 9.109e+03 0.053 0.957998
## GarageCondPo 2.244e+03 1.393e+04 0.161 0.872024
## GarageCondTA NA NA NA NA
## PavedDriveP -3.238e+03 5.544e+03 -0.584 0.559290
## PavedDriveY -6.724e+02 3.471e+03 -0.194 0.846417
## WoodDeckSF 1.455e+01 5.868e+00 2.480 0.013292 *
## OpenPorchSF 2.375e+00 1.153e+01 0.206 0.836909
## EnclosedPorch 5.910e+00 1.244e+01 0.475 0.634769
## X3SsnPorch 3.257e+01 2.245e+01 1.451 0.147054
## ScreenPorch 3.417e+01 1.254e+01 2.725 0.006531 **
## PoolArea 1.159e+02 2.006e+01 5.776 9.74e-09 ***
## FenceGdPrv -9.759e+03 3.672e+03 -2.658 0.007966 **
## FenceGdWo -4.985e+02 3.582e+03 -0.139 0.889328
## FenceMnPrv 1.344e+03 2.268e+03 0.592 0.553656
## FenceMnWw -5.094e+03 7.499e+03 -0.679 0.497100
## MiscFeatureGar2 -1.521e+04 9.751e+04 -0.156 0.876039
## MiscFeatureOthr 1.204e+04 2.028e+04 0.594 0.552863
## MiscFeatureShed 2.106e+03 5.708e+03 0.369 0.712220
## MiscFeatureTenC -8.853e+04 2.914e+04 -3.038 0.002430 **
## MiscVal 1.009e+00 6.133e+00 0.165 0.869293
## MoSold -4.380e+02 2.454e+02 -1.785 0.074471 .
## YrSold -5.131e+02 5.166e+02 -0.993 0.320828
## SaleTypeCon 2.533e+04 1.760e+04 1.439 0.150376
## SaleTypeConLD 1.676e+04 9.696e+03 1.729 0.084146 .
## SaleTypeConLI 5.682e+03 1.156e+04 0.491 0.623285
## SaleTypeConLw 1.083e+03 1.218e+04 0.089 0.929196
## SaleTypeCWD 1.487e+04 1.290e+04 1.153 0.249191
## SaleTypeNew 2.223e+04 1.546e+04 1.438 0.150554
## SaleTypeOth 6.767e+03 1.453e+04 0.466 0.641551
## SaleTypeWD -9.000e+01 4.188e+03 -0.021 0.982859
## SaleConditionAdjLand 7.129e+03 1.451e+04 0.491 0.623396
## SaleConditionAlloca 3.393e+03 8.648e+03 0.392 0.694856
## SaleConditionFamily 5.736e+01 6.089e+03 0.009 0.992486
## SaleConditionNormal 5.749e+03 2.896e+03 1.985 0.047360 *
## SaleConditionPartial -1.777e+03 1.488e+04 -0.119 0.904971
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22690 on 1211 degrees of freedom
## Multiple R-squared: 0.9323, Adjusted R-squared: 0.9184
## F-statistic: 67.24 on 248 and 1211 DF, p-value: < 2.2e-16
This is an example of over fitting. I will try to minimize the feature. Feature selection will be more difficult with 82 features, so I would think make a house more expensive or less. There can of course be exceptions.
training2 <- train %>%
dplyr::select(Id,BsmtFinSF1, BsmtUnfSF, GrLivArea, GrLivArea, MasVnrArea,
OpenPorchSF, X1stFlrSF,KitchenQual,BsmtFinSF1,Street,LotArea,MSZoning,CentralAir,SalePrice)
Next I remodel based on the criteria I determine will make a difference.
house.prices.lm <- lm(SalePrice ~ . -Id ,data=training2)
summary(house.prices.lm)
##
## Call:
## lm(formula = SalePrice ~ . - Id, data = training2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -602354 -17660 168 16565 255614
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.716e+04 2.016e+04 3.331 0.000889 ***
## BsmtFinSF1 3.272e+01 4.095e+00 7.990 2.76e-15 ***
## BsmtUnfSF 1.949e+01 3.942e+00 4.945 8.51e-07 ***
## GrLivArea 5.830e+01 2.700e+00 21.593 < 2e-16 ***
## MasVnrArea 5.228e+01 6.566e+00 7.963 3.39e-15 ***
## OpenPorchSF 2.949e+01 1.732e+01 1.703 0.088833 .
## X1stFlrSF 6.891e+00 4.732e+00 1.456 0.145530
## KitchenQualFa -1.037e+05 8.320e+03 -12.467 < 2e-16 ***
## KitchenQualGd -6.102e+04 4.618e+03 -13.212 < 2e-16 ***
## KitchenQualTA -9.756e+04 4.898e+03 -19.917 < 2e-16 ***
## StreetPave -2.221e+03 1.732e+04 -0.128 0.897984
## LotArea 4.564e-01 1.154e-01 3.955 8.03e-05 ***
## MSZoningFV 5.250e+04 1.433e+04 3.665 0.000256 ***
## MSZoningRH 2.486e+04 1.647e+04 1.510 0.131389
## MSZoningRL 4.199e+04 1.338e+04 3.138 0.001733 **
## MSZoningRM 2.305e+04 1.344e+04 1.715 0.086616 .
## CentralAirY 2.102e+04 4.689e+03 4.483 7.93e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39810 on 1435 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.7506, Adjusted R-squared: 0.7478
## F-statistic: 270 on 16 and 1435 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(house.prices.lm)
F-statistic shows if there exists a relationship between the variables and the output. The further the F-statistic is from 1 the better it is. The performance of the F-statistic is also determined in comparison to how large the data is. A small dataset like this one would need a large F statistic in relation to the 1460 observations. Which in comparison to the number of observations, a F statistic of 270 is not very large.
The R^2 shows how well the model fits to the actual data. I will interpret the adjusted R^2 as it adjusts to the degrees of freedom in a model. The closer the R^2 is to one the better the fit of the model. This model has an adjusted R^2 of 0.7478, or 74.78%, which is pretty close to one.
Now I will test use this and submit to Kaggle.
final_df <- test %>% dplyr::select(Id)
final_df$SalePrice <- predict(house.prices.lm, test)
head(final_df)
## Id SalePrice
## 1 1461 97550.08
## 2 1462 204911.59
## 3 1463 167629.14
## 4 1464 199509.42
## 5 1465 183517.93
## 6 1466 154058.94
colSums(is.na(final_df))
## Id SalePrice
## 0 21
final_df$SalePrice[is.na(final_df$SalePrice)] <- mean(final_df$SalePrice,na.rm = TRUE)
write_csv(final_df, "SalePricesPredictions.csv")