library(tidyverse)
library(GGally)
library(modelr)
For this given data set, I need to build a linear model to predict the house price,this data set has total 81 variables and 1460 observation,and among them, 38 are numerical variables,43 are categorical.my target variable is SalePrice. I chose 3 numerical variables based on the correlation coefficient and 2 categorical variables based on ggpairs, they are: GrLivArea : Above grade (ground) living area square feet TotalBsmtSF : Total square feet of basement area X1stFlrSF : First Floor square feet OverallQual : Rates the overall material and finish of the house(10 levels) KetchenQual : Kitchen quality(5 levels).
my_data <-read.csv("train.csv")
my_data<- as.tibble(my_data)
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
numeric_column <- keep(my_data, is.numeric)
result_numeric <- cor(numeric_column$SalePrice,numeric_column)
result_numeric[result_numeric>0.6]
## [1] NA 0.7909816 NA 0.6135806 0.6058522 0.7086245 NA
## [8] 0.6404092 0.6234314 1.0000000
# OverallQual 0.7909816
# GrLivArea 0.7086245
# TotalBsmtSF 0.6135806
# X1stFlrSF 0.6058522
# GarageCars 0.6404092
# GarageArea 0.6234314
my_data <-read.csv("train.csv")
my_data<- as.tibble(my_data)
my_data_ggpair <- select(my_data, SalePrice,MSZoning,HouseStyle,Foundation,KitchenQual)
ggpairs(my_data_ggpair)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# MSZoning(a little less), KitchenQual both have big variance, neiborhood has 25 level which exceed the limit of 15 level, can't use ggpair
my_data <-read.csv("train.csv")
my_data<- as.tibble(my_data)
my_data_ggpair2 <- select(my_data, SalePrice,PavedDrive,GarageCond,GarageType,CentralAir,BsmtCond )
ggpairs(my_data_ggpair2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 81 rows containing non-finite values (`stat_g_gally_count()`).
## Removed 81 rows containing non-finite values (`stat_g_gally_count()`).
## Removed 81 rows containing non-finite values (`stat_g_gally_count()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 81 rows containing non-finite values (`stat_g_gally_count()`).
## Removed 81 rows containing non-finite values (`stat_g_gally_count()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# there is no significant one
my_data <-read.csv("train.csv")
my_data<- as.tibble(my_data)
my_data_ggpair3 <- select(my_data, SalePrice,LandContour,Utilities,Condition1,Condition2,HouseStyle,RoofStyle,RoofMatl,Exterior1st)
ggpairs(my_data_ggpair3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Condition2 is an option
model_GrLivArea <- lm(SalePrice ~ GrLivArea, data = my_data_GrLivArea)
plot(model_GrLivArea, which = 1)
plot(model_GrLivArea, which = 2)
model_GrLivArea$coefficients
## (Intercept) GrLivArea
## 11860.0890 111.7895
grid <- my_data_GrLivArea %>%
data_grid(GrLivArea) %>%
add_predictions(model_GrLivArea)
ggplot(my_data_GrLivArea) + geom_point(aes(GrLivArea, SalePrice)) + geom_line(aes(GrLivArea,pred), data = grid, colour = "red", size = 1)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
ggplot(my_data_GrLivArea) + geom_point(aes(y = SalePrice, x = TotalBsmtSF))
my_data_GrLivArea_TotalBsmtSF <- filter(my_data_GrLivArea, between(TotalBsmtSF, 0,2500))
ggplot(my_data_GrLivArea_TotalBsmtSF) + geom_point(aes(y = SalePrice, x = TotalBsmtSF))
model_TotalBsmtSF <- lm(SalePrice ~ TotalBsmtSF, data = my_data_GrLivArea_TotalBsmtSF)
plot(model_TotalBsmtSF, which = 1)
plot(model_TotalBsmtSF, which = 2)
model_TotalBsmtSF$coefficients
## (Intercept) TotalBsmtSF
## 55599.5638 117.3578
grid <- my_data_GrLivArea_TotalBsmtSF %>%
data_grid(TotalBsmtSF) %>%
add_predictions(model_TotalBsmtSF)
ggplot(my_data_GrLivArea_TotalBsmtSF) + geom_point(aes(TotalBsmtSF, SalePrice)) + geom_line(aes(TotalBsmtSF,pred), data = grid, colour = "red", size = 1)
ggplot(my_data_GrLivArea_TotalBsmtSF) + geom_point(aes(y = SalePrice,x = X1stFlrSF ))
my_data_GrLivArea_TotalBsmtSF_X1stFlrSF <- filter(my_data, between(X1stFlrSF, 0,2300))
ggplot(my_data_GrLivArea_TotalBsmtSF_X1stFlrSF) + geom_point(aes(y = SalePrice,x = X1stFlrSF ))
model_X1stFlrSF <- lm(SalePrice ~ X1stFlrSF, data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF)
plot(model_X1stFlrSF, which = 1)
plot(model_X1stFlrSF, which = 2)
model_X1stFlrSF$coefficients
## (Intercept) X1stFlrSF
## 31178.7511 128.5846
grid <- my_data_GrLivArea_TotalBsmtSF_X1stFlrSF %>%
data_grid(X1stFlrSF) %>%
add_predictions(model_X1stFlrSF)
ggplot(my_data_GrLivArea_TotalBsmtSF_X1stFlrSF) + geom_point(aes(X1stFlrSF, SalePrice)) + geom_line(aes(X1stFlrSF,pred), data = grid, colour = "red", size = 1)
ggplot(my_data_GrLivArea_TotalBsmtSF_X1stFlrSF) + geom_boxplot(aes(x = SalePrice,y = as.factor(OverallQual)))
model_OverallQual <- lm(SalePrice ~as.character(OverallQual), data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF)
plot(model_OverallQual, which = 1)
plot(model_OverallQual, which = 2)
model_OverallQual$coefficients
## (Intercept) as.character(OverallQual)10
## 50150.000 371858.273
## as.character(OverallQual)2 as.character(OverallQual)3
## 1620.333 37323.750
## as.character(OverallQual)4 as.character(OverallQual)5
## 58270.655 83373.348
## as.character(OverallQual)6 as.character(OverallQual)7
## 111348.421 157566.423
## as.character(OverallQual)8 as.character(OverallQual)9
## 223550.424 311550.071
grid <- my_data_GrLivArea_TotalBsmtSF_X1stFlrSF %>%
data_grid(OverallQual) %>%
add_predictions(model_OverallQual)
grid
## # A tibble: 10 × 2
## OverallQual pred
## <int> <dbl>
## 1 1 50150.
## 2 2 51770.
## 3 3 87474.
## 4 4 108421.
## 5 5 133523.
## 6 6 161498.
## 7 7 207716.
## 8 8 273700.
## 9 9 361700.
## 10 10 422008.
ggplot() + geom_point(aes(OverallQual, SalePrice), data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF) + geom_point(aes(OverallQual, pred), data = grid, colour = "red", size = 4)
ggplot(my_data_GrLivArea_TotalBsmtSF_X1stFlrSF) + geom_boxplot(aes(x = SalePrice,y = as.factor(KitchenQual)))
model_KitchenQual <- lm(SalePrice ~ KitchenQual, data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF)
plot(model_KitchenQual, which = 1)
plot(model_KitchenQual, which = 2)
model_KitchenQual$coefficients
## (Intercept) KitchenQualFa KitchenQualGd KitchenQualTA
## 315122.9 -209557.7 -103622.1 -175243.1
grid <- my_data_GrLivArea_TotalBsmtSF_X1stFlrSF %>%
data_grid(KitchenQual) %>%
add_predictions(model_KitchenQual)
grid
## # A tibble: 4 × 2
## KitchenQual pred
## <chr> <dbl>
## 1 Ex 315123.
## 2 Fa 105565.
## 3 Gd 211501.
## 4 TA 139880.
ggplot() + geom_point(aes(KitchenQual, SalePrice), data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF) + geom_point(aes(KitchenQual, pred), data = grid, colour = "red", size = 4)
my_data_1 <- my_data_GrLivArea_TotalBsmtSF_X1stFlrSF
model_1 <- lm(SalePrice ~ GrLivArea + TotalBsmtSF, data = my_data_1)
plot(model_1, which = 1)
plot(model_1, which = 2)
summary(model_1)
##
## Call:
## lm(formula = SalePrice ~ GrLivArea + TotalBsmtSF, data = my_data_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -181158 -21113 840 21952 236749
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -31201.871 4051.015 -7.702 2.47e-14 ***
## GrLivArea 85.519 2.473 34.575 < 2e-16 ***
## TotalBsmtSF 78.460 3.052 25.708 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 42880 on 1445 degrees of freedom
## Multiple R-squared: 0.6687, Adjusted R-squared: 0.6683
## F-statistic: 1458 on 2 and 1445 DF, p-value: < 2.2e-16
# Note : There is an obvious pattern that residuals versus fitted model
model_2 <- lm(SalePrice ~ GrLivArea + TotalBsmtSF + X1stFlrSF, data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF)
plot(model_2, which = 1)
plot(model_2, which = 2)
summary(model_2)
##
## Call:
## lm(formula = SalePrice ~ GrLivArea + TotalBsmtSF + X1stFlrSF,
## data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF)
##
## Residuals:
## Min 1Q Median 3Q Max
## -180717 -21471 774 21893 236582
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -30665.885 4267.434 -7.186 1.07e-12 ***
## GrLivArea 85.910 2.659 32.307 < 2e-16 ***
## TotalBsmtSF 79.850 4.621 17.278 < 2e-16 ***
## X1stFlrSF -2.237 5.586 -0.401 0.689
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 42900 on 1444 degrees of freedom
## Multiple R-squared: 0.6687, Adjusted R-squared: 0.6681
## F-statistic: 971.7 on 3 and 1444 DF, p-value: < 2.2e-16
# Note : The p-vlaue of variable XistFlrSF so big, it fails to reject the null hypothesis, means this variable has no effect in the model, both the pattern and R squared value has no change at all.
model_3 <- lm(SalePrice ~ GrLivArea + TotalBsmtSF + X1stFlrSF + as.factor(OverallQual), data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF)
plot(model_3, which = 1)
plot(model_3, which = 2)
summary(model_3)
##
## Call:
## lm(formula = SalePrice ~ GrLivArea + TotalBsmtSF + X1stFlrSF +
## as.factor(OverallQual), data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF)
##
## Residuals:
## Min 1Q Median 3Q Max
## -143600 -17329 1672 17132 182881
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.220e+02 2.241e+04 0.041 0.96719
## GrLivArea 5.212e+01 2.263e+00 23.029 < 2e-16 ***
## TotalBsmtSF 3.793e+01 3.668e+00 10.341 < 2e-16 ***
## X1stFlrSF 6.484e+00 4.189e+00 1.548 0.12186
## as.factor(OverallQual)2 4.215e+03 2.883e+04 0.146 0.88376
## as.factor(OverallQual)3 1.032e+03 2.345e+04 0.044 0.96491
## as.factor(OverallQual)4 1.510e+04 2.256e+04 0.669 0.50351
## as.factor(OverallQual)5 2.526e+04 2.247e+04 1.124 0.26104
## as.factor(OverallQual)6 4.010e+04 2.251e+04 1.782 0.07503 .
## as.factor(OverallQual)7 6.825e+04 2.258e+04 3.022 0.00255 **
## as.factor(OverallQual)8 1.098e+05 2.276e+04 4.823 1.57e-06 ***
## as.factor(OverallQual)9 1.764e+05 2.327e+04 7.582 6.10e-14 ***
## as.factor(OverallQual)10 2.179e+05 2.476e+04 8.801 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31580 on 1435 degrees of freedom
## Multiple R-squared: 0.8216, Adjusted R-squared: 0.8201
## F-statistic: 550.7 on 12 and 1435 DF, p-value: < 2.2e-16
# Note : this variable helps to build the linear model a lot, throw away X1stFlrSF, OVerallQuall 2 ~ 5 levels since their p-vlaue are a lot greater than 0.05, means that they have no effect on this model.
my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual <-filter(my_data_GrLivArea_TotalBsmtSF_X1stFlrSF,between(OverallQual,5,10))
model_3_2<- lm(SalePrice ~ GrLivArea + TotalBsmtSF + as.factor(OverallQual), data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual)
plot(model_3_2, which = 1)
plot(model_3_2, which = 2)
summary(model_3_2)
##
## Call:
## lm(formula = SalePrice ~ GrLivArea + TotalBsmtSF + as.factor(OverallQual),
## data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual)
##
## Residuals:
## Min 1Q Median 3Q Max
## -143474 -17417 1564 18038 179441
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.384e+04 3.903e+03 6.108 1.33e-09 ***
## GrLivArea 5.481e+01 2.206e+00 24.847 < 2e-16 ***
## TotalBsmtSF 4.424e+01 2.710e+00 16.324 < 2e-16 ***
## as.factor(OverallQual)6 1.407e+04 2.371e+03 5.935 3.76e-09 ***
## as.factor(OverallQual)7 4.144e+04 2.647e+03 15.653 < 2e-16 ***
## as.factor(OverallQual)8 8.212e+04 3.523e+03 23.313 < 2e-16 ***
## as.factor(OverallQual)9 1.483e+05 5.869e+03 25.261 < 2e-16 ***
## as.factor(OverallQual)10 1.889e+05 1.035e+04 18.251 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32220 on 1299 degrees of freedom
## Multiple R-squared: 0.8074, Adjusted R-squared: 0.8064
## F-statistic: 778 on 7 and 1299 DF, p-value: < 2.2e-16
#Note : R square value drops a litter after I throw away those large p-value varibles, but it is still pretty good, and lower the risk of overfitting.
model_4 <- lm(SalePrice ~ GrLivArea + TotalBsmtSF + as.factor(OverallQual) + KitchenQual, data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual)
plot(model_4, which = 1)
plot(model_4, which = 2)
summary(model_4)
##
## Call:
## lm(formula = SalePrice ~ GrLivArea + TotalBsmtSF + as.factor(OverallQual) +
## KitchenQual, data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual)
##
## Residuals:
## Min 1Q Median 3Q Max
## -132817 -17198 1666 16446 175873
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60234.422 6008.231 10.025 < 2e-16 ***
## GrLivArea 54.775 2.125 25.773 < 2e-16 ***
## TotalBsmtSF 41.656 2.617 15.916 < 2e-16 ***
## as.factor(OverallQual)6 11506.094 2313.341 4.974 7.45e-07 ***
## as.factor(OverallQual)7 30143.858 2879.204 10.470 < 2e-16 ***
## as.factor(OverallQual)8 65563.442 3793.894 17.281 < 2e-16 ***
## as.factor(OverallQual)9 120019.284 6624.391 18.118 < 2e-16 ***
## as.factor(OverallQual)10 162066.264 10441.460 15.521 < 2e-16 ***
## KitchenQualFa -61749.581 8275.431 -7.462 1.56e-13 ***
## KitchenQualGd -19074.511 4298.392 -4.438 9.87e-06 ***
## KitchenQualTA -36795.878 4636.779 -7.936 4.50e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30970 on 1296 degrees of freedom
## Multiple R-squared: 0.8224, Adjusted R-squared: 0.821
## F-statistic: 600.2 on 10 and 1296 DF, p-value: < 2.2e-16
my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual_KitchenQual <-filter(my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual,KitchenQual != "Fa")
model_5<- lm(SalePrice ~ GrLivArea + TotalBsmtSF + as.factor(OverallQual) + KitchenQual + GrLivArea*(as.factor(OverallQual)), data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual_KitchenQual)
plot(model_5, which = 1)
plot(model_5, which = 2)
summary(model_5)
##
## Call:
## lm(formula = SalePrice ~ GrLivArea + TotalBsmtSF + as.factor(OverallQual) +
## KitchenQual + GrLivArea * (as.factor(OverallQual)), data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual_KitchenQual)
##
## Residuals:
## Min 1Q Median 3Q Max
## -132968 -16385 1583 15208 161598
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 96708.527 7069.706 13.679 < 2e-16 ***
## GrLivArea 24.082 3.927 6.133 1.15e-09 ***
## TotalBsmtSF 44.657 2.516 17.750 < 2e-16 ***
## as.factor(OverallQual)6 -21355.374 7822.114 -2.730 0.00642 **
## as.factor(OverallQual)7 -27287.375 9100.512 -2.998 0.00277 **
## as.factor(OverallQual)8 -23815.930 11411.383 -2.087 0.03708 *
## as.factor(OverallQual)9 -55187.572 23981.694 -2.301 0.02154 *
## as.factor(OverallQual)10 -11372.264 37930.741 -0.300 0.76437
## KitchenQualGd -19868.609 4129.652 -4.811 1.68e-06 ***
## KitchenQualTA -37923.588 4443.829 -8.534 < 2e-16 ***
## GrLivArea:as.factor(OverallQual)6 26.995 5.546 4.867 1.27e-06 ***
## GrLivArea:as.factor(OverallQual)7 41.470 5.753 7.209 9.64e-13 ***
## GrLivArea:as.factor(OverallQual)8 56.263 6.346 8.866 < 2e-16 ***
## GrLivArea:as.factor(OverallQual)9 95.272 11.622 8.198 5.93e-16 ***
## GrLivArea:as.factor(OverallQual)10 83.628 14.918 5.606 2.54e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 29480 on 1272 degrees of freedom
## Multiple R-squared: 0.84, Adjusted R-squared: 0.8383
## F-statistic: 477.1 on 14 and 1272 DF, p-value: < 2.2e-16
#Note : filter out "Fa" since there is no data equals "Fa". every other variables has it's effect on the model which is pretty good, will stop here.
my_data_outlier <- my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual_KitchenQual %>%
add_residuals(model_5, "resid1") %>%
filter (abs(resid1) > 50000) %>%
select(GrLivArea,TotalBsmtSF, X1stFlrSF,OverallQual, KitchenQual,SalePrice,resid1) %>%
arrange(desc(resid1)) %>%
print()
## # A tibble: 105 × 7
## GrLivArea TotalBsmtSF X1stFlrSF OverallQual KitchenQual SalePrice resid1
## <int> <int> <int> <int> <chr> <int> <dbl>
## 1 1419 1419 1419 8 Gd 392000 161598.
## 2 1976 1976 1976 8 Gd 440000 139972.
## 3 1652 1600 1652 8 Gd 392500 135295.
## 4 3279 1650 1690 8 Ex 538000 127972.
## 5 2822 1734 1734 9 Ex 582933 127161.
## 6 2036 2136 2036 7 TA 375000 114652.
## 7 1710 1710 1710 8 Gd 372402 105625.
## 8 1954 798 1137 7 Gd 311500 98224.
## 9 1973 1935 1973 8 Gd 395000 97044.
## 10 2234 2216 2234 9 Ex 501837 94721.
## # … with 95 more rows
my_data_outlier %>%
as.tibble() %>%
#add_residuals(model_5, "resid1") %>%
ggplot(aes(SalePrice, resid1)) +
geom_ref_line(h = 0) +
geom_point() +
ylim(-100000, 100000)
## Warning: Removed 11 rows containing missing values (`geom_point()`).
# Note : no pattern.
I find that the variable”OverallQual” has the strongest effect to the model, after I throw it in, the model becomes a lot better, I think it is probably the key factor for this data set; the variable X1stFlrSF has no contribution at all to build the linear model , even though it has very strong simple linear relationship with SalePrice, it is actually very similar to the variable”TotalBsmtSF” according to real estate definition; bedroom and bathromm don’t show strong relationship with SalePrice, it’s a bit strange with the common sense, it may due to the less quantity of data, or this is a special case for a special location, since I don’t have any backgroud information for this data set, I can’t get into that part; I can’t find any inter relationship with those potential outlier, it has no pattern at all, SalePrice varies randomly, then I decide just keep them in the model. So, model 5 is my best linear model to predict the house price.