library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.1.0
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(modelr)
Here for this given data set, I need to build a linear model to predict the house price,this data set has total 81 variables and 1460 observation,and among them, 38 are numerical variables,43 are categorical.my target variable is SalePrice. I chose 3 numerical variables based on the correlation value and 2 categorical variables based on ggpairs.
my_data <-read.csv("train.csv")
my_data<- as.tibble(my_data)
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
numeric_column <- keep(my_data, is.numeric)
result_numeric <- cor(numeric_column$SalePrice,numeric_column)
result_numeric[result_numeric>0.6]
## [1] NA 0.7909816 NA 0.6135806 0.6058522 0.7086245 NA
## [8] 0.6404092 0.6234314 1.0000000
# OverallQual 0.7909816
# GrLivArea 0.7086245
# TotalBsmtSF 0.6135806
# X1stFlrSF 0.6058522
# GarageCars 0.6404092
# GarageArea 0.6234314
my_data <-read.csv("train.csv")
my_data<- as.tibble(my_data)
my_data_ggpair <- select(my_data, SalePrice,MSZoning,HouseStyle,Foundation,KitchenQual)
ggpairs(my_data_ggpair)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# MSZoning(a little less),KitchenQual both have big variance, neiborhood has 25 level which exceed the limit of 15 level, can't use ggpairs
my_data <-read.csv("train.csv")
my_data<- as.tibble(my_data)
my_data_ggpair2 <- select(my_data, SalePrice,PavedDrive,GarageCond,GarageType,CentralAir,BsmtCond )
ggpairs(my_data_ggpair2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 81 rows containing non-finite values (`stat_g_gally_count()`).
## Removed 81 rows containing non-finite values (`stat_g_gally_count()`).
## Removed 81 rows containing non-finite values (`stat_g_gally_count()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 81 rows containing non-finite values (`stat_g_gally_count()`).
## Removed 81 rows containing non-finite values (`stat_g_gally_count()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# there is no significant one
my_data <-read.csv("train.csv")
my_data<- as.tibble(my_data)
my_data_ggpair3 <- select(my_data, SalePrice,LandContour,Utilities,Condition1,Condition2,HouseStyle,RoofStyle,RoofMatl,Exterior1st)
ggpairs(my_data_ggpair3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Condition2 is an option
model_GrLivArea <- lm(SalePrice ~ GrLivArea, data = my_data_GrLivArea)
plot(model_GrLivArea, which = 1)
plot(model_GrLivArea, which = 2)
model_GrLivArea$coefficients
## (Intercept) GrLivArea
## 11860.0890 111.7895
grid <- my_data_GrLivArea %>%
data_grid(GrLivArea) %>%
add_predictions(model_GrLivArea)
ggplot(my_data_GrLivArea) + geom_point(aes(GrLivArea, SalePrice)) + geom_line(aes(GrLivArea,pred), data = grid, colour = "red", size = 1)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
ggplot(my_data_GrLivArea) + geom_point(aes(y = SalePrice, x = TotalBsmtSF))
my_data_GrLivArea_TotalBsmtSF <- filter(my_data_GrLivArea, between(TotalBsmtSF, 0,2500))
ggplot(my_data_GrLivArea_TotalBsmtSF) + geom_point(aes(y = SalePrice, x = TotalBsmtSF))
model_TotalBsmtSF <- lm(SalePrice ~ TotalBsmtSF, data = my_data_GrLivArea_TotalBsmtSF)
plot(model_TotalBsmtSF, which = 1)
plot(model_TotalBsmtSF, which = 2)
model_TotalBsmtSF$coefficients
## (Intercept) TotalBsmtSF
## 55599.5638 117.3578
grid <- my_data_GrLivArea_TotalBsmtSF %>%
data_grid(TotalBsmtSF) %>%
add_predictions(model_TotalBsmtSF)
ggplot(my_data_GrLivArea_TotalBsmtSF) + geom_point(aes(TotalBsmtSF, SalePrice)) + geom_line(aes(TotalBsmtSF,pred), data = grid, colour = "red", size = 1)
ggplot(my_data_GrLivArea_TotalBsmtSF) + geom_point(aes(y = SalePrice,x = X1stFlrSF ))
my_data_GrLivArea_TotalBsmtSF_X1stFlrSF <- filter(my_data, between(X1stFlrSF, 0,2300))
ggplot(my_data_GrLivArea_TotalBsmtSF_X1stFlrSF) + geom_point(aes(y = SalePrice,x = X1stFlrSF ))
model_X1stFlrSF <- lm(SalePrice ~ X1stFlrSF, data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF)
plot(model_X1stFlrSF, which = 1)
plot(model_X1stFlrSF, which = 2)
model_X1stFlrSF$coefficients
## (Intercept) X1stFlrSF
## 31178.7511 128.5846
grid <- my_data_GrLivArea_TotalBsmtSF_X1stFlrSF %>%
data_grid(X1stFlrSF) %>%
add_predictions(model_X1stFlrSF)
ggplot(my_data_GrLivArea_TotalBsmtSF_X1stFlrSF) + geom_point(aes(X1stFlrSF, SalePrice)) + geom_line(aes(X1stFlrSF,pred), data = grid, colour = "red", size = 1)
ggplot(my_data_GrLivArea_TotalBsmtSF_X1stFlrSF) + geom_boxplot(aes(x = SalePrice,y = as.factor(OverallQual)))
model_OverallQual <- lm(SalePrice ~as.character(OverallQual), data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF)
plot(model_OverallQual, which = 1)
plot(model_OverallQual, which = 2)
model_OverallQual$coefficients
## (Intercept) as.character(OverallQual)10
## 50150.000 371858.273
## as.character(OverallQual)2 as.character(OverallQual)3
## 1620.333 37323.750
## as.character(OverallQual)4 as.character(OverallQual)5
## 58270.655 83373.348
## as.character(OverallQual)6 as.character(OverallQual)7
## 111348.421 157566.423
## as.character(OverallQual)8 as.character(OverallQual)9
## 223550.424 311550.071
grid <- my_data_GrLivArea_TotalBsmtSF_X1stFlrSF %>%
data_grid(OverallQual) %>%
add_predictions(model_OverallQual)
grid
## # A tibble: 10 × 2
## OverallQual pred
## <int> <dbl>
## 1 1 50150.
## 2 2 51770.
## 3 3 87474.
## 4 4 108421.
## 5 5 133523.
## 6 6 161498.
## 7 7 207716.
## 8 8 273700.
## 9 9 361700.
## 10 10 422008.
ggplot() + geom_point(aes(OverallQual, SalePrice), data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF) + geom_point(aes(OverallQual, pred), data = grid, colour = "red", size = 4)
ggplot(my_data_GrLivArea_TotalBsmtSF_X1stFlrSF) + geom_boxplot(aes(x = SalePrice,y = as.factor(KitchenQual)))
model_KitchenQual <- lm(SalePrice ~ KitchenQual, data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF)
plot(model_KitchenQual, which = 1)
plot(model_KitchenQual, which = 2)
model_KitchenQual$coefficients
## (Intercept) KitchenQualFa KitchenQualGd KitchenQualTA
## 315122.9 -209557.7 -103622.1 -175243.1
grid <- my_data_GrLivArea_TotalBsmtSF_X1stFlrSF %>%
data_grid(KitchenQual) %>%
add_predictions(model_KitchenQual)
grid
## # A tibble: 4 × 2
## KitchenQual pred
## <chr> <dbl>
## 1 Ex 315123.
## 2 Fa 105565.
## 3 Gd 211501.
## 4 TA 139880.
ggplot() + geom_point(aes(KitchenQual, SalePrice), data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF) + geom_point(aes(KitchenQual, pred), data = grid, colour = "red", size = 4)
### Modeling procedure:
my_data_1 <- my_data_GrLivArea_TotalBsmtSF_X1stFlrSF
model_1 <- lm(SalePrice ~ GrLivArea + TotalBsmtSF, data = my_data_1)
plot(model_1, which = 1)
plot(model_1, which = 2)
summary(model_1)
##
## Call:
## lm(formula = SalePrice ~ GrLivArea + TotalBsmtSF, data = my_data_1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -181158 -21113 840 21952 236749
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -31201.871 4051.015 -7.702 2.47e-14 ***
## GrLivArea 85.519 2.473 34.575 < 2e-16 ***
## TotalBsmtSF 78.460 3.052 25.708 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 42880 on 1445 degrees of freedom
## Multiple R-squared: 0.6687, Adjusted R-squared: 0.6683
## F-statistic: 1458 on 2 and 1445 DF, p-value: < 2.2e-16
model_2 <- lm(SalePrice ~ GrLivArea + TotalBsmtSF + X1stFlrSF, data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF)
plot(model_2, which = 1)
plot(model_2, which = 2)
summary(model_2)
##
## Call:
## lm(formula = SalePrice ~ GrLivArea + TotalBsmtSF + X1stFlrSF,
## data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF)
##
## Residuals:
## Min 1Q Median 3Q Max
## -180717 -21471 774 21893 236582
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -30665.885 4267.434 -7.186 1.07e-12 ***
## GrLivArea 85.910 2.659 32.307 < 2e-16 ***
## TotalBsmtSF 79.850 4.621 17.278 < 2e-16 ***
## X1stFlrSF -2.237 5.586 -0.401 0.689
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 42900 on 1444 degrees of freedom
## Multiple R-squared: 0.6687, Adjusted R-squared: 0.6681
## F-statistic: 971.7 on 3 and 1444 DF, p-value: < 2.2e-16
model_3 <- lm(SalePrice ~ GrLivArea + TotalBsmtSF + X1stFlrSF + as.factor(OverallQual), data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF)
plot(model_3, which = 1)
plot(model_3, which = 2)
summary(model_3)
##
## Call:
## lm(formula = SalePrice ~ GrLivArea + TotalBsmtSF + X1stFlrSF +
## as.factor(OverallQual), data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF)
##
## Residuals:
## Min 1Q Median 3Q Max
## -143600 -17329 1672 17132 182881
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.220e+02 2.241e+04 0.041 0.96719
## GrLivArea 5.212e+01 2.263e+00 23.029 < 2e-16 ***
## TotalBsmtSF 3.793e+01 3.668e+00 10.341 < 2e-16 ***
## X1stFlrSF 6.484e+00 4.189e+00 1.548 0.12186
## as.factor(OverallQual)2 4.215e+03 2.883e+04 0.146 0.88376
## as.factor(OverallQual)3 1.032e+03 2.345e+04 0.044 0.96491
## as.factor(OverallQual)4 1.510e+04 2.256e+04 0.669 0.50351
## as.factor(OverallQual)5 2.526e+04 2.247e+04 1.124 0.26104
## as.factor(OverallQual)6 4.010e+04 2.251e+04 1.782 0.07503 .
## as.factor(OverallQual)7 6.825e+04 2.258e+04 3.022 0.00255 **
## as.factor(OverallQual)8 1.098e+05 2.276e+04 4.823 1.57e-06 ***
## as.factor(OverallQual)9 1.764e+05 2.327e+04 7.582 6.10e-14 ***
## as.factor(OverallQual)10 2.179e+05 2.476e+04 8.801 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31580 on 1435 degrees of freedom
## Multiple R-squared: 0.8216, Adjusted R-squared: 0.8201
## F-statistic: 550.7 on 12 and 1435 DF, p-value: < 2.2e-16
my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual <-filter(my_data_GrLivArea_TotalBsmtSF_X1stFlrSF,between(OverallQual,4,10))
model_3_2<- lm(SalePrice ~ GrLivArea + TotalBsmtSF + as.factor(OverallQual), data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual)
plot(model_3_2, which = 1)
plot(model_3_2, which = 2)
summary(model_3_2)
##
## Call:
## lm(formula = SalePrice ~ GrLivArea + TotalBsmtSF + as.factor(OverallQual),
## data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual)
##
## Residuals:
## Min 1Q Median 3Q Max
## -143634 -17057 1594 17332 181279
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.651e+04 4.104e+03 4.023 6.05e-05 ***
## GrLivArea 5.353e+01 2.108e+00 25.391 < 2e-16 ***
## TotalBsmtSF 4.325e+01 2.544e+00 17.005 < 2e-16 ***
## as.factor(OverallQual)5 9.851e+03 3.379e+03 2.915 0.00361 **
## as.factor(OverallQual)6 2.425e+04 3.482e+03 6.963 5.09e-12 ***
## as.factor(OverallQual)7 5.205e+04 3.733e+03 13.943 < 2e-16 ***
## as.factor(OverallQual)8 9.331e+04 4.452e+03 20.958 < 2e-16 ***
## as.factor(OverallQual)9 1.599e+05 6.451e+03 24.794 < 2e-16 ***
## as.factor(OverallQual)10 2.011e+05 1.058e+04 19.005 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31650 on 1414 degrees of freedom
## Multiple R-squared: 0.8174, Adjusted R-squared: 0.8163
## F-statistic: 791.1 on 8 and 1414 DF, p-value: < 2.2e-16
model_4 <- lm(SalePrice ~ GrLivArea + TotalBsmtSF + as.factor(OverallQual) + KitchenQual, data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual)
plot(model_4, which = 1)
plot(model_4, which = 2)
summary(model_4)
##
## Call:
## lm(formula = SalePrice ~ GrLivArea + TotalBsmtSF + as.factor(OverallQual) +
## KitchenQual, data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual)
##
## Residuals:
## Min 1Q Median 3Q Max
## -132691 -16888 1465 16124 177051
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56272.441 6141.955 9.162 < 2e-16 ***
## GrLivArea 53.181 2.031 26.183 < 2e-16 ***
## TotalBsmtSF 40.834 2.459 16.605 < 2e-16 ***
## as.factor(OverallQual)5 7019.626 3283.808 2.138 0.0327 *
## as.factor(OverallQual)6 18807.258 3411.075 5.514 4.18e-08 ***
## as.factor(OverallQual)7 37941.099 3892.644 9.747 < 2e-16 ***
## as.factor(OverallQual)8 73894.586 4666.344 15.836 < 2e-16 ***
## as.factor(OverallQual)9 128726.570 7147.573 18.010 < 2e-16 ***
## as.factor(OverallQual)10 171397.419 10687.050 16.038 < 2e-16 ***
## KitchenQualFa -54364.944 7053.987 -7.707 2.42e-14 ***
## KitchenQualGd -19232.355 4226.252 -4.551 5.81e-06 ***
## KitchenQualTA -37345.678 4543.452 -8.220 4.57e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30460 on 1411 degrees of freedom
## Multiple R-squared: 0.8312, Adjusted R-squared: 0.8299
## F-statistic: 631.5 on 11 and 1411 DF, p-value: < 2.2e-16
my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual_KitchenQual <-filter(my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual,KitchenQual != "Fa")
model_4_2<- lm(SalePrice ~ GrLivArea + TotalBsmtSF + as.factor(OverallQual) + KitchenQual, data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual_KitchenQual)
plot(model_4_2, which = 1)
plot(model_4_2, which = 2)
summary(model_4_2)
##
## Call:
## lm(formula = SalePrice ~ GrLivArea + TotalBsmtSF + as.factor(OverallQual) +
## KitchenQual, data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual_KitchenQual)
##
## Residuals:
## Min 1Q Median 3Q Max
## -132671 -16730 1645 16234 176195
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53065.930 6269.341 8.464 < 2e-16 ***
## GrLivArea 54.340 2.070 26.252 < 2e-16 ***
## TotalBsmtSF 41.147 2.494 16.498 < 2e-16 ***
## as.factor(OverallQual)5 8359.011 3410.094 2.451 0.0144 *
## as.factor(OverallQual)6 20194.797 3538.488 5.707 1.40e-08 ***
## as.factor(OverallQual)7 38627.126 3993.897 9.672 < 2e-16 ***
## as.factor(OverallQual)8 74339.954 4758.560 15.622 < 2e-16 ***
## as.factor(OverallQual)9 128975.929 7227.592 17.845 < 2e-16 ***
## as.factor(OverallQual)10 171169.688 10762.870 15.904 < 2e-16 ***
## KitchenQualGd -19133.292 4241.338 -4.511 7.00e-06 ***
## KitchenQualTA -37094.673 4561.204 -8.133 9.27e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30570 on 1380 degrees of freedom
## Multiple R-squared: 0.8298, Adjusted R-squared: 0.8286
## F-statistic: 672.8 on 10 and 1380 DF, p-value: < 2.2e-16