library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0     ✔ purrr   1.0.1
## ✔ tibble  3.1.8     ✔ dplyr   1.1.0
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.3     ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(modelr)

Introduction

Here for this given data set, I need to build a linear model to predict the house price,this data set has total 81 variables and 1460 observation,and among them, 38 are numerical variables,43 are categorical.my target variable is SalePrice. I chose 3 numerical variables based on the correlation value and 2 categorical variables based on ggpairs.

check saleprice with numerical variables’ correlation:

my_data <-read.csv("train.csv")
my_data<- as.tibble(my_data)
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
numeric_column <- keep(my_data, is.numeric) 
result_numeric <- cor(numeric_column$SalePrice,numeric_column)
result_numeric[result_numeric>0.6]
##  [1]        NA 0.7909816        NA 0.6135806 0.6058522 0.7086245        NA
##  [8] 0.6404092 0.6234314 1.0000000
# OverallQual 0.7909816
# GrLivArea  0.7086245
# TotalBsmtSF   0.6135806
# X1stFlrSF     0.6058522
# GarageCars   0.6404092
# GarageArea   0.6234314
my_data <-read.csv("train.csv")
my_data<- as.tibble(my_data)
my_data_ggpair <- select(my_data, SalePrice,MSZoning,HouseStyle,Foundation,KitchenQual)
ggpairs(my_data_ggpair)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# MSZoning(a little less),KitchenQual both have  big variance, neiborhood has 25 level which exceed the limit of 15 level, can't use ggpairs
my_data <-read.csv("train.csv")
my_data<- as.tibble(my_data)
my_data_ggpair2 <- select(my_data, SalePrice,PavedDrive,GarageCond,GarageType,CentralAir,BsmtCond )
ggpairs(my_data_ggpair2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 81 rows containing non-finite values (`stat_g_gally_count()`).
## Removed 81 rows containing non-finite values (`stat_g_gally_count()`).
## Removed 81 rows containing non-finite values (`stat_g_gally_count()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 81 rows containing non-finite values (`stat_g_gally_count()`).
## Removed 81 rows containing non-finite values (`stat_g_gally_count()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# there is no significant one
my_data <-read.csv("train.csv")
my_data<- as.tibble(my_data)
my_data_ggpair3 <- select(my_data, SalePrice,LandContour,Utilities,Condition1,Condition2,HouseStyle,RoofStyle,RoofMatl,Exterior1st) 
ggpairs(my_data_ggpair3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Condition2 is an option

So, I pick 3 numerical variables, they are: GrLivArea, TotalBsmtSF, X1stFlrSF; and 2 categorical variables, they are OverallQual(10 levels) and KetchenQual(5 levels). and the following are the insights of every variables:

model_GrLivArea <- lm(SalePrice ~ GrLivArea, data = my_data_GrLivArea)
plot(model_GrLivArea, which = 1)

plot(model_GrLivArea, which = 2)

model_GrLivArea$coefficients
## (Intercept)   GrLivArea 
##  11860.0890    111.7895
grid <- my_data_GrLivArea %>%
  data_grid(GrLivArea) %>%
  add_predictions(model_GrLivArea)

ggplot(my_data_GrLivArea) + geom_point(aes(GrLivArea, SalePrice)) + geom_line(aes(GrLivArea,pred), data = grid, colour = "red", size = 1)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.

ggplot(my_data_GrLivArea) + geom_point(aes(y = SalePrice, x = TotalBsmtSF))

my_data_GrLivArea_TotalBsmtSF <- filter(my_data_GrLivArea, between(TotalBsmtSF, 0,2500))

ggplot(my_data_GrLivArea_TotalBsmtSF) + geom_point(aes(y = SalePrice, x = TotalBsmtSF))

model_TotalBsmtSF <- lm(SalePrice ~ TotalBsmtSF, data = my_data_GrLivArea_TotalBsmtSF)
plot(model_TotalBsmtSF, which = 1)

plot(model_TotalBsmtSF, which = 2)

model_TotalBsmtSF$coefficients
## (Intercept) TotalBsmtSF 
##  55599.5638    117.3578
grid <- my_data_GrLivArea_TotalBsmtSF %>%
  data_grid(TotalBsmtSF) %>%
  add_predictions(model_TotalBsmtSF)

ggplot(my_data_GrLivArea_TotalBsmtSF) + geom_point(aes(TotalBsmtSF, SalePrice)) + geom_line(aes(TotalBsmtSF,pred), data = grid, colour = "red", size = 1)

ggplot(my_data_GrLivArea_TotalBsmtSF) + geom_point(aes(y = SalePrice,x = X1stFlrSF ))

my_data_GrLivArea_TotalBsmtSF_X1stFlrSF <- filter(my_data, between(X1stFlrSF, 0,2300))
ggplot(my_data_GrLivArea_TotalBsmtSF_X1stFlrSF) + geom_point(aes(y = SalePrice,x = X1stFlrSF ))

model_X1stFlrSF <- lm(SalePrice ~ X1stFlrSF, data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF)
plot(model_X1stFlrSF, which = 1)

plot(model_X1stFlrSF, which = 2)

model_X1stFlrSF$coefficients
## (Intercept)   X1stFlrSF 
##  31178.7511    128.5846
grid <- my_data_GrLivArea_TotalBsmtSF_X1stFlrSF %>%
  data_grid(X1stFlrSF) %>%
  add_predictions(model_X1stFlrSF)

ggplot(my_data_GrLivArea_TotalBsmtSF_X1stFlrSF) + geom_point(aes(X1stFlrSF, SalePrice)) + geom_line(aes(X1stFlrSF,pred), data = grid, colour = "red", size = 1)

ggplot(my_data_GrLivArea_TotalBsmtSF_X1stFlrSF) + geom_boxplot(aes(x = SalePrice,y = as.factor(OverallQual)))

model_OverallQual <- lm(SalePrice ~as.character(OverallQual), data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF)

plot(model_OverallQual, which = 1)

plot(model_OverallQual, which = 2)

model_OverallQual$coefficients
##                 (Intercept) as.character(OverallQual)10 
##                   50150.000                  371858.273 
##  as.character(OverallQual)2  as.character(OverallQual)3 
##                    1620.333                   37323.750 
##  as.character(OverallQual)4  as.character(OverallQual)5 
##                   58270.655                   83373.348 
##  as.character(OverallQual)6  as.character(OverallQual)7 
##                  111348.421                  157566.423 
##  as.character(OverallQual)8  as.character(OverallQual)9 
##                  223550.424                  311550.071
grid <- my_data_GrLivArea_TotalBsmtSF_X1stFlrSF %>%
  data_grid(OverallQual) %>%
  add_predictions(model_OverallQual)

grid
## # A tibble: 10 × 2
##    OverallQual    pred
##          <int>   <dbl>
##  1           1  50150.
##  2           2  51770.
##  3           3  87474.
##  4           4 108421.
##  5           5 133523.
##  6           6 161498.
##  7           7 207716.
##  8           8 273700.
##  9           9 361700.
## 10          10 422008.
ggplot() + geom_point(aes(OverallQual, SalePrice), data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF) + geom_point(aes(OverallQual, pred), data = grid, colour = "red", size = 4)

ggplot(my_data_GrLivArea_TotalBsmtSF_X1stFlrSF) + geom_boxplot(aes(x = SalePrice,y = as.factor(KitchenQual)))

model_KitchenQual <- lm(SalePrice ~ KitchenQual, data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF)

plot(model_KitchenQual, which = 1)

plot(model_KitchenQual, which = 2)

model_KitchenQual$coefficients
##   (Intercept) KitchenQualFa KitchenQualGd KitchenQualTA 
##      315122.9     -209557.7     -103622.1     -175243.1
grid <- my_data_GrLivArea_TotalBsmtSF_X1stFlrSF %>%
  data_grid(KitchenQual) %>%
  add_predictions(model_KitchenQual)

grid
## # A tibble: 4 × 2
##   KitchenQual    pred
##   <chr>         <dbl>
## 1 Ex          315123.
## 2 Fa          105565.
## 3 Gd          211501.
## 4 TA          139880.
ggplot() + geom_point(aes(KitchenQual, SalePrice), data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF) + geom_point(aes(KitchenQual, pred), data = grid, colour = "red", size = 4)

### Modeling procedure:

my_data_1 <- my_data_GrLivArea_TotalBsmtSF_X1stFlrSF 
model_1 <- lm(SalePrice ~ GrLivArea + TotalBsmtSF, data = my_data_1)


plot(model_1, which = 1)

plot(model_1, which = 2)

summary(model_1)
## 
## Call:
## lm(formula = SalePrice ~ GrLivArea + TotalBsmtSF, data = my_data_1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -181158  -21113     840   21952  236749 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -31201.871   4051.015  -7.702 2.47e-14 ***
## GrLivArea       85.519      2.473  34.575  < 2e-16 ***
## TotalBsmtSF     78.460      3.052  25.708  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 42880 on 1445 degrees of freedom
## Multiple R-squared:  0.6687, Adjusted R-squared:  0.6683 
## F-statistic:  1458 on 2 and 1445 DF,  p-value: < 2.2e-16
model_2 <- lm(SalePrice ~ GrLivArea + TotalBsmtSF + X1stFlrSF, data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF)



plot(model_2, which = 1)

plot(model_2, which = 2)

summary(model_2)
## 
## Call:
## lm(formula = SalePrice ~ GrLivArea + TotalBsmtSF + X1stFlrSF, 
##     data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -180717  -21471     774   21893  236582 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -30665.885   4267.434  -7.186 1.07e-12 ***
## GrLivArea       85.910      2.659  32.307  < 2e-16 ***
## TotalBsmtSF     79.850      4.621  17.278  < 2e-16 ***
## X1stFlrSF       -2.237      5.586  -0.401    0.689    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 42900 on 1444 degrees of freedom
## Multiple R-squared:  0.6687, Adjusted R-squared:  0.6681 
## F-statistic: 971.7 on 3 and 1444 DF,  p-value: < 2.2e-16
model_3 <- lm(SalePrice ~ GrLivArea + TotalBsmtSF + X1stFlrSF + as.factor(OverallQual), data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF)

plot(model_3, which = 1)

plot(model_3, which = 2)

summary(model_3)
## 
## Call:
## lm(formula = SalePrice ~ GrLivArea + TotalBsmtSF + X1stFlrSF + 
##     as.factor(OverallQual), data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -143600  -17329    1672   17132  182881 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              9.220e+02  2.241e+04   0.041  0.96719    
## GrLivArea                5.212e+01  2.263e+00  23.029  < 2e-16 ***
## TotalBsmtSF              3.793e+01  3.668e+00  10.341  < 2e-16 ***
## X1stFlrSF                6.484e+00  4.189e+00   1.548  0.12186    
## as.factor(OverallQual)2  4.215e+03  2.883e+04   0.146  0.88376    
## as.factor(OverallQual)3  1.032e+03  2.345e+04   0.044  0.96491    
## as.factor(OverallQual)4  1.510e+04  2.256e+04   0.669  0.50351    
## as.factor(OverallQual)5  2.526e+04  2.247e+04   1.124  0.26104    
## as.factor(OverallQual)6  4.010e+04  2.251e+04   1.782  0.07503 .  
## as.factor(OverallQual)7  6.825e+04  2.258e+04   3.022  0.00255 ** 
## as.factor(OverallQual)8  1.098e+05  2.276e+04   4.823 1.57e-06 ***
## as.factor(OverallQual)9  1.764e+05  2.327e+04   7.582 6.10e-14 ***
## as.factor(OverallQual)10 2.179e+05  2.476e+04   8.801  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31580 on 1435 degrees of freedom
## Multiple R-squared:  0.8216, Adjusted R-squared:  0.8201 
## F-statistic: 550.7 on 12 and 1435 DF,  p-value: < 2.2e-16
my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual <-filter(my_data_GrLivArea_TotalBsmtSF_X1stFlrSF,between(OverallQual,4,10))
model_3_2<- lm(SalePrice ~ GrLivArea + TotalBsmtSF + as.factor(OverallQual), data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual)
plot(model_3_2, which = 1)

plot(model_3_2, which = 2)

summary(model_3_2)
## 
## Call:
## lm(formula = SalePrice ~ GrLivArea + TotalBsmtSF + as.factor(OverallQual), 
##     data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -143634  -17057    1594   17332  181279 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              1.651e+04  4.104e+03   4.023 6.05e-05 ***
## GrLivArea                5.353e+01  2.108e+00  25.391  < 2e-16 ***
## TotalBsmtSF              4.325e+01  2.544e+00  17.005  < 2e-16 ***
## as.factor(OverallQual)5  9.851e+03  3.379e+03   2.915  0.00361 ** 
## as.factor(OverallQual)6  2.425e+04  3.482e+03   6.963 5.09e-12 ***
## as.factor(OverallQual)7  5.205e+04  3.733e+03  13.943  < 2e-16 ***
## as.factor(OverallQual)8  9.331e+04  4.452e+03  20.958  < 2e-16 ***
## as.factor(OverallQual)9  1.599e+05  6.451e+03  24.794  < 2e-16 ***
## as.factor(OverallQual)10 2.011e+05  1.058e+04  19.005  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31650 on 1414 degrees of freedom
## Multiple R-squared:  0.8174, Adjusted R-squared:  0.8163 
## F-statistic: 791.1 on 8 and 1414 DF,  p-value: < 2.2e-16
model_4 <- lm(SalePrice ~ GrLivArea + TotalBsmtSF  + as.factor(OverallQual) + KitchenQual, data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual)


plot(model_4, which = 1)

plot(model_4, which = 2)

summary(model_4)
## 
## Call:
## lm(formula = SalePrice ~ GrLivArea + TotalBsmtSF + as.factor(OverallQual) + 
##     KitchenQual, data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -132691  -16888    1465   16124  177051 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               56272.441   6141.955   9.162  < 2e-16 ***
## GrLivArea                    53.181      2.031  26.183  < 2e-16 ***
## TotalBsmtSF                  40.834      2.459  16.605  < 2e-16 ***
## as.factor(OverallQual)5    7019.626   3283.808   2.138   0.0327 *  
## as.factor(OverallQual)6   18807.258   3411.075   5.514 4.18e-08 ***
## as.factor(OverallQual)7   37941.099   3892.644   9.747  < 2e-16 ***
## as.factor(OverallQual)8   73894.586   4666.344  15.836  < 2e-16 ***
## as.factor(OverallQual)9  128726.570   7147.573  18.010  < 2e-16 ***
## as.factor(OverallQual)10 171397.419  10687.050  16.038  < 2e-16 ***
## KitchenQualFa            -54364.944   7053.987  -7.707 2.42e-14 ***
## KitchenQualGd            -19232.355   4226.252  -4.551 5.81e-06 ***
## KitchenQualTA            -37345.678   4543.452  -8.220 4.57e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30460 on 1411 degrees of freedom
## Multiple R-squared:  0.8312, Adjusted R-squared:  0.8299 
## F-statistic: 631.5 on 11 and 1411 DF,  p-value: < 2.2e-16
my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual_KitchenQual <-filter(my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual,KitchenQual != "Fa")
model_4_2<- lm(SalePrice ~ GrLivArea + TotalBsmtSF + as.factor(OverallQual) + KitchenQual, data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual_KitchenQual)
plot(model_4_2, which = 1)

plot(model_4_2, which = 2)

summary(model_4_2)
## 
## Call:
## lm(formula = SalePrice ~ GrLivArea + TotalBsmtSF + as.factor(OverallQual) + 
##     KitchenQual, data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual_KitchenQual)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -132671  -16730    1645   16234  176195 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               53065.930   6269.341   8.464  < 2e-16 ***
## GrLivArea                    54.340      2.070  26.252  < 2e-16 ***
## TotalBsmtSF                  41.147      2.494  16.498  < 2e-16 ***
## as.factor(OverallQual)5    8359.011   3410.094   2.451   0.0144 *  
## as.factor(OverallQual)6   20194.797   3538.488   5.707 1.40e-08 ***
## as.factor(OverallQual)7   38627.126   3993.897   9.672  < 2e-16 ***
## as.factor(OverallQual)8   74339.954   4758.560  15.622  < 2e-16 ***
## as.factor(OverallQual)9  128975.929   7227.592  17.845  < 2e-16 ***
## as.factor(OverallQual)10 171169.688  10762.870  15.904  < 2e-16 ***
## KitchenQualGd            -19133.292   4241.338  -4.511 7.00e-06 ***
## KitchenQualTA            -37094.673   4561.204  -8.133 9.27e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30570 on 1380 degrees of freedom
## Multiple R-squared:  0.8298, Adjusted R-squared:  0.8286 
## F-statistic: 672.8 on 10 and 1380 DF,  p-value: < 2.2e-16