library(tidyverse)
library(GGally)
library(modelr)


Introduction

For this given data set, I need to build a linear model to predict the house price,this data set has total 81 variables and 1460 observation,and among them, 38 are numerical variables,43 are categorical.my target variable is SalePrice. I chose 3 numerical variables based on the correlation coefficient and 2 categorical variables based on ggpairs, they are: GrLivArea : Above grade (ground) living area square feet TotalBsmtSF : Total square feet of basement area X1stFlrSF : First Floor square feet OverallQual : Rates the overall material and finish of the house(10 levels) KetchenQual : Kitchen quality(5 levels).


check SalePrice with numerical variables’correlation:


my_data <-read.csv("train.csv")
my_data<- as.tibble(my_data)
## Warning: `as.tibble()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
numeric_column <- keep(my_data, is.numeric) 
result_numeric <- cor(numeric_column$SalePrice,numeric_column)
result_numeric[result_numeric>0.6]
##  [1]        NA 0.7909816        NA 0.6135806 0.6058522 0.7086245        NA
##  [8] 0.6404092 0.6234314 1.0000000
# OverallQual 0.7909816
# GrLivArea  0.7086245
# TotalBsmtSF   0.6135806
# X1stFlrSF     0.6058522
# GarageCars   0.6404092
# GarageArea   0.6234314


check SalePrice with categorical variables’ correlation:


my_data <-read.csv("train.csv")
my_data<- as.tibble(my_data)
my_data_ggpair <- select(my_data, SalePrice,MSZoning,HouseStyle,Foundation,KitchenQual)
ggpairs(my_data_ggpair)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# MSZoning(a little less), KitchenQual both have  big variance, neiborhood has 25 level which exceed the limit of 15 level, can't use ggpair
my_data <-read.csv("train.csv")
my_data<- as.tibble(my_data)
my_data_ggpair2 <- select(my_data, SalePrice,PavedDrive,GarageCond,GarageType,CentralAir,BsmtCond )
ggpairs(my_data_ggpair2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 81 rows containing non-finite values (`stat_g_gally_count()`).
## Removed 81 rows containing non-finite values (`stat_g_gally_count()`).
## Removed 81 rows containing non-finite values (`stat_g_gally_count()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 81 rows containing non-finite values (`stat_g_gally_count()`).
## Removed 81 rows containing non-finite values (`stat_g_gally_count()`).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# there is no significant one
my_data <-read.csv("train.csv")
my_data<- as.tibble(my_data)
my_data_ggpair3 <- select(my_data, SalePrice,LandContour,Utilities,Condition1,Condition2,HouseStyle,RoofStyle,RoofMatl,Exterior1st) 
ggpairs(my_data_ggpair3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Condition2 is an option


check SalePrice with each variable’s relationship:


1, GrlivArea


model_GrLivArea <- lm(SalePrice ~ GrLivArea, data = my_data_GrLivArea)
plot(model_GrLivArea, which = 1)

plot(model_GrLivArea, which = 2)

model_GrLivArea$coefficients
## (Intercept)   GrLivArea 
##  11860.0890    111.7895
grid <- my_data_GrLivArea %>%
  data_grid(GrLivArea) %>%
  add_predictions(model_GrLivArea)

ggplot(my_data_GrLivArea) + geom_point(aes(GrLivArea, SalePrice)) + geom_line(aes(GrLivArea,pred), data = grid, colour = "red", size = 1)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.


2, TotalBsmtSF


ggplot(my_data_GrLivArea) + geom_point(aes(y = SalePrice, x = TotalBsmtSF))

my_data_GrLivArea_TotalBsmtSF <- filter(my_data_GrLivArea, between(TotalBsmtSF, 0,2500))

ggplot(my_data_GrLivArea_TotalBsmtSF) + geom_point(aes(y = SalePrice, x = TotalBsmtSF))

model_TotalBsmtSF <- lm(SalePrice ~ TotalBsmtSF, data = my_data_GrLivArea_TotalBsmtSF)
plot(model_TotalBsmtSF, which = 1)

plot(model_TotalBsmtSF, which = 2)

model_TotalBsmtSF$coefficients
## (Intercept) TotalBsmtSF 
##  55599.5638    117.3578
grid <- my_data_GrLivArea_TotalBsmtSF %>%
  data_grid(TotalBsmtSF) %>%
  add_predictions(model_TotalBsmtSF)

ggplot(my_data_GrLivArea_TotalBsmtSF) + geom_point(aes(TotalBsmtSF, SalePrice)) + geom_line(aes(TotalBsmtSF,pred), data = grid, colour = "red", size = 1)


3, X1stFlrSF


ggplot(my_data_GrLivArea_TotalBsmtSF) + geom_point(aes(y = SalePrice,x = X1stFlrSF ))

my_data_GrLivArea_TotalBsmtSF_X1stFlrSF <- filter(my_data, between(X1stFlrSF, 0,2300))
ggplot(my_data_GrLivArea_TotalBsmtSF_X1stFlrSF) + geom_point(aes(y = SalePrice,x = X1stFlrSF ))

model_X1stFlrSF <- lm(SalePrice ~ X1stFlrSF, data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF)
plot(model_X1stFlrSF, which = 1)

plot(model_X1stFlrSF, which = 2)

model_X1stFlrSF$coefficients
## (Intercept)   X1stFlrSF 
##  31178.7511    128.5846
grid <- my_data_GrLivArea_TotalBsmtSF_X1stFlrSF %>%
  data_grid(X1stFlrSF) %>%
  add_predictions(model_X1stFlrSF)

ggplot(my_data_GrLivArea_TotalBsmtSF_X1stFlrSF) + geom_point(aes(X1stFlrSF, SalePrice)) + geom_line(aes(X1stFlrSF,pred), data = grid, colour = "red", size = 1)


4, OverallQual


ggplot(my_data_GrLivArea_TotalBsmtSF_X1stFlrSF) + geom_boxplot(aes(x = SalePrice,y = as.factor(OverallQual)))

model_OverallQual <- lm(SalePrice ~as.character(OverallQual), data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF)

plot(model_OverallQual, which = 1)

plot(model_OverallQual, which = 2)

model_OverallQual$coefficients
##                 (Intercept) as.character(OverallQual)10 
##                   50150.000                  371858.273 
##  as.character(OverallQual)2  as.character(OverallQual)3 
##                    1620.333                   37323.750 
##  as.character(OverallQual)4  as.character(OverallQual)5 
##                   58270.655                   83373.348 
##  as.character(OverallQual)6  as.character(OverallQual)7 
##                  111348.421                  157566.423 
##  as.character(OverallQual)8  as.character(OverallQual)9 
##                  223550.424                  311550.071
grid <- my_data_GrLivArea_TotalBsmtSF_X1stFlrSF %>%
  data_grid(OverallQual) %>%
  add_predictions(model_OverallQual)

grid
## # A tibble: 10 × 2
##    OverallQual    pred
##          <int>   <dbl>
##  1           1  50150.
##  2           2  51770.
##  3           3  87474.
##  4           4 108421.
##  5           5 133523.
##  6           6 161498.
##  7           7 207716.
##  8           8 273700.
##  9           9 361700.
## 10          10 422008.
ggplot() + geom_point(aes(OverallQual, SalePrice), data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF) + geom_point(aes(OverallQual, pred), data = grid, colour = "red", size = 4)


5, KitchenQual


ggplot(my_data_GrLivArea_TotalBsmtSF_X1stFlrSF) + geom_boxplot(aes(x = SalePrice,y = as.factor(KitchenQual)))

model_KitchenQual <- lm(SalePrice ~ KitchenQual, data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF)

plot(model_KitchenQual, which = 1)

plot(model_KitchenQual, which = 2)

model_KitchenQual$coefficients
##   (Intercept) KitchenQualFa KitchenQualGd KitchenQualTA 
##      315122.9     -209557.7     -103622.1     -175243.1
grid <- my_data_GrLivArea_TotalBsmtSF_X1stFlrSF %>%
  data_grid(KitchenQual) %>%
  add_predictions(model_KitchenQual)

grid
## # A tibble: 4 × 2
##   KitchenQual    pred
##   <chr>         <dbl>
## 1 Ex          315123.
## 2 Fa          105565.
## 3 Gd          211501.
## 4 TA          139880.
ggplot() + geom_point(aes(KitchenQual, SalePrice), data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF) + geom_point(aes(KitchenQual, pred), data = grid, colour = "red", size = 4)


Modeling procedure:(add variables in one by one)


my_data_1 <- my_data_GrLivArea_TotalBsmtSF_X1stFlrSF 
model_1 <- lm(SalePrice ~ GrLivArea + TotalBsmtSF, data = my_data_1)


plot(model_1, which = 1)

plot(model_1, which = 2)

summary(model_1)
## 
## Call:
## lm(formula = SalePrice ~ GrLivArea + TotalBsmtSF, data = my_data_1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -181158  -21113     840   21952  236749 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -31201.871   4051.015  -7.702 2.47e-14 ***
## GrLivArea       85.519      2.473  34.575  < 2e-16 ***
## TotalBsmtSF     78.460      3.052  25.708  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 42880 on 1445 degrees of freedom
## Multiple R-squared:  0.6687, Adjusted R-squared:  0.6683 
## F-statistic:  1458 on 2 and 1445 DF,  p-value: < 2.2e-16
# Note : There is an obvious pattern that residuals versus fitted model
model_2 <- lm(SalePrice ~ GrLivArea + TotalBsmtSF + X1stFlrSF, data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF)



plot(model_2, which = 1)

plot(model_2, which = 2)

summary(model_2)
## 
## Call:
## lm(formula = SalePrice ~ GrLivArea + TotalBsmtSF + X1stFlrSF, 
##     data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -180717  -21471     774   21893  236582 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -30665.885   4267.434  -7.186 1.07e-12 ***
## GrLivArea       85.910      2.659  32.307  < 2e-16 ***
## TotalBsmtSF     79.850      4.621  17.278  < 2e-16 ***
## X1stFlrSF       -2.237      5.586  -0.401    0.689    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 42900 on 1444 degrees of freedom
## Multiple R-squared:  0.6687, Adjusted R-squared:  0.6681 
## F-statistic: 971.7 on 3 and 1444 DF,  p-value: < 2.2e-16
# Note : The p-vlaue of variable XistFlrSF so big, it fails to reject the null hypothesis, means this variable has no effect in the model, both the pattern and R squared value has no change at all.
model_3 <- lm(SalePrice ~ GrLivArea + TotalBsmtSF + X1stFlrSF + as.factor(OverallQual), data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF)

plot(model_3, which = 1)

plot(model_3, which = 2)

summary(model_3)
## 
## Call:
## lm(formula = SalePrice ~ GrLivArea + TotalBsmtSF + X1stFlrSF + 
##     as.factor(OverallQual), data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -143600  -17329    1672   17132  182881 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              9.220e+02  2.241e+04   0.041  0.96719    
## GrLivArea                5.212e+01  2.263e+00  23.029  < 2e-16 ***
## TotalBsmtSF              3.793e+01  3.668e+00  10.341  < 2e-16 ***
## X1stFlrSF                6.484e+00  4.189e+00   1.548  0.12186    
## as.factor(OverallQual)2  4.215e+03  2.883e+04   0.146  0.88376    
## as.factor(OverallQual)3  1.032e+03  2.345e+04   0.044  0.96491    
## as.factor(OverallQual)4  1.510e+04  2.256e+04   0.669  0.50351    
## as.factor(OverallQual)5  2.526e+04  2.247e+04   1.124  0.26104    
## as.factor(OverallQual)6  4.010e+04  2.251e+04   1.782  0.07503 .  
## as.factor(OverallQual)7  6.825e+04  2.258e+04   3.022  0.00255 ** 
## as.factor(OverallQual)8  1.098e+05  2.276e+04   4.823 1.57e-06 ***
## as.factor(OverallQual)9  1.764e+05  2.327e+04   7.582 6.10e-14 ***
## as.factor(OverallQual)10 2.179e+05  2.476e+04   8.801  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31580 on 1435 degrees of freedom
## Multiple R-squared:  0.8216, Adjusted R-squared:  0.8201 
## F-statistic: 550.7 on 12 and 1435 DF,  p-value: < 2.2e-16
# Note : this variable helps to build the linear model a lot, throw away X1stFlrSF, OVerallQuall 2 ~ 5 levels since their p-vlaue are a lot greater than 0.05, means that they have no effect on this model.
my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual <-filter(my_data_GrLivArea_TotalBsmtSF_X1stFlrSF,between(OverallQual,5,10))
model_3_2<- lm(SalePrice ~ GrLivArea + TotalBsmtSF + as.factor(OverallQual), data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual)
plot(model_3_2, which = 1)

plot(model_3_2, which = 2)

summary(model_3_2)
## 
## Call:
## lm(formula = SalePrice ~ GrLivArea + TotalBsmtSF + as.factor(OverallQual), 
##     data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -143474  -17417    1564   18038  179441 
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              2.384e+04  3.903e+03   6.108 1.33e-09 ***
## GrLivArea                5.481e+01  2.206e+00  24.847  < 2e-16 ***
## TotalBsmtSF              4.424e+01  2.710e+00  16.324  < 2e-16 ***
## as.factor(OverallQual)6  1.407e+04  2.371e+03   5.935 3.76e-09 ***
## as.factor(OverallQual)7  4.144e+04  2.647e+03  15.653  < 2e-16 ***
## as.factor(OverallQual)8  8.212e+04  3.523e+03  23.313  < 2e-16 ***
## as.factor(OverallQual)9  1.483e+05  5.869e+03  25.261  < 2e-16 ***
## as.factor(OverallQual)10 1.889e+05  1.035e+04  18.251  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32220 on 1299 degrees of freedom
## Multiple R-squared:  0.8074, Adjusted R-squared:  0.8064 
## F-statistic:   778 on 7 and 1299 DF,  p-value: < 2.2e-16
#Note : R square value drops a litter after I throw away those large p-value varibles, but it is still pretty good, and lower the risk of overfitting.
model_4 <- lm(SalePrice ~ GrLivArea + TotalBsmtSF  + as.factor(OverallQual) + KitchenQual, data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual)


plot(model_4, which = 1)

plot(model_4, which = 2)

summary(model_4)
## 
## Call:
## lm(formula = SalePrice ~ GrLivArea + TotalBsmtSF + as.factor(OverallQual) + 
##     KitchenQual, data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -132817  -17198    1666   16446  175873 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               60234.422   6008.231  10.025  < 2e-16 ***
## GrLivArea                    54.775      2.125  25.773  < 2e-16 ***
## TotalBsmtSF                  41.656      2.617  15.916  < 2e-16 ***
## as.factor(OverallQual)6   11506.094   2313.341   4.974 7.45e-07 ***
## as.factor(OverallQual)7   30143.858   2879.204  10.470  < 2e-16 ***
## as.factor(OverallQual)8   65563.442   3793.894  17.281  < 2e-16 ***
## as.factor(OverallQual)9  120019.284   6624.391  18.118  < 2e-16 ***
## as.factor(OverallQual)10 162066.264  10441.460  15.521  < 2e-16 ***
## KitchenQualFa            -61749.581   8275.431  -7.462 1.56e-13 ***
## KitchenQualGd            -19074.511   4298.392  -4.438 9.87e-06 ***
## KitchenQualTA            -36795.878   4636.779  -7.936 4.50e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 30970 on 1296 degrees of freedom
## Multiple R-squared:  0.8224, Adjusted R-squared:  0.821 
## F-statistic: 600.2 on 10 and 1296 DF,  p-value: < 2.2e-16
my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual_KitchenQual <-filter(my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual,KitchenQual != "Fa")
model_5<- lm(SalePrice ~ GrLivArea + TotalBsmtSF + as.factor(OverallQual) + KitchenQual + GrLivArea*(as.factor(OverallQual)), data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual_KitchenQual)
plot(model_5, which = 1)

plot(model_5, which = 2)

summary(model_5)
## 
## Call:
## lm(formula = SalePrice ~ GrLivArea + TotalBsmtSF + as.factor(OverallQual) + 
##     KitchenQual + GrLivArea * (as.factor(OverallQual)), data = my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual_KitchenQual)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -132968  -16385    1583   15208  161598 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         96708.527   7069.706  13.679  < 2e-16 ***
## GrLivArea                              24.082      3.927   6.133 1.15e-09 ***
## TotalBsmtSF                            44.657      2.516  17.750  < 2e-16 ***
## as.factor(OverallQual)6            -21355.374   7822.114  -2.730  0.00642 ** 
## as.factor(OverallQual)7            -27287.375   9100.512  -2.998  0.00277 ** 
## as.factor(OverallQual)8            -23815.930  11411.383  -2.087  0.03708 *  
## as.factor(OverallQual)9            -55187.572  23981.694  -2.301  0.02154 *  
## as.factor(OverallQual)10           -11372.264  37930.741  -0.300  0.76437    
## KitchenQualGd                      -19868.609   4129.652  -4.811 1.68e-06 ***
## KitchenQualTA                      -37923.588   4443.829  -8.534  < 2e-16 ***
## GrLivArea:as.factor(OverallQual)6      26.995      5.546   4.867 1.27e-06 ***
## GrLivArea:as.factor(OverallQual)7      41.470      5.753   7.209 9.64e-13 ***
## GrLivArea:as.factor(OverallQual)8      56.263      6.346   8.866  < 2e-16 ***
## GrLivArea:as.factor(OverallQual)9      95.272     11.622   8.198 5.93e-16 ***
## GrLivArea:as.factor(OverallQual)10     83.628     14.918   5.606 2.54e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 29480 on 1272 degrees of freedom
## Multiple R-squared:   0.84,  Adjusted R-squared:  0.8383 
## F-statistic: 477.1 on 14 and 1272 DF,  p-value: < 2.2e-16
#Note : filter out "Fa" since there is no data equals "Fa". every other variables has it's effect on the model which is pretty good, will stop here.


check potential outlier

my_data_outlier <- my_data_GrLivArea_TotalBsmtSF_X1stFlrSF_OverallQual_KitchenQual %>%
  add_residuals(model_5, "resid1") %>%

  filter (abs(resid1) > 50000) %>%
  select(GrLivArea,TotalBsmtSF, X1stFlrSF,OverallQual, KitchenQual,SalePrice,resid1) %>%
  arrange(desc(resid1)) %>%
  print() 
## # A tibble: 105 × 7
##    GrLivArea TotalBsmtSF X1stFlrSF OverallQual KitchenQual SalePrice  resid1
##        <int>       <int>     <int>       <int> <chr>           <int>   <dbl>
##  1      1419        1419      1419           8 Gd             392000 161598.
##  2      1976        1976      1976           8 Gd             440000 139972.
##  3      1652        1600      1652           8 Gd             392500 135295.
##  4      3279        1650      1690           8 Ex             538000 127972.
##  5      2822        1734      1734           9 Ex             582933 127161.
##  6      2036        2136      2036           7 TA             375000 114652.
##  7      1710        1710      1710           8 Gd             372402 105625.
##  8      1954         798      1137           7 Gd             311500  98224.
##  9      1973        1935      1973           8 Gd             395000  97044.
## 10      2234        2216      2234           9 Ex             501837  94721.
## # … with 95 more rows
my_data_outlier %>%
  as.tibble() %>%
  #add_residuals(model_5, "resid1") %>%
  ggplot(aes(SalePrice, resid1)) + 
  geom_ref_line(h = 0) + 
  geom_point() +
  ylim(-100000, 100000)
## Warning: Removed 11 rows containing missing values (`geom_point()`).

# Note : no pattern.


Conclusion:

I find that the variable”OverallQual” has the strongest effect to the model, after I throw it in, the model becomes a lot better, I think it is probably the key factor for this data set; the variable X1stFlrSF has no contribution at all to build the linear model , even though it has very strong simple linear relationship with SalePrice, it is actually very similar to the variable”TotalBsmtSF” according to real estate definition; bedroom and bathromm don’t show strong relationship with SalePrice, it’s a bit strange with the common sense, it may due to the less quantity of data, or this is a special case for a special location, since I don’t have any backgroud information for this data set, I can’t get into that part; I can’t find any inter relationship with those potential outlier, it has no pattern at all, SalePrice varies randomly, then I decide just keep them in the model. So, model 5 is my best linear model to predict the house price.