library(tidyverse)
library(car)
library(matrixcalc)
library(MASS)

Introduction

Tasks

Task 1

Descriptive and Inferential Statistics.

Load the data:

train <- read.csv("./house_prices_data/train.csv")
test <- read.csv("./house_prices_data/test.csv")

Provide univariate descriptive statistics and appropriate plots for the training data set.

names(train)
##  [1] "Id"            "MSSubClass"    "MSZoning"      "LotFrontage"  
##  [5] "LotArea"       "Street"        "Alley"         "LotShape"     
##  [9] "LandContour"   "Utilities"     "LotConfig"     "LandSlope"    
## [13] "Neighborhood"  "Condition1"    "Condition2"    "BldgType"     
## [17] "HouseStyle"    "OverallQual"   "OverallCond"   "YearBuilt"    
## [21] "YearRemodAdd"  "RoofStyle"     "RoofMatl"      "Exterior1st"  
## [25] "Exterior2nd"   "MasVnrType"    "MasVnrArea"    "ExterQual"    
## [29] "ExterCond"     "Foundation"    "BsmtQual"      "BsmtCond"     
## [33] "BsmtExposure"  "BsmtFinType1"  "BsmtFinSF1"    "BsmtFinType2" 
## [37] "BsmtFinSF2"    "BsmtUnfSF"     "TotalBsmtSF"   "Heating"      
## [41] "HeatingQC"     "CentralAir"    "Electrical"    "X1stFlrSF"    
## [45] "X2ndFlrSF"     "LowQualFinSF"  "GrLivArea"     "BsmtFullBath" 
## [49] "BsmtHalfBath"  "FullBath"      "HalfBath"      "BedroomAbvGr" 
## [53] "KitchenAbvGr"  "KitchenQual"   "TotRmsAbvGrd"  "Functional"   
## [57] "Fireplaces"    "FireplaceQu"   "GarageType"    "GarageYrBlt"  
## [61] "GarageFinish"  "GarageCars"    "GarageArea"    "GarageQual"   
## [65] "GarageCond"    "PavedDrive"    "WoodDeckSF"    "OpenPorchSF"  
## [69] "EnclosedPorch" "X3SsnPorch"    "ScreenPorch"   "PoolArea"     
## [73] "PoolQC"        "Fence"         "MiscFeature"   "MiscVal"      
## [77] "MoSold"        "YrSold"        "SaleType"      "SaleCondition"
## [81] "SalePrice"
scatterplot(SalePrice ~ YearBuilt, data=train,  xlab="Year Built", ylab="Sale Price", grid=FALSE)

scatterplot(SalePrice ~ YrSold, data=train,  xlab="Year Sold", ylab="Sale Price", grid=FALSE)

scatterplot(SalePrice ~ X1stFlrSF, data=train,  xlab="Square Footage Floor 1", ylab="Sale Price", grid=FALSE)

summary(train)
##        Id           MSSubClass      MSZoning          LotFrontage    
##  Min.   :   1.0   Min.   : 20.0   Length:1460        Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   Class :character   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   Mode  :character   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9                      Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0                      3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0                      Max.   :313.00  
##                                                      NA's   :259     
##     LotArea          Street             Alley             LotShape        
##  Min.   :  1300   Length:1460        Length:1460        Length:1460       
##  1st Qu.:  7554   Class :character   Class :character   Class :character  
##  Median :  9478   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 10517                                                           
##  3rd Qu.: 11602                                                           
##  Max.   :215245                                                           
##                                                                           
##  LandContour         Utilities          LotConfig          LandSlope        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  Neighborhood        Condition1         Condition2          BldgType        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##   HouseStyle         OverallQual      OverallCond      YearBuilt   
##  Length:1460        Min.   : 1.000   Min.   :1.000   Min.   :1872  
##  Class :character   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954  
##  Mode  :character   Median : 6.000   Median :5.000   Median :1973  
##                     Mean   : 6.099   Mean   :5.575   Mean   :1971  
##                     3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000  
##                     Max.   :10.000   Max.   :9.000   Max.   :2010  
##                                                                    
##   YearRemodAdd   RoofStyle           RoofMatl         Exterior1st       
##  Min.   :1950   Length:1460        Length:1460        Length:1460       
##  1st Qu.:1967   Class :character   Class :character   Class :character  
##  Median :1994   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1985                                                           
##  3rd Qu.:2004                                                           
##  Max.   :2010                                                           
##                                                                         
##  Exterior2nd         MasVnrType          MasVnrArea      ExterQual        
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median :   0.0   Mode  :character  
##                                        Mean   : 103.7                     
##                                        3rd Qu.: 166.0                     
##                                        Max.   :1600.0                     
##                                        NA's   :8                          
##   ExterCond          Foundation          BsmtQual           BsmtCond        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  BsmtExposure       BsmtFinType1         BsmtFinSF1     BsmtFinType2      
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median : 383.5   Mode  :character  
##                                        Mean   : 443.6                     
##                                        3rd Qu.: 712.2                     
##                                        Max.   :5644.0                     
##                                                                           
##    BsmtFinSF2        BsmtUnfSF       TotalBsmtSF       Heating         
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0.0   Length:1460       
##  1st Qu.:   0.00   1st Qu.: 223.0   1st Qu.: 795.8   Class :character  
##  Median :   0.00   Median : 477.5   Median : 991.5   Mode  :character  
##  Mean   :  46.55   Mean   : 567.2   Mean   :1057.4                     
##  3rd Qu.:   0.00   3rd Qu.: 808.0   3rd Qu.:1298.2                     
##  Max.   :1474.00   Max.   :2336.0   Max.   :6110.0                     
##                                                                        
##   HeatingQC          CentralAir         Electrical          X1stFlrSF   
##  Length:1460        Length:1460        Length:1460        Min.   : 334  
##  Class :character   Class :character   Class :character   1st Qu.: 882  
##  Mode  :character   Mode  :character   Mode  :character   Median :1087  
##                                                           Mean   :1163  
##                                                           3rd Qu.:1391  
##                                                           Max.   :4692  
##                                                                         
##    X2ndFlrSF     LowQualFinSF       GrLivArea     BsmtFullBath   
##  Min.   :   0   Min.   :  0.000   Min.   : 334   Min.   :0.0000  
##  1st Qu.:   0   1st Qu.:  0.000   1st Qu.:1130   1st Qu.:0.0000  
##  Median :   0   Median :  0.000   Median :1464   Median :0.0000  
##  Mean   : 347   Mean   :  5.845   Mean   :1515   Mean   :0.4253  
##  3rd Qu.: 728   3rd Qu.:  0.000   3rd Qu.:1777   3rd Qu.:1.0000  
##  Max.   :2065   Max.   :572.000   Max.   :5642   Max.   :3.0000  
##                                                                  
##   BsmtHalfBath        FullBath        HalfBath       BedroomAbvGr  
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.00000   Median :2.000   Median :0.0000   Median :3.000  
##  Mean   :0.05753   Mean   :1.565   Mean   :0.3829   Mean   :2.866  
##  3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :2.00000   Max.   :3.000   Max.   :2.0000   Max.   :8.000  
##                                                                    
##   KitchenAbvGr   KitchenQual         TotRmsAbvGrd     Functional       
##  Min.   :0.000   Length:1460        Min.   : 2.000   Length:1460       
##  1st Qu.:1.000   Class :character   1st Qu.: 5.000   Class :character  
##  Median :1.000   Mode  :character   Median : 6.000   Mode  :character  
##  Mean   :1.047                      Mean   : 6.518                     
##  3rd Qu.:1.000                      3rd Qu.: 7.000                     
##  Max.   :3.000                      Max.   :14.000                     
##                                                                        
##    Fireplaces    FireplaceQu         GarageType         GarageYrBlt  
##  Min.   :0.000   Length:1460        Length:1460        Min.   :1900  
##  1st Qu.:0.000   Class :character   Class :character   1st Qu.:1961  
##  Median :1.000   Mode  :character   Mode  :character   Median :1980  
##  Mean   :0.613                                         Mean   :1979  
##  3rd Qu.:1.000                                         3rd Qu.:2002  
##  Max.   :3.000                                         Max.   :2010  
##                                                        NA's   :81    
##  GarageFinish         GarageCars      GarageArea      GarageQual       
##  Length:1460        Min.   :0.000   Min.   :   0.0   Length:1460       
##  Class :character   1st Qu.:1.000   1st Qu.: 334.5   Class :character  
##  Mode  :character   Median :2.000   Median : 480.0   Mode  :character  
##                     Mean   :1.767   Mean   : 473.0                     
##                     3rd Qu.:2.000   3rd Qu.: 576.0                     
##                     Max.   :4.000   Max.   :1418.0                     
##                                                                        
##   GarageCond         PavedDrive          WoodDeckSF      OpenPorchSF    
##  Length:1460        Length:1460        Min.   :  0.00   Min.   :  0.00  
##  Class :character   Class :character   1st Qu.:  0.00   1st Qu.:  0.00  
##  Mode  :character   Mode  :character   Median :  0.00   Median : 25.00  
##                                        Mean   : 94.24   Mean   : 46.66  
##                                        3rd Qu.:168.00   3rd Qu.: 68.00  
##                                        Max.   :857.00   Max.   :547.00  
##                                                                         
##  EnclosedPorch      X3SsnPorch      ScreenPorch        PoolArea      
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.000  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000  
##  Median :  0.00   Median :  0.00   Median :  0.00   Median :  0.000  
##  Mean   : 21.95   Mean   :  3.41   Mean   : 15.06   Mean   :  2.759  
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000  
##  Max.   :552.00   Max.   :508.00   Max.   :480.00   Max.   :738.000  
##                                                                      
##     PoolQC             Fence           MiscFeature           MiscVal        
##  Length:1460        Length:1460        Length:1460        Min.   :    0.00  
##  Class :character   Class :character   Class :character   1st Qu.:    0.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0.00  
##                                                           Mean   :   43.49  
##                                                           3rd Qu.:    0.00  
##                                                           Max.   :15500.00  
##                                                                             
##      MoSold           YrSold       SaleType         SaleCondition     
##  Min.   : 1.000   Min.   :2006   Length:1460        Length:1460       
##  1st Qu.: 5.000   1st Qu.:2007   Class :character   Class :character  
##  Median : 6.000   Median :2008   Mode  :character   Mode  :character  
##  Mean   : 6.322   Mean   :2008                                        
##  3rd Qu.: 8.000   3rd Qu.:2009                                        
##  Max.   :12.000   Max.   :2010                                        
##                                                                       
##    SalePrice     
##  Min.   : 34900  
##  1st Qu.:129975  
##  Median :163000  
##  Mean   :180921  
##  3rd Qu.:214000  
##  Max.   :755000  
## 
(freq <- table(train$YrSold))
## 
## 2006 2007 2008 2009 2010 
##  314  329  304  338  175
print ("Cumulative Frequency Table")
## [1] "Cumulative Frequency Table"
cumsum <- cumsum(freq)
print (cumsum)
## 2006 2007 2008 2009 2010 
##  314  643  947 1285 1460
print ("Relative Frequency Table")
## [1] "Relative Frequency Table"
prob <- prop.table(freq)
print (prob)
## 
##      2006      2007      2008      2009      2010 
## 0.2150685 0.2253425 0.2082192 0.2315068 0.1198630

Provide a scatterplot matrix for at least two of the independent variables and the dependent variable.

pairs(SalePrice~YearBuilt+OverallQual+TotalBsmtSF+GrLivArea,data=train,
   main="Simple Scatterplot Matrix")

Derive a correlation matrix for any three quantitative variables in the dataset.

correlation_three <- dplyr::select(train, YearBuilt, OverallQual, GrLivArea)
corr_matrix <-  cor(correlation_three)
corr_matrix
##             YearBuilt OverallQual GrLivArea
## YearBuilt   1.0000000   0.5723228 0.1990097
## OverallQual 0.5723228   1.0000000 0.5930074
## GrLivArea   0.1990097   0.5930074 1.0000000

Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval.

\[ H_0:p=0 (\text{there is no linear relationship}) \\ H_1:p!=0 (\text{there is a linear relationship}) \]

# mpg
qqPlot(train$YearBuilt, ylab = "YearBuilt")

## [1] 1350 1138
# wt
qqPlot(train$OverallQual, ylab = "Overall Quality")

## [1] 376 534

Using Pearson’s correlation test:

#test the correlation between 1st and 2nd variables
(test1 <- cor.test(formula = ~ OverallQual + GrLivArea,
                      data = correlation_three,
                      method = "pearson",
                      conf.level = 0.80))
## 
##  Pearson's product-moment correlation
## 
## data:  OverallQual and GrLivArea
## t = 28.121, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.5708061 0.6143422
## sample estimates:
##       cor 
## 0.5930074
test1$p.value <0.05
## [1] TRUE
# test the correlation between 1st and 3rd variables
(test2 <- cor.test(formula = ~ OverallQual + YearBuilt,
                      data = correlation_three,
                      method = "pearson",
                      conf.level = 0.80))
## 
##  Pearson's product-moment correlation
## 
## data:  OverallQual and YearBuilt
## t = 26.65, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.5493124 0.5944659
## sample estimates:
##       cor 
## 0.5723228
test2$p.value <0.05
## [1] TRUE
# test the correlation between 2nd and 3rd variables
(test3 <- cor.test(formula = ~ YearBuilt + GrLivArea,
                      data = correlation_three,
                      method = "pearson",
                      conf.level = 0.80))
## 
##  Pearson's product-moment correlation
## 
## data:  YearBuilt and GrLivArea
## t = 7.754, df = 1458, p-value = 1.66e-14
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.1665605 0.2310283
## sample estimates:
##       cor 
## 0.1990097
test3$p.value <0.05
## [1] TRUE

Discuss the meaning of your analysis. Would you be worried about familywise error? Why or why not?

When we perform one hypothesis test, the type I error rate is equal to the significance level (\(\alpha\)), which is commonly chosen to be 0.01, 0.05, or 0.10. However, when we conduct multiple hypothesis tests at once, the probability of getting a false positive increases. So we do have to worry about family wise error.

The family wise error Rate is defined below.

\[ 1-(1-\alpha)^n \]

Where \(\alpha\) is the significant and n is the total number of tests.

1-(1-0.05)^3
## [1] 0.142625

In other words, the probability of getting a type I error on at least one of the hypothesis tests is 14.26%!

Task 2 Linear Algebra and Correlation.

Invert your correlation matrix from above.

(This is known as the precision matrix and contains variance inflation factors on the diagonal.)

(precision <- solve(corr_matrix))
##             YearBuilt OverallQual GrLivArea
## YearBuilt    1.557510   -1.091384  0.337239
## OverallQual -1.091384    2.307153 -1.150963
## GrLivArea    0.337239   -1.150963  1.615416

Multiply the correlation matrix by the precision matrix, and then multiply the precision matrix by the correlation matrix.

corr_matrix %*% precision
##                 YearBuilt   OverallQual GrLivArea
## YearBuilt    1.000000e+00 -8.326673e-17         0
## OverallQual -8.326673e-17  1.000000e+00         0
## GrLivArea    0.000000e+00  0.000000e+00         1
precision %*% corr_matrix
##                 YearBuilt   OverallQual GrLivArea
## YearBuilt    1.000000e+00 -8.326673e-17         0
## OverallQual -8.326673e-17  1.000000e+00         0
## GrLivArea    0.000000e+00  0.000000e+00         1

Conduct LU decomposition on the matrix.

lu.decomposition(corr_matrix)
## $L
##           [,1]      [,2] [,3]
## [1,] 1.0000000 0.0000000    0
## [2,] 0.5723228 1.0000000    0
## [3,] 0.1990097 0.7124872    1
## 
## $U
##      [,1]          [,2]      [,3]
## [1,]    1  5.723228e-01 0.1990097
## [2,]    0  6.724466e-01 0.4791096
## [3,]    0 -5.551115e-17 0.6190356

Task 3 Calculus-Based Probability & Statistics.

Many times, it makes sense to fit a closed form distribution to data.

Select a variable in the Kaggle.com training dataset that is skewed to the right.

Shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function.

(See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ).

train %>%
  dplyr::select(BsmtFinSF1, BsmtUnfSF, GrLivArea,
         LotFrontage, GrLivArea, MasVnrArea,
         OpenPorchSF, X1stFlrSF) %>%
  gather() %>%                  
  ggplot(aes(value)) + 
  facet_wrap(~ key, scales = "free") +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 267 rows containing non-finite values (`stat_bin()`).

I am going to select X1stFlrSF column for this question.

summary(train$X1stFlrSF)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     334     882    1087    1163    1391    4692

Histogram of chosen variable:

hist(train$X1stFlrSF, freq = FALSE, col = "lightblue", main = "Exponential Distribution", xlab = "X", ylab = "Density")

x<- train$X1stFlrSF
qqnorm(x)
qqline(x, col="red")

(exp_df <- fitdistr(x,"exponential"))
##        rate    
##   0.0008601213 
##  (0.0000225104)

Find the optimal value of lambda for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, )).

lambda = 0.0008601213
X <- rexp(1000,lambda)

Plot a histogram and compare it with a histogram of your original variable.

hist(X, freq = FALSE, col = "lightblue", main = "Exponential Distribution", xlab = "X", ylab = "Density")
curve(dexp(x, rate = lambda), add = TRUE, col = "red", lwd = 2)

Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF).

A PDF is a derivative of the CDF. So, in order to find the probability density function (PDF) of an exponential distribution, we can differentiate its cumulative distribution function (CDF), 1 — P(T > t).

qexp(0.05, rate = lambda)
## [1] 59.63495
qexp(.95, rate = lambda)
## [1] 3482.918

Also generate a 95% confidence interval from the empirical data, assuming normality.

z <- 1.96
n <- length(train$X1stFlrSF)
mean <- mean(train$X1stFlrSF)

sd <- sd(train$X1stFlrSF)
upper_bound <- round(mean + z * sd / sqrt(n), 4) 
upper_bound
## [1] 1182.457
lower_bound <- round(mean - z * sd / sqrt(n), 4) 
lower_bound
## [1] 1142.796

The upper bound 1182.457 and the lower bound 1142.796.

Finally, provide the empirical 5th percentile and 95th percentile of the data.

quantile(train$X1stFlrSF,0.05)
##     5% 
## 672.95
quantile(train$X1stFlrSF,0.95)
##     95% 
## 1831.25

Discuss.

Task 4

10 points. Modeling.

Build some type of multiple regression model and submit your model to the competition board. Provide your complete model summary and results with analysis.

First I will look at the train dataset.

head(train)
##   Id MSSubClass MSZoning LotFrontage LotArea Street Alley LotShape LandContour
## 1  1         60       RL          65    8450   Pave  <NA>      Reg         Lvl
## 2  2         20       RL          80    9600   Pave  <NA>      Reg         Lvl
## 3  3         60       RL          68   11250   Pave  <NA>      IR1         Lvl
## 4  4         70       RL          60    9550   Pave  <NA>      IR1         Lvl
## 5  5         60       RL          84   14260   Pave  <NA>      IR1         Lvl
## 6  6         50       RL          85   14115   Pave  <NA>      IR1         Lvl
##   Utilities LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType
## 1    AllPub    Inside       Gtl      CollgCr       Norm       Norm     1Fam
## 2    AllPub       FR2       Gtl      Veenker      Feedr       Norm     1Fam
## 3    AllPub    Inside       Gtl      CollgCr       Norm       Norm     1Fam
## 4    AllPub    Corner       Gtl      Crawfor       Norm       Norm     1Fam
## 5    AllPub       FR2       Gtl      NoRidge       Norm       Norm     1Fam
## 6    AllPub    Inside       Gtl      Mitchel       Norm       Norm     1Fam
##   HouseStyle OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl
## 1     2Story           7           5      2003         2003     Gable  CompShg
## 2     1Story           6           8      1976         1976     Gable  CompShg
## 3     2Story           7           5      2001         2002     Gable  CompShg
## 4     2Story           7           5      1915         1970     Gable  CompShg
## 5     2Story           8           5      2000         2000     Gable  CompShg
## 6     1.5Fin           5           5      1993         1995     Gable  CompShg
##   Exterior1st Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation
## 1     VinylSd     VinylSd    BrkFace        196        Gd        TA      PConc
## 2     MetalSd     MetalSd       None          0        TA        TA     CBlock
## 3     VinylSd     VinylSd    BrkFace        162        Gd        TA      PConc
## 4     Wd Sdng     Wd Shng       None          0        TA        TA     BrkTil
## 5     VinylSd     VinylSd    BrkFace        350        Gd        TA      PConc
## 6     VinylSd     VinylSd       None          0        TA        TA       Wood
##   BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2
## 1       Gd       TA           No          GLQ        706          Unf
## 2       Gd       TA           Gd          ALQ        978          Unf
## 3       Gd       TA           Mn          GLQ        486          Unf
## 4       TA       Gd           No          ALQ        216          Unf
## 5       Gd       TA           Av          GLQ        655          Unf
## 6       Gd       TA           No          GLQ        732          Unf
##   BsmtFinSF2 BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical
## 1          0       150         856    GasA        Ex          Y      SBrkr
## 2          0       284        1262    GasA        Ex          Y      SBrkr
## 3          0       434         920    GasA        Ex          Y      SBrkr
## 4          0       540         756    GasA        Gd          Y      SBrkr
## 5          0       490        1145    GasA        Ex          Y      SBrkr
## 6          0        64         796    GasA        Ex          Y      SBrkr
##   X1stFlrSF X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath
## 1       856       854            0      1710            1            0        2
## 2      1262         0            0      1262            0            1        2
## 3       920       866            0      1786            1            0        2
## 4       961       756            0      1717            1            0        1
## 5      1145      1053            0      2198            1            0        2
## 6       796       566            0      1362            1            0        1
##   HalfBath BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional
## 1        1            3            1          Gd            8        Typ
## 2        0            3            1          TA            6        Typ
## 3        1            3            1          Gd            6        Typ
## 4        0            3            1          Gd            7        Typ
## 5        1            4            1          Gd            9        Typ
## 6        1            1            1          TA            5        Typ
##   Fireplaces FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars
## 1          0        <NA>     Attchd        2003          RFn          2
## 2          1          TA     Attchd        1976          RFn          2
## 3          1          TA     Attchd        2001          RFn          2
## 4          1          Gd     Detchd        1998          Unf          3
## 5          1          TA     Attchd        2000          RFn          3
## 6          0        <NA>     Attchd        1993          Unf          2
##   GarageArea GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF
## 1        548         TA         TA          Y          0          61
## 2        460         TA         TA          Y        298           0
## 3        608         TA         TA          Y          0          42
## 4        642         TA         TA          Y          0          35
## 5        836         TA         TA          Y        192          84
## 6        480         TA         TA          Y         40          30
##   EnclosedPorch X3SsnPorch ScreenPorch PoolArea PoolQC Fence MiscFeature
## 1             0          0           0        0   <NA>  <NA>        <NA>
## 2             0          0           0        0   <NA>  <NA>        <NA>
## 3             0          0           0        0   <NA>  <NA>        <NA>
## 4           272          0           0        0   <NA>  <NA>        <NA>
## 5             0          0           0        0   <NA>  <NA>        <NA>
## 6             0        320           0        0   <NA> MnPrv        Shed
##   MiscVal MoSold YrSold SaleType SaleCondition SalePrice
## 1       0      2   2008       WD        Normal    208500
## 2       0      5   2007       WD        Normal    181500
## 3       0      9   2008       WD        Normal    223500
## 4       0      2   2006       WD       Abnorml    140000
## 5       0     12   2008       WD        Normal    250000
## 6     700     10   2009       WD        Normal    143000

Next I will check if there are NA values in the data set.

colSums(is.na(train))
##            Id    MSSubClass      MSZoning   LotFrontage       LotArea 
##             0             0             0           259             0 
##        Street         Alley      LotShape   LandContour     Utilities 
##             0          1369             0             0             0 
##     LotConfig     LandSlope  Neighborhood    Condition1    Condition2 
##             0             0             0             0             0 
##      BldgType    HouseStyle   OverallQual   OverallCond     YearBuilt 
##             0             0             0             0             0 
##  YearRemodAdd     RoofStyle      RoofMatl   Exterior1st   Exterior2nd 
##             0             0             0             0             0 
##    MasVnrType    MasVnrArea     ExterQual     ExterCond    Foundation 
##             8             8             0             0             0 
##      BsmtQual      BsmtCond  BsmtExposure  BsmtFinType1    BsmtFinSF1 
##            37            37            38            37             0 
##  BsmtFinType2    BsmtFinSF2     BsmtUnfSF   TotalBsmtSF       Heating 
##            38             0             0             0             0 
##     HeatingQC    CentralAir    Electrical     X1stFlrSF     X2ndFlrSF 
##             0             0             1             0             0 
##  LowQualFinSF     GrLivArea  BsmtFullBath  BsmtHalfBath      FullBath 
##             0             0             0             0             0 
##      HalfBath  BedroomAbvGr  KitchenAbvGr   KitchenQual  TotRmsAbvGrd 
##             0             0             0             0             0 
##    Functional    Fireplaces   FireplaceQu    GarageType   GarageYrBlt 
##             0             0           690            81            81 
##  GarageFinish    GarageCars    GarageArea    GarageQual    GarageCond 
##            81             0             0            81            81 
##    PavedDrive    WoodDeckSF   OpenPorchSF EnclosedPorch    X3SsnPorch 
##             0             0             0             0             0 
##   ScreenPorch      PoolArea        PoolQC         Fence   MiscFeature 
##             0             0          1453          1179          1406 
##       MiscVal        MoSold        YrSold      SaleType SaleCondition 
##             0             0             0             0             0 
##     SalePrice 
##             0

I will choose some columns that are mostly empry and the Id which is not needed. Then I will save it into the dataframe training.

training <- train %>% dplyr::select(-c(Id,PoolQC,Alley))

Remove unnecessary columns

training[is.na(training)] <- 0
head(training)
##   MSSubClass MSZoning LotFrontage LotArea Street LotShape LandContour Utilities
## 1         60       RL          65    8450   Pave      Reg         Lvl    AllPub
## 2         20       RL          80    9600   Pave      Reg         Lvl    AllPub
## 3         60       RL          68   11250   Pave      IR1         Lvl    AllPub
## 4         70       RL          60    9550   Pave      IR1         Lvl    AllPub
## 5         60       RL          84   14260   Pave      IR1         Lvl    AllPub
## 6         50       RL          85   14115   Pave      IR1         Lvl    AllPub
##   LotConfig LandSlope Neighborhood Condition1 Condition2 BldgType HouseStyle
## 1    Inside       Gtl      CollgCr       Norm       Norm     1Fam     2Story
## 2       FR2       Gtl      Veenker      Feedr       Norm     1Fam     1Story
## 3    Inside       Gtl      CollgCr       Norm       Norm     1Fam     2Story
## 4    Corner       Gtl      Crawfor       Norm       Norm     1Fam     2Story
## 5       FR2       Gtl      NoRidge       Norm       Norm     1Fam     2Story
## 6    Inside       Gtl      Mitchel       Norm       Norm     1Fam     1.5Fin
##   OverallQual OverallCond YearBuilt YearRemodAdd RoofStyle RoofMatl Exterior1st
## 1           7           5      2003         2003     Gable  CompShg     VinylSd
## 2           6           8      1976         1976     Gable  CompShg     MetalSd
## 3           7           5      2001         2002     Gable  CompShg     VinylSd
## 4           7           5      1915         1970     Gable  CompShg     Wd Sdng
## 5           8           5      2000         2000     Gable  CompShg     VinylSd
## 6           5           5      1993         1995     Gable  CompShg     VinylSd
##   Exterior2nd MasVnrType MasVnrArea ExterQual ExterCond Foundation BsmtQual
## 1     VinylSd    BrkFace        196        Gd        TA      PConc       Gd
## 2     MetalSd       None          0        TA        TA     CBlock       Gd
## 3     VinylSd    BrkFace        162        Gd        TA      PConc       Gd
## 4     Wd Shng       None          0        TA        TA     BrkTil       TA
## 5     VinylSd    BrkFace        350        Gd        TA      PConc       Gd
## 6     VinylSd       None          0        TA        TA       Wood       Gd
##   BsmtCond BsmtExposure BsmtFinType1 BsmtFinSF1 BsmtFinType2 BsmtFinSF2
## 1       TA           No          GLQ        706          Unf          0
## 2       TA           Gd          ALQ        978          Unf          0
## 3       TA           Mn          GLQ        486          Unf          0
## 4       Gd           No          ALQ        216          Unf          0
## 5       TA           Av          GLQ        655          Unf          0
## 6       TA           No          GLQ        732          Unf          0
##   BsmtUnfSF TotalBsmtSF Heating HeatingQC CentralAir Electrical X1stFlrSF
## 1       150         856    GasA        Ex          Y      SBrkr       856
## 2       284        1262    GasA        Ex          Y      SBrkr      1262
## 3       434         920    GasA        Ex          Y      SBrkr       920
## 4       540         756    GasA        Gd          Y      SBrkr       961
## 5       490        1145    GasA        Ex          Y      SBrkr      1145
## 6        64         796    GasA        Ex          Y      SBrkr       796
##   X2ndFlrSF LowQualFinSF GrLivArea BsmtFullBath BsmtHalfBath FullBath HalfBath
## 1       854            0      1710            1            0        2        1
## 2         0            0      1262            0            1        2        0
## 3       866            0      1786            1            0        2        1
## 4       756            0      1717            1            0        1        0
## 5      1053            0      2198            1            0        2        1
## 6       566            0      1362            1            0        1        1
##   BedroomAbvGr KitchenAbvGr KitchenQual TotRmsAbvGrd Functional Fireplaces
## 1            3            1          Gd            8        Typ          0
## 2            3            1          TA            6        Typ          1
## 3            3            1          Gd            6        Typ          1
## 4            3            1          Gd            7        Typ          1
## 5            4            1          Gd            9        Typ          1
## 6            1            1          TA            5        Typ          0
##   FireplaceQu GarageType GarageYrBlt GarageFinish GarageCars GarageArea
## 1           0     Attchd        2003          RFn          2        548
## 2          TA     Attchd        1976          RFn          2        460
## 3          TA     Attchd        2001          RFn          2        608
## 4          Gd     Detchd        1998          Unf          3        642
## 5          TA     Attchd        2000          RFn          3        836
## 6           0     Attchd        1993          Unf          2        480
##   GarageQual GarageCond PavedDrive WoodDeckSF OpenPorchSF EnclosedPorch
## 1         TA         TA          Y          0          61             0
## 2         TA         TA          Y        298           0             0
## 3         TA         TA          Y          0          42             0
## 4         TA         TA          Y          0          35           272
## 5         TA         TA          Y        192          84             0
## 6         TA         TA          Y         40          30             0
##   X3SsnPorch ScreenPorch PoolArea Fence MiscFeature MiscVal MoSold YrSold
## 1          0           0        0     0           0       0      2   2008
## 2          0           0        0     0           0       0      5   2007
## 3          0           0        0     0           0       0      9   2008
## 4          0           0        0     0           0       0      2   2006
## 5          0           0        0     0           0       0     12   2008
## 6        320           0        0 MnPrv        Shed     700     10   2009
##   SaleType SaleCondition SalePrice
## 1       WD        Normal    208500
## 2       WD        Normal    181500
## 3       WD        Normal    223500
## 4       WD       Abnorml    140000
## 5       WD        Normal    250000
## 6       WD        Normal    143000

First I will build a full model and save it into house.prices.lm.

house.prices.lm <- lm(SalePrice ~ .,data=training)

The summary below shows all the p-values of the columns.

summary(house.prices.lm)
## 
## Call:
## lm(formula = SalePrice ~ ., data = training)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -179735   -9252      36    9580  179735 
## 
## Coefficients: (8 not defined because of singularities)
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -5.330e+05  1.053e+06  -0.506 0.612718    
## MSSubClass           -4.832e+01  8.290e+01  -0.583 0.560141    
## MSZoningFV            3.344e+04  1.199e+04   2.788 0.005382 ** 
## MSZoningRH            2.311e+04  1.192e+04   1.938 0.052818 .  
## MSZoningRL            2.584e+04  1.024e+04   2.524 0.011742 *  
## MSZoningRM            2.253e+04  9.582e+03   2.351 0.018890 *  
## LotFrontage           8.328e+00  2.297e+01   0.363 0.717038    
## LotArea               7.405e-01  1.091e-01   6.790 1.75e-11 ***
## StreetPave            3.322e+04  1.222e+04   2.719 0.006635 ** 
## LotShapeIR2           4.416e+03  4.229e+03   1.044 0.296632    
## LotShapeIR3           5.541e+03  8.847e+03   0.626 0.531267    
## LotShapeReg           1.458e+03  1.642e+03   0.888 0.374811    
## LandContourHLS        7.585e+03  5.149e+03   1.473 0.140971    
## LandContourLow       -1.073e+04  6.426e+03  -1.669 0.095334 .  
## LandContourLvl        5.285e+03  3.703e+03   1.427 0.153759    
## UtilitiesNoSeWa      -3.377e+04  2.646e+04  -1.276 0.202096    
## LotConfigCulDSac      7.876e+03  3.262e+03   2.414 0.015911 *  
## LotConfigFR2         -7.870e+03  4.012e+03  -1.962 0.050046 .  
## LotConfigFR3         -1.735e+04  1.259e+04  -1.378 0.168605    
## LotConfigInside      -1.821e+03  1.759e+03  -1.035 0.300878    
## LandSlopeMod          7.368e+03  3.984e+03   1.849 0.064677 .  
## LandSlopeSev         -4.434e+04  1.140e+04  -3.889 0.000106 ***
## NeighborhoodBlueste   8.738e+03  1.929e+04   0.453 0.650643    
## NeighborhoodBrDale   -1.548e+03  1.101e+04  -0.141 0.888262    
## NeighborhoodBrkSide  -5.112e+03  9.452e+03  -0.541 0.588705    
## NeighborhoodClearCr  -1.616e+04  9.218e+03  -1.753 0.079917 .  
## NeighborhoodCollgCr  -1.021e+04  7.260e+03  -1.406 0.159922    
## NeighborhoodCrawfor   1.109e+04  8.566e+03   1.294 0.195881    
## NeighborhoodEdwards  -2.068e+04  8.006e+03  -2.583 0.009910 ** 
## NeighborhoodGilbert  -1.121e+04  7.674e+03  -1.461 0.144410    
## NeighborhoodIDOTRR   -1.184e+04  1.075e+04  -1.102 0.270805    
## NeighborhoodMeadowV  -6.379e+03  1.121e+04  -0.569 0.569373    
## NeighborhoodMitchel  -2.224e+04  8.169e+03  -2.722 0.006580 ** 
## NeighborhoodNAmes    -1.637e+04  7.851e+03  -2.085 0.037323 *  
## NeighborhoodNoRidge   2.479e+04  8.442e+03   2.936 0.003389 ** 
## NeighborhoodNPkVill   1.424e+04  1.408e+04   1.011 0.312003    
## NeighborhoodNridgHt   1.799e+04  7.535e+03   2.388 0.017101 *  
## NeighborhoodNWAmes   -1.767e+04  8.007e+03  -2.206 0.027550 *  
## NeighborhoodOldTown  -1.429e+04  9.595e+03  -1.489 0.136651    
## NeighborhoodSawyer   -1.023e+04  8.128e+03  -1.259 0.208237    
## NeighborhoodSawyerW  -3.334e+03  7.801e+03  -0.427 0.669194    
## NeighborhoodSomerst  -2.578e+03  9.018e+03  -0.286 0.775063    
## NeighborhoodStoneBr   3.799e+04  8.312e+03   4.571 5.35e-06 ***
## NeighborhoodSWISU    -9.284e+03  9.687e+03  -0.958 0.338003    
## NeighborhoodTimber   -1.051e+04  8.148e+03  -1.290 0.197176    
## NeighborhoodVeenker   5.105e+01  1.051e+04   0.005 0.996126    
## Condition1Feedr       6.387e+03  5.005e+03   1.276 0.202168    
## Condition1Norm        1.555e+04  4.175e+03   3.724 0.000205 ***
## Condition1PosA        8.971e+03  9.960e+03   0.901 0.367926    
## Condition1PosN        1.379e+04  7.476e+03   1.845 0.065339 .  
## Condition1RRAe       -1.663e+04  9.079e+03  -1.832 0.067151 .  
## Condition1RRAn        1.233e+04  6.955e+03   1.772 0.076640 .  
## Condition1RRNe       -4.010e+03  1.753e+04  -0.229 0.819136    
## Condition1RRNn        1.067e+04  1.287e+04   0.829 0.407027    
## Condition2Feedr      -4.753e+03  2.350e+04  -0.202 0.839715    
## Condition2Norm       -7.085e+03  2.033e+04  -0.348 0.727579    
## Condition2PosA        4.128e+04  3.710e+04   1.113 0.265985    
## Condition2PosN       -2.382e+05  2.773e+04  -8.592  < 2e-16 ***
## Condition2RRAe       -1.252e+05  6.531e+04  -1.918 0.055382 .  
## Condition2RRAn       -1.826e+04  3.159e+04  -0.578 0.563234    
## Condition2RRNn        2.633e+02  2.714e+04   0.010 0.992263    
## BldgType2fmCon       -3.386e+03  1.249e+04  -0.271 0.786375    
## BldgTypeDuplex       -7.856e+03  7.397e+03  -1.062 0.288390    
## BldgTypeTwnhs        -1.953e+04  9.939e+03  -1.965 0.049652 *  
## BldgTypeTwnhsE       -1.529e+04  8.957e+03  -1.707 0.088137 .  
## HouseStyle1.5Unf      1.374e+04  7.933e+03   1.732 0.083578 .  
## HouseStyle1Story      7.419e+03  4.350e+03   1.706 0.088351 .  
## HouseStyle2.5Fin     -2.205e+04  1.232e+04  -1.789 0.073877 .  
## HouseStyle2.5Unf     -1.038e+04  9.258e+03  -1.121 0.262416    
## HouseStyle2Story     -6.006e+03  3.508e+03  -1.712 0.087102 .  
## HouseStyleSFoyer      2.408e+03  6.270e+03   0.384 0.701010    
## HouseStyleSLvl        4.745e+03  5.581e+03   0.850 0.395379    
## OverallQual           6.781e+03  1.015e+03   6.682 3.59e-11 ***
## OverallCond           5.663e+03  8.735e+02   6.483 1.30e-10 ***
## YearBuilt             3.215e+02  7.695e+01   4.177 3.16e-05 ***
## YearRemodAdd          1.036e+02  5.570e+01   1.861 0.063033 .  
## RoofStyleGable        6.160e+03  1.843e+04   0.334 0.738314    
## RoofStyleGambrel      8.545e+03  2.016e+04   0.424 0.671688    
## RoofStyleHip          5.766e+03  1.850e+04   0.312 0.755397    
## RoofStyleMansard      1.648e+04  2.147e+04   0.767 0.443068    
## RoofStyleShed         1.011e+05  3.463e+04   2.920 0.003561 ** 
## RoofMatlCompShg       6.895e+05  3.337e+04  20.662  < 2e-16 ***
## RoofMatlMembran       7.853e+05  4.780e+04  16.429  < 2e-16 ***
## RoofMatlMetal         7.547e+05  4.684e+04  16.114  < 2e-16 ***
## RoofMatlRoll          6.762e+05  4.191e+04  16.135  < 2e-16 ***
## RoofMatlTar&Grv       6.926e+05  3.812e+04  18.170  < 2e-16 ***
## RoofMatlWdShake       6.811e+05  3.692e+04  18.446  < 2e-16 ***
## RoofMatlWdShngl       7.433e+05  3.452e+04  21.530  < 2e-16 ***
## Exterior1stAsphShn   -2.583e+04  3.309e+04  -0.780 0.435253    
## Exterior1stBrkComm   -7.723e+03  2.779e+04  -0.278 0.781143    
## Exterior1stBrkFace    5.717e+03  1.277e+04   0.448 0.654368    
## Exterior1stCBlock    -1.558e+04  2.726e+04  -0.571 0.567889    
## Exterior1stCemntBd   -1.370e+04  1.905e+04  -0.719 0.472255    
## Exterior1stHdBoard   -1.510e+04  1.294e+04  -1.166 0.243729    
## Exterior1stImStucc   -3.048e+04  2.814e+04  -1.083 0.278994    
## Exterior1stMetalSd   -7.333e+03  1.460e+04  -0.502 0.615515    
## Exterior1stPlywood   -1.691e+04  1.275e+04  -1.326 0.185230    
## Exterior1stStone     -4.617e+03  2.438e+04  -0.189 0.849814    
## Exterior1stStucco    -8.160e+03  1.411e+04  -0.578 0.563081    
## Exterior1stVinylSd   -1.613e+04  1.329e+04  -1.214 0.225134    
## Exterior1stWd Sdng   -1.501e+04  1.238e+04  -1.212 0.225886    
## Exterior1stWdShing   -1.176e+04  1.333e+04  -0.882 0.377902    
## Exterior2ndAsphShn    1.310e+04  2.227e+04   0.588 0.556343    
## Exterior2ndBrk Cmn    7.080e+03  2.012e+04   0.352 0.724959    
## Exterior2ndBrkFace    5.033e+03  1.325e+04   0.380 0.704173    
## Exterior2ndCBlock            NA         NA      NA       NA    
## Exterior2ndCmentBd    1.354e+04  1.875e+04   0.722 0.470344    
## Exterior2ndHdBoard    9.496e+03  1.245e+04   0.763 0.445661    
## Exterior2ndImStucc    2.506e+04  1.425e+04   1.759 0.078907 .  
## Exterior2ndMetalSd    6.336e+03  1.423e+04   0.445 0.656094    
## Exterior2ndOther     -1.607e+04  2.718e+04  -0.591 0.554557    
## Exterior2ndPlywood    8.309e+03  1.207e+04   0.688 0.491318    
## Exterior2ndStone     -1.007e+04  1.727e+04  -0.583 0.559882    
## Exterior2ndStucco     7.749e+03  1.358e+04   0.571 0.568368    
## Exterior2ndVinylSd    1.437e+04  1.280e+04   1.122 0.262106    
## Exterior2ndWd Sdng    1.233e+04  1.196e+04   1.031 0.302752    
## Exterior2ndWd Shng    6.307e+03  1.247e+04   0.506 0.613010    
## MasVnrTypeBrkCmn     -2.599e+03  1.094e+04  -0.238 0.812218    
## MasVnrTypeBrkFace     2.086e+03  8.781e+03   0.238 0.812289    
## MasVnrTypeNone        5.183e+03  8.622e+03   0.601 0.547838    
## MasVnrTypeStone       6.878e+03  8.859e+03   0.776 0.437652    
## MasVnrArea            1.956e+01  5.775e+00   3.387 0.000729 ***
## ExterQualFa          -7.070e+03  1.109e+04  -0.638 0.523914    
## ExterQualGd          -2.010e+04  4.802e+03  -4.186 3.04e-05 ***
## ExterQualTA          -1.981e+04  5.321e+03  -3.724 0.000205 ***
## ExterCondFa          -2.889e+03  1.807e+04  -0.160 0.873011    
## ExterCondGd          -6.996e+03  1.726e+04  -0.405 0.685238    
## ExterCondPo           4.888e+03  3.175e+04   0.154 0.877691    
## ExterCondTA          -4.406e+03  1.721e+04  -0.256 0.797995    
## FoundationCBlock      3.106e+03  3.172e+03   0.979 0.327703    
## FoundationPConc       4.272e+03  3.429e+03   1.246 0.213073    
## FoundationSlab       -7.401e+03  1.007e+04  -0.735 0.462499    
## FoundationStone       8.826e+03  1.131e+04   0.780 0.435355    
## FoundationWood       -2.649e+04  1.479e+04  -1.791 0.073609 .  
## BsmtQualEx           -3.168e+04  3.637e+04  -0.871 0.383814    
## BsmtQualFa           -4.473e+04  3.616e+04  -1.237 0.216320    
## BsmtQualGd           -5.018e+04  3.614e+04  -1.389 0.165227    
## BsmtQualTA           -4.683e+04  3.604e+04  -1.299 0.194031    
## BsmtCondFa           -2.915e+03  4.252e+03  -0.686 0.493087    
## BsmtCondGd           -3.357e+03  3.245e+03  -1.034 0.301131    
## BsmtCondPo            6.941e+04  3.010e+04   2.306 0.021254 *  
## BsmtCondTA                   NA         NA      NA       NA    
## BsmtExposureAv        1.172e+04  2.308e+04   0.508 0.611763    
## BsmtExposureGd        2.507e+04  2.317e+04   1.082 0.279540    
## BsmtExposureMn        7.637e+03  2.315e+04   0.330 0.741543    
## BsmtExposureNo        6.200e+03  2.304e+04   0.269 0.787857    
## BsmtFinType1ALQ      -3.485e+03  2.919e+03  -1.194 0.232690    
## BsmtFinType1BLQ      -4.870e+02  3.131e+03  -0.156 0.876432    
## BsmtFinType1GLQ       2.706e+03  2.714e+03   0.997 0.318887    
## BsmtFinType1LwQ      -6.724e+03  3.780e+03  -1.779 0.075526 .  
## BsmtFinType1Rec      -3.255e+03  3.182e+03  -1.023 0.306439    
## BsmtFinType1Unf              NA         NA      NA       NA    
## BsmtFinSF1            3.936e+01  5.322e+00   7.395 2.63e-13 ***
## BsmtFinType2ALQ       2.918e+04  2.506e+04   1.165 0.244438    
## BsmtFinType2BLQ       1.675e+04  2.481e+04   0.675 0.499672    
## BsmtFinType2GLQ       2.680e+04  2.559e+04   1.047 0.295126    
## BsmtFinType2LwQ       1.494e+04  2.483e+04   0.601 0.547622    
## BsmtFinType2Rec       1.926e+04  2.478e+04   0.777 0.437215    
## BsmtFinType2Unf       2.073e+04  2.472e+04   0.839 0.401908    
## BsmtFinSF2            3.125e+01  9.091e+00   3.437 0.000609 ***
## BsmtUnfSF             2.080e+01  4.881e+00   4.262 2.19e-05 ***
## TotalBsmtSF                  NA         NA      NA       NA    
## HeatingGasA           9.172e+03  2.567e+04   0.357 0.720891    
## HeatingGasW           5.051e+03  2.646e+04   0.191 0.848674    
## HeatingGrav           3.554e+03  2.814e+04   0.126 0.899509    
## HeatingOthW          -1.162e+04  3.146e+04  -0.369 0.711979    
## HeatingWall           2.355e+04  2.981e+04   0.790 0.429641    
## HeatingQCFa          -6.126e+02  4.691e+03  -0.131 0.896122    
## HeatingQCGd          -3.780e+03  2.075e+03  -1.822 0.068709 .  
## HeatingQCPo           2.182e+03  2.664e+04   0.082 0.934734    
## HeatingQCTA          -3.470e+03  2.071e+03  -1.675 0.094179 .  
## CentralAirY           1.007e+02  3.881e+03   0.026 0.979309    
## ElectricalFuseA      -1.204e+04  2.416e+04  -0.499 0.618209    
## ElectricalFuseF      -1.216e+04  2.456e+04  -0.495 0.620789    
## ElectricalFuseP      -2.048e+04  3.024e+04  -0.677 0.498273    
## ElectricalMix        -6.012e+04  5.075e+04  -1.185 0.236387    
## ElectricalSBrkr      -1.390e+04  2.396e+04  -0.580 0.561901    
## X1stFlrSF             4.558e+01  5.652e+00   8.064 1.76e-15 ***
## X2ndFlrSF             6.615e+01  5.593e+00  11.827  < 2e-16 ***
## LowQualFinSF          6.703e+00  1.864e+01   0.360 0.719216    
## GrLivArea                    NA         NA      NA       NA    
## BsmtFullBath          1.085e+03  1.981e+03   0.547 0.584146    
## BsmtHalfBath         -1.690e+02  3.032e+03  -0.056 0.955550    
## FullBath              3.947e+03  2.206e+03   1.789 0.073859 .  
## HalfBath              1.603e+03  2.102e+03   0.763 0.445826    
## BedroomAbvGr         -3.672e+03  1.365e+03  -2.690 0.007254 ** 
## KitchenAbvGr         -1.385e+04  5.670e+03  -2.443 0.014698 *  
## KitchenQualFa        -1.960e+04  6.212e+03  -3.156 0.001641 ** 
## KitchenQualGd        -2.401e+04  3.489e+03  -6.881 9.48e-12 ***
## KitchenQualTA        -2.265e+04  3.930e+03  -5.762 1.05e-08 ***
## TotRmsAbvGrd          1.593e+03  9.560e+02   1.666 0.095904 .  
## FunctionalMaj2       -1.023e+03  1.443e+04  -0.071 0.943518    
## FunctionalMin1        7.684e+03  8.643e+03   0.889 0.374157    
## FunctionalMin2        9.625e+03  8.661e+03   1.111 0.266664    
## FunctionalMod        -4.694e+03  1.061e+04  -0.443 0.658160    
## FunctionalSev        -4.415e+04  2.957e+04  -1.493 0.135655    
## FunctionalTyp         1.924e+04  7.502e+03   2.565 0.010442 *  
## Fireplaces            6.446e+03  2.572e+03   2.507 0.012321 *  
## FireplaceQuEx        -9.041e+03  6.238e+03  -1.449 0.147535    
## FireplaceQuFa        -1.129e+04  5.317e+03  -2.123 0.033975 *  
## FireplaceQuGd        -6.230e+03  3.434e+03  -1.814 0.069877 .  
## FireplaceQuPo        -1.636e+02  6.279e+03  -0.026 0.979214    
## FireplaceQuTA        -5.169e+03  3.593e+03  -1.439 0.150497    
## GarageType2Types      4.263e+04  1.188e+05   0.359 0.719826    
## GarageTypeAttchd      6.214e+04  1.187e+05   0.524 0.600668    
## GarageTypeBasment     6.572e+04  1.184e+05   0.555 0.578997    
## GarageTypeBuiltIn     6.083e+04  1.188e+05   0.512 0.608851    
## GarageTypeCarPort     6.709e+04  1.194e+05   0.562 0.574159    
## GarageTypeDetchd      6.526e+04  1.188e+05   0.549 0.582901    
## GarageYrBlt          -3.630e+01  6.085e+01  -0.597 0.550929    
## GarageFinishFin       7.956e+01  2.431e+03   0.033 0.973902    
## GarageFinishRFn      -2.300e+03  2.160e+03  -1.065 0.287022    
## GarageFinishUnf              NA         NA      NA       NA    
## GarageCars            4.146e+03  2.277e+03   1.821 0.068900 .  
## GarageArea            1.855e+01  7.909e+00   2.345 0.019181 *  
## GarageQualEx          1.160e+05  2.992e+04   3.878 0.000111 ***
## GarageQualFa         -6.395e+03  4.872e+03  -1.313 0.189506    
## GarageQualGd          8.426e+02  7.620e+03   0.111 0.911969    
## GarageQualPo         -2.215e+04  2.403e+04  -0.922 0.356889    
## GarageQualTA                 NA         NA      NA       NA    
## GarageCondEx         -1.105e+05  3.458e+04  -3.194 0.001438 ** 
## GarageCondFa         -1.467e+03  5.427e+03  -0.270 0.786895    
## GarageCondGd          4.798e+02  9.109e+03   0.053 0.957998    
## GarageCondPo          2.244e+03  1.393e+04   0.161 0.872024    
## GarageCondTA                 NA         NA      NA       NA    
## PavedDriveP          -3.238e+03  5.544e+03  -0.584 0.559290    
## PavedDriveY          -6.724e+02  3.471e+03  -0.194 0.846417    
## WoodDeckSF            1.455e+01  5.868e+00   2.480 0.013292 *  
## OpenPorchSF           2.375e+00  1.153e+01   0.206 0.836909    
## EnclosedPorch         5.910e+00  1.244e+01   0.475 0.634769    
## X3SsnPorch            3.257e+01  2.245e+01   1.451 0.147054    
## ScreenPorch           3.417e+01  1.254e+01   2.725 0.006531 ** 
## PoolArea              1.159e+02  2.006e+01   5.776 9.74e-09 ***
## FenceGdPrv           -9.759e+03  3.672e+03  -2.658 0.007966 ** 
## FenceGdWo            -4.985e+02  3.582e+03  -0.139 0.889328    
## FenceMnPrv            1.344e+03  2.268e+03   0.592 0.553656    
## FenceMnWw            -5.094e+03  7.499e+03  -0.679 0.497100    
## MiscFeatureGar2      -1.521e+04  9.751e+04  -0.156 0.876039    
## MiscFeatureOthr       1.204e+04  2.028e+04   0.594 0.552863    
## MiscFeatureShed       2.106e+03  5.708e+03   0.369 0.712220    
## MiscFeatureTenC      -8.853e+04  2.914e+04  -3.038 0.002430 ** 
## MiscVal               1.009e+00  6.133e+00   0.165 0.869293    
## MoSold               -4.380e+02  2.454e+02  -1.785 0.074471 .  
## YrSold               -5.131e+02  5.166e+02  -0.993 0.320828    
## SaleTypeCon           2.533e+04  1.760e+04   1.439 0.150376    
## SaleTypeConLD         1.676e+04  9.696e+03   1.729 0.084146 .  
## SaleTypeConLI         5.682e+03  1.156e+04   0.491 0.623285    
## SaleTypeConLw         1.083e+03  1.218e+04   0.089 0.929196    
## SaleTypeCWD           1.487e+04  1.290e+04   1.153 0.249191    
## SaleTypeNew           2.223e+04  1.546e+04   1.438 0.150554    
## SaleTypeOth           6.767e+03  1.453e+04   0.466 0.641551    
## SaleTypeWD           -9.000e+01  4.188e+03  -0.021 0.982859    
## SaleConditionAdjLand  7.129e+03  1.451e+04   0.491 0.623396    
## SaleConditionAlloca   3.393e+03  8.648e+03   0.392 0.694856    
## SaleConditionFamily   5.736e+01  6.089e+03   0.009 0.992486    
## SaleConditionNormal   5.749e+03  2.896e+03   1.985 0.047360 *  
## SaleConditionPartial -1.777e+03  1.488e+04  -0.119 0.904971    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22690 on 1211 degrees of freedom
## Multiple R-squared:  0.9323, Adjusted R-squared:  0.9184 
## F-statistic: 67.24 on 248 and 1211 DF,  p-value: < 2.2e-16

This is an example of over fitting. I will try to minimize the feature. Feature selection will be more difficult with 82 features, so I would think make a house more expensive or less. There can of course be exceptions.

training2 <- train %>%
  dplyr::select(Id,BsmtFinSF1, BsmtUnfSF, GrLivArea, GrLivArea, MasVnrArea, 
         OpenPorchSF, X1stFlrSF,KitchenQual,BsmtFinSF1,Street,LotArea,MSZoning,CentralAir,SalePrice) 

Next I remodel based on the criteria I determine will make a difference.

house.prices.lm <- lm(SalePrice ~ . -Id  ,data=training2)
summary(house.prices.lm)
## 
## Call:
## lm(formula = SalePrice ~ . - Id, data = training2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -602354  -17660     168   16565  255614 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    6.716e+04  2.016e+04   3.331 0.000889 ***
## BsmtFinSF1     3.272e+01  4.095e+00   7.990 2.76e-15 ***
## BsmtUnfSF      1.949e+01  3.942e+00   4.945 8.51e-07 ***
## GrLivArea      5.830e+01  2.700e+00  21.593  < 2e-16 ***
## MasVnrArea     5.228e+01  6.566e+00   7.963 3.39e-15 ***
## OpenPorchSF    2.949e+01  1.732e+01   1.703 0.088833 .  
## X1stFlrSF      6.891e+00  4.732e+00   1.456 0.145530    
## KitchenQualFa -1.037e+05  8.320e+03 -12.467  < 2e-16 ***
## KitchenQualGd -6.102e+04  4.618e+03 -13.212  < 2e-16 ***
## KitchenQualTA -9.756e+04  4.898e+03 -19.917  < 2e-16 ***
## StreetPave    -2.221e+03  1.732e+04  -0.128 0.897984    
## LotArea        4.564e-01  1.154e-01   3.955 8.03e-05 ***
## MSZoningFV     5.250e+04  1.433e+04   3.665 0.000256 ***
## MSZoningRH     2.486e+04  1.647e+04   1.510 0.131389    
## MSZoningRL     4.199e+04  1.338e+04   3.138 0.001733 ** 
## MSZoningRM     2.305e+04  1.344e+04   1.715 0.086616 .  
## CentralAirY    2.102e+04  4.689e+03   4.483 7.93e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 39810 on 1435 degrees of freedom
##   (8 observations deleted due to missingness)
## Multiple R-squared:  0.7506, Adjusted R-squared:  0.7478 
## F-statistic:   270 on 16 and 1435 DF,  p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(house.prices.lm)

F Statistics

F-statistic shows if there exists a relationship between the variables and the output. The further the F-statistic is from 1 the better it is. The performance of the F-statistic is also determined in comparison to how large the data is. A small dataset like this one would need a large F statistic in relation to the 1460 observations. Which in comparison to the number of observations, a F statistic of 270 is not very large.

R^2

The R^2 shows how well the model fits to the actual data. I will interpret the adjusted R^2 as it adjusts to the degrees of freedom in a model. The closer the R^2 is to one the better the fit of the model. This model has an adjusted R^2 of 0.7478, or 74.78%, which is pretty close to one.

Now I will test use this and submit to Kaggle.

final_df <- test %>% dplyr::select(Id)
final_df$SalePrice <- predict(house.prices.lm, test)
head(final_df)
##     Id SalePrice
## 1 1461  97550.08
## 2 1462 204911.59
## 3 1463 167629.14
## 4 1464 199509.42
## 5 1465 183517.93
## 6 1466 154058.94
colSums(is.na(final_df))
##        Id SalePrice 
##         0        21
final_df$SalePrice[is.na(final_df$SalePrice)] <- mean(final_df$SalePrice,na.rm = TRUE)
write_csv(final_df, "SalePricesPredictions.csv")