#Import Libraries
library(tidyverse)
library(ggcorrplot)
library(pastecs)
library(modelr)
#Import Training Set
train <- read_csv("house-prices-advanced-regression-techniques/train.csv")

Lets begin by retrieving some plots to explore our dependant variable.

hist(train$SalePrice)

summary(train$SalePrice)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   34900  129975  163000  180921  214000  755000

We now move onto the possible explanatory variables. The first selections involve the structure and type of the home.

train%>%
ggplot(aes(x = HouseStyle, y = SalePrice)) + geom_boxplot()

Following still with home configuration, we find an interesting reaction with the home level square footage. These plots with prove important further down for our regression analysis.

sf_plot = train%>%
ggplot() + 
  geom_smooth(aes(x = TotalBsmtSF, y = SalePrice, col = "Basement")) +
  geom_smooth(aes(x = `1stFlrSF`, y = SalePrice, col = "`1stFloor"))+
  geom_smooth(aes(x = `2ndFlrSF`, y = SalePrice, col = "`2stFloor"))+
  labs(title = "Home Levels and Sale Price", x = "Square Feet")

sf_plot

vars = c('TotalBsmtSF','1stFlrSF','2ndFlrSF')
train%>%
select(vars , SalePrice)%>%
pairs(c(vars, 'SalePrice'))
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(vars)` instead of `vars` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.

Next we are taking another set of quantitative variables to build a correlation matrix next with the sale price. The below matrix was landed on after cycling through a series of random samples of pairs.

set.seed(997)
var_pairs = select_if(train,is.numeric)%>%
  select(SalePrice, sample(1:length(colnames(select_if(train, is.numeric))), 5, replace=F))%>%
  data.frame()
var_pairs%>%
  cor(use = 'na.or.complete') %>%
  round(2)%>%
  ggcorrplot( lab = TRUE)

#Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval.

Proceeding with a loop through all of the pairs for the above matrix, we collect testing data on the null hypothesis that the correlation is zero.

rowcounter = 1
pair_cortest = data.frame(pair = character(),
                          cor = numeric(),
                          confidence_80 =numeric(),
                          p_value =numeric(), 
                          t_Statistic =numeric(), 
                          observations=numeric() )
for (i in seq(1,length(colnames(var_pairs)))) {
  for (v in seq(1,length(colnames(var_pairs))-i)) {
    if (i + v <= 6){
         cort = cor.test(var_pairs[,i],var_pairs[,i+v])
    pair_cortest[rowcounter,1] = paste0(colnames(var_pairs)[i],"~",colnames(var_pairs)[i+v])
    pair_cortest[rowcounter,2] = round(cort[["estimate"]][["cor"]],2)
    pair_cortest[rowcounter,3] = paste0(round(cort[["conf.int"]][1],2)," - ",round(cort[["conf.int"]][2],2))
    pair_cortest[rowcounter,4] = round(cort[["p.value"]],5)
    pair_cortest[rowcounter,5] = round(cort[["statistic"]][["t"]],2)
    pair_cortest[rowcounter,6] = cort[["parameter"]][["df"]]
    rowcounter = rowcounter +1
    }
  }
}

From the list of correlations below, we can reject the null hypothesis that the correlation between the pairs are zero, for all but the top two pairs in the list. The top two do not have much of a linear relationship, and the P-value supports this observation.

pair_cortest[order(-pair_cortest$p_value),]
##                          pair   cor confidence_80 p_value   t_Statistic
## 11 EnclosedPorch~TotRmsAbvGrd  0.00  -0.05 - 0.06 0.87407          0.16
## 10  EnclosedPorch~LotFrontage  0.01  -0.05 - 0.07 0.71105          0.37
## 12  EnclosedPorch~OverallQual -0.11 -0.16 - -0.06 0.00001         -4.38
## 1        SalePrice~GarageArea  0.62   0.59 - 0.65 0.00000         30.45
## 2     SalePrice~EnclosedPorch -0.13 -0.18 - -0.08 0.00000         -4.95
## 3       SalePrice~LotFrontage  0.35     0.3 - 0.4 0.00000         13.01
## 4      SalePrice~TotRmsAbvGrd  0.53    0.5 - 0.57 0.00000         24.10
## 5       SalePrice~OverallQual  0.79   0.77 - 0.81 0.00000         49.36
## 6    GarageArea~EnclosedPorch -0.12 -0.17 - -0.07 0.00000         -4.68
## 7      GarageArea~LotFrontage  0.34   0.29 - 0.39 0.00000         12.73
## 8     GarageArea~TotRmsAbvGrd  0.34   0.29 - 0.38 0.00000         13.71
## 9      GarageArea~OverallQual  0.56    0.53 - 0.6 0.00000         25.95
## 13   LotFrontage~TotRmsAbvGrd  0.35     0.3 - 0.4 0.00000         13.03
## 14    LotFrontage~OverallQual  0.25     0.2 - 0.3 0.00000          9.00
## 15   TotRmsAbvGrd~OverallQual  0.43   0.38 - 0.47 0.00000         18.05
## 16    OverallQual~OverallQual  1.00         1 - 1 0.00000 2562469171.85
##    observations
## 11         1458
## 10         1199
## 12         1458
## 1          1458
## 2          1458
## 3          1199
## 4          1458
## 5          1458
## 6          1458
## 7          1199
## 8          1458
## 9          1458
## 13         1199
## 14         1199
## 15         1458
## 16         1458
cor_matrix = var_pairs%>%
  cor(use = 'na.or.complete')%>%
  matrix(nrow = length(colnames(var_pairs)), ncol = length(colnames(var_pairs)))
cor_matrix_P = solve(cor_matrix)
cor_matrix%*%(cor_matrix_P%*%cor_matrix)
##            [,1]       [,2]        [,3]       [,4]        [,5]       [,6]
## [1,]  1.0000000  0.6317615 -0.16400433 0.35179910  0.53721530  0.8022875
## [2,]  0.6317615  1.0000000 -0.12836568 0.34499672  0.35049561  0.5836331
## [3,] -0.1640043 -0.1283657  1.00000000 0.01070034 -0.03506514 -0.1516277
## [4,]  0.3517991  0.3449967  0.01070034 1.00000000  0.35209595  0.2516458
## [5,]  0.5372153  0.3504956 -0.03506514 0.35209595  1.00000000  0.4447412
## [6,]  0.8022875  0.5836331 -0.15162766 0.25164578  0.44474116  1.0000000

Matrix LU Decompisition of the correlation matrix

cor_matrix_L = diag(length(colnames(var_pairs)))
cor_matrix_U = cor_matrix
for (j in seq(1,length(colnames(var_pairs)),1)){
  for (i in seq(1,length(colnames(var_pairs)),1)){
        if (i > j){
          term = cor_matrix_U[i,j]/cor_matrix_U[j,j]
          cor_matrix_U[i,]= cor_matrix_U[i,] - (term* cor_matrix_U[j,])
          cor_matrix_L[i,j]= term
    }
  }
}
print(cor_matrix_L)
##            [,1]        [,2]        [,3]        [,4]      [,5] [,6]
## [1,]  1.0000000  0.00000000  0.00000000  0.00000000 0.0000000    0
## [2,]  0.6317615  1.00000000  0.00000000  0.00000000 0.0000000    0
## [3,] -0.1640043 -0.04119653  1.00000000  0.00000000 0.0000000    0
## [4,]  0.3517991  0.20427395  0.07556303  1.00000000 0.0000000    0
## [5,]  0.5372153  0.01847911  0.05503433  0.18541969 1.0000000    0
## [6,]  0.8022875  0.12777780 -0.01737098 -0.05322309 0.0317967    1
print(cor_matrix_U)
##      [,1]          [,2]        [,3]       [,4]       [,5]        [,6]
## [1,]    1  6.317615e-01 -0.16400433 0.35179910 0.53721530  0.80228746
## [2,]    0  6.008774e-01 -0.02475406 0.12274360 0.01110368  0.07677880
## [3,]    0  0.000000e+00  0.97208280 0.07345352 0.05349793 -0.01688603
## [4,]    0  0.000000e+00  0.00000000 0.84561370 0.15679343 -0.04500618
## [5,]    0  0.000000e+00  0.00000000 0.00000000 0.67917773  0.02159561
## [6,]    0 -1.387779e-17  0.00000000 0.00000000 0.00000000  0.34314885

We can check that A = LU

print(cor_matrix)
##            [,1]       [,2]        [,3]       [,4]        [,5]       [,6]
## [1,]  1.0000000  0.6317615 -0.16400433 0.35179910  0.53721530  0.8022875
## [2,]  0.6317615  1.0000000 -0.12836568 0.34499672  0.35049561  0.5836331
## [3,] -0.1640043 -0.1283657  1.00000000 0.01070034 -0.03506514 -0.1516277
## [4,]  0.3517991  0.3449967  0.01070034 1.00000000  0.35209595  0.2516458
## [5,]  0.5372153  0.3504956 -0.03506514 0.35209595  1.00000000  0.4447412
## [6,]  0.8022875  0.5836331 -0.15162766 0.25164578  0.44474116  1.0000000
print(cor_matrix_L%*%cor_matrix_U)
##            [,1]       [,2]        [,3]       [,4]        [,5]       [,6]
## [1,]  1.0000000  0.6317615 -0.16400433 0.35179910  0.53721530  0.8022875
## [2,]  0.6317615  1.0000000 -0.12836568 0.34499672  0.35049561  0.5836331
## [3,] -0.1640043 -0.1283657  1.00000000 0.01070034 -0.03506514 -0.1516277
## [4,]  0.3517991  0.3449967  0.01070034 1.00000000  0.35209595  0.2516458
## [5,]  0.5372153  0.3504956 -0.03506514 0.35209595  1.00000000  0.4447412
## [6,]  0.8022875  0.5836331 -0.15162766 0.25164578  0.44474116  1.0000000

#Calculus-Based Probability & Statistics.

Many times, it makes sense to fit a closed form distribution to data. Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary. Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ). Find the optimal value of lambda for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, lambda)). Plot a histogram and compare it with a histogram of your original variable. Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). Also generate a 95% confidence interval from the empirical data, assuming normality. Finally, provide the empirical 5th percentile and 95th percentile of the data. Discuss.

Looking for a right skewed variable, we stumble upon BsmtUnfSF. The histogram, along with the difference between the median and mean, make this a good example for this exercise.

train%>%
ggplot(aes(x = BsmtUnfSF))+
  geom_histogram( bins = 60)+
  geom_density()

print(summary(train$BsmtUnfSF))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   223.0   477.5   567.2   808.0  2336.0
library(MASS,  include.only = 'fitdistr')
BsmtUnfSFexp = 
fitdistr(train$BsmtUnfSF+.001, "exponential")

Our first look at the exponential distribution line on top of the actual histogram.

basegg = ggplot(train, aes(x = BsmtUnfSF)) + geom_histogram(aes(y=..density..), bins = 30)
basegg + stat_function(aes(x = train$BsmtUnfSF),fun = dexp, args = list(rate = BsmtUnfSFexp$estimate["rate"]))

Below we simulate a sample of 1000, plucked from the exponential distribution that we fitted to the data above. The plot below features an overlay of two histograms. The blue represents the data that we pulled from the dataset, while the red represents the simulated sample. The middle 90%(5th and 95th intervals) is marked by vertical lines. We first notice that the simulation has a wider spread. Also worth noting that the raw data’s 5th percentile is 0, which makes sense considering there are a set of observations at value zero. However the simulation does not cluster at zero like the emperical data does.

set.seed(92929)
c_int = t.test(train$BsmtUnfSF)
expo_sample = rexp(1000, BsmtUnfSFexp$estimate["rate"])
ggplot()+ 
  geom_histogram(aes(x = expo_sample, y=..density..*800),  fill = "red", alpha =.8) + 
  geom_histogram(aes(x = train$BsmtUnfSF, y=..density..*800),  fill = "blue", alpha = .3) +
  geom_vline(xintercept = quantile(expo_sample,.05), color = "red", show.legend = TRUE)+
  geom_vline(xintercept = quantile(expo_sample,.95), color = "red", show.legend = TRUE)+
  geom_vline(xintercept = quantile(train$BsmtUnfSF,.05), color = "blue", show.legend = TRUE)+
  geom_vline(xintercept = quantile(train$BsmtUnfSF,.95), color = "blue", show.legend = TRUE)+
  geom_label(aes( x=quantile(expo_sample,.05), y=.50, label=round(quantile(expo_sample,.05),)),color="red",size=7 , angle=45, fontface="bold" )+
  geom_label(aes( x=quantile(expo_sample,.95), y=.50, label=round(quantile(expo_sample,.95),)),color="red",size=7 , angle=45, fontface="bold" )+
  geom_label(aes( x=quantile(train$BsmtUnfSF,.05), y=.200, label=round(quantile(train$BsmtUnfSF,.05),)),color="blue",size=7 , angle=45, fontface="bold" )+
  geom_label(aes( x=quantile(train$BsmtUnfSF,.95), y=.200, label=round(quantile(train$BsmtUnfSF,.95),)),color="blue",size=7 , angle=45, fontface="bold" )+
  stat_ecdf(aes(x=expo_sample), geom = "step", color = "red")+
  scale_y_continuous("PDF",sec.axis = sec_axis(~ (. - 0), name = "CDF"))+
  labs(title = "data sample(BLUE) vs simulation(RED)", subtitle = "5th & 95th percentile marked", x = "value", color = "Legend")+
  theme(legend.position = "none",panel.grid = element_blank(), axis.text.y = element_blank())
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

print(
  paste0("The 95% confidence interval for the mean of the emperical data is ", round(c_int[["conf.int"]][1])," to ",round(c_int[["conf.int"]][2]))
)
## [1] "The 95% confidence interval for the mean of the emperical data is 545 to 590"

To begin our model we first select our variables. These will include the variables we explored earlier and more. The independant variables chosen here are mostly quatitative, with the exception of ‘KitchenQual,’ which is included based on the popular addage that “kitchens sell homes.”

We will move along using the Backward Elimination method, to trim off what appear to be poor “performing” variables.

mylm = lm(SalePrice ~ GrLivArea + LotArea + YearBuilt + WoodDeckSF + PoolArea + KitchenQual + TotalBsmtSF + BsmtUnfSF + BsmtFinSF2 + MasVnrArea + OverallCond + OverallQual+ GarageArea + EnclosedPorch + LotFrontage + TotRmsAbvGrd  + `1stFlrSF` + `2ndFlrSF`+TotRmsAbvGrd + TotalBsmtSF, data = train)
summary(mylm)
## 
## Call:
## lm(formula = SalePrice ~ GrLivArea + LotArea + YearBuilt + WoodDeckSF + 
##     PoolArea + KitchenQual + TotalBsmtSF + BsmtUnfSF + BsmtFinSF2 + 
##     MasVnrArea + OverallCond + OverallQual + GarageArea + EnclosedPorch + 
##     LotFrontage + TotRmsAbvGrd + `1stFlrSF` + `2ndFlrSF` + TotRmsAbvGrd + 
##     TotalBsmtSF, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -509162  -15267   -1721   12392  307145 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -8.233e+05  1.168e+05  -7.051 3.03e-12 ***
## GrLivArea      2.053e+01  2.144e+01   0.957  0.33869    
## LotArea        7.989e-01  1.535e-01   5.204 2.30e-07 ***
## YearBuilt      4.048e+02  5.834e+01   6.939 6.53e-12 ***
## WoodDeckSF     2.884e+01  9.476e+00   3.044  0.00239 ** 
## PoolArea      -8.140e+01  2.869e+01  -2.838  0.00462 ** 
## KitchenQualFa -4.456e+04  8.608e+03  -5.176 2.66e-07 ***
## KitchenQualGd -4.658e+04  4.562e+03 -10.210  < 2e-16 ***
## KitchenQualTA -5.768e+04  5.250e+03 -10.988  < 2e-16 ***
## TotalBsmtSF    2.152e+01  4.917e+00   4.377 1.31e-05 ***
## BsmtUnfSF     -1.498e+01  2.866e+00  -5.226 2.05e-07 ***
## BsmtFinSF2    -7.841e+00  7.274e+00  -1.078  0.28128    
## MasVnrArea     2.847e+01  6.795e+00   4.189 3.01e-05 ***
## OverallCond    5.641e+03  1.136e+03   4.966 7.83e-07 ***
## OverallQual    1.690e+04  1.354e+03  12.482  < 2e-16 ***
## GarageArea     3.443e+01  6.570e+00   5.241 1.89e-07 ***
## EnclosedPorch  8.198e+00  1.932e+01   0.424  0.67133    
## LotFrontage   -2.037e+01  5.386e+01  -0.378  0.70537    
## TotRmsAbvGrd   1.550e+03  1.220e+03   1.271  0.20408    
## `1stFlrSF`     2.573e+01  2.198e+01   1.170  0.24205    
## `2ndFlrSF`     2.061e+01  2.157e+01   0.956  0.33952    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36400 on 1174 degrees of freedom
##   (265 observations deleted due to missingness)
## Multiple R-squared:  0.8117, Adjusted R-squared:  0.8085 
## F-statistic:   253 on 20 and 1174 DF,  p-value: < 2.2e-16

Some variables under the gun are 1stFlrSF ,2ndFlrSF, and TotalBsmtSF. These variables refer to the square footage of the different home levels. GrLivArea, which is a combination of the former two, is also on the chopping block. From our earlier plots on these variables we know the 1st and 2nd floor footgage follow a similar pattern; so we can move forward with the assumption that the aggreated variable can be used in lue of the seperated top floors. Leaving basement alone for now.

print(stat.desc(train[c('GrLivArea','1stFlrSF','2ndFlrSF')]))
##                 GrLivArea     1stFlrSF     2ndFlrSF
## nbr.val      1.460000e+03 1.460000e+03 1.460000e+03
## nbr.null     0.000000e+00 0.000000e+00 8.290000e+02
## nbr.na       0.000000e+00 0.000000e+00 0.000000e+00
## min          3.340000e+02 3.340000e+02 0.000000e+00
## max          5.642000e+03 4.692000e+03 2.065000e+03
## range        5.308000e+03 4.358000e+03 2.065000e+03
## sum          2.212577e+06 1.697435e+06 5.066090e+05
## median       1.464000e+03 1.087000e+03 0.000000e+00
## mean         1.515464e+03 1.162627e+03 3.469925e+02
## SE.mean      1.375245e+01 1.011746e+01 1.142447e+01
## CI.mean.0.95 2.697669e+01 1.984633e+01 2.241014e+01
## var          2.761296e+05 1.494501e+05 1.905571e+05
## std.dev      5.254804e+02 3.865877e+02 4.365284e+02
## coef.var     3.467456e-01 3.325123e-01 1.258034e+00
mylm = update(mylm, .~. -`2ndFlrSF` -`1stFlrSF`, data = train)
summary(mylm)
## 
## Call:
## lm(formula = SalePrice ~ GrLivArea + LotArea + YearBuilt + WoodDeckSF + 
##     PoolArea + KitchenQual + TotalBsmtSF + BsmtUnfSF + BsmtFinSF2 + 
##     MasVnrArea + OverallCond + OverallQual + GarageArea + EnclosedPorch + 
##     LotFrontage + TotRmsAbvGrd, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -510158  -15234   -1539   12653  309237 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -8.289e+05  1.157e+05  -7.167 1.35e-12 ***
## GrLivArea      4.130e+01  4.489e+00   9.201  < 2e-16 ***
## LotArea        8.062e-01  1.534e-01   5.256 1.75e-07 ***
## YearBuilt      4.084e+02  5.783e+01   7.063 2.78e-12 ***
## WoodDeckSF     2.934e+01  9.469e+00   3.098  0.00199 ** 
## PoolArea      -8.406e+01  2.862e+01  -2.938  0.00337 ** 
## KitchenQualFa -4.492e+04  8.600e+03  -5.224 2.08e-07 ***
## KitchenQualGd -4.674e+04  4.548e+03 -10.279  < 2e-16 ***
## KitchenQualTA -5.781e+04  5.238e+03 -11.038  < 2e-16 ***
## TotalBsmtSF    2.474e+01  3.619e+00   6.838 1.29e-11 ***
## BsmtUnfSF     -1.521e+01  2.861e+00  -5.317 1.26e-07 ***
## BsmtFinSF2    -8.093e+00  7.269e+00  -1.113  0.26579    
## MasVnrArea     2.883e+01  6.773e+00   4.256 2.25e-05 ***
## OverallCond    5.642e+03  1.134e+03   4.977 7.41e-07 ***
## OverallQual    1.682e+04  1.352e+03  12.446  < 2e-16 ***
## GarageArea     3.536e+01  6.534e+00   5.412 7.57e-08 ***
## EnclosedPorch  7.556e+00  1.927e+01   0.392  0.69501    
## LotFrontage   -1.305e+01  5.323e+01  -0.245  0.80637    
## TotRmsAbvGrd   1.575e+03  1.220e+03   1.291  0.19695    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 36400 on 1176 degrees of freedom
##   (265 observations deleted due to missingness)
## Multiple R-squared:  0.8114, Adjusted R-squared:  0.8085 
## F-statistic:   281 on 18 and 1176 DF,  p-value: < 2.2e-16

Continue again dropping the values with the highest p-values.(results combined below for readability)

mylm = update(mylm, .~. -LotFrontage - EnclosedPorch -PoolArea -BsmtFinSF2, data = train)
summary(mylm)
## 
## Call:
## lm(formula = SalePrice ~ GrLivArea + LotArea + YearBuilt + WoodDeckSF + 
##     KitchenQual + TotalBsmtSF + BsmtUnfSF + MasVnrArea + OverallCond + 
##     OverallQual + GarageArea + TotRmsAbvGrd, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -539932  -14714   -1602   12212  264103 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -8.077e+05  9.266e+04  -8.717  < 2e-16 ***
## GrLivArea      4.230e+01  3.792e+00  11.156  < 2e-16 ***
## LotArea        5.854e-01  9.812e-02   5.965 3.07e-09 ***
## YearBuilt      3.991e+02  4.656e+01   8.572  < 2e-16 ***
## WoodDeckSF     2.763e+01  7.778e+00   3.552 0.000395 ***
## KitchenQualFa -4.487e+04  7.656e+03  -5.861 5.71e-09 ***
## KitchenQualGd -4.617e+04  4.104e+03 -11.249  < 2e-16 ***
## KitchenQualTA -5.672e+04  4.643e+03 -12.217  < 2e-16 ***
## TotalBsmtSF    2.487e+01  2.935e+00   8.473  < 2e-16 ***
## BsmtUnfSF     -1.311e+01  2.396e+00  -5.470 5.29e-08 ***
## MasVnrArea     2.712e+01  5.827e+00   4.655 3.54e-06 ***
## OverallCond    5.375e+03  9.361e+02   5.742 1.14e-08 ***
## OverallQual    1.636e+04  1.154e+03  14.172  < 2e-16 ***
## GarageArea     3.430e+01  5.665e+00   6.055 1.79e-09 ***
## TotRmsAbvGrd   1.473e+03  1.035e+03   1.422 0.155140    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34620 on 1437 degrees of freedom
##   (8 observations deleted due to missingness)
## Multiple R-squared:  0.8112, Adjusted R-squared:  0.8094 
## F-statistic:   441 on 14 and 1437 DF,  p-value: < 2.2e-16

Now that we have what appears to be a good collection of predictor variable, we can review the residuals. We already see from the summary above that the residuals are not centered around zero which is not ideal. However, we can see that aside from a few outliers. Thus, the residuals have a fairly normal distribution and can be considered a good sign for our model.

myres = resid(mylm)
scplot = plot(myres, type = 'p')

hplot = hist(myres, breaks = 100)

qqnorm(myres)
qqline(myres)

Now that we are satisfied with the predictive variables and the residuals thus far; we can see how this model performs against the test data.

test <- read_csv("house-prices-advanced-regression-techniques/test.csv")
predictlm = predict(mylm, newdata = test)

A look at the predicition results.

hist(predictlm)

To avoid generating a prediction of NA, we want to first identify the predictive variables that have NA in the test data, and impute over with a reasonable value.

pred_vars = c('Id', colnames(mylm[["model"]])[-1])
print(summary(select(test,all_of(pred_vars))))
##        Id         GrLivArea       LotArea        YearBuilt      WoodDeckSF     
##  Min.   :1461   Min.   : 407   Min.   : 1470   Min.   :1879   Min.   :   0.00  
##  1st Qu.:1826   1st Qu.:1118   1st Qu.: 7391   1st Qu.:1953   1st Qu.:   0.00  
##  Median :2190   Median :1432   Median : 9399   Median :1973   Median :   0.00  
##  Mean   :2190   Mean   :1486   Mean   : 9819   Mean   :1971   Mean   :  93.17  
##  3rd Qu.:2554   3rd Qu.:1721   3rd Qu.:11518   3rd Qu.:2001   3rd Qu.: 168.00  
##  Max.   :2919   Max.   :5095   Max.   :56600   Max.   :2010   Max.   :1424.00  
##                                                                                
##  KitchenQual         TotalBsmtSF     BsmtUnfSF        MasVnrArea    
##  Length:1459        Min.   :   0   Min.   :   0.0   Min.   :   0.0  
##  Class :character   1st Qu.: 784   1st Qu.: 219.2   1st Qu.:   0.0  
##  Mode  :character   Median : 988   Median : 460.0   Median :   0.0  
##                     Mean   :1046   Mean   : 554.3   Mean   : 100.7  
##                     3rd Qu.:1305   3rd Qu.: 797.8   3rd Qu.: 164.0  
##                     Max.   :5095   Max.   :2140.0   Max.   :1290.0  
##                     NA's   :1      NA's   :1        NA's   :15      
##   OverallCond     OverallQual       GarageArea      TotRmsAbvGrd   
##  Min.   :1.000   Min.   : 1.000   Min.   :   0.0   Min.   : 3.000  
##  1st Qu.:5.000   1st Qu.: 5.000   1st Qu.: 318.0   1st Qu.: 5.000  
##  Median :5.000   Median : 6.000   Median : 480.0   Median : 6.000  
##  Mean   :5.554   Mean   : 6.079   Mean   : 472.8   Mean   : 6.385  
##  3rd Qu.:6.000   3rd Qu.: 7.000   3rd Qu.: 576.0   3rd Qu.: 7.000  
##  Max.   :9.000   Max.   :10.000   Max.   :1488.0   Max.   :15.000  
##                                   NA's   :1
#After seeing that the NAs appear for variables that could be considered optional; it is reasonable to set these values to 0.  (Most likely a recording error in the data)
test = data.frame(test[pred_vars]) %>%
  replace_na(list(TotalBsmtSF = 0, GarageArea = 0, MasVnrArea = 0,BsmtUnfSF = 0, KitchenQual = 'Gd'))
submission = add_predictions(test, mylm, var = 'SalePrice')%>%
  select('Id', 'SalePrice')%>%
  data_frame()
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
write_csv( submission, "SalePrice_Predictions.csv")

#Results per Kaggle Competition User name: Mustafa Telab Score: 0.33501