Initialization

library(tidyverse)
library(car)
library(MASS)
library(Hmisc)
library(pander)
library(corrplot)

Part 1

housing <- read_csv('C:\\Users\\pgood\\OneDrive\\Documents\\R\\Data 605\\train.csv')

area25 <- quantile(housing$`1stFlrSF`, .25)
price25<- quantile(housing$SalePrice, .25)
pr_a <- sum(housing$SalePrice > price25 & housing$`1stFlrSF` > area25)/sum(housing$SalePrice > price25)

pr_b <- sum(housing$SalePrice > price25 & housing$`1stFlrSF` > area25)/length(housing$SalePrice)

pr_c <- sum(housing$SalePrice > price25 & housing$`1stFlrSF` < area25)/sum(housing$SalePrice > price25)

\(P(X>x|Y>y)=\) 0.8273973

Out of the observations where y is above the first quartile, this is the probability that x is above the first quartile. If X and Y are not independent, this should be greater than .75.

\(P(X>x,Y>y)=\) 0.6205479

This is known as the intersection in set theory. It’s simply the proportion of observations where both are above the first quartile

\(P(X<x|Y>y)=\) 0.1707763

Out of the observations where y is above the first quartile, this is the probability that x is below the first quartile. If X and Y are not independent, this should be less than .25. This will also be the compliment of probability A (approximately for our psued continuous distributioon).

housing_table <- housing %>% mutate(
  area_above25 = ifelse(`1stFlrSF` <= area25, '<=1st quartile', '>1st quartile'),
  price_above25 = ifelse(SalePrice <= price25, '<=1st quartile', '>1st quartile')
)

pander(addmargins(table(housing_table$area_above25, housing_table$price_above25)))
  <=1st quartile >1st quartile Sum
<=1st quartile 179 189 368
>1st quartile 186 906 1092
Sum 365 1095 1460
my_table <- table(housing_table$area_above25, housing_table$price_above25)

To help determine whether SalePrice is independent of 1st floor area, we will compare the probability that both variables are greater than their first quartile value in their joint distribution. If independent this should be approximatley equal to \(.075^{2}\)

sum(housing$SalePrice > price25 & housing$`1stFlrSF` > area25)/length(housing$SalePrice)
## [1] 0.6205479

Since the distributions are technically discrete (there could easily be duplicate values of house prices), we look at a slightly more precise measure than \(.075^{2}\):

(sum(housing$SalePrice > price25)/length(housing$SalePrice)) * (sum(housing$`1stFlrSF` > area25)/length(housing$SalePrice))
## [1] 0.5609589

Finally, we run a Chi Square test:

chisq.test(my_table)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  my_table
## X-squared = 144.98, df = 1, p-value < 2.2e-16

The result is easily signficant, and we conclude the variables are not independent.

Part 2

Proceeding like we wanted to check regression assumptions of these variables, we check how each variables relates to the independent variable and how skewed the distribution is.

ggplot(housing) + geom_point(aes(x = `1stFlrSF`, y = SalePrice/1000)) + labs(x = '1st Floor Area', y = 'Sale Price (1000s)')

ggplot(housing) + geom_histogram(aes(x = `1stFlrSF`), color = 'black', fill = 'darkblue') + labs(x = '1st Floor Area')

ggplot(housing) + geom_point(aes(x = GarageArea, y = SalePrice/1000)) + labs(x = 'Garage Area', y = 'Sale Price (1000s)')

ggplot(housing) + geom_histogram(aes(x = GarageArea), color = 'black', fill = 'darkgreen') + labs(x = 'Garage Area')

ggplot(housing) + geom_point(aes(x = OverallQual, y = SalePrice/1000)) + labs(x = 'Overall Quality', y = 'Sale Price (1000s)')

ggplot(housing) + geom_histogram(aes(x = OverallQual), color = 'black', fill = 'darkred', binwidth = 1) + 
  labs(x = 'Overall Quality')

Next check the univariate statistics of each variable.

describe(dplyr::select (housing, `1stFlrSF`, GarageArea, OverallQual))
## dplyr::select(housing, `1stFlrSF`, GarageArea, OverallQual) 
## 
##  3  Variables      1460  Observations
## ---------------------------------------------------------------------------
## 1stFlrSF 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1460        0      753        1     1163    416.4    673.0    756.9 
##      .25      .50      .75      .90      .95 
##    882.0   1087.0   1391.2   1680.0   1831.2 
## 
## lowest :  334  372  438  480  483, highest: 2633 2898 3138 3228 4692
## ---------------------------------------------------------------------------
## GarageArea 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1460        0      441        1      473    234.9      0.0    240.0 
##      .25      .50      .75      .90      .95 
##    334.5    480.0    576.0    757.1    850.1 
## 
## lowest :    0  160  164  180  186, highest: 1220 1248 1356 1390 1418
## ---------------------------------------------------------------------------
## OverallQual 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     1460        0       10    0.951    6.099    1.522        4        5 
##      .25      .50      .75      .90      .95 
##        5        6        7        8        8 
##                                                                       
## Value          1     2     3     4     5     6     7     8     9    10
## Frequency      2     3    20   116   397   374   319   168    43    18
## Proportion 0.001 0.002 0.014 0.079 0.272 0.256 0.218 0.115 0.029 0.012
## ---------------------------------------------------------------------------

Finally, we see how the variables are correlated:

mat <- as.matrix(dplyr::select(housing, GarageArea, OverallQual, `1stFlrSF`))

cor_test <- function(var1, var2){ 
  interval <- cor.test(var1, var2, conf.level = .92)$conf.int
  value <- cor.test(var1, var2, conf.level = .92)$p.value
  attr(interval, 'conf.level') <- NULL
  return(c(interval, value))
}

correlations <- rbind(cor_test(housing$GarageArea, housing$OverallQual),
                      cor_test(housing$GarageArea, housing$`1stFlrSF`),
                      cor_test(housing$OverallQual, housing$`1stFlrSF`)
)
correlations <- cbind(c('Garage Area, Quality', 'Garage Area, 1st Floor Area', 'Quality, 1st Floor Area'), correlations) %>%
  as.data.frame()

colnames(correlations) <- c('Variables', 'Lower Bound', 'Upper Bound', 'P Value')
pander(correlations)
Table continues below
Variables Lower Bound Upper Bound
Garage Area, Quality 0.529837217493636 0.592589856982602
Garage Area, 1st Floor Area 0.45414368660521 0.523854751274019
Quality, 1st Floor Area 0.439994840701337 0.510905090783813
P Value
2.43428779844161e-122
6.16046533594049e-89
1.62319740337505e-83

The correlation between these variables is easily signficant. This indicates there is some multicollinearity in the dataset. Multicollinearity can cause problems with the parameter estimates. In this case we are not worried about familywise error because of level of signficance and the lack of a cost associated with falsely looking out for multicolinearity. Furthremore, there are only 3 hypothesis tests here and each of the correlations far exceedes the signficance needed even with an adustment. If it were a concern, we could perform the Bonferroni correction and change the signficance level from .05 to \(\frac{.05}{3}\). Performing this adjustment would be more likely if we were looking at the correlation between indpendent variables and the response variable.

Part 3

We invert the correlation matrix to create the precision matrix

corr_mat <- cor(mat)
prec_mat <- solve(corr_mat)
prec_mat
##             GarageArea OverallQual   1stFlrSF
## GarageArea   1.6120830  -0.6854719 -0.4631306
## OverallQual -0.6854719   1.5847769 -0.4189770
## 1stFlrSF    -0.4631306  -0.4189770  1.4263597

When multiplied by by the correlation matrix from the left and right, we are left with the identity matrix as expected:

round(corr_mat %*% prec_mat, 5)
##             GarageArea OverallQual 1stFlrSF
## GarageArea           1           0        0
## OverallQual          0           1        0
## 1stFlrSF             0           0        1
round(prec_mat %*% corr_mat, 5)
##             GarageArea OverallQual 1stFlrSF
## GarageArea           1           0        0
## OverallQual          0           1        0
## 1stFlrSF             0           0        1

Finally, we Perform LU factorization on the correlation matrix:

U <- prec_mat
rows <- nrow(U)
L <- diag(rows)
r <- 1
for (j in 1:(rows - 1)){ 
  if(r > rows){
    break
  }
    cur_row <- U[r, ]
    factor <- U[c(1:rows) > r, j]/U[r,j]
    U[c(1:rows) > r, ] <- U[c(1:rows) > r, ] - 
      matrix(rep(cur_row, each = rows - r), ncol = rows, nrow = rows - r) * factor
    L[c(1:rows) > r, j] <- factor
    r <- r + 1
  }

L 
##            [,1]       [,2] [,3]
## [1,]  1.0000000  0.0000000    0
## [2,] -0.4252088  1.0000000    0
## [3,] -0.2872871 -0.4762238    1
U
##             GarageArea OverallQual   1stFlrSF
## GarageArea    1.612083  -0.6854719 -0.4631306
## OverallQual   0.000000   1.2933083 -0.6159042
## 1stFlrSF      0.000000   0.0000000  1.0000000
near(L %*% U,  prec_mat)
##      GarageArea OverallQual 1stFlrSF
## [1,]       TRUE        TRUE     TRUE
## [2,]       TRUE        TRUE     TRUE
## [3,]       TRUE        TRUE     TRUE

The factorization was succesful

Part 4

We now fit an exponential distribution to the data for the variable “1stFlrSF”. Since the MLE estimate for an exponential is really easy, we compare that to the fit from the fitdistr function:

dist <- fitdistr(housing$`1stFlrSF`, 'exponential')
lambda <- 1/(sum(housing$`1stFlrSF`)/length(housing$`1stFlrSF`))
dist$estimate
##         rate 
## 0.0008601213

We sample from the fitted distrubtion and plot the result:

samples <- rexp(1000, lambda)
df <- data.frame(sample = samples)
ggplot(df) + geom_histogram(aes(sample), fill = 'darkblue', color = 'black') + labs(title = 'Fitted Exponential')

To solve for the 5th and 95th percentiles, we solve the equation \(1-P={ e }^{ -\lambda x }\)

\(log(1-P)=-\lambda x\)

q_95t <- log(.05)/lambda * -1
q_5t <- log(.95)/lambda * -1

The estimates for the 5th percentile is 59.6349542 and 95th percentile is 3482.9183642

We also generate percentiles for a normal fitted with normal approximation:

sdev <- sd(housing$`1stFlrSF`)
mean_a <- mean(housing$`1stFlrSF`)
delta <- qnorm(.975)*sdev

norms <- c(mean_a - delta, mean_a + delta)
names(norms) <- c('5%', '95%')
norms
##        5%       95% 
##  404.9287 1920.3248

And finally for the emperical:

quantile(housing$`1stFlrSF`, c(.05, .95))
##      5%     95% 
##  672.95 1831.25

We can see from the plot of the fitted exponential distribution that it is far more left skewed than the emperical (plotted in part 2). The dispersion is also much greater for the exponential. The dispersion of the emperical is not as great as the normal, indicating some possible kurtosis. In order to get a good fit for our oddly shaped distribution, we need to fit a non-symmetric distrubtion with at least a scale and a shape parameter.

Part 5

We will now build a model using this dataset. With so many variables, we will look to use programmatic model building techniques when possible. We start by isolating the numeric variables.

housing <- read_csv('https://raw.githubusercontent.com/TheFedExpress/Data/master/605train')

only_numeric <- housing %>% keep(is.numeric)
mat <- as.matrix(only_numeric)
cor_mat <- cor(mat)
corrplot(cor_mat, type = 'upper')

It looks like a correlation of about .3 will serve as a cutoff for separating the wheat from the chaff.

good_vars <- cor_mat %>%
  as.data.frame() %>%
  mutate(varname = rownames(cor_mat)) %>%
  filter(abs(SalePrice) >= .3) %>%
  dplyr::select(varname, SalePrice)



good_frame <- dplyr::select(only_numeric, good_vars$varname) %>%
  mutate(YearBuilt = 2017 - YearBuilt, YearRemodAdd = 2017 - YearRemodAdd )
model1 <- lm(SalePrice ~ ., data = good_frame)
pander(summary(model1))
  Estimate Std. Error t value Pr(>|t|)
(Intercept) -51079 8476 -6.026 2.128e-09
OverallQual 19407 1174 16.54 2.188e-56
YearBuilt -191.6 49.72 -3.854 0.0001214
YearRemodAdd -323.4 62.26 -5.194 2.358e-07
BsmtFinSF1 18.32 2.602 7.041 2.944e-12
TotalBsmtSF 12.42 4.329 2.869 0.004181
1stFlrSF 32 20.96 1.527 0.1271
2ndFlrSF 23.65 20.61 1.148 0.2513
GrLivArea 17.73 20.52 0.8643 0.3876
FullBath -2423 2639 -0.9181 0.3587
TotRmsAbvGrd 1668 1097 1.52 0.1286
Fireplaces 7697 1792 4.296 1.852e-05
GarageCars 10217 2986 3.422 0.0006387
GarageArea 12.89 10.12 1.273 0.2031
WoodDeckSF 31.75 8.179 3.882 0.0001084
OpenPorchSF 6.312 15.77 0.4003 0.689
Fitting linear model: SalePrice ~ .
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
1460 36708 0.7887 0.7865
plot(model1)

This model already offers a decent fit, but transforming the variables should help linearity and normality of residuals.

varnames <- vector()
for (i in 1:16){
  varnames <- c(as.name(good_vars$varname[i]), varnames)
}

good_frame <- good_frame + 1
params <- powerTransform(good_frame)$lambda

new_frame <- as.matrix(good_frame)
for (i in 1:ncol(good_frame)){
  new_frame[, i] <- new_frame[, i] ** params[i]
}
new_frame <- as.data.frame(new_frame)
model2 <- lm(SalePrice ~ . , new_frame)
pander(summary(model2))
  Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.125 0.139 8.092 1.232e-15
OverallQual 0.009779 0.0005845 16.73 1.441e-57
YearBuilt 0.5784 0.1086 5.328 1.148e-07
YearRemodAdd 0.1428 0.01661 8.598 2.068e-17
BsmtFinSF1 0.003711 0.0002828 13.13 2.929e-37
TotalBsmtSF 7.393e-06 3.817e-06 1.937 0.05294
1stFlrSF -0.001713 0.002861 -0.5987 0.5494
2ndFlrSF -0.05745 0.02024 -2.838 0.004606
GrLivArea 0.09594 0.01152 8.328 1.892e-16
FullBath -0.0004374 0.001556 -0.2811 0.7787
TotRmsAbvGrd 0.00412 0.006858 0.6008 0.5481
Fireplaces -0.03739 0.004976 -7.515 9.936e-14
GarageCars 0.008931 0.002463 3.627 0.000297
GarageArea 7.238e-05 3.749e-05 1.931 0.05372
WoodDeckSF -0.01177 0.003281 -3.588 0.0003445
OpenPorchSF -0.0783 0.07537 -1.039 0.2991
Fitting linear model: SalePrice ~ .
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
1460 0.01892 0.8563 0.8548
plot(model2)

That’s considerably better. Now we’ll work on the categoricals.

only_char <- housing %>% keep(is.character) %>%
  unclass %>%
  as.data.frame %>%
  dplyr::select(-c(Street, Condition2, Utilities, RoofMatl, Heating, PoolQC, Fence, MiscFeature,
                   GarageQual, GarageCond, GarageType)) #remove variables with many NAs

change_none <- function(col){
  col = ifelse(is.na(col), 'none', col)
}

only_char <- sapply(dplyr::select(only_char, -Alley), change_none) %>%
  cbind(housing$SalePrice) %>%
  as.data.frame()
all_df <- cbind(only_char, new_frame)

With the variables selected, we employ programmatic stepwise regression

all_model <- lm(SalePrice ~ . , data = dplyr::select(all_df, -V32))
summary(all_model)
selection <- stepAIC(all_model,direction="backward")

best_model <- lm(SalePrice ~ MSZoning + LotShape + LandContour + LotConfig + Neighborhood + 
                   Condition1 + BldgType + HouseStyle + Exterior1st + ExterCond + 
                   Foundation + BsmtQual + BsmtCond + BsmtExposure + BsmtFinType1 + 
                   BsmtFinType2 + HeatingQC + CentralAir + KitchenQual + Functional + 
                   SaleCondition + OverallQual + YearRemodAdd + BsmtFinSF1 + 
                   TotalBsmtSF + GrLivArea + Fireplaces + GarageCars + WoodDeckSF + 
                   OpenPorchSF,
                 
   data = dplyr::select(all_df, -V32))
pander(data.frame(Adj.R.Squared = summary(best_model)$adj.r.squared))
Adj.R.Squared
0.9123
plot(best_model)

This can be improved by better handling of the categoticals in the stepwise regression

all_chars2 <- model.matrix(SalePrice ~ MSZoning + LotShape + LandContour + LotConfig + Neighborhood + 
  Condition1 + BldgType + HouseStyle + Exterior1st + ExterCond + 
  Foundation + BsmtQual + BsmtCond + BsmtExposure + BsmtFinType1 + 
  BsmtFinType2 + HeatingQC + CentralAir + KitchenQual + Functional + 
  SaleCondition + OverallQual + YearRemodAdd + BsmtFinSF1 + 
  TotalBsmtSF + GrLivArea + Fireplaces + GarageCars + WoodDeckSF + 
  OpenPorchSF, all_df) %>% as.data.frame %>%
  dplyr::select(-`(Intercept)`)

all_df2 <- cbind(all_chars2, all_df$SalePrice) %>%
  rename(SalePrice = `all_df$SalePrice`)

all_model2 <- lm(SalePrice ~ ., data = all_df2)

selection <- stepAIC(all_model2,direction="backward")
best_model2 <- lm(SalePrice ~ MSZoning2 + MSZoning3 + MSZoning4 + MSZoning5 + MSZoningnone + 
                    LotShape2 + LotShape3 + LandContour2 + LandContour3 + LandContour4 + 
                    LotConfig2 + LotConfig3 + LotConfig5 + Neighborhood10 + Neighborhood11 + 
                    Neighborhood12 + Neighborhood13 + Neighborhood14 + Neighborhood16 + 
                    Neighborhood18 + Neighborhood19 + Neighborhood21 + Neighborhood22 + 
                    Neighborhood25 + Neighborhood5 + Neighborhood7 + Neighborhood8 + 
                    Condition13 + Condition17 + BldgType3 + BldgType4 + BldgType5 + 
                    HouseStyle2 + HouseStyle3 + HouseStyle6 + HouseStyle7 + Exterior1st13 + 
                    Exterior1st3 + Exterior1st4 + Exterior1st6 + Exterior1st9 + 
                    ExterCond2 + ExterCond3 + ExterCond4 + ExterCond5 + Foundation2 + 
                    Foundation3 + Foundation5 + BsmtQual2 + BsmtQual3 + BsmtQual4 + 
                    BsmtQualnone + BsmtCond2 + BsmtCond4 + BsmtExposure2 + BsmtExposure4 + 
                    BsmtFinType12 + BsmtFinType14 + BsmtFinType15 + BsmtFinType22 + 
                    BsmtFinType24 + BsmtFinType25 + BsmtFinType26 + HeatingQC2 + 
                    HeatingQC3 + HeatingQC5 + CentralAir2 + KitchenQual2 + KitchenQual3 + 
                    KitchenQual4 + Functional2 + Functional6 + Functional7 + 
                    Functionalnone + SaleCondition2 + SaleCondition3 + SaleCondition5 + 
                    SaleCondition6 + OverallQual + YearRemodAdd + BsmtFinSF1 + 
                    TotalBsmtSF + GrLivArea + Fireplaces + GarageCars + WoodDeckSF + 
                    OpenPorchSF, data = all_df2)
pander(summary(best_model2))
  Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.652 0.06615 24.97 9.768e-114
MSZoning2 0.04969 0.006879 7.223 8.387e-13
MSZoning3 0.04786 0.006804 7.034 3.163e-12
MSZoning4 0.04871 0.005763 8.451 7.28e-17
MSZoning5 0.04465 0.005474 8.158 7.651e-16
MSZoningnone 0.06677 0.01218 5.481 5.03e-08
LotShape2 0.005345 0.002462 2.171 0.03012
LotShape3 -0.008062 0.004876 -1.653 0.09849
LandContour2 0.01083 0.002973 3.642 0.0002802
LandContour3 0.009827 0.003466 2.836 0.00464
LandContour4 0.008581 0.002045 4.196 2.897e-05
LotConfig2 0.003716 0.001856 2.002 0.04549
LotConfig3 -0.00661 0.002407 -2.746 0.006111
LotConfig5 -0.001791 0.001035 -1.731 0.08368
Neighborhood10 -0.006048 0.003351 -1.805 0.07134
Neighborhood11 -0.01601 0.004837 -3.309 0.0009606
Neighborhood12 -0.004975 0.002356 -2.111 0.03491
Neighborhood13 -0.003252 0.001516 -2.145 0.03214
Neighborhood14 0.01632 0.002593 6.295 4.139e-10
Neighborhood16 0.01364 0.002231 6.112 1.279e-09
Neighborhood18 -0.01134 0.002234 -5.078 4.342e-07
Neighborhood19 -0.003032 0.002066 -1.468 0.1424
Neighborhood21 0.007792 0.003482 2.238 0.02541
Neighborhood22 0.01788 0.003325 5.38 8.77e-08
Neighborhood25 0.01111 0.004663 2.382 0.01733
Neighborhood5 0.00718 0.003239 2.217 0.02679
Neighborhood7 0.01334 0.002392 5.577 2.947e-08
Neighborhood8 -0.01164 0.001829 -6.364 2.678e-10
Condition13 0.008508 0.001279 6.653 4.125e-11
Condition17 0.00575 0.003219 1.786 0.07427
BldgType3 -0.0099 0.002556 -3.874 0.0001121
BldgType4 -0.01579 0.002711 -5.824 7.168e-09
BldgType5 -0.01107 0.001773 -6.24 5.806e-10
HouseStyle2 0.007411 0.004311 1.719 0.0858
HouseStyle3 0.00646 0.001559 4.143 3.632e-05
HouseStyle6 -0.005169 0.001401 -3.689 0.0002338
HouseStyle7 0.007232 0.003123 2.316 0.02073
Exterior1st13 0.002578 0.001201 2.147 0.03197
Exterior1st3 -0.01549 0.01078 -1.436 0.1512
Exterior1st4 0.01262 0.002282 5.529 3.855e-08
Exterior1st6 0.003599 0.002567 1.402 0.1612
Exterior1st9 0.003806 0.001247 3.052 0.002316
ExterCond2 -0.03599 0.009181 -3.92 9.303e-05
ExterCond3 -0.0267 0.008686 -3.074 0.002151
ExterCond4 -0.06997 0.01744 -4.012 6.349e-05
ExterCond5 -0.02745 0.008641 -3.176 0.001525
Foundation2 0.006466 0.001592 4.061 5.16e-05
Foundation3 0.004641 0.001796 2.584 0.009865
Foundation5 0.01648 0.006264 2.631 0.008609
BsmtQual2 -0.007783 0.003632 -2.143 0.03229
BsmtQual3 -0.006477 0.001912 -3.388 0.0007231
BsmtQual4 -0.009467 0.002329 -4.065 5.074e-05
BsmtQualnone -0.02537 0.005767 -4.399 1.171e-05
BsmtCond2 0.008165 0.003093 2.64 0.008389
BsmtCond4 0.00666 0.002432 2.739 0.00625
BsmtExposure2 0.007059 0.001662 4.248 2.301e-05
BsmtExposure4 -0.002397 0.001039 -2.307 0.02122
BsmtFinType12 -0.003435 0.001479 -2.323 0.02034
BsmtFinType14 -0.006525 0.001991 -3.277 0.001075
BsmtFinType15 -0.003147 0.001576 -1.998 0.04596
BsmtFinType22 -0.01304 0.003763 -3.465 0.0005462
BsmtFinType24 -0.008412 0.00359 -2.343 0.01927
BsmtFinType25 -0.00715 0.003424 -2.088 0.03697
BsmtFinType26 -0.009818 0.002812 -3.491 0.0004965
HeatingQC2 -0.006132 0.002486 -2.467 0.01376
HeatingQC3 -0.002916 0.001216 -2.398 0.01663
HeatingQC5 -0.003363 0.001204 -2.792 0.005304
CentralAir2 0.009365 0.001967 4.76 2.141e-06
KitchenQual2 -0.01085 0.003521 -3.082 0.0021
KitchenQual3 -0.009746 0.001971 -4.945 8.535e-07
KitchenQual4 -0.01147 0.002236 -5.131 3.298e-07
Functional2 -0.03187 0.006956 -4.582 5.015e-06
Functional6 -0.03522 0.01514 -2.326 0.02016
Functional7 0.01064 0.001757 6.054 1.822e-09
Functionalnone 0.02537 0.01483 1.71 0.08745
SaleCondition2 0.01482 0.007828 1.893 0.0586
SaleCondition3 0.01074 0.004834 2.221 0.02652
SaleCondition5 0.009098 0.001463 6.22 6.568e-10
SaleCondition6 0.01253 0.002197 5.703 1.439e-08
OverallQual 0.0057 0.0005211 10.94 9.247e-27
YearRemodAdd 0.0818 0.01512 5.409 7.466e-08
BsmtFinSF1 0.00281 0.000252 11.15 1.04e-27
TotalBsmtSF -7.077e-06 3.61e-06 -1.96 0.05015
GrLivArea 0.1001 0.004298 23.3 2.051e-101
Fireplaces -0.01474 0.00409 -3.604 0.0003249
GarageCars 0.01053 0.001079 9.763 8.182e-22
WoodDeckSF -0.007983 0.00272 -2.935 0.00339
OpenPorchSF -0.1107 0.06034 -1.835 0.0667
Fitting linear model: SalePrice ~ MSZoning2 + MSZoning3 + MSZoning4 + MSZoning5 + MSZoningnone + LotShape2 + LotShape3 + LandContour2 + LandContour3 + LandContour4 + LotConfig2 + LotConfig3 + LotConfig5 + Neighborhood10 + Neighborhood11 + Neighborhood12 + Neighborhood13 + Neighborhood14 + Neighborhood16 + Neighborhood18 + Neighborhood19 + Neighborhood21 + Neighborhood22 + Neighborhood25 + Neighborhood5 + Neighborhood7 + Neighborhood8 + Condition13 + Condition17 + BldgType3 + BldgType4 + BldgType5 + HouseStyle2 + HouseStyle3 + HouseStyle6 + HouseStyle7 + Exterior1st13 + Exterior1st3 + Exterior1st4 + Exterior1st6 + Exterior1st9 + ExterCond2 + ExterCond3 + ExterCond4 + ExterCond5 + Foundation2 + Foundation3 + Foundation5 + BsmtQual2 + BsmtQual3 + BsmtQual4 + BsmtQualnone + BsmtCond2 + BsmtCond4 + BsmtExposure2 + BsmtExposure4 + BsmtFinType12 + BsmtFinType14 + BsmtFinType15 + BsmtFinType22 + BsmtFinType24 + BsmtFinType25 + BsmtFinType26 + HeatingQC2 + HeatingQC3 + HeatingQC5 + CentralAir2 + KitchenQual2 + KitchenQual3 + KitchenQual4 + Functional2 + Functional6 + Functional7 + Functionalnone + SaleCondition2 + SaleCondition3 + SaleCondition5 + SaleCondition6 + OverallQual + YearRemodAdd + BsmtFinSF1 + TotalBsmtSF + GrLivArea + Fireplaces + GarageCars + WoodDeckSF + OpenPorchSF
Observations Residual Std. Error \(R^2\) Adjusted \(R^2\)
1460 0.01456 0.9191 0.914
plot(best_model2)

The plot diagnostics aren’t noticeably better, but is had far few variables, nearly all of which are signficant. This should help with overfitting on the test set. We also managed to improve adjusted R squared.

Finally, this code will output the predictions for submission
My user name is PeterGoodridge
The final score was 0.14396

housing_test <-  read_csv('https://raw.githubusercontent.com/TheFedExpress/Data/master/605test')

remove_na <- function(col){
  col = ifelse(is.na(col), 0, col)
  return (col)
}

only_numeric <- housing_test %>% keep(is.numeric) %>%
  lapply(remove_na) %>%
  as.data.frame
only_numeric <- rename(only_numeric, `1stFlrSF` = X1stFlrSF, `2ndFlrSF` = X2ndFlrSF)

good_frame <- dplyr::select(only_numeric, good_vars$varname[1:15]) %>%
  mutate(YearBuilt = 2017 - YearBuilt, YearRemodAdd = 2017 - YearRemodAdd)
good_frame <- good_frame + 1
new_frame <- as.matrix(good_frame)
for (i in 1:ncol(good_frame)){
  new_frame[, i] <- new_frame[, i] ** params[i]
}
new_frame <- as.data.frame(new_frame)
only_char <- housing_test %>% keep(is.character) %>%
  unclass %>%
  as.data.frame %>%
  dplyr::select(-c(Street, Condition2, Utilities, RoofMatl, Heating, PoolQC, Fence, MiscFeature,
                   GarageQual, GarageCond, GarageType, Alley)) 

change_none <- function(col){
  ifelse(is.na(col), 'none', col)
}

only_char <- sapply(only_char, change_none) %>%
  as.data.frame()


all_df <- cbind(only_char, new_frame)


final_df <- model.matrix(YearBuilt ~ MSZoning + LotShape + LandContour + LotConfig + Neighborhood + 
                             Condition1 + BldgType + HouseStyle + Exterior1st + ExterCond + 
                             Foundation + BsmtQual + BsmtCond + BsmtExposure + BsmtFinType1 + 
                             BsmtFinType2 + HeatingQC + CentralAir + KitchenQual + Functional + 
                             SaleCondition + OverallQual + YearRemodAdd + BsmtFinSF1 + 
                             TotalBsmtSF + GrLivArea + Fireplaces + GarageCars + WoodDeckSF + 
                             OpenPorchSF, all_df) %>% as.data.frame %>%
  dplyr::select(-`(Intercept)`)



predictions <- predict.lm(best_model2, final_df) ** (1/params[length(params)])

out_df <- data.frame(Id = housing_test$Id, SalePrice = predictions)
write_csv(out_df, 'C:\\Users\\pgood\\OneDrive\\Documents\\R\\Data 605\\submission.csv')