library(tidyverse)
library(car)
library(MASS)
library(Hmisc)
library(pander)
library(corrplot)
housing <- read_csv('C:\\Users\\pgood\\OneDrive\\Documents\\R\\Data 605\\train.csv')
area25 <- quantile(housing$`1stFlrSF`, .25)
price25<- quantile(housing$SalePrice, .25)
pr_a <- sum(housing$SalePrice > price25 & housing$`1stFlrSF` > area25)/sum(housing$SalePrice > price25)
pr_b <- sum(housing$SalePrice > price25 & housing$`1stFlrSF` > area25)/length(housing$SalePrice)
pr_c <- sum(housing$SalePrice > price25 & housing$`1stFlrSF` < area25)/sum(housing$SalePrice > price25)
\(P(X>x|Y>y)=\) 0.8273973
Out of the observations where y is above the first quartile, this is the probability that x is above the first quartile. If X and Y are not independent, this should be greater than .75.
\(P(X>x,Y>y)=\) 0.6205479
This is known as the intersection in set theory. It’s simply the proportion of observations where both are above the first quartile
\(P(X<x|Y>y)=\) 0.1707763
Out of the observations where y is above the first quartile, this is the probability that x is below the first quartile. If X and Y are not independent, this should be less than .25. This will also be the compliment of probability A (approximately for our psued continuous distributioon).
housing_table <- housing %>% mutate(
area_above25 = ifelse(`1stFlrSF` <= area25, '<=1st quartile', '>1st quartile'),
price_above25 = ifelse(SalePrice <= price25, '<=1st quartile', '>1st quartile')
)
pander(addmargins(table(housing_table$area_above25, housing_table$price_above25)))
| Â | <=1st quartile | >1st quartile | Sum |
|---|---|---|---|
| <=1st quartile | 179 | 189 | 368 |
| >1st quartile | 186 | 906 | 1092 |
| Sum | 365 | 1095 | 1460 |
my_table <- table(housing_table$area_above25, housing_table$price_above25)
To help determine whether SalePrice is independent of 1st floor area, we will compare the probability that both variables are greater than their first quartile value in their joint distribution. If independent this should be approximatley equal to \(.075^{2}\)
sum(housing$SalePrice > price25 & housing$`1stFlrSF` > area25)/length(housing$SalePrice)
## [1] 0.6205479
Since the distributions are technically discrete (there could easily be duplicate values of house prices), we look at a slightly more precise measure than \(.075^{2}\):
(sum(housing$SalePrice > price25)/length(housing$SalePrice)) * (sum(housing$`1stFlrSF` > area25)/length(housing$SalePrice))
## [1] 0.5609589
Finally, we run a Chi Square test:
chisq.test(my_table)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: my_table
## X-squared = 144.98, df = 1, p-value < 2.2e-16
The result is easily signficant, and we conclude the variables are not independent.
Proceeding like we wanted to check regression assumptions of these variables, we check how each variables relates to the independent variable and how skewed the distribution is.
ggplot(housing) + geom_point(aes(x = `1stFlrSF`, y = SalePrice/1000)) + labs(x = '1st Floor Area', y = 'Sale Price (1000s)')
ggplot(housing) + geom_histogram(aes(x = `1stFlrSF`), color = 'black', fill = 'darkblue') + labs(x = '1st Floor Area')
ggplot(housing) + geom_point(aes(x = GarageArea, y = SalePrice/1000)) + labs(x = 'Garage Area', y = 'Sale Price (1000s)')
ggplot(housing) + geom_histogram(aes(x = GarageArea), color = 'black', fill = 'darkgreen') + labs(x = 'Garage Area')
ggplot(housing) + geom_point(aes(x = OverallQual, y = SalePrice/1000)) + labs(x = 'Overall Quality', y = 'Sale Price (1000s)')
ggplot(housing) + geom_histogram(aes(x = OverallQual), color = 'black', fill = 'darkred', binwidth = 1) +
labs(x = 'Overall Quality')
Next check the univariate statistics of each variable.
describe(dplyr::select (housing, `1stFlrSF`, GarageArea, OverallQual))
## dplyr::select(housing, `1stFlrSF`, GarageArea, OverallQual)
##
## 3 Variables 1460 Observations
## ---------------------------------------------------------------------------
## 1stFlrSF
## n missing distinct Info Mean Gmd .05 .10
## 1460 0 753 1 1163 416.4 673.0 756.9
## .25 .50 .75 .90 .95
## 882.0 1087.0 1391.2 1680.0 1831.2
##
## lowest : 334 372 438 480 483, highest: 2633 2898 3138 3228 4692
## ---------------------------------------------------------------------------
## GarageArea
## n missing distinct Info Mean Gmd .05 .10
## 1460 0 441 1 473 234.9 0.0 240.0
## .25 .50 .75 .90 .95
## 334.5 480.0 576.0 757.1 850.1
##
## lowest : 0 160 164 180 186, highest: 1220 1248 1356 1390 1418
## ---------------------------------------------------------------------------
## OverallQual
## n missing distinct Info Mean Gmd .05 .10
## 1460 0 10 0.951 6.099 1.522 4 5
## .25 .50 .75 .90 .95
## 5 6 7 8 8
##
## Value 1 2 3 4 5 6 7 8 9 10
## Frequency 2 3 20 116 397 374 319 168 43 18
## Proportion 0.001 0.002 0.014 0.079 0.272 0.256 0.218 0.115 0.029 0.012
## ---------------------------------------------------------------------------
Finally, we see how the variables are correlated:
mat <- as.matrix(dplyr::select(housing, GarageArea, OverallQual, `1stFlrSF`))
cor_test <- function(var1, var2){
interval <- cor.test(var1, var2, conf.level = .92)$conf.int
value <- cor.test(var1, var2, conf.level = .92)$p.value
attr(interval, 'conf.level') <- NULL
return(c(interval, value))
}
correlations <- rbind(cor_test(housing$GarageArea, housing$OverallQual),
cor_test(housing$GarageArea, housing$`1stFlrSF`),
cor_test(housing$OverallQual, housing$`1stFlrSF`)
)
correlations <- cbind(c('Garage Area, Quality', 'Garage Area, 1st Floor Area', 'Quality, 1st Floor Area'), correlations) %>%
as.data.frame()
colnames(correlations) <- c('Variables', 'Lower Bound', 'Upper Bound', 'P Value')
pander(correlations)
| Variables | Lower Bound | Upper Bound |
|---|---|---|
| Garage Area, Quality | 0.529837217493636 | 0.592589856982602 |
| Garage Area, 1st Floor Area | 0.45414368660521 | 0.523854751274019 |
| Quality, 1st Floor Area | 0.439994840701337 | 0.510905090783813 |
| P Value |
|---|
| 2.43428779844161e-122 |
| 6.16046533594049e-89 |
| 1.62319740337505e-83 |
The correlation between these variables is easily signficant. This indicates there is some multicollinearity in the dataset. Multicollinearity can cause problems with the parameter estimates. In this case we are not worried about familywise error because of level of signficance and the lack of a cost associated with falsely looking out for multicolinearity. Furthremore, there are only 3 hypothesis tests here and each of the correlations far exceedes the signficance needed even with an adustment. If it were a concern, we could perform the Bonferroni correction and change the signficance level from .05 to \(\frac{.05}{3}\). Performing this adjustment would be more likely if we were looking at the correlation between indpendent variables and the response variable.
We invert the correlation matrix to create the precision matrix
corr_mat <- cor(mat)
prec_mat <- solve(corr_mat)
prec_mat
## GarageArea OverallQual 1stFlrSF
## GarageArea 1.6120830 -0.6854719 -0.4631306
## OverallQual -0.6854719 1.5847769 -0.4189770
## 1stFlrSF -0.4631306 -0.4189770 1.4263597
When multiplied by by the correlation matrix from the left and right, we are left with the identity matrix as expected:
round(corr_mat %*% prec_mat, 5)
## GarageArea OverallQual 1stFlrSF
## GarageArea 1 0 0
## OverallQual 0 1 0
## 1stFlrSF 0 0 1
round(prec_mat %*% corr_mat, 5)
## GarageArea OverallQual 1stFlrSF
## GarageArea 1 0 0
## OverallQual 0 1 0
## 1stFlrSF 0 0 1
Finally, we Perform LU factorization on the correlation matrix:
U <- prec_mat
rows <- nrow(U)
L <- diag(rows)
r <- 1
for (j in 1:(rows - 1)){
if(r > rows){
break
}
cur_row <- U[r, ]
factor <- U[c(1:rows) > r, j]/U[r,j]
U[c(1:rows) > r, ] <- U[c(1:rows) > r, ] -
matrix(rep(cur_row, each = rows - r), ncol = rows, nrow = rows - r) * factor
L[c(1:rows) > r, j] <- factor
r <- r + 1
}
L
## [,1] [,2] [,3]
## [1,] 1.0000000 0.0000000 0
## [2,] -0.4252088 1.0000000 0
## [3,] -0.2872871 -0.4762238 1
U
## GarageArea OverallQual 1stFlrSF
## GarageArea 1.612083 -0.6854719 -0.4631306
## OverallQual 0.000000 1.2933083 -0.6159042
## 1stFlrSF 0.000000 0.0000000 1.0000000
near(L %*% U, prec_mat)
## GarageArea OverallQual 1stFlrSF
## [1,] TRUE TRUE TRUE
## [2,] TRUE TRUE TRUE
## [3,] TRUE TRUE TRUE
The factorization was succesful
We now fit an exponential distribution to the data for the variable “1stFlrSF”. Since the MLE estimate for an exponential is really easy, we compare that to the fit from the fitdistr function:
dist <- fitdistr(housing$`1stFlrSF`, 'exponential')
lambda <- 1/(sum(housing$`1stFlrSF`)/length(housing$`1stFlrSF`))
dist$estimate
## rate
## 0.0008601213
We sample from the fitted distrubtion and plot the result:
samples <- rexp(1000, lambda)
df <- data.frame(sample = samples)
ggplot(df) + geom_histogram(aes(sample), fill = 'darkblue', color = 'black') + labs(title = 'Fitted Exponential')
To solve for the 5th and 95th percentiles, we solve the equation \(1-P={ e }^{ -\lambda x }\)
\(log(1-P)=-\lambda x\)
q_95t <- log(.05)/lambda * -1
q_5t <- log(.95)/lambda * -1
The estimates for the 5th percentile is 59.6349542 and 95th percentile is 3482.9183642
We also generate percentiles for a normal fitted with normal approximation:
sdev <- sd(housing$`1stFlrSF`)
mean_a <- mean(housing$`1stFlrSF`)
delta <- qnorm(.975)*sdev
norms <- c(mean_a - delta, mean_a + delta)
names(norms) <- c('5%', '95%')
norms
## 5% 95%
## 404.9287 1920.3248
And finally for the emperical:
quantile(housing$`1stFlrSF`, c(.05, .95))
## 5% 95%
## 672.95 1831.25
We can see from the plot of the fitted exponential distribution that it is far more left skewed than the emperical (plotted in part 2). The dispersion is also much greater for the exponential. The dispersion of the emperical is not as great as the normal, indicating some possible kurtosis. In order to get a good fit for our oddly shaped distribution, we need to fit a non-symmetric distrubtion with at least a scale and a shape parameter.
We will now build a model using this dataset. With so many variables, we will look to use programmatic model building techniques when possible. We start by isolating the numeric variables.
housing <- read_csv('https://raw.githubusercontent.com/TheFedExpress/Data/master/605train')
only_numeric <- housing %>% keep(is.numeric)
mat <- as.matrix(only_numeric)
cor_mat <- cor(mat)
corrplot(cor_mat, type = 'upper')
good_vars <- cor_mat %>%
as.data.frame() %>%
mutate(varname = rownames(cor_mat)) %>%
filter(abs(SalePrice) >= .3) %>%
dplyr::select(varname, SalePrice)
good_frame <- dplyr::select(only_numeric, good_vars$varname) %>%
mutate(YearBuilt = 2017 - YearBuilt, YearRemodAdd = 2017 - YearRemodAdd )
model1 <- lm(SalePrice ~ ., data = good_frame)
pander(summary(model1))
| Â | Estimate | Std. Error | t value | Pr(>|t|) |
|---|---|---|---|---|
| (Intercept) | -51079 | 8476 | -6.026 | 2.128e-09 |
| OverallQual | 19407 | 1174 | 16.54 | 2.188e-56 |
| YearBuilt | -191.6 | 49.72 | -3.854 | 0.0001214 |
| YearRemodAdd | -323.4 | 62.26 | -5.194 | 2.358e-07 |
| BsmtFinSF1 | 18.32 | 2.602 | 7.041 | 2.944e-12 |
| TotalBsmtSF | 12.42 | 4.329 | 2.869 | 0.004181 |
1stFlrSF |
32 | 20.96 | 1.527 | 0.1271 |
2ndFlrSF |
23.65 | 20.61 | 1.148 | 0.2513 |
| GrLivArea | 17.73 | 20.52 | 0.8643 | 0.3876 |
| FullBath | -2423 | 2639 | -0.9181 | 0.3587 |
| TotRmsAbvGrd | 1668 | 1097 | 1.52 | 0.1286 |
| Fireplaces | 7697 | 1792 | 4.296 | 1.852e-05 |
| GarageCars | 10217 | 2986 | 3.422 | 0.0006387 |
| GarageArea | 12.89 | 10.12 | 1.273 | 0.2031 |
| WoodDeckSF | 31.75 | 8.179 | 3.882 | 0.0001084 |
| OpenPorchSF | 6.312 | 15.77 | 0.4003 | 0.689 |
| Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
|---|---|---|---|
| 1460 | 36708 | 0.7887 | 0.7865 |
plot(model1)
varnames <- vector()
for (i in 1:16){
varnames <- c(as.name(good_vars$varname[i]), varnames)
}
good_frame <- good_frame + 1
params <- powerTransform(good_frame)$lambda
new_frame <- as.matrix(good_frame)
for (i in 1:ncol(good_frame)){
new_frame[, i] <- new_frame[, i] ** params[i]
}
new_frame <- as.data.frame(new_frame)
model2 <- lm(SalePrice ~ . , new_frame)
pander(summary(model2))
| Â | Estimate | Std. Error | t value | Pr(>|t|) |
|---|---|---|---|---|
| (Intercept) | 1.125 | 0.139 | 8.092 | 1.232e-15 |
| OverallQual | 0.009779 | 0.0005845 | 16.73 | 1.441e-57 |
| YearBuilt | 0.5784 | 0.1086 | 5.328 | 1.148e-07 |
| YearRemodAdd | 0.1428 | 0.01661 | 8.598 | 2.068e-17 |
| BsmtFinSF1 | 0.003711 | 0.0002828 | 13.13 | 2.929e-37 |
| TotalBsmtSF | 7.393e-06 | 3.817e-06 | 1.937 | 0.05294 |
1stFlrSF |
-0.001713 | 0.002861 | -0.5987 | 0.5494 |
2ndFlrSF |
-0.05745 | 0.02024 | -2.838 | 0.004606 |
| GrLivArea | 0.09594 | 0.01152 | 8.328 | 1.892e-16 |
| FullBath | -0.0004374 | 0.001556 | -0.2811 | 0.7787 |
| TotRmsAbvGrd | 0.00412 | 0.006858 | 0.6008 | 0.5481 |
| Fireplaces | -0.03739 | 0.004976 | -7.515 | 9.936e-14 |
| GarageCars | 0.008931 | 0.002463 | 3.627 | 0.000297 |
| GarageArea | 7.238e-05 | 3.749e-05 | 1.931 | 0.05372 |
| WoodDeckSF | -0.01177 | 0.003281 | -3.588 | 0.0003445 |
| OpenPorchSF | -0.0783 | 0.07537 | -1.039 | 0.2991 |
| Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
|---|---|---|---|
| 1460 | 0.01892 | 0.8563 | 0.8548 |
plot(model2)
only_char <- housing %>% keep(is.character) %>%
unclass %>%
as.data.frame %>%
dplyr::select(-c(Street, Condition2, Utilities, RoofMatl, Heating, PoolQC, Fence, MiscFeature,
GarageQual, GarageCond, GarageType)) #remove variables with many NAs
change_none <- function(col){
col = ifelse(is.na(col), 'none', col)
}
only_char <- sapply(dplyr::select(only_char, -Alley), change_none) %>%
cbind(housing$SalePrice) %>%
as.data.frame()
all_df <- cbind(only_char, new_frame)
all_model <- lm(SalePrice ~ . , data = dplyr::select(all_df, -V32))
summary(all_model)
selection <- stepAIC(all_model,direction="backward")
best_model <- lm(SalePrice ~ MSZoning + LotShape + LandContour + LotConfig + Neighborhood +
Condition1 + BldgType + HouseStyle + Exterior1st + ExterCond +
Foundation + BsmtQual + BsmtCond + BsmtExposure + BsmtFinType1 +
BsmtFinType2 + HeatingQC + CentralAir + KitchenQual + Functional +
SaleCondition + OverallQual + YearRemodAdd + BsmtFinSF1 +
TotalBsmtSF + GrLivArea + Fireplaces + GarageCars + WoodDeckSF +
OpenPorchSF,
data = dplyr::select(all_df, -V32))
pander(data.frame(Adj.R.Squared = summary(best_model)$adj.r.squared))
| Adj.R.Squared |
|---|
| 0.9123 |
plot(best_model)
all_chars2 <- model.matrix(SalePrice ~ MSZoning + LotShape + LandContour + LotConfig + Neighborhood +
Condition1 + BldgType + HouseStyle + Exterior1st + ExterCond +
Foundation + BsmtQual + BsmtCond + BsmtExposure + BsmtFinType1 +
BsmtFinType2 + HeatingQC + CentralAir + KitchenQual + Functional +
SaleCondition + OverallQual + YearRemodAdd + BsmtFinSF1 +
TotalBsmtSF + GrLivArea + Fireplaces + GarageCars + WoodDeckSF +
OpenPorchSF, all_df) %>% as.data.frame %>%
dplyr::select(-`(Intercept)`)
all_df2 <- cbind(all_chars2, all_df$SalePrice) %>%
rename(SalePrice = `all_df$SalePrice`)
all_model2 <- lm(SalePrice ~ ., data = all_df2)
selection <- stepAIC(all_model2,direction="backward")
best_model2 <- lm(SalePrice ~ MSZoning2 + MSZoning3 + MSZoning4 + MSZoning5 + MSZoningnone +
LotShape2 + LotShape3 + LandContour2 + LandContour3 + LandContour4 +
LotConfig2 + LotConfig3 + LotConfig5 + Neighborhood10 + Neighborhood11 +
Neighborhood12 + Neighborhood13 + Neighborhood14 + Neighborhood16 +
Neighborhood18 + Neighborhood19 + Neighborhood21 + Neighborhood22 +
Neighborhood25 + Neighborhood5 + Neighborhood7 + Neighborhood8 +
Condition13 + Condition17 + BldgType3 + BldgType4 + BldgType5 +
HouseStyle2 + HouseStyle3 + HouseStyle6 + HouseStyle7 + Exterior1st13 +
Exterior1st3 + Exterior1st4 + Exterior1st6 + Exterior1st9 +
ExterCond2 + ExterCond3 + ExterCond4 + ExterCond5 + Foundation2 +
Foundation3 + Foundation5 + BsmtQual2 + BsmtQual3 + BsmtQual4 +
BsmtQualnone + BsmtCond2 + BsmtCond4 + BsmtExposure2 + BsmtExposure4 +
BsmtFinType12 + BsmtFinType14 + BsmtFinType15 + BsmtFinType22 +
BsmtFinType24 + BsmtFinType25 + BsmtFinType26 + HeatingQC2 +
HeatingQC3 + HeatingQC5 + CentralAir2 + KitchenQual2 + KitchenQual3 +
KitchenQual4 + Functional2 + Functional6 + Functional7 +
Functionalnone + SaleCondition2 + SaleCondition3 + SaleCondition5 +
SaleCondition6 + OverallQual + YearRemodAdd + BsmtFinSF1 +
TotalBsmtSF + GrLivArea + Fireplaces + GarageCars + WoodDeckSF +
OpenPorchSF, data = all_df2)
pander(summary(best_model2))
| Â | Estimate | Std. Error | t value | Pr(>|t|) |
|---|---|---|---|---|
| (Intercept) | 1.652 | 0.06615 | 24.97 | 9.768e-114 |
| MSZoning2 | 0.04969 | 0.006879 | 7.223 | 8.387e-13 |
| MSZoning3 | 0.04786 | 0.006804 | 7.034 | 3.163e-12 |
| MSZoning4 | 0.04871 | 0.005763 | 8.451 | 7.28e-17 |
| MSZoning5 | 0.04465 | 0.005474 | 8.158 | 7.651e-16 |
| MSZoningnone | 0.06677 | 0.01218 | 5.481 | 5.03e-08 |
| LotShape2 | 0.005345 | 0.002462 | 2.171 | 0.03012 |
| LotShape3 | -0.008062 | 0.004876 | -1.653 | 0.09849 |
| LandContour2 | 0.01083 | 0.002973 | 3.642 | 0.0002802 |
| LandContour3 | 0.009827 | 0.003466 | 2.836 | 0.00464 |
| LandContour4 | 0.008581 | 0.002045 | 4.196 | 2.897e-05 |
| LotConfig2 | 0.003716 | 0.001856 | 2.002 | 0.04549 |
| LotConfig3 | -0.00661 | 0.002407 | -2.746 | 0.006111 |
| LotConfig5 | -0.001791 | 0.001035 | -1.731 | 0.08368 |
| Neighborhood10 | -0.006048 | 0.003351 | -1.805 | 0.07134 |
| Neighborhood11 | -0.01601 | 0.004837 | -3.309 | 0.0009606 |
| Neighborhood12 | -0.004975 | 0.002356 | -2.111 | 0.03491 |
| Neighborhood13 | -0.003252 | 0.001516 | -2.145 | 0.03214 |
| Neighborhood14 | 0.01632 | 0.002593 | 6.295 | 4.139e-10 |
| Neighborhood16 | 0.01364 | 0.002231 | 6.112 | 1.279e-09 |
| Neighborhood18 | -0.01134 | 0.002234 | -5.078 | 4.342e-07 |
| Neighborhood19 | -0.003032 | 0.002066 | -1.468 | 0.1424 |
| Neighborhood21 | 0.007792 | 0.003482 | 2.238 | 0.02541 |
| Neighborhood22 | 0.01788 | 0.003325 | 5.38 | 8.77e-08 |
| Neighborhood25 | 0.01111 | 0.004663 | 2.382 | 0.01733 |
| Neighborhood5 | 0.00718 | 0.003239 | 2.217 | 0.02679 |
| Neighborhood7 | 0.01334 | 0.002392 | 5.577 | 2.947e-08 |
| Neighborhood8 | -0.01164 | 0.001829 | -6.364 | 2.678e-10 |
| Condition13 | 0.008508 | 0.001279 | 6.653 | 4.125e-11 |
| Condition17 | 0.00575 | 0.003219 | 1.786 | 0.07427 |
| BldgType3 | -0.0099 | 0.002556 | -3.874 | 0.0001121 |
| BldgType4 | -0.01579 | 0.002711 | -5.824 | 7.168e-09 |
| BldgType5 | -0.01107 | 0.001773 | -6.24 | 5.806e-10 |
| HouseStyle2 | 0.007411 | 0.004311 | 1.719 | 0.0858 |
| HouseStyle3 | 0.00646 | 0.001559 | 4.143 | 3.632e-05 |
| HouseStyle6 | -0.005169 | 0.001401 | -3.689 | 0.0002338 |
| HouseStyle7 | 0.007232 | 0.003123 | 2.316 | 0.02073 |
| Exterior1st13 | 0.002578 | 0.001201 | 2.147 | 0.03197 |
| Exterior1st3 | -0.01549 | 0.01078 | -1.436 | 0.1512 |
| Exterior1st4 | 0.01262 | 0.002282 | 5.529 | 3.855e-08 |
| Exterior1st6 | 0.003599 | 0.002567 | 1.402 | 0.1612 |
| Exterior1st9 | 0.003806 | 0.001247 | 3.052 | 0.002316 |
| ExterCond2 | -0.03599 | 0.009181 | -3.92 | 9.303e-05 |
| ExterCond3 | -0.0267 | 0.008686 | -3.074 | 0.002151 |
| ExterCond4 | -0.06997 | 0.01744 | -4.012 | 6.349e-05 |
| ExterCond5 | -0.02745 | 0.008641 | -3.176 | 0.001525 |
| Foundation2 | 0.006466 | 0.001592 | 4.061 | 5.16e-05 |
| Foundation3 | 0.004641 | 0.001796 | 2.584 | 0.009865 |
| Foundation5 | 0.01648 | 0.006264 | 2.631 | 0.008609 |
| BsmtQual2 | -0.007783 | 0.003632 | -2.143 | 0.03229 |
| BsmtQual3 | -0.006477 | 0.001912 | -3.388 | 0.0007231 |
| BsmtQual4 | -0.009467 | 0.002329 | -4.065 | 5.074e-05 |
| BsmtQualnone | -0.02537 | 0.005767 | -4.399 | 1.171e-05 |
| BsmtCond2 | 0.008165 | 0.003093 | 2.64 | 0.008389 |
| BsmtCond4 | 0.00666 | 0.002432 | 2.739 | 0.00625 |
| BsmtExposure2 | 0.007059 | 0.001662 | 4.248 | 2.301e-05 |
| BsmtExposure4 | -0.002397 | 0.001039 | -2.307 | 0.02122 |
| BsmtFinType12 | -0.003435 | 0.001479 | -2.323 | 0.02034 |
| BsmtFinType14 | -0.006525 | 0.001991 | -3.277 | 0.001075 |
| BsmtFinType15 | -0.003147 | 0.001576 | -1.998 | 0.04596 |
| BsmtFinType22 | -0.01304 | 0.003763 | -3.465 | 0.0005462 |
| BsmtFinType24 | -0.008412 | 0.00359 | -2.343 | 0.01927 |
| BsmtFinType25 | -0.00715 | 0.003424 | -2.088 | 0.03697 |
| BsmtFinType26 | -0.009818 | 0.002812 | -3.491 | 0.0004965 |
| HeatingQC2 | -0.006132 | 0.002486 | -2.467 | 0.01376 |
| HeatingQC3 | -0.002916 | 0.001216 | -2.398 | 0.01663 |
| HeatingQC5 | -0.003363 | 0.001204 | -2.792 | 0.005304 |
| CentralAir2 | 0.009365 | 0.001967 | 4.76 | 2.141e-06 |
| KitchenQual2 | -0.01085 | 0.003521 | -3.082 | 0.0021 |
| KitchenQual3 | -0.009746 | 0.001971 | -4.945 | 8.535e-07 |
| KitchenQual4 | -0.01147 | 0.002236 | -5.131 | 3.298e-07 |
| Functional2 | -0.03187 | 0.006956 | -4.582 | 5.015e-06 |
| Functional6 | -0.03522 | 0.01514 | -2.326 | 0.02016 |
| Functional7 | 0.01064 | 0.001757 | 6.054 | 1.822e-09 |
| Functionalnone | 0.02537 | 0.01483 | 1.71 | 0.08745 |
| SaleCondition2 | 0.01482 | 0.007828 | 1.893 | 0.0586 |
| SaleCondition3 | 0.01074 | 0.004834 | 2.221 | 0.02652 |
| SaleCondition5 | 0.009098 | 0.001463 | 6.22 | 6.568e-10 |
| SaleCondition6 | 0.01253 | 0.002197 | 5.703 | 1.439e-08 |
| OverallQual | 0.0057 | 0.0005211 | 10.94 | 9.247e-27 |
| YearRemodAdd | 0.0818 | 0.01512 | 5.409 | 7.466e-08 |
| BsmtFinSF1 | 0.00281 | 0.000252 | 11.15 | 1.04e-27 |
| TotalBsmtSF | -7.077e-06 | 3.61e-06 | -1.96 | 0.05015 |
| GrLivArea | 0.1001 | 0.004298 | 23.3 | 2.051e-101 |
| Fireplaces | -0.01474 | 0.00409 | -3.604 | 0.0003249 |
| GarageCars | 0.01053 | 0.001079 | 9.763 | 8.182e-22 |
| WoodDeckSF | -0.007983 | 0.00272 | -2.935 | 0.00339 |
| OpenPorchSF | -0.1107 | 0.06034 | -1.835 | 0.0667 |
| Observations | Residual Std. Error | \(R^2\) | Adjusted \(R^2\) |
|---|---|---|---|
| 1460 | 0.01456 | 0.9191 | 0.914 |
plot(best_model2)
The plot diagnostics aren’t noticeably better, but is had far few variables, nearly all of which are signficant. This should help with overfitting on the test set. We also managed to improve adjusted R squared.
Finally, this code will output the predictions for submissionhousing_test <- read_csv('https://raw.githubusercontent.com/TheFedExpress/Data/master/605test')
remove_na <- function(col){
col = ifelse(is.na(col), 0, col)
return (col)
}
only_numeric <- housing_test %>% keep(is.numeric) %>%
lapply(remove_na) %>%
as.data.frame
only_numeric <- rename(only_numeric, `1stFlrSF` = X1stFlrSF, `2ndFlrSF` = X2ndFlrSF)
good_frame <- dplyr::select(only_numeric, good_vars$varname[1:15]) %>%
mutate(YearBuilt = 2017 - YearBuilt, YearRemodAdd = 2017 - YearRemodAdd)
good_frame <- good_frame + 1
new_frame <- as.matrix(good_frame)
for (i in 1:ncol(good_frame)){
new_frame[, i] <- new_frame[, i] ** params[i]
}
new_frame <- as.data.frame(new_frame)
only_char <- housing_test %>% keep(is.character) %>%
unclass %>%
as.data.frame %>%
dplyr::select(-c(Street, Condition2, Utilities, RoofMatl, Heating, PoolQC, Fence, MiscFeature,
GarageQual, GarageCond, GarageType, Alley))
change_none <- function(col){
ifelse(is.na(col), 'none', col)
}
only_char <- sapply(only_char, change_none) %>%
as.data.frame()
all_df <- cbind(only_char, new_frame)
final_df <- model.matrix(YearBuilt ~ MSZoning + LotShape + LandContour + LotConfig + Neighborhood +
Condition1 + BldgType + HouseStyle + Exterior1st + ExterCond +
Foundation + BsmtQual + BsmtCond + BsmtExposure + BsmtFinType1 +
BsmtFinType2 + HeatingQC + CentralAir + KitchenQual + Functional +
SaleCondition + OverallQual + YearRemodAdd + BsmtFinSF1 +
TotalBsmtSF + GrLivArea + Fireplaces + GarageCars + WoodDeckSF +
OpenPorchSF, all_df) %>% as.data.frame %>%
dplyr::select(-`(Intercept)`)
predictions <- predict.lm(best_model2, final_df) ** (1/params[length(params)])
out_df <- data.frame(Id = housing_test$Id, SalePrice = predictions)
write_csv(out_df, 'C:\\Users\\pgood\\OneDrive\\Documents\\R\\Data 605\\submission.csv')