#The goal of this project is model the sales price of houses in Ames, Iowa based on various housing characteristics. The goal is to build a multiple regression model, check assumptions, and apply Box-Cox. The ultimate purpose is to answer the pratical questions about how much do features like living are, quality, bathroom size, etc. affect sale price, and whether or not bootstrap intervals are consistent with parametric intervals.
#The dataset used is Ames Housing, the response is SalePrice, the home sale price in dollars, and the predictors include the structure, size, quality, and amenity variables. A unit is considered a single home sale. The engineered variables within this project are total_baths, house_age, and years_since_remod.
#The methodology for this project included fitting a full multiple linear regression, assessing multicollinearity via VIF, and assessing residual plots for linearity, variance, and normality. A Box-Cox will be used for transformation, and a Stepwise AIC will be applied with R-Squared and Adjusted R-Squared.The model with the best balance of fit will be selected. Ultimately, the model with the best balance of fit and interpretability will be selected. Bootstrapping confidence intervals will also be implemented.
#The full model including all of the predictors explains 85% of the variation in sale price, with adjusted-R^2 = .853. Residual plots indicate slight skew and heteroscedasticity. The Box-Cox transformation applied shows lambda = 0.17, and the transformed model improved fit with an adjusted R-squared of 0.875, as well as a more normal residual plot. Some key takeaways include that living area and size variables had large and significant effects. Overall quality ratings also showed very strong assosciations. For example, homes rated excellent were worth approximately 138,000 more than very poor, with very excellent being worth 169,000 more. Condition ratings also contributed immensely, with good or excellent ratings contributing to $50,000-59,000 premiums. Amenities such as bathroom, garage, and fireplaces also provided signifcant value to the final sale price of a home. Negative values were observed for older homes and homes that have gone a long time without remodeling. Bootstrap confidence intervals matched closely to paramteric ones, confirming the robustness of the results. Overall, the analysis shows that quality, size of living areas, number of baths, and fireplaces were the strognest drivers of sale price for homes in Ames. The Box-Cox model slightly improved diagnostics, but the stepwise model provided a nearly identical fit to the original model. THe boostrap analysis confirmed the classical infernece, which gives confidence in the assumptions of the parametric models.
#The regression analysis of the Ames housing dataset provided insight into what the main driving factors of house pricing in Ames iowa are. The results indicate that structural characteristics like quality rating, living area, and number of bathrooms are the most influential factors. These findings are in line with expectations of buyers. From a research perspective, the project accurately identified what factors most strongly influence sales price, confirmed normality assumptions, and applied transformations. This study could be improved by adding additional factor, such as school district quality, closeness to important locations, or broader economic trends. Future work should include more interaction relationships, and validate without out of sample results or walk forward cross validation. Exploring non-linear techniques like XGBoosting or LightGBM would also be interesting. Overall, this study provided a solid foundation and helps explain what impacts pricing.
set.seed(321)
suppressPackageStartupMessages({
library(AmesHousing)
library(tidyverse)
library(GGally)
library(MASS)
library(car)
library(broom)
})
## Warning: package 'AmesHousing' was built under R version 4.4.3
## Warning: package 'car' was built under R version 4.4.2
## Warning: package 'carData' was built under R version 4.4.2
housing <- AmesHousing::make_ames()
dat0 <- housing %>%
dplyr::select(
Sale_Price,
Gr_Liv_Area, Total_Bsmt_SF, First_Flr_SF, Second_Flr_SF,
Overall_Qual, Overall_Cond,
Year_Built, Year_Remod_Add,
Full_Bath, Half_Bath, Bsmt_Full_Bath, Bsmt_Half_Bath,
Garage_Cars, Garage_Area,
Fireplaces,
Lot_Area, Mas_Vnr_Area
) %>%
tidyr::drop_na(Sale_Price)
# Feature engineering
dat <- dat0 %>%
dplyr::mutate(
Total_Baths = Full_Bath + 0.5*Half_Bath + Bsmt_Full_Bath + 0.5*Bsmt_Half_Bath,
House_Age = max(Year_Built, na.rm = TRUE) - Year_Built,
Years_Since_Remod = max(Year_Remod_Add, na.rm = TRUE) - Year_Remod_Add,
Overall_Qual_S = as.numeric(Overall_Qual),
Overall_Cond_S = as.numeric(Overall_Cond)
) %>%
dplyr::select(
Sale_Price,
Gr_Liv_Area, Total_Bsmt_SF, First_Flr_SF, Second_Flr_SF,
Overall_Qual_S, Overall_Cond_S,
House_Age, Years_Since_Remod,
Total_Baths, Garage_Cars, Garage_Area, Fireplaces,
Lot_Area, Mas_Vnr_Area
)
summary(dat$Sale_Price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 12789 129500 160000 180796 213500 755000
sort(colSums(is.na(dat)), decreasing = TRUE)
## Sale_Price Gr_Liv_Area Total_Bsmt_SF First_Flr_SF
## 0 0 0 0
## Second_Flr_SF Overall_Qual_S Overall_Cond_S House_Age
## 0 0 0 0
## Years_Since_Remod Total_Baths Garage_Cars Garage_Area
## 0 0 0 0
## Fireplaces Lot_Area Mas_Vnr_Area
## 0 0 0
small <- dat %>% dplyr::sample_n(min(1000, nrow(dat)))
GGally::ggpairs(
dplyr::select(small, Sale_Price, Gr_Liv_Area, Total_Bsmt_SF, Overall_Qual_S, Total_Baths, Garage_Cars)
)
cat()
full_formula <- Sale_Price ~ Gr_Liv_Area + Total_Bsmt_SF + First_Flr_SF + Second_Flr_SF +
Overall_Qual_S + Overall_Cond_S + House_Age + Years_Since_Remod +
Total_Baths + Garage_Cars + Garage_Area + Fireplaces + Lot_Area + Mas_Vnr_Area
m_full <- lm(full_formula, data = dat)
summary(m_full)
##
## Call:
## lm(formula = full_formula, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -518019 -17759 -2451 14208 296657
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.770e+04 6.299e+03 -12.335 < 2e-16 ***
## Gr_Liv_Area 2.189e+01 1.403e+01 1.560 0.1189
## Total_Bsmt_SF 1.949e+01 2.639e+00 7.385 1.97e-13 ***
## First_Flr_SF 2.543e+01 1.440e+01 1.767 0.0774 .
## Second_Flr_SF 1.443e+01 1.420e+01 1.016 0.3097
## Overall_Qual_S 1.817e+04 7.712e+02 23.559 < 2e-16 ***
## Overall_Cond_S 4.595e+03 6.830e+02 6.728 2.06e-11 ***
## House_Age -2.637e+02 3.736e+01 -7.059 2.08e-12 ***
## Years_Since_Remod -2.433e+02 4.472e+01 -5.442 5.72e-08 ***
## Total_Baths 7.264e+03 1.168e+03 6.220 5.67e-10 ***
## Garage_Cars 2.841e+03 1.982e+03 1.434 0.1518
## Garage_Area 3.202e+01 6.810e+00 4.702 2.70e-06 ***
## Fireplaces 8.071e+03 1.174e+03 6.874 7.63e-12 ***
## Lot_Area 5.961e-01 8.903e-02 6.696 2.56e-11 ***
## Mas_Vnr_Area 3.817e+01 4.181e+00 9.128 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34630 on 2915 degrees of freedom
## Multiple R-squared: 0.813, Adjusted R-squared: 0.8121
## F-statistic: 905.3 on 14 and 2915 DF, p-value: < 2.2e-16
# Multicollinearity
sort(round(car::vif(m_full), 2), decreasing = TRUE)
## Gr_Liv_Area Second_Flr_SF First_Flr_SF Garage_Cars
## 122.87 90.44 77.75 5.56
## Garage_Area Total_Bsmt_SF House_Age Overall_Qual_S
## 5.25 3.31 3.12 2.89
## Total_Baths Years_Since_Remod Overall_Cond_S Fireplaces
## 2.17 2.13 1.41 1.41
## Mas_Vnr_Area Lot_Area
## 1.36 1.20
# residual diagnostics
op <- par(mfrow = c(2,2)); plot(m_full); par(op)
y <- dat$Sale_Price
eps <- ifelse(any(y <= 0), 1, 0)
bc <- MASS::boxcox(m_full, lambda = seq(-1, 1, by = 0.1), plotit = TRUE)
(lambda_hat <- bc$x[which.max(bc$y)])
## [1] 0.1515152
y_bc <- if (abs(lambda_hat) < 1e-8) log(y + eps) else ((y + eps)^lambda_hat - 1)/lambda_hat
m_bc <- lm(
y_bc ~ Gr_Liv_Area + Total_Bsmt_SF + First_Flr_SF + Second_Flr_SF +
Overall_Qual_S + Overall_Cond_S + House_Age + Years_Since_Remod +
Total_Baths + Garage_Cars + Garage_Area + Fireplaces + Lot_Area + Mas_Vnr_Area,
data = dat
)
summary(m_bc)
##
## Call:
## lm(formula = y_bc ~ Gr_Liv_Area + Total_Bsmt_SF + First_Flr_SF +
## Second_Flr_SF + Overall_Qual_S + Overall_Cond_S + House_Age +
## Years_Since_Remod + Total_Baths + Garage_Cars + Garage_Area +
## Fireplaces + Lot_Area + Mas_Vnr_Area, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.1262 -0.4469 0.0052 0.4908 3.2168
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.549e+01 1.681e-01 151.637 < 2e-16 ***
## Gr_Liv_Area 8.797e-04 3.744e-04 2.349 0.018874 *
## Total_Bsmt_SF 6.307e-04 7.044e-05 8.954 < 2e-16 ***
## First_Flr_SF 5.040e-04 3.842e-04 1.312 0.189683
## Second_Flr_SF 2.632e-04 3.791e-04 0.694 0.487479
## Overall_Qual_S 5.684e-01 2.058e-02 27.614 < 2e-16 ***
## Overall_Cond_S 3.081e-01 1.823e-02 16.904 < 2e-16 ***
## House_Age -1.622e-02 9.971e-04 -16.267 < 2e-16 ***
## Years_Since_Remod -6.867e-03 1.193e-03 -5.754 9.62e-09 ***
## Total_Baths 2.759e-01 3.117e-02 8.854 < 2e-16 ***
## Garage_Cars 2.375e-01 5.289e-02 4.490 7.40e-06 ***
## Garage_Area 6.377e-04 1.817e-04 3.509 0.000457 ***
## Fireplaces 3.425e-01 3.134e-02 10.929 < 2e-16 ***
## Lot_Area 1.835e-05 2.376e-06 7.722 1.56e-14 ***
## Mas_Vnr_Area 1.986e-04 1.116e-04 1.779 0.075309 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9242 on 2915 degrees of freedom
## Multiple R-squared: 0.8671, Adjusted R-squared: 0.8664
## F-statistic: 1358 on 14 and 2915 DF, p-value: < 2.2e-16
op <- par(mfrow = c(2,2)); plot(m_bc); par(op)
# Stepwise selection
m_step <- MASS::stepAIC(m_full, trace = FALSE)
summary(m_step)
##
## Call:
## lm(formula = Sale_Price ~ Gr_Liv_Area + Total_Bsmt_SF + First_Flr_SF +
## Overall_Qual_S + Overall_Cond_S + House_Age + Years_Since_Remod +
## Total_Baths + Garage_Cars + Garage_Area + Fireplaces + Lot_Area +
## Mas_Vnr_Area, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -517086 -17863 -2425 14259 297387
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.772e+04 6.299e+03 -12.338 < 2e-16 ***
## Gr_Liv_Area 3.599e+01 2.063e+00 17.443 < 2e-16 ***
## Total_Bsmt_SF 1.938e+01 2.637e+00 7.350 2.56e-13 ***
## First_Flr_SF 1.114e+01 3.063e+00 3.639 0.000279 ***
## Overall_Qual_S 1.818e+04 7.711e+02 23.576 < 2e-16 ***
## Overall_Cond_S 4.626e+03 6.823e+02 6.781 1.44e-11 ***
## House_Age -2.670e+02 3.723e+01 -7.172 9.34e-13 ***
## Years_Since_Remod -2.423e+02 4.471e+01 -5.420 6.46e-08 ***
## Total_Baths 7.324e+03 1.166e+03 6.280 3.89e-10 ***
## Garage_Cars 2.887e+03 1.981e+03 1.457 0.145096
## Garage_Area 3.199e+01 6.810e+00 4.698 2.75e-06 ***
## Fireplaces 8.092e+03 1.174e+03 6.893 6.67e-12 ***
## Lot_Area 5.982e-01 8.900e-02 6.721 2.16e-11 ***
## Mas_Vnr_Area 3.841e+01 4.175e+00 9.200 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34630 on 2916 degrees of freedom
## Multiple R-squared: 0.8129, Adjusted R-squared: 0.8121
## F-statistic: 974.8 on 13 and 2916 DF, p-value: < 2.2e-16
round(car::vif(m_step), 2)
## Gr_Liv_Area Total_Bsmt_SF First_Flr_SF Overall_Qual_S
## 2.66 3.30 3.52 2.89
## Overall_Cond_S House_Age Years_Since_Remod Total_Baths
## 1.40 3.10 2.12 2.17
## Garage_Cars Garage_Area Fireplaces Lot_Area
## 5.55 5.25 1.41 1.20
## Mas_Vnr_Area
## 1.36
# GOF
gof_table <- function(m, mse_full, n){
e <- resid(m)
SSE <- sum(e^2)
R2 <- summary(m)$r.squared
R2adj <- summary(m)$adj.r.squared
p <- length(coef(m))
Cp <- SSE/mse_full - (n - 2*p)
X <- model.matrix(m)
H <- X %*% solve(t(X) %*% X) %*% t(X)
d <- e/(1 - diag(H))
PRESS <- sum(d^2)
tibble::tibble(Model = deparse(formula(m))[2],
SSE = SSE, R2 = R2, R2_Adj = R2adj, Cp = Cp, PRESS = PRESS, p = p)
}
n <- nobs(m_full)
mse_full <- mean(resid(m_full)^2)
gof <- dplyr::bind_rows(
Full_Model = gof_table(m_full, mse_full, n) %>% dplyr::mutate(Name = "Full"),
BoxCox_Model = gof_table(m_bc, mse_full, n) %>% dplyr::mutate(Name = "Box–Cox"),
Step_AIC = gof_table(m_step, mse_full, n) %>% dplyr::mutate(Name = "Stepwise (orig)")
) %>%
dplyr::select(Name, dplyr::everything(), -Model)
gof %>% dplyr::mutate(dplyr::across(where(is.numeric), ~round(., 3))) %>% dplyr::arrange(Cp)
m_final <- m_step
summary(m_final)
##
## Call:
## lm(formula = Sale_Price ~ Gr_Liv_Area + Total_Bsmt_SF + First_Flr_SF +
## Overall_Qual_S + Overall_Cond_S + House_Age + Years_Since_Remod +
## Total_Baths + Garage_Cars + Garage_Area + Fireplaces + Lot_Area +
## Mas_Vnr_Area, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -517086 -17863 -2425 14259 297387
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.772e+04 6.299e+03 -12.338 < 2e-16 ***
## Gr_Liv_Area 3.599e+01 2.063e+00 17.443 < 2e-16 ***
## Total_Bsmt_SF 1.938e+01 2.637e+00 7.350 2.56e-13 ***
## First_Flr_SF 1.114e+01 3.063e+00 3.639 0.000279 ***
## Overall_Qual_S 1.818e+04 7.711e+02 23.576 < 2e-16 ***
## Overall_Cond_S 4.626e+03 6.823e+02 6.781 1.44e-11 ***
## House_Age -2.670e+02 3.723e+01 -7.172 9.34e-13 ***
## Years_Since_Remod -2.423e+02 4.471e+01 -5.420 6.46e-08 ***
## Total_Baths 7.324e+03 1.166e+03 6.280 3.89e-10 ***
## Garage_Cars 2.887e+03 1.981e+03 1.457 0.145096
## Garage_Area 3.199e+01 6.810e+00 4.698 2.75e-06 ***
## Fireplaces 8.092e+03 1.174e+03 6.893 6.67e-12 ***
## Lot_Area 5.982e-01 8.900e-02 6.721 2.16e-11 ***
## Mas_Vnr_Area 3.841e+01 4.175e+00 9.200 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34630 on 2916 degrees of freedom
## Multiple R-squared: 0.8129, Adjusted R-squared: 0.8121
## F-statistic: 974.8 on 13 and 2916 DF, p-value: < 2.2e-16
op <- par(mfrow = c(2,2)); plot(m_final); par(op)
confint(m_final)
## 2.5 % 97.5 %
## (Intercept) -9.007652e+04 -6.537308e+04
## Gr_Liv_Area 3.194066e+01 4.003102e+01
## Total_Bsmt_SF 1.421075e+01 2.455175e+01
## First_Flr_SF 5.138295e+00 1.714841e+01
## Overall_Qual_S 1.666798e+04 1.969192e+04
## Overall_Cond_S 3.288468e+03 5.964038e+03
## House_Age -3.399672e+02 -1.939860e+02
## Years_Since_Remod -3.299438e+02 -1.546299e+02
## Total_Baths 5.037435e+03 9.610963e+03
## Garage_Cars -9.971247e+02 6.771933e+03
## Garage_Area 1.863786e+01 4.534281e+01
## Fireplaces 5.790373e+03 1.039430e+04
## Lot_Area 4.236867e-01 7.727211e-01
## Mas_Vnr_Area 3.022111e+01 4.659273e+01
broom::tidy(m_final) %>%
dplyr::mutate(dplyr::across(where(is.numeric), ~round(., 4))) %>%
dplyr::arrange(p.value) %>%
print(n = Inf)
## # A tibble: 14 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -77725. 6299. -12.3 0
## 2 Gr_Liv_Area 36.0 2.06 17.4 0
## 3 Total_Bsmt_SF 19.4 2.64 7.35 0
## 4 Overall_Qual_S 18180. 771. 23.6 0
## 5 Overall_Cond_S 4626. 682. 6.78 0
## 6 House_Age -267. 37.2 -7.17 0
## 7 Years_Since_Remod -242. 44.7 -5.42 0
## 8 Total_Baths 7324. 1166. 6.28 0
## 9 Garage_Area 32.0 6.81 4.70 0
## 10 Fireplaces 8092. 1174. 6.89 0
## 11 Lot_Area 0.598 0.089 6.72 0
## 12 Mas_Vnr_Area 38.4 4.17 9.20 0
## 13 First_Flr_SF 11.1 3.06 3.64 0.0003
## 14 Garage_Cars 2887. 1981. 1.46 0.145
set.seed(321)
B <- 1000
X <- model.matrix(m_final)
y_final <- model.response(model.frame(m_final))
p <- length(coef(m_final))
coef_names <- names(coef(m_final))
boot_beta_case <- matrix(NA_real_, nrow = B, ncol = p)
colnames(boot_beta_case) <- coef_names
for (b in 1:B) {
idx <- sample.int(nrow(X), replace = TRUE)
fit_b <- lm(y_final[idx] ~ X[idx, -1])
boot_beta_case[b, ] <- coef(fit_b)
}
# 95% bootstrap CIs
ci_case_mat <- apply(boot_beta_case, 2, quantile, probs = c(.025, .975), na.rm = TRUE)
ci_case_tbl <- tibble::tibble(
term = colnames(ci_case_mat),
`2.5%` = as.numeric(ci_case_mat[1, ]),
`97.5%`= as.numeric(ci_case_mat[2, ])
)
ci_case_tbl
set.seed(321)
fit_hat <- fitted(m_final)
resid_hat <- resid(m_final)
boot_beta_res <- matrix(NA_real_, nrow = B, ncol = p)
colnames(boot_beta_res) <- coef_names
for (b in 1:B) {
y_star <- fit_hat + sample(resid_hat, replace = TRUE)
fit_b <- lm(y_star ~ X[,-1])
boot_beta_res[b, ] <- coef(fit_b)
}
ci_resid_mat <- apply(boot_beta_res, 2, quantile, probs = c(.025, .975), na.rm = TRUE)
ci_resid_tbl <- tibble::tibble(
term = colnames(ci_resid_mat),
`2.5%` = as.numeric(ci_resid_mat[1, ]),
`97.5%`= as.numeric(ci_resid_mat[2, ])
)
ci_resid_tbl