Section 0: Install & Load Package
if (!require("AmesHousing")) install.packages("AmesHousing", repos = "https://cloud.r-project.org")
## Loading required package: AmesHousing
library(AmesHousing)
Section 1: Import and Clean dataset
ames <- make_ames()
# Visual Check
dim(ames) # 2930 × 81
## [1] 2930 81
names(ames)
## [1] "MS_SubClass" "MS_Zoning" "Lot_Frontage"
## [4] "Lot_Area" "Street" "Alley"
## [7] "Lot_Shape" "Land_Contour" "Utilities"
## [10] "Lot_Config" "Land_Slope" "Neighborhood"
## [13] "Condition_1" "Condition_2" "Bldg_Type"
## [16] "House_Style" "Overall_Qual" "Overall_Cond"
## [19] "Year_Built" "Year_Remod_Add" "Roof_Style"
## [22] "Roof_Matl" "Exterior_1st" "Exterior_2nd"
## [25] "Mas_Vnr_Type" "Mas_Vnr_Area" "Exter_Qual"
## [28] "Exter_Cond" "Foundation" "Bsmt_Qual"
## [31] "Bsmt_Cond" "Bsmt_Exposure" "BsmtFin_Type_1"
## [34] "BsmtFin_SF_1" "BsmtFin_Type_2" "BsmtFin_SF_2"
## [37] "Bsmt_Unf_SF" "Total_Bsmt_SF" "Heating"
## [40] "Heating_QC" "Central_Air" "Electrical"
## [43] "First_Flr_SF" "Second_Flr_SF" "Low_Qual_Fin_SF"
## [46] "Gr_Liv_Area" "Bsmt_Full_Bath" "Bsmt_Half_Bath"
## [49] "Full_Bath" "Half_Bath" "Bedroom_AbvGr"
## [52] "Kitchen_AbvGr" "Kitchen_Qual" "TotRms_AbvGrd"
## [55] "Functional" "Fireplaces" "Fireplace_Qu"
## [58] "Garage_Type" "Garage_Finish" "Garage_Cars"
## [61] "Garage_Area" "Garage_Qual" "Garage_Cond"
## [64] "Paved_Drive" "Wood_Deck_SF" "Open_Porch_SF"
## [67] "Enclosed_Porch" "Three_season_porch" "Screen_Porch"
## [70] "Pool_Area" "Pool_QC" "Fence"
## [73] "Misc_Feature" "Misc_Val" "Mo_Sold"
## [76] "Year_Sold" "Sale_Type" "Sale_Condition"
## [79] "Sale_Price" "Longitude" "Latitude"
# Export CSV
write.csv(ames, "AmesHousing_original.csv", row.names = FALSE)
# log transformation to response variable
ames$log_Price <- log(ames$Sale_Price)
# Overall_Qual:mapping to number 1-10
qual_map <- c(
"Very_Poor" = 1,
"Poor" = 2,
"Fair" = 3,
"Below_Average" = 4,
"Average" = 5,
"Above_Average" = 6,
"Good" = 7,
"Very_Good" = 8,
"Excellent" = 9,
"Very_Excellent" = 10
)
ames$Overall_Qual_num <- qual_map[as.character(ames$Overall_Qual)]
# Garage_Type:factor
ames$Garage_Type <- factor(ames$Garage_Type)
levels(ames$Garage_Type) # reference level
## [1] "Attchd" "Basment" "BuiltIn"
## [4] "CarPort" "Detchd" "More_Than_Two_Types"
## [7] "No_Garage"
# Export cleaned CSV file
write.csv(ames, "AmesHousing_cleaned.csv", row.names = FALSE)
Section 2: Fit data
model <- lm(
log_Price ~ Gr_Liv_Area + Overall_Qual_num +
Garage_Area + Garage_Type +
Wood_Deck_SF + Open_Porch_SF,
data = ames
)
summary(model)
##
## Call:
## lm(formula = log_Price ~ Gr_Liv_Area + Overall_Qual_num + Garage_Area +
## Garage_Type + Wood_Deck_SF + Open_Porch_SF, data = ames)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.90416 -0.08631 0.00962 0.10210 0.84864
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.071e+01 1.853e-02 577.725 < 2e-16 ***
## Gr_Liv_Area 2.211e-04 8.767e-06 25.215 < 2e-16 ***
## Overall_Qual_num 1.392e-01 3.340e-03 41.670 < 2e-16 ***
## Garage_Area 3.336e-04 2.273e-05 14.679 < 2e-16 ***
## Garage_TypeBasment -8.400e-02 3.041e-02 -2.762 0.00578 **
## Garage_TypeBuiltIn -2.310e-02 1.432e-02 -1.613 0.10683
## Garage_TypeCarPort -2.002e-01 4.695e-02 -4.263 2.08e-05 ***
## Garage_TypeDetchd -1.298e-01 8.514e-03 -15.242 < 2e-16 ***
## Garage_TypeMore_Than_Two_Types -1.256e-01 3.870e-02 -3.245 0.00119 **
## Garage_TypeNo_Garage -1.209e-01 1.824e-02 -6.629 4.02e-11 ***
## Wood_Deck_SF 2.038e-04 2.788e-05 7.311 3.40e-13 ***
## Open_Porch_SF 8.731e-05 5.318e-05 1.642 0.10076
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1797 on 2918 degrees of freedom
## Multiple R-squared: 0.8065, Adjusted R-squared: 0.8057
## F-statistic: 1105 on 11 and 2918 DF, p-value: < 2.2e-16
Section 3: Graph A — Response variable distribution
par(mfrow = c(1, 2))
hist(ames$Sale_Price,
breaks = 40, col = "steelblue",
main = "(a) Sale Price",
xlab = "Sale Price (USD)")
hist(ames$log_Price,
breaks = 40, col = "tomato",
main = "(b) log(Sale Price)",
xlab = "log(Sale Price)")

par(mfrow = c(1, 1))
Section 4: Graph B — 6 Predictors distribution
par(mfrow = c(2, 3))
hist(ames$Gr_Liv_Area,
col = "lightblue",
main = "Gr_Liv_Area",
xlab = "Above Ground Living Area (sqft)")
barplot(table(ames$Overall_Qual_num),
col = "lightblue",
main = "Overall_Qual",
xlab = "Quality Score (1–10)")
hist(ames$Garage_Area,
col = "lightblue",
main = "Garage_Area",
xlab = "Garage Area (sqft)")
hist(ames$Wood_Deck_SF,
col = "lightblue",
main = "Wood_Deck_SF",
xlab = "Wood Deck Area (sqft)")
hist(ames$Open_Porch_SF,
col = "lightblue",
main = "Open_Porch_SF",
xlab = "Open Porch Area (sqft)")
barplot(table(ames$Garage_Type),
col = "lightcoral",
main = "Garage_Type",
las = 2, cex.names = 0.7)

par(mfrow = c(1, 1))
Section 5: Extreact residuals
fitted_vals <- fitted(model)
resid_vals <- resid(model)
std_resid <- rstandard(model)
Section 6: Graph C: Residual Analysis (MLR Assumptions)
par(mfrow = c(1, 3))
# (a) Residuals vs Fitted
plot(fitted_vals, resid_vals,
pch = 16, col = rgb(0, 0, 1, 0.3),
main = "(a) Residuals vs Fitted",
xlab = "Fitted Values",
ylab = "Residuals")
abline(h = 0, col = "red", lwd = 2, lty = 2)
# (b) Normal Q-Q
qqnorm(std_resid,
pch = 16, col = rgb(0, 0, 1, 0.3),
main = "(b) Normal Q-Q")
qqline(std_resid, col = "red", lwd = 2)
# (c) Scale-Location
plot(fitted_vals, sqrt(abs(std_resid)),
pch = 16, col = rgb(0, 0, 1, 0.3),
main = "(c) Scale-Location",
xlab = "Fitted Values",
ylab = expression(sqrt("|Standardized Residuals|")))

par(mfrow = c(1, 1))
Section 7: Graph C: Residual Distribution
par(mfrow = c(1, 2))
# (d) Residuals vs Leverage
plot(model, which = 5,
main = "(d) Residuals vs Leverage")
# (e) Residual Histogram
hist(resid_vals,
breaks = 40, col = "lightblue",
main = "(e) Distribution of Residuals",
xlab = "Residuals",
freq = FALSE)
curve(dnorm(x, mean = 0, sd = sd(resid_vals)),
add = TRUE, col = "red", lwd = 2)

par(mfrow = c(1, 1))
Section 8: Graph D — Residuals test
par(mfrow = c(2, 2))
plot(ames$Gr_Liv_Area, resid_vals,
pch = 16, col = rgb(0, 0, 1, 0.3),
main = "Residuals vs Gr_Liv_Area",
xlab = "Above Ground Living Area (sqft)",
ylab = "Residuals")
abline(h = 0, col = "red", lwd = 2, lty = 2)
plot(ames$Garage_Area, resid_vals,
pch = 16, col = rgb(0, 0, 1, 0.3),
main = "Residuals vs Garage_Area",
xlab = "Garage Area (sqft)",
ylab = "Residuals")
abline(h = 0, col = "red", lwd = 2, lty = 2)
plot(ames$Wood_Deck_SF, resid_vals,
pch = 16, col = rgb(0, 0, 1, 0.3),
main = "Residuals vs Wood_Deck_SF",
xlab = "Wood Deck Area (sqft)",
ylab = "Residuals")
abline(h = 0, col = "red", lwd = 2, lty = 2)
plot(ames$Open_Porch_SF, resid_vals,
pch = 16, col = rgb(0, 0, 1, 0.3),
main = "Residuals vs Open_Porch_SF",
xlab = "Open Porch Area (sqft)",
ylab = "Residuals")
abline(h = 0, col = "red", lwd = 2, lty = 2)

par(mfrow = c(1, 1))