Section 0: Install & Load Package

if (!require("AmesHousing")) install.packages("AmesHousing", repos = "https://cloud.r-project.org")
## Loading required package: AmesHousing
library(AmesHousing)

Section 1: Import and Clean dataset

ames <- make_ames()

# Visual Check
dim(ames)    #  2930 × 81
## [1] 2930   81
names(ames)
##  [1] "MS_SubClass"        "MS_Zoning"          "Lot_Frontage"      
##  [4] "Lot_Area"           "Street"             "Alley"             
##  [7] "Lot_Shape"          "Land_Contour"       "Utilities"         
## [10] "Lot_Config"         "Land_Slope"         "Neighborhood"      
## [13] "Condition_1"        "Condition_2"        "Bldg_Type"         
## [16] "House_Style"        "Overall_Qual"       "Overall_Cond"      
## [19] "Year_Built"         "Year_Remod_Add"     "Roof_Style"        
## [22] "Roof_Matl"          "Exterior_1st"       "Exterior_2nd"      
## [25] "Mas_Vnr_Type"       "Mas_Vnr_Area"       "Exter_Qual"        
## [28] "Exter_Cond"         "Foundation"         "Bsmt_Qual"         
## [31] "Bsmt_Cond"          "Bsmt_Exposure"      "BsmtFin_Type_1"    
## [34] "BsmtFin_SF_1"       "BsmtFin_Type_2"     "BsmtFin_SF_2"      
## [37] "Bsmt_Unf_SF"        "Total_Bsmt_SF"      "Heating"           
## [40] "Heating_QC"         "Central_Air"        "Electrical"        
## [43] "First_Flr_SF"       "Second_Flr_SF"      "Low_Qual_Fin_SF"   
## [46] "Gr_Liv_Area"        "Bsmt_Full_Bath"     "Bsmt_Half_Bath"    
## [49] "Full_Bath"          "Half_Bath"          "Bedroom_AbvGr"     
## [52] "Kitchen_AbvGr"      "Kitchen_Qual"       "TotRms_AbvGrd"     
## [55] "Functional"         "Fireplaces"         "Fireplace_Qu"      
## [58] "Garage_Type"        "Garage_Finish"      "Garage_Cars"       
## [61] "Garage_Area"        "Garage_Qual"        "Garage_Cond"       
## [64] "Paved_Drive"        "Wood_Deck_SF"       "Open_Porch_SF"     
## [67] "Enclosed_Porch"     "Three_season_porch" "Screen_Porch"      
## [70] "Pool_Area"          "Pool_QC"            "Fence"             
## [73] "Misc_Feature"       "Misc_Val"           "Mo_Sold"           
## [76] "Year_Sold"          "Sale_Type"          "Sale_Condition"    
## [79] "Sale_Price"         "Longitude"          "Latitude"
# Export CSV
write.csv(ames, "AmesHousing_original.csv", row.names = FALSE)
# log transformation to response variable
ames$log_Price <- log(ames$Sale_Price)

# Overall_Qual:mapping to number 1-10
qual_map <- c(
  "Very_Poor"      = 1,
  "Poor"           = 2,
  "Fair"           = 3,
  "Below_Average"  = 4,
  "Average"        = 5,
  "Above_Average"  = 6,
  "Good"           = 7,
  "Very_Good"      = 8,
  "Excellent"      = 9,
  "Very_Excellent" = 10
)
ames$Overall_Qual_num <- qual_map[as.character(ames$Overall_Qual)]

# Garage_Type:factor
ames$Garage_Type <- factor(ames$Garage_Type)
levels(ames$Garage_Type)   # reference level
## [1] "Attchd"              "Basment"             "BuiltIn"            
## [4] "CarPort"             "Detchd"              "More_Than_Two_Types"
## [7] "No_Garage"
# Export cleaned CSV file
write.csv(ames, "AmesHousing_cleaned.csv", row.names = FALSE)

Section 2: Fit data

model <- lm(
  log_Price ~ Gr_Liv_Area + Overall_Qual_num +
              Garage_Area + Garage_Type +
              Wood_Deck_SF + Open_Porch_SF,
  data = ames
)

summary(model)
## 
## Call:
## lm(formula = log_Price ~ Gr_Liv_Area + Overall_Qual_num + Garage_Area + 
##     Garage_Type + Wood_Deck_SF + Open_Porch_SF, data = ames)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.90416 -0.08631  0.00962  0.10210  0.84864 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     1.071e+01  1.853e-02 577.725  < 2e-16 ***
## Gr_Liv_Area                     2.211e-04  8.767e-06  25.215  < 2e-16 ***
## Overall_Qual_num                1.392e-01  3.340e-03  41.670  < 2e-16 ***
## Garage_Area                     3.336e-04  2.273e-05  14.679  < 2e-16 ***
## Garage_TypeBasment             -8.400e-02  3.041e-02  -2.762  0.00578 ** 
## Garage_TypeBuiltIn             -2.310e-02  1.432e-02  -1.613  0.10683    
## Garage_TypeCarPort             -2.002e-01  4.695e-02  -4.263 2.08e-05 ***
## Garage_TypeDetchd              -1.298e-01  8.514e-03 -15.242  < 2e-16 ***
## Garage_TypeMore_Than_Two_Types -1.256e-01  3.870e-02  -3.245  0.00119 ** 
## Garage_TypeNo_Garage           -1.209e-01  1.824e-02  -6.629 4.02e-11 ***
## Wood_Deck_SF                    2.038e-04  2.788e-05   7.311 3.40e-13 ***
## Open_Porch_SF                   8.731e-05  5.318e-05   1.642  0.10076    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1797 on 2918 degrees of freedom
## Multiple R-squared:  0.8065, Adjusted R-squared:  0.8057 
## F-statistic:  1105 on 11 and 2918 DF,  p-value: < 2.2e-16

Section 3: Graph A — Response variable distribution

par(mfrow = c(1, 2))

hist(ames$Sale_Price,
     breaks = 40, col = "steelblue",
     main = "(a) Sale Price",
     xlab = "Sale Price (USD)")

hist(ames$log_Price,
     breaks = 40, col = "tomato",
     main = "(b) log(Sale Price)",
     xlab = "log(Sale Price)")

par(mfrow = c(1, 1))

Section 4: Graph B — 6 Predictors distribution

par(mfrow = c(2, 3))

hist(ames$Gr_Liv_Area,
     col = "lightblue",
     main = "Gr_Liv_Area",
     xlab = "Above Ground Living Area (sqft)")

barplot(table(ames$Overall_Qual_num),
        col = "lightblue",
        main = "Overall_Qual",
        xlab = "Quality Score (1–10)")

hist(ames$Garage_Area,
     col = "lightblue",
     main = "Garage_Area",
     xlab = "Garage Area (sqft)")

hist(ames$Wood_Deck_SF,
     col = "lightblue",
     main = "Wood_Deck_SF",
     xlab = "Wood Deck Area (sqft)")

hist(ames$Open_Porch_SF,
     col = "lightblue",
     main = "Open_Porch_SF",
     xlab = "Open Porch Area (sqft)")

barplot(table(ames$Garage_Type),
        col = "lightcoral",
        main = "Garage_Type",
        las = 2, cex.names = 0.7)

par(mfrow = c(1, 1))

Section 5: Extreact residuals

fitted_vals <- fitted(model)
resid_vals  <- resid(model)
std_resid   <- rstandard(model)

Section 6: Graph C: Residual Analysis (MLR Assumptions)

par(mfrow = c(1, 3))

# (a) Residuals vs Fitted
plot(fitted_vals, resid_vals,
     pch = 16, col = rgb(0, 0, 1, 0.3),
     main = "(a) Residuals vs Fitted",
     xlab = "Fitted Values",
     ylab = "Residuals")
abline(h = 0, col = "red", lwd = 2, lty = 2)

# (b) Normal Q-Q
qqnorm(std_resid,
       pch = 16, col = rgb(0, 0, 1, 0.3),
       main = "(b) Normal Q-Q")
qqline(std_resid, col = "red", lwd = 2)

# (c) Scale-Location
plot(fitted_vals, sqrt(abs(std_resid)),
     pch = 16, col = rgb(0, 0, 1, 0.3),
     main = "(c) Scale-Location",
     xlab = "Fitted Values",
     ylab = expression(sqrt("|Standardized Residuals|")))

par(mfrow = c(1, 1))

Section 7: Graph C: Residual Distribution

par(mfrow = c(1, 2))

# (d) Residuals vs Leverage
plot(model, which = 5,
     main = "(d) Residuals vs Leverage")

# (e) Residual Histogram
hist(resid_vals,
     breaks = 40, col = "lightblue",
     main = "(e) Distribution of Residuals",
     xlab = "Residuals",
     freq = FALSE)
curve(dnorm(x, mean = 0, sd = sd(resid_vals)),
      add = TRUE, col = "red", lwd = 2)

par(mfrow = c(1, 1))

Section 8: Graph D — Residuals test

par(mfrow = c(2, 2))

plot(ames$Gr_Liv_Area, resid_vals,
     pch = 16, col = rgb(0, 0, 1, 0.3),
     main = "Residuals vs Gr_Liv_Area",
     xlab = "Above Ground Living Area (sqft)",
     ylab = "Residuals")
abline(h = 0, col = "red", lwd = 2, lty = 2)

plot(ames$Garage_Area, resid_vals,
     pch = 16, col = rgb(0, 0, 1, 0.3),
     main = "Residuals vs Garage_Area",
     xlab = "Garage Area (sqft)",
     ylab = "Residuals")
abline(h = 0, col = "red", lwd = 2, lty = 2)

plot(ames$Wood_Deck_SF, resid_vals,
     pch = 16, col = rgb(0, 0, 1, 0.3),
     main = "Residuals vs Wood_Deck_SF",
     xlab = "Wood Deck Area (sqft)",
     ylab = "Residuals")
abline(h = 0, col = "red", lwd = 2, lty = 2)

plot(ames$Open_Porch_SF, resid_vals,
     pch = 16, col = rgb(0, 0, 1, 0.3),
     main = "Residuals vs Open_Porch_SF",
     xlab = "Open Porch Area (sqft)",
     ylab = "Residuals")
abline(h = 0, col = "red", lwd = 2, lty = 2)

par(mfrow = c(1, 1))