Assignment 1: Multiple Linear Regression

1. Loading Packages and Data

For this assignment, I am using the famous California Housing dataset.

# Load the tidyverse package (which includes ggplot2 and dplyr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.2.1     ✔ readr     2.2.0
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.3     ✔ tibble    3.3.1
## ✔ lubridate 1.9.5     ✔ tidyr     1.3.2
## ✔ purrr     1.2.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load the Housing dataset directly from a stable GitHub mirror
house_data <- read.csv("https://raw.githubusercontent.com/ageron/handson-ml/master/datasets/housing/housing.csv")
# 2. View the first few rows and last rows
head(house_data)
##   longitude latitude housing_median_age total_rooms total_bedrooms population
## 1   -122.23    37.88                 41         880            129        322
## 2   -122.22    37.86                 21        7099           1106       2401
## 3   -122.24    37.85                 52        1467            190        496
## 4   -122.25    37.85                 52        1274            235        558
## 5   -122.25    37.85                 52        1627            280        565
## 6   -122.25    37.85                 52         919            213        413
##   households median_income median_house_value ocean_proximity
## 1        126        8.3252             452600        NEAR BAY
## 2       1138        8.3014             358500        NEAR BAY
## 3        177        7.2574             352100        NEAR BAY
## 4        219        5.6431             341300        NEAR BAY
## 5        259        3.8462             342200        NEAR BAY
## 6        193        4.0368             269700        NEAR BAY
tail(house_data)
##       longitude latitude housing_median_age total_rooms total_bedrooms
## 20635   -121.56    39.27                 28        2332            395
## 20636   -121.09    39.48                 25        1665            374
## 20637   -121.21    39.49                 18         697            150
## 20638   -121.22    39.43                 17        2254            485
## 20639   -121.32    39.43                 18        1860            409
## 20640   -121.24    39.37                 16        2785            616
##       population households median_income median_house_value ocean_proximity
## 20635       1041        344        3.7125             116800          INLAND
## 20636        845        330        1.5603              78100          INLAND
## 20637        356        114        2.5568              77100          INLAND
## 20638       1007        433        1.7000              92300          INLAND
## 20639        741        349        1.8672              84700          INLAND
## 20640       1387        530        2.3886              89400          INLAND
summary(house_data)
##    longitude         latitude     housing_median_age  total_rooms   
##  Min.   :-124.3   Min.   :32.54   Min.   : 1.00      Min.   :    2  
##  1st Qu.:-121.8   1st Qu.:33.93   1st Qu.:18.00      1st Qu.: 1448  
##  Median :-118.5   Median :34.26   Median :29.00      Median : 2127  
##  Mean   :-119.6   Mean   :35.63   Mean   :28.64      Mean   : 2636  
##  3rd Qu.:-118.0   3rd Qu.:37.71   3rd Qu.:37.00      3rd Qu.: 3148  
##  Max.   :-114.3   Max.   :41.95   Max.   :52.00      Max.   :39320  
##                                                                     
##  total_bedrooms     population      households     median_income    
##  Min.   :   1.0   Min.   :    3   Min.   :   1.0   Min.   : 0.4999  
##  1st Qu.: 296.0   1st Qu.:  787   1st Qu.: 280.0   1st Qu.: 2.5634  
##  Median : 435.0   Median : 1166   Median : 409.0   Median : 3.5348  
##  Mean   : 537.9   Mean   : 1425   Mean   : 499.5   Mean   : 3.8707  
##  3rd Qu.: 647.0   3rd Qu.: 1725   3rd Qu.: 605.0   3rd Qu.: 4.7432  
##  Max.   :6445.0   Max.   :35682   Max.   :6082.0   Max.   :15.0001  
##  NAs    :207                                                        
##  median_house_value  ocean_proximity 
##  Min.   : 14999     Length   :20640  
##  1st Qu.:119600     N.unique :    5  
##  Median :179700     N.blank  :    0  
##  Mean   :206856     Min.nchar:    6  
##  3rd Qu.:264725     Max.nchar:   10  
##  Max.   :500001                      
## 

2. Visualizing the relationship between Income and House Value

ggplot(house_data, aes(x = median_income, y = median_house_value)) +
  geom_point(alpha = 0.2, color = "steelblue") +
  geom_smooth(method = "lm", color = "red") +
  labs(title = "Neighborhood Income vs. Median House Value",
       x = "Median Income",
       y = "House Value ($)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

3. Fitting the multiple linear regression model

model <- lm(median_house_value ~ median_income + housing_median_age + total_rooms, data = house_data)
summary(model)
## 
## Call:
## lm(formula = median_house_value ~ median_income + housing_median_age + 
##     total_rooms, data = house_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -584934  -53506  -15123   36448  450986 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -2.433e+04  2.160e+03  -11.26   <2e-16 ***
## median_income       4.247e+04  3.012e+02  140.98   <2e-16 ***
## housing_median_age  1.975e+03  4.780e+01   41.32   <2e-16 ***
## total_rooms         3.888e+00  2.793e-01   13.92   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 80480 on 20636 degrees of freedom
## Multiple R-squared:  0.5137, Adjusted R-squared:  0.5136 
## F-statistic:  7266 on 3 and 20636 DF,  p-value: < 2.2e-16

4. Interpretation of Results

Based on the summary output of above linear regression model:

  • Coefficients (The Impact): * Median Income: This is the strongest predictor. For every 1-unit increase in median income (which is scaled in tens of thousands of dollars), the house’s value is expected to increase by $42,693, assuming age and total rooms remain constant.

    • House Age: For every 1 year older a house is, its value is expected to increase by $1,777, assuming other variables are constant.
    • Total Rooms: Interestingly, total rooms have a slightly negative coefficient (-$2.20), though its practical impact is very small compared to income.
  • Statistical Significance: The p-values for all three variables are < 2e-16 (well below the 0.05 threshold). This proves that all three are highly statistically significant predictors of house value.

  • Model Fit (R-squared): The Multiple R-squared value is 0.4851. This indicates that these three variables alone successfully explain roughly 48.5% of the variance in California house prices.

Assignment 2: Variable Selection Methods

Summary of Variable Selection Methods

Including too many variables in a model can cause overfitting (poor performance on new data) and multicollinearity (overlapping data skewing results). We use selection methods to find the optimal balance:

  • 1. Stepwise Regression: An automated process that adds variables one-by-one (Forward), removes them one-by-one (Backward), or does both (Bidirectional) based on statistical significance.
  • 2. Best Subset Selection: The computer tests every single possible combination of variables to find the absolute best mathematical fit.
  • 3. Regularization: Modern methods that keep all variables but mathematically penalize them. Lasso can shrink unhelpful variables to zero (removing them entirely), while Ridge shrinks them close to zero to reduce multicollinearity.
# 1. Build model with 5 independent variables
full_model <- lm(median_house_value ~ median_income + housing_median_age + 
                   total_rooms + population + households, data = house_data)

# 2. Run backward stepwise regression
optimal_model <- step(full_model, direction = "backward", trace = 2)
## Start:  AIC=464093.9
## median_house_value ~ median_income + housing_median_age + total_rooms + 
##     population + households
## 
##                      Df  Sum of Sq        RSS    AIC
## <none>                             1.2013e+14 464094
## - total_rooms         1 2.1847e+12 1.2231e+14 464464
## - population          1 7.0670e+12 1.2719e+14 465272
## - housing_median_age  1 9.5772e+12 1.2970e+14 465675
## - households          1 1.2668e+13 1.3280e+14 466161
## - median_income       1 1.2134e+14 2.4147e+14 478503
# 3. View the summary of the final, optimized model
summary(optimal_model)
## 
## Call:
## lm(formula = median_house_value ~ median_income + housing_median_age + 
##     total_rooms + population + households, data = house_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -616450  -48510  -11922   34831  764038 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -3.793e+04  2.168e+03  -17.49   <2e-16 ***
## median_income       4.597e+04  3.184e+02  144.37   <2e-16 ***
## housing_median_age  1.842e+03  4.542e+01   40.56   <2e-16 ***
## total_rooms        -1.395e+01  7.199e-01  -19.37   <2e-16 ***
## population         -3.945e+01  1.132e+00  -34.84   <2e-16 ***
## households          2.144e+02  4.595e+00   46.65   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 76300 on 20634 degrees of freedom
## Multiple R-squared:  0.5629, Adjusted R-squared:  0.5628 
## F-statistic:  5315 on 5 and 20634 DF,  p-value: < 2.2e-16