pnt130_Assignment6

Exercise 6

In this exercise, you will further analyze the Wage data set considered throughout this chapter.

Exercise 6.(a)

Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

library(ISLR2)
library(boot)
library(ggplot2)

# Remove rows with missing values
Wage <- na.omit(Wage)

set.seed(1)
CrossVali_errors <- rep(NA, 10)

for (d in 1:10) {
  fit <- glm(wage ~ poly(age, d), data = Wage)
  CrossVali_errors[d] <- cv.glm(Wage, fit, K = 10)$delta[1]
  cat(sprintf("Degree %2d CV error: %7.2f\n", d, CrossVali_errors[d]))
}

## Degree  1 CV error: 1676.83
## Degree  2 CV error: 1600.76
## Degree  3 CV error: 1598.40
## Degree  4 CV error: 1595.65
## Degree  5 CV error: 1594.98
## Degree  6 CV error: 1596.06
## Degree  7 CV error: 1594.30
## Degree  8 CV error: 1598.13
## Degree  9 CV error: 1593.91
## Degree 10 CV error: 1595.95

optimal_degree <- which.min(CrossVali_errors)
cat(sprintf("\n>> Optimal degree by 10-fold CV: %d\n\n", optimal_degree))

## 
## >> Optimal degree by 10-fold CV: 9

# Visualize the errors by degree
cv_df <- data.frame(
  Degree = 1:10,
  CV_Error = CrossVali_errors
)

ggplot(cv_df, aes(x = Degree, y = CV_Error)) +
  geom_line(color = "blue", size = 1) +
  geom_point(color = "red", size = 2) +
  labs(
    title = "10-fold CV Error by Polynomial Degree",
    x = "Polynomial Degree",
    y = "Cross-Validation Error"
  ) +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# ANOVA: compare nested models for degree = 1~5
fit_list <- lapply(1:5, function(degree) lm(wage ~ poly(age, degree), data = Wage))

ANOVA_Res <- do.call(anova, fit_list)

print(ANOVA_Res)

## Analysis of Variance Table
## 
## Model 1: wage ~ poly(age, degree)
## Model 2: wage ~ poly(age, degree)
## Model 3: wage ~ poly(age, degree)
## Model 4: wage ~ poly(age, degree)
## Model 5: wage ~ poly(age, degree)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

ggplot(Wage, aes(x = age, y = wage)) +
  geom_point(alpha = 0.3) +
  stat_smooth(
    method  = "lm",
    formula = y ~ poly(x, optimal_degree),
    se      = FALSE,
    color   = "blue",
    size    = 1
  ) +
  labs(
    title = paste("Polynomial Regression of Wage on Age (Best Degree =", optimal_degree, ")"),
    x     = "Age",
    y     = "Wage (USD)"
  ) +
  theme_minimal()

Exercise 6.(b)

Fit a step function to predict wage using age, and perform cross-validation to choose the optimal number of cuts. Make a plot of the fit obtained.

set.seed(1)

CrossVali_Errors <- numeric(9)            # K

for (K in 2:10) {
  
  brks <- seq(min(Wage$age), max(Wage$age), length.out = K + 1)
  
  
  Wage$ageGrp <- cut(Wage$age,
                     breaks         = brks,
                     include.lowest = TRUE,
                     right          = TRUE)
  
  Wage$ageGrp <- factor(Wage$ageGrp, levels = levels(Wage$ageGrp))
  
  
  fit_step <- glm(wage ~ ageGrp, data = Wage)
  CrossVali_Errors[K - 1] <- cv.glm(Wage, fit_step)$delta[1]
  
  cat(sprintf("Cuts = %2d  CV error = %7.2f\n", K, CrossVali_Errors[K - 1]))
}

## Cuts =  2  CV error = 1734.06
## Cuts =  3  CV error = 1682.76
## Cuts =  4  CV error = 1635.89
## Cuts =  5  CV error = 1631.45
## Cuts =  6  CV error = 1623.29
## Cuts =  7  CV error = 1611.60
## Cuts =  8  CV error = 1601.01
## Cuts =  9  CV error = 1610.21
## Cuts = 10  CV error = 1604.80

cuts_vec <- 2:10
bestK    <- cuts_vec[ which.min(CrossVali_Errors) ]
cat("\n>> Optimal cuts by CV:", bestK, "\n")

## 
## >> Optimal cuts by CV: 8

## Final Optimal MOdel with best K
best_brks <- seq(min(Wage$age), max(Wage$age), length.out = bestK + 1)
Wage$ageGrp <- cut(Wage$age,
                   breaks         = best_brks,
                   include.lowest = TRUE,
                   right          = TRUE)
Wage$ageGrp <- factor(Wage$ageGrp, levels = levels(Wage$ageGrp))

final_fit <- lm(wage ~ ageGrp, data = Wage)

#Visualize Result

Wage$wage_hat <- predict(final_fit)
ord <- order(Wage$age)             

ggplot(Wage, aes(x = age, y = wage)) +
  geom_point(alpha = 0.3, color = "gray50") +
  geom_step(
    aes(x = age[ord], y = wage_hat[ord]),
    color = "blue", size = 1
  ) +
  labs(
    title = paste("Step-Function Fit with", bestK, "Cuts"),
    x = "Age", y = "Wage"
  ) +
  theme_minimal()

Exercise 10

This question relates to the College data set.

Exercise 10.(a)

Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

library(ISLR2)
library(leaps)    # regsubsets()

head(College)

##                              Private Apps Accept Enroll Top10perc Top25perc
## Abilene Christian University     Yes 1660   1232    721        23        52
## Adelphi University               Yes 2186   1924    512        16        29
## Adrian College                   Yes 1428   1097    336        22        50
## Agnes Scott College              Yes  417    349    137        60        89
## Alaska Pacific University        Yes  193    146     55        16        44
## Albertson College                Yes  587    479    158        38        62
##                              F.Undergrad P.Undergrad Outstate Room.Board Books
## Abilene Christian University        2885         537     7440       3300   450
## Adelphi University                  2683        1227    12280       6450   750
## Adrian College                      1036          99    11250       3750   400
## Agnes Scott College                  510          63    12960       5450   450
## Alaska Pacific University            249         869     7560       4120   800
## Albertson College                    678          41    13500       3335   500
##                              Personal PhD Terminal S.F.Ratio perc.alumni Expend
## Abilene Christian University     2200  70       78      18.1          12   7041
## Adelphi University               1500  29       30      12.2          16  10527
## Adrian College                   1165  53       66      12.9          30   8735
## Agnes Scott College               875  92       97       7.7          37  19016
## Alaska Pacific University        1500  76       72      11.9           2  10922
## Albertson College                 675  67       73       9.4          11   9727
##                              Grad.Rate
## Abilene Christian University        60
## Adelphi University                  56
## Adrian College                      54
## Agnes Scott College                 59
## Alaska Pacific University           15
## Albertson College                   55

set.seed(1)
n <- nrow(College)
train_idx <- sample(seq_len(n), size = 0.7 * n)

College.train <- College[train_idx, ]   #Train 70%
College.test  <- College[-train_idx, ]  #Test  30%

# Forward Stepwise Selection
fwd.fit <- regsubsets(Outstate ~ ., data = College.train, nvmax = 17, method = "forward")

# Summary
fwd.sum <- summary(fwd.fit)

fwd.sum

## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = College.train, nvmax = 17, 
##     method = "forward")
## 17 Variables  (and intercept)
##             Forced in Forced out
## PrivateYes      FALSE      FALSE
## Apps            FALSE      FALSE
## Accept          FALSE      FALSE
## Enroll          FALSE      FALSE
## Top10perc       FALSE      FALSE
## Top25perc       FALSE      FALSE
## F.Undergrad     FALSE      FALSE
## P.Undergrad     FALSE      FALSE
## Room.Board      FALSE      FALSE
## Books           FALSE      FALSE
## Personal        FALSE      FALSE
## PhD             FALSE      FALSE
## Terminal        FALSE      FALSE
## S.F.Ratio       FALSE      FALSE
## perc.alumni     FALSE      FALSE
## Expend          FALSE      FALSE
## Grad.Rate       FALSE      FALSE
## 1 subsets of each size up to 17
## Selection Algorithm: forward
##           PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1  ( 1 )  " "        " "  " "    " "    " "       " "       " "        
## 2  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 3  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 4  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 5  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 6  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 7  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 8  ( 1 )  "*"        " "  " "    " "    " "       " "       " "        
## 9  ( 1 )  "*"        " "  " "    " "    "*"       " "       " "        
## 10  ( 1 ) "*"        " "  " "    " "    "*"       " "       " "        
## 11  ( 1 ) "*"        " "  "*"    " "    "*"       " "       " "        
## 12  ( 1 ) "*"        "*"  "*"    " "    "*"       " "       " "        
## 13  ( 1 ) "*"        "*"  "*"    " "    "*"       " "       "*"        
## 14  ( 1 ) "*"        "*"  "*"    " "    "*"       "*"       "*"        
## 15  ( 1 ) "*"        "*"  "*"    " "    "*"       "*"       "*"        
## 16  ( 1 ) "*"        "*"  "*"    " "    "*"       "*"       "*"        
## 17  ( 1 ) "*"        "*"  "*"    "*"    "*"       "*"       "*"        
##           P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1  ( 1 )  " "         " "        " "   " "      " " " "      " "      
## 2  ( 1 )  " "         " "        " "   " "      " " " "      " "      
## 3  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 4  ( 1 )  " "         "*"        " "   " "      " " " "      " "      
## 5  ( 1 )  " "         "*"        " "   " "      "*" " "      " "      
## 6  ( 1 )  " "         "*"        " "   " "      "*" " "      " "      
## 7  ( 1 )  " "         "*"        " "   "*"      "*" " "      " "      
## 8  ( 1 )  " "         "*"        " "   "*"      "*" "*"      " "      
## 9  ( 1 )  " "         "*"        " "   "*"      "*" "*"      " "      
## 10  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 11  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 12  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 13  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 14  ( 1 ) " "         "*"        " "   "*"      "*" "*"      "*"      
## 15  ( 1 ) "*"         "*"        " "   "*"      "*" "*"      "*"      
## 16  ( 1 ) "*"         "*"        "*"   "*"      "*" "*"      "*"      
## 17  ( 1 ) "*"         "*"        "*"   "*"      "*" "*"      "*"      
##           perc.alumni Expend Grad.Rate
## 1  ( 1 )  " "         "*"    " "      
## 2  ( 1 )  " "         "*"    " "      
## 3  ( 1 )  " "         "*"    " "      
## 4  ( 1 )  "*"         "*"    " "      
## 5  ( 1 )  "*"         "*"    " "      
## 6  ( 1 )  "*"         "*"    "*"      
## 7  ( 1 )  "*"         "*"    "*"      
## 8  ( 1 )  "*"         "*"    "*"      
## 9  ( 1 )  "*"         "*"    "*"      
## 10  ( 1 ) "*"         "*"    "*"      
## 11  ( 1 ) "*"         "*"    "*"      
## 12  ( 1 ) "*"         "*"    "*"      
## 13  ( 1 ) "*"         "*"    "*"      
## 14  ( 1 ) "*"         "*"    "*"      
## 15  ( 1 ) "*"         "*"    "*"      
## 16  ( 1 ) "*"         "*"    "*"      
## 17  ( 1 ) "*"         "*"    "*"

plot(fwd.sum$bic, type = "b", xlab = "Number of Predictors", ylab = "BIC")

# Best Size
best_size <- which.min(fwd.sum$bic)
cat("Lowest‐BIC model size:", best_size, "\n")

## Lowest‐BIC model size: 13

# Coeff of a model
best_coefs <- coef(fwd.fit, best_size)
print(best_coefs)

##   (Intercept)    PrivateYes          Apps        Accept     Top10perc 
## -1739.5725417  2276.7996721    -0.3358567     0.7814587    28.9687655 
##   F.Undergrad    Room.Board      Personal           PhD      Terminal 
##    -0.1559550     0.9134134    -0.3484815    11.8113175    24.9233138 
##     S.F.Ratio   perc.alumni        Expend     Grad.Rate 
##   -55.0649149    48.6046652     0.1744677    20.9498491

Exercise 10(b)

Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings. Answer: Apps and Expend have significant non-linear effects on out-of-state tuition, while Top10perc and PhD do not. Private is a strong predictor.

library(gam)

## Loading required package: splines

## Loading required package: foreach

## Loaded gam 1.22-5

best_size <- which.min(fwd.sum$bic)    # check the best size
selected_vars <- names(coef(fwd.fit, best_size))[-1]
print(selected_vars)

##  [1] "PrivateYes"  "Apps"        "Accept"      "Top10perc"   "F.Undergrad"
##  [6] "Room.Board"  "Personal"    "PhD"         "Terminal"    "S.F.Ratio"  
## [11] "perc.alumni" "Expend"      "Grad.Rate"

# GAM formula
gam_formula <- as.formula("Outstate ~ Private + s(Apps) + s(Top10perc) + s(PhD) + s(Expend)")

# fit GAM model
gam.fit <- gam(gam_formula, data = College.train)

# model Summary
summary(gam.fit)

## 
## Call: gam(formula = gam_formula, data = College.train)
## Deviance Residuals:
##     Min      1Q  Median      3Q     Max 
## -7445.4 -1101.0   237.1  1382.3  7794.1 
## 
## (Dispersion Parameter for gaussian family taken to be 4440828)
## 
##     Null Deviance: 9260683704 on 542 degrees of freedom
## Residual Deviance: 2331432484 on 524.9995 degrees of freedom
## AIC: 9872.011 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##               Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Private        1 2550500698 2550500698 574.330 < 2.2e-16 ***
## s(Apps)        1  771472280  771472280 173.723 < 2.2e-16 ***
## s(Top10perc)   1 1068974759 1068974759 240.715 < 2.2e-16 ***
## s(PhD)         1  182409160  182409160  41.075 3.263e-10 ***
## s(Expend)      1  814435659  814435659 183.397 < 2.2e-16 ***
## Residuals    525 2331432484    4440828                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##              Npar Df Npar F     Pr(F)    
## (Intercept)                              
## Private                                  
## s(Apps)            3  9.311 5.243e-06 ***
## s(Top10perc)       3  0.622    0.6013    
## s(PhD)             3  0.942    0.4199    
## s(Expend)          3 43.132 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Plot the result
par(mfrow = c(2, 2))
plot(gam.fit, se = TRUE, col = "blue")

Exercise 10.(c)

Evaluate the model obtained on the test set, and explain the results obtained. Answer: this valuse(= 3,918,303) shows that the GAM makes accurate predictions.

# Predict with Test set
gam.pred <- predict(gam.fit, newdata = College.test)

# Calc MSE
mse.test <- mean((College.test$Outstate - gam.pred)^2)
cat("Test MSE:", round(mse.test, 2), "\n")

## Test MSE: 3918303

Exercise 10.(d)

For which variables, if any, is there evidence of a non-linear relationship with the response?

Answer: non-linear relationship between the response (Outstate) and the variables Apps and Expend => their smooth terms had highly significant p-values (p < 0.001).