Moving Beyond Linearity

In this exercise, you will further analyze the Wage data set considered throughout this chapter.

Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.

# Loop for cross Validation
set.seed (17)
cv.error.10 <- rep(0, 10)
for (i in 1:10) {
glm.fit <- glm(wage ~ poly(age, i), data = Wage)
cv.error.10[i] <- cv.glm(Wage, glm.fit , K = 10)$delta [1]
return(cv.error.10[i])
}

# Extracting Optimal Degree
optimal_degree <- which.min(cv.error.10)
optimal_degree

## [1] 2

# loop for anova comparison per degree. 
anova_results <- sapply(1:10, function(i) {
  lmfit <- lm(wage ~ poly(age, i), data = Wage)
  return(anova(lmfit)$"Pr(>F)"[1])
})

fit2<- lm(wage ~ poly(age, 2), data=Wage)

#plotting mse
plot(1:10, cv.error.10, xlab = "Degree", ylab = "MSE", type = "l")
points(optimal_degree, cv.error.10[optimal_degree], col = 'red', cex = 2, pch = 19)

# plotting Poly 2 fit
agelims <- range(Wage$age)
age.grid <- seq(from = agelims[1], to = agelims [2])
preds <- predict(fit2 , newdata = list(age = age.grid),
se = TRUE)
se.bands <- cbind(preds$glm.fit + 2 * preds$se.fit ,
preds$fit - 2 * preds$se.fit)

par(mfrow = c(1, 2), mar = c(4.5 , 4.5, 1, 1),
oma = c(0, 0, 4, 0))
plot(Wage$age, Wage$wage , xlim = agelims , cex = .5, col = "darkgrey")
title("Degree -2 Polynomial", outer = T)
lines(age.grid, preds$fit , lwd = 2, col = "blue")
matlines(age.grid , se.bands, lwd = 1, col = "blue", lty = 3)

Fit a step function to predict wage using age, and perform cross-validation to choose the optimal number of cuts. Make a plot of the fit obtained.

# setting loop for cuts 1-10
set.seed (123)
cv.error<- rep(NA, 10)
for (i in 2:10) {
Wage$age.cut<-cut(Wage$age, i)
glm.fit <- glm(wage ~ age.cut, data = Wage)
cv.error[i]<-cv.glm(Wage, glm.fit, K = 10)$delta[1]
}

# extract optimal cut
optimal_degree<-which.min(cv.error)
optimal_degree

## [1] 8

# plotting MSE by cut
plot(2:10, cv.error[-1], xlab = "Cuts", "ylab" = "MSE", type = "l")
points(optimal_degree, cv.error[optimal_degree], col = 'red', cex = 2, pch = 19)

# plotting Optimal Cut
plot(wage ~ age, data = Wage, col = "darkgrey")
fit <- glm(wage ~ cut(age, 8), data = Wage)
preds <- predict(fit, list(age = age.grid))
lines(age.grid, preds, col = "red", lwd = 2)

The Wage data set contains a number of other features not explored in this chapter, such as marital status (maritl), job class (jobclass), and others. Explore the relationships between some of these other predictors and wage, and use non-linear fitting techniques in order to fit flexible models to the data. Create plots of the results obtained, and write a summary of your findings.

Assuming the rest of this assignment will consider the other non-linear approaches, I’ll run lm fit by adding relevant variables then comparing through ANOVA.

boxplot(wage ~ education, data = Wage, col = c("lavender"), xlab = "education", ylab = "wage")

boxplot(wage ~ maritl, data = Wage, col = c("blue"), xlab = "mar. stat.", ylab = "wage")

boxplot(wage ~ race, data = Wage, col = c("pink"), xlab = "race", ylab = "wage")

boxplot(wage ~ health, data = Wage, col = c("yellow"), xlab = "health", ylab = "wage")

boxplot(wage ~ jobclass, data = Wage, col = c("red"), xlab = "jobclass", ylab = "wage")

boxplot(wage ~ year, data = Wage, col = c("purple"), xlab = "year", ylab = "wage")

There are some noticeable differences in wage for factors like race, marital status, education, health. Lets try a few out!

fit.2 <- lm(wage ~ education + poly(age , 2), data = Wage)
fit.3 <- lm(wage ~ education + health + poly(age , 2), data = Wage)
fit.4 <- lm(wage ~ education + health + race + poly(age , 2), data = Wage)
anova(fit.2, fit.3, fit.4)

## Analysis of Variance Table
## 
## Model 1: wage ~ education + poly(age, 2)
## Model 2: wage ~ education + health + poly(age, 2)
## Model 3: wage ~ education + health + race + poly(age, 2)
##   Res.Df     RSS Df Sum of Sq       F    Pr(>F)    
## 1   2993 3725395                                   
## 2   2992 3685535  1     39860 32.4204 1.362e-08 ***
## 3   2989 3674876  3     10659  2.8899   0.03419 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Fit some of the non-linear models investigated in this chapter to the Auto data set. Is there evidence for non-linear relationships in this data set? Create some informative plots to justify your answer.

gam_auto<-gam(mpg ~ s(displacement, 3) + s(weight, 3) + s(acceleration, 2), data = Auto)

par(mfrow = c(1, 3))
plot(gam_auto, se = TRUE , col = "blue")

summary(gam_auto)

## 
## Call: gam(formula = mpg ~ s(displacement, 3) + s(weight, 3) + s(acceleration, 
##     2), data = Auto)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.4669  -2.4244  -0.2371   1.8486  17.4322 
## 
## (Dispersion Parameter for gaussian family taken to be 16.4602)
## 
##     Null Deviance: 23818.99 on 391 degrees of freedom
## Residual Deviance: 6304.25 on 382.9998 degrees of freedom
## AIC: 2221.313 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                     Df  Sum Sq Mean Sq  F value    Pr(>F)    
## s(displacement, 3)   1 15870.6 15870.6 964.1793 < 2.2e-16 ***
## s(weight, 3)         1   905.6   905.6  55.0156 7.769e-13 ***
## s(acceleration, 2)   1   127.8   127.8   7.7629  0.005598 ** 
## Residuals          383  6304.2    16.5                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                    Npar Df  Npar F     Pr(F)    
## (Intercept)                                     
## s(displacement, 3)       2  5.1132  0.006434 ** 
## s(weight, 3)             2 10.1768 4.938e-05 ***
## s(acceleration, 2)       1  2.7257  0.099562 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

It looks as if weight and displacement have a non-linear relationship with mpg.

plot(Auto$mpg, Auto$displacement)

9. This question uses the variables dis (the weighted mean of distances to five Boston employment centers) and nox (nitrogen oxides concentration in parts per 10 million) from the Boston data. We will treat dis as the predictor and nox as the response.

Use the poly() function to fit a cubic polynomial regression to predict nox using dis. Report the regression output, and plot the resulting data and polynomial fits.

boston_fit<-lm(nox ~ poly(dis, 3), data = Boston)
summary(boston_fit)

## 
## Call:
## lm(formula = nox ~ poly(dis, 3), data = Boston)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.121130 -0.040619 -0.009738  0.023385  0.194904 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.554695   0.002759 201.021  < 2e-16 ***
## poly(dis, 3)1 -2.003096   0.062071 -32.271  < 2e-16 ***
## poly(dis, 3)2  0.856330   0.062071  13.796  < 2e-16 ***
## poly(dis, 3)3 -0.318049   0.062071  -5.124 4.27e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06207 on 502 degrees of freedom
## Multiple R-squared:  0.7148, Adjusted R-squared:  0.7131 
## F-statistic: 419.3 on 3 and 502 DF,  p-value: < 2.2e-16

x_seq <-seq(min(Boston$dis), max(Boston$dis), length.out = 506)
predictions<-predict(boston_fit, newdata = list(dis = x_seq))
plot(Boston$dis, Boston$nox, col = "blue", xlab = "X", ylab = "Y", main = "Polynomial Regression")
lines(x_seq, predictions, col = "red", lwd = 2)

Plot the polynomial fits for a range of different polynomial degrees (say, from 1 to 10), and report the associated residual sum of squares.

set.seed (123)
rss<- rep(0, 10)
for (i in 1:10) {
bos_fit <- lm(nox ~ poly(dis, i), data = Boston)
rss[i]<-sum(bos_fit$residuals^2)
}
rss

##  [1] 2.768563 2.035262 1.934107 1.932981 1.915290 1.878257 1.849484 1.835630
##  [9] 1.833331 1.832171

Perform cross-validation or another approach to select the optimal degree for the polynomial, and explain your results.

set.seed(42)
degrees<-rep(0, 10)
for (i in 1:10) {
bos.fit <- glm(nox ~ poly(dis, i), data = Boston)
degrees[i] <- cv.glm(Boston, bos.fit , K = 10)$delta[1]
}

# Extracting Optimal Degree
optimal_degree <- which.min(degrees)
optimal_degree

## [1] 4

plot(1:10, degrees, xlab = "Degree", ylab = "MSE", type = "l")
points(optimal_degree, degrees[optimal_degree], col = 'red', cex = 2, pch = 19)

Through this method we see that the optimal degree is 4.

Use the bs() function to fit a regression spline to predict nox using dis. Report the output for the fit using four degrees of freedom. How did you choose the knots? Plot the resulting fit.

library(splines)
dislims <- range(Boston$dis)
dis.grid <- seq(from = dislims[1], to = dislims [2])
attr(bs(Boston$dis , df = 4), "knots")

## [1] 3.20745

fit<-lm(nox ~ bs(dis , knots = c(3.20745)), data = Boston)
summary(fit)

## 
## Call:
## lm(formula = nox ~ bs(dis, knots = c(3.20745)), data = Boston)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.124622 -0.039259 -0.008514  0.020850  0.193891 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   0.73447    0.01460  50.306  < 2e-16 ***
## bs(dis, knots = c(3.20745))1 -0.05810    0.02186  -2.658  0.00812 ** 
## bs(dis, knots = c(3.20745))2 -0.46356    0.02366 -19.596  < 2e-16 ***
## bs(dis, knots = c(3.20745))3 -0.19979    0.04311  -4.634 4.58e-06 ***
## bs(dis, knots = c(3.20745))4 -0.38881    0.04551  -8.544  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06195 on 501 degrees of freedom
## Multiple R-squared:  0.7164, Adjusted R-squared:  0.7142 
## F-statistic: 316.5 on 4 and 501 DF,  p-value: < 2.2e-16

pred<-predict(fit , newdata = list(dis = dis.grid), se = T)
plot(Boston$dis, Boston$nox, col = "gray")
lines(dis.grid, pred$fit , lwd = 2)
lines(dis.grid , pred$fit + 2 * pred$se, lty = "dashed")
lines(dis.grid , pred$fit - 2 * pred$se, lty = "dashed")

Knot value was picked via df function at degrees of freedom = 4. There was only 1 knot value for this df.

Now fit a regression spline for a range of degrees of freedom, and plot the resulting fits and report the resulting RSS. Describe the results obtained.

set.seed(123)
rss <- numeric(15)
degrees<-rep(0, 15)
for (i in 4:15) {
bos.spline <-lm(nox ~ bs(dis, df=i), data = Boston)
rss[i]<-sum(bos.spline$residuals^2)
}
rss

##  [1] 0.000000 0.000000 0.000000 1.922775 1.840173 1.833966 1.829884 1.816995
##  [9] 1.825653 1.792535 1.796992 1.788999 1.782350 1.781838 1.782798

plot(4:15, rss[4:15], type = "b")

Perform cross-validation or another approach in order to select the best degrees of freedom for a regression spline on this data. Describe your results.

set.seed(123)
rss <- numeric(15)
degrees<-rep(0, 15)
for (i in 4:15) {
bos.spline <-glm(nox ~ bs(dis, df=i), data = Boston)
degrees[i] <- cv.glm(Boston, bos.spline, K = 10)$delta[1]
}


plot(4:15, degrees[4:15], xlab = "Degree", ylab = "MSE", type = "l")

It appears that degree 14 has the lowest MSE as determined through cross validation.

This question relates to the College data set.

Split the data into a training set and a test set. Using out-of-state tuition as the response and the other variables as the predictors, perform forward stepwise selection on the training set in order to identify a satisfactory model that uses just a subset of the predictors.

set.seed(123)
train_indices<-sample (1: nrow(College), nrow(College)/2)
train_data <- College[train_indices, ]
test_data <- College[-train_indices, ]

regfit.fwd<-regsubsets(Outstate ~ ., data = train_data, nvmax = 17, method = "forward")
fwd_summary<-summary(regfit.fwd)
fwd_min_bic <- which.min(fwd_summary$bic)
fwd_min_cp<- which.min(fwd_summary$cp)
fwd_max_adjr2<- which.max(fwd_summary$adjr2)

# Plot AIC, BIC, Cp
par(mfrow = c(2, 2))  

plot(fwd_summary$bic, type = "b", xlab = "Number of Variables", ylab = "BIC", main = "BIC vs Number of Variables")
points(fwd_min_bic, fwd_summary$bic[fwd_min_bic], col = "red", pch = 16)

plot(fwd_summary$cp, type = "b", xlab = "Number of Variables", ylab = "Cp", main = "Cp vs Number of Variables")
points(fwd_min_cp, fwd_summary$cp[fwd_min_cp], col = "red", pch = 16)

plot(fwd_summary$adjr2, type = "b", xlab = "Number of Variables", ylab = "Adjusted R-squared", main = "Adjusted R-squared vs Number of Variables")
points(fwd_max_adjr2, fwd_summary$adjr2[fwd_max_adjr2], col = "red", pch = 16)

Looks like a model of roughly 11 predictors perfoms the best.

Fit a GAM on the training data, using out-of-state tuition as the response and the features selected in the previous step as the predictors. Plot the results, and explain your findings.

gam.m1<-gam(Outstate ~ Private + s(Apps, 2) + s(Accept) + s(Top25perc, 3) + s(F.Undergrad, 2) + s(Room.Board) + s(PhD, 2) + s(perc.alumni, 3) + s(Expend, 2) + s(Grad.Rate), data = train_data)

par(mfrow = c(1, 3))
plot(gam.m1, se = TRUE , col = "blue")

preds<-predict(gam.m1, newdata = test_data)
err<-mean((preds - test_data$Outstate)^2)
err

## [1] 3703063

tss<- mean((test_data$Outstate - mean(test_data$Outstate))^2)
rss<- 1 - err/tss
rss

## [1] 0.7619557

For which variables, if any, is there evidence of a non-linear relationship with the response?

summary(gam.m1)

## 
## Call: gam(formula = Outstate ~ Private + s(Apps, 2) + s(Accept) + s(Top25perc, 
##     3) + s(F.Undergrad, 2) + s(Room.Board) + s(PhD, 2) + s(perc.alumni, 
##     3) + s(Expend, 2) + s(Grad.Rate), data = train_data)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -6592.54 -1076.03    82.27  1230.29  8355.16 
## 
## (Dispersion Parameter for gaussian family taken to be 3544258)
## 
##     Null Deviance: 6505853731 on 387 degrees of freedom
## Residual Deviance: 1275932360 on 359.9998 degrees of freedom
## AIC: 6981.4 
## 
## Number of Local Scoring Iterations: NA 
## 
## Anova for Parametric Effects
##                    Df     Sum Sq    Mean Sq  F value    Pr(>F)    
## Private             1 2034279795 2034279795 573.9649 < 2.2e-16 ***
## s(Apps, 2)          1  939551224  939551224 265.0911 < 2.2e-16 ***
## s(Accept)           1   65276430   65276430  18.4175 2.283e-05 ***
## s(Top25perc, 3)     1  426503525  426503525 120.3365 < 2.2e-16 ***
## s(F.Undergrad, 2)   1   97642501   97642501  27.5495 2.622e-07 ***
## s(Room.Board)       1  286861205  286861205  80.9369 < 2.2e-16 ***
## s(PhD, 2)           1   88858466   88858466  25.0711 8.669e-07 ***
## s(perc.alumni, 3)   1  153617026  153617026  43.3425 1.628e-10 ***
## s(Expend, 2)        1  335269096  335269096  94.5950 < 2.2e-16 ***
## s(Grad.Rate)        1   25752058   25752058   7.2659  0.007358 ** 
## Residuals         360 1275932360    3544258                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                   Npar Df Npar F     Pr(F)    
## (Intercept)                                   
## Private                                       
## s(Apps, 2)              1  0.276  0.599838    
## s(Accept)               3  5.501  0.001053 ** 
## s(Top25perc, 3)         2  1.439  0.238433    
## s(F.Undergrad, 2)       1  5.247  0.022567 *  
## s(Room.Board)           3  2.832  0.038253 *  
## s(PhD, 2)               1  2.415  0.121046    
## s(perc.alumni, 3)       2  0.177  0.837980    
## s(Expend, 2)            1 43.405 1.582e-10 ***
## s(Grad.Rate)            3  3.046  0.028798 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Based on this model it looks as if Accept, Room.Board, PhD, Expend, and Grad.Rate have a non-linear relationship with Outstate.

In Section 7.7, it was mentioned that GAMs are generally fit using a backfitting approach. The idea behind backfitting is actually quite simple. We will now explore backfitting in the context of multiple linear regression.

Suppose that we would like to perform multiple linear regression, but we do not have software to do so. Instead, we only have software to perform simple linear regression. Therefore, we take the following iterative approach: we repeatedly hold all but one coefficient estimate fixed at its current value, and update only that coefficient estimate using a simple linear regression. The process is continued until convergence—that is, until the coefficient estimates stop changing. We now try this out on a toy example.

We now try this out on a toy example. (a) Generate a response Y and two predictors X1 and X2, with n = 100.

x1<-rnorm(100, 0, .25)
x2<-rnorm(100, 0, .5)
e<-rnorm(100, 0, 2)
y<-2 + 3*x1 + 2*x2 + e

Initialize ˆ β1 to take on a value of your choice. It does not matter what value you choose.

beta0<-2
beta1<-1

Keeping ˆ β1 fixed, fit the model Y − ˆ β1X1 = β0 + β2X2 + ϵ.

You can do this as follows:

a <- y - beta1 * x1
beta2<-lm(a ~ x2)$coef[2]

Keeping ˆ β2 fixed, fit the model Y − ˆ β2X2 = β0 + β1X1 + ϵ. You can do this as follows:

a <- y - beta2 * x2
beta1<-lm(a ~ x1)$coef [2]

Write a for loop to repeat (c) and (d) 1,000 times. Report the estimates of ˆ β0, ˆ β1, and ˆ β2 at each iteration of the for loop. Create a plot in which each of these values is displayed, with ˆ β0, ˆ β1, and ˆ β2 each shown in a different color.

beta1_int<-1
beta0<- c()
beta1<- c()
beta2<- c()

# beta_2:
a<-y - beta1_int * x1
beta2[1]<-lm(a ~ x2)$coef[2]

# beta_1:
b<- y - beta2[1] * x2
lm_coef<-lm(b ~ x1)$coef
beta1[1]<-lm_coef[2]

# beta_0:
beta0[1] <-lm_coef[1]


for (i in 2:1000) {
  a<- y - beta1[i-1] * x1
  beta2[i]<-lm(a ~ x2)$coef[2]
  b<- y - beta2[i-1] * x2
  lm_coef <- lm(b ~ x1)$coef
  beta1[i] <- lm_coef[2]
  beta0[i] <- lm_coef[1]
}

iteration <- 1:1000
df <- data.frame(Iteration = iteration, Beta0 = beta0, Beta1 = beta1, Beta2 = beta2)

# Melt the data for better plotting with ggplot
library(reshape2)

## Warning: package 'reshape2' was built under R version 4.3.1

df_melted <- melt(df, id.vars = "Iteration")

# Plot using ggplot
ggplot(df_melted, aes(x = Iteration, y = value, color = variable)) +
  geom_line() +
  labs(x = "Iteration", y = "Coefficient Estimate", color = "Coefficient") +
  theme_minimal()

Compare your answer in (e) to the results of simply performing multiple linear regression to predict Y using X1 and X2. Use the abline() function to overlay those multiple linear regression coefficient estimates on the plot obtained in (e).

# lm model
multiple_model <- lm(y ~ x1 + x2)

# Extract coefficients
multiple_beta0 <- coef(multiple_model)[1]
multiple_beta1 <- coef(multiple_model)[2]
multiple_beta2 <- coef(multiple_model)[3]

iteration <- 1:1000

# Data frame for coefficients from the iterative approach
df_iterative <- data.frame(
  Iteration = iteration,
  Beta0 = beta0,
  Beta1 = beta1,
  Beta2 = beta2
)

# Melt the iterative data frame
library(reshape2)
df_iterative_melted <- melt(df_iterative, id.vars = "Iteration")

# Data frame for coefficients from the lm model
df_lm <- data.frame(
  Iteration = rep(0, 3),
  Coefficient = c(multiple_beta0, multiple_beta1, multiple_beta2),
  Variable = c("Beta0", "Beta1", "Beta2")
)

# Plotting the iterative approach
ggplot(df_iterative_melted, aes(x = Iteration, y = value, color = variable, linetype = variable)) +
  geom_line() +
  labs(x = "Iteration", y = "Coefficient Estimate", color = "Coefficient", linetype = "Coefficient") +
  theme_minimal() +
  geom_line(data = df_lm, aes(x = Iteration, y = Coefficient, color = Variable, linetype = Variable))

## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?

It appears the coefficients are quite similar.

```

Moving Beyond Linearity

2023-12-06