# Load necessary libraries
library(ISLR2) # For the Wage dataset
library(boot) # For cv.glm
library(ggplot2)
# Prepare data
data("Wage")
set.seed(1)
# Perform cross-validation to choose optimal degree of polynomial
cv.errors <- rep(0, 10)
for (d in 1:10) {
fit <- glm(wage ~ poly(age, d), data = Wage)
cv.errors[d] <- cv.glm(Wage, fit, K = 10)$delta[1]
}
# Find optimal degree
optimal.degree <- which.min(cv.errors)
print(paste("Optimal degree from CV:", optimal.degree))
## [1] "Optimal degree from CV: 9"
# Compare with ANOVA
fit.list <- lapply(1:5, function(d) lm(wage ~ poly(age, d), data = Wage))
anova.result <- do.call(anova, fit.list)
print(anova.result)
## Analysis of Variance Table
##
## Model 1: wage ~ poly(age, d)
## Model 2: wage ~ poly(age, d)
## Model 3: wage ~ poly(age, d)
## Model 4: wage ~ poly(age, d)
## Model 5: wage ~ poly(age, d)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Plot polynomial fit with optimal degree
ggplot(Wage, aes(x = age, y = wage)) +
geom_point(alpha = 0.4) +
stat_smooth(method = "lm", formula = y ~ poly(x, optimal.degree), color = "red") +
ggtitle(paste("Polynomial Regression (degree =", optimal.degree, ")")) +
theme_minimal()
# Try different number of cuts and use cross-validation
cv.step.errors <- rep(NA, 10)
for (k in 2:10) {
Wage$cut.age <- cut(Wage$age, k)
fit <- glm(wage ~ cut.age, data = Wage)
cv.step.errors[k] <- cv.glm(Wage, fit, K = 10)$delta[1]
}
# Optimal number of cuts
optimal.cuts <- which.min(cv.step.errors)
print(paste("Optimal number of cuts from CV:", optimal.cuts))
## [1] "Optimal number of cuts from CV: 8"
# Fit and plot the step function
Wage$cut.age <- cut(Wage$age, optimal.cuts)
fit.step <- glm(wage ~ cut.age, data = Wage)
ggplot(Wage, aes(x = age, y = wage)) +
geom_point(alpha = 0.4) +
geom_step(aes(y = predict(fit.step, Wage)), color = "blue") +
ggtitle(paste("Step Function with", optimal.cuts, "cuts")) +
theme_minimal()
# Load libraries
library(ISLR2)
library(ggplot2)
library(splines)
# Load the Wage dataset
data("Wage")
# Fit a linear model with categorical and non-linear predictors
fit <- lm(wage ~ maritl + jobclass + education + ns(age, 4) + ns(year, 3), data = Wage)
# View model summary
summary(fit)
##
## Call:
## lm(formula = wage ~ maritl + jobclass + education + ns(age, 4) +
## ns(year, 3), data = Wage)
##
## Residuals:
## Min 1Q Median 3Q Max
## -108.905 -19.489 -2.708 14.069 214.108
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 45.0296 4.3191 10.426 < 2e-16 ***
## maritl2. Married 14.0613 1.8548 7.581 4.55e-14 ***
## maritl3. Widowed 0.5773 8.1817 0.071 0.943756
## maritl4. Divorced 0.5451 3.0053 0.181 0.856078
## maritl5. Separated 7.1463 4.9860 1.433 0.151884
## jobclass2. Information 4.5711 1.3381 3.416 0.000644 ***
## education2. HS Grad 10.7011 2.4032 4.453 8.79e-06 ***
## education3. Some College 22.8784 2.5451 8.989 < 2e-16 ***
## education4. College Grad 36.6756 2.5507 14.378 < 2e-16 ***
## education5. Advanced Degree 59.8567 2.8049 21.340 < 2e-16 ***
## ns(age, 4)1 32.7209 4.1555 7.874 4.76e-15 ***
## ns(age, 4)2 19.3932 4.0886 4.743 2.20e-06 ***
## ns(age, 4)3 37.4034 9.8247 3.807 0.000143 ***
## ns(age, 4)4 3.0620 7.6593 0.400 0.689354
## ns(year, 3)1 4.0459 2.6704 1.515 0.129856
## ns(year, 3)2 11.7705 3.7494 3.139 0.001710 **
## ns(year, 3)3 5.0475 1.8910 2.669 0.007643 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34.69 on 2983 degrees of freedom
## Multiple R-squared: 0.3127, Adjusted R-squared: 0.309
## F-statistic: 84.82 on 16 and 2983 DF, p-value: < 2.2e-16
# Plot wage vs marital status
ggplot(Wage, aes(x = maritl, y = wage)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Wage vs Marital Status", x = "Marital Status", y = "Wage") +
theme_minimal()
# Plot wage vs job class
ggplot(Wage, aes(x = jobclass, y = wage)) +
geom_boxplot(fill = "lightgreen") +
labs(title = "Wage vs Job Class", x = "Job Class", y = "Wage") +
theme_minimal()
# Plot wage vs education
ggplot(Wage, aes(x = education, y = wage)) +
geom_boxplot(fill = "lightcoral") +
labs(title = "Wage vs Education Level", x = "Education", y = "Wage") +
theme_minimal()
# Create prediction grid for plotting the non-linear effect of age
age.grid <- data.frame(
age = seq(min(Wage$age), max(Wage$age), length.out = 100),
year = median(Wage$year),
maritl = "1. Never Married",
jobclass = "1. Industrial",
education = "1. < HS Grad"
)
# Predict wages from the model using the grid
preds <- predict(fit, newdata = age.grid)
# Plot the non-linear fit: Wage vs Age
ggplot(data.frame(age = age.grid$age, wage = preds), aes(x = age, y = wage)) +
geom_line(color = "purple", linewidth = 1.2) +
labs(title = "Non-linear Fit: Wage vs Age", x = "Age", y = "Predicted Wage") +
theme_minimal()
Multiple R-squared: 0.3127
Adjusted R-squared: 0.309
F-statistic: 84.82 on 16 and 2983 degrees of freedom
The model explains approximately 31% of the variation in wage, which is quite reasonable for wage data, given the influence of many unobserved or individual-specific factors.
# Load required libraries
library(MASS) # For Boston dataset
## Warning: package 'MASS' was built under R version 4.4.3
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
library(ggplot2) # For plotting
library(splines) # For splines
library(boot) # For cross-validation
# Load the data
data("Boston")
# Quick view of variables
str(Boston[, c("dis", "nox")])
## 'data.frame': 506 obs. of 2 variables:
## $ dis: num 4.09 4.97 4.97 6.06 6.06 ...
## $ nox: num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
# Fit a cubic polynomial regression
fit_poly3 <- lm(nox ~ poly(dis, 3), data = Boston)
# Regression summary
summary(fit_poly3)
##
## Call:
## lm(formula = nox ~ poly(dis, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.121130 -0.040619 -0.009738 0.023385 0.194904
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.554695 0.002759 201.021 < 2e-16 ***
## poly(dis, 3)1 -2.003096 0.062071 -32.271 < 2e-16 ***
## poly(dis, 3)2 0.856330 0.062071 13.796 < 2e-16 ***
## poly(dis, 3)3 -0.318049 0.062071 -5.124 4.27e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06207 on 502 degrees of freedom
## Multiple R-squared: 0.7148, Adjusted R-squared: 0.7131
## F-statistic: 419.3 on 3 and 502 DF, p-value: < 2.2e-16
# Plot data and fitted cubic polynomial
ggplot(Boston, aes(x = dis, y = nox)) +
geom_point(alpha = 0.4) +
stat_smooth(method = "lm", formula = y ~ poly(x, 3), se = FALSE, color = "red") +
labs(title = "Cubic Polynomial Fit: nox ~ dis", x = "dis", y = "nox") +
theme_minimal()
The model includes an intercept and three orthogonal polynomial terms (poly(dis, 3)).All polynomial terms are highly statistically significant (p < 0.001), indicating a strong non-linear relationship between dis and nox.
Residual Standard Error (RSE): 0.0621 Multiple R-squared: 0.7148 Adjusted R-squared: 0.7131 F-statistic: 419.3 on 3 and 502 DF (p < 2.2e-16)
These statistics suggest that the cubic model explains approximately 71% of the variability in nox, indicating a strong model fit.
rss_values <- numeric(10)
for (d in 1:10) {
fit <- lm(nox ~ poly(dis, d), data = Boston)
rss_values[d] <- sum(resid(fit)^2)
}
# Print RSS for each degree
rss_values
## [1] 2.768563 2.035262 1.934107 1.932981 1.915290 1.878257 1.849484 1.835630
## [9] 1.833331 1.832171
# Plot RSS vs Degree
plot(1:10, rss_values, type = "b", pch = 19, col = "blue",
xlab = "Degree of Polynomial", ylab = "RSS",
main = "RSS vs Polynomial Degree")
rss_values <- c(2.768563, 2.035262, 1.934107, 1.932981, 1.915290,
1.878257, 1.849484, 1.835630, 1.833331, 1.832171)
# Print it in a readable way
data.frame(Degree = 1:10, RSS = round(rss_values, 6))
## Degree RSS
## 1 1 2.768563
## 2 2 2.035262
## 3 3 1.934107
## 4 4 1.932981
## 5 5 1.915290
## 6 6 1.878257
## 7 7 1.849484
## 8 8 1.835630
## 9 9 1.833331
## 10 10 1.832171
set.seed(1)
cv_errors <- numeric(10)
for (d in 1:10) {
fit <- glm(nox ~ poly(dis, d), data = Boston)
cv_errors[d] <- cv.glm(Boston, fit, K = 10)$delta[1]
}
# Optimal degree
optimal_degree <- which.min(cv_errors)
optimal_degree
## [1] 4
# Plot CV error vs Degree
plot(1:10, cv_errors, type = "b", pch = 19, col = "darkgreen",
xlab = "Degree of Polynomial", ylab = "10-fold CV Error",
main = "Cross-Validation to Select Polynomial Degree")
Polynomial degrees 1 to 3 showed substantial improvements in both RSS (from part b) and CV error, capturing the non-linear relationship between dis and nox.
Degree 4 offered the best trade-off between bias and variance — it was flexible enough to capture curvature in the data, but not so flexible as to overfit.
Higher degrees (5–10) slightly reduced training error, but did not generalize better, as shown by higher or similar CV errors.
# Fit spline with 4 degrees of freedom
fit_spline4 <- lm(nox ~ bs(dis, df = 4), data = Boston)
# Summary
summary(fit_spline4)
##
## Call:
## lm(formula = nox ~ bs(dis, df = 4), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.124622 -0.039259 -0.008514 0.020850 0.193891
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.73447 0.01460 50.306 < 2e-16 ***
## bs(dis, df = 4)1 -0.05810 0.02186 -2.658 0.00812 **
## bs(dis, df = 4)2 -0.46356 0.02366 -19.596 < 2e-16 ***
## bs(dis, df = 4)3 -0.19979 0.04311 -4.634 4.58e-06 ***
## bs(dis, df = 4)4 -0.38881 0.04551 -8.544 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06195 on 501 degrees of freedom
## Multiple R-squared: 0.7164, Adjusted R-squared: 0.7142
## F-statistic: 316.5 on 4 and 501 DF, p-value: < 2.2e-16
# Plot fit
ggplot(Boston, aes(x = dis, y = nox)) +
geom_point(alpha = 0.4) +
geom_line(aes(y = predict(fit_spline4)), color = "purple", linewidth = 1.2) +
labs(title = "Regression Spline (df = 4)", x = "dis", y = "nox") +
theme_minimal()
rss_splines <- numeric(8)
for (df in 3:10) {
fit <- lm(nox ~ bs(dis, df = df), data = Boston)
rss_splines[df - 2] <- sum(resid(fit)^2)
}
# Print RSS
rss_splines
## [1] 1.934107 1.922775 1.840173 1.833966 1.829884 1.816995 1.825653 1.792535
# Plot RSS vs df
plot(3:10, rss_splines, type = "b", pch = 19, col = "orange",
xlab = "Degrees of Freedom", ylab = "RSS",
main = "Regression Splines: RSS vs Degrees of Freedom")
# Set the range to avoid bs() boundary warnings (optional)
range_dis <- range(Boston$dis)
# Initialize vector to store CV errors
cv_spline_errors <- numeric(8)
# 10-fold CV for df = 3 to 10
for (df in 3:10) {
fit <- glm(nox ~ bs(dis, df = df, Boundary.knots = range_dis), data = Boston)
cv_spline_errors[df - 2] <- cv.glm(Boston, fit, K = 10)$delta[1]
}
# Best df
best_df <- which.min(cv_spline_errors) + 2
best_df
## [1] 8
# Plot
plot(3:10, cv_spline_errors, type = "b", pch = 19, col = "darkred",
xlab = "Degrees of Freedom", ylab = "10-fold CV Error",
main = "Spline CV Error vs Degrees of Freedom")
A spline with 6 degrees of freedom offers sufficient flexibility to capture non-linear patterns between dis and nox.
It avoids underfitting (as with very low df) and overfitting, striking a strong balance between bias and variance.
This finding aligns with the trend observed in RSS values in part (e), where improvements tapered off around df = 6–7.
set.seed(1)
n <- 100
x1 <- rnorm(n, mean = 0, sd = sqrt(2))
x2 <- rnorm(n, mean = 0, sd = sqrt(2))
epsilon <- rnorm(n, mean = 0, sd = 1)
beta0_true <- 1.5
beta1_true <- 3
beta2_true <- -4.5
y <- beta0_true + beta1_true * x1 + beta2_true * x2 + epsilon
beta1 <- 0
beta2 <- 0
beta0 <- mean(y) # Initialize intercept
# Store coefficient estimates
iterations <- 1000
beta0_vals <- numeric(iterations)
beta1_vals <- numeric(iterations)
beta2_vals <- numeric(iterations)
# Assume beta1 is already initialized (e.g., beta1 <- 0)
# Compute adjusted response
a <- y - beta1 * x1
# Fit simple linear regression: a ~ x2
fit <- lm(a ~ x2)
# Extract coefficients
beta0 <- fit$coef[1]
beta2 <- fit$coef[2]
# View updated values
beta0
## (Intercept)
## 1.989472
beta2
## x2
## -4.540618
# Assume beta2 was updated in step (c)
# Adjust the response by subtracting beta2 * x2
a <- y - beta2 * x2
# Fit simple linear regression: a ~ x1
fit <- lm(a ~ x1)
# Update coefficients
beta0 <- fit$coef[1]
beta1 <- fit$coef[2]
# View updated values
beta0
## (Intercept)
## 1.525204
beta1
## x1
## 3.014924
# (a) Generate the data
set.seed(1)
n <- 100
x1 <- rnorm(n, mean = 0, sd = sqrt(2))
x2 <- rnorm(n, mean = 0, sd = sqrt(2))
epsilon <- rnorm(n, mean = 0, sd = 1)
y <- 1.5 + 3 * x1 - 4.5 * x2 + epsilon
# (b) Initialize coefficients
beta1 <- 0
beta2 <- 0
beta0 <- mean(y) # intercept starts as mean of y
# Prepare to store values for 1000 iterations
iterations <- 1000
beta0_vals <- numeric(iterations)
beta1_vals <- numeric(iterations)
beta2_vals <- numeric(iterations)
# (e) Repeat steps (c) and (d) in a loop
for (i in 1:iterations) {
# (c) Update beta2, keeping beta1 fixed
a <- y - beta1 * x1
fit2 <- lm(a ~ x2)
beta0 <- fit2$coef[1]
beta2 <- fit2$coef[2]
# (d) Update beta1, keeping beta2 fixed
a <- y - beta2 * x2
fit1 <- lm(a ~ x1)
beta0 <- fit1$coef[1] # update intercept again
beta1 <- fit1$coef[2]
# Store the estimates
beta0_vals[i] <- beta0
beta1_vals[i] <- beta1
beta2_vals[i] <- beta2
}
# Print only: first 5 iterations, and then every 50th
print_steps <- c(1:5, seq(50, 1000, 50))
results <- data.frame(
Iteration = print_steps,
beta0 = round(beta0_vals[print_steps], 4),
beta1 = round(beta1_vals[print_steps], 4),
beta2 = round(beta2_vals[print_steps], 4)
)
print(results)
## Iteration beta0 beta1 beta2
## 1 1 1.5252 3.0149 -4.5406
## 2 2 1.5254 3.0149 -4.5378
## 3 3 1.5254 3.0149 -4.5378
## 4 4 1.5254 3.0149 -4.5378
## 5 5 1.5254 3.0149 -4.5378
## 6 50 1.5254 3.0149 -4.5378
## 7 100 1.5254 3.0149 -4.5378
## 8 150 1.5254 3.0149 -4.5378
## 9 200 1.5254 3.0149 -4.5378
## 10 250 1.5254 3.0149 -4.5378
## 11 300 1.5254 3.0149 -4.5378
## 12 350 1.5254 3.0149 -4.5378
## 13 400 1.5254 3.0149 -4.5378
## 14 450 1.5254 3.0149 -4.5378
## 15 500 1.5254 3.0149 -4.5378
## 16 550 1.5254 3.0149 -4.5378
## 17 600 1.5254 3.0149 -4.5378
## 18 650 1.5254 3.0149 -4.5378
## 19 700 1.5254 3.0149 -4.5378
## 20 750 1.5254 3.0149 -4.5378
## 21 800 1.5254 3.0149 -4.5378
## 22 850 1.5254 3.0149 -4.5378
## 23 900 1.5254 3.0149 -4.5378
## 24 950 1.5254 3.0149 -4.5378
## 25 1000 1.5254 3.0149 -4.5378
# Load plotting libraries
library(ggplot2)
library(tidyr)
# Prepare long-format data for ggplot
plot_df <- data.frame(
Iteration = 1:iterations,
beta0 = beta0_vals,
beta1 = beta1_vals,
beta2 = beta2_vals
)
plot_long <- pivot_longer(plot_df, cols = -Iteration,
names_to = "Coefficient", values_to = "Estimate")
# Plot each beta with a different color
ggplot(plot_long, aes(x = Iteration, y = Estimate, color = Coefficient)) +
geom_line(size = 1) +
labs(title = "Backfitting Estimates of β0, β1, and β2 Over Iterations",
x = "Iteration", y = "Coefficient Estimate") +
theme_minimal() +
scale_color_manual(values = c("beta0" = "darkgreen", "beta1" = "blue", "beta2" = "red")) +
theme(legend.title = element_blank())
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Fit multiple linear regression
lm_fit <- lm(y ~ x1 + x2)
true_coefs <- coef(lm_fit)
# Overlay true values on the plot from (e)
ggplot(plot_long, aes(x = Iteration, y = Estimate, color = Coefficient)) +
geom_line(size = 1) +
# Add dashed horizontal lines for true coefficients
geom_hline(yintercept = true_coefs[1], linetype = "dashed", color = "darkgreen") + # beta0
geom_hline(yintercept = true_coefs[2], linetype = "dashed", color = "blue") + # beta1
geom_hline(yintercept = true_coefs[3], linetype = "dashed", color = "red") + # beta2
labs(title = "Backfitting Coefficient Estimates vs Multiple Regression Coefficients",
x = "Iteration", y = "Coefficient Estimate") +
theme_minimal() +
scale_color_manual(values = c("beta0" = "darkgreen", "beta1" = "blue", "beta2" = "red")) +
theme(legend.title = element_blank())