library(ISLR2)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:lattice':
##
## melanoma
library(splines)
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:ISLR2':
##
## Boston
fit.1 <- lm(wage ~ age,data = Wage)
fit.2 <- lm(wage ~ poly(age, 2),data = Wage)
fit.3 <- lm(wage ~ poly(age, 3),data = Wage)
fit.4 <- lm(wage ~ poly(age, 4),data = Wage)
fit.5 <- lm(wage ~ poly(age, 5),data = Wage)
fit.6 <- lm(wage ~ poly(age, 6),data = Wage)
anova(fit.1, fit.2, fit.3, fit.4, fit.5,fit.6)
## Analysis of Variance Table
##
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Model 6: wage ~ poly(age, 6)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.6636 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8936 0.001675 **
## 4 2995 4771604 1 6070 3.8117 0.050989 .
## 5 2994 4770322 1 1283 0.8054 0.369565
## 6 2993 4766389 1 3932 2.4692 0.116201
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Idea is to prefer lower degree polynomial as possible which capture most of the variance of the wage variable.
Based on ANOVA, cubic and/ or Bi-Quadratic are better polynomial regression models
set.seed(96)
cv.errors <- rep(0, 6)
for (d in 1:6) {
fit <- glm(wage ~ poly(age, d), data = Wage)
cv.errors[d] <- cv.glm(Wage, fit)$delta[1]
}
cv.errors
## [1] 1676.235 1600.529 1595.960 1594.596 1594.879 1594.119
ggplot(Wage, aes(x = age, y = wage)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm",
formula = y ~ poly(x, 4),
se = FALSE, color = "blue") +
labs(title = "Polynomial Fit (Degree: 4)",
x = "Age", y = "Wage") +
theme_minimal()
- Using cross-validation, the optimal degree for the polynomial
regression model was 4, as it gave the lowest prediction error. ANOVA
testing supports the inclusion of up to degree 3 (p < 0.05), with
degree 4 being borderline (p ≈ 0.05). Thus, the results are in general
agreement, suggesting a 3rd or 4th-degree polynomial as appropriate for
modeling the nonlinear relationship between age and wage
set.seed(96)
cv.errors <- rep(NA, 9) # from 2 to 10 cuts
for (k in 2:10) {
Wage$age.cut <- cut(Wage$age, k)
fit <- glm(wage ~ age.cut, data = Wage)
cv.errors[k - 1] <- cv.glm(Wage, fit)$delta[1]
}
# Plot CV errors
plot(2:10, cv.errors, type = "b", pch = 19, col = "blue",
xlab = "Number of Cuts (bins)", ylab = "LOOCV Error",
main = "CV Error vs Number of Cuts in Step Function")
- As number of cuts increases, the loocv error rate is gradually
decreasing, has a minimum at 8 cuts - Therefore lets consider optimal
cuts as 8.
Wage$age.cut <- cut(Wage$age, 8)
fit.step <- glm(wage ~ age.cut, data = Wage)
ggplot(Wage, aes(x = age, y = wage)) +
geom_point(alpha = 0.3) +
stat_summary(aes(group = age.cut), fun = mean, geom = "line", color = "red", size = 1.2) +
labs(title = paste("Step Function Fit with 8 Cuts"),
x = "Age", y = "Wage") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Marital status
ggplot(Wage, aes(x = maritl, y = wage)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Wage vs Marital Status", x = "Marital Status", y = "Wage") +
theme_minimal()
# Job class
ggplot(Wage, aes(x = jobclass, y = wage)) +
geom_boxplot(fill = "lightgreen") +
labs(title = "Wage vs Job Class", x = "Job Class", y = "Wage") +
theme_minimal()
# Education
ggplot(Wage, aes(x = education, y = wage)) +
geom_boxplot(fill = "lightcoral") +
labs(title = "Wage vs Education", x = "Education", y = "Wage") +
theme_minimal()
fit.flex <- lm(wage ~ ns(age, 4) + education + jobclass + maritl, data = Wage)
summary(fit.flex)
##
## Call:
## lm(formula = wage ~ ns(age, 4) + education + jobclass + maritl,
## data = Wage)
##
## Residuals:
## Min 1Q Median 3Q Max
## -113.364 -19.703 -2.945 14.484 209.691
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.1850 4.1608 11.821 < 2e-16 ***
## ns(age, 4)1 32.7041 4.1625 7.857 5.45e-15 ***
## ns(age, 4)2 20.4086 4.0902 4.990 6.40e-07 ***
## ns(age, 4)3 38.4561 9.8346 3.910 9.42e-05 ***
## ns(age, 4)4 2.5793 7.6689 0.336 0.73664
## education2. HS Grad 10.8338 2.4074 4.500 7.05e-06 ***
## education3. Some College 22.9062 2.5487 8.988 < 2e-16 ***
## education4. College Grad 36.7756 2.5559 14.389 < 2e-16 ***
## education5. Advanced Degree 60.1003 2.8106 21.384 < 2e-16 ***
## jobclass2. Information 4.5422 1.3401 3.389 0.00071 ***
## maritl2. Married 13.8448 1.8570 7.456 1.17e-13 ***
## maritl3. Widowed -0.3111 8.1927 -0.038 0.96971
## maritl4. Divorced 0.2407 3.0106 0.080 0.93627
## maritl5. Separated 7.2134 4.9974 1.443 0.14900
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34.77 on 2986 degrees of freedom
## Multiple R-squared: 0.3088, Adjusted R-squared: 0.3058
## F-statistic: 102.6 on 13 and 2986 DF, p-value: < 2.2e-16
This flexible model reveals that education is the most powerful predictor of wage, with higher degrees (especially advanced degrees) associated with substantial wage increases — up to ~60 more than those with <HS education. Job class has a moderate effect: workers in the Information sector earn ~4.5 more than those in Industrial roles. Marital status also plays a role: married individuals earn ~$13.84 more than those who have never married, while other marital categories show no significant difference.
The non-linear spline for age captures the classic wage curve: wages rise with age, then plateau or decline in older years — something linear models would miss. This flexible fitting improves model performance, explaining about 31% of the wage variance (R² = 0.31), suggesting that while the chosen predictors are important, other factors (like experience, location, occupation) may also play a large role
age.grid <- data.frame(
age = seq(18, 80, length.out = 100),
education = factor("1. < HS Grad", levels = levels(Wage$education)),
jobclass = factor("1. Industrial", levels = levels(Wage$jobclass)),
maritl = factor("1. Never Married", levels = levels(Wage$maritl))
)
preds <- predict(fit.flex, newdata = age.grid)
age.grid$predicted_wage <- preds
ggplot(age.grid, aes(x = age, y = predicted_wage)) +
geom_line(color = "blue", size = 1.2) +
labs(title = "Effect of Age on Predicted Wage",
x = "Age", y = "Predicted Wage") +
theme_minimal()
fit_cubic <- lm(nox ~ poly(dis, 3), data = Boston)
summary(fit_cubic)
##
## Call:
## lm(formula = nox ~ poly(dis, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.121130 -0.040619 -0.009738 0.023385 0.194904
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.554695 0.002759 201.021 < 2e-16 ***
## poly(dis, 3)1 -2.003096 0.062071 -32.271 < 2e-16 ***
## poly(dis, 3)2 0.856330 0.062071 13.796 < 2e-16 ***
## poly(dis, 3)3 -0.318049 0.062071 -5.124 4.27e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06207 on 502 degrees of freedom
## Multiple R-squared: 0.7148, Adjusted R-squared: 0.7131
## F-statistic: 419.3 on 3 and 502 DF, p-value: < 2.2e-16
# Plot with fitted curve
ggplot(Boston, aes(x = dis, y = nox)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", formula = y ~ poly(x, 3), color = "blue", se = FALSE) +
labs(title = "Cubic Polynomial Fit", x = "Distance to Employment Centers", y = "NOx Concentration") +
theme_minimal()
rss_vec <- numeric(10)
for (d in 1:10) {
fit <- lm(nox ~ poly(dis, d), data = Boston)
rss_vec[d] <- sum(residuals(fit)^2)
}
plot(1:10, rss_vec, type = "b", pch = 19, col = "red",
xlab = "Polynomial Degree", ylab = "Residual Sum of Squares (RSS)",
main = "RSS for Polynomial Fits (Degree 1 to 10)")
- As degree increases,as expected RSS is decreasing. indicating low bias
as flexibility increases and can cause overfitting which can be tested
by crossvalidation methods
cv_error <- rep(0, 10)
set.seed(42)
for (d in 1:10) {
fit <- glm(nox ~ poly(dis, d), data = Boston)
cv_error[d] <- cv.glm(Boston, fit, K = 10)$delta[1]
}
# Plot CV error
plot(1:10, cv_error, type = "b", pch = 19, col = "blue",
xlab = "Polynomial Degree", ylab = "10-fold CV Error",
main = "CV Error for Polynomial Degree")
fit_spline4 <- lm(nox ~ bs(dis, df = 4), data = Boston)
summary(fit_spline4)
##
## Call:
## lm(formula = nox ~ bs(dis, df = 4), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.124622 -0.039259 -0.008514 0.020850 0.193891
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.73447 0.01460 50.306 < 2e-16 ***
## bs(dis, df = 4)1 -0.05810 0.02186 -2.658 0.00812 **
## bs(dis, df = 4)2 -0.46356 0.02366 -19.596 < 2e-16 ***
## bs(dis, df = 4)3 -0.19979 0.04311 -4.634 4.58e-06 ***
## bs(dis, df = 4)4 -0.38881 0.04551 -8.544 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06195 on 501 degrees of freedom
## Multiple R-squared: 0.7164, Adjusted R-squared: 0.7142
## F-statistic: 316.5 on 4 and 501 DF, p-value: < 2.2e-16
ggplot(Boston, aes(x = dis, y = nox)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", formula = y ~ bs(x, df = 4), color = "darkgreen", se = FALSE) +
labs(title = "Regression Spline with 4 Degrees of Freedom", x = "dis", y = "nox") +
theme_minimal()
rss_spline <- numeric(10)
for (d in 3:10) {
fit <- lm(nox ~ bs(dis, df = d), data = Boston)
rss_spline[d] <- sum(residuals(fit)^2)
}
# Plot RSS for splines
plot(3:10, rss_spline[3:10], type = "b", pch = 19, col = "darkorange",
xlab = "Degrees of Freedom", ylab = "RSS",
main = "RSS for Regression Splines")
cv_spline <- rep(0, 10)
set.seed(123)
for (d in 3:10) {
fit <- glm(nox ~ bs(dis, df = d), data = Boston)
cv_spline[d] <- cv.glm(Boston, fit, K = 10)$delta[1]
}
## Warning in bs(dis, degree = 3L, knots = numeric(0), Boundary.knots = c(1.1691,
## : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = numeric(0), Boundary.knots = c(1.1691,
## : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = 3.1992, Boundary.knots = c(1.1296, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = 3.1992, Boundary.knots = c(1.1296, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = 3.20745, Boundary.knots = c(1.137, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = 3.20745, Boundary.knots = c(1.137, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(2.40296666666667, 4.2474),
## Boundary.knots = c(1.1296, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(2.40296666666667, 4.2474),
## Boundary.knots = c(1.1296, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(2.35953333333333, 4.2474),
## Boundary.knots = c(1.137, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(2.35953333333333, 4.2474),
## Boundary.knots = c(1.137, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(2.10915, 3.20745, 5.117025),
## Boundary.knots = c(1.137, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(2.10915, 3.20745, 5.117025),
## Boundary.knots = c(1.137, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(2.0754, 3.1025, 5.16495),
## Boundary.knots = c(1.1296, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(2.0754, 3.1025, 5.16495),
## Boundary.knots = c(1.1296, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.9345, 2.67888, 3.89326, 5.6629:
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.9345, 2.67888, 3.89326, 5.6629:
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.85586666666667, 2.35953333333333, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.85586666666667, 2.35953333333333, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.83066666666667, 2.3817, 3.1323, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.83066666666667, 2.3817, 3.1323, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.81108571428571, 2.2467,
## 2.80031428571429, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.81108571428571, 2.2467,
## 2.80031428571429, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.7550125, 2.087875, 2.4895375, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.7550125, 2.087875, 2.4895375, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.7565875, 2.10915, 2.537525, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.7565875, 2.10915, 2.537525, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases
# Plot CV error
plot(3:10, cv_spline[3:10], type = "b", pch = 19, col = "purple",
xlab = "Degrees of Freedom", ylab = "10-fold CV Error",
main = "CV Error for Regression Splines")
- From the plot, we can say that Increase of DOF didn’t cross into
overfitting terrain yet, but has reached lowest point at 5 DOF - 5 DOF
has the best CV error and from RSS as well Model with 5 DoF has good RSS
(although not best, a lowest RSS can probable indicate overfitting)
set.seed(96)
n <- 100
x1 <- rnorm(n)
x2 <- rnorm(n)
y <- 3 + 2 * x1 + -3 * x2 + rnorm(n) # β0 = 3, β1 = 2, β2 = -3 + noise
beta1 <- 0
a <- y - beta1*x1
beta2 <- lm(a~x2)$coef[2]
beta2
## x2
## -3.238452
a <- y -beta2*x2
beta1 <- lm(a~x1)$coef[2]
beta1
## x1
## 2.046237
beta1_vals <- numeric(1000)
beta2_vals <- numeric(1000)
beta0_vals <- numeric(1000)
beta1 <- 0 #resetting to 0
beta2 <- 0 #resetiing to 0
for (i in 1:1000) {
a <- y - beta1 * x1
beta2 <- lm(a ~ x2)$coef[2]
a <- y - beta2 * x2
lm_fit <- lm(a ~ x1)
beta1 <- lm_fit$coef[2]
beta0 <- lm_fit$coef[1]
beta1_vals[i] <- beta1
beta2_vals[i] <- beta2
beta0_vals[i] <- beta0
}
plot(beta0_vals, type = "l", col = "blue", ylim = range(c(beta0_vals, beta1_vals, beta2_vals)),
ylab = "Coefficient Values", xlab = "Iteration", main = "Backfitting Coefficient Estimates")
lines(beta1_vals, col = "red")
lines(beta2_vals, col = "green")
legend("topright", legend = c("beta0", "beta1", "beta2"), col = c("blue", "red", "green"), lty = 1)
- The model is very close to true values from the very beginning
iterations (less than 10)
plot(beta0_vals, type = "l", col = "blue", ylim = range(c(beta0_vals, beta1_vals, beta2_vals)),
ylab = "Coefficient Values", xlab = "Iteration", main = "Backfitting Coefficient Estimates")
lines(beta1_vals, col = "red")
lines(beta2_vals, col = "green")
legend("topright", legend = c("beta0", "beta1", "beta2"), col = c("blue", "red", "green"), lty = 1)
lm_full <- lm(y ~ x1 + x2)
abline(h = coef(lm_full)[1], col = "blue", lty = 2)
abline(h = coef(lm_full)[2], col = "red", lty = 2)
abline(h = coef(lm_full)[3], col = "green", lty = 2)
# Look at the differences between successive iterations
diff_beta1 <- abs(diff(beta1_vals))
diff_beta2 <- abs(diff(beta2_vals))
diff_beta0 <- abs(diff(beta0_vals))
# Check when all the changes are below a small threshold (e.g., 1e-6)
converged_iter <- which((diff_beta1 < 1e-6) & (diff_beta2 < 1e-6) & (diff_beta0 < 1e-6))[1]
converged_iter
## [1] 4