library(ISLR)
library(boot)
library(ggplot2)

data(Wage)
# Load required libraries
library(ISLR)
library(boot)

# Load the Wage data
data(Wage)

# Set seed for reproducibility
set.seed(1)

# Step 1: Use cross-validation to choose the best degree (1 to 10)
cv.error <- rep(0, 10)

for (d in 1:10) {
  fit <- glm(wage ~ poly(age, d), data = Wage)
  cv.error[d] <- cv.glm(Wage, fit)$delta[1]
}

# Find best degree based on lowest CV error
best.degree <- which.min(cv.error)
cat("Best degree chosen by cross-validation:", best.degree, "\n")
## Best degree chosen by cross-validation: 9
# Plot the CV error vs degree
plot(1:10, cv.error, type = "b", xlab = "Polynomial Degree", ylab = "LOOCV Error",
     main = "LOOCV Error by Polynomial Degree")

# Step 2: Compare with ANOVA (nested hypothesis testing)
fit.1 <- lm(wage ~ poly(age, 1), data = Wage)
fit.2 <- lm(wage ~ poly(age, 2), data = Wage)
fit.3 <- lm(wage ~ poly(age, 3), data = Wage)
fit.4 <- lm(wage ~ poly(age, 4), data = Wage)
fit.5 <- lm(wage ~ poly(age, 5), data = Wage)

anova_result <- anova(fit.1, fit.2, fit.3, fit.4, fit.5)
print(anova_result)
## Analysis of Variance Table
## 
## Model 1: wage ~ poly(age, 1)
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
##   Res.Df     RSS Df Sum of Sq        F    Pr(>F)    
## 1   2998 5022216                                    
## 2   2997 4793430  1    228786 143.5931 < 2.2e-16 ***
## 3   2996 4777674  1     15756   9.8888  0.001679 ** 
## 4   2995 4771604  1      6070   3.8098  0.051046 .  
## 5   2994 4770322  1      1283   0.8050  0.369682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Step 3: Fit the final model using the best degree from CV
final_model <- lm(wage ~ poly(age, best.degree), data = Wage)

# Step 4: Make predictions and plot the polynomial fit
age.grid <- seq(from = min(Wage$age), to = max(Wage$age), length.out = 100)
preds <- predict(final_model, newdata = list(age = age.grid))

# Plot the actual data and polynomial fit
plot(Wage$age, Wage$wage, col = "gray", main = paste("Polynomial Regression Fit (Degree =", best.degree, ")"),
     xlab = "Age", ylab = "Wage")
lines(age.grid, preds, col = "blue", lwd = 2)

Performed polynomial regression to predict wage using age on the Wage dataset. Using leave-one-out cross-validation (LOOCV), the optimal polynomial degree was determined to be 9, as it gave the lowest prediction error. In contrast, a series of nested hypothesis tests using the anova() function showed that degree 3 was the last statistically significant model (p < 0.05). Degree 4 was marginal (p ≈ 0.051), and higher degrees did not yield statistically significant improvements. The discrepancy illustrates a trade-off: cross-validation prioritizes predictive accuracy and may select more flexible models, while ANOVA emphasizes statistical significance and model simplicity. The LOOCV error plot supports the selection of degree 9, while the polynomial regression plot visually shows how this higher-degree polynomial captures more detail — possibly at the risk of overfitting.

# Set seed for reproducibility
set.seed(2)

# Step 1: Try different numbers of cuts (2 to 10), use CV to choose the best
cv.error.step <- rep(NA, 9)

for (k in 2:10) {
  Wage$age.cut <- cut(Wage$age, k)
  fit <- glm(wage ~ age.cut, data = Wage)
  cv.error.step[k - 1] <- cv.glm(Wage, fit, K = 10)$delta[1]
}

# Step 2: Best number of cuts (lowest CV error)
best.cuts <- which.min(cv.error.step) + 1
cat("Best number of cuts chosen by CV:", best.cuts, "\n")
## Best number of cuts chosen by CV: 8
# Plot CV errors for visual inspection
plot(2:10, cv.error.step, type = "b", xlab = "Number of Cuts", ylab = "CV Error",
     main = "CV Error for Step Function by Number of Cuts")

# Step 3: Fit model using best number of cuts
Wage$age.cut <- cut(Wage$age, best.cuts)
fit.step <- lm(wage ~ age.cut, data = Wage)

# Step 4: Plot the step function fit
age.grid <- seq(from = min(Wage$age), to = max(Wage$age), length.out = 100)
pred.step <- predict(fit.step, newdata = data.frame(age.cut = cut(age.grid, best.cuts)))

plot(Wage$age, Wage$wage, col = "gray", main = paste("Step Function Fit (", best.cuts, " cuts)"),
     xlab = "Age", ylab = "Wage")
lines(age.grid, pred.step, col = "red", lwd = 2)

We fit a step function to predict wage using age by dividing the age variable into k intervals using the cut() function, and performed 10-fold cross-validation to choose the best number of cuts. As shown in the CV error plot, the lowest CV error occurred at 8 cuts, suggesting that this number of intervals provides the best trade-off between model flexibility and generalization error. The resulting fit was visualized as a piecewise constant function, where predicted wage remains constant within each age bin. Compared to polynomial regression, the step function offers a simpler, more interpretable structure, though it may lack smoothness and miss subtle nonlinear trends.

# Load required libraries
library(ISLR)
library(splines)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Load the Wage data
data(Wage)

# --- 1. Summarize categorical variables of interest ---
summary(Wage$maritl)
## 1. Never Married       2. Married       3. Widowed      4. Divorced 
##              648             2074               19              204 
##     5. Separated 
##               55
summary(Wage$jobclass)
##  1. Industrial 2. Information 
##           1544           1456
summary(Wage$education)
##       1. < HS Grad         2. HS Grad    3. Some College    4. College Grad 
##                268                971                650                685 
## 5. Advanced Degree 
##                426
# --- 2. Visualize Wage by categorical predictors ---
par(mfrow = c(1, 3))
boxplot(wage ~ maritl, data = Wage, main = "Wage by Marital Status", las = 2)
boxplot(wage ~ jobclass, data = Wage, main = "Wage by Job Class", las = 2)
boxplot(wage ~ education, data = Wage, main = "Wage by Education", las = 2)

# --- 3. Fit spline models for continuous predictors ---

# Spline fit for age (df = 5)
fit.age <- lm(wage ~ ns(age, df = 5), data = Wage)

# Spline fit for year (df = 4)
fit.year <- lm(wage ~ ns(year, df = 4), data = Wage)

# Interaction model: wage ~ age (spline) * education
fit.interaction <- lm(wage ~ ns(age, df = 4) * education, data = Wage)

# --- 4. Plot spline fit: wage vs age ---
age.grid <- seq(from = min(Wage$age), to = max(Wage$age), length.out = 100)
pred.age <- predict(fit.age, newdata = list(age = age.grid))

plot(Wage$age, Wage$wage, col = "gray", main = "Spline Fit: Wage vs Age",
     xlab = "Age", ylab = "Wage")
lines(age.grid, pred.age, col = "blue", lwd = 2)

# --- 5. Plot spline fit: wage vs year ---
year.grid <- seq(from = min(Wage$year), to = max(Wage$year), length.out = 100)
pred.year <- predict(fit.year, newdata = list(year = year.grid))

plot(Wage$year, Wage$wage, col = "gray", main = "Spline Fit: Wage vs Year",
     xlab = "Year", ylab = "Wage")
lines(year.grid, pred.year, col = "red", lwd = 2)

# --- 6. Plot interaction: wage vs age across education levels ---
education.levels <- unique(Wage$education)
colors <- rainbow(length(education.levels))

plot(Wage$age, Wage$wage, col = "gray", main = "Wage vs Age by Education Level",
     xlab = "Age", ylab = "Wage")

for (i in seq_along(education.levels)) {
  ed <- education.levels[i]
  age.seq <- seq(from = min(Wage$age), to = max(Wage$age), length.out = 100)
  pred <- predict(fit.interaction, newdata = data.frame(age = age.seq, education = ed))
  lines(age.seq, pred, col = colors[i], lwd = 2)
}
legend("topright", legend = education.levels, col = colors, lwd = 2, cex = 0.6)

Wage by Marital Status Married individuals have a slightly higher median wage than other groups. Never Married and Divorced individuals show similar distributions, but slightly lower medians. Widowed and Separated groups are very small (n=19 and n=55), and show greater variance, which may be due to sample size. Marital status has some relationship with wage, but due to imbalance in group sizes, it may be secondary to education or job class.

Wage by Job Class Individuals in Information jobs clearly earn higher wages on average than those in Industrial jobs. The median and interquartile range for Information class are shifted upward, and extreme high earners are more frequent.

Wage by Education Clear increasing trend in wage with education level. < HS Grad group has the lowest median wage. Advanced Degree group has the highest median wage and largest spread. Very clean progression from less to more education.

Wage by Education Clear increasing trend in wage with education level. < HS Grad group has the lowest median wage. Advanced Degree group has the highest median wage and largest spread. Very clean progression from less to more education.

Analyzed the relationship between several predictors and wage using both categorical and continuous variables. Education, job class, and marital status show meaningful differences in wage distributions. Using natural splines, we found that wage increases non-linearly with age, peaking around 50–60, and grows slightly over calendar years. Most notably, an interaction between age and education revealed that wage increases with age are much steeper for more educated individuals, highlighting the compounding effect of education over time.

9)a

# Load necessary libraries
library(MASS)      # for Boston dataset
library(ggplot2)

# Load the data
data(Boston)

# Fit a cubic polynomial regression: nox ~ poly(dis, 3)
fit.cubic <- lm(nox ~ poly(dis, 3), data = Boston)

# Print regression output
summary(fit.cubic)
## 
## Call:
## lm(formula = nox ~ poly(dis, 3), data = Boston)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.121130 -0.040619 -0.009738  0.023385  0.194904 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.554695   0.002759 201.021  < 2e-16 ***
## poly(dis, 3)1 -2.003096   0.062071 -32.271  < 2e-16 ***
## poly(dis, 3)2  0.856330   0.062071  13.796  < 2e-16 ***
## poly(dis, 3)3 -0.318049   0.062071  -5.124 4.27e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06207 on 502 degrees of freedom
## Multiple R-squared:  0.7148, Adjusted R-squared:  0.7131 
## F-statistic: 419.3 on 3 and 502 DF,  p-value: < 2.2e-16
# Create prediction grid
dis.grid <- seq(from = min(Boston$dis), to = max(Boston$dis), length.out = 200)
pred.cubic <- predict(fit.cubic, newdata = data.frame(dis = dis.grid))

# Plot the data and the fitted cubic polynomial
plot(Boston$dis, Boston$nox, col = "gray", pch = 1,
     main = "Cubic Polynomial Fit: NOX vs DIS",
     xlab = "Distance to Employment Centers (dis)",
     ylab = "NOX Concentration")

lines(dis.grid, pred.cubic, col = "blue", lwd = 2)

There’s a strong non-linear relationship between dis (distance to employment centers) and nox (NOX concentration). The negative coefficient for the first term and positive for the second indicate curvature. The curve decreases steeply at first, then levels out — matching intuition about how NOX declines as you move away from urban areas.

Gray scatterplot: Actual data Blue line: Fitted cubic polynomial curve The curve shows a smooth downward trend, flattening beyond dis ~ 6

We used a cubic polynomial regression model to predict nox (nitrogen oxides concentration) from dis (distance to employment centers) using the poly() function in R. The model was highly significant (F = 419.3, p < 2e-16) and explained 71.5% of the variation in NOX. All three polynomial terms were statistically significant (p < 0.001). The resulting curve indicates that NOX levels decline non-linearly as distance from city centers increases, flattening out in outer suburbs. This relationship is consistent with urban pollution dynamics.

# Load libraries
library(MASS)

# Load the dataset
data(Boston)

# Create prediction gri
dis.grid <- seq(min(Boston$dis), max(Boston$dis), length.out = 200)

# Store RSS and colors for each degree
degrees <- 1:10
rss.values <- numeric(length(degrees))
colors <- rainbow(length(degrees))

# Plot the raw data
plot(Boston$dis, Boston$nox, col = "gray", pch = 1,
     main = "Polynomial Fits of Degrees 1 to 10",
     xlab = "Distance to Employment Centers (dis)",
     ylab = "NOX Concentration")

# Fit and plot each polynomial
for (d in degrees) {
  fit <- lm(nox ~ poly(dis, d), data = Boston)
  pred <- predict(fit, newdata = data.frame(dis = dis.grid))
  lines(dis.grid, pred, col = colors[d], lwd = 2)
  
  # Calculate and store RSS
  rss.values[d] <- sum(residuals(fit)^2)
}

# Add legend
legend("topright", legend = paste("Degree", degrees), col = colors, lwd = 2, cex = 0.7)

# Print RSS values
cat("Residual Sum of Squares (RSS) by Degree:\n")
## Residual Sum of Squares (RSS) by Degree:
for (d in degrees) {
  cat(sprintf("Degree %2d: RSS = %.4f\n", d, rss.values[d]))
}
## Degree  1: RSS = 2.7686
## Degree  2: RSS = 2.0353
## Degree  3: RSS = 1.9341
## Degree  4: RSS = 1.9330
## Degree  5: RSS = 1.9153
## Degree  6: RSS = 1.8783
## Degree  7: RSS = 1.8495
## Degree  8: RSS = 1.8356
## Degree  9: RSS = 1.8333
## Degree 10: RSS = 1.8322

Degrees 1–3 reduce RSS significantly, capturing the general trend that nox decreases as dis increases. After degree 4, the curve becomes increasingly wiggly, especially at the boundaries (as seen for degrees 7–10). While RSS continues to drop, the improvement beyond degree 4 is minimal — indicating diminishing returns. Degree 10 has the lowest RSS, but the plot clearly shows signs of overfitting.

We fit polynomial regression models of degrees 1 through 10 to predict nox using dis. As the degree increased, the Residual Sum of Squares (RSS) decreased from 2.77 (degree 1) to 1.83 (degree 10), indicating better in-sample fit. However, the rate of improvement slowed drastically after degree 4, and higher-degree models produced curves with unnatural oscillations, especially near the boundaries of dis. A degree 3 or 4 model appears optimal, balancing model complexity and smoothness without overfitting. This finding aligns with visual inspection and supports avoiding excessive degrees despite slightly lower RSS.

9c)

library(MASS)
library(boot)

# Load the dataset
data(Boston)

# Store CV errors
cv.error <- numeric(10)

# Perform 10-fold CV for polynomial degrees 1 to 10
set.seed(123)  # for reproducibility
for (d in 1:10) {
  fit <- glm(nox ~ poly(dis, d), data = Boston)
  cv.error[d] <- cv.glm(Boston, fit, K = 10)$delta[1]
}

# Plot CV Error vs Degree
plot(1:10, cv.error, type = "b", pch = 19, col = "blue",
     xlab = "Polynomial Degree", ylab = "10-fold CV Error",
     main = "Cross-Validation Error by Polynomial Degree")

# Print CV Errors
cat("10-Fold CV Errors by Degree:\n")
## 10-Fold CV Errors by Degree:
for (d in 1:10) {
  cat(sprintf("Degree %2d: CV Error = %.4f\n", d, cv.error[d]))
}
## Degree  1: CV Error = 0.0055
## Degree  2: CV Error = 0.0041
## Degree  3: CV Error = 0.0039
## Degree  4: CV Error = 0.0039
## Degree  5: CV Error = 0.0041
## Degree  6: CV Error = 0.0054
## Degree  7: CV Error = 0.0103
## Degree  8: CV Error = 0.0137
## Degree  9: CV Error = 0.0149
## Degree 10: CV Error = 0.0089
CV error drops steeply between degrees 1–3
It reaches a minimum at degrees 3 and 4
Then increases sharply for degrees ≥ 6 — indicating overfitting

We used 10-fold cross-validation to identify the optimal polynomial degree for predicting nox from dis.
The cross-validation error was lowest for degrees 3 and 4 (CV error = 0.0039), indicating the best generalization performance.
Higher-degree models showed increased error, especially from degrees 6 to 9, which is a classic sign of overfitting.
Based on these results, we conclude that a cubic (degree 3) or quartic (degree 4) polynomial provides the optimal trade-off between bias and variance.

9d)



``` r
library(MASS)
library(splines)

# Fit regression spline with 4 degrees of freedom
fit.spline <- lm(nox ~ bs(dis, df = 4), data = Boston)

# Display model summary
summary(fit.spline)
## 
## Call:
## lm(formula = nox ~ bs(dis, df = 4), data = Boston)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.124622 -0.039259 -0.008514  0.020850  0.193891 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.73447    0.01460  50.306  < 2e-16 ***
## bs(dis, df = 4)1 -0.05810    0.02186  -2.658  0.00812 ** 
## bs(dis, df = 4)2 -0.46356    0.02366 -19.596  < 2e-16 ***
## bs(dis, df = 4)3 -0.19979    0.04311  -4.634 4.58e-06 ***
## bs(dis, df = 4)4 -0.38881    0.04551  -8.544  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06195 on 501 degrees of freedom
## Multiple R-squared:  0.7164, Adjusted R-squared:  0.7142 
## F-statistic: 316.5 on 4 and 501 DF,  p-value: < 2.2e-16
# Show automatically chosen internal knots
knots <- attr(bs(Boston$dis, df = 4), "knots")
cat("Knots used (automatically selected):\n")
## Knots used (automatically selected):
print(knots)
## [1] 3.20745
# Generate prediction grid
dis.grid <- seq(min(Boston$dis), max(Boston$dis), length.out = 200)
pred.spline <- predict(fit.spline, newdata = data.frame(dis = dis.grid))

# Plot data and spline fit
plot(Boston$dis, Boston$nox, col = "gray", pch = 1,
     main = "Regression Spline Fit (df = 4)",
     xlab = "Distance to Employment Centers (dis)",
     ylab = "NOX Concentration")
lines(dis.grid, pred.spline, col = "darkgreen", lwd = 2)



The green spline curve captures the non-linear decrease in NOX as distance from employment centers increases.
Unlike high-degree polynomials, the spline is smooth and doesn't oscillate or overfit.
The curve levels off after ~6, which aligns with the flattening seen in your earlier models.

We fit a regression spline using the bs() function with 4 degrees of freedom to model nox as a function of dis.
The model was highly significant (p < 2e-16) and explained 71.6% of the variance in NOX levels. All spline basis terms were statistically significant.
With df = 4, R placed one internal knot at dis ≈ 3.21, automatically chosen based on quantiles of the predictor.
The spline curve effectively captures the non-linear decrease in NOX with increasing distance, while remaining smooth and interpretable. It performs comparably to the cubic polynomial but avoids overfitting and erratic boundary behavior.

9e)



``` r
library(MASS)
library(splines)

data(Boston)

# Prediction grid for smoother curves
dis.grid <- seq(min(Boston$dis), max(Boston$dis), length.out = 200)

# Degrees of freedom to test
df.range <- 3:10
colors <- rainbow(length(df.range))
rss <- numeric(length(df.range))

# Plot base scatter
plot(Boston$dis, Boston$nox, col = "gray", pch = 1,
     main = "Regression Splines with df = 3 to 10",
     xlab = "Distance to Employment Centers (dis)",
     ylab = "NOX Concentration")

# Loop over dfs: fit spline, compute RSS, plot line
for (i in seq_along(df.range)) {
  df <- df.range[i]
  fit <- lm(nox ~ bs(dis, df = df), data = Boston)
  pred <- predict(fit, newdata = data.frame(dis = dis.grid))
  lines(dis.grid, pred, col = colors[i], lwd = 2)
  
  # Compute RSS
  rss[i] <- sum(residuals(fit)^2)
}

# Legend
legend("topright", legend = paste("df =", df.range), col = colors, lwd = 2, cex = 0.7)

# Print RSS values
cat("RSS for regression splines:\n")
## RSS for regression splines:
for (i in seq_along(df.range)) {
  cat(sprintf("df = %2d: RSS = %.4f\n", df.range[i], rss[i]))
}
## df =  3: RSS = 1.9341
## df =  4: RSS = 1.9228
## df =  5: RSS = 1.8402
## df =  6: RSS = 1.8340
## df =  7: RSS = 1.8299
## df =  8: RSS = 1.8170
## df =  9: RSS = 1.8257
## df = 10: RSS = 1.7925

The plot shows the regression spline fits with df ranging from 3 to 10. Lower degrees (3–5) produce smooth curves that generalize well. Higher degrees (8–10) allow the model to closely follow the data, especially in regions with dense points, but may begin to overfit or introduce unnecessary wiggles.

We fit regression splines with degrees of freedom from 3 to 10 to model nox as a function of dis. As expected, the Residual Sum of Squares (RSS) decreased steadily, from 1.93 (df = 3) to 1.79 (df = 10). This confirms that additional flexibility improves the in-sample fit. However, the rate of RSS improvement diminishes beyond df = 6, and the plotted curves become increasingly complex. This suggests that degrees between 5 and 7 offer a good trade-off between bias and variance, while higher df (e.g., 9–10) may risk overfitting. The best approach would involve cross-validation to formally assess predictive accuracy for each spline.

9f)

library(MASS)
library(boot)
library(splines)

data(Boston)

set.seed(42)

df.range <- 3:10
cv.errors <- numeric(length(df.range))

# Use fixed boundary to prevent errors during CV
boundary <- range(Boston$dis)

# Perform 10-fold CV for each df
for (i in seq_along(df.range)) {
  df <- df.range[i]
  fit <- glm(nox ~ bs(dis, df = df, Boundary.knots = boundary), data = Boston)
  cv.errors[i] <- cv.glm(Boston, fit, K = 10)$delta[1]
}

# Plot CV error vs df
plot(df.range, cv.errors, type = "b", pch = 19, col = "blue",
     xlab = "Degrees of Freedom", ylab = "10-fold CV Error",
     main = "CV Error for Regression Spline")

# Print optimal df
best.df <- df.range[which.min(cv.errors)]
cat(sprintf("Best degrees of freedom by CV: %d\n", best.df))
## Best degrees of freedom by CV: 10

We applied 10-fold cross-validation to select the best degrees of freedom (df) for a regression spline modeling nox as a function of dis using the bs() function. The cross-validation error was evaluated for df values ranging from 3 to 10. While the error fluctuated slightly, the lowest CV error was observed at df = 10. The CV error curve suggests that models with moderate to high flexibility (df = 6 to 10) perform similarly, but df = 10 yielded the best generalization to unseen data based on this metric. Therefore, a regression spline with 10 degrees of freedom is selected as the optimal model for this analysis.

Based on the cross-validation results, the optimal degrees of freedom for the regression spline is 10. While several models from df = 5 to 9 performed comparably, df = 10 gave the best generalization error and captured the non-linear relationship between nox and dis most effectively. Thus, we select df = 10 as the best regression spline model for this data.

set.seed(42)

# Parameters
n <- 100
beta0 <- 1.5
beta1 <- 3.0
beta2 <- -4.5

# Generate X1 ~ N(0, 2)
X1 <- rnorm(n, mean = 0, sd = sqrt(2))

# Generate X2 ~ N(1, 2)
X2 <- rnorm(n, mean = 1, sd = sqrt(2))

# Generate error ε ~ N(0, 1)
epsilon <- rnorm(n, mean = 0, sd = 1)

# Generate response Y
Y <- beta0 + beta1 * X1 + beta2 * X2 + epsilon

# Store in a data frame
data <- data.frame(X1, X2, Y)

# Display first few rows
head(data)
##           X1         X2          Y
## 1  1.9388280  2.6984215  -6.827342
## 2 -0.7986038  2.4775012 -11.710789
## 3  0.5135411 -0.4187513   6.096329
## 4  0.8950029  3.6141482 -10.019119
## 5  0.5717217  0.0570400   1.581624
## 6 -0.1500827  1.1492191  -5.272590

11b)

# Initialize coefficients (as per part b)
beta0 <- 0
beta1 <- 1  # You can choose any value here (e.g., 0, 1, or even random)
beta2 <- 0
print(beta0)
## [1] 0
print(beta1)
## [1] 1
print(beta2)
## [1] 0

beta0 is initialized to 0 beta1 is initialized to 1 beta2 is initialized to 0

11c)

# Assume beta1 is fixed (e.g., from initialization or previous iteration)
a <- Y - beta1 * X1  # Partial residual
beta2 <- lm(a ~ X2)$coef[2]  # Update beta2
beta0 <- lm(a ~ X2)$coef[1]
cat("Updated beta0:", beta0, "\n")
## Updated beta0: 1.468819
cat("Updated beta2:", beta2, "\n")
## Updated beta2: -4.371298

β₀ (Intercept) was updated to 1.4688, which is very close to the true value of 1.5 β₂ (Slope for X2) was updated to -4.3713, which is very close to the true value of -4.5 These values are derived using a simple linear regression, holding β₁ fixed at 1 Since the updates are already close to the true values, your backfitting process is working as expected, even after just one iteration

11d)

# Keeping beta2 fixed, update beta1
a <- Y - beta2 * X2  # Partial residual
beta1 <- lm(a ~ X1)$coef[2]  # Update beta1
print(beta0)
## (Intercept) 
##    1.468819
print(beta1)
##       X1 
## 2.896525
print(beta2)
##        X2 
## -4.371298

gradually estimating the values of the coefficients by iterating between updates of one coefficient at a time. Each step gets you closer to the true values, which were used to simulate the data: β0=1.4, β1=2.8, β2=−4.3

11e)

# Load necessary libraries
library(ggplot2)

# Set seed for reproducibility
set.seed(42)

# Generate data
n <- 100
X1 <- rnorm(n, mean = 0, sd = sqrt(2))      # mean 0, variance 2
X2 <- rnorm(n, mean = 1, sd = sqrt(2))      # mean 1, variance 2
epsilon <- rnorm(n, mean = 0, sd = 1)       # variance 1

# True coefficients
beta0_true <- 1.5
beta1_true <- 3
beta2_true <- -4.5

# Response
Y <- beta0_true + beta1_true * X1 + beta2_true * X2 + epsilon

# Initialize
beta0 <- 0
beta1 <- 1  # you can choose any value
beta2 <- 0

# Store coefficient paths
beta0_path <- numeric(1000)
beta1_path <- numeric(1000)
beta2_path <- numeric(1000)

# Backfitting algorithm
for (i in 1:1000) {
  # Update beta2 (fix beta1)
  a <- Y - beta1 * X1
  lm_fit <- lm(a ~ X2)
  beta2 <- lm_fit$coef[2]
  beta0 <- lm_fit$coef[1]
  
  # Update beta1 (fix beta2)
  a <- Y - beta2 * X2
  lm_fit <- lm(a ~ X1)
  beta1 <- lm_fit$coef[2]
  beta0 <- lm_fit$coef[1]
  
  # Store
  beta0_path[i] <- beta0
  beta1_path[i] <- beta1
  beta2_path[i] <- beta2
}

# Print selected iterations
print_steps <- c(1:5, seq(50, 1000, by = 50))
output <- data.frame(
  Iteration = print_steps,
  beta0 = beta0_path[print_steps],
  beta1 = beta1_path[print_steps],
  beta2 = beta2_path[print_steps]
)
print(output)
##    Iteration    beta0    beta1     beta2
## 1          1 1.381611 2.896525 -4.371298
## 2          2 1.441396 2.898380 -4.439622
## 3          3 1.441455 2.898382 -4.439689
## 4          4 1.441455 2.898382 -4.439689
## 5          5 1.441455 2.898382 -4.439689
## 6         50 1.441455 2.898382 -4.439689
## 7        100 1.441455 2.898382 -4.439689
## 8        150 1.441455 2.898382 -4.439689
## 9        200 1.441455 2.898382 -4.439689
## 10       250 1.441455 2.898382 -4.439689
## 11       300 1.441455 2.898382 -4.439689
## 12       350 1.441455 2.898382 -4.439689
## 13       400 1.441455 2.898382 -4.439689
## 14       450 1.441455 2.898382 -4.439689
## 15       500 1.441455 2.898382 -4.439689
## 16       550 1.441455 2.898382 -4.439689
## 17       600 1.441455 2.898382 -4.439689
## 18       650 1.441455 2.898382 -4.439689
## 19       700 1.441455 2.898382 -4.439689
## 20       750 1.441455 2.898382 -4.439689
## 21       800 1.441455 2.898382 -4.439689
## 22       850 1.441455 2.898382 -4.439689
## 23       900 1.441455 2.898382 -4.439689
## 24       950 1.441455 2.898382 -4.439689
## 25      1000 1.441455 2.898382 -4.439689
# Plot evolution of coefficients
df_plot <- data.frame(
  Iteration = rep(1:1000, 3),
  Value = c(beta0_path, beta1_path, beta2_path),
  Coefficient = rep(c("beta0", "beta1", "beta2"), each = 1000)
)

ggplot(df_plot, aes(x = Iteration, y = Value, color = Coefficient)) +
  geom_line(linewidth = 1) +
  labs(title = "Backfitting Convergence of Coefficients",
       y = "Coefficient Estimate") +
  theme_minimal()

From the table and the plot:

The values of β₀, β₁, and β₂ converge very quickly (within the first few iterations) and then remain constant for the rest of the 1,000 iterations. By iteration 3, the coefficients have stabilized, and no further meaningful updates occur

The final estimates (after convergence) are approximately:

β₀ ≈ 1.441 β₁ ≈ 2.898 β₂ ≈ -4.440 These are very close to the true values used in the data generation process:

β₀ (true) = 1.5 β₁ (true) = 3 β₂ (true) = -4.5

The flat lines in the plot indicate convergence — the coefficients no longer change after a few iterations. Each coefficient is represented in a different color, and the plot clearly shows stability, which is a key feature of successful backfitting.

It correctly alternates between updating β₁ and β₂ while holding the other fixed. It converges rapidly and reliably to values close to the true coefficients. The plot and summary table clearly demonstrate this convergence behavior.

11f)

# Simulate the data
set.seed(42)
n <- 100
x1 <- rnorm(n, mean = 0, sd = sqrt(2))
x2 <- rnorm(n, mean = 1, sd = sqrt(2))
epsilon <- rnorm(n, mean = 0, sd = 1)

# True coefficients
beta0_true <- 1.5
beta1_true <- 3
beta2_true <- -4.5

# Generate response
y <- beta0_true + beta1_true * x1 + beta2_true * x2 + epsilon

# Initialize beta estimates
beta0 <- 0
beta1 <- 1
beta2 <- 0

# Store estimates
beta0_path <- numeric(1000)
beta1_path <- numeric(1000)
beta2_path <- numeric(1000)

# Backfitting iterations
for (i in 1:1000) {
  # Update beta2 (fix beta1)
  res2 <- y - beta1 * x1
  fit2 <- lm(res2 ~ x2)
  beta2 <- coef(fit2)[2]
  beta0 <- coef(fit2)[1]
  
  # Update beta1 (fix beta2)
  res1 <- y - beta2 * x2
  fit1 <- lm(res1 ~ x1)
  beta1 <- coef(fit1)[2]
  beta0 <- coef(fit1)[1]
  
  # Save estimates
  beta0_path[i] <- beta0
  beta1_path[i] <- beta1
  beta2_path[i] <- beta2
}

# Fit multiple regression model
df <- data.frame(x1 = x1, x2 = x2, y = y)
lm_fit <- lm(y ~ x1 + x2, data = df)
coef_lm <- coef(lm_fit)

# Plot backfitting paths using base R
plot(1:1000, beta0_path, type = "l", col = "red", ylim = range(c(beta0_path, beta1_path, beta2_path)),
     xlab = "Iteration", ylab = "Coefficient Estimate",
     main = "Backfitting Convergence with abline() Overlay")
lines(1:1000, beta1_path, col = "green")
lines(1:1000, beta2_path, col = "blue")

# Overlay lm() estimates using abline()
abline(h = coef_lm[1], col = "red", lty = 2)   # beta0
abline(h = coef_lm["x1"], col = "green", lty = 2)  # beta1
abline(h = coef_lm["x2"], col = "blue", lty = 2)   # beta2

# Add legend
legend("bottomright", legend = c("beta0", "beta1", "beta2",
                                 "lm beta0", "lm beta1", "lm beta2"),
       col = c("red", "green", "blue", "red", "green", "blue"),
       lty = c(1, 1, 1, 2, 2, 2), cex = 0.8)

Dashed lines (added via abline()): Match the OLS estimates from lm(Y ~ X1 + X2) These represent the true converged values for the coefficients using closed-form multiple regression.

All backfitting estimates converge quickly and stabilize by the ~3rd iteration. The converged values from backfitting exactly match the OLS estimates shown by the dashed lines. This confirms the correctness of your backfitting implementation — it’s functionally equivalent to multiple linear regression!

11g)

Starting from iteration 3, the estimates remain unchanged (to at least 6 decimal places). The final OLS values were: beta0 ≈ 1.441455 beta1 ≈ 2.898382 beta2 ≈ -4.439689

These match exactly by iteration 3. Only 3 iterations were needed to obtain a “good” approximation of the multiple regression coefficients via backfitting. This shows how fast and efficient the backfitting algorithm can be when the model is linear and data is clean.