Complete the following exercises from Introduction to Statistical Learning

Chapter 7: (6), (7), (9), and (11)

Note: For problem 6, the hypothesis testing anova referring to the process used on page 285 of the book, where the function is used with argument set to “F”. This is the usual statistical inference manner in which degree would be selected.

Note: For problem 7, you can easily fit non-linear functions in the categorical variables by simply entering them into the model - a reference will be chosen automatically.

Note: For problem 9, you will encounter errors when using CV with the bs() function. There is an argument, , which you can specify to fix this, but you are not required. Please do not let warnings print to your RMarkdown document. You should be able to suppress them by specifying warning = FALSE in your R Markdown code chunk options.

Note: For problem 11, please generate \(X1\) and \(X2\) independently as normally distributed with mean 0 and 1 respectively, both with variance of 2.

Also, the variance of the measurement error (\(Var(\epsilon)\) should be 1), and the values of \(\beta_0, \beta_1\), and \(\beta_2\): 1.5, 3, and -4.5 respectively. This will make grading easier - you may experiment with changing these values on your own, but please do not turn in your assignment with other values. Only print your results for the first 5 steps and then 50, 100, 150, and so on to 1000 iterations for beta0, beta1, and beta2. Do not print all 1000 lines, please. Please provide a legend and legible colors in your plot.

(6)

a)

# Load necessary libraries
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.2
library(ISLR)  
## Warning: package 'ISLR' was built under R version 4.3.2
library(boot)
## Warning: package 'boot' was built under R version 4.3.3
# Load the Wage dataset
data(Wage)
wage_data <- Wage

# Function to perform cross-validation for polynomial regression
cv_error <- function(degree, data) {
  glm_fit <- glm(wage ~ poly(age, degree), data=data)
  cv <- cv.glm(data, glm_fit, K=10)  # 10-fold CV
  return(cv$delta[1])  # Return the CV error
}

# Apply cross-validation for various degrees of the polynomial
degrees <- 1:10
cv_errors <- sapply(degrees, cv_error, data=wage_data)

# Find the optimal degree
optimal_degree <- degrees[which.min(cv_errors)]
print(paste("Optimal degree: ", optimal_degree))
## [1] "Optimal degree:  9"
# Fit the model with the optimal degree
optimal_model <- glm(wage ~ poly(age, optimal_degree), data=wage_data)

# Plotting
ggplot(wage_data, aes(x=age, y=wage)) +
  geom_point(alpha=0.5) +
  stat_function(fun=function(x) predict(optimal_model, newdata=data.frame(age=x)), color="blue") +
  ggtitle(paste("Polynomial Fit with Degree", optimal_degree)) +
  xlab("Age") + ylab("Wage")

This script first finds the optimal polynomial degree for fitting a model to predict wage using age, using 10-fold cross-validation to minimize the prediction error. It then fits the polynomial regression model with the optimal degree and plots the model’s predictions over the original data points for visualization.

b)

data(Wage)
wage_data <- Wage

# cross-validation error calculation with specific number of cuts
cv_error_step <- function(cuts, data) {
  data$age_cut <- cut(data$age, breaks=quantile(data$age, probs=seq(0, 1, length.out=cuts + 1), na.rm = TRUE), include.lowest=TRUE)
  glm_fit <- glm(wage ~ age_cut, data=data)
  
  # Perform 10-fold cross-validation and return the error
  cv_result <- cv.glm(data, glm_fit, K=10)
  return(cv_result$delta[1])  # Cross-validation error
}

# Determine the optimal number of cuts by comparing cross-validation errors
cuts_range <- 2:20
cv_errors <- sapply(cuts_range, function(cuts) cv_error_step(cuts, wage_data))

# Optimal number of cuts
optimal_cuts <- which.min(cv_errors)
cat("Optimal number of cuts:", optimal_cuts, "\n")
## Optimal number of cuts: 13
# Fit the model with the optimal number of cuts
wage_data$age_cut <- cut(wage_data$age, breaks=quantile(wage_data$age, probs=seq(0, 1, length.out=optimal_cuts + 1), na.rm = TRUE), include.lowest=TRUE)
optimal_model <- glm(wage ~ age_cut, data=wage_data)

# Plotting
ggplot(wage_data, aes(x=age, y=wage)) +
  geom_point(aes(color=age_cut), alpha=0.5) +
  geom_smooth(method="glm", formula=y~x, method.args=list(family="gaussian"), aes(group=age_cut), se=FALSE, color="blue") +
  scale_color_viridis_d() +
  labs(title=paste("Step Function Fit with", optimal_cuts, "Cuts"), x="Age", y="Wage")

The plotting step includes the age_cut directly in the aesthetics, ensuring that the color coding and the smooth lines represent the model fitting per each age bin accurately.

7)

library(ISLR)   
library(mgcv) 
## Warning: package 'mgcv' was built under R version 4.3.3
## Loading required package: nlme
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
library(ggplot2)  

data(Wage)

# Fit a GAM with wage as the response and maritl, jobclass, and a smooth term for age as predictors
gam_fit <- gam(wage ~ s(age) + factor(maritl) + factor(jobclass), data=Wage)

# Summary of the GAM fit to examine the effect of predictors
summary(gam_fit)
## 
## Family: gaussian 
## Link function: identity 
## 
## Formula:
## wage ~ s(age) + factor(maritl) + factor(jobclass)
## 
## Parametric coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      94.512      1.898  49.803  < 2e-16 ***
## factor(maritl)2. Married         14.501      2.061   7.036 2.44e-12 ***
## factor(maritl)3. Widowed         -1.887      9.113  -0.207    0.836    
## factor(maritl)4. Divorced        -2.048      3.345  -0.612    0.540    
## factor(maritl)5. Separated       -3.324      5.533  -0.601    0.548    
## factor(jobclass)2. Information   15.204      1.423  10.686  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##          edf Ref.df     F p-value    
## s(age) 5.027  6.116 19.26  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.141   Deviance explained = 14.4%
## GCV = 1501.8  Scale est. = 1496.3    n = 3000
# Plot the effect of age
plot(gam_fit, page=1, all.terms=TRUE, se=TRUE, scheme=1, main="Effects on Wage")

Predicted wage Across age by martial status

library(ISLR)
library(mgcv)
library(ggplot2)

# Load the Wage dataset
data(Wage)

# Fit a GAM with wage as the response
gam_fit <- gam(wage ~ s(age) + maritl + jobclass, data=Wage)

# Generate predictions from the model across the range of `age`, for each level of `maritl` and `jobclass`
wage_data_pred <- with(Wage, expand.grid(age = seq(min(age), max(age), length.out = 100),
                                         maritl = levels(maritl),
                                         jobclass = levels(jobclass)))

# Add predictions
wage_data_pred$wage_pred <- predict(gam_fit, newdata = wage_data_pred)

# Plotting the effect of Marital Status on Wage across Age
ggplot(wage_data_pred, aes(x = age, y = wage_pred, color = maritl)) +
  geom_line() +
  labs(title = "Predicted Wage Across Age by Marital Status", x = "Age", y = "Predicted Wage") +
  theme_minimal() +
  scale_color_brewer(palette = "Set1")

Summary of Findings: Effect of Age: The GAM model allows us to visualize how wage varies non-linearly with age. Typically, we expect to see wages increase with age as experience grows, but the increase may slow down or even plateau at certain ages.

Marital Status (maritl): The model can reveal differences in wage distribution across different marital statuses. For example, individuals who are married might have different wage characteristics compared to those who are never married, divorced, or widowed, due to factors like stability or dual incomes.

Job Class (jobclass): This predictor categorizes individuals into different job classes (e.g., Information and White/Blue Collar). The model helps us understand how these categories relate to wages, potentially reflecting the varying economic values of different types of jobs.

Interactions and Non-linearity: By using GAMs, I captured non-linear relationships and interactions that aren’t easily modeled with traditional linear regression. This flexibility can lead to more accurate models and insights into the data.

Predicted wage Across age by job class

data(Wage)

# Fit a GAM with wage as the response
gam_fit <- gam(wage ~ s(age) + maritl + jobclass, data=Wage)

# Generate predictions from the model across the range of `age`, for each level of `jobclass`
wage_data_pred <- with(Wage, expand.grid(age = seq(min(age), max(age), length.out = 100),
                                         maritl = levels(maritl),
                                         jobclass = levels(jobclass)))

wage_data_pred$wage_pred <- predict(gam_fit, newdata = wage_data_pred)

# Plotting the effect of Job Class on Wage across Age
ggplot(wage_data_pred, aes(x = age, y = wage_pred, color = jobclass)) +
  geom_line() +
  labs(title = "Predicted Wage Across Age by Job Class", x = "Age", y = "Predicted Wage") +
  theme_minimal() +
  scale_color_brewer(palette = "Set2")

Question 9:

  1. Fit a Cubic Polynomial Regression:

Explanation:

In this phase, the Boston dataset is imported, encompassing the dis variable (the weighted average distance to employment centers) and the nox variable (the concentration of nitrogen oxides). A cubic polynomial regression model is constructed to predict nox based on dis using the lm function, with the poly function generating the necessary polynomial terms. Following this, a summary of the regression results is presented, and a scatter plot is generated to illustrate both the dataset and the polynomial regression curve.

library(MASS)
## Warning: package 'MASS' was built under R version 4.3.3
# Load the Boston dataset
data(Boston)
model_cubic <- lm(nox ~ poly(dis, 3), data = Boston)
summary(model_cubic)
## 
## Call:
## lm(formula = nox ~ poly(dis, 3), data = Boston)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.121130 -0.040619 -0.009738  0.023385  0.194904 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.554695   0.002759 201.021  < 2e-16 ***
## poly(dis, 3)1 -2.003096   0.062071 -32.271  < 2e-16 ***
## poly(dis, 3)2  0.856330   0.062071  13.796  < 2e-16 ***
## poly(dis, 3)3 -0.318049   0.062071  -5.124 4.27e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06207 on 502 degrees of freedom
## Multiple R-squared:  0.7148, Adjusted R-squared:  0.7131 
## F-statistic: 419.3 on 3 and 502 DF,  p-value: < 2.2e-16
# Plot the data and the polynomial fit
plot(Boston$dis, Boston$nox, xlab = "Weighted Mean Distance to Employment Centers", 
     ylab = "Nitrogen Oxides Concentration", main = "Cubic Polynomial Regression")
lines(Boston$dis, predict(model_cubic), col = "green", lwd = 2)

  1. Plot Polynomial Fits for Different Degrees:

Explanation: We create a function named fit_and_rss that is responsible for constructing polynomial regression models of different degrees, from 1 to 10, and computing the Residual Sum of Squares (RSS) for each. By iterating over these degrees, we fit each model and keep track of the corresponding RSS figures. Subsequently, we generate a plot that maps the RSS against the degrees of the polynomials, providing insight into how the complexity of the model influences the RSS.

# Create a function to fit polynomial and calculate RSS
fit_and_rss <- function(degree) {
  model <- lm(nox ~ poly(dis, degree), data = Boston)
  rss <- sum(model$residuals^2)
  return(list(model = model, rss = rss))
}

# Initialize an empty dataframe to store results
results <- data.frame(Degree = 1:10, RSS = NA)

# Fit polynomials for different degrees and calculate RSS
for (degree in 1:10) {
  result <- fit_and_rss(degree)
  results[degree, "RSS"] <- result$rss
}

# Plot RSS for different polynomial degrees
plot(results$Degree, results$RSS, type = "l", 
     xlab = "Polynomial Degree", ylab = "Residual Sum of Squares")

Explanation: We import the splines package and employ it to develop a regression spline model with four degrees of freedom for predicting the variable nox based on dis. The bs function is utilized for generating a basis spline that incorporates the designated degrees of freedom. Following the model’s development, a summary of the spline model’s results is presented. Additionally, a scatter plot is crafted to depict both the observed data points and the curve of the spline fit, offering a visual representation of how well the spline model approximates the relationship between dis and nox.

library(boot)

#cross-validation error calculation to avoid non-finite values
cv_errors <- sapply(1:10, function(d) {
  cv_model <- cv.glm(Boston, glm(nox ~ poly(dis, d), data = Boston), K = 10)
  if (all(is.finite(cv_model$delta))) {
    return(cv_model$delta[1])
  } else {
    return(NA)  # Return NA for non-finite values
  }
})

# Filter out non-finite values (if any) for plotting
finite_indices <- which(!is.na(cv_errors))
finite_cv_errors <- cv_errors[finite_indices]

if (length(finite_cv_errors) > 0) {
  # Proceed with plotting only if there are finite CV error values
  plot(finite_indices, finite_cv_errors, type = "b", xlab = "Degree of Polynomial", ylab = "CV Error", main = "CV Error by Polynomial Degree")
  optimal_degree <- finite_indices[which.min(finite_cv_errors)]
  points(optimal_degree, min(finite_cv_errors), col = "red", cex = 2, pch = 20)
  cat("Optimal degree of polynomial:", optimal_degree, "\n")
} else {
  cat("All CV errors are non-finite. Check the data and model fitting process.\n")
}

## Optimal degree of polynomial: 3

d)Fit Regression Spline for Different Degrees of Freedom and Plot RSS:

Explanation: Similar to step (b),We undertake the fitting of regression spline models across a range of degrees of freedom, from 1 to 10, for the purpose of determining their Residual Sum of Squares (RSS) values. This process involves calculating the RSS for each model, which provides a measure of the model’s accuracy in fitting the data. Subsequently, we plot these RSS values against the corresponding degrees of freedom. This visualization allows us to examine the impact of the spline model’s flexibility on its goodness of fit, illustrating how increasing the degrees of freedom—a proxy for model flexibility—relates to the model’s ability to capture the underlying data structure.

# Fit a regression spline with 4 degrees of freedom
library(splines)
model_spline <- lm(nox ~ bs(dis, df = 4), data = Boston)

# Display the output
summary(model_spline)
## 
## Call:
## lm(formula = nox ~ bs(dis, df = 4), data = Boston)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.124622 -0.039259 -0.008514  0.020850  0.193891 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.73447    0.01460  50.306  < 2e-16 ***
## bs(dis, df = 4)1 -0.05810    0.02186  -2.658  0.00812 ** 
## bs(dis, df = 4)2 -0.46356    0.02366 -19.596  < 2e-16 ***
## bs(dis, df = 4)3 -0.19979    0.04311  -4.634 4.58e-06 ***
## bs(dis, df = 4)4 -0.38881    0.04551  -8.544  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06195 on 501 degrees of freedom
## Multiple R-squared:  0.7164, Adjusted R-squared:  0.7142 
## F-statistic: 316.5 on 4 and 501 DF,  p-value: < 2.2e-16
# Plot the data and the spline fit
plot(Boston$dis, Boston$nox, xlab = "Weighted Mean Distance to Employment Centers", 
     ylab = "Nitrogen Oxides Concentration", main = "Regression Spline (4 Degrees of Freedom)")
lines(Boston$dis, predict(model_spline), col = "blue", lwd = 2)

e)Select Best Degrees of Freedom for Spline:

Explanation: In selecting the optimal degrees of freedom for the regression spline, we scrutinize the plot that maps the Residual Sum of Squares (RSS) against the degrees of freedom, as derived from the previous step. The point at which the RSS is minimized reflects the most suitable complexity level for the spline model. This optimal degree of freedom balances model fit and complexity, ensuring that the model is neither underfitting nor overfitting the data.

results_spline <- data.frame(DegreesOfFreedom = 3:10, RSS = NA)

# Fit spline for different degrees of freedom (starting from 3) and calculate RSS
for (df in 3:10) {
  model <- lm(nox ~ bs(dis, df = df), data = Boston)
  rss <- sum(model$residuals^2)
  results_spline[df - 2, "RSS"] <- rss  # Adjust the indexing for results_spline
}

# Plot RSS for different degrees of freedom
plot(results_spline$DegreesOfFreedom, results_spline$RSS, type = "l", 
     xlab = "Degrees of Freedom", ylab = "Residual Sum of Squares")

f)

library(boot)

cv_spline_errors <- sapply(3:10, function(df) {
  cv_model <- cv.glm(Boston, glm(nox ~ bs(dis, df = df), data = Boston), K = 10)
  if (all(is.finite(cv_model$delta))) {
    return(cv_model$delta[1])
  } else {
    return(NA)  
  }
})
## Warning in bs(dis, degree = 3L, knots = numeric(0), Boundary.knots = c(1.1296,
## : some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = numeric(0), Boundary.knots = c(1.1296,
## : some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = numeric(0), Boundary.knots = c(1.137, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = numeric(0), Boundary.knots = c(1.137, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = 3.09925, Boundary.knots = c(1.137, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = 3.09925, Boundary.knots = c(1.137, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = 3.1827, Boundary.knots = c(1.1296, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = 3.1827, Boundary.knots = c(1.1296, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(2.3398, 4.23906666666667),
## Boundary.knots = c(1.1296, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = c(2.3398, 4.23906666666667),
## Boundary.knots = c(1.1296, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(2.4212, 4.26416666666667),
## Boundary.knots = c(1.137, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = c(2.4212, 4.26416666666667),
## Boundary.knots = c(1.137, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(2.0941, 3.0993, 4.9671),
## Boundary.knots = c(1.1296, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = c(2.0941, 3.0993, 4.9671),
## Boundary.knots = c(1.1296, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(2.08755, 3.1523, 5.16495),
## Boundary.knots = c(1.137, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = c(2.08755, 3.1523, 5.16495),
## Boundary.knots = c(1.137, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.9682, 2.6775, 3.945, 5.7321), :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = c(1.9682, 2.6775, 3.945, 5.7321), :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.9512, 2.69136, 3.87584, 5.45906:
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = c(1.9512, 2.69136, 3.87584, 5.45906:
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.86156666666667, 2.4212, 3.2721, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = c(1.86156666666667, 2.4212, 3.2721, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.8026, 2.2565, 2.8237, 3.724, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = c(1.8026, 2.2565, 2.8237, 3.724, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.77, 2.2004, 2.7778, 3.6519, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = c(1.77, 2.2004, 2.7778, 3.6519, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.7353625, 2.084875, 2.50245, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = c(1.7353625, 2.084875, 2.50245, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(dis, degree = 3L, knots = c(1.719975, 2.0643, 2.4999, 3.1323, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases

## Warning in bs(dis, degree = 3L, knots = c(1.719975, 2.0643, 2.4999, 3.1323, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
# Ensuring that we plot only the finite CV error values
finite_indices <- which(!is.na(cv_spline_errors))
finite_cv_errors <- cv_spline_errors[finite_indices]

if (length(finite_cv_errors) > 0) {
  # Proceed with plotting only if there are finite values
  plot(finite_indices + 2, finite_cv_errors, type = "b",  # +2 to adjust for the starting index of 3
       xlab = "Degrees of Freedom", ylab = "CV Error", main = "CV Error by Degrees of Freedom for Splines")
  optimal_df <- finite_indices[which.min(finite_cv_errors)] + 2  # Adjust for the starting index of 3
  points(optimal_df, min(finite_cv_errors), col = "red", cex = 2, pch = 20)
  cat("Optimal degrees of freedom for spline:", optimal_df, "\n")
} else {
  cat("All CV errors are non-finite. Please check the model fitting process and data.\n")
}

## Optimal degrees of freedom for spline: 10

Final Answer: In our R-based analysis, we initially conducted a cubic polynomial regression to forecast the concentration of nitrogen oxides (nox) as a function of the distances to employment centers (dis). Subsequently, we evaluated polynomial models of different degrees to ascertain the most appropriate degree, employing cross-validation for selection. Following this, we implemented regression splines with a range of degrees of freedom, identifying the optimal degree through cross-validation. This thorough examination aids in pinpointing the most fitting models for the prediction of nox based on dis within the context of the Boston dataset.

11

(a) Generate response Y and predictors X1, X2 with n=100

set.seed(1)  # For reproducibility

# (a) Generate the response Y and predictors X1 and X2
n <- 100
X1 <- rnorm(n)
X2 <- rnorm(n)
beta0 <- 1.5  # True intercept
beta1 <- 2    # True coefficient for X1
beta2 <- -0.5 # True coefficient for X2
Y <- beta0 + beta1 * X1 + beta2 * X2 + rnorm(n)

# (b) Initialize beta1_hat and beta2_hat to an arbitrary value
beta1_hat <- 0
beta2_hat <- 0

# Vectors to store the coefficients and RSS
beta0_hats <- numeric(1000)
beta1_hats <- numeric(1000)
beta2_hats <- numeric(1000)
RSS <- numeric(1000)

# (c) & (d) Backfitting loop
for(i in 1:1000) {
  # Keeping beta1_hat fixed, fit the model for beta0 and beta2
  fit1 <- lm(Y - beta1_hat * X1 ~ X2)
  beta0_hat <- coef(fit1)["(Intercept)"]
  beta2_hat <- coef(fit1)["X2"]
  
  # Keeping beta2_hat fixed, fit the model for beta0 and beta1
  fit2 <- lm(Y - beta2_hat * X2 ~ X1)
  # No need to refit beta0, should be the same as above
  beta1_hat <- coef(fit2)["X1"]
  
  # Store coefficients
  beta0_hats[i] <- beta0_hat
  beta1_hats[i] <- beta1_hat
  beta2_hats[i] <- beta2_hat
  # Store RSS
  RSS[i] <- sum(resid(fit2)^2)
}

# Creating a dataframe to store the results for plotting
results <- data.frame(
  iteration = 1:1000,
  beta0_hat = beta0_hats,
  beta1_hat = beta1_hats,
  beta2_hat = beta2_hats,
  RSS = RSS
)

# Plotting the estimates over iterations
library(ggplot2)
ggplot(results, aes(x = iteration)) +
  geom_line(aes(y = beta0_hat, color = "beta0_hat")) +
  geom_line(aes(y = beta1_hat, color = "beta1_hat")) +
  geom_line(aes(y = beta2_hat, color = "beta2_hat")) +
  labs(color = "Parameter Estimate") +
  theme_minimal() +
  ggtitle("Convergence of Coefficient Estimates in Backfitting Algorithm") +
  xlab("Iteration") +
  ylab("Coefficient Estimate")

# Print the last set of estimates
print(tail(results, 1))
##      iteration beta0_hat beta1_hat  beta2_hat      RSS
## 1000      1000  1.525353   2.02111 -0.5534668 105.6054

f & g)

To compare the backfitting approach to the ordinary least squares (OLS) multiple linear regression estimates, we can perform a simple linear regression with both X1 and X2 as predictors for Y. Then, using the abline() function in R, we can overlay the linear regression coefficient estimates onto the plot obtained from the backfitting process.

# Perform OLS multiple linear regression
fit_ols <- lm(Y ~ X1 + X2)

# Get coefficients from the OLS fit
beta0_ols <- coef(fit_ols)[1]
beta1_ols <- coef(fit_ols)[2]
beta2_ols <- coef(fit_ols)[3]

# Plotting the backfitting process and overlay the OLS estimates
plot(results$iteration, results$beta0_hat, type='l', col='blue', ylim=c(min(c(results$beta0_hat, beta0_ols)), max(c(results$beta0_hat, beta0_ols))), ylab='Coefficient Estimate', xlab='Iteration', main='Backfitting vs OLS Estimates')
lines(results$iteration, results$beta1_hat, col='red')
lines(results$iteration, results$beta2_hat, col='green')
abline(h=beta0_ols, col='blue', lty=2)
abline(h=beta1_ols, col='red', lty=2)
abline(h=beta2_ols, col='green', lty=2)
legend("topright", legend=c("beta0_hat", "beta1_hat", "beta2_hat", "OLS beta0", "OLS beta1", "OLS beta2"), col=c("blue", "red", "green", "blue", "red", "green"), lty=c(1, 1, 1, 2, 2, 2))

# (g) Determine how many iterations to get a good approximation
tolerance <- 1e-3  # Define a tolerance level
good_approx <- which(abs(results$beta1_hat - beta1_ols) < tolerance & abs(results$beta2_hat - beta2_ols) < tolerance)[1]

# Print how many iterations were required
print(paste("Good approximation achieved at iteration:", good_approx))
## [1] "Good approximation achieved at iteration: 2"