Importing necessary libaries and datasets

library(ISLR)

## Warning: package 'ISLR' was built under R version 4.3.2

library(boot) 
library(MASS)
library(tidyverse)

## Warning: package 'dplyr' was built under R version 4.3.2

## Warning: package 'lubridate' was built under R version 4.3.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)

## Warning: package 'caret' was built under R version 4.3.2

## Loading required package: lattice
## 
## Attaching package: 'lattice'
## 
## The following object is masked from 'package:boot':
## 
##     melanoma
## 
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(viridis)

## Warning: package 'viridis' was built under R version 4.3.3

## Loading required package: viridisLite

library(splines)
library(gam)

## Warning: package 'gam' was built under R version 4.3.3

## Loading required package: foreach

## Warning: package 'foreach' was built under R version 4.3.2

## 
## Attaching package: 'foreach'
## 
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## 
## Loaded gam 1.22-3

6

a) Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial ft to the data.

data(Wage)
set.seed(241)
train_indices <- sample(1:nrow(Wage), 0.7 * nrow(Wage))
train_data <- Wage[train_indices, ]
test_data <- Wage[-train_indices, ]

cross_val <- data.frame()
for (degree in 1:10) {
  model <- lm(wage ~ poly(age, degree, raw = TRUE), data = train_data)
  predictions <- predict(model, newdata = test_data)
  cv_error <- RMSE(predictions, test_data$wage)
  cross_val <- rbind(cross_val, data.frame(degree = degree, cv_error = cv_error))
}


optimal_degree <- cross_val %>% 
  arrange(cv_error) %>%
  slice(1) %>%
  pull(degree)
print(paste("Optimal degree selected by cross-validation:", optimal_degree))

## [1] "Optimal degree selected by cross-validation: 3"

The optimal degree selected by cross-validation for performing polynomial regression is “3”.

final_model <- lm(wage ~ poly(age, optimal_degree, raw = TRUE), data = Wage)
anova_results <- anova(final_model)
print(anova_results)

## Analysis of Variance Table
## 
## Response: wage
##                                         Df  Sum Sq Mean Sq F value    Pr(>F)
## poly(age, optimal_degree, raw = TRUE)    3  444411  148137  92.894 < 2.2e-16
## Residuals                             2996 4777674    1595                  
##                                          
## poly(age, optimal_degree, raw = TRUE) ***
## Residuals                                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

From the output of ANOVA, we can observe that the degree is “3”.

The degree chosen by both cross-validation and ANOVA for polynomial regression is 3.

age_grid <- seq(min(Wage$age), max(Wage$age), length.out = 100)
pred <- predict(final_model, newdata = data.frame(age = age_grid), se.fit = TRUE)


ggplot() +
  geom_point(data = Wage, aes(x = age, y = wage), color = viridis(1, alpha = 0.5), size = 2) +
  geom_line(aes(x = age_grid, y = pred$fit), color = "green", size = 1.2) +
  geom_ribbon(aes(x = age_grid, ymin = pred$fit - 1.96 * pred$se.fit, ymax = pred$fit + 1.96 * pred$se.fit),
              fill = "green", alpha = 0.2) +
  labs(x = "Age", y = "Wage", title = "Polynomial Regression", subtitle = paste("Degree:", optimal_degree)) +
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 16),
        plot.subtitle = element_text(size = 12),
        axis.title = element_text(size = 14),
        panel.grid.major = element_line(color = "grey90"))

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#### b) Fit a step function to predict wage using age, and perform crossvalidation to choose the optimal number of cuts. Make a plot of the ft obtained.

mse <- function(y_true, y_pred) {
  mean((y_true - y_pred)^2)
}

cross_val <- data.frame()
for (cuts in 2:10) {
  wage_bins <- cut(train_data$age, cuts)
  wage_means <- tapply(train_data$wage, wage_bins, mean)
  cv_error <- mse(train_data$wage, wage_means[wage_bins])
  cross_val <- rbind(cross_val, data.frame(cuts = cuts, cv_error = cv_error))
}

optimal_cuts <- cross_val %>%
  arrange(cv_error) %>%
  slice(1) %>%
  pull(cuts)
print(paste("Optimal number of cuts selected by cross-validation:", optimal_cuts))

## [1] "Optimal number of cuts selected by cross-validation: 10"

wage_bins <- cut(Wage$age, optimal_cuts)
wage_means <- tapply(Wage$wage, wage_bins, mean)
breaks <- sort(unique(as.numeric(wage_bins)))
wage_means <- c(mean(Wage$wage), wage_means)
step_function <- stepfun(breaks, wage_means)

plot(step_function, col = "blue", main = "Step Function for Wage Prediction", xlab = "Age", ylab = "Mean Wage")

## 7 The Wage data set contains a number of other features not explored in this chapter, such as marital status (maritl), job class (jobclass), and others. Explore the relationships between some of these other predictors and wage, and use non-linear ftting techniques in order to ft fexible models to the data. Create plots of the results obtained, and write a summary of your fndings.

summary(Wage$maritl)

## 1. Never Married       2. Married       3. Widowed      4. Divorced 
##              648             2074               19              204 
##     5. Separated 
##               55

summary(Wage$jobclass)

##  1. Industrial 2. Information 
##           1544           1456

summary(Wage$education)

##       1. < HS Grad         2. HS Grad    3. Some College    4. College Grad 
##                268                971                650                685 
## 5. Advanced Degree 
##                426

summary(Wage$region)

##        1. New England    2. Middle Atlantic 3. East North Central 
##                     0                  3000                     0 
## 4. West North Central     5. South Atlantic 6. East South Central 
##                     0                     0                     0 
## 7. West South Central           8. Mountain            9. Pacific 
##                     0                     0                     0

summary(Wage$health)

##      1. <=Good 2. >=Very Good 
##            858           2142

summary(Wage$health_ins)

## 1. Yes  2. No 
##   2083    917

# Relationship between wage and education
Wage$education <- factor(Wage$education)
gam_model <- gam(wage ~ education, data = Wage)

Wage$predicted_wage <- predict(gam_model, newdata = Wage)
ggplot(Wage, aes(x = education, y = predicted_wage)) +
  geom_boxplot() +
  labs(title = "Predicted Wage by Education Level", x = "Education Level", y = "Predicted Wage") +
  theme_minimal()

From the above chart , we see that higher levels of education correlate with higher predicted wages. Those with an advanced degree have the highest predicted wage range.

# Relationship between wage and health condition
Wage$health <- factor(Wage$health)
gam_model <- gam(wage ~ health, data = Wage)

Wage$predicted_wage_health <- predict(gam_model, newdata = Wage)
ggplot(Wage, aes(x = health, y = predicted_wage_health)) +
  geom_boxplot() +
  labs(title = "Predicted Wage by Health Level", x = "Health Level", y = "Predicted Wage") +
  theme_minimal()

The above chart analyzes health level, shows a slight increase in predicted wage for those with a ‘Very Good’ health status compared to ‘Good’ or less. This suggests that health status may have a modest effect on wage predictions.

# Relationship between wage and job class
Wage$jobclass <- factor(Wage$jobclass)
gam_model <- gam(wage ~ jobclass, data = Wage)

Wage$predicted_wage_jobclass <- predict(gam_model, newdata = Wage)
ggplot(Wage, aes(x = jobclass, y = predicted_wage_jobclass)) +
  geom_boxplot() +
  labs(title = "Predicted Wage by Job Class", x = "Job Class", y = "Predicted Wage") +
  theme_minimal()

The above compares the predicted wage by job class and shows a higher predicted wage for those in the ‘Information’ sector compared to the ‘Industrial’ sector. This aligns with the contemporary trend of information sector jobs generally offering higher wages.

# Relationship between wage and marital status
Wage$maritl <- factor(Wage$maritl)
gam_model <- gam(wage ~ maritl, data = Wage)

Wage$predicted_wage_maritl <- predict(gam_model, newdata = Wage)
ggplot(Wage, aes(x = maritl, y = predicted_wage_maritl)) +
  geom_boxplot() +
  labs(title = "Predicted Wage by Marital Status", x = "Marital Status", y = "Predicted Wage") +
  theme_minimal()

Marital status also seems to play a role in wage prediction as seen in the above chart . Those who are ‘Married’ have a higher predicted wage range compared to those who are ‘Never Married’, ‘Widowed’, ‘Divorced’, or ‘Separated’.

# Relationship between wage and health insurance
Wage$health_ins <- factor(Wage$health_ins)
gam_model <- gam(wage ~ health_ins, data = Wage)

Wage$predicted_wage_health_ins <- predict(gam_model, newdata = Wage)
ggplot(Wage, aes(x = health_ins, y = predicted_wage_health_ins)) +
  geom_boxplot() +
  labs(title = "Predicted Wage by Health Insurance Status", x = "Health Insurance Status", y = "Predicted Wage") +
  theme_minimal()

This chart indicates that having health insurance is associated with a higher predicted wage. This could be due to a variety of factors, including that jobs which offer higher wages are more likely to provide health insurance benefits.

In summary, we can conclude that higher education, better health status, employment in the information sector, being married, and having health insurance are associated with higher wage predictions.

9 This question uses the variables dis (the weighted mean of distances to fve Boston employment centers) and nox (nitrogen oxides concentration in parts per 10 million) from the Boston data. We will treat dis as the predictor and nox as the response.

a)

data(Boston)
dis <- Boston$dis
nox <- Boston$nox
poly_fit <- lm(nox ~ poly(dis, 3, raw = TRUE))
summary(poly_fit)

## 
## Call:
## lm(formula = nox ~ poly(dis, 3, raw = TRUE))
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.121130 -0.040619 -0.009738  0.023385  0.194904 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                0.9341281  0.0207076  45.110  < 2e-16 ***
## poly(dis, 3, raw = TRUE)1 -0.1820817  0.0146973 -12.389  < 2e-16 ***
## poly(dis, 3, raw = TRUE)2  0.0219277  0.0029329   7.476 3.43e-13 ***
## poly(dis, 3, raw = TRUE)3 -0.0008850  0.0001727  -5.124 4.27e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06207 on 502 degrees of freedom
## Multiple R-squared:  0.7148, Adjusted R-squared:  0.7131 
## F-statistic: 419.3 on 3 and 502 DF,  p-value: < 2.2e-16

The regression output indicates that all the coefficients of the cubic polynomial terms are statistically significant (p-values < 0.001). The model has an adjusted R-squared value of 0.7131, suggesting that the cubic polynomial regression explains a substantial portion of the variance in nox.

plot(dis, nox, col = "blue", main = "Cubic Polynomial Regression", xlab = "dis", ylab = "nox")
lines(sort(dis), predict(poly_fit)[order(dis)], col = "red", lw = 2)

TThe above plot visualizes the relationship between dis and nox and assess the fit of the cubic polynomial regression.

b)

calculate_RSS <- function(model) {
  return(sum(model$residuals^2))
}

degrees <- 1:10
RSS_values <- numeric(length(degrees))

for (deg in degrees) {
  poly_fit <- lm(nox ~ poly(dis, deg, raw = TRUE))
  
  plot(dis, nox, col = "blue", main = paste("Degree:", deg), xlab = "dis", ylab = "nox")
  lines(sort(dis), predict(poly_fit)[order(dis)], col = "red", lw = 2)
  
  RSS_values[deg] <- calculate_RSS(poly_fit)
  
  
}

cat("Degree\tRSS\n")

## Degree   RSS

for (i in 1:length(degrees)) {
  cat(degrees[i], "\t", RSS_values[i], "\n")
}

## 1     2.768563 
## 2     2.035262 
## 3     1.934107 
## 4     1.932981 
## 5     1.91529 
## 6     1.878257 
## 7     1.849484 
## 8     1.83563 
## 9     1.833331 
## 10    1.832171

c)

k <- 5
mse <- numeric(10)

for (deg in 1:10) {
  mse_fold <- numeric(k)
  
  for (i in 1:k) {
    suppressWarnings({
      set.seed(241) 
      indices <- sample(1:length(dis), replace = FALSE)
      fold_size <- length(dis) %/% k
      test_indices <- indices[((i - 1) * fold_size + 1):(i * fold_size)]
      train_indices <- setdiff(indices, test_indices)
      dis_train <- dis[train_indices]
      nox_train <- nox[train_indices]
      dis_test <- dis[test_indices]
      nox_test <- nox[test_indices]
      
    
      poly_fit <- lm(nox_train ~ poly(dis_train, deg, raw = TRUE))
      
      nox_pred <- predict(poly_fit, newdata = data.frame(dis = dis_test))
      
      mse_fold[i] <- mean((nox_test - nox_pred)^2)
    })
  }
  
  mse[deg] <- mean(mse_fold)
}

optimal_degree <- which.min(mse)
cat("Optimal degree selected by cross-validation:", optimal_degree, "\n")

## Optimal degree selected by cross-validation: 1

The above suggests that a linear regression (polynomial of degree 1) is the optimal choice based on the cross-validation results.

d)

fit <- lm(nox ~ bs(dis, df = 4), data = Boston)
plot(Boston$dis, Boston$nox, xlab = "dis", ylab = "nox", main = "Regression Spline Fit", col = "blue")
lines(sort(Boston$dis), fitted(fit)[order(Boston$dis)], col = "red", lwd = 2)

The output shows the coefficients, standard errors, t-values, and p-values for each basis function of the regression spline. The adjusted R-squared value of 0.7142 indicates that the model explains a substantial portion of the variance in nox.

Regarding the choice of knots, when using the bs() function, the knots are chosen automatically based on the quantiles of the predictor variable . By default, the bs() function places the knots at equally spaced quantiles of the predictor variable. The number of knots is determined by the degrees of freedom.

e)

df_range <- 2:10

RSS <- numeric(length(df_range))

plot(Boston$dis, Boston$nox, xlab = "dis", ylab = "nox", main = "Regression Spline Fits", col = "blue")

for (i in seq_along(df_range)) {

  fit <- lm(nox ~ bs(dis, df = df_range[i]), data = Boston)
  
  RSS[i] <- sum(residuals(fit)^2)
  
  lines(sort(Boston$dis), fitted(fit)[order(Boston$dis)], col = rainbow(length(df_range))[i])
}

## Warning in bs(dis, df = df_range[i]): 'df' was too small; have used 3

legend("topright", legend = paste("df =", df_range), col = rainbow(length(df_range)), lty = 1, cex = 0.8)

print(data.frame(Degrees_of_Freedom = df_range, RSS = RSS))

##   Degrees_of_Freedom      RSS
## 1                  2 1.934107
## 2                  3 1.934107
## 3                  4 1.922775
## 4                  5 1.840173
## 5                  6 1.833966
## 6                  7 1.829884
## 7                  8 1.816995
## 8                  9 1.825653
## 9                 10 1.792535

The above results demonstrate the impact of varying degrees of freedom on the flexibility and complexity of regression spline fits. While higher degrees of freedom can capture more intricate patterns, they also increase the risk of overfitting.

f)

set.seed(241)
num_folds <- 5

compute_RSS <- function(df, data) {
  fit <- suppressWarnings(lm(nox ~ bs(dis, df = df), data = data))  
  RSS <- sum(residuals(fit)^2)
  return(RSS)
}

cv_results <- suppressWarnings(sapply(2:10, function(df) {
  cv <- cv.glm(Boston, glm(nox ~ bs(dis, df = df)), K = num_folds)
  return(cv$delta[1])
}))

best_df <- which.min(cv_results) + 1  

cat("Best degrees of freedom:", best_df, "\n")

## Best degrees of freedom: 6

plot(2:10, cv_results, type = "b", xlab = "Degrees of Freedom", ylab = "Cross-Validated RSS")

The output indicates that the best degrees of freedom selected by cross-validation is 6.

11) In Section 7.7, it was mentioned that GAMs are generally ft using a backftting approach. The idea behind backftting is actually quite simple. We will now explore backftting in the context of multiple linear regression.Suppose that we would like to perform multiple linear regression, butwe do not have software to do so. Instead, we only have software to perform simple linear regression. Therefore, we take the followingiterative approach: we repeatedly hold all but one coefcient estimate fxed at its current value, and update only that coefcient estimate using a simple linear regression. The process is continued until convergence—that is, until the coefcient estimates stop changing.We now try this out on a toy example.

a)

set.seed(241)
n <- 100
X1 <- rnorm(n, mean = 0, sd = sqrt(2))
X2 <- rnorm(n, mean = 1, sd = sqrt(2))
beta0 <- 1.5
beta1 <- 3
beta2 <- -4.5
epsilon <- rnorm(n, mean = 0, sd = 1)
Y <- beta0 + beta1 * X1 + beta2 * X2 + epsilon
data <- data.frame(Y, X1, X2)

# Initialize empty data frame to store values
iteration_data <- data.frame(iteration = integer(),
                             beta0 = numeric(),
                             beta1 = numeric(),
                             beta2 = numeric())

# Backfitting Approach
beta0_hat <- 0
beta1_hat <- 0
beta2_hat <- 0

converged <- FALSE
iteration <- 0
threshold <- 1e-6  # Set a stricter convergence threshold

while (!converged) {
  iteration <- iteration + 1

  # Fitting models
  lm_beta0 <- lm(Y - beta1_hat * X1 - beta2_hat * X2 ~ 1)
  beta0_new <- coef(lm_beta0)[1]

  lm_beta1 <- lm(Y - beta0_hat - beta2_hat * X2 ~ X1)
  beta1_new <- coef(lm_beta1)[2]

  lm_beta2 <- lm(Y - beta0_hat - beta1_hat * X1 ~ X2)
  beta2_new <- coef(lm_beta2)[2]

  # Check convergence
  if (all(abs(c(beta0_new - beta0_hat, beta1_new - beta1_hat, beta2_new - beta2_hat)) < threshold)) {
    converged <- TRUE
  }

  # Update estimates
  beta0_hat <- beta0_new
  beta1_hat <- beta1_new
  beta2_hat <- beta2_new

  # Store values in data frame
  iteration_data <- rbind(iteration_data, data.frame(iteration = iteration,
                                                     beta0 = beta0_hat,
                                                     beta1 = beta1_hat,
                                                     beta2 = beta2_hat))
}

# Plot
ggplot(iteration_data, aes(x = iteration)) +
  geom_line(aes(y = beta0, color = "Beta0")) +
  geom_line(aes(y = beta1, color = "Beta1")) +
  geom_line(aes(y = beta2, color = "Beta2")) +
  labs(title = "Convergence of Beta Estimates",
       x = "Iteration",
       y = "Beta Value",
       color = "Parameter") +
  scale_color_manual(values = c("Beta0" = "blue", "Beta1" = "red", "Beta2" = "green")) +
  theme_minimal()

# Print Results for Selected Iterations
selected_iterations <- c(1:5, seq(50, 1000, by = 50))
selected_iterations <- selected_iterations[selected_iterations <= iteration]  # Ensure selected iterations are within the actual iteration count



for (i in selected_iterations) {
  cat("Iteration:", i, "\n")
  cat("Beta0:", round(iteration_data$beta0[i], 3), "Beta1:", round(iteration_data$beta1[i], 3), "Beta2:", round(iteration_data$beta2[i], 3), "\n\n")
}

## Iteration: 1 
## Beta0: -4.184 Beta1: 2.72 Beta2: -4.305 
## 
## Iteration: 2 
## Beta0: 1.227 Beta1: 2.953 Beta2: -4.412 
## 
## Iteration: 3 
## Beta0: 1.382 Beta1: 2.959 Beta2: -4.421 
## 
## Iteration: 4 
## Beta0: 1.394 Beta1: 2.96 Beta2: -4.421 
## 
## Iteration: 5 
## Beta0: 1.394 Beta1: 2.96 Beta2: -4.421

b)

n <- 100
X1 <- rnorm(n, mean = 0, sd = sqrt(2))
X2 <- rnorm(n, mean = 1, sd = sqrt(2))
beta0 <- 1.5
beta1 <- 3
beta2 <- -4.5
epsilon <- rnorm(n, mean = 0, sd = 1)
Y <- beta0 + beta1 * X1 + beta2 * X2 + epsilon
data <- data.frame(Y, X1, X2)

# Initialize empty data frame to store values
iteration_data <- data.frame(iteration = integer(),
                             beta0 = numeric(),
                             beta1 = numeric(),
                             beta2 = numeric())

# Backfitting Approach
beta0_hat <- 0
beta1_hat <- 10
beta2_hat <- 0

converged <- FALSE
iteration <- 0
threshold <- 1e-6  # Set a stricter convergence threshold

while (!converged) {
  iteration <- iteration + 1

  # Fitting models
  lm_beta0 <- lm(Y - beta1_hat * X1 - beta2_hat * X2 ~ 1)
  beta0_new <- coef(lm_beta0)[1]

  lm_beta1 <- lm(Y - beta0_hat - beta2_hat * X2 ~ X1)
  beta1_new <- coef(lm_beta1)[2]

  lm_beta2 <- lm(Y - beta0_hat - beta1_hat * X1 ~ X2)
  beta2_new <- coef(lm_beta2)[2]

  # Check convergence
  if (all(abs(c(beta0_new - beta0_hat, beta1_new - beta1_hat, beta2_new - beta2_hat)) < threshold)) {
    converged <- TRUE
  }

  # Update estimates
  beta0_hat <- beta0_new
  beta1_hat <- beta1_new
  beta2_hat <- beta2_new

  # Store values in data frame
  iteration_data <- rbind(iteration_data, data.frame(iteration = iteration,
                                                     beta0 = beta0_hat,
                                                     beta1 = beta1_hat,
                                                     beta2 = beta2_hat))
}

# Plot
ggplot(iteration_data, aes(x = iteration)) +
  geom_line(aes(y = beta0, color = "Beta0")) +
  geom_line(aes(y = beta1, color = "Beta1")) +
  geom_line(aes(y = beta2, color = "Beta2")) +
  labs(title = "Convergence of Beta Estimates",
       x = "Iteration",
       y = "Beta Value",
       color = "Parameter") +
  scale_color_manual(values = c("Beta0" = "blue", "Beta1" = "red", "Beta2" = "green")) +
  theme_minimal()

# Print Results for Selected Iterations
selected_iterations <- c(1:6, seq(50, 1000, by = 50))
selected_iterations <- selected_iterations[selected_iterations <= iteration]  # Ensure selected iterations are within the actual iteration count



for (i in selected_iterations) {
  cat("Iteration:", i, "\n")
  cat("Beta0:", round(iteration_data$beta0[i], 3), "Beta1:", round(iteration_data$beta1[i], 3), "Beta2:", round(iteration_data$beta2[i], 3), "\n\n")
}

## Iteration: 1 
## Beta0: -3.942 Beta1: 2.922 Beta2: -4.501 
## 
## Iteration: 2 
## Beta0: 1.479 Beta1: 2.946 Beta2: -4.477 
## 
## Iteration: 3 
## Beta0: 1.452 Beta1: 2.945 Beta2: -4.477 
## 
## Iteration: 4 
## Beta0: 1.452 Beta1: 2.945 Beta2: -4.477 
## 
## Iteration: 5 
## Beta0: 1.452 Beta1: 2.945 Beta2: -4.477

c)

n <- 100
X1 <- rnorm(n, mean = 0, sd = sqrt(2))
X2 <- rnorm(n, mean = 1, sd = sqrt(2))
beta0 <- 1.5
beta1 <- 3
beta2 <- -4.5
epsilon <- rnorm(n, mean = 0, sd = 1)
Y <- beta0 + beta1 * X1 + beta2 * X2 + epsilon
data <- data.frame(Y, X1, X2)

# Initialize empty data frame to store values
iteration_data <- data.frame(iteration = integer(),
                             beta0 = numeric(),
                             beta1 = numeric(),
                             beta2 = numeric())

# Backfitting Approach
beta0_hat <- 0
beta1_hat <- 10
beta2_hat <- 0

converged <- FALSE
iteration <- 0
threshold <- 1e-6  # Set a stricter convergence threshold

while (!converged) {
  iteration <- iteration + 1

  # Fitting models
  lm_beta0 <- lm(Y - beta1_hat * X1 - beta2_hat * X2 ~ 1)
  beta0_new <- coef(lm_beta0)[1]

  lm_beta2_adjusted <- lm(Y - beta1_hat * X1 ~ X2)
  beta2_new <- coef(lm_beta2_adjusted)[2]

  # Check convergence
  if (all(abs(c(beta0_new - beta0_hat, beta2_new - beta2_hat)) < threshold)) {
    converged <- TRUE
  }

  # Update estimates
  beta0_hat <- beta0_new
  beta2_hat <- beta2_new

  # Store values in data frame
  iteration_data <- rbind(iteration_data, data.frame(iteration = iteration,
                                                     beta0 = beta0_hat,
                                                     beta1 = beta1_hat,
                                                     beta2 = beta2_hat))
}

# Plot
ggplot(iteration_data, aes(x = iteration)) +
  geom_line(aes(y = beta0, color = "Beta0")) +
  geom_line(aes(y = beta1, color = "Beta1")) +
  geom_line(aes(y = beta2, color = "Beta2")) +
  labs(title = "Convergence of Beta Estimates",
       x = "Iteration",
       y = "Beta Value",
       color = "Parameter") +
  scale_color_manual(values = c("Beta0" = "blue", "Beta1" = "red", "Beta2" = "green")) +
  theme_minimal()

# Print Results for Selected Iterations
selected_iterations <- c(1:5, seq(50, 1000, by = 50))
selected_iterations <- selected_iterations[selected_iterations <= iteration]  # Ensure selected iterations are within the actual iteration count

for (i in selected_iterations) {
  cat("Iteration:", i, "\n")
  cat("Beta0:", round(iteration_data$beta0[i], 3), "Beta1:", round(beta1_hat, 3), "Beta2:", round(iteration_data$beta2[i], 3), "\n\n")
}

## Iteration: 1 
## Beta0: -2.36 Beta1: 10 Beta2: -3.988 
## 
## Iteration: 2 
## Beta0: 1.624 Beta1: 10 Beta2: -3.988 
## 
## Iteration: 3 
## Beta0: 1.624 Beta1: 10 Beta2: -3.988

d)

n <- 100
X1 <- rnorm(n, mean = 0, sd = sqrt(2))
X2 <- rnorm(n, mean = 1, sd = sqrt(2))
beta0 <- 1.5
beta1 <- 3
beta2 <- -4.5
epsilon <- rnorm(n, mean = 0, sd = 1)
Y <- beta0 + beta1 * X1 + beta2 * X2 + epsilon
data <- data.frame(Y, X1, X2)

# Initialize empty data frame to store values
iteration_data <- data.frame(iteration = integer(),
                             beta0 = numeric(),
                             beta1 = numeric(),
                             beta2 = numeric())

# Backfitting Approach
beta0_hat <- 0
beta1_hat <- 10
beta2_hat <- 0

converged <- FALSE
iteration <- 0
threshold <- 1e-6  # Set a stricter convergence threshold

while (!converged) {
  iteration <- iteration + 1

  # Fitting models
  lm_beta0 <- lm(Y - beta1_hat * X1 - beta2_hat * X2 ~ 1)
  beta0_new <- coef(lm_beta0)[1]

  lm_beta1_adjusted <- lm(Y - beta2_hat * X2 ~ X1)
  beta1_new <- coef(lm_beta1_adjusted)[2]

  # Check convergence
  if (all(abs(c(beta0_new - beta0_hat, beta1_new - beta1_hat)) < threshold)) {
    converged <- TRUE
  }

  # Update estimates
  beta0_hat <- beta0_new
  beta1_hat <- beta1_new

  # Store values in data frame
  iteration_data <- rbind(iteration_data, data.frame(iteration = iteration,
                                                     beta0 = beta0_hat,
                                                     beta1 = beta1_hat,
                                                     beta2 = beta2_hat))
}

# Plot
ggplot(iteration_data, aes(x = iteration)) +
  geom_line(aes(y = beta0, color = "Beta0")) +
  geom_line(aes(y = beta1, color = "Beta1")) +
  geom_line(aes(y = beta2, color = "Beta2")) +
  labs(title = "Convergence of Beta Estimates",
       x = "Iteration",
       y = "Beta Value",
       color = "Parameter") +
  scale_color_manual(values = c("Beta0" = "blue", "Beta1" = "red", "Beta2" = "green")) +
  theme_minimal()

# Print Results for Selected Iterations
selected_iterations <- c(1:5, seq(50, 1000, by = 50))
selected_iterations <- selected_iterations[selected_iterations <= iteration]  # Ensure selected iterations are within the actual iteration count

for (i in selected_iterations) {
  cat("Iteration:", i, "\n")
  cat("Beta0:", round(iteration_data$beta0[i], 3), "Beta1:", round(iteration_data$beta1[i], 3), "Beta2:", round(beta2_hat, 3), "\n\n")
}

## Iteration: 1 
## Beta0: -2.28 Beta1: 2.695 Beta2: 0 
## 
## Iteration: 2 
## Beta0: -2.43 Beta1: 2.695 Beta2: 0 
## 
## Iteration: 3 
## Beta0: -2.43 Beta1: 2.695 Beta2: 0

e)

library(ggplot2)

# Generate Data
set.seed(241)
n <- 100
X1 <- rnorm(n, mean = 0, sd = sqrt(2))
X2 <- rnorm(n, mean = 1, sd = sqrt(2))
beta0 <- 1.5
beta1 <- 3
beta2 <- -4.5
epsilon <- rnorm(n, mean = 0, sd = 1)
Y <- beta0 + beta1 * X1 + beta2 * X2 + epsilon
data <- data.frame(Y, X1, X2)

# Initialize empty data frame to store values
iteration_data <- data.frame(iteration = integer(),
                             beta0 = numeric(),
                             beta1 = numeric(),
                             beta2 = numeric())

# Backfitting Approach
beta0_hat <- 0
beta1_hat <- 10
beta2_hat <- 0

threshold <- 1e-6  # Set a stricter convergence threshold

for (iteration in 1:1000) {
  # Step (c): Keeping beta1 fixed, fit the model for beta2
  a <- Y - beta1_hat * X1
  beta2_hat <- lm(a ~ X2)$coef[2]

  # Step (d): Keeping beta2 fixed, fit the model for beta1
  a <- Y - beta2_hat * X2
  beta1_hat <- lm(a ~ X1)$coef[2]

  # Estimate beta0
  beta0_hat <- mean(Y - beta1_hat * X1 - beta2_hat * X2)

  # Store values in data frame
  iteration_data <- rbind(iteration_data, data.frame(iteration = iteration,
                                                     beta0 = beta0_hat,
                                                     beta1 = beta1_hat,
                                                     beta2 = beta2_hat))
}

# Plot
ggplot(iteration_data, aes(x = iteration)) +
  geom_line(aes(y = beta0, color = "Beta0")) +
  geom_line(aes(y = beta1, color = "Beta1")) +
  geom_line(aes(y = beta2, color = "Beta2")) +
  labs(title = "Convergence of Beta Estimates",
       x = "Iteration",
       y = "Beta Value",
       color = "Parameter") +
  scale_color_manual(values = c("Beta0" = "blue", "Beta1" = "red", "Beta2" = "green")) +
  theme_minimal()

# Print Results for Selected Iterations
selected_iterations <- c(1:5, seq(50, 1000, by = 50))

for (i in selected_iterations) {
  cat("Iteration:", i, "\n")
  cat("Beta0:", round(iteration_data$beta0[i], 3), "Beta1:", round(iteration_data$beta1[i], 3), "Beta2:", round(iteration_data$beta2[i], 3), "\n\n")
}

## Iteration: 1 
## Beta0: 1.723 Beta1: 2.975 Beta2: -4.699 
## 
## Iteration: 2 
## Beta0: 1.395 Beta1: 2.96 Beta2: -4.422 
## 
## Iteration: 3 
## Beta0: 1.394 Beta1: 2.96 Beta2: -4.421 
## 
## Iteration: 4 
## Beta0: 1.394 Beta1: 2.96 Beta2: -4.421 
## 
## Iteration: 5 
## Beta0: 1.394 Beta1: 2.96 Beta2: -4.421 
## 
## Iteration: 50 
## Beta0: 1.394 Beta1: 2.96 Beta2: -4.421 
## 
## Iteration: 100 
## Beta0: 1.394 Beta1: 2.96 Beta2: -4.421 
## 
## Iteration: 150 
## Beta0: 1.394 Beta1: 2.96 Beta2: -4.421 
## 
## Iteration: 200 
## Beta0: 1.394 Beta1: 2.96 Beta2: -4.421 
## 
## Iteration: 250 
## Beta0: 1.394 Beta1: 2.96 Beta2: -4.421 
## 
## Iteration: 300 
## Beta0: 1.394 Beta1: 2.96 Beta2: -4.421 
## 
## Iteration: 350 
## Beta0: 1.394 Beta1: 2.96 Beta2: -4.421 
## 
## Iteration: 400 
## Beta0: 1.394 Beta1: 2.96 Beta2: -4.421 
## 
## Iteration: 450 
## Beta0: 1.394 Beta1: 2.96 Beta2: -4.421 
## 
## Iteration: 500 
## Beta0: 1.394 Beta1: 2.96 Beta2: -4.421 
## 
## Iteration: 550 
## Beta0: 1.394 Beta1: 2.96 Beta2: -4.421 
## 
## Iteration: 600 
## Beta0: 1.394 Beta1: 2.96 Beta2: -4.421 
## 
## Iteration: 650 
## Beta0: 1.394 Beta1: 2.96 Beta2: -4.421 
## 
## Iteration: 700 
## Beta0: 1.394 Beta1: 2.96 Beta2: -4.421 
## 
## Iteration: 750 
## Beta0: 1.394 Beta1: 2.96 Beta2: -4.421 
## 
## Iteration: 800 
## Beta0: 1.394 Beta1: 2.96 Beta2: -4.421 
## 
## Iteration: 850 
## Beta0: 1.394 Beta1: 2.96 Beta2: -4.421 
## 
## Iteration: 900 
## Beta0: 1.394 Beta1: 2.96 Beta2: -4.421 
## 
## Iteration: 950 
## Beta0: 1.394 Beta1: 2.96 Beta2: -4.421 
## 
## Iteration: 1000 
## Beta0: 1.394 Beta1: 2.96 Beta2: -4.421

f)

n <- 100
X1 <- rnorm(n, mean = 0, sd = sqrt(2))
X2 <- rnorm(n, mean = 1, sd = sqrt(2))
beta0 <- 1.5
beta1 <- 3
beta2 <- -4.5
epsilon <- rnorm(n, mean = 0, sd = 1)
Y <- beta0 + beta1 * X1 + beta2 * X2 + epsilon
data <- data.frame(Y, X1, X2)

# Initialize empty data frame to store values
iteration_data <- data.frame(iteration = integer(),
                             beta0 = numeric(),
                             beta1 = numeric(),
                             beta2 = numeric())

# Backfitting Approach
beta0_hat <- 0
beta1_hat <- 10
beta2_hat <- 0

threshold <- 1e-6  # Set a stricter convergence threshold

for (iteration in 1:1000) {
  # Step (c): Keeping beta1 fixed, fit the model for beta2
  a <- Y - beta1_hat * X1
  beta2_hat <- lm(a ~ X2)$coef[2]

  # Step (d): Keeping beta2 fixed, fit the model for beta1
  a <- Y - beta2_hat * X2
  beta1_hat <- lm(a ~ X1)$coef[2]

  # Estimate beta0
  beta0_hat <- mean(Y - beta1_hat * X1 - beta2_hat * X2)

  # Store values in data frame
  iteration_data <- rbind(iteration_data, data.frame(iteration = iteration,
                                                     beta0 = beta0_hat,
                                                     beta1 = beta1_hat,
                                                     beta2 = beta2_hat))
}

# Multiple Linear Regression
mlr_model <- lm(Y ~ X1 + X2, data = data)
mlr_coefs <- coef(mlr_model)

# Plot with MLR Coefficients Overlay
ggplot(iteration_data, aes(x = iteration)) +
  geom_line(aes(y = beta0, color = "Beta0")) +
  geom_line(aes(y = beta1, color = "Beta1")) +
  geom_line(aes(y = beta2, color = "Beta2")) +
  geom_hline(yintercept = mlr_coefs[1], color = "blue", linetype = "dashed") +
  geom_hline(yintercept = mlr_coefs[2], color = "red", linetype = "dashed") +
  geom_hline(yintercept = mlr_coefs[3], color = "green", linetype = "dashed") +
  labs(title = "Convergence of Beta Estimates with MLR Coefficients",
       x = "Iteration",
       y = "Beta Value",
       color = "Parameter") +
  scale_color_manual(values = c("Beta0" = "blue", "Beta1" = "red", "Beta2" = "green")) +
  theme_minimal()

The plot showing the convergence of the iterative estimates of (beta_0), (beta_1), and (beta_2) across 1,000 iterations, with the estimates from the multiple linear regression model overlaid as dashed lines. This visual comparison will help to assess how closely the iterative approach approximates the results of the standard multiple linear regression analysis.

g)

n <- 100
X1 <- rnorm(n, mean = 0, sd = sqrt(2))
X2 <- rnorm(n, mean = 1, sd = sqrt(2))
beta0 <- 1.5
beta1 <- 3
beta2 <- -4.5
epsilon <- rnorm(n, mean = 0, sd = 1)
Y <- beta0 + beta1 * X1 + beta2 * X2 + epsilon
data <- data.frame(Y, X1, X2)

# Multiple Linear Regression for benchmark
mlr_model <- lm(Y ~ X1 + X2, data = data)
mlr_coefs <- coef(mlr_model)

# Initialize variables for iterative process
beta0_hat <- 0
beta1_hat <- 10 # Starting with a value far from the expected to demonstrate convergence
beta2_hat <- 0
convergence_threshold <- 0.01
max_iterations <- 1000
converged <- FALSE

# Iterative process
for (iteration in 1:max_iterations) {
  # Update beta2_hat
  a <- Y - beta1_hat * X1
  beta2_hat <- lm(a ~ X2)$coef[2]
  
  # Update beta1_hat
  a <- Y - beta2_hat * X2
  beta1_hat <- lm(a ~ X1)$coef[2]
  
  # Update beta0_hat
  beta0_hat <- mean(Y - beta1_hat * X1 - beta2_hat * X2)
  
  # Check convergence
  if (abs(beta0_hat - mlr_coefs[1]) < convergence_threshold &&
      abs(beta1_hat - mlr_coefs[2]) < convergence_threshold &&
      abs(beta2_hat - mlr_coefs[3]) < convergence_threshold) {
    converged <- TRUE
    cat("Converged at iteration:", iteration, "\n")
    break
  }
}

## Converged at iteration: 2

if (!converged) {
  cat("Did not converge within", max_iterations, "iterations.\n")
}

# Final Estimates
cat("Final Estimates after convergence check:\n")

## Final Estimates after convergence check:

cat("Beta0:", beta0_hat, "Beta1:", beta1_hat, "Beta2:", beta2_hat, "\n")

## Beta0: 1.332806 Beta1: 3.007769 Beta2: -4.441211

The model converged after two iterations, with the final estimates indicating that for every unit increase in Beta1, the response variable increases by approximately 3.01 units

Data_Analytics_Lab_8

Surya

Importing necessary libaries and datasets

6

a) Perform polynomial regression to predict wage using age. Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial ft to the data.

9 This question uses the variables dis (the weighted mean of distances to fve Boston employment centers) and nox (nitrogen oxides concentration in parts per 10 million) from the Boston data. We will treat dis as the predictor and nox as the response.

a)

b)

c)

d)

e)

f)

a)

b)

c)

d)

e)

f)

g)