library(ISLR)
## Warning: package 'ISLR' was built under R version 4.4.3
library(boot)
library(ggplot2)
library(splines)
library(mgcv)
## Loading required package: nlme
## This is mgcv 1.9-1. For overview type 'help("mgcv-package")'.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:nlme':
##
## collapse
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
# Load the data
data(Wage)
# Set seed for reproducibility
set.seed(1)
# We'll try degrees from 1 to 10
cv.errors <- rep(0, 10)
for (d in 1:10) {
glm.fit <- glm(wage ~ poly(age, d), data = Wage)
cv.errors[d] <- cv.glm(Wage, glm.fit, K = 10)$delta[1]
}
# Find optimal degree
optimal.degree <- which.min(cv.errors)
cat("Optimal polynomial degree by cross-validation:", optimal.degree, "\n")
## Optimal polynomial degree by cross-validation: 9
# Fit final model using optimal degree
best.poly.fit <- glm(wage ~ poly(age, optimal.degree), data = Wage)
# Perform hypothesis testing using ANOVA
fit.list <- list()
for (d in 1:5) {
fit.list[[d]] <- glm(wage ~ poly(age, d), data = Wage)
}
anova.result <- anova(fit.list[[1]], fit.list[[2]], fit.list[[3]],
fit.list[[4]], fit.list[[5]], test = "F")
print(anova.result)
## Analysis of Deviance Table
##
## Model 1: wage ~ poly(age, d)
## Model 2: wage ~ poly(age, d)
## Model 3: wage ~ poly(age, d)
## Model 4: wage ~ poly(age, d)
## Model 5: wage ~ poly(age, d)
## Resid. Df Resid. Dev Df Deviance F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ANOVA comparison between degrees 1 to 5
fit.1 <- glm(wage ~ poly(age, 1), data = Wage)
fit.2 <- glm(wage ~ poly(age, 2), data = Wage)
fit.3 <- glm(wage ~ poly(age, 3), data = Wage)
fit.4 <- glm(wage ~ poly(age, 4), data = Wage)
fit.5 <- glm(wage ~ poly(age, 5), data = Wage)
anova(fit.1, fit.2, fit.3, fit.4, fit.5, test = "F")
## Analysis of Deviance Table
##
## Model 1: wage ~ poly(age, 1)
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Resid. Df Resid. Dev Df Deviance F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Plot the polynomial fit
age.grid <- seq(from = min(Wage$age), to = max(Wage$age), length.out = 100)
preds <- predict(best.poly.fit, newdata = list(age = age.grid))
plot(Wage$age, Wage$wage, col = "gray", pch = 19, cex = 0.5,
main = paste("Polynomial Regression (Degree =", optimal.degree, ")"))
lines(age.grid, preds, col = "blue", lwd = 2)
Based on ANOVA, the optimal degree is likely 3.
Degree 4 is borderline, and degree 5+ adds no real value.
This contrasts with cross-validation, which chose degree 9, possibly favoring a more flexible model to reduce prediction error — even at the cost of interpretability and possible overfitting at the extremes (as seen in our plot).
# Only try cuts from 2 to 10
cv.errors.step <- rep(NA, 9) # index 1 will correspond to 2 cuts
for (k in 2:10) {
Wage$age.cut <- cut(Wage$age, k)
glm.fit <- glm(wage ~ age.cut, data = Wage)
cv.errors.step[k - 1] <- cv.glm(Wage, glm.fit, K = 10)$delta[1]
}
# Find optimal number of cuts
optimal.cuts <- which.min(cv.errors.step) + 1
cat("Optimal number of cuts for step function:", optimal.cuts, "\n")
## Optimal number of cuts for step function: 8
# Fit final step function model
Wage$age.cut <- cut(Wage$age, optimal.cuts)
step.fit <- glm(wage ~ age.cut, data = Wage)
# Plot step function prediction
preds.step <- predict(step.fit, newdata = Wage)
plot(Wage$age, Wage$wage, col = "gray", pch = 19, cex = 0.5,
main = paste("Step Function with", optimal.cuts, "Cuts"))
lines(Wage$age[order(Wage$age)], preds.step[order(Wage$age)], col = "red", lwd = 2)
Cross-validation selected degree 9, which captures detailed trends but may overfit near the boundaries.
ANOVA testing suggested degree 3 as the optimal balance of fit and simplicity.
Cross-validation selected 8 cuts.
The step function clearly shows wage trends across age brackets and is easier to interpret than high-degree polynomials.
# Load data
data("Wage")
# Quick look at categorical variables
summary(Wage$maritl)
## 1. Never Married 2. Married 3. Widowed 4. Divorced
## 648 2074 19 204
## 5. Separated
## 55
summary(Wage$jobclass)
## 1. Industrial 2. Information
## 1544 1456
summary(Wage$education)
## 1. < HS Grad 2. HS Grad 3. Some College 4. College Grad
## 268 971 650 685
## 5. Advanced Degree
## 426
# Fit a GAM model with smooth age, and categorical factors
gam.fit <- gam(wage ~ s(age, k = 5) + maritl + jobclass + education, data = Wage)
summary(gam.fit)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## wage ~ s(age, k = 5) + maritl + jobclass + education
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 74.08862 2.61286 28.355 < 2e-16 ***
## maritl2. Married 14.23608 1.84011 7.737 1.39e-14 ***
## maritl3. Widowed -0.07275 8.18777 -0.009 0.992912
## maritl4. Divorced 0.65790 3.00234 0.219 0.826564
## maritl5. Separated 7.54212 4.99009 1.511 0.130787
## jobclass2. Information 4.56395 1.33990 3.406 0.000668 ***
## education2. HS Grad 10.80303 2.40656 4.489 7.43e-06 ***
## education3. Some College 22.84194 2.54528 8.974 < 2e-16 ***
## education4. College Grad 36.76036 2.55195 14.405 < 2e-16 ***
## education5. Advanced Degree 60.11438 2.80680 21.417 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(age) 2.968 3.48 24.48 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.306 Deviance explained = 30.8%
## GCV = 1214.2 Scale est. = 1209 n = 3000
# Predict for plotting
age.grid <- data.frame(age = seq(min(Wage$age), max(Wage$age), length = 100))
# Effect of age (smooth term)
plot(gam.fit, se = TRUE, col = "blue", main = "GAM Smooth Terms")
The smooth function of age (plot above) shows a non-linear relationship:
Wage increases steadily until around age 50.
Then it flattens and gradually declines.
The edf ≈ 3 confirms a non-linear, but smooth curve.
The p-value < 2e-16
means the age term is
highly significant.
maritl
):“Married” individuals earn $14.24 more on average than “Never Married” (the baseline).
Other categories (Widowed, Divorced, Separated) are not statistically significant.
jobclass
):**Education**:
- **HS Grad**: +\$10.80
- **Some College**: +\$22.84
- **College Grad**: +\$36.76
- **Advanced Degree**: +\$60.11
R-squared (adjusted): 0.306
→ The model explains ~31% of the variance in
wage.
GCV: 1214.2
→ Good relative measure of prediction error (lower is better).
# ggplot: Wage vs. age colored by marital status
ggplot(Wage, aes(x = age, y = wage, color = maritl)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "loess", se = FALSE) +
labs(title = "Wage vs. Age by Marital Status")
## `geom_smooth()` using formula = 'y ~ x'
# ggplot: Wage vs. age colored by jobclass
ggplot(Wage, aes(x = age, y = wage, color = jobclass)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "loess", se = FALSE) +
labs(title = "Wage vs. Age by Job Class")
## `geom_smooth()` using formula = 'y ~ x'
# ggplot: Boxplot of wage by education level
ggplot(Wage, aes(x = education, y = wage, fill = education)) +
geom_boxplot() +
labs(title = "Wage Distribution by Education Level") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
The first plot shows wage vs. age by marital status. Married individuals generally have higher wages, especially during mid-career (ages 40-60). All groups show an increase in wages until middle age, followed by a decline, except for divorced individuals whose wages continue to rise at older ages. The highest individual wages appear among married people.
The second plot displays wage vs. age by job class. Information sector workers consistently earn higher wages than industrial workers across all age groups. Both sectors show the typical pattern of wages peaking in middle age (around 40-50 years) before declining, though the information sector maintains a more substantial advantage throughout the career span.
The third plot illustrates wage distribution by education level. There is a clear positive relationship between education and wages, with each successive education level showing higher median wages. Advanced degree holders earn the highest wages, followed by college graduates, those with some college, high school graduates, and those without high school diplomas. Higher education levels also show greater wage variability, particularly at the upper end.
# Load the Boston dataset
data(Boston)
poly_fit3 <- lm(nox ~ poly(dis, 3), data = Boston)
summary(poly_fit3)
##
## Call:
## lm(formula = nox ~ poly(dis, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.121130 -0.040619 -0.009738 0.023385 0.194904
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.554695 0.002759 201.021 < 2e-16 ***
## poly(dis, 3)1 -2.003096 0.062071 -32.271 < 2e-16 ***
## poly(dis, 3)2 0.856330 0.062071 13.796 < 2e-16 ***
## poly(dis, 3)3 -0.318049 0.062071 -5.124 4.27e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06207 on 502 degrees of freedom
## Multiple R-squared: 0.7148, Adjusted R-squared: 0.7131
## F-statistic: 419.3 on 3 and 502 DF, p-value: < 2.2e-16
# Plot the data and cubic polynomial fit
ggplot(Boston, aes(x = dis, y = nox)) +
geom_point(alpha = 0.5) +
stat_smooth(method = "lm", formula = y ~ poly(x, 3), color = "blue") +
labs(title = "Cubic Polynomial Regression Fit",
x = "Weighted Mean Distance to Employment Centers (dis)",
y = "Nitrogen Oxides Concentration (nox)")
The regression output indicates:
All polynomial terms are highly significant (p < 0.001)
The model explains approximately 71.5% of the variance (R-squared = 0.7148)
The cubic term is negative (-0.318), indicating a slight upward curve at the end of the fit
The plot visually confirms this relationship, showing:
High NOx concentrations (0.7-0.9) at short distances (1-2.5 units)
A steep decline in NOx between distances of 2-5 units
A leveling off of NOx values (around 0.4-0.45) at distances beyond 7.5 units
The gray band represents the 95% confidence interval, which widens at the extremes
This suggests that proximity to employment centers is strongly associated with higher pollution levels, with the effect diminishing at greater distances.
max_degree <- 10
rss_values <- numeric(max_degree)
for (degree in 1:max_degree) {
poly_fit <- lm(nox ~ poly(dis, degree), data = Boston)
rss_values[degree] <- sum(poly_fit$residuals^2)
}
# Plot RSS vs degree
ggplot(data.frame(degree = 1:max_degree, rss = rss_values), aes(x = degree, y = rss)) +
geom_line() +
geom_point() +
labs(title = "RSS vs Polynomial Degree",
x = "Degree of Polynomial",
y = "Residual Sum of Squares (RSS)")
# Plot all polynomial fits
plot_data <- Boston[order(Boston$dis), ]
plot_data$dis_grid <- plot_data$dis
poly_plots <- ggplot(plot_data, aes(x = dis, y = nox)) +
geom_point(alpha = 0.3) +
labs(title = "Polynomial Fits of Different Degrees",
x = "Weighted Mean Distance to Employment Centers (dis)",
y = "Nitrogen Oxides Concentration (nox)")
for (degree in 1:max_degree) {
poly_fit <- lm(nox ~ poly(dis, degree), data = Boston)
pred_data <- data.frame(dis = plot_data$dis_grid)
pred_data$pred <- predict(poly_fit, newdata = pred_data)
poly_plots <- poly_plots +
geom_line(data = pred_data, aes(x = dis, y = pred, color = factor(degree)))
}
poly_plots +
scale_color_discrete(name = "Polynomial Degree") +
theme(legend.position = "right")
The first plot shows RSS (Residual Sum of Squares) versus polynomial degree. There’s a sharp decrease in RSS when moving from degree 1 to degree 3, indicating substantial improvement in model fit. After degree 3, the improvements become more gradual, with diminishing returns as the polynomial degree increases. The RSS continues to decrease slightly up to degree 10, but the rate of improvement slows considerably after degree 4.
The second plot displays the actual polynomial fits of different degrees overlaid on the scatter plot of the data. All polynomial models capture the general negative relationship between distance and NOx concentration. The higher-degree polynomials (shown in various colors) fit the data more closely, particularly at the extremes of the distance range where some models show different behaviors (some curve upward while others curve downward at distances beyond 10). The data shows a clear nonlinear pattern with NOx levels decreasing rapidly at shorter distances and then leveling off at greater distances from employment centers.
set.seed(123)
cv_errors <- numeric(max_degree)
for (degree in 1:max_degree) {
cv_model <- glm(nox ~ poly(dis, degree), data = Boston)
cv_errors[degree] <- cv.glm(Boston, cv_model, K = 10)$delta[1]
}
# Plot CV error vs degree
ggplot(data.frame(degree = 1:max_degree, cv_error = cv_errors), aes(x = degree, y = cv_error)) +
geom_line() +
geom_point() +
labs(title = "10-fold CV Error vs Polynomial Degree",
x = "Degree of Polynomial",
y = "Cross-Validation Error")
# Find optimal degree
optimal_degree <- which.min(cv_errors)
cat("Optimal polynomial degree based on CV:", optimal_degree, "\n")
## Optimal polynomial degree based on CV: 3
The 10-fold cross-validation plot shows that a polynomial of degree 3 achieves the lowest cross-validation error, confirming it as the optimal model complexity for this data. The U-shaped error curve illustrates the bias-variance tradeoff: lower-degree polynomials (degrees 1-2) have higher error due to underfitting (high bias), while higher-degree polynomials (degrees 4-10) show increasing error due to overfitting (high variance). The sharp increase in CV error after degree 5 indicates that the additional flexibility of higher-order polynomials leads to models that fit noise in the training data rather than the underlying relationship.
The cubic polynomial (degree 3) strikes the best balance between model complexity and predictive performance for the relationship between distance to employment centers and nitrogen oxide concentration.
# Using 2 interior knots (4 df = 3 + 1 interior knot)
knots <- quantile(Boston$dis, c(0.33, 0.66))
spline_fit4 <- lm(nox ~ bs(dis, df = 4, knots = knots), data = Boston)
summary(spline_fit4)
##
## Call:
## lm(formula = nox ~ bs(dis, df = 4, knots = knots), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.132643 -0.038901 -0.008489 0.023569 0.198086
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.67148 0.02006 33.468 < 2e-16 ***
## bs(dis, df = 4, knots = knots)1 0.08525 0.02900 2.940 0.00343 **
## bs(dis, df = 4, knots = knots)2 -0.12934 0.01991 -6.495 2.01e-10 ***
## bs(dis, df = 4, knots = knots)3 -0.25755 0.03476 -7.409 5.47e-13 ***
## bs(dis, df = 4, knots = knots)4 -0.26347 0.04067 -6.478 2.23e-10 ***
## bs(dis, df = 4, knots = knots)5 -0.26165 0.05203 -5.029 6.88e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06066 on 500 degrees of freedom
## Multiple R-squared: 0.7287, Adjusted R-squared: 0.7259
## F-statistic: 268.5 on 5 and 500 DF, p-value: < 2.2e-16
# Plot the spline fit
ggplot(Boston, aes(x = dis, y = nox)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", formula = y ~ bs(x, df = 4, knots = knots), color = "red") +
labs(title = "Regression Spline with 4 Degrees of Freedom",
x = "Weighted Mean Distance to Employment Centers (dis)",
y = "Nitrogen Oxides Concentration (nox)") +
geom_vline(xintercept = knots, linetype = "dashed", color = "blue", alpha = 0.5)
The regression spline with 4 degrees of freedom effectively models the relationship between distance to employment centers (dis) and nitrogen oxide concentration (nox). The model explains about 73% of the variance (R-squared = 0.7287), slightly better than the cubic polynomial model from part (a).
The plot shows the spline fit (red line) with two knots (vertical blue dashed lines) placed at approximately the 33rd and 66th percentiles of the distance distribution (around 2.5 and 5.0). These knots allow the model to adapt to changes in the relationship between variables at different distance ranges.
All spline coefficients are highly significant (p < 0.01), with the intercept representing the baseline NOx level. The fit captures the nonlinear relationship well: NOx levels start high (around 0.7) at short distances, decrease rapidly between distances of 1-5, and then level off to approximately 0.4 at distances beyond 7.5 units. The widening gray confidence band at extreme distances reflects increased uncertainty in these regions due to fewer data points.
The spline approach effectively captures the diminishing effect of distance on pollution levels, showing how NOx concentration stabilizes at greater distances from employment centers.
max_df <- 10
spline_rss <- numeric(max_df)
spline_plots <- ggplot(plot_data, aes(x = dis, y = nox)) +
geom_point(alpha = 0.3) +
labs(title = "Spline Fits with Different Degrees of Freedom",
x = "Weighted Mean Distance to Employment Centers (dis)",
y = "Nitrogen Oxides Concentration (nox)")
for (df in 3:max_df) {
# Fit spline with df degrees of freedom
spline_fit <- lm(nox ~ bs(dis, df = df), data = Boston)
spline_rss[df] <- sum(spline_fit$residuals^2)
# Add to plot
pred_data <- data.frame(dis = plot_data$dis_grid)
pred_data$pred <- predict(spline_fit, newdata = pred_data)
spline_plots <- spline_plots +
geom_line(data = pred_data, aes(x = dis, y = pred, color = factor(df)))
}
# Plot RSS vs degrees of freedom
ggplot(data.frame(df = 3:max_df, rss = spline_rss[3:max_df]), aes(x = df, y = rss)) +
geom_line() +
geom_point() +
labs(title = "RSS vs Spline Degrees of Freedom",
x = "Degrees of Freedom",
y = "Residual Sum of Squares (RSS)")
# Plot all spline fits
spline_plots +
scale_color_discrete(name = "Degrees of Freedom") +
theme(legend.position = "right")
The plot shows multiple spline fits with different degrees of freedom (ranging from 3 to 10, with the legend showing 10) for modeling the relationship between distance to employment centers (dis) and nitrogen oxide concentration (nox).
All spline models capture the same general pattern: NOx levels start high (around 0.7-0.75) at short distances, decrease rapidly between distances of 1-5 units, and then level off to approximately 0.4-0.45 at greater distances.
The fits are very similar across different degrees of freedom, with only minor differences visible at the extremes of the distance range (particularly beyond 10 units where some models diverge slightly).
This visual similarity suggests that increasing the degrees of freedom beyond a certain point provides minimal improvement in capturing the underlying relationship, which aligns with the cross-validation results that identified an optimal model with moderate complexity.
set.seed(456)
cv_spline_errors <- numeric(max_df)
for (df in 3:max_df) {
# Suppress warnings
cv_model <- suppressWarnings(glm(nox ~ bs(dis, df = df), data = Boston))
cv_result <- suppressWarnings(cv.glm(Boston, cv_model, K = 10))
cv_spline_errors[df] <- cv_result$delta[1]
}
# Plot CV error vs degrees of freedom
ggplot(data.frame(df = 3:max_df, cv_error = cv_spline_errors[3:max_df]),
aes(x = df, y = cv_error)) +
geom_line() +
geom_point() +
labs(title = "10-fold CV Error vs Spline Degrees of Freedom",
x = "Degrees of Freedom",
y = "Cross-Validation Error")
# Find optimal df
optimal_df <- which.min(cv_spline_errors)
cat("Optimal spline degrees of freedom based on CV:", optimal_df, "\n")
## Optimal spline degrees of freedom based on CV: 1
The cross-validation plot for spline degrees of freedom shows that the optimal model has 7 degrees of freedom (not 1 as stated in the query text), which achieves the lowest cross-validation error.
The plot reveals an interesting pattern where CV error initially increases from 3 to 4 degrees of freedom, then drops sharply between 4 and 5, and continues to decrease until reaching its minimum at 7 degrees of freedom. After that point, the error begins to increase again for models with 8-10 degrees of freedom.
This result suggests that a spline with 7 degrees of freedom provides the best balance between model flexibility and generalization performance. The higher error for simpler models (3-4 df) indicates underfitting, while the increasing error for more complex models (8-10 df) suggests overfitting.
The optimal model captures the nonlinear relationship between distance to employment centers and nitrogen oxide concentration without fitting to noise in the data.
set.seed(123)
n <- 100
X1 <- rnorm(n)
X2 <- rnorm(n)
Y <- 2 + 3 * X1 - X2 + rnorm(n)
beta1 <- 0
# (c) Keeping beta1 fixed, fit the model Y - beta1*X1 = beta0 + beta2*X2 + epsilon
a <- Y - beta1 * X1
model_beta2 <- lm(a ~ X2)
beta2 <- model_beta2$coef[2]
beta0_c <- model_beta2$coef[1] # Also capturing beta0 from this model
# (d) Keeping beta2 fixed, fit the model Y - beta2*X2 = beta0 + beta1*X1 + epsilon
a <- Y - beta2 * X2
model_beta1 <- lm(a ~ X1)
beta1 <- model_beta1$coef[2]
beta0_d <- model_beta1$coef[1] # Also capturing beta0 from this model
iterations <- 1000
beta0_estimates <- numeric(iterations)
beta1_estimates <- numeric(iterations)
beta2_estimates <- numeric(iterations)
for (i in 1:iterations) {
# (c) Update beta2
a <- Y - beta1 * X1
model_beta2 <- lm(a ~ X2)
beta2 <- coef(model_beta2)[2]
# (d) Update beta1
a <- Y - beta2 * X2
model_beta1 <- lm(a ~ X1)
beta1 <- coef(model_beta1)[2]
# Store estimates
beta0_estimates[i] <- coef(model_beta1)[1]
beta1_estimates[i] <- beta1
beta2_estimates[i] <- beta2
}
# Plot the estimates
plot(1:iterations, beta0_estimates, type = "l", col = "red",
ylim = range(c(beta0_estimates, beta1_estimates, beta2_estimates)),
xlab = "Iteration", ylab = "Coefficient Estimate",
main = "Backfitting Coefficient Estimates")
lines(1:iterations, beta1_estimates, col = "blue")
lines(1:iterations, beta2_estimates, col = "green")
legend("topright", legend = c("beta0", "beta1", "beta2"),
col = c("red", "blue", "green"), lty = 1)
# (f) Compare with multiple linear regression
full_model <- lm(Y ~ X1 + X2)
abline(h = coef(full_model)[1], col = "red", lty = 2)
abline(h = coef(full_model)[2], col = "blue", lty = 2)
abline(h = coef(full_model)[3], col = "green", lty = 2)
plot(X1, Y)
abline(a=10, b=0)
full_model <- lm(Y ~ X1 + X2)
abline(h = coef(full_model)[1], col = "red", lty = 2)
abline(h = coef(full_model)[2], col = "blue", lty = 2)
abline(h = coef(full_model)[3], col = "green", lty = 2)
The scatter plot shows a positive linear relationship between variables X1 and Y. Data points are distributed with X1 values ranging from about -2.5 to 2.5, and Y values from about -4 to 10. As X increases, Y tends to increase as well, suggesting a positive correlation.
Three horizontal dashed lines are overlaid on the plot: a red line at around Y=2.5, a blue line at about Y=3, and a green line at approximately Y=-1. These lines likely represent coefficient estimates from a regression model, possibly showing the convergence of parameter estimates in a back fitting algorithm.
The pattern of points shows some variability around the trend, with the relationship appearing stronger at the extremes of X1.
convergence_threshold <- 1e-6
diffs <- abs(diff(cbind(beta0_estimates, beta1_estimates, beta2_estimates)))
convergence_iteration <- which(apply(diffs, 1, max) < convergence_threshold)[1]
cat("Convergence achieved after", convergence_iteration, "iterations")
## Convergence achieved after 2 iterations