library(ISLR2)
attach(Wage)
# Fit polynomials of different degrees
fit1 <- lm(wage ~ poly(age, 1), data = Wage)
fit2 <- lm(wage ~ poly(age, 2), data = Wage)
fit3 <- lm(wage ~ poly(age, 3), data = Wage)
fit4 <- lm(wage ~ poly(age, 4), data = Wage)
fit5 <- lm(wage ~ poly(age, 5), data = Wage)
anova(fit1, fit2, fit3, fit4, fit5)
## Analysis of Variance Table
##
## Model 1: wage ~ poly(age, 1)
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The ANOVA suggests strong evidence of non-linearity in regards to the relationship between age and wage. There is an improvement from linear to a quadratic model is dramatic, and adding a cubic term only furthers this improvement. The fourth and fifth degree terms do not seem to further contribute to this improvement, thus a cubic model is preferred.
library(boot)
set.seed(1)
cv.errors <- rep(NA, 10)
for (d in 1:10) {
fit <- glm(wage ~ poly(age, d), data = Wage)
cv.errors[d] <- cv.glm(Wage, fit, K=10)$delta[1]
}
cv.errors
## [1] 1676.826 1600.763 1598.399 1595.651 1594.977 1596.061 1594.298 1598.134
## [9] 1593.913 1595.950
-It can be observed the lowest cross validation error is at degree 9, which contradicts what our ANOVA says. However, we must observe the substantial improvement from degree 1 to 2, and improvements beyond the 3rd degree are not nearly as substantial, this is similar to what the ANOVA produced and affirms our findings. Adding terms beyond degree 3 yield only marginal or no-significant improvements.
# Fit the cubic model
fit3 <- lm(wage ~ poly(age, 3), data = Wage)
age.grid <- seq(min(Wage$age), max(Wage$age), length.out = 100)
preds <- predict(fit3, newdata = list(age = age.grid))
# Plot wage versus age and overlay the cubic fit
plot(Wage$age, Wage$wage, col = "gray", pch = 16,
xlab = "Age", ylab = "Wage", main = "Cubic Polynomial Fit: Wage ~ Age")
lines(age.grid, preds, col = "blue", lwd = 2)
set.seed(2)
#Store the CV errors for models with k = 2 to 10 groups.
cv.step.errors <- rep(NA, 9)
for (k in 2:10) {
# Define fixed breakpoints based on the full dataset
breaks_k <- seq(from = min(Wage$age), to = max(Wage$age), length.out = k + 1)
# Create the step function
Wage$age.cut <- cut(Wage$age, breaks = breaks_k, include.lowest = TRUE)
# Fit the step function
fit.step <- glm(wage ~ age.cut, data = Wage)
# Store the 10-fold CV error
cv.step.errors[k - 1] <- cv.glm(Wage, fit.step, K = 10)$delta[1]
}
cv.step.errors
## [1] 1734.448 1681.623 1635.869 1632.864 1621.643 1613.148 1601.850 1610.852
## [9] 1603.607
-The results above seem to suggests that the 8th degree yields the minimum CV error thus the optimal number of cuts is 8.
optimal_k <- 8
breaks_opt <- seq(from = min(Wage$age), to = max(Wage$age), length.out = optimal_k + 1)
# Create the factor variable
Wage$age.cut <- cut(Wage$age, breaks = breaks_opt, include.lowest = TRUE)
# Fit the final model
fit.step.opt <- lm(wage ~ age.cut, data = Wage)
summary(fit.step.opt)
##
## Call:
## lm(formula = wage ~ age.cut, data = Wage)
##
## Residuals:
## Min 1Q Median 3Q Max
## -99.697 -24.552 -5.307 15.417 198.560
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 76.282 2.630 29.007 < 2e-16 ***
## age.cut(25.8,33.5] 25.833 3.161 8.172 4.44e-16 ***
## age.cut(33.5,41.2] 40.226 3.049 13.193 < 2e-16 ***
## age.cut(41.2,49] 43.501 3.018 14.412 < 2e-16 ***
## age.cut(49,56.8] 40.136 3.177 12.634 < 2e-16 ***
## age.cut(56.8,64.5] 44.102 3.564 12.373 < 2e-16 ***
## age.cut(64.5,72.2] 28.948 6.042 4.792 1.74e-06 ***
## age.cut(72.2,80] 15.224 9.781 1.556 0.12
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39.97 on 2992 degrees of freedom
## Multiple R-squared: 0.08467, Adjusted R-squared: 0.08253
## F-statistic: 39.54 on 7 and 2992 DF, p-value: < 2.2e-16
# Create a grid for age to plot
age.grid <- seq(min(Wage$age), max(Wage$age), length.out = 100)
age.cut.grid <- cut(age.grid, breaks = breaks_opt, include.lowest = TRUE)
# Predict wages
preds <- predict(fit.step.opt, newdata = data.frame(age.cut = age.cut.grid))
# Plot the data
plot(Wage$age, Wage$wage, col = "gray", pch = 16,
xlab = "Age", ylab = "Wage",
main = "Step Function Fit: Wage ~ Age (8 Groups)")
lines(age.grid, preds, type = "s", col = "red", lwd = 2)
library(gam)
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.22-5
# Fit a GAM model allowing non-linear effects for age and year,
gam_model <- gam(wage ~ s(age, df=4) + s(year, df=4) + jobclass + maritl,
data = Wage)
summary(gam_model)
##
## Call: gam(formula = wage ~ s(age, df = 4) + s(year, df = 4) + jobclass +
## maritl, data = Wage)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -110.151 -22.913 -4.708 15.053 205.674
##
## (Dispersion Parameter for gaussian family taken to be 1489.99)
##
## Null Deviance: 5222086 on 2999 degrees of freedom
## Residual Deviance: 4449111 on 2986 degrees of freedom
## AIC: 30449.17
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## s(age, df = 4) 1 200549 200549 134.598 < 2.2e-16 ***
## s(year, df = 4) 1 21014 21014 14.104 0.0001763 ***
## jobclass 1 174612 174612 117.190 < 2.2e-16 ***
## maritl 4 142382 35595 23.890 < 2.2e-16 ***
## Residuals 2986 4449111 1490
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## s(age, df = 4) 3 30.887 <2e-16 ***
## s(year, df = 4) 3 0.600 0.615
## jobclass
## maritl
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Based on the information above there seems to be a significant non-linear relationship for age and for year. I am also able to observe that there is a significant difference in salary between jobs in different industries. Additionally, I found it interesting that marital status, so being married, seems to have a positive affect on how much one makes.
#Visualization
par(mfrow = c(1, 2))
# Plot the effect of age on wage
plot(gam_model, se = TRUE, col = "blue", main = "Effect of Age on Wage")
# Plot the effect of year on wage
plot(gam_model, select = 2, se = TRUE, col = "red", main = "Effect of Year on Wage")
## Warning in plot.window(...): "select" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "select" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "select" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "select" is not a
## graphical parameter
## Warning in box(...): "select" is not a graphical parameter
## Warning in title(...): "select" is not a graphical parameter
## Warning in plot.window(...): "select" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "select" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "select" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "select" is not a
## graphical parameter
## Warning in box(...): "select" is not a graphical parameter
## Warning in title(...): "select" is not a graphical parameter
## Warning in plot.window(...): "select" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "select" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "select" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "select" is not a
## graphical parameter
## Warning in box(...): "select" is not a graphical parameter
## Warning in title(...): "select" is not a graphical parameter
## Warning in plot.window(...): "select" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "select" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "select" is not a
## graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "select" is not a
## graphical parameter
## Warning in box(...): "select" is not a graphical parameter
## Warning in title(...): "select" is not a graphical parameter
# Boxplots for categorical variables:
par(mfrow = c(1, 2))
boxplot(wage ~ jobclass, data = Wage,
main = "Wage by Jobclass", xlab = "Job Class", ylab = "Wage", col = "lightgreen")
boxplot(wage ~ maritl, data = Wage,
main = "Wage by Marital Status", xlab = "Marital Status", ylab = "Wage", col = "lightblue")
-The GAM model indicates that wage increases with age in a non-linear fashion—typically rising mid-career and then leveling off or declining slightly. Similarly, the effect of year is non-linear, possibly reflecting economic trends over time. There seems to be a clear disparage between job industries, most pertinent to us, it seems that jobs in information seems to earn more than those such as industrial jobs. Martial Status also was cited to have a significant influence on wage. This suggests that social or familial situation have influence on wage. Through the utilization of splines and smooth functions the GAM framework allows us to capture complex, non-linear relationships.
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
library(splines)
data("Boston")
attach(Boston)
## The following object is masked from Wage:
##
## age
#Fit a cubic polynomial model:
fit_poly3 <- lm(dis ~ poly(nox, 3), data = Boston)
summary(fit_poly3)
##
## Call:
## lm(formula = dis ~ poly(nox, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.2094 -0.6112 -0.1121 0.4798 4.9914
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.7950 0.0467 81.264 < 2e-16 ***
## poly(nox, 3)1 -36.3999 1.0505 -34.650 < 2e-16 ***
## poly(nox, 3)2 17.9570 1.0505 17.094 < 2e-16 ***
## poly(nox, 3)3 -6.1479 1.0505 -5.852 8.75e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.05 on 502 degrees of freedom
## Multiple R-squared: 0.7526, Adjusted R-squared: 0.7511
## F-statistic: 509 on 3 and 502 DF, p-value: < 2.2e-16
# Create a grid of nox values for plotting
nox.grid <- seq(min(nox), max(nox), length.out = 100)
# Predict dis
pred_poly3 <- predict(fit_poly3, newdata = data.frame(nox = nox.grid))
#Plot the original data and the fitted curve
plot(nox, dis,
xlab = "NOX concentration",
ylab = "DIS (distance)",
pch = 16, col = "darkgray",
main = "Cubic Polynomial Fit: dis ~ nox")
lines(nox.grid, pred_poly3, col = "blue", lwd = 2)
#Store RSS values for degrees 1 through 10
rss.values <- numeric(10)
plot(nox, dis,
xlab = "NOX concentration",
ylab = "DIS (distance)",
pch = 16, col = "darkgray",
main = "Polynomial Fits (Degrees 1 to 10)")
colors <- rainbow(10)
nox.grid <- seq(min(nox), max(nox), length.out = 100)
for(d in 1:10) {
#Fit polynomial of degree
fit <- lm(dis ~ poly(nox, d), data = Boston)
#Compute RSS
rss.values[d] <- sum(resid(fit)^2)
#Predict for plotting
preds <- predict(fit, newdata = data.frame(nox = nox.grid))
#Add line
lines(nox.grid, preds, col = colors[d], lwd = 1)
}
#Legend
legend("topright", legend = paste("Deg", 1:10),
col = colors, lty = 1, cex = 0.7)
#Now plot RSS by polynomial degree
plot(1:10, rss.values, type = "b",
xlab = "Polynomial Degree", ylab = "RSS",
main = "RSS for Polynomial Degrees 1 to 10")
print(rss.values)
## [1] 914.2227 591.7683 553.9713 553.2096 542.6460 542.6052 527.7509 524.5965
## [9] 520.4882 511.8114
set.seed(1)
cv.error <- rep(NA, 10)
for(d in 1:10){
fit <- glm(dis ~ poly(nox, d), data = Boston)
cv.error[d] <- cv.glm(Boston, fit, K = 10)$delta[1] # K=10-fold CV
}
cv.error
## [1] 1.832495 1.183275 1.131704 1.107037 1.106649 1.098316 1.069238 1.077748
## [9] 1.089051 1.043549
plot(1:10, cv.error, type = "b",
xlab = "Polynomial Degree", ylab = "10-fold CV Error",
main = "CV Error vs. Degree")
best.degree <- which.min(cv.error)
best.degree
## [1] 10
-It seems the lowest CV error occurs at degree 10 but it should be noted that at degree 3 increasing degrees form this point seem to yield minimal improvements. So, again, I’d suggest degree 3 as the ideal degree.
# (d) Fit a regression spline with df=4 using bs()
fit_spline4 <- lm(dis ~ bs(nox, df = 4), data = Boston)
summary(fit_spline4)
##
## Call:
## lm(formula = dis ~ bs(nox, df = 4), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1869 -0.6051 -0.1267 0.4976 4.9651
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.4743 0.2704 31.340 < 2e-16 ***
## bs(nox, df = 4)1 -2.9047 0.4546 -6.389 3.81e-10 ***
## bs(nox, df = 4)2 -7.5820 0.3489 -21.729 < 2e-16 ***
## bs(nox, df = 4)3 -6.3879 0.5264 -12.135 < 2e-16 ***
## bs(nox, df = 4)4 -6.7810 0.3660 -18.528 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.051 on 501 degrees of freedom
## Multiple R-squared: 0.7527, Adjusted R-squared: 0.7508
## F-statistic: 381.3 on 4 and 501 DF, p-value: < 2.2e-16
# Generate predictions over a grid
nox.grid <- seq(min(nox), max(nox), length.out = 100)
pred_spline4 <- predict(fit_spline4, newdata = data.frame(nox = nox.grid))
# Plot
plot(nox, dis, col = "darkgray", pch = 16,
xlab = "NOX concentration", ylab = "DIS (distance)",
main = "Regression Spline with 4 DF")
lines(nox.grid, pred_spline4, col = "red", lwd = 2)
-By default, if you specify df but not the knots, the bs() function chooses a set of internal knots in such a way that they divide the range of the predictor (here, nox) into intervals that contain approximately equal numbers of observations. This automatic selection is typically done using quantiles of the data.
dfs <- 3:10
rss.spline <- numeric(length(dfs))
plot(nox, dis, col = "darkgray", pch = 16,
xlab = "NOX", ylab = "DIS",
main = "Regression Splines (Various DF)")
colors <- rainbow(length(dfs))
for(i in seq_along(dfs)){
df_val <- dfs[i]
fit_tmp <- lm(dis ~ bs(nox, df = df_val), data = Boston)
rss.spline[i] <- sum(resid(fit_tmp)^2)
# Plot each fit
preds_tmp <- predict(fit_tmp, newdata = data.frame(nox = nox.grid))
lines(nox.grid, preds_tmp, col = colors[i], lwd = 1)
}
legend("topright", legend = paste("DF =", dfs),
col = colors, lty = 1, cex = 0.8)
# Plot RSS vs. DF
plot(dfs, rss.spline, type = "b",
xlab = "Degrees of Freedom", ylab = "RSS",
main = "RSS for Spline Models with Varying DF")
-RSS plot conclusion: Higher df leads to lower RSS, but the gains of this diminish after a certain point which suggest that moderate flexibility (df 5-7) may be enough to to capture the main structure without over fitting.
-Fitted Curve: More df leads to more localized curvature in the spline, which might be better if the relationship truly has fine detail, or might be over fitting noise if the data is relatively smooth.
set.seed(2)
dfs <- 3:10
cv.spline <- numeric(length(dfs))
for(i in seq_along(dfs)){
df_val <- dfs[i]
fit_tmp <- glm(dis ~ bs(nox, df = df_val), data = Boston)
cv.spline[i] <- cv.glm(Boston, fit_tmp, K = 10)$delta[1]
}
## Warning in bs(nox, degree = 3L, knots = numeric(0), Boundary.knots = c(0.389, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(nox, degree = 3L, knots = numeric(0), Boundary.knots = c(0.389, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(nox, degree = 3L, knots = 0.538, Boundary.knots = c(0.389, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(nox, degree = 3L, knots = 0.538, Boundary.knots = c(0.389, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(nox, degree = 3L, knots = c(0.489, 0.597), Boundary.knots =
## c(0.389, : some 'x' values beyond boundary knots may cause ill-conditioned
## bases
## Warning in bs(nox, degree = 3L, knots = c(0.489, 0.597), Boundary.knots =
## c(0.389, : some 'x' values beyond boundary knots may cause ill-conditioned
## bases
## Warning in bs(nox, degree = 3L, knots = c(0.449, 0.538, 0.624), Boundary.knots
## = c(0.392, : some 'x' values beyond boundary knots may cause ill-conditioned
## bases
## Warning in bs(nox, degree = 3L, knots = c(0.449, 0.538, 0.624), Boundary.knots
## = c(0.392, : some 'x' values beyond boundary knots may cause ill-conditioned
## bases
## Warning in bs(nox, degree = 3L, knots = c(0.4429, 0.507, 0.58, 0.671),
## Boundary.knots = c(0.389, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases
## Warning in bs(nox, degree = 3L, knots = c(0.4429, 0.507, 0.58, 0.671),
## Boundary.knots = c(0.389, : some 'x' values beyond boundary knots may cause
## ill-conditioned bases
## Warning in bs(nox, degree = 3L, knots = c(0.437, 0.489, 0.538, 0.597, 0.693:
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(nox, degree = 3L, knots = c(0.437, 0.489, 0.538, 0.597, 0.693:
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(nox, degree = 3L, knots = c(0.433, 0.462857142857143, 0.51, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(nox, degree = 3L, knots = c(0.433, 0.462857142857143, 0.51, :
## some 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(nox, degree = 3L, knots = c(0.431, 0.449, 0.50025, 0.538, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases
## Warning in bs(nox, degree = 3L, knots = c(0.431, 0.449, 0.50025, 0.538, : some
## 'x' values beyond boundary knots may cause ill-conditioned bases
cv.spline
## [1] 1.113732 1.125110 1.112849 1.058856 1.088869 1.050953 1.037519 1.039459
plot(dfs, cv.spline, type = "b",
xlab = "Degrees of Freedom", ylab = "10-fold CV Error",
main = "CV Error for Splines with Varying DF")
best.df <- dfs[which.min(cv.spline)]
best.df
## [1] 9
From the CV error plot, we see that as we increase the spline’s degrees of freedom, the out-of-sample error initially decreases, indicating improved flexibility. However, past a certain point, the CV error either levels off or fluctuates, suggesting diminishing returns from additional complexity. The lowest CV error is achieved at around df =9. Thus, a spline model with 9 degrees of freedom most effectively balances flexibility with a risk of over fitting and appears to be the best choice for predicting dis from nox.
a.)
set.seed(1)
n <- 100
x1 <- rnorm(n)
x2 <- rnorm(n)
# Generate Y with noise:
y <- 2 + 1.5 * x1 + 2.5 * x2 + rnorm(n, mean = 0, sd = 1)
b.)
beta1 <- 0 # Initial guess for beta1
c.)
# Compute the partial residual "a"
a <- y - beta1 * x1
# Fit the model
fit_beta2 <- lm(a ~ x2)
# Extract beta2 from the regression
beta2 <- fit_beta2$coef[2]
beta2
## x2
## 2.445115
d.)
# Compute the partial residual "a" with beta2 fixed
a <- y - beta2 * x2
# Fit the model
fit_beta1 <- lm(a ~ x1)
beta1 <- fit_beta1$coef[2]
beta1
## x1
## 1.521109
e.)
# Number of iterations
num_iter <- 1000
# Create vectors
beta0_vals <- numeric(num_iter)
beta1_vals <- numeric(num_iter)
beta2_vals <- numeric(num_iter)
# Reinitialize beta1 and 2
beta1 <- 0
beta2 <- 0
# For each iteration, update beta2 then beta1, and update beta0 as well.
for(i in 1:num_iter){
#Keeping beta1 fixed, update beta2:
a <- y - beta1 * x1
fit_beta2 <- lm(a ~ x2)
beta2 <- fit_beta2$coef[2]
#Keeping beta2 fixed, update beta1:
a <- y - beta2 * x2
fit_beta1 <- lm(a ~ x1)
beta1 <- fit_beta1$coef[2]
beta0 <- mean(y) - beta1 * mean(x1) - beta2 * mean(x2)
beta0_vals[i] <- beta0
beta1_vals[i] <- beta1
beta2_vals[i] <- beta2
}
# Report the final estimates after 1000 iterations
cat("Final estimates after 1000 iterations:\n")
## Final estimates after 1000 iterations:
cat("beta0 =", beta0, "\n")
## beta0 = 2.025353
cat("beta1 =", beta1, "\n")
## beta1 = 1.52111
cat("beta2 =", beta2, "\n")
## beta2 = 2.446533
# Set up the plot area. Adjust y-limits to encompass all the values.
plot(1:num_iter, beta0_vals, type = "l", col = "blue", lwd = 2,
xlab = "Iteration", ylab = "Parameter Estimates",
main = "Convergence of beta0 (blue), beta1 (red), and beta2 (green)",
ylim = range(c(beta0_vals, beta1_vals, beta2_vals)))
lines(1:num_iter, beta1_vals, col = "red", lwd = 2)
lines(1:num_iter, beta2_vals, col = "green", lwd = 2)
legend("topright", legend = c("beta0", "beta1", "beta2"),
col = c("blue", "red", "green"), lwd = 2)
f.)
# Perform a standard multiple linear regression of Y on X1 and X2
lm_fit <- lm(y ~ x1 + x2)
lm_coef <- lm_fit$coef
print(lm_coef)
## (Intercept) x1 x2
## 2.025353 1.521110 2.446533
#Create the base plot with the convergence data.
plot(1:num_iter, beta0_vals, type = "l", col = "blue",
xlab = "Iteration", ylab = "Parameter Estimates",
main = "Convergence of beta0, beta1, & beta2",
ylim = range(c(beta0_vals, beta1_vals, beta2_vals)))
lines(1:num_iter, beta1_vals, col = "red")
lines(1:num_iter, beta2_vals, col = "green")
#Overlay the multiple regression estimates.
abline(h = lm_coef[1], col = "blue", lty = 2, lwd = 2) # beta0
abline(h = lm_coef[2], col = "red", lty = 2, lwd = 2) # beta1
abline(h = lm_coef[3], col = "green", lty = 2, lwd = 2) # beta2
legend("topright", legend = c("beta0 (lm)", "beta1 (lm)", "beta2 (lm)"),
col = c("blue", "red", "green"), lty = 2, lwd = 2)
By inspecting the convergence plot (with the horizontal dashed lines representing the multiple regression coefficients), we see that the parameter estimates stabilize very early in the process. In our simulation, approximately 10 iterations were sufficient for the backfitting estimates of β0, β1, and β2 to converge to values that very closely match those from the standard multiple regression. Thus, only about 10 iterations were required to obtain a “good” approximation to the multiple regression coefficient estimates.