# Load the libraries
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.3
library(boot)
## Warning: package 'boot' was built under R version 4.4.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
# View the data
data(Wage)
# Set seed for reproducibility
set.seed(1)
# Cross-validation to choose optimal degree
cv.error <- rep(0, 10)
for (d in 1:10) {
glm.fit <- glm(wage ~ poly(age, d), data = Wage)
cv.error[d] <- cv.glm(Wage, glm.fit, K = 10)$delta[1]
}
# Plot cross-validation errors
plot(1:10, cv.error, type = "b", xlab = "Degree", ylab = "CV Error", main = "CV Error vs Polynomial Degree")
# Optimal degree
optimal.degree <- which.min(cv.error)
cat("Optimal degree selected by CV:", optimal.degree, "\n")
## Optimal degree selected by CV: 9
# Fit the polynomial model with optimal degree
fit.poly <- glm(wage ~ poly(age, optimal.degree), data = Wage)
# ANOVA to compare degrees
fit.1 <- lm(wage ~ poly(age, 1), data = Wage)
fit.2 <- lm(wage ~ poly(age, 2), data = Wage)
fit.3 <- lm(wage ~ poly(age, 3), data = Wage)
fit.4 <- lm(wage ~ poly(age, 4), data = Wage)
fit.5 <- lm(wage ~ poly(age, 5), data = Wage)
# ANOVA table comparing nested models
anova(fit.1, fit.2, fit.3, fit.4, fit.5, test = "F")
## Analysis of Variance Table
##
## Model 1: wage ~ poly(age, 1)
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Plot polynomial fit
age.lims <- range(Wage$age)
age.grid <- seq(from = age.lims[1], to = age.lims[2])
preds <- predict(fit.poly, newdata = list(age = age.grid), se = TRUE)
plot(Wage$age, Wage$wage, col = "gray", main = "Polynomial Fit (Degree = Optimal)")
lines(age.grid, preds$fit, col = "blue", lwd = 2)
lines(age.grid, preds$fit + 2 * preds$se.fit, col = "blue", lty = "dashed")
lines(age.grid, preds$fit - 2 * preds$se.fit, col = "blue", lty = "dashed")
Cross-validation favors better predictive accuracy, whereas ANOVA emphasizes statistical confidence in model complexity. Degree 9 may slightly overfit but performs best in terms of CV error.
# Create bins and perform cross-validation for step functions
cv.errors.step <- rep(NA, 10)
for (i in 2:10) {
Wage$cut.age <- cut(Wage$age, i)
fit.step <- glm(wage ~ cut.age, data = Wage)
cv.errors.step[i] <- cv.glm(Wage, fit.step, K = 10)$delta[1]
}
# Plot CV errors for step function
plot(2:10, cv.errors.step[2:10], type = "b", xlab = "Number of Cuts", ylab = "CV Error", main = "CV Error vs Number of Cuts")
# Optimal cut
best.cut <- which.min(cv.errors.step)
cat("Optimal number of cuts:", best.cut, "\n")
## Optimal number of cuts: 8
# Fit the final step function
Wage$cut.age <- cut(Wage$age, best.cut)
fit.final.step <- lm(wage ~ cut.age, data = Wage)
# Plot step function
plot(Wage$age, Wage$wage, col = "gray", main = "Step Function Fit")
lines(sort(Wage$age), fitted(fit.final.step)[order(Wage$age)], col = "red", lwd = 2)
# View structure of the Wage dataset
str(Wage)
## 'data.frame': 3000 obs. of 12 variables:
## $ year : int 2006 2004 2003 2003 2005 2008 2009 2008 2006 2004 ...
## $ age : int 18 24 45 43 50 54 44 30 41 52 ...
## $ maritl : Factor w/ 5 levels "1. Never Married",..: 1 1 2 2 4 2 2 1 1 2 ...
## $ race : Factor w/ 4 levels "1. White","2. Black",..: 1 1 1 3 1 1 4 3 2 1 ...
## $ education : Factor w/ 5 levels "1. < HS Grad",..: 1 4 3 4 2 4 3 3 3 2 ...
## $ region : Factor w/ 9 levels "1. New England",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ jobclass : Factor w/ 2 levels "1. Industrial",..: 1 2 1 2 2 2 1 2 2 2 ...
## $ health : Factor w/ 2 levels "1. <=Good","2. >=Very Good": 1 2 1 2 1 2 2 1 2 2 ...
## $ health_ins: Factor w/ 2 levels "1. Yes","2. No": 2 2 1 1 1 1 1 1 1 1 ...
## $ logwage : num 4.32 4.26 4.88 5.04 4.32 ...
## $ wage : num 75 70.5 131 154.7 75 ...
## $ cut.age : Factor w/ 8 levels "(17.9,25.8]",..: 1 1 4 4 5 5 4 2 3 5 ...
# Plot wage vs marital status
ggplot(Wage, aes(x = maritl, y = wage)) +
geom_boxplot(fill = "skyblue") +
labs(title = "Wage by Marital Status", x = "Marital Status", y = "Wage")
# Plot wage vs jobclass
ggplot(Wage, aes(x = jobclass, y = wage)) +
geom_boxplot(fill = "orange") +
labs(title = "Wage by Job Class", x = "Job Class", y = "Wage")
# Plot wage vs education
ggplot(Wage, aes(x = education, y = wage)) +
geom_boxplot(fill = "lightgreen") +
labs(title = "Wage by Education Level", x = "Education Level", y = "Wage")
# Fit model with non-linear functions for categorical variables
fit <- lm(wage ~ maritl + jobclass + education, data = Wage)
# Summary of the model
summary(fit)
##
## Call:
## lm(formula = wage ~ maritl + jobclass + education, data = Wage)
##
## Residuals:
## Min 1Q Median 3Q Max
## -109.75 -19.81 -2.74 14.85 219.37
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 66.632 2.502 26.635 < 2e-16 ***
## maritl2. Married 22.332 1.594 14.006 < 2e-16 ***
## maritl3. Widowed 7.309 8.214 0.890 0.373597
## maritl4. Divorced 9.723 2.833 3.432 0.000607 ***
## maritl5. Separated 15.783 4.972 3.174 0.001518 **
## jobclass2. Information 5.199 1.355 3.836 0.000127 ***
## education2. HS Grad 11.268 2.440 4.618 4.04e-06 ***
## education3. Some College 23.128 2.579 8.966 < 2e-16 ***
## education4. College Grad 37.956 2.584 14.689 < 2e-16 ***
## education5. Advanced Degree 61.881 2.840 21.793 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35.27 on 2990 degrees of freedom
## Multiple R-squared: 0.2876, Adjusted R-squared: 0.2855
## F-statistic: 134.1 on 9 and 2990 DF, p-value: < 2.2e-16
Marital Status (maritl
):
There are notable differences in wage based on marital status.
Married individuals tend to have higher median wages compared to those who are single or widowed.
jobclass
):- People in *Information* sector jobs tend to earn more than those in *Industrial* jobs.
- This aligns with the economic structure where tech and white-collar jobs often pay better.
- Wage tends to increase with higher education levels.
- Those with advanced degrees have visibly higher wages than those without high school diplomas.
- The linear model with categorical variables treats each category as a separate level (factor), allowing flexible mean estimates.
- No transformation is needed; R automatically chooses a reference level and estimates effects relative to it.
# Load required libraries
library(MASS)
## Warning: package 'MASS' was built under R version 4.4.3
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
# Load the Boston data
data("Boston")
# Fit a cubic polynomial regression to predict nox using dis
fit_cubic <- lm(nox ~ poly(dis, 3), data = Boston)
# Summarize the fit
summary(fit_cubic)
##
## Call:
## lm(formula = nox ~ poly(dis, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.121130 -0.040619 -0.009738 0.023385 0.194904
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.554695 0.002759 201.021 < 2e-16 ***
## poly(dis, 3)1 -2.003096 0.062071 -32.271 < 2e-16 ***
## poly(dis, 3)2 0.856330 0.062071 13.796 < 2e-16 ***
## poly(dis, 3)3 -0.318049 0.062071 -5.124 4.27e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06207 on 502 degrees of freedom
## Multiple R-squared: 0.7148, Adjusted R-squared: 0.7131
## F-statistic: 419.3 on 3 and 502 DF, p-value: < 2.2e-16
# Plot the resulting data and polynomial fits
ggplot(Boston, aes(x = dis, y = nox)) +
geom_point(color = "gray") +
geom_smooth(method = "lm", formula = y ~ poly(x, 3), color = "blue") +
labs(title = "Cubic Polynomial Fit: nox vs dis", x = "Distance to Employment Centers", y = "Nitrogen Oxides Concentration (nox)")
# Fit polynomial regressions for degrees 1 to 10
rss_values <- rep(0, 10)
for (degree in 1:10) {
fit <- lm(nox ~ poly(dis, degree), data = Boston)
rss_values[degree] <- sum(resid(fit)^2)
}
# Plot the polynomial fits and report RSS
plot(1:10, rss_values, type = "b", xlab = "Degree", ylab = "Residual Sum of Squares (RSS)",
main = "RSS vs Polynomial Degree for Polynomial Fits")
# Load required libraries
library(splines)
# Remove rows with missing values in 'nox' and 'dis'
Boston_clean <- na.omit(Boston[, c("nox", "dis")])
# Initialize cv_errors vector to store cross-validation errors
cv_errors <- rep(NA, 10)
# Perform k-fold cross-validation for polynomial degrees 1 to 10
k <- 10 # Number of folds
folds <- sample(1:k, nrow(Boston_clean), replace = TRUE)
for (degree in 1:10) {
cv_error <- 0
# Perform cross-validation for each fold
for (i in 1:k) {
# Split data into training and validation sets
train_data <- Boston_clean[folds != i, ]
val_data <- Boston_clean[folds == i, ]
# Fit the polynomial model to the training data
fit <- lm(nox ~ poly(dis, degree), data = train_data)
# Predict on validation data
pred <- predict(fit, newdata = val_data)
# Calculate the residual sum of squares (RSS) for this fold
rss <- sum((val_data$nox - pred)^2)
# Average the errors across all folds
cv_error <- cv_error + rss
}
# Compute the average CV error for the degree
cv_errors[degree] <- cv_error / k
# Debugging: Print each cv error to check the results
cat("Degree:", degree, "CV Error:", cv_errors[degree], "\n")
}
## Degree: 1 CV Error: 0.281412
## Degree: 2 CV Error: 0.2078067
## Degree: 3 CV Error: 0.1963942
## Degree: 4 CV Error: 0.1978465
## Degree: 5 CV Error: 0.2119629
## Degree: 6 CV Error: 0.2562822
## Degree: 7 CV Error: 0.5853273
## Degree: 8 CV Error: 0.5162523
## Degree: 9 CV Error: 1.92793
## Degree: 10 CV Error: 1.173036
# Remove NA values from the cv_errors vector
cv_errors <- na.omit(cv_errors)
# Ensure that valid degrees are selected
valid_degrees <- 1:length(cv_errors)
# Plot CV errors to visualize
plot(valid_degrees, cv_errors, type = "b", xlab = "Degree", ylab = "Cross-Validation Error",
main = "CV Error vs Polynomial Degree")
# Report the optimal polynomial degree based on cross-validation
optimal_degree <- valid_degrees[which.min(cv_errors)]
cat("Optimal polynomial degree chosen by cross-validation:", optimal_degree, "\n")
## Optimal polynomial degree chosen by cross-validation: 3
# Fit a regression spline using bs() function with 4 degrees of freedom
fit_spline <- lm(nox ~ bs(dis, df = 4), data = Boston)
# Summarize the spline fit
summary(fit_spline)
##
## Call:
## lm(formula = nox ~ bs(dis, df = 4), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.124622 -0.039259 -0.008514 0.020850 0.193891
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.73447 0.01460 50.306 < 2e-16 ***
## bs(dis, df = 4)1 -0.05810 0.02186 -2.658 0.00812 **
## bs(dis, df = 4)2 -0.46356 0.02366 -19.596 < 2e-16 ***
## bs(dis, df = 4)3 -0.19979 0.04311 -4.634 4.58e-06 ***
## bs(dis, df = 4)4 -0.38881 0.04551 -8.544 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06195 on 501 degrees of freedom
## Multiple R-squared: 0.7164, Adjusted R-squared: 0.7142
## F-statistic: 316.5 on 4 and 501 DF, p-value: < 2.2e-16
# Plot the regression spline fit
ggplot(Boston, aes(x = dis, y = nox)) +
geom_point(color = "gray") +
geom_smooth(method = "lm", formula = y ~ bs(x, df = 4), color = "red") +
labs(title = "Regression Spline Fit: nox vs dis", x = "Distance to Employment Centers", y = "Nitrogen Oxides Concentration (nox)")
bs(dis, df = 4)
: We fit a regression spline with 4
degrees of freedom. This will allow the model to flexibly capture the
relationship between dis
and nox
.geom_smooth()
, which is similar to polynomial fitting but
more flexible.# Fit regression splines for a range of degrees of freedom (3 to 12)
rss_spline <- rep(NA, 10)
for (df in 3:12) {
fit_spline <- lm(nox ~ bs(dis, df = df), data = Boston)
rss_spline[df - 2] <- sum(resid(fit_spline)^2)
}
# Plot the resulting fits and report the RSS
plot(3:12, rss_spline, type = "b", xlab = "Degrees of Freedom", ylab = "Residual Sum of Squares (RSS)",
main = "RSS vs Degrees of Freedom for Regression Spline Fits")
rss_spline[df - 2]
: We compute the residual sum of
squares (RSS) for regression splines with different degrees of freedom
ranging from 3 to 12.# Fit a simple linear model to check if basic regression works
fit_linear <- lm(nox ~ dis, data = Boston)
# Summarize the linear model
summary(fit_linear)
##
## Call:
## lm(formula = nox ~ dis, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.12239 -0.05212 -0.01257 0.04391 0.23041
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.715343 0.006796 105.26 <2e-16 ***
## dis -0.042331 0.001566 -27.03 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07412 on 504 degrees of freedom
## Multiple R-squared: 0.5917, Adjusted R-squared: 0.5909
## F-statistic: 730.4 on 1 and 504 DF, p-value: < 2.2e-16
# Plot the fitted linear model
plot(Boston$dis, Boston$nox, main = "Linear Fit: nox vs dis", xlab = "Distance to Employment Centers", ylab = "Nitrogen Oxides Concentration (nox)")
abline(fit_linear, col = "red")
set.seed(123) # For reproducibility
n <- 100 # Sample size
X1 <- rnorm(n, mean = 0, sd = sqrt(2)) # X1 with mean 0 and variance 2
X2 <- rnorm(n, mean = 0, sd = sqrt(2)) # X2 with mean 0 and variance 2
beta0 <- 1.5
beta1 <- 3
beta2 <- -4.5
epsilon <- rnorm(n, mean = 0, sd = 1) # Error term with variance 1
Y <- beta0 + beta1 * X1 + beta2 * X2 + epsilon # Response variable Y
Y
## [1] 5.84191325 0.20105468 9.41783176 4.55408339 7.69024494
## [6] 8.58671229 7.66200047 6.15289995 2.65658437 -6.29327034
## [11] 10.47407012 -0.59881304 14.72894765 1.80711735 -5.15622857
## [16] 8.84039905 2.49851830 -3.48928787 8.64681331 4.72691593
## [21] -4.35306414 7.22288810 1.37876838 0.74492550 -13.24974027
## [26] -1.44729996 2.85184197 0.93734906 3.67716256 6.25769550
## [31] -3.92843695 -2.71555607 5.24983067 7.17583053 17.47806994
## [36] -4.09514891 12.96260468 -3.05268386 -11.62324602 8.29314157
## [41] -6.70214223 1.78429758 7.63250279 19.20408313 16.63801093
## [46] 2.01608407 8.99228510 -6.21756723 -9.22066915 9.82236678
## [51] -2.81400277 -4.07714484 -1.13992545 13.81425382 2.90083534
## [56] 9.62969976 -7.57284562 6.98124593 -4.30559140 3.26707846
## [61] -4.10985815 5.55587321 8.15305410 -22.14710253 1.89871935
## [66] 2.43743525 -0.78266252 3.04711154 1.73477971 7.93890405
## [71] 1.63241736 -7.74996818 6.66809929 -16.44954330 4.14850372
## [76] 12.37945114 0.22612925 -5.58040285 -0.08064612 3.85242795
## [81] 6.62394635 -4.16775896 2.53861068 9.47629667 2.18637322
## [86] 4.29649001 -0.68897618 4.44790004 -4.90062989 9.71951827
## [91] 5.51874216 6.94712076 3.05619020 3.95660965 17.61713205
## [96] -13.69017303 8.82403225 14.61445570 4.41042773 5.93952856
X1
## [1] -0.79263226 -0.32552013 2.20434644 0.09971392 0.18284047 2.42546816
## [7] 0.65183395 -1.78906676 -0.97135662 -0.63026120 1.73111308 0.50885359
## [13] 0.56677642 0.15652900 -0.78607807 2.52707679 0.70406690 -2.78121665
## [19] 0.99186703 -0.66862802 -1.51013077 -0.30826308 -1.45098941 -1.03080786
## [25] -0.88393901 -2.38534456 1.18480980 0.21690234 -1.60956869 1.77316207
## [31] 0.60311149 -0.41729409 1.26589885 1.24186829 1.16189111 0.97388439
## [37] 0.78335786 -0.08755638 -0.43269655 -0.53806725 -0.98246403 -0.29403943
## [43] -1.78954068 3.06736694 1.70831624 -1.58831539 -0.56976520 -0.65995033
## [49] 1.10303725 -0.11790166 0.35824648 -0.04037121 -0.06062798 1.93549591
## [55] -0.31928839 2.14461330 -2.19026722 0.82676869 0.17515635 0.30538750
## [61] 0.53689131 -0.71039264 -0.47122640 -1.44048312 -1.51574169 0.42925432
## [67] 0.63386435 0.07495930 1.30428316 2.89925757 -0.69442293 -3.26565794
## [73] 1.42232906 -1.00296134 -0.97299112 1.45037694 -0.40272985 -1.72635554
## [79] 0.25640184 -0.19642205 0.00815179 0.54486877 -0.52419244 0.91128605
## [85] -0.31181509 0.46921055 1.55116461 0.61543957 -0.46093687 1.62465931
## [91] 1.40502663 0.77555042 0.33761766 -0.88799329 1.92425315 -0.84889525
## [97] 3.09335598 2.16743873 -0.33333064 -1.45157836
X2
## [1] -1.00466660 0.36328843 -0.34887500 -0.49149946 -1.34579188 -0.06367882
## [7] -1.11002255 -2.35882611 -0.53772150 1.29965747 -0.81366348 0.85979139
## [13] -2.28803167 -0.07857649 0.73455271 0.42589517 0.14944871 -0.90609513
## [19] -1.20166341 -1.44833683 0.16637741 -1.33993145 -0.69375299 -0.36216905
## [25] 2.60761465 -0.92199639 0.33288688 0.11025329 -1.36027070 -0.10084486
## [31] 2.04290342 0.63852316 0.05831216 -0.59750075 -2.90373007 1.59995243
## [37] -2.06565700 1.04644381 2.69988016 -2.04197329 0.99247292 -0.37080325
## [43] -2.22334759 -2.14206354 -2.26491418 -0.75081520 -2.06723457 0.97286123
## [49] 2.97000255 -1.82013595 1.11403096 1.08758997 0.46980539 -1.42605988
## [55] -0.16893150 -0.39653889 0.79618743 -0.52670794 1.38164901 -0.52973733
## [61] 1.48875883 -1.48376035 -1.78212864 4.58352263 -0.58952565 0.42175750
## [67] 0.90024547 -0.68416912 0.73095331 0.52179464 -0.30459403 0.09233829
## [73] -0.04817837 3.01008554 -1.04840756 -1.54997279 0.05344087 0.43908609
## [79] 0.61733742 -0.64822647 -1.50377024 1.78641361 -0.49448032 -1.22402003
## [85] -0.33414977 -0.27884882 1.56966433 0.11983663 1.06639309 -0.70610554
## [91] 0.30327147 -0.45917522 0.13376131 -1.26623500 -1.85375331 2.82448626
## [97] 0.84953057 -1.76956493 -0.86431913 -1.67652201
beta1_init <- 0
a <- Y - beta1_init * X1
beta2_est <- lm(a ~ X2)$coef[2]
a <- Y - beta2_est * X2
beta1_est <- lm(a ~ X1)$coef[2]
beta1_vals <- c(beta1_init) # Store beta1 values
beta2_vals <- c(beta2_est) # Store beta2 values
convergence <- FALSE
iterations <- 0
# Perform backfitting loop
while (!convergence && iterations < 1000) {
iterations <- iterations + 1
# Update beta2 by keeping beta1 fixed
a <- Y - beta1_vals[iterations] * X1
beta2_new <- lm(a ~ X2)$coef[2]
# Update beta1 by keeping beta2 fixed
a <- Y - beta2_new * X2
beta1_new <- lm(a ~ X1)$coef[2]
# Store new values
beta1_vals <- c(beta1_vals, beta1_new)
beta2_vals <- c(beta2_vals, beta2_new)
# Check for convergence (small change in coefficients)
if (abs(beta1_new - beta1_vals[iterations]) < 0.0001 && abs(beta2_new - beta2_vals[iterations]) < 0.0001) {
convergence <- TRUE
}
}
# Plot the results (Beta estimates over iterations)
plot(1:iterations, beta1_vals[2:(iterations + 1)], type = "b", col = "blue", xlab = "Iterations", ylab = "Beta1", main = "Backfitting: Beta1 Convergence")
lines(1:iterations, beta2_vals[2:(iterations + 1)], type = "b", col = "red")
legend("topright", legend = c("Beta1", "Beta2"), col = c("blue", "red"), lty = 1)
# Plot the data first
plot(Boston$dis, Boston$nox, main = "Linear Fit: nox vs dis", xlab = "Distance to Employment Centers", ylab = "Nitrogen Oxides Concentration (nox)", col = "black", pch = 16)
# Now perform multiple linear regression
lm_fit <- lm(Y ~ X1 + X2)
# Overlay the multiple linear regression results on the plot
abline(h = lm_fit$coef[1] + lm_fit$coef[2] * 0, col = "blue", lty = 2) # Intercept line
abline(h = lm_fit$coef[1] + lm_fit$coef[2] * 12, col = "red", lty = 2) # Slope line
# Perform multiple linear regression
lm_fit <- lm(Y ~ X1 + X2)
beta0_true <- lm_fit$coef[1]
beta1_true <- lm_fit$coef[2]
beta2_true <- lm_fit$coef[3]
# Initialize backfitting
beta1_vals <- c(beta1_init) # Store beta1 values
beta2_vals <- c(beta2_est) # Store beta2 values
convergence <- FALSE
iterations <- 0
convergence_threshold <- 0.01 # Define good approximation as less than 0.01 difference
# Perform backfitting loop
while (!convergence && iterations < 1000) {
iterations <- iterations + 1
# Update beta2 by keeping beta1 fixed
a <- Y - beta1_vals[iterations] * X1
beta2_new <- lm(a ~ X2)$coef[2]
# Update beta1 by keeping beta2 fixed
a <- Y - beta2_new * X2
beta1_new <- lm(a ~ X1)$coef[2]
# Store new values
beta1_vals <- c(beta1_vals, beta1_new)
beta2_vals <- c(beta2_vals, beta2_new)
# Check for convergence
if (abs(beta1_new - beta1_true) < convergence_threshold && abs(beta2_new - beta2_true) < convergence_threshold) {
convergence <- TRUE
}
}
# Report the number of iterations required for good approximation
cat("Number of backfitting iterations for good approximation:", iterations, "\n")
## Number of backfitting iterations for good approximation: 2