# Loop for cross Validation
set.seed (17)
cv.error.10 <- rep(0, 10)
for (i in 1:10) {
glm.fit <- glm(wage ~ poly(age, i), data = Wage)
cv.error.10[i] <- cv.glm(Wage, glm.fit , K = 10)$delta [1]
return(cv.error.10[i])
}
# Extracting Optimal Degree
optimal_degree <- which.min(cv.error.10)
optimal_degree
## [1] 2
# loop for anova comparison per degree.
anova_results <- sapply(1:10, function(i) {
lmfit <- lm(wage ~ poly(age, i), data = Wage)
return(anova(lmfit)$"Pr(>F)"[1])
})
fit2<- lm(wage ~ poly(age, 2), data=Wage)
#plotting mse
plot(1:10, cv.error.10, xlab = "Degree", ylab = "MSE", type = "l")
points(optimal_degree, cv.error.10[optimal_degree], col = 'red', cex = 2, pch = 19)
# plotting Poly 2 fit
agelims <- range(Wage$age)
age.grid <- seq(from = agelims[1], to = agelims [2])
preds <- predict(fit2 , newdata = list(age = age.grid),
se = TRUE)
se.bands <- cbind(preds$glm.fit + 2 * preds$se.fit ,
preds$fit - 2 * preds$se.fit)
par(mfrow = c(1, 2), mar = c(4.5 , 4.5, 1, 1),
oma = c(0, 0, 4, 0))
plot(Wage$age, Wage$wage , xlim = agelims , cex = .5, col = "darkgrey")
title("Degree -2 Polynomial", outer = T)
lines(age.grid, preds$fit , lwd = 2, col = "blue")
matlines(age.grid , se.bands, lwd = 1, col = "blue", lty = 3)
# setting loop for cuts 1-10
set.seed (123)
cv.error<- rep(NA, 10)
for (i in 2:10) {
Wage$age.cut<-cut(Wage$age, i)
glm.fit <- glm(wage ~ age.cut, data = Wage)
cv.error[i]<-cv.glm(Wage, glm.fit, K = 10)$delta[1]
}
# extract optimal cut
optimal_degree<-which.min(cv.error)
optimal_degree
## [1] 8
# plotting MSE by cut
plot(2:10, cv.error[-1], xlab = "Cuts", "ylab" = "MSE", type = "l")
points(optimal_degree, cv.error[optimal_degree], col = 'red', cex = 2, pch = 19)
# plotting Optimal Cut
plot(wage ~ age, data = Wage, col = "darkgrey")
fit <- glm(wage ~ cut(age, 8), data = Wage)
preds <- predict(fit, list(age = age.grid))
lines(age.grid, preds, col = "red", lwd = 2)
Assuming the rest of this assignment will consider the other non-linear approaches, I’ll run lm fit by adding relevant variables then comparing through ANOVA.
boxplot(wage ~ education, data = Wage, col = c("lavender"), xlab = "education", ylab = "wage")
boxplot(wage ~ maritl, data = Wage, col = c("blue"), xlab = "mar. stat.", ylab = "wage")
boxplot(wage ~ race, data = Wage, col = c("pink"), xlab = "race", ylab = "wage")
boxplot(wage ~ health, data = Wage, col = c("yellow"), xlab = "health", ylab = "wage")
boxplot(wage ~ jobclass, data = Wage, col = c("red"), xlab = "jobclass", ylab = "wage")
boxplot(wage ~ year, data = Wage, col = c("purple"), xlab = "year", ylab = "wage")
There are some noticeable differences in wage for factors like race, marital status, education, health. Lets try a few out!
fit.2 <- lm(wage ~ education + poly(age , 2), data = Wage)
fit.3 <- lm(wage ~ education + health + poly(age , 2), data = Wage)
fit.4 <- lm(wage ~ education + health + race + poly(age , 2), data = Wage)
anova(fit.2, fit.3, fit.4)
## Analysis of Variance Table
##
## Model 1: wage ~ education + poly(age, 2)
## Model 2: wage ~ education + health + poly(age, 2)
## Model 3: wage ~ education + health + race + poly(age, 2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2993 3725395
## 2 2992 3685535 1 39860 32.4204 1.362e-08 ***
## 3 2989 3674876 3 10659 2.8899 0.03419 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
gam_auto<-gam(mpg ~ s(displacement, 3) + s(weight, 3) + s(acceleration, 2), data = Auto)
par(mfrow = c(1, 3))
plot(gam_auto, se = TRUE , col = "blue")
summary(gam_auto)
##
## Call: gam(formula = mpg ~ s(displacement, 3) + s(weight, 3) + s(acceleration,
## 2), data = Auto)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -12.4669 -2.4244 -0.2371 1.8486 17.4322
##
## (Dispersion Parameter for gaussian family taken to be 16.4602)
##
## Null Deviance: 23818.99 on 391 degrees of freedom
## Residual Deviance: 6304.25 on 382.9998 degrees of freedom
## AIC: 2221.313
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## s(displacement, 3) 1 15870.6 15870.6 964.1793 < 2.2e-16 ***
## s(weight, 3) 1 905.6 905.6 55.0156 7.769e-13 ***
## s(acceleration, 2) 1 127.8 127.8 7.7629 0.005598 **
## Residuals 383 6304.2 16.5
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## s(displacement, 3) 2 5.1132 0.006434 **
## s(weight, 3) 2 10.1768 4.938e-05 ***
## s(acceleration, 2) 1 2.7257 0.099562 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
It looks as if weight and displacement have a non-linear relationship with mpg.
plot(Auto$mpg, Auto$displacement)
9. This question uses the variables dis (the weighted mean of distances
to five Boston employment centers) and nox (nitrogen oxides
concentration in parts per 10 million) from the Boston data. We will
treat dis as the predictor and nox as the response.
boston_fit<-lm(nox ~ poly(dis, 3), data = Boston)
summary(boston_fit)
##
## Call:
## lm(formula = nox ~ poly(dis, 3), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.121130 -0.040619 -0.009738 0.023385 0.194904
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.554695 0.002759 201.021 < 2e-16 ***
## poly(dis, 3)1 -2.003096 0.062071 -32.271 < 2e-16 ***
## poly(dis, 3)2 0.856330 0.062071 13.796 < 2e-16 ***
## poly(dis, 3)3 -0.318049 0.062071 -5.124 4.27e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06207 on 502 degrees of freedom
## Multiple R-squared: 0.7148, Adjusted R-squared: 0.7131
## F-statistic: 419.3 on 3 and 502 DF, p-value: < 2.2e-16
x_seq <-seq(min(Boston$dis), max(Boston$dis), length.out = 506)
predictions<-predict(boston_fit, newdata = list(dis = x_seq))
plot(Boston$dis, Boston$nox, col = "blue", xlab = "X", ylab = "Y", main = "Polynomial Regression")
lines(x_seq, predictions, col = "red", lwd = 2)
set.seed (123)
rss<- rep(0, 10)
for (i in 1:10) {
bos_fit <- lm(nox ~ poly(dis, i), data = Boston)
rss[i]<-sum(bos_fit$residuals^2)
}
rss
## [1] 2.768563 2.035262 1.934107 1.932981 1.915290 1.878257 1.849484 1.835630
## [9] 1.833331 1.832171
set.seed(42)
degrees<-rep(0, 10)
for (i in 1:10) {
bos.fit <- glm(nox ~ poly(dis, i), data = Boston)
degrees[i] <- cv.glm(Boston, bos.fit , K = 10)$delta[1]
}
# Extracting Optimal Degree
optimal_degree <- which.min(degrees)
optimal_degree
## [1] 4
plot(1:10, degrees, xlab = "Degree", ylab = "MSE", type = "l")
points(optimal_degree, degrees[optimal_degree], col = 'red', cex = 2, pch = 19)
Through this method we see that the optimal degree is 4.
library(splines)
dislims <- range(Boston$dis)
dis.grid <- seq(from = dislims[1], to = dislims [2])
attr(bs(Boston$dis , df = 4), "knots")
## [1] 3.20745
fit<-lm(nox ~ bs(dis , knots = c(3.20745)), data = Boston)
summary(fit)
##
## Call:
## lm(formula = nox ~ bs(dis, knots = c(3.20745)), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.124622 -0.039259 -0.008514 0.020850 0.193891
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.73447 0.01460 50.306 < 2e-16 ***
## bs(dis, knots = c(3.20745))1 -0.05810 0.02186 -2.658 0.00812 **
## bs(dis, knots = c(3.20745))2 -0.46356 0.02366 -19.596 < 2e-16 ***
## bs(dis, knots = c(3.20745))3 -0.19979 0.04311 -4.634 4.58e-06 ***
## bs(dis, knots = c(3.20745))4 -0.38881 0.04551 -8.544 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06195 on 501 degrees of freedom
## Multiple R-squared: 0.7164, Adjusted R-squared: 0.7142
## F-statistic: 316.5 on 4 and 501 DF, p-value: < 2.2e-16
pred<-predict(fit , newdata = list(dis = dis.grid), se = T)
plot(Boston$dis, Boston$nox, col = "gray")
lines(dis.grid, pred$fit , lwd = 2)
lines(dis.grid , pred$fit + 2 * pred$se, lty = "dashed")
lines(dis.grid , pred$fit - 2 * pred$se, lty = "dashed")
Knot value was picked via df function at degrees of freedom = 4. There
was only 1 knot value for this df.
set.seed(123)
rss <- numeric(15)
degrees<-rep(0, 15)
for (i in 4:15) {
bos.spline <-lm(nox ~ bs(dis, df=i), data = Boston)
rss[i]<-sum(bos.spline$residuals^2)
}
rss
## [1] 0.000000 0.000000 0.000000 1.922775 1.840173 1.833966 1.829884 1.816995
## [9] 1.825653 1.792535 1.796992 1.788999 1.782350 1.781838 1.782798
plot(4:15, rss[4:15], type = "b")
set.seed(123)
rss <- numeric(15)
degrees<-rep(0, 15)
for (i in 4:15) {
bos.spline <-glm(nox ~ bs(dis, df=i), data = Boston)
degrees[i] <- cv.glm(Boston, bos.spline, K = 10)$delta[1]
}
plot(4:15, degrees[4:15], xlab = "Degree", ylab = "MSE", type = "l")
It appears that degree 14 has the lowest MSE as determined through cross validation.
set.seed(123)
train_indices<-sample (1: nrow(College), nrow(College)/2)
train_data <- College[train_indices, ]
test_data <- College[-train_indices, ]
regfit.fwd<-regsubsets(Outstate ~ ., data = train_data, nvmax = 17, method = "forward")
fwd_summary<-summary(regfit.fwd)
fwd_min_bic <- which.min(fwd_summary$bic)
fwd_min_cp<- which.min(fwd_summary$cp)
fwd_max_adjr2<- which.max(fwd_summary$adjr2)
# Plot AIC, BIC, Cp
par(mfrow = c(2, 2))
plot(fwd_summary$bic, type = "b", xlab = "Number of Variables", ylab = "BIC", main = "BIC vs Number of Variables")
points(fwd_min_bic, fwd_summary$bic[fwd_min_bic], col = "red", pch = 16)
plot(fwd_summary$cp, type = "b", xlab = "Number of Variables", ylab = "Cp", main = "Cp vs Number of Variables")
points(fwd_min_cp, fwd_summary$cp[fwd_min_cp], col = "red", pch = 16)
plot(fwd_summary$adjr2, type = "b", xlab = "Number of Variables", ylab = "Adjusted R-squared", main = "Adjusted R-squared vs Number of Variables")
points(fwd_max_adjr2, fwd_summary$adjr2[fwd_max_adjr2], col = "red", pch = 16)
Looks like a model of roughly 11 predictors perfoms the best.
gam.m1<-gam(Outstate ~ Private + s(Apps, 2) + s(Accept) + s(Top25perc, 3) + s(F.Undergrad, 2) + s(Room.Board) + s(PhD, 2) + s(perc.alumni, 3) + s(Expend, 2) + s(Grad.Rate), data = train_data)
par(mfrow = c(1, 3))
plot(gam.m1, se = TRUE , col = "blue")
(c) Evaluate the model obtained on the test set, and explain the results
obtained.
preds<-predict(gam.m1, newdata = test_data)
err<-mean((preds - test_data$Outstate)^2)
err
## [1] 3703063
tss<- mean((test_data$Outstate - mean(test_data$Outstate))^2)
rss<- 1 - err/tss
rss
## [1] 0.7619557
summary(gam.m1)
##
## Call: gam(formula = Outstate ~ Private + s(Apps, 2) + s(Accept) + s(Top25perc,
## 3) + s(F.Undergrad, 2) + s(Room.Board) + s(PhD, 2) + s(perc.alumni,
## 3) + s(Expend, 2) + s(Grad.Rate), data = train_data)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6592.54 -1076.03 82.27 1230.29 8355.16
##
## (Dispersion Parameter for gaussian family taken to be 3544258)
##
## Null Deviance: 6505853731 on 387 degrees of freedom
## Residual Deviance: 1275932360 on 359.9998 degrees of freedom
## AIC: 6981.4
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 2034279795 2034279795 573.9649 < 2.2e-16 ***
## s(Apps, 2) 1 939551224 939551224 265.0911 < 2.2e-16 ***
## s(Accept) 1 65276430 65276430 18.4175 2.283e-05 ***
## s(Top25perc, 3) 1 426503525 426503525 120.3365 < 2.2e-16 ***
## s(F.Undergrad, 2) 1 97642501 97642501 27.5495 2.622e-07 ***
## s(Room.Board) 1 286861205 286861205 80.9369 < 2.2e-16 ***
## s(PhD, 2) 1 88858466 88858466 25.0711 8.669e-07 ***
## s(perc.alumni, 3) 1 153617026 153617026 43.3425 1.628e-10 ***
## s(Expend, 2) 1 335269096 335269096 94.5950 < 2.2e-16 ***
## s(Grad.Rate) 1 25752058 25752058 7.2659 0.007358 **
## Residuals 360 1275932360 3544258
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Apps, 2) 1 0.276 0.599838
## s(Accept) 3 5.501 0.001053 **
## s(Top25perc, 3) 2 1.439 0.238433
## s(F.Undergrad, 2) 1 5.247 0.022567 *
## s(Room.Board) 3 2.832 0.038253 *
## s(PhD, 2) 1 2.415 0.121046
## s(perc.alumni, 3) 2 0.177 0.837980
## s(Expend, 2) 1 43.405 1.582e-10 ***
## s(Grad.Rate) 3 3.046 0.028798 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Based on this model it looks as if Accept, Room.Board, PhD, Expend, and Grad.Rate have a non-linear relationship with Outstate.
Suppose that we would like to perform multiple linear regression, but we do not have software to do so. Instead, we only have software to perform simple linear regression. Therefore, we take the following iterative approach: we repeatedly hold all but one coefficient estimate fixed at its current value, and update only that coefficient estimate using a simple linear regression. The process is continued until convergence—that is, until the coefficient estimates stop changing. We now try this out on a toy example.
We now try this out on a toy example. (a) Generate a response Y and two predictors X1 and X2, with n = 100.
x1<-rnorm(100, 0, .25)
x2<-rnorm(100, 0, .5)
e<-rnorm(100, 0, 2)
y<-2 + 3*x1 + 2*x2 + e
beta0<-2
beta1<-1
You can do this as follows:
a <- y - beta1 * x1
beta2<-lm(a ~ x2)$coef[2]
a <- y - beta2 * x2
beta1<-lm(a ~ x1)$coef [2]
beta1_int<-1
beta0<- c()
beta1<- c()
beta2<- c()
# beta_2:
a<-y - beta1_int * x1
beta2[1]<-lm(a ~ x2)$coef[2]
# beta_1:
b<- y - beta2[1] * x2
lm_coef<-lm(b ~ x1)$coef
beta1[1]<-lm_coef[2]
# beta_0:
beta0[1] <-lm_coef[1]
for (i in 2:1000) {
a<- y - beta1[i-1] * x1
beta2[i]<-lm(a ~ x2)$coef[2]
b<- y - beta2[i-1] * x2
lm_coef <- lm(b ~ x1)$coef
beta1[i] <- lm_coef[2]
beta0[i] <- lm_coef[1]
}
iteration <- 1:1000
df <- data.frame(Iteration = iteration, Beta0 = beta0, Beta1 = beta1, Beta2 = beta2)
# Melt the data for better plotting with ggplot
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.3.1
df_melted <- melt(df, id.vars = "Iteration")
# Plot using ggplot
ggplot(df_melted, aes(x = Iteration, y = value, color = variable)) +
geom_line() +
labs(x = "Iteration", y = "Coefficient Estimate", color = "Coefficient") +
theme_minimal()
# lm model
multiple_model <- lm(y ~ x1 + x2)
# Extract coefficients
multiple_beta0 <- coef(multiple_model)[1]
multiple_beta1 <- coef(multiple_model)[2]
multiple_beta2 <- coef(multiple_model)[3]
iteration <- 1:1000
# Data frame for coefficients from the iterative approach
df_iterative <- data.frame(
Iteration = iteration,
Beta0 = beta0,
Beta1 = beta1,
Beta2 = beta2
)
# Melt the iterative data frame
library(reshape2)
df_iterative_melted <- melt(df_iterative, id.vars = "Iteration")
# Data frame for coefficients from the lm model
df_lm <- data.frame(
Iteration = rep(0, 3),
Coefficient = c(multiple_beta0, multiple_beta1, multiple_beta2),
Variable = c("Beta0", "Beta1", "Beta2")
)
# Plotting the iterative approach
ggplot(df_iterative_melted, aes(x = Iteration, y = value, color = variable, linetype = variable)) +
geom_line() +
labs(x = "Iteration", y = "Coefficient Estimate", color = "Coefficient", linetype = "Coefficient") +
theme_minimal() +
geom_line(data = df_lm, aes(x = Iteration, y = Coefficient, color = Variable, linetype = Variable))
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
It appears the coefficients are quite similar.
```