a.)Perform polynomial regression to predict wage using age . Use cross-validation to select the optimal degree d for the polynomial. What degree was chosen, and how does this compare to the results of hypothesis testing using ANOVA? Make a plot of the resulting polynomial fit to the data.
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.3.2
csv_file<-"C:/Masters/Predictive Modeling/ALL CSV FILES - 2nd Edition/Wage.csv"
Wage<-read.csv(csv_file)
str(Wage)
## 'data.frame': 3000 obs. of 11 variables:
## $ year : int 2006 2004 2003 2003 2005 2008 2009 2008 2006 2004 ...
## $ age : int 18 24 45 43 50 54 44 30 41 52 ...
## $ maritl : chr "1. Never Married" "1. Never Married" "2. Married" "2. Married" ...
## $ race : chr "1. White" "1. White" "1. White" "3. Asian" ...
## $ education : chr "1. < HS Grad" "4. College Grad" "3. Some College" "4. College Grad" ...
## $ region : chr "2. Middle Atlantic" "2. Middle Atlantic" "2. Middle Atlantic" "2. Middle Atlantic" ...
## $ jobclass : chr "1. Industrial" "2. Information" "1. Industrial" "2. Information" ...
## $ health : chr "1. <=Good" "2. >=Very Good" "1. <=Good" "2. >=Very Good" ...
## $ health_ins: chr "2. No" "2. No" "1. Yes" "1. Yes" ...
## $ logwage : num 4.32 4.26 4.88 5.04 4.32 ...
## $ wage : num 75 70.5 131 154.7 75 ...
set.seed(123)
train <- sample(nrow(Wage), nrow(Wage)/2)
test <- (-train)
Wage.train <- Wage[train, ]
Wage.test <- Wage[test, ]
library(boot)
cv.error <- rep(0, 10)
for (d in 1:10) {
fit <- glm(wage ~ poly(age, d), data = Wage.train)
cv.error[d] <- cv.glm(Wage, fit, K = 10)$delta[1]
}
opt.d <- which.min(cv.error)
cat("The optimal degree is", opt.d, "\n")
## The optimal degree is
plot(wage ~ age, data = Wage, col = "pink")
age.range <- range(Wage$age)
age.grid <- seq(from = age.range[1], to = age.range[2])
fit <- lm(wage ~ poly(age, 3), data = Wage)
preds <- predict(fit, newdata = list(age = age.grid))
lines(age.grid, preds, col = "blue", lwd = 2)
plot(Wage.train$age, Wage.train$wage, col = "gray", xlab = "Age", ylab = "Wage")
points(Wage.test$age, Wage.test$wage, col = "gray", pch = 20)
age.grid <- seq(from = min(Wage$age), to = max(Wage$age))
preds <- predict(fit, newdata = list(age = age.grid), se = TRUE)
se.bands <- cbind(preds$fit + 2 * preds$se.fit, preds$fit - 2 * preds$se.fit)
matlines(age.grid, se.bands, col = "blue", lty = 2)
lines(age.grid, preds$fit, lwd = 2, col = "red")
csv_file<-"C:/Masters/Predictive Modeling/ALL CSV FILES - 2nd Edition/College.csv"
College<-read.csv(csv_file)
str(College)
## 'data.frame': 777 obs. of 19 variables:
## $ X : chr "Abilene Christian University" "Adelphi University" "Adrian College" "Agnes Scott College" ...
## $ Private : chr "Yes" "Yes" "Yes" "Yes" ...
## $ Apps : int 1660 2186 1428 417 193 587 353 1899 1038 582 ...
## $ Accept : int 1232 1924 1097 349 146 479 340 1720 839 498 ...
## $ Enroll : int 721 512 336 137 55 158 103 489 227 172 ...
## $ Top10perc : int 23 16 22 60 16 38 17 37 30 21 ...
## $ Top25perc : int 52 29 50 89 44 62 45 68 63 44 ...
## $ F.Undergrad: int 2885 2683 1036 510 249 678 416 1594 973 799 ...
## $ P.Undergrad: int 537 1227 99 63 869 41 230 32 306 78 ...
## $ Outstate : int 7440 12280 11250 12960 7560 13500 13290 13868 15595 10468 ...
## $ Room.Board : int 3300 6450 3750 5450 4120 3335 5720 4826 4400 3380 ...
## $ Books : int 450 750 400 450 800 500 500 450 300 660 ...
## $ Personal : int 2200 1500 1165 875 1500 675 1500 850 500 1800 ...
## $ PhD : int 70 29 53 92 76 67 90 89 79 40 ...
## $ Terminal : int 78 30 66 97 72 73 93 100 84 41 ...
## $ S.F.Ratio : num 18.1 12.2 12.9 7.7 11.9 9.4 11.5 13.7 11.3 11.5 ...
## $ perc.alumni: int 12 16 30 37 2 11 26 37 23 15 ...
## $ Expend : int 7041 10527 8735 19016 10922 9727 8861 11487 11644 8991 ...
## $ Grad.Rate : int 60 56 54 59 15 55 63 73 80 52 ...
set.seed(5)
train <- sample(nrow(College), nrow(College)/4)
test <- (-train)
College.train <- College[train, ]
College.test <- College[test, ]
coef(fit, 6)
## (Intercept) poly(age, 3)1 poly(age, 3)2 poly(age, 3)3
## 111.7036 447.0679 -478.3158 125.5217
library(gam)
## Warning: package 'gam' was built under R version 4.3.3
## Loading required package: splines
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.3.2
## Loaded gam 1.22-3
gam.mod <- gam(Outstate ~ Private + s(Room.Board, 4) + s(Terminal, 4) + s(perc.alumni, 4) + s(Expend, 4) + s(Grad.Rate, 4), data = College, subset = train)
plot(gam.mod, se = TRUE)
## Warning in gplot.default(x = c("No", "No", "Yes", "Yes", "Yes", "Yes", "Yes", :
## The "x" component of "partial for Private" has class "character"; no gplot()
## methods available
(c) Evaluate the model obtained on the test set, and explain the results
obtained
pred.gam <- predict(gam.mod, College[test,])
(mse.gam <- mean((College[test,'Outstate']-pred.gam)^2))
## [1] 3807477
tss.gam <- mean((College[test,'Outstate'] - mean(College[test,'Outstate']))^2)
(rss.test = 1 - mse.gam/tss.gam)
## [1] 0.7655204
summary(gam.mod)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 4) + s(Terminal,
## 4) + s(perc.alumni, 4) + s(Expend, 4) + s(Grad.Rate, 4),
## data = College, subset = train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5093.1 -931.3 73.7 1183.0 3755.3
##
## (Dispersion Parameter for gaussian family taken to be 3053873)
##
## Null Deviance: 3081045758 on 193 degrees of freedom
## Residual Deviance: 525265433 on 171.9998 degrees of freedom
## AIC: 3469.99
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 800522655 800522655 262.1336 < 2.2e-16 ***
## s(Room.Board, 4) 1 732080071 732080071 239.7219 < 2.2e-16 ***
## s(Terminal, 4) 1 300944143 300944143 98.5451 < 2.2e-16 ***
## s(perc.alumni, 4) 1 199410159 199410159 65.2975 1.091e-13 ***
## s(Expend, 4) 1 204161196 204161196 66.8532 6.166e-14 ***
## s(Grad.Rate, 4) 1 15087815 15087815 4.9406 0.02754 *
## Residuals 172 525265433 3053873
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board, 4) 3 2.2509 0.0842 .
## s(Terminal, 4) 3 2.0745 0.1054
## s(perc.alumni, 4) 3 1.4466 0.2310
## s(Expend, 4) 3 8.7914 1.865e-05 ***
## s(Grad.Rate, 4) 3 0.8806 0.4524
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The summary statistics show that there isnt a strong non-linear relationship between certain variables:Expend, Grad.Rate, and Personal