#QUESTION 6 PART A)
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.4.3
library(boot)
attach(Wage)
set.seed(1)
poly.mse=c()
for(degree in 1:10){
fit=glm(wage~poly(age,degree,raw=T),data=Wage)
mse=cv.glm(fit,data=Wage,K=10)$delta[1]
poly.mse=c(poly.mse,mse)
}
plot(poly.mse,xlab='Degree',ylab='Test MSE',type='l')
x=which.min(poly.mse)
points(x,poly.mse[x],pch=20,cex=2,col='blue')

#PLOT OF THE RESULTING POLYNOMIAL FIT
plot(wage~age, data=Wage)
age.range=range(Wage$age)
age.grid=seq(from=age.range[1], to=age.range[2])
fit=lm(wage~poly(age,4), data=Wage)
preds=predict(fit, newdata=list(age=age.grid))
lines(age.grid, preds, col="red", lwd=2)

#QUESTION 6 PART B)
set.seed(1)
step.mse=c()
for(degree in 2:10){
Wage.model=model.frame(wage~cut(age,degree),data=Wage)
names(Wage.model)=c('wage','age')
step.fit=glm(wage~age,data=Wage.model)
mse=cv.glm(step.fit,data=Wage.model,K=10)$delta[1]
step.mse=c(step.mse,mse)
}
plot(step.mse,xlab='Cuts',ylab='Test MSE',type='l')
x=which.min(step.mse)
points(x,step.mse[x],pch=20,cex=2,col='red')

#The optimal number of cuts is 7 from the plot above.
plot(wage~age,data=Wage)
fit=glm(wage~cut(age,7), data=Wage)
preds=predict(fit, list(age=age.grid))
lines(age.grid, preds, col="red", lwd=2)

#QUESTION 10 PART A)
library(ISLR)
## Warning: package 'ISLR' was built under R version 4.4.2
##
## Attaching package: 'ISLR'
## The following objects are masked from 'package:ISLR2':
##
## Auto, Credit
library(leaps)
## Warning: package 'leaps' was built under R version 4.4.3
set.seed(1)
train=sample(dim(College)[1], dim(College)[1]/2)
test=-train
fit=regsubsets(Outstate~., data=College, subset=train, method='forward')
summary(fit)
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = College, subset = train,
## method = "forward")
## 17 Variables (and intercept)
## Forced in Forced out
## PrivateYes FALSE FALSE
## Apps FALSE FALSE
## Accept FALSE FALSE
## Enroll FALSE FALSE
## Top10perc FALSE FALSE
## Top25perc FALSE FALSE
## F.Undergrad FALSE FALSE
## P.Undergrad FALSE FALSE
## Room.Board FALSE FALSE
## Books FALSE FALSE
## Personal FALSE FALSE
## PhD FALSE FALSE
## Terminal FALSE FALSE
## S.F.Ratio FALSE FALSE
## perc.alumni FALSE FALSE
## Expend FALSE FALSE
## Grad.Rate FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
## PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " " " " " " " " " " " " "
## 4 ( 1 ) "*" " " " " " " " " " " " "
## 5 ( 1 ) "*" " " " " " " " " " " " "
## 6 ( 1 ) "*" " " " " " " " " " " " "
## 7 ( 1 ) "*" " " " " " " " " " " " "
## 8 ( 1 ) "*" " " " " " " "*" " " " "
## P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1 ( 1 ) " " "*" " " " " " " " " " "
## 2 ( 1 ) " " "*" " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " "
## 5 ( 1 ) " " "*" " " " " " " " " " "
## 6 ( 1 ) " " "*" " " " " " " "*" " "
## 7 ( 1 ) " " "*" " " "*" " " "*" " "
## 8 ( 1 ) " " "*" " " "*" " " "*" " "
## perc.alumni Expend Grad.Rate
## 1 ( 1 ) " " " " " "
## 2 ( 1 ) "*" " " " "
## 3 ( 1 ) "*" "*" " "
## 4 ( 1 ) "*" "*" " "
## 5 ( 1 ) "*" "*" "*"
## 6 ( 1 ) "*" "*" "*"
## 7 ( 1 ) "*" "*" "*"
## 8 ( 1 ) "*" "*" "*"
coef(fit,id=6)
## (Intercept) PrivateYes Room.Board Terminal perc.alumni
## -4726.8810613 2717.7019276 1.1032433 36.9990286 59.0863753
## Expend Grad.Rate
## 0.1930814 33.8303314
#QUESTION 10 PART B)
library(gam)
## Warning: package 'gam' was built under R version 4.4.2
## Loading required package: splines
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 4.4.2
## Loaded gam 1.22-5
gam.fit=gam(Outstate~Private+s(Room.Board)+s(Terminal)+s(perc.alumni)+s(Expend)+s(Grad.Rate),data=College[train,])
par(mfrow=c(2, 3))
plot(gam.fit,se=T,col='red')

#Looking at the plots and the shape of the fit curves, we can see that Expend, Grad.Rate have non-linear relationship with outstate.
#QUESTION 10 PART C)
gam.pred=predict(gam.fit, College[test,])
mean((College[test,"Outstate"]-gam.pred)^2)
## [1] 3353802
1-mean(abs(College[test,"Outstate"]-gam.pred))/mean(College[test,"Outstate"])
## [1] 0.8640176
#The R squared on the test set is 0.8571672. This means that using this model, we can predict the out of state tuition with approximately 86% accuracy.
#QUESTION 10 PART D)
summary(gam.fit)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board) + s(Terminal) +
## s(perc.alumni) + s(Expend) + s(Grad.Rate), data = College[train,
## ])
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7128.62 -1133.86 -74.25 1231.50 7369.50
##
## (Dispersion Parameter for gaussian family taken to be 3724586)
##
## Null Deviance: 6989966760 on 387 degrees of freedom
## Residual Deviance: 1363197370 on 365.9997 degrees of freedom
## AIC: 6995.069
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1764398916 1764398916 473.717 < 2.2e-16 ***
## s(Room.Board) 1 1616561254 1616561254 434.024 < 2.2e-16 ***
## s(Terminal) 1 287918343 287918343 77.302 < 2.2e-16 ***
## s(perc.alumni) 1 354690429 354690429 95.230 < 2.2e-16 ***
## s(Expend) 1 601731164 601731164 161.556 < 2.2e-16 ***
## s(Grad.Rate) 1 90312393 90312393 24.248 1.284e-06 ***
## Residuals 366 1363197370 3724586
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board) 3 1.9107 0.1274
## s(Terminal) 3 1.4636 0.2241
## s(perc.alumni) 3 0.3498 0.7893
## s(Expend) 3 26.1184 2.442e-15 ***
## s(Grad.Rate) 3 0.9075 0.4375
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#According to the ANOVA for nonparametric effects, Expend seems to have a strong non-linear relationship with Outstate. Furthermore, Grad.Rate and PhD also have non-linear relationship with the Outstate as well.