library(ISLR)
library(boot)
set.seed(1)
#Cross validation for polynomial
data("Wage")
poly.mse <- c()
for(degree in 1:7){
poly.fit <- glm(wage~poly(age,degree,raw=TRUE),data=Wage)
mse <- cv.glm(poly.fit,data = Wage,K=10)$delta[1]
poly.mse <- c(poly.mse,mse)
}
#Plot the test MSE with degree of polynomial
plot(poly.mse,xlab='Degree of Polynomial',ylab='MSE',type='b')
x <- which.min(poly.mse)
points(x,poly.mse[x],pch=19,cex=2,col='red')
# ANOVA
set.seed(1)
models <- list()
for (i in 1:7){
anova.model <- lm(wage ~ poly(age, degree=i, raw = TRUE), data = Wage)
models[[i]] <- anova.model
}
anova(models[[1]],models[[2]],models[[3]],models[[4]],models[[5]],models[[6]],models[[7]])
## Analysis of Variance Table
##
## Model 1: wage ~ poly(age, degree = i, raw = TRUE)
## Model 2: wage ~ poly(age, degree = i, raw = TRUE)
## Model 3: wage ~ poly(age, degree = i, raw = TRUE)
## Model 4: wage ~ poly(age, degree = i, raw = TRUE)
## Model 5: wage ~ poly(age, degree = i, raw = TRUE)
## Model 6: wage ~ poly(age, degree = i, raw = TRUE)
## Model 7: wage ~ poly(age, degree = i, raw = TRUE)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.6926 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8956 0.001673 **
## 4 2995 4771604 1 6070 3.8125 0.050966 .
## 5 2994 4770322 1 1283 0.8055 0.369516
## 6 2993 4766389 1 3932 2.4697 0.116165
## 7 2992 4763834 1 2555 1.6049 0.205311
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
In the polynomial cross-validation, minimum MSE occurs at degree of 7. However, we can see that after degree of 4, the MSE doesn’t reduce much. This also justify by ANOVA test where degree of 4 is the best (p-value approximately 5%).
#Plot of polynomial = 4 fit
library(ggplot2)
ggplot(Wage, aes(age,wage))+
geom_point(color='grey')+
stat_smooth(method = 'lm', formula = y ~ poly(x,4), size = 1)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
set.seed(1)
# Use 10-fold cross-validation
ctrl <- trainControl(method = "cv", number = 10)
CV_RMSE <- c()
for (i in 2:20) {
model_temp <- train(y = Wage$wage,
x = data.frame(cut(Wage$age, i)),
method = "lm",
metric = "RMSE",
trControl = ctrl)
CV_RMSE[i-1] <- model_temp$results$RMSE
}
#Identify optimal cut
optimal.cuts <- as.numeric(min(CV_RMSE) == CV_RMSE)
optimal.cuts
## [1] 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
The optimal cut is at 10
ggplot(Wage, aes(x = age, y = wage)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "lm", formula = "y ~ cut(x, 10)")+
labs(title = "Step function with 10 intervals")
# Split the data:
tr.ind <- sample(1:nrow(College), size = 0.7 * nrow(College))
train <- College[tr.ind,]
test <- College[-tr.ind,]
# Perform forward stepwise selection
library(leaps)
fit <- regsubsets(Outstate ~., data = train, method = 'forward')
summary(fit)
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = train, method = "forward")
## 17 Variables (and intercept)
## Forced in Forced out
## PrivateYes FALSE FALSE
## Apps FALSE FALSE
## Accept FALSE FALSE
## Enroll FALSE FALSE
## Top10perc FALSE FALSE
## Top25perc FALSE FALSE
## F.Undergrad FALSE FALSE
## P.Undergrad FALSE FALSE
## Room.Board FALSE FALSE
## Books FALSE FALSE
## Personal FALSE FALSE
## PhD FALSE FALSE
## Terminal FALSE FALSE
## S.F.Ratio FALSE FALSE
## perc.alumni FALSE FALSE
## Expend FALSE FALSE
## Grad.Rate FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
## PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " " " " " " "
## 4 ( 1 ) "*" " " " " " " " " " " " "
## 5 ( 1 ) "*" " " " " " " " " " " " "
## 6 ( 1 ) "*" " " " " " " " " " " " "
## 7 ( 1 ) "*" " " " " " " " " " " " "
## 8 ( 1 ) "*" " " " " " " " " " " " "
## P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " "
## 5 ( 1 ) " " "*" " " " " "*" " " " "
## 6 ( 1 ) " " "*" " " " " "*" " " " "
## 7 ( 1 ) " " "*" " " " " "*" "*" " "
## 8 ( 1 ) " " "*" " " "*" "*" "*" " "
## perc.alumni Expend Grad.Rate
## 1 ( 1 ) " " "*" " "
## 2 ( 1 ) " " "*" " "
## 3 ( 1 ) " " "*" " "
## 4 ( 1 ) "*" "*" " "
## 5 ( 1 ) "*" "*" " "
## 6 ( 1 ) "*" "*" "*"
## 7 ( 1 ) "*" "*" "*"
## 8 ( 1 ) "*" "*" "*"
# Print out the most satisfactory predictors
coef(fit, id = 8)
## (Intercept) PrivateYes Room.Board Personal PhD
## -3391.6280309 2793.8059822 0.9195291 -0.2773126 19.4035537
## Terminal perc.alumni Expend Grad.Rate
## 23.5747223 39.6201101 0.2217800 27.5704240
## Loading required package: splines
## Loading required package: foreach
## Loaded gam 1.22
model.gam <- gam(Outstate ~ Private + s(Top25perc) + s(Room.Board) + s(Personal) + s(Terminal) + s(perc.alumni) + s(Expend) + s(Grad.Rate), data = train)
par(mfrow = c(2,3))
plot(model.gam, se = T, col = "red")
The private variable presents that most of the data is from public schools. There’s a slightly positive linear relationship between out-of-state tuition and top25perc. Room and board highly affect the out-of-state tuition since more out of state students will result in higher spending on dorms. Personal spending negatively affect the out-of-state tuition. And so on, terminal, perc.alumni also shows a positive correlation to out-of-state.
preds <- predict(model.gam, test)
RSS <- sum((test$Outstate - preds)^2)
TSS <- sum((test$Outstate - mean(test$Outstate)) ^ 2)
1 - (RSS / TSS)
## [1] 0.7912961
This gam model has an r-squared value of 74.68%, meaning it can other variables in the model can explain 74.68% variance of the out-of-state variables.
summary(model.gam)
##
## Call: gam(formula = Outstate ~ Private + s(Top25perc) + s(Room.Board) +
## s(Personal) + s(Terminal) + s(perc.alumni) + s(Expend) +
## s(Grad.Rate), data = train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7234.04 -1031.34 79.55 1227.96 7439.31
##
## (Dispersion Parameter for gaussian family taken to be 3379256)
##
## Null Deviance: 8537435416 on 542 degrees of freedom
## Residual Deviance: 1733557283 on 512.9997 degrees of freedom
## AIC: 9735.116
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 2188513966 2188513966 647.632 < 2.2e-16 ***
## s(Top25perc) 1 1306721465 1306721465 386.689 < 2.2e-16 ***
## s(Room.Board) 1 1005894952 1005894952 297.668 < 2.2e-16 ***
## s(Personal) 1 47116500 47116500 13.943 0.0002096 ***
## s(Terminal) 1 169653988 169653988 50.205 4.599e-12 ***
## s(perc.alumni) 1 179575924 179575924 53.141 1.182e-12 ***
## s(Expend) 1 732608492 732608492 216.796 < 2.2e-16 ***
## s(Grad.Rate) 1 95970557 95970557 28.400 1.480e-07 ***
## Residuals 513 1733557283 3379256
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Top25perc) 3 0.9618 0.41046
## s(Room.Board) 3 1.2393 0.29475
## s(Personal) 3 3.5956 0.01355 *
## s(Terminal) 3 1.5246 0.20719
## s(perc.alumni) 3 1.2422 0.29373
## s(Expend) 3 27.6073 2.22e-16 ***
## s(Grad.Rate) 3 3.4863 0.01572 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Based on the Anova for nonparametic effects, ‘Expend’ is significant and has a non-linear relationship with ‘out-of-state’. ‘Room.Board’ and ‘Grad.Rate’ is moderately non linear to ‘out-of-state’. This matches with plots from part b)
For Anova for Parametric Effects, the test shows that all variables in the model are significant and have a non-linear relationship.