library(titanic)
library(caret)
library(lattice)
library(ggplot2)
library(gam)
library(car)
library(ROCR)
library(ISLR)
library(ggmosaic)
library(boot)
##
## Attaching package: 'boot'
## The following object is masked from 'package:car':
##
## logit
## The following object is masked from 'package:lattice':
##
## melanoma
data('Wage')
set.seed(1)
cv.error <- rep(0,5)
for (i in 1:5){
glm.fit <- glm(wage ~ poly(age,i),data=Wage)
cv.error[i]<- cv.glm(Wage,glm.fit,K=10)$delta[1]
}
cv.error
## [1] 1676.826 1600.763 1598.399 1595.651 1594.977
Plot the MSE by degrees
plot(cv.error, type="b", xlab="Degree", ylab="Test MSE")
points(which.min(cv.error), cv.error[4], col="red", pch=20, cex=2)
The optimal degree that was chosen by my cross validation was 5
Annova Fit
fit_1 <- lm(wage ~ age, data=Wage)
fit_2 <- lm(wage ~ poly(age, 2), data=Wage)
fit_3 <- lm(wage ~ poly(age, 3), data=Wage)
fit_4 <- lm(wage ~ poly(age, 4), data=Wage)
fit_5 <- lm(wage ~ poly(age, 5), data=Wage)
anova(fit_1, fit_2, fit_3, fit_4, fit_5)
## Analysis of Variance Table
##
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.5931 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8888 0.001679 **
## 4 2995 4771604 1 6070 3.8098 0.051046 .
## 5 2994 4770322 1 1283 0.8050 0.369682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Using the Annova I was able to see that degree 2 had the highest significance predicted by the annova model.
library(boot)
set.seed(1)
cv.errors <- rep(NA, 10)
for(i in 2:10){
Wage$age.cut <- cut(Wage$age,i)
glm.fit <- glm(wage ~ age.cut, data=Wage)
cv.errors[i] <- cv.glm(Wage, glm.fit, K=10)$delta[1]
}
cv.errors
## [1] NA 1734.489 1684.271 1635.552 1632.080 1623.415 1614.996 1601.318
## [9] 1613.954 1606.331
plot(2:10, cv.errors[-1], type="b", xlab="Number of cuts", ylab="Test MSE")
points(which.min(cv.errors), cv.errors[which.min(cv.errors)], col="red", pch=20, cex=2)
In the plot function i had to plot the number 2-10 as the number 1 stop did not have a value. The optimal number of cuts when using the step function came out to 8.
library(leaps)
train <- sample(1: nrow(College), nrow(College)/2)
test <- -train
fit <- regsubsets(Outstate ~ ., data = College, subset = train, method = 'forward')
fit.summary <- summary(fit)
fit.summary
## Subset selection object
## Call: regsubsets.formula(Outstate ~ ., data = College, subset = train,
## method = "forward")
## 17 Variables (and intercept)
## Forced in Forced out
## PrivateYes FALSE FALSE
## Apps FALSE FALSE
## Accept FALSE FALSE
## Enroll FALSE FALSE
## Top10perc FALSE FALSE
## Top25perc FALSE FALSE
## F.Undergrad FALSE FALSE
## P.Undergrad FALSE FALSE
## Room.Board FALSE FALSE
## Books FALSE FALSE
## Personal FALSE FALSE
## PhD FALSE FALSE
## Terminal FALSE FALSE
## S.F.Ratio FALSE FALSE
## perc.alumni FALSE FALSE
## Expend FALSE FALSE
## Grad.Rate FALSE FALSE
## 1 subsets of each size up to 8
## Selection Algorithm: forward
## PrivateYes Apps Accept Enroll Top10perc Top25perc F.Undergrad
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) "*" " " " " " " " " " " " "
## 3 ( 1 ) "*" " " " " " " " " " " " "
## 4 ( 1 ) "*" " " " " " " " " " " " "
## 5 ( 1 ) "*" " " " " " " " " " " " "
## 6 ( 1 ) "*" " " " " " " " " " " " "
## 7 ( 1 ) "*" " " " " " " " " " " " "
## 8 ( 1 ) "*" " " "*" " " " " " " " "
## P.Undergrad Room.Board Books Personal PhD Terminal S.F.Ratio
## 1 ( 1 ) " " " " " " " " " " " " " "
## 2 ( 1 ) " " " " " " " " " " " " " "
## 3 ( 1 ) " " "*" " " " " " " " " " "
## 4 ( 1 ) " " "*" " " " " " " " " " "
## 5 ( 1 ) " " "*" " " " " "*" " " " "
## 6 ( 1 ) " " "*" " " " " "*" " " " "
## 7 ( 1 ) " " "*" " " "*" "*" " " " "
## 8 ( 1 ) " " "*" " " "*" "*" " " " "
## perc.alumni Expend Grad.Rate
## 1 ( 1 ) " " "*" " "
## 2 ( 1 ) " " "*" " "
## 3 ( 1 ) " " "*" " "
## 4 ( 1 ) "*" "*" " "
## 5 ( 1 ) "*" "*" " "
## 6 ( 1 ) "*" "*" "*"
## 7 ( 1 ) "*" "*" "*"
## 8 ( 1 ) "*" "*" "*"
This is the list of the top 6 predictors:
coef(fit, id = 6)
## (Intercept) PrivateYes Room.Board PhD perc.alumni
## -3191.6057181 2950.2717035 0.8732379 37.0498937 51.0144153
## Expend Grad.Rate
## 0.1848395 29.9264164
gam.mod <- gam(Outstate ~ Private + s(Room.Board, 5) + s(Terminal, 5) + s(perc.alumni, 5) + s(Expend, 5) + s(Grad.Rate, 5), data = College, subset = train)
par(mfrow = c(2,3))
plot(gam.mod, se = TRUE, col = 'red')
Looking at the shapes of the curves, expend and grad.rate have high non-linear relationships with the variablke outstate.
preds <- predict(gam.mod, College[test, ])
RSS <- sum((College[test, ]$Outstate - preds)^2)
TSS <- sum((College[test, ]$Outstate - mean(College[test, ]$Outstate)) ^ 2)
1 - (RSS / TSS)
## [1] 0.7834314
Based on this model our r squared value is .7788.
summary(gam.mod)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board, 5) + s(Terminal,
## 5) + s(perc.alumni, 5) + s(Expend, 5) + s(Grad.Rate, 5),
## data = College, subset = train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7159.85 -1162.81 47.96 1095.18 7648.37
##
## (Dispersion Parameter for gaussian family taken to be 3334500)
##
## Null Deviance: 5860806972 on 387 degrees of freedom
## Residual Deviance: 1203754008 on 360.9998 degrees of freedom
## AIC: 6956.806
##
## Number of Local Scoring Iterations: NA
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1624052654 1624052654 487.045 < 2.2e-16 ***
## s(Room.Board, 5) 1 1068282762 1068282762 320.373 < 2.2e-16 ***
## s(Terminal, 5) 1 345093780 345093780 103.492 < 2.2e-16 ***
## s(perc.alumni, 5) 1 280343396 280343396 84.074 < 2.2e-16 ***
## s(Expend, 5) 1 507867707 507867707 152.307 < 2.2e-16 ***
## s(Grad.Rate, 5) 1 69135717 69135717 20.733 7.227e-06 ***
## Residuals 361 1203754008 3334500
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board, 5) 4 1.5341 0.1917
## s(Terminal, 5) 4 1.0288 0.3922
## s(perc.alumni, 5) 4 1.8927 0.1111
## s(Expend, 5) 4 16.7207 1.387e-12 ***
## s(Grad.Rate, 5) 4 0.7858 0.5350
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
From this summary we can see that the variables grade rate and expend have non-linear relationships with outstate.