Wage<-read.table("C:/Users/Joshua Escareno/Desktop/Wage.txt", header = T )
lm.fit1<-lm(wage~age, data = Wage)
lm.fit2<-lm(wage~poly(age,2), data = Wage)
lm.fit3<-lm(wage~poly(age,3), data = Wage)
lm.fit4<-lm(wage~poly(age,4), data = Wage)
lm.fit5<-lm(wage~poly(age,5), data = Wage)
lm.fit6<-lm(wage~poly(age,6), data = Wage)
lm.fit7<-lm(wage~poly(age,7), data = Wage)
anova(lm.fit1,lm.fit2,lm.fit3,lm.fit4,lm.fit5,lm.fit6,lm.fit7)
## Analysis of Variance Table
##
## Model 1: wage ~ age
## Model 2: wage ~ poly(age, 2)
## Model 3: wage ~ poly(age, 3)
## Model 4: wage ~ poly(age, 4)
## Model 5: wage ~ poly(age, 5)
## Model 6: wage ~ poly(age, 6)
## Model 7: wage ~ poly(age, 7)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2998 5022216
## 2 2997 4793430 1 228786 143.6926 < 2.2e-16 ***
## 3 2996 4777674 1 15756 9.8956 0.001673 **
## 4 2995 4771604 1 6070 3.8125 0.050966 .
## 5 2994 4770322 1 1283 0.8055 0.369516
## 6 2993 4766389 1 3932 2.4697 0.116165
## 7 2992 4763834 1 2555 1.6049 0.205311
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(boot)
deltas = rep(NA,10)
set.seed(1) #I set the seed because everytime I ran a new set of numbers would occur.
for (i in 1:10){
fit = glm(wage~poly(age,i),data = Wage)
deltas[i] = cv.glm(Wage,fit,K=10)$delta[1]
}
plot(1:10,deltas, xlab = "Degree", ylab = "Our Testing MSE",type = "l")
delta.min<-which.min(deltas)
delta.min
## [1] 4
plot(wage~age,data = Wage, col = "blue")
agelims = range(Wage$age)
grid.age = seq(from = agelims[1], to=agelims[2])
lm.fit<-lm(wage~poly(age,3), data = Wage)
lm.pred<-predict(lm.fit,newdata = list(age = grid.age))
lines(grid.age,lm.pred,col = "green", lwd = "5")
(b) Fit a step function to predict wage using age, and perform crossvalidation to choose the optimal number of cuts. Make a plot of the ???t obtained.
cvs = rep(NA,10)
for(i in 2:10)
{
Wage$age.cut = cut(Wage$age,i)
fit = glm(wage~age.cut, data = Wage)
cvs[i] = cv.glm(Wage,fit,K = 10)$delta[1]
}
plot(2:10,cvs[-1],xlab = "cuts", ylab = "Test Mean Sqaured Error", type = "o")
dim.min = which.min(cvs)
dim.min
## [1] 8
Using chross validations we will chose 8.
plot(wage~age,data = Wage, col = "blue")
agelims = range(Wage$age)
grid.age = seq(from = agelims[1], to=agelims[2])
glm.fit<-glm(wage~poly(age,8), data = Wage)
glm.pred<-predict(glm.fit,newdata = list(age = grid.age))
lines(grid.age,glm.pred,col = "green", lwd = "10")
library(ISLR)
## Warning: package 'ISLR' was built under R version 3.5.3
##
## Attaching package: 'ISLR'
## The following object is masked _by_ '.GlobalEnv':
##
## Wage
library(leaps)
## Warning: package 'leaps' was built under R version 3.5.3
attach(College)
train = sample(length(Outstate), length(Outstate)/2)
test = -train
ctrain<-College[train,]
ctest<-College[test,]
reg.fit<-regsubsets(Outstate~.,data = ctrain, nvmax = 17,method = 'forward')
reg.summary<- summary(reg.fit)
par=(mfrow = c(1,3))
plot(reg.summary$cp,xlab = "OUTSTATE", ylab = "CP", type = "l")
min.cp = min(reg.summary$cp)
std.cp = sd(reg.summary$cp)
abline(h=min.cp+.02*std.cp,col="red", lty = 2)
abline(h=min.cp-.02*std.cp,col="red", lty = 2)
plot(reg.summary$bic,xlab = "OUtSTATE", ylab = "BIC", type = "l")
min.bic = min(reg.summary$bic)
std.bic = sd(reg.summary$bic)
abline(h=min.bic+.02*std.bic,col="red", lty = 2)
abline(h=std.bic+.02*std.bic,col="red", lty = 2)
plot(reg.summary$bic,xlab = "OUtSTATE", ylab = "BIC", type = "l")
min.adj = min(reg.summary$adjr2)
std.adj = sd(reg.summary$adjr2)
abline(h=min.adj+.02*std.adj,col="red", lty = 2)
abline(h=min.adj-.02*std.adj,col="red", lty = 2)
plot(reg.summary$adjr2,xlab = "y", ylab = "c", type = "l")
coefi = coef(reg.fit,id = 6)
names(coefi)
## [1] "(Intercept)" "PrivateYes" "Room.Board" "Terminal" "perc.alumni"
## [6] "Expend" "Grad.Rate"
library(gam)
## Warning: package 'gam' was built under R version 3.5.3
## Loading required package: splines
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 3.5.3
## Loaded gam 1.16
gam.fit= gam(Outstate~ Private+s(Room.Board,df = 2)+s(PhD,df= 2)+ s(perc.alumni,df = 2)+s(Expend,df = 5)+s(Grad.Rate, df = 2),data = ctrain)
plot(Outstate~ Room.Board)
It would appear that there are positive relations between Out of state applicants and Room and Board. Those who live furtyher pay more then someone say who is in the local area.
gam.fit2= gam(Outstate~ Private+s(Room.Board,df = 8)+s(PhD,df= 5)+ s(perc.alumni,df = 2)+s(Expend,df = 10)+s(Grad.Rate, df = 7),data = ctrain)
plot(gam.fit2,se = T, col = "green")
gam.pred<- predict(gam.fit, ctest)
gam.pred2<- predict(gam.fit2, ctest)
mean((ctest$Outstate-gam.pred)^2)
## [1] 3491988
mean((ctest$Outstate-gam.pred2)^2)
## [1] 3667829
summary(gam.fit)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board, df = 2) + s(PhD,
## df = 2) + s(perc.alumni, df = 2) + s(Expend, df = 5) + s(Grad.Rate,
## df = 2), data = ctrain)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7707.66 -1070.71 -58.67 1283.51 7348.01
##
## (Dispersion Parameter for gaussian family taken to be 3518025)
##
## Null Deviance: 6433245471 on 387 degrees of freedom
## Residual Deviance: 1312223032 on 372.9999 degrees of freedom
## AIC: 6966.282
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1743281330 1743281330 495.528 < 2.2e-16 ***
## s(Room.Board, df = 2) 1 1349150163 1349150163 383.497 < 2.2e-16 ***
## s(PhD, df = 2) 1 327440246 327440246 93.075 < 2.2e-16 ***
## s(perc.alumni, df = 2) 1 245007093 245007093 69.643 1.40e-15 ***
## s(Expend, df = 5) 1 559665043 559665043 159.085 < 2.2e-16 ***
## s(Grad.Rate, df = 2) 1 55662319 55662319 15.822 8.36e-05 ***
## Residuals 373 1312223032 3518025
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board, df = 2) 1 2.5160 0.11354
## s(PhD, df = 2) 1 1.7649 0.18482
## s(perc.alumni, df = 2) 1 3.2300 0.07311 .
## s(Expend, df = 5) 4 19.3548 1.754e-14 ***
## s(Grad.Rate, df = 2) 1 2.0473 0.15331
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(gam.fit2)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board, df = 8) + s(PhD,
## df = 5) + s(perc.alumni, df = 2) + s(Expend, df = 10) + s(Grad.Rate,
## df = 7), data = ctrain)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6961.22 -1047.50 -38.53 1300.74 6783.71
##
## (Dispersion Parameter for gaussian family taken to be 3437620)
##
## Null Deviance: 6433245471 on 387 degrees of freedom
## Residual Deviance: 1216917265 on 354 degrees of freedom
## AIC: 6975.026
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1791042090 1791042090 521.012 < 2.2e-16 ***
## s(Room.Board, df = 8) 1 1323839009 1323839009 385.103 < 2.2e-16 ***
## s(PhD, df = 5) 1 312273254 312273254 90.840 < 2.2e-16 ***
## s(perc.alumni, df = 2) 1 222294451 222294451 64.665 1.347e-14 ***
## s(Expend, df = 10) 1 583734943 583734943 169.808 < 2.2e-16 ***
## s(Grad.Rate, df = 7) 1 66070271 66070271 19.220 1.538e-05 ***
## Residuals 354 1216917265 3437620
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board, df = 8) 7 1.3300 0.23491
## s(PhD, df = 5) 4 2.5832 0.03698 *
## s(perc.alumni, df = 2) 1 2.6372 0.10527
## s(Expend, df = 10) 9 9.2786 1.147e-12 ***
## s(Grad.Rate, df = 7) 6 1.5369 0.16507
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
It would appear that Expend have a non- linear relationship, also with Grad. rate and Ph.D.