Notebook prepared by Everton Lima
X=seq(-2,2,0.1)
Y=1+X-2*(X-1)^2*I(X>=1)
plot(X,Y,type='l')
The function is linear up until \(X>1\), afterwords it is a quadratic function with negative slope.
Using the cv.glm function of the boot package one can easily calculate the cross validation error. This is shown below.
library(boot)
library(ISLR)
set.seed(532)
poly.mse=c()
for(degree in 1:7){
poly.fit=glm(wage~poly(age,degree,raw=T),data=Wage)
mse=cv.glm(poly.fit,data = Wage,K=10)$delta[1]
poly.mse=c(poly.mse,mse)
}
Plotting the data results in the graph below; the optimal choice of polynomial is a polynomial of fourth degree. It is important to keep in mind that cross validation results will depend on how the K-folds are partitioned, which may skew the results obtained.
plot(poly.mse,xlab='Degree of Polynomial',ylab='Cross Validation Error',type='l')
x=which.min(poly.mse)
points(x,poly.mse[x],pch=20,cex=2,col='red')
From the curve we can see that there is not a significant difference in the error between polynomials of degree 4 and onward. Keeping this in mind, one should select a polynomial that does not overfit the data.
The results obtained agree with ANOVA as there is no significant reduction is the cross validation error for adding terms with degree higher than 4.
To answer this question we repeat the same process as before, but substituting the polynomial fit with a step function. Note that when using the cut function we need to change change the data at every step, otherwise when doing cross validation the cutoff values for age will differ for every fold making it unable to make predictions.
set.seed(42)
step.mse=c()
for(br in 2:10){
Wage.model=model.frame(wage~cut(age,br),data=Wage)
names(Wage.model)=c('wage','age')
step.fit=glm(wage~age,data=Wage.model)
mse=cv.glm(step.fit,data = Wage.model,K=10)$delta[1]
step.mse=c(step.mse,mse)
}
plot(step.mse,xlab='Degree of Polynomial',ylab='Cross Validation Error',type='l')
x=which.min(step.mse)
points(x,step.mse[x],pch=20,cex=2,col='red')
From the graph above we can see that the optimal number of cutoff points is 7.
Below an overview of the features in the Wage dataset is given.
summary(Wage[,c('wage','maritl','jobclass','race','health')])
## wage maritl jobclass
## Min. : 20.09 1. Never Married: 648 1. Industrial :1544
## 1st Qu.: 85.38 2. Married :2074 2. Information:1456
## Median :104.92 3. Widowed : 19
## Mean :111.70 4. Divorced : 204
## 3rd Qu.:128.68 5. Separated : 55
## Max. :318.34
## race health
## 1. White:2480 1. <=Good : 858
## 2. Black: 293 2. >=Very Good:2142
## 3. Asian: 190
## 4. Other: 37
##
##
First, we plot the scatter plot of wage and each of the predictors.
feat=c('maritl','jobclass','race','health')
par(mfrow=c(2,2))
for(p in feat)
boxplot(wage~.,data=Wage[,c('wage',p)])
This problem can be solved by predicting pairs of variables of the Auto dataset and using orthogonal polynomials, as generated by the poly function.
glm.fit=glm(mpg~poly(displacement,5),data = Auto)
summary(glm.fit)
##
## Call:
## glm(formula = mpg ~ poly(displacement, 5), data = Auto)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -15.3360 -2.3445 -0.2895 2.1635 20.3439
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.4459 0.2209 106.158 < 2e-16 ***
## poly(displacement, 5)1 -124.2585 4.3728 -28.416 < 2e-16 ***
## poly(displacement, 5)2 31.0895 4.3728 7.110 5.67e-12 ***
## poly(displacement, 5)3 -4.4655 4.3728 -1.021 0.308
## poly(displacement, 5)4 0.7747 4.3728 0.177 0.859
## poly(displacement, 5)5 3.2991 4.3728 0.754 0.451
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 19.12134)
##
## Null deviance: 23819.0 on 391 degrees of freedom
## Residual deviance: 7380.8 on 386 degrees of freedom
## AIC: 2277.1
##
## Number of Fisher Scoring iterations: 2
Below you can observe a quadratic fit of regressing mpg onto poly(displacement,2).
newdat=c()
glm.fit=glm(mpg~poly(displacement,2),data = Auto)
newdat$pred = predict(glm.fit, newdata = data.frame(
displacement=seq(min(Auto$displacement), max(Auto$displacement), length.out = 100)))
plot(Auto[,c('displacement','mpg')])
lines(seq(min(Auto$displacement), max(Auto$displacement), length.out = 100),
newdat$pred,type='l',col='red')
library(MASS)
attach(Boston)
poly.fit=glm(nox~poly(dis,3),data=Boston)
summary(poly.fit)
##
## Call:
## glm(formula = nox ~ poly(dis, 3), data = Boston)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.121130 -0.040619 -0.009738 0.023385 0.194904
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.554695 0.002759 201.021 < 2e-16 ***
## poly(dis, 3)1 -2.003096 0.062071 -32.271 < 2e-16 ***
## poly(dis, 3)2 0.856330 0.062071 13.796 < 2e-16 ***
## poly(dis, 3)3 -0.318049 0.062071 -5.124 4.27e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.003852802)
##
## Null deviance: 6.7810 on 505 degrees of freedom
## Residual deviance: 1.9341 on 502 degrees of freedom
## AIC: -1370.9
##
## Number of Fisher Scoring iterations: 2
plot(Boston[,c('dis','nox')])
pred=predict(poly.fit,
data.frame(dis=seq(min(dis),max(dis),length.out = 100)))
lines(seq(min(dis),max(dis),length.out = 100),
pred,col='red')
x=seq(min(dis),max(dis),length.out = 100)
clrs=rainbow(10)
plot(Boston[,c('dis','nox')])
rss=c()
for(d in 1:10){
poly.fit=glm(nox~poly(dis,d),data = Boston)
pred=predict(poly.fit,data.frame(dis=x))
lines(x,pred,col=clrs[d])
rss=c(rss,sum(poly.fit$residuals^2))
}
legend(x='topright',legend = 1:10,col=clrs,lty = c(1,1),lwd = c(2,2))
plot(rss,xlab='Degree of Polynomials',ylab='RSE',type='l')
This can be done by the same approach as question 6a. The minimum achieved in shown by the red point in the curve below.
library(boot)
set.seed(532)
poly.mse=c()
for(degree in 1:7){
poly.fit=glm(nox~poly(dis,degree,raw=T),data=Boston)
mse=cv.glm(poly.fit,data = Boston,K=10)$delta[1]
poly.mse=c(poly.mse,mse)
}
plot(poly.mse,type='l',xlab='Polynomial Degree',ylab='Cross Validation MSE')
points(which.min(poly.mse),poly.mse[which.min(poly.mse)],col='red',pch=20,cex=2)
library(splines)
library(MASS)
spline.fit=lm(nox~bs(dis,df =4),data=Boston)
x=seq(min(Boston[,'dis']),max(Boston[,'dis']),length.out = 100)
y=predict(spline.fit,data.frame(dis=x))
plot(Boston[,c('dis','nox')],ylim=c(0,1))
lines(x,y,col=clrs[4])
plot(Boston[,c('dis','nox')],ylim=c(0,1))
clrs=rainbow(10)
x=seq(min(Boston[,'dis']),max(Boston[,'dis']),length.out = 100)
rss=c()
for(df in 3:10){
spline.fit=lm(nox~bs(dis,df=df),data=Boston)
y=predict(spline.fit,data.frame(dis=x))
lines(x,y,col=clrs[df])
rss=c(rss,sum(spline.fit$residuals^2))
}
legend(x='topright',legend=3:10,text.col=clrs[3:10],text.width=0.5,bty = 'n',horiz = T)
plot(3:10,rss,xlab='Df',ylab='Train RSS',type='l')
As before with the step model obtained from using the cut function, if we automatically generate the cutoffs we must do so in the entire dataset. If not, predictions cannot be made.
Why do you think this is a bad approach? How can we know where to place the knots when we only partially observe the data?
library(boot)
set.seed(42)
spline.mse=c()
for(df in 3:10){
Boston.model=model.frame(nox~bs(dis,df=df),data=Boston) # Note that because we are automatically setting the
names(Boston.model)=c('nox','bs.dis') # cutoffs we must do so in the entire dataset, otherwise
# predictions cannot be made.
spline.fit=glm(nox~bs.dis,data=Boston.model)
mse=cv.glm(spline.fit,data=Boston.model,K=10)$delta[1]
spline.mse=c(spline.mse,mse)
}
plot(3:10,spline.mse,type='l',xlab='Df',ylab='Cross Val. MSE for Splines')
x=which.min(spline.mse)
points(x+2,spline.mse[x],col='red',pch=20,cex=2)
It is clear that fitting model with 6 degree of freedom performs well in this dataset.
First, select observations of the College dataset by sampling.
library(ISLR)
set.seed(25)
train = sample(1:nrow(College),500)
test = -train
Reform forward step subset selection of the predictors; This is done by making of the regsubsets function from the leaps library. Moreover, the subset of predictors that have the best training MSE score are selected.
library(leaps)
forward=regsubsets(Outstate~.,data=College,method = 'forward',nvmax = 17)
plot(1/nrow(College)*summary(forward)$rss,type='l',xlab='Number of Predictors',ylab='Train MSE Score',xaxt='n')
axis(side=1,at=seq(1,17,2),labels = seq(1,17,2))
From the plot we see that a good choice is the model with 7 predictors. The predictors in this model are shown below.
which(summary(forward)$which[7,-1])
## PrivateYes Room.Board Personal PhD perc.alumni Expend
## 1 9 11 12 15 16
## Grad.Rate
## 17
Here we fit a GAM by making use of smoothing splines for each of the predictors selected, except ‘Private’ since it is a qualitative predictor.
library(gam)
## Loading required package: foreach
## Loaded gam 1.14
gam.fit=gam(Outstate~Private+s(Room.Board)+s(Personal)+s(PhD)+s(perc.alumni)+s(Expend)+s(Grad.Rate),data=College[train,])
par(mfrow = c(2, 3))
plot(gam.fit,se=T,col='blue')
gam.pred=predict(gam.fit,College[test,])
gam.mse=mean((College[test,'Outstate']-gam.pred)^2)
gam.mse
## [1] 3788945
Evaluating the model on the test set performs better than in the training set, since the MSE obtained is lower on the test set. Furthermore, below we can see that about 78% of the variance encountered in the data is explained by this model.
gam.tss = mean((College[test,'Outstate'] - mean(College[test,'Outstate']))^2)
test.rss = 1 - gam.mse/gam.tss
test.rss
## [1] 0.7836849
summary(gam.fit)
##
## Call: gam(formula = Outstate ~ Private + s(Room.Board) + s(Personal) +
## s(PhD) + s(perc.alumni) + s(Expend) + s(Grad.Rate), data = College[train,
## ])
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -6791.51 -1009.57 83.44 1235.44 6617.10
##
## (Dispersion Parameter for gaussian family taken to be 3318310)
##
## Null Deviance: 7706811402 on 499 degrees of freedom
## Residual Deviance: 1572880753 on 474.0005 degrees of freedom
## AIC: 8953.721
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## Private 1 1825444340 1825444340 550.113 < 2.2e-16 ***
## s(Room.Board) 1 1758531768 1758531768 529.948 < 2.2e-16 ***
## s(Personal) 1 68428237 68428237 20.621 7.107e-06 ***
## s(PhD) 1 643164758 643164758 193.823 < 2.2e-16 ***
## s(perc.alumni) 1 317101104 317101104 95.561 < 2.2e-16 ***
## s(Expend) 1 633174905 633174905 190.812 < 2.2e-16 ***
## s(Grad.Rate) 1 55821167 55821167 16.822 4.831e-05 ***
## Residuals 474 1572880753 3318310
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## Private
## s(Room.Board) 3 2.7349 0.043148 *
## s(Personal) 3 3.4526 0.016504 *
## s(PhD) 3 3.0147 0.029717 *
## s(perc.alumni) 3 1.2507 0.290793
## s(Expend) 3 22.8687 7.683e-14 ***
## s(Grad.Rate) 3 3.9867 0.008012 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
From the output of ANOVA we can not that there is a significant evidence that a non-linear relationship for Expend,Grad.Rate,PhD,Personal, and Room.Board is present.
set.seed(40)
n=100
K=sample(1:100,1)
X1=rnorm(n,mean=sample(1:100,1),sd=sample(1:25,1))
X2=rnorm(n,mean=sample(1:100,1),sd=sample(1:25,1))
Y=K+3*X1+7*X2
plot(Y,col='darkgray')
step_b0=c()
step_b1=c()
step_b2=c()
for(step in 1:1000){
a=Y-B1*X1
m=lm(a~X2)
B2=m$coef[2]
a=Y-B2*X2
m=lm(a~X1)
B1=m$coef[2]
B0=m$coef[1]
step_b1=c(step_b1,B1)
step_b2=c(step_b2,B2)
step_b0=c(step_b0,B0)
}
plot(step_b0,xlab='Step',ylab='Estimates',col='black',ylim=c(0,100),xlim=c(1,100),type='l')
lines(step_b1,xlab='Step',col='red')
lines(step_b2,xlab='Step',col='blue')
From the plot we can see that the algorithm converges very fast, finding the correct values within less than 10 steps.
The intercept, coefficients, and observations are generated from the loop below.
set.seed(42)
n=1000
p=100
Bs=c()
Xs=c()
for(i in 1:p){
mu=sample(1:100,1)
sd=sample(1:25,1)
beta=rnorm(n = 1,mean = mu,sd = sd)
x=rnorm(n = n,mean = mu,sd = sd)
Bs=rbind(Bs,beta)
Xs=cbind(Xs,x)
}
B0=sample(1:100,1)
Y=as.vector((Xs%*%Bs)+B0)
Now we can perform back fitting as before, with an added nested loop to iterate over the predictors for each step.
maxstep=1000
diff=0.1
Bhs=c()
for(i in 1:100) # Guess starting values for each predictor.
Bhs=rbind(Bhs,sample(1:100,1))
mse=mean((Bhs-Bs)^2) # Check how good approximation has been achieved
err=c(mse)
for(step in 1:maxstep){
for(pred in 1:p){
a=Y-Xs[,-pred] %*% Bhs[-pred,] # Subtract all other predictors
m=lm(a~Xs[,pred]) # Fit model for this predictor
Bhs[pred]=m$coef[2]
}
mse=mean((Bhs-Bs)^2) # Check how good approximation has been achieved
if( abs(mse-err[length(err)]) < diff){ # Stop when no significant change occurs
err=c(err,mse)
break
} else {
err=c(err,mse)
}
}
plot(err,xlab='Iteration',ylab='MSE of Coefficients',type='l',
main='Number of Iterations necessary \nfor convergency in Backfitting')