Problem 1: Boston
#install.packages("ISLR")
library(ISLR)
#install.packages("MASS")
library(MASS)
## Warning: package 'MASS' was built under R version 3.5.2
library(Matrix)
library(lme4)
## Warning: package 'lme4' was built under R version 3.5.2
library(ggplot2)
Part A: Polynomial Regression
# polynomial regression
# poly uses matrix of orthogonal polynomials
fitMASS=lm(nox~poly(dis,3),data=Boston) #first responce 2nd is
coef(summary(fitMASS))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.5546951 0.00275939 201.020894 0.000000e+00
## poly(dis, 3)1 -2.0030959 0.06207094 -32.271071 1.597201e-124
## poly(dis, 3)2 0.8563300 0.06207094 13.795987 6.133104e-37
## poly(dis, 3)3 -0.3180490 0.06207094 -5.123959 4.274950e-07
Part B: Polynomial Degrees
#Creating a training set
set.seed(1)
# gives a 50-50 split of the indexes
train<-sample(dim(Boston)[1], floor(dim(Boston)[1]/2)) #This form of spliting the data is more universal
test.nox<-Boston$nox[-train]
# we can put this code in a loop to test different amounts of flexibility
degreePoly<-10
polyMSE<-matrix(nrow=degreePoly, ncol=2)
colnames(polyMSE)<-c("degree", "MSE")
for(i in 1:degreePoly){
polyMSE[i,1]<-i
this.fit<-lm(nox~poly(dis,i), data=Boston, subset=train)
polyMSE[i,2]<-mean((Boston$nox-predict(this.fit, Boston))[-train]^2)
}
polyDF<-as.data.frame(polyMSE)
head(polyDF)
## degree MSE
## 1 1 0.005160163
## 2 2 0.003796198
## 3 3 0.003658465
## 4 4 0.003652429
## 5 5 0.003656329
## 6 6 0.003635610
ggplot(data=polyDF, aes(x=degree, y=MSE))+
geom_point()+
geom_line()
Part C: Cross Validation
After splitting and testing the data it is clear that after the 5th degree the MSE increases on some of the testing data sets. To avoid overfitting it would be best to cut it at the 5th polynomial.
### Let's create many random splits and compare their results
# we can put this code in a double loop to test different amounts of flexibility
degreePoly<-10
splits<-10
splitMat<-matrix(nrow=degreePoly*splits, ncol=3)
colnames(splitMat)<-c("run","MSE", "degree")
for(i in 1:splits){
a=(i-1)*degreePoly+1
b=i*degreePoly
set.seed(i*3)
splitMat[a:b,1]<-i
train<-sample(392, 196) #UNSURE OF WHAT TO WRITE IN HERE
for(j in 1:degreePoly){
c=a+(j-1)
this.fit<-lm(Boston$nox~poly(dis,j), data=Boston, subset=train)
splitMat[c,2]<-mean((Boston$nox-predict(this.fit, Boston))[-train]^2)
splitMat[c,3]<-j
}
}
splitDF<-as.data.frame(splitMat)
head(splitDF)
## run MSE degree
## 1 1 0.006880444 1
## 2 1 0.004879994 2
## 3 1 0.004763359 3
## 4 1 0.004950247 4
## 5 1 0.005059731 5
## 6 1 0.004937652 6
ggplot(data=splitDF, aes(x=degree, y=MSE, color=as.factor(run)))+
geom_point()+
geom_line()
Part D: Fitting a Regression Spline
# knots chosen by R
#attr(bs(Boston$dis ,df=6) ,"knots")
# Splines
#install.packages("splines")
library(splines)
attach((Boston))
#basis functions
dislims = range(dis)
dis.grid=seq(from=dislims[1], to=dislims[2])
dis.grid = dis.grid
par(mfrow=c(1,1))
fit=lm(nox~bs(dis ,knots=c(3 ,6,9) ),data=Boston)
pred=predict(fit ,newdata =list(dis=dis.grid),se=T)
plot(dis ,nox ,col="gray")
lines(dis.grid ,pred$fit ,lwd=2)
lines(dis.grid ,pred$fit+2*pred$se ,lty="dashed")
lines(dis.grid ,pred$fit-2*pred$se ,lty="dashed")
Problem 2: College
#install.packages("ISLR")
library(ISLR)
#install.packages("gam")
library(gam)
## Warning: package 'gam' was built under R version 3.5.2
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 3.5.2
## Loaded gam 1.16.1
library(lme4)
College <- read.csv("http://faculty.marshall.usc.edu/gareth-james/ISL/College.csv",header=TRUE)
Part A: Foward Stepwise Selection
#test-train
#dividing the data
set.seed(1)
# gives a 50-50 split of the indexes
train<-sample(dim(College)[1], floor(dim(College)[1]/2)) #This form of spliting the data is more universal
College.train<-College[train,]
College.test<-College[-train,]
CollegeData<- College[-c(1),]
modV3<-lm(Outstate~Apps, data=College.train) #Apps
anova(modV3)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Apps 1 22927838 22927838 1.4277 0.2329
## Residuals 386 6199070694 16059769
modV4<-lm(Outstate~Accept, data=College.train) # Accept
anova(modV4)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Accept 1 59247 59247 0.0037 0.9517
## Residuals 386 6221939285 16119014
modV5<-lm(Outstate~Enroll, data=College.train) # Enroll
anova(modV5)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Enroll 1 99455586 99455586 6.2702 0.01269 *
## Residuals 386 6122542946 15861510
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modV6<-lm(Outstate~Top10perc, data=College.train) # Top10perc
anova(modV6)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Top10perc 1 1.743e+09 1743043060 150.22 < 2.2e-16 ***
## Residuals 386 4.479e+09 11603512
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modV7<-lm(Outstate~Top25perc, data=College.train) # Top25perc
anova(modV7)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Top25perc 1 1218341158 1218341158 93.987 < 2.2e-16 ***
## Residuals 386 5003657374 12962843
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modV8<-lm(Outstate~F.Undergrad, data=College.train) # F.Undergrad
anova(modV8)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## F.Undergrad 1 216544009 216544009 13.918 0.0002196 ***
## Residuals 386 6005454523 15558172
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modV9<-lm(Outstate~P.Undergrad, data=College.train) # P.Undergrad
anova(modV9)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## P.Undergrad 1 414732944 414732944 27.567 2.512e-07 ***
## Residuals 386 5807265588 15044730
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modV11<-lm(Outstate~Room.Board, data=College.train) # Room.Board
anova(modV11)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Room.Board 1 2648219045 2648219045 286.03 < 2.2e-16 ***
## Residuals 386 3573779487 9258496
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modV12<-lm(Outstate~Books, data=College.train) # Books
anova(modV12)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Books 1 12385341 12385341 0.7699 0.3808
## Residuals 386 6209613191 16087081
modV13<-lm(Outstate~Personal, data=College.train) # Personal
anova(modV13)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Personal 1 455832189 455832189 30.514 6.103e-08 ***
## Residuals 386 5766166343 14938255
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modV14<-lm(Outstate~PhD, data=College.train) # PhD
anova(modV14)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## PhD 1 805196840 805196840 57.378 2.686e-13 ***
## Residuals 386 5416801692 14033165
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modV15<-lm(Outstate~Terminal,data=College.train) # Terminal
anova(modV15)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Terminal 1 963689965 963689965 70.742 7.985e-16 ***
## Residuals 386 5258308567 13622561
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modV16<-lm(Outstate~S.F.Ratio,data=College.train) # S.F.Ratio
anova(modV16)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## S.F.Ratio 1 2034178367 2034178367 187.49 < 2.2e-16 ***
## Residuals 386 4187820165 10849275
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modV17<-lm(Outstate~perc.alumni,data=College.train) # perc.alumni
anova(modV17)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## perc.alumni 1 2012057851 2012057851 184.48 < 2.2e-16 ***
## Residuals 386 4209940681 10906582
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modV18<-lm(Outstate~Expend, data=College.train) # Expend
anova(modV18)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Expend 1 2810662153 2810662153 318.03 < 2.2e-16 ***
## Residuals 386 3411336379 8837659
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modV19<-lm(Outstate~Grad.Rate,data=College.train) # Grad.Rate
anova(modV19)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Grad.Rate 1 2084067618 2084067618 194.41 < 2.2e-16 ***
## Residuals 386 4137930914 10720028
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model1a<-lm(Outstate~Top10perc+Enroll+P.Undergrad+F.Undergrad+Room.Board+PhD+S.F.Ratio+perc.alumni+Expend, data=College.train)
anova(model1a)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Top10perc 1 1743043060 1743043060 372.500 < 2.2e-16 ***
## Enroll 1 410629702 410629702 87.754 < 2.2e-16 ***
## P.Undergrad 1 27430208 27430208 5.862 0.0159413 *
## F.Undergrad 1 185005076 185005076 39.537 8.894e-10 ***
## Room.Board 1 1337879697 1337879697 285.913 < 2.2e-16 ***
## PhD 1 52560740 52560740 11.233 0.0008848 ***
## S.F.Ratio 1 278180783 278180783 59.449 1.122e-13 ***
## perc.alumni 1 311763425 311763425 66.626 4.941e-15 ***
## Expend 1 106724139 106724139 22.808 2.565e-06 ***
## Residuals 378 1768781702 4679317
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Part B: Fitting a GAM
#Fit a GAM on the training data, using outofstate tuition as the response
#and the features selected in the previous step as the predictors.
#Plot the results, and explain your findings
# model with natural splines
# Once you get the right variables modify this code
gam1=lm(Outstate~ns(Top10perc ,4)+ns(Enroll,4)+ns(F.Undergrad,4) , data=College.train)
summary(gam1)
##
## Call:
## lm(formula = Outstate ~ ns(Top10perc, 4) + ns(Enroll, 4) + ns(F.Undergrad,
## 4), data = College.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9151.7 -1872.8 79.1 1889.6 10574.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8679.0 1327.6 6.537 2.05e-10 ***
## ns(Top10perc, 4)1 2439.7 992.4 2.458 0.01441 *
## ns(Top10perc, 4)2 4838.5 1056.0 4.582 6.29e-06 ***
## ns(Top10perc, 4)3 6534.2 2366.1 2.762 0.00604 **
## ns(Top10perc, 4)4 11423.0 1386.5 8.239 2.94e-15 ***
## ns(Enroll, 4)1 -1175.7 1772.5 -0.663 0.50756
## ns(Enroll, 4)2 4685.8 3350.7 1.398 0.16280
## ns(Enroll, 4)3 5982.7 4441.0 1.347 0.17875
## ns(Enroll, 4)4 10403.2 4713.4 2.207 0.02791 *
## ns(F.Undergrad, 4)1 1169.5 1904.0 0.614 0.53944
## ns(F.Undergrad, 4)2 -8778.2 3362.9 -2.610 0.00941 **
## ns(F.Undergrad, 4)3 -9096.4 4542.9 -2.002 0.04597 *
## ns(F.Undergrad, 4)4 -14668.7 5185.9 -2.829 0.00493 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3146 on 375 degrees of freedom
## Multiple R-squared: 0.4036, Adjusted R-squared: 0.3845
## F-statistic: 21.14 on 12 and 375 DF, p-value: < 2.2e-16
# plot.Gam
plot.Gam(gam1, se=TRUE ,col ="red")
Part C: Evaluating the Model
It is dificult to evaluate our model as the residuals are relative only to its model and we don’t have any other model that we can compare it with.
Part D:
When looking at the GAM models I found that most of my models were surprisingly linear with some non linear variations on the ends.