Problem 1: Boston

#install.packages("ISLR")
library(ISLR)
#install.packages("MASS")
library(MASS)
## Warning: package 'MASS' was built under R version 3.5.2
library(Matrix)
library(lme4)
## Warning: package 'lme4' was built under R version 3.5.2
library(ggplot2)

Part A: Polynomial Regression

# polynomial regression 
# poly uses matrix of orthogonal polynomials

fitMASS=lm(nox~poly(dis,3),data=Boston) #first responce 2nd is 
coef(summary(fitMASS))
##                 Estimate Std. Error    t value      Pr(>|t|)
## (Intercept)    0.5546951 0.00275939 201.020894  0.000000e+00
## poly(dis, 3)1 -2.0030959 0.06207094 -32.271071 1.597201e-124
## poly(dis, 3)2  0.8563300 0.06207094  13.795987  6.133104e-37
## poly(dis, 3)3 -0.3180490 0.06207094  -5.123959  4.274950e-07

Part B: Polynomial Degrees

#Creating a training set 

set.seed(1)
# gives a 50-50 split of the indexes
train<-sample(dim(Boston)[1], floor(dim(Boston)[1]/2)) #This form of spliting the data is more universal 
test.nox<-Boston$nox[-train]

# we can put this code in a loop to test different amounts of flexibility 
degreePoly<-10
polyMSE<-matrix(nrow=degreePoly, ncol=2)
colnames(polyMSE)<-c("degree", "MSE")
for(i in 1:degreePoly){ 
  polyMSE[i,1]<-i
  this.fit<-lm(nox~poly(dis,i), data=Boston, subset=train)
  polyMSE[i,2]<-mean((Boston$nox-predict(this.fit, Boston))[-train]^2)
}

polyDF<-as.data.frame(polyMSE)
head(polyDF)
##   degree         MSE
## 1      1 0.005160163
## 2      2 0.003796198
## 3      3 0.003658465
## 4      4 0.003652429
## 5      5 0.003656329
## 6      6 0.003635610
ggplot(data=polyDF, aes(x=degree, y=MSE))+
  geom_point()+
  geom_line()

Part C: Cross Validation

After splitting and testing the data it is clear that after the 5th degree the MSE increases on some of the testing data sets. To avoid overfitting it would be best to cut it at the 5th polynomial.

### Let's create many random splits and compare their results
# we can put this code in a double loop to test different amounts of flexibility 

degreePoly<-10
splits<-10

splitMat<-matrix(nrow=degreePoly*splits, ncol=3)
colnames(splitMat)<-c("run","MSE", "degree")

for(i in 1:splits){
  a=(i-1)*degreePoly+1
  b=i*degreePoly
  
  set.seed(i*3)
  splitMat[a:b,1]<-i
  train<-sample(392, 196) #UNSURE OF WHAT TO WRITE IN HERE 
  
  for(j in 1:degreePoly){ 
    c=a+(j-1)
    
    this.fit<-lm(Boston$nox~poly(dis,j), data=Boston, subset=train)
    splitMat[c,2]<-mean((Boston$nox-predict(this.fit, Boston))[-train]^2)
    splitMat[c,3]<-j
  }
}
splitDF<-as.data.frame(splitMat)
head(splitDF)
##   run         MSE degree
## 1   1 0.006880444      1
## 2   1 0.004879994      2
## 3   1 0.004763359      3
## 4   1 0.004950247      4
## 5   1 0.005059731      5
## 6   1 0.004937652      6
ggplot(data=splitDF, aes(x=degree, y=MSE, color=as.factor(run)))+
  geom_point()+
  geom_line()

Part D: Fitting a Regression Spline

# knots chosen by R
#attr(bs(Boston$dis ,df=6) ,"knots")
# Splines
#install.packages("splines")
library(splines)
attach((Boston))
#basis functions
dislims = range(dis)
dis.grid=seq(from=dislims[1], to=dislims[2])
dis.grid = dis.grid
par(mfrow=c(1,1))
fit=lm(nox~bs(dis ,knots=c(3 ,6,9) ),data=Boston)
pred=predict(fit ,newdata =list(dis=dis.grid),se=T)
plot(dis ,nox ,col="gray") 
lines(dis.grid ,pred$fit ,lwd=2)
lines(dis.grid ,pred$fit+2*pred$se ,lty="dashed")
lines(dis.grid ,pred$fit-2*pred$se ,lty="dashed")

Problem 2: College

#install.packages("ISLR")
library(ISLR)
#install.packages("gam")
library(gam)
## Warning: package 'gam' was built under R version 3.5.2
## Loading required package: foreach
## Warning: package 'foreach' was built under R version 3.5.2
## Loaded gam 1.16.1
library(lme4)
College <- read.csv("http://faculty.marshall.usc.edu/gareth-james/ISL/College.csv",header=TRUE)

Part A: Foward Stepwise Selection

#test-train 
#dividing the data
set.seed(1)
# gives a 50-50 split of the indexes
train<-sample(dim(College)[1], floor(dim(College)[1]/2)) #This form of spliting the data is more universal 
College.train<-College[train,]
College.test<-College[-train,]

CollegeData<- College[-c(1),] 
modV3<-lm(Outstate~Apps, data=College.train) #Apps 
anova(modV3) 
## Analysis of Variance Table
## 
## Response: Outstate
##            Df     Sum Sq  Mean Sq F value Pr(>F)
## Apps        1   22927838 22927838  1.4277 0.2329
## Residuals 386 6199070694 16059769
modV4<-lm(Outstate~Accept, data=College.train) #    Accept                          
anova(modV4)
## Analysis of Variance Table
## 
## Response: Outstate
##            Df     Sum Sq  Mean Sq F value Pr(>F)
## Accept      1      59247    59247  0.0037 0.9517
## Residuals 386 6221939285 16119014
modV5<-lm(Outstate~Enroll, data=College.train) # Enroll
anova(modV5)
## Analysis of Variance Table
## 
## Response: Outstate
##            Df     Sum Sq  Mean Sq F value  Pr(>F)  
## Enroll      1   99455586 99455586  6.2702 0.01269 *
## Residuals 386 6122542946 15861510                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modV6<-lm(Outstate~Top10perc, data=College.train) # Top10perc
anova(modV6)
## Analysis of Variance Table
## 
## Response: Outstate
##            Df    Sum Sq    Mean Sq F value    Pr(>F)    
## Top10perc   1 1.743e+09 1743043060  150.22 < 2.2e-16 ***
## Residuals 386 4.479e+09   11603512                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modV7<-lm(Outstate~Top25perc, data=College.train) # Top25perc
anova(modV7)
## Analysis of Variance Table
## 
## Response: Outstate
##            Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Top25perc   1 1218341158 1218341158  93.987 < 2.2e-16 ***
## Residuals 386 5003657374   12962843                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modV8<-lm(Outstate~F.Undergrad, data=College.train) # F.Undergrad
anova(modV8)
## Analysis of Variance Table
## 
## Response: Outstate
##              Df     Sum Sq   Mean Sq F value    Pr(>F)    
## F.Undergrad   1  216544009 216544009  13.918 0.0002196 ***
## Residuals   386 6005454523  15558172                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modV9<-lm(Outstate~P.Undergrad, data=College.train) # P.Undergrad
anova(modV9)
## Analysis of Variance Table
## 
## Response: Outstate
##              Df     Sum Sq   Mean Sq F value    Pr(>F)    
## P.Undergrad   1  414732944 414732944  27.567 2.512e-07 ***
## Residuals   386 5807265588  15044730                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modV11<-lm(Outstate~Room.Board, data=College.train) # Room.Board
anova(modV11)
## Analysis of Variance Table
## 
## Response: Outstate
##             Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Room.Board   1 2648219045 2648219045  286.03 < 2.2e-16 ***
## Residuals  386 3573779487    9258496                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modV12<-lm(Outstate~Books, data=College.train) # Books
anova(modV12)
## Analysis of Variance Table
## 
## Response: Outstate
##            Df     Sum Sq  Mean Sq F value Pr(>F)
## Books       1   12385341 12385341  0.7699 0.3808
## Residuals 386 6209613191 16087081
modV13<-lm(Outstate~Personal, data=College.train) # Personal
anova(modV13)
## Analysis of Variance Table
## 
## Response: Outstate
##            Df     Sum Sq   Mean Sq F value    Pr(>F)    
## Personal    1  455832189 455832189  30.514 6.103e-08 ***
## Residuals 386 5766166343  14938255                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modV14<-lm(Outstate~PhD, data=College.train) # PhD
anova(modV14)
## Analysis of Variance Table
## 
## Response: Outstate
##            Df     Sum Sq   Mean Sq F value    Pr(>F)    
## PhD         1  805196840 805196840  57.378 2.686e-13 ***
## Residuals 386 5416801692  14033165                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modV15<-lm(Outstate~Terminal,data=College.train) # Terminal
anova(modV15)
## Analysis of Variance Table
## 
## Response: Outstate
##            Df     Sum Sq   Mean Sq F value    Pr(>F)    
## Terminal    1  963689965 963689965  70.742 7.985e-16 ***
## Residuals 386 5258308567  13622561                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modV16<-lm(Outstate~S.F.Ratio,data=College.train) # S.F.Ratio   
anova(modV16)
## Analysis of Variance Table
## 
## Response: Outstate
##            Df     Sum Sq    Mean Sq F value    Pr(>F)    
## S.F.Ratio   1 2034178367 2034178367  187.49 < 2.2e-16 ***
## Residuals 386 4187820165   10849275                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modV17<-lm(Outstate~perc.alumni,data=College.train) # perc.alumni   
anova(modV17)
## Analysis of Variance Table
## 
## Response: Outstate
##              Df     Sum Sq    Mean Sq F value    Pr(>F)    
## perc.alumni   1 2012057851 2012057851  184.48 < 2.2e-16 ***
## Residuals   386 4209940681   10906582                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modV18<-lm(Outstate~Expend, data=College.train) # Expend
anova(modV18)
## Analysis of Variance Table
## 
## Response: Outstate
##            Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Expend      1 2810662153 2810662153  318.03 < 2.2e-16 ***
## Residuals 386 3411336379    8837659                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
modV19<-lm(Outstate~Grad.Rate,data=College.train) # Grad.Rate
anova(modV19)
## Analysis of Variance Table
## 
## Response: Outstate
##            Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Grad.Rate   1 2084067618 2084067618  194.41 < 2.2e-16 ***
## Residuals 386 4137930914   10720028                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
model1a<-lm(Outstate~Top10perc+Enroll+P.Undergrad+F.Undergrad+Room.Board+PhD+S.F.Ratio+perc.alumni+Expend, data=College.train)
anova(model1a)
## Analysis of Variance Table
## 
## Response: Outstate
##              Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Top10perc     1 1743043060 1743043060 372.500 < 2.2e-16 ***
## Enroll        1  410629702  410629702  87.754 < 2.2e-16 ***
## P.Undergrad   1   27430208   27430208   5.862 0.0159413 *  
## F.Undergrad   1  185005076  185005076  39.537 8.894e-10 ***
## Room.Board    1 1337879697 1337879697 285.913 < 2.2e-16 ***
## PhD           1   52560740   52560740  11.233 0.0008848 ***
## S.F.Ratio     1  278180783  278180783  59.449 1.122e-13 ***
## perc.alumni   1  311763425  311763425  66.626 4.941e-15 ***
## Expend        1  106724139  106724139  22.808 2.565e-06 ***
## Residuals   378 1768781702    4679317                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Part B: Fitting a GAM

#Fit a GAM on the training data, using outofstate tuition as the response
#and the features selected in the previous step as the predictors. 
#Plot the results, and explain your findings
  
# model with natural splines
# Once you get the right variables modify this code

gam1=lm(Outstate~ns(Top10perc ,4)+ns(Enroll,4)+ns(F.Undergrad,4) , data=College.train)
summary(gam1)
## 
## Call:
## lm(formula = Outstate ~ ns(Top10perc, 4) + ns(Enroll, 4) + ns(F.Undergrad, 
##     4), data = College.train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9151.7 -1872.8    79.1  1889.6 10574.9 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           8679.0     1327.6   6.537 2.05e-10 ***
## ns(Top10perc, 4)1     2439.7      992.4   2.458  0.01441 *  
## ns(Top10perc, 4)2     4838.5     1056.0   4.582 6.29e-06 ***
## ns(Top10perc, 4)3     6534.2     2366.1   2.762  0.00604 ** 
## ns(Top10perc, 4)4    11423.0     1386.5   8.239 2.94e-15 ***
## ns(Enroll, 4)1       -1175.7     1772.5  -0.663  0.50756    
## ns(Enroll, 4)2        4685.8     3350.7   1.398  0.16280    
## ns(Enroll, 4)3        5982.7     4441.0   1.347  0.17875    
## ns(Enroll, 4)4       10403.2     4713.4   2.207  0.02791 *  
## ns(F.Undergrad, 4)1   1169.5     1904.0   0.614  0.53944    
## ns(F.Undergrad, 4)2  -8778.2     3362.9  -2.610  0.00941 ** 
## ns(F.Undergrad, 4)3  -9096.4     4542.9  -2.002  0.04597 *  
## ns(F.Undergrad, 4)4 -14668.7     5185.9  -2.829  0.00493 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3146 on 375 degrees of freedom
## Multiple R-squared:  0.4036, Adjusted R-squared:  0.3845 
## F-statistic: 21.14 on 12 and 375 DF,  p-value: < 2.2e-16
# plot.Gam  
plot.Gam(gam1, se=TRUE ,col ="red")

Part C: Evaluating the Model

It is dificult to evaluate our model as the residuals are relative only to its model and we don’t have any other model that we can compare it with.

Part D:

When looking at the GAM models I found that most of my models were surprisingly linear with some non linear variations on the ends.