Problem Set 1

a.

library(ISLR)
library(tidyverse)
## ── Attaching packages ─────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
attach(Boston)
data(Boston)


ggplot(Boston, aes(dis, nox))+
  geom_point()

fit <- lm(nox~poly(dis, 3) , data = Boston)
coef(summary(fit))
##                 Estimate Std. Error    t value      Pr(>|t|)
## (Intercept)    0.5546951 0.00275939 201.020894  0.000000e+00
## poly(dis, 3)1 -2.0030959 0.06207094 -32.271071 1.597201e-124
## poly(dis, 3)2  0.8563300 0.06207094  13.795987  6.133104e-37
## poly(dis, 3)3 -0.3180490 0.06207094  -5.123959  4.274950e-07
dislims <- range(dis)
dis.grid <- seq(from=1.13, to = 12.13)
preds <- predict(fit, newdata = list(dis=dis.grid), se= TRUE)
se.bands <- cbind(preds$fit +2*preds$se.fit, preds$fit - 2*preds$se.fit)

par(mfrow=c(1,1),mar=c(4.5,4.5,1,1) ,oma=c(0,0,4,0))
plot(dis ,nox ,xlim=dislims ,cex =.5,col=" darkgrey ")+
title(" Degree-3 Polynomial ",outer=T)+
lines(dis.grid ,preds$fit ,lwd=2,col="blue")+
matlines (dis.grid ,se.bands ,lwd=1, col=" blue",lty=3)

## integer(0)

b.

set.seed(1)
train <- sample (506, 253)
polyfit <- lm(nox~dis, data = Boston, subset = train)

degreePoly<-10
polyMSE<-matrix(nrow=degreePoly, ncol=2)
colnames(polyMSE)<-c("degree", "MSE")
for(i in 1:degreePoly){ 
  polyMSE[i,1]<-i
  this.fit<-lm(nox~poly(dis,i), data=Boston, subset=train)
  polyMSE[i,2]<-mean((nox-predict(this.fit, Boston))[-train]^2)
}

polyDF<-as.data.frame(polyMSE)
head(polyDF)
##   degree         MSE
## 1      1 0.005838344
## 2      2 0.004252326
## 3      3 0.004012238
## 4      4 0.004002253
## 5      5 0.004234507
## 6      6 0.004864446
ggplot(data=polyDF, aes(x=degree, y=MSE))+
  geom_point()+
  geom_line()

#### c.

#I'm going to perform cross validation with maximum splits 
degreePoly<-10
splits<-10

splitMat<-matrix(nrow=degreePoly*splits, ncol=3)
colnames(splitMat)<-c("run","MSE", "degree")

for(i in 1:splits){
  a=(i-1)*degreePoly+1
  b=i*degreePoly
  
  set.seed(i*3)
  splitMat[a:b,1]<-i
  train<-sample(392, 196)
  
  for(j in 1:degreePoly){ 
    c=a+(j-1)
    
    this.fit<-lm(nox~poly(dis,j), data=Boston, subset=train)
    splitMat[c,2]<-mean((nox-predict(this.fit, Boston))[-train]^2)
    splitMat[c,3]<-j
  }
}
splitDF<-as.data.frame(splitMat)
head(splitDF)
##   run         MSE degree
## 1   1 0.006832097      1
## 2   1 0.005003684      2
## 3   1 0.004854090      3
## 4   1 0.004943255      4
## 5   1 0.004995259      5
## 6   1 0.007894782      6
ggplot(data=splitDF, aes(x=degree, y=MSE, color= as.factor(run)))+
  geom_point()+
  geom_line()

splitDF
##     run          MSE degree
## 1     1  0.006832097      1
## 2     1  0.005003684      2
## 3     1  0.004854090      3
## 4     1  0.004943255      4
## 5     1  0.004995259      5
## 6     1  0.007894782      6
## 7     1  0.030518408      7
## 8     1  0.056770844      8
## 9     1  0.116310173      9
## 10    1  0.029334220     10
## 11    2  0.005681372      1
## 12    2  0.004060449      2
## 13    2  0.004105233      3
## 14    2  0.004508962      4
## 15    2  0.004634270      5
## 16    2  0.004504236      6
## 17    2  0.004419429      7
## 18    2  0.004508230      8
## 19    2  0.004579759      9
## 20    2  0.004590997     10
## 21    3  0.005585983      1
## 22    3  0.004165962      2
## 23    3  0.003962041      3
## 24    3  0.004018728      4
## 25    3  0.004019972      5
## 26    3  0.009400245      6
## 27    3  0.090105266      7
## 28    3  0.192330636      8
## 29    3  0.396259159      9
## 30    3  0.182964794     10
## 31    4  0.006986331      1
## 32    4  0.005054783      2
## 33    4  0.004800766      3
## 34    4  0.004854835      4
## 35    4  0.004856246      5
## 36    4  0.009126129      6
## 37    4  0.054871487      7
## 38    4  0.162090601      8
## 39    4  0.048007994      9
## 40    4  0.236357271     10
## 41    5  0.004854515      1
## 42    5  0.003741380      2
## 43    5  0.004004685      3
## 44    5  0.004243153      4
## 45    5  0.004225028      5
## 46    5  0.004152459      6
## 47    5  0.004196447      7
## 48    5  0.004279911      8
## 49    5  0.004385722      9
## 50    5  0.004415623     10
## 51    6  0.005314151      1
## 52    6  0.003928096      2
## 53    6  0.004089739      3
## 54    6  0.004075101      4
## 55    6  0.004895301      5
## 56    6  0.012145802      6
## 57    6  0.054230988      7
## 58    6  0.128488168      8
## 59    6  0.263125742      9
## 60    6  3.081664349     10
## 61    7  0.005762151      1
## 62    7  0.004216960      2
## 63    7  0.004094056      3
## 64    7  0.004209381      4
## 65    7  0.004158368      5
## 66    7  0.004072725      6
## 67    7  0.004089253      7
## 68    7  0.004228465      8
## 69    7  0.004267710      9
## 70    7  0.004267725     10
## 71    8  0.005891821      1
## 72    8  0.004105193      2
## 73    8  0.004235233      3
## 74    8  0.004491641      4
## 75    8  0.004564999      5
## 76    8  0.012183853      6
## 77    8  0.074788246      7
## 78    8  0.108301705      8
## 79    8  0.121847376      9
## 80    8  0.010121529     10
## 81    9  0.006573843      1
## 82    9  0.004951756      2
## 83    9  0.005027012      3
## 84    9  0.005444605      4
## 85    9  0.005594238      5
## 86    9  0.005995383      6
## 87    9  0.024677953      7
## 88    9  0.081190687      8
## 89    9  0.100257059      9
## 90    9 12.383529830     10
## 91   10  0.005167166      1
## 92   10  0.003961295      2
## 93   10  0.003914861      3
## 94   10  0.003930742      4
## 95   10  0.004307891      5
## 96   10  0.013746126      6
## 97   10  0.092094653      7
## 98   10  0.133653648      8
## 99   10  0.024250929      9
## 100  10 18.202678270     10

#### The best fit for this model would be a polynomial of 6 degrees. After 6 degrees, there is a dramtic increase in the MSE that would result in overfitting in the model.

d.

library(splines)
attr(bs(dis ,df=4) ,"knots")
##     50% 
## 3.20745
par(mfrow=c(1,1))
fit=lm(nox~bs(dis ,knots= c(2,3,6)),data=Boston)
pred=predict (fit ,newdata =list(dis=dis.grid),se=T)
## Warning in bs(dis, degree = 3L, knots = c(2, 3, 6), Boundary.knots =
## c(1.1296, : some 'x' values beyond boundary knots may cause ill-conditioned
## bases
plot(dis ,nox ,col="gray")+
lines(dis.grid ,pred$fit ,lwd=2)+
lines(dis.grid ,pred$fit+2*pred$se ,lty="dashed")+
lines(dis.grid ,pred$fit-2*pred$se ,lty="dashed")

## integer(0)

##### On 4 degrees of freedom, I chose 3 knots at 2, 3, and 6. This was based on the outputs for knots that r would have given for 4 degrees of freedom, including a third, higher value knot to include more of the data.

Problem 2

a.

attach(College)
dim(College)
## [1] 777  18
view(College)
set.seed(2)
train <- sample(777,388)
test <- sample(1:777, 388, replace = FALSE)
mod <- lm(Outstate~Room.Board+Private+Grad.Rate, data = College, subset = train)
anova(mod)
## Analysis of Variance Table
## 
## Response: Outstate
##             Df     Sum Sq    Mean Sq F value    Pr(>F)    
## Room.Board   1 2637863392 2637863392 401.224 < 2.2e-16 ***
## Private      1  738162530  738162530 112.276 < 2.2e-16 ***
## Grad.Rate    1  393890725  393890725  59.912 8.874e-14 ***
## Residuals  384 2524621641    6574536                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#I Found that room and board, private college, and graduatioin rate were significatnt predictors of outstate. 

b.

library(gam)
## Loading required package: foreach
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
## Loaded gam 1.16.1
#Fit a GAM with Natural splines

gam1 <- lm(Outstate~ns(Room.Board, 3)+ns(Grad.Rate,4)+Private, data = College, subset = train)
summary(gam1)
## 
## Call:
## lm(formula = Outstate ~ ns(Room.Board, 3) + ns(Grad.Rate, 4) + 
##     Private, data = College, subset = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7047.8 -1602.6  -189.9  1594.3 12003.3 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         2140.62    1725.79   1.240    0.216    
## ns(Room.Board, 3)1  5416.21     708.79   7.642 1.78e-13 ***
## ns(Room.Board, 3)2 11603.42    2426.86   4.781 2.50e-06 ***
## ns(Room.Board, 3)3  8257.87    1224.65   6.743 5.80e-11 ***
## ns(Grad.Rate, 4)1   1194.06    1310.97   0.911    0.363    
## ns(Grad.Rate, 4)2   5628.10    1043.68   5.393 1.22e-07 ***
## ns(Grad.Rate, 4)3   1914.44    2999.25   0.638    0.524    
## ns(Grad.Rate, 4)4    -99.88    1590.32  -0.063    0.950    
## PrivateYes          2509.77     314.39   7.983 1.72e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2506 on 379 degrees of freedom
## Multiple R-squared:  0.6218, Adjusted R-squared:  0.6138 
## F-statistic: 77.88 on 8 and 379 DF,  p-value: < 2.2e-16

##### It appears that in this GAM model there is a loss of significance among most of the predictors #### c.

gam2 <- lm(Outstate~ns(Room.Board, 3)+ns(Grad.Rate,4)+Private, data = College, subset = test)
summary(gam2)
## 
## Call:
## lm(formula = Outstate ~ ns(Room.Board, 3) + ns(Grad.Rate, 4) + 
##     Private, data = College, subset = test)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7957.2 -1487.3  -175.3  1498.0  7377.9 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -24.12    1901.73  -0.013   0.9899    
## ns(Room.Board, 3)1  4569.56     695.35   6.572 1.65e-10 ***
## ns(Room.Board, 3)2 12714.51    2357.44   5.393 1.22e-07 ***
## ns(Room.Board, 3)3  9605.09    1314.88   7.305 1.65e-12 ***
## ns(Grad.Rate, 4)1   2376.47    1576.11   1.508   0.1324    
## ns(Grad.Rate, 4)2   6395.30    1198.75   5.335 1.65e-07 ***
## ns(Grad.Rate, 4)3   7410.74    3604.40   2.056   0.0405 *  
## ns(Grad.Rate, 4)4   2996.87    1761.30   1.702   0.0897 .  
## PrivateYes          2933.10     303.23   9.673  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2483 on 379 degrees of freedom
## Multiple R-squared:  0.6516, Adjusted R-squared:  0.6442 
## F-statistic: 88.59 on 8 and 379 DF,  p-value: < 2.2e-16

on the testing set, it appears that the model shows much more significance for the predictors than on the training set. this is probably due to the nature of the splits within the training and testing set. #### d. It appears that the variable of room and board has a non-linear reationship to whether or not a school is out of state.