library(ISLR)
library(tidyverse)
## ── Attaching packages ─────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
attach(Boston)
data(Boston)
ggplot(Boston, aes(dis, nox))+
geom_point()
fit <- lm(nox~poly(dis, 3) , data = Boston)
coef(summary(fit))
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.5546951 0.00275939 201.020894 0.000000e+00
## poly(dis, 3)1 -2.0030959 0.06207094 -32.271071 1.597201e-124
## poly(dis, 3)2 0.8563300 0.06207094 13.795987 6.133104e-37
## poly(dis, 3)3 -0.3180490 0.06207094 -5.123959 4.274950e-07
dislims <- range(dis)
dis.grid <- seq(from=1.13, to = 12.13)
preds <- predict(fit, newdata = list(dis=dis.grid), se= TRUE)
se.bands <- cbind(preds$fit +2*preds$se.fit, preds$fit - 2*preds$se.fit)
par(mfrow=c(1,1),mar=c(4.5,4.5,1,1) ,oma=c(0,0,4,0))
plot(dis ,nox ,xlim=dislims ,cex =.5,col=" darkgrey ")+
title(" Degree-3 Polynomial ",outer=T)+
lines(dis.grid ,preds$fit ,lwd=2,col="blue")+
matlines (dis.grid ,se.bands ,lwd=1, col=" blue",lty=3)
## integer(0)
set.seed(1)
train <- sample (506, 253)
polyfit <- lm(nox~dis, data = Boston, subset = train)
degreePoly<-10
polyMSE<-matrix(nrow=degreePoly, ncol=2)
colnames(polyMSE)<-c("degree", "MSE")
for(i in 1:degreePoly){
polyMSE[i,1]<-i
this.fit<-lm(nox~poly(dis,i), data=Boston, subset=train)
polyMSE[i,2]<-mean((nox-predict(this.fit, Boston))[-train]^2)
}
polyDF<-as.data.frame(polyMSE)
head(polyDF)
## degree MSE
## 1 1 0.005838344
## 2 2 0.004252326
## 3 3 0.004012238
## 4 4 0.004002253
## 5 5 0.004234507
## 6 6 0.004864446
ggplot(data=polyDF, aes(x=degree, y=MSE))+
geom_point()+
geom_line()
#### c.
#I'm going to perform cross validation with maximum splits
degreePoly<-10
splits<-10
splitMat<-matrix(nrow=degreePoly*splits, ncol=3)
colnames(splitMat)<-c("run","MSE", "degree")
for(i in 1:splits){
a=(i-1)*degreePoly+1
b=i*degreePoly
set.seed(i*3)
splitMat[a:b,1]<-i
train<-sample(392, 196)
for(j in 1:degreePoly){
c=a+(j-1)
this.fit<-lm(nox~poly(dis,j), data=Boston, subset=train)
splitMat[c,2]<-mean((nox-predict(this.fit, Boston))[-train]^2)
splitMat[c,3]<-j
}
}
splitDF<-as.data.frame(splitMat)
head(splitDF)
## run MSE degree
## 1 1 0.006832097 1
## 2 1 0.005003684 2
## 3 1 0.004854090 3
## 4 1 0.004943255 4
## 5 1 0.004995259 5
## 6 1 0.007894782 6
ggplot(data=splitDF, aes(x=degree, y=MSE, color= as.factor(run)))+
geom_point()+
geom_line()
splitDF
## run MSE degree
## 1 1 0.006832097 1
## 2 1 0.005003684 2
## 3 1 0.004854090 3
## 4 1 0.004943255 4
## 5 1 0.004995259 5
## 6 1 0.007894782 6
## 7 1 0.030518408 7
## 8 1 0.056770844 8
## 9 1 0.116310173 9
## 10 1 0.029334220 10
## 11 2 0.005681372 1
## 12 2 0.004060449 2
## 13 2 0.004105233 3
## 14 2 0.004508962 4
## 15 2 0.004634270 5
## 16 2 0.004504236 6
## 17 2 0.004419429 7
## 18 2 0.004508230 8
## 19 2 0.004579759 9
## 20 2 0.004590997 10
## 21 3 0.005585983 1
## 22 3 0.004165962 2
## 23 3 0.003962041 3
## 24 3 0.004018728 4
## 25 3 0.004019972 5
## 26 3 0.009400245 6
## 27 3 0.090105266 7
## 28 3 0.192330636 8
## 29 3 0.396259159 9
## 30 3 0.182964794 10
## 31 4 0.006986331 1
## 32 4 0.005054783 2
## 33 4 0.004800766 3
## 34 4 0.004854835 4
## 35 4 0.004856246 5
## 36 4 0.009126129 6
## 37 4 0.054871487 7
## 38 4 0.162090601 8
## 39 4 0.048007994 9
## 40 4 0.236357271 10
## 41 5 0.004854515 1
## 42 5 0.003741380 2
## 43 5 0.004004685 3
## 44 5 0.004243153 4
## 45 5 0.004225028 5
## 46 5 0.004152459 6
## 47 5 0.004196447 7
## 48 5 0.004279911 8
## 49 5 0.004385722 9
## 50 5 0.004415623 10
## 51 6 0.005314151 1
## 52 6 0.003928096 2
## 53 6 0.004089739 3
## 54 6 0.004075101 4
## 55 6 0.004895301 5
## 56 6 0.012145802 6
## 57 6 0.054230988 7
## 58 6 0.128488168 8
## 59 6 0.263125742 9
## 60 6 3.081664349 10
## 61 7 0.005762151 1
## 62 7 0.004216960 2
## 63 7 0.004094056 3
## 64 7 0.004209381 4
## 65 7 0.004158368 5
## 66 7 0.004072725 6
## 67 7 0.004089253 7
## 68 7 0.004228465 8
## 69 7 0.004267710 9
## 70 7 0.004267725 10
## 71 8 0.005891821 1
## 72 8 0.004105193 2
## 73 8 0.004235233 3
## 74 8 0.004491641 4
## 75 8 0.004564999 5
## 76 8 0.012183853 6
## 77 8 0.074788246 7
## 78 8 0.108301705 8
## 79 8 0.121847376 9
## 80 8 0.010121529 10
## 81 9 0.006573843 1
## 82 9 0.004951756 2
## 83 9 0.005027012 3
## 84 9 0.005444605 4
## 85 9 0.005594238 5
## 86 9 0.005995383 6
## 87 9 0.024677953 7
## 88 9 0.081190687 8
## 89 9 0.100257059 9
## 90 9 12.383529830 10
## 91 10 0.005167166 1
## 92 10 0.003961295 2
## 93 10 0.003914861 3
## 94 10 0.003930742 4
## 95 10 0.004307891 5
## 96 10 0.013746126 6
## 97 10 0.092094653 7
## 98 10 0.133653648 8
## 99 10 0.024250929 9
## 100 10 18.202678270 10
#### The best fit for this model would be a polynomial of 6 degrees. After 6 degrees, there is a dramtic increase in the MSE that would result in overfitting in the model.
library(splines)
attr(bs(dis ,df=4) ,"knots")
## 50%
## 3.20745
par(mfrow=c(1,1))
fit=lm(nox~bs(dis ,knots= c(2,3,6)),data=Boston)
pred=predict (fit ,newdata =list(dis=dis.grid),se=T)
## Warning in bs(dis, degree = 3L, knots = c(2, 3, 6), Boundary.knots =
## c(1.1296, : some 'x' values beyond boundary knots may cause ill-conditioned
## bases
plot(dis ,nox ,col="gray")+
lines(dis.grid ,pred$fit ,lwd=2)+
lines(dis.grid ,pred$fit+2*pred$se ,lty="dashed")+
lines(dis.grid ,pred$fit-2*pred$se ,lty="dashed")
## integer(0)
##### On 4 degrees of freedom, I chose 3 knots at 2, 3, and 6. This was based on the outputs for knots that r would have given for 4 degrees of freedom, including a third, higher value knot to include more of the data.
attach(College)
dim(College)
## [1] 777 18
view(College)
set.seed(2)
train <- sample(777,388)
test <- sample(1:777, 388, replace = FALSE)
mod <- lm(Outstate~Room.Board+Private+Grad.Rate, data = College, subset = train)
anova(mod)
## Analysis of Variance Table
##
## Response: Outstate
## Df Sum Sq Mean Sq F value Pr(>F)
## Room.Board 1 2637863392 2637863392 401.224 < 2.2e-16 ***
## Private 1 738162530 738162530 112.276 < 2.2e-16 ***
## Grad.Rate 1 393890725 393890725 59.912 8.874e-14 ***
## Residuals 384 2524621641 6574536
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#I Found that room and board, private college, and graduatioin rate were significatnt predictors of outstate.
library(gam)
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loaded gam 1.16.1
#Fit a GAM with Natural splines
gam1 <- lm(Outstate~ns(Room.Board, 3)+ns(Grad.Rate,4)+Private, data = College, subset = train)
summary(gam1)
##
## Call:
## lm(formula = Outstate ~ ns(Room.Board, 3) + ns(Grad.Rate, 4) +
## Private, data = College, subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7047.8 -1602.6 -189.9 1594.3 12003.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2140.62 1725.79 1.240 0.216
## ns(Room.Board, 3)1 5416.21 708.79 7.642 1.78e-13 ***
## ns(Room.Board, 3)2 11603.42 2426.86 4.781 2.50e-06 ***
## ns(Room.Board, 3)3 8257.87 1224.65 6.743 5.80e-11 ***
## ns(Grad.Rate, 4)1 1194.06 1310.97 0.911 0.363
## ns(Grad.Rate, 4)2 5628.10 1043.68 5.393 1.22e-07 ***
## ns(Grad.Rate, 4)3 1914.44 2999.25 0.638 0.524
## ns(Grad.Rate, 4)4 -99.88 1590.32 -0.063 0.950
## PrivateYes 2509.77 314.39 7.983 1.72e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2506 on 379 degrees of freedom
## Multiple R-squared: 0.6218, Adjusted R-squared: 0.6138
## F-statistic: 77.88 on 8 and 379 DF, p-value: < 2.2e-16
##### It appears that in this GAM model there is a loss of significance among most of the predictors #### c.
gam2 <- lm(Outstate~ns(Room.Board, 3)+ns(Grad.Rate,4)+Private, data = College, subset = test)
summary(gam2)
##
## Call:
## lm(formula = Outstate ~ ns(Room.Board, 3) + ns(Grad.Rate, 4) +
## Private, data = College, subset = test)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7957.2 -1487.3 -175.3 1498.0 7377.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -24.12 1901.73 -0.013 0.9899
## ns(Room.Board, 3)1 4569.56 695.35 6.572 1.65e-10 ***
## ns(Room.Board, 3)2 12714.51 2357.44 5.393 1.22e-07 ***
## ns(Room.Board, 3)3 9605.09 1314.88 7.305 1.65e-12 ***
## ns(Grad.Rate, 4)1 2376.47 1576.11 1.508 0.1324
## ns(Grad.Rate, 4)2 6395.30 1198.75 5.335 1.65e-07 ***
## ns(Grad.Rate, 4)3 7410.74 3604.40 2.056 0.0405 *
## ns(Grad.Rate, 4)4 2996.87 1761.30 1.702 0.0897 .
## PrivateYes 2933.10 303.23 9.673 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2483 on 379 degrees of freedom
## Multiple R-squared: 0.6516, Adjusted R-squared: 0.6442
## F-statistic: 88.59 on 8 and 379 DF, p-value: < 2.2e-16
on the testing set, it appears that the model shows much more significance for the predictors than on the training set. this is probably due to the nature of the splits within the training and testing set. #### d. It appears that the variable of room and board has a non-linear reationship to whether or not a school is out of state.