Question#1 (5.4.6)
library(ISLR)
summary(Default)
## default student balance income
## No :9667 No :7056 Min. : 0.0 Min. : 772
## Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
## Median : 823.6 Median :34553
## Mean : 835.4 Mean :33517
## 3rd Qu.:1166.3 3rd Qu.:43808
## Max. :2654.3 Max. :73554
attach(Default)
set.seed(100)
glm.fit=glm(default~income+balance,data=Default,family=binomial)
summary(glm.fit)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4725 -0.1444 -0.0574 -0.0211 3.7245
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
Std. Error income=4.99exp(-06) Std. Error balance=2.27exp(-04)
dasboot=function(data,index) return(coef(glm(default~income+balance,data=data,family=binomial,subset=index)))
library(boot)
boot(Default,dasboot,50)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Default, statistic = dasboot, R = 50)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -1.154047e+01 2.151888e-02 3.888043e-01
## t2* 2.080898e-05 -2.361924e-07 4.269663e-06
## t3* 5.647103e-03 -1.405635e-05 2.194840e-04
Std.Error (t1)= 3.89exp(-01) Std.Error (t2)= 4.27exp(-06) Std.Error (t3)= 2.19exp(-04)
Question#2 (5.4.7)
library(ISLR)
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202
## Median : 0.2380 Median : 0.2340 Median :1.00268
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821
## Today Direction
## Min. :-18.1950 Down:484
## 1st Qu.: -1.1540 Up :605
## Median : 0.2410
## Mean : 0.1499
## 3rd Qu.: 1.4050
## Max. : 12.0260
set.seed(2000)
attach(Weekly)
glm.fit=glm(Direction~Lag1+Lag2,family=binomial,data=Weekly)
summary(glm.fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.623 -1.261 1.001 1.083 1.506
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.22122 0.06147 3.599 0.000319 ***
## Lag1 -0.03872 0.02622 -1.477 0.139672
## Lag2 0.06025 0.02655 2.270 0.023232 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1488.2 on 1086 degrees of freedom
## AIC: 1494.2
##
## Number of Fisher Scoring iterations: 4
glm.fit=glm(Direction~Lag1+Lag2,data=Weekly[-1,],family=binomial)
summary(glm.fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = binomial, data = Weekly[-1,
## ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6258 -1.2617 0.9999 1.0819 1.5071
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.22324 0.06150 3.630 0.000283 ***
## Lag1 -0.03843 0.02622 -1.466 0.142683
## Lag2 0.06085 0.02656 2.291 0.021971 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1494.6 on 1087 degrees of freedom
## Residual deviance: 1486.5 on 1085 degrees of freedom
## AIC: 1492.5
##
## Number of Fisher Scoring iterations: 4
predict.glm(glm.fit,Weekly[1,],type="response")>0.5
## 1
## TRUE
The prediction was UP, but the true Direction was DOWN.
count=rep(0,dim(Weekly)[1])
for(i in 1:(dim(Weekly)[1])){
glm.fit=glm(Direction~Lag1+Lag2,data=Weekly[-i,],family=binomial)
is_up=predict.glm(glm.fit,Weekly[i,],type="response") > 0.5
is_true_up=Weekly[i,]$Direction == "Up"
if (is_up !=is_true_up)
count[i]=1
}
sum(count)
## [1] 490
There were 490 errors.
mean(count)
## [1] 0.4499541
LOOCV estimates a test error rate of 45%.
Question#3 (5.4.8)
set.seed(1)
y = rnorm(100)
x = rnorm(100)
y = x - 2 * x^2 + rnorm(100)
n=100 p=2
plot(x,y)
I find a quadratic trend X from -2 to 2, Y from -8 to 2.
library(boot)
Data = data.frame(x, y)
set.seed(100)
#i.
glm.fit = glm(y ~ x)
cv.glm(Data, glm.fit)$delta
## [1] 5.890979 5.888812
#ii.
glm.fit = glm(y ~ poly(x, 2))
cv.glm(Data, glm.fit)$delta
## [1] 1.086596 1.086326
#iii.
glm.fit = glm(y ~ poly(x, 3))
cv.glm(Data, glm.fit)$delta
## [1] 1.102585 1.102227
#iv.
glm.fit = glm(y ~ poly(x, 4))
cv.glm(Data, glm.fit)$delta
## [1] 1.114772 1.114334
set.seed(20)
# i.
glm.fit = glm(y ~ x)
cv.glm(Data, glm.fit)$delta
## [1] 5.890979 5.888812
# ii.
glm.fit = glm(y ~ poly(x, 2))
cv.glm(Data, glm.fit)$delta
## [1] 1.086596 1.086326
# iii.
glm.fit = glm(y ~ poly(x, 3))
cv.glm(Data, glm.fit)$delta
## [1] 1.102585 1.102227
# iv.
glm.fit = glm(y ~ poly(x, 4))
cv.glm(Data, glm.fit)$delta
## [1] 1.114772 1.114334
Exact same, because LOOCV will be the same since its from n folds of a single observation.
THe quadratic polynomial had the lowest LOOCV error rate. This is expected because it matches the true from of Y.
summary(glm.fit)
##
## Call:
## glm(formula = y ~ poly(x, 4))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8914 -0.5244 0.0749 0.5932 2.7796
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.8277 0.1041 -17.549 <2e-16 ***
## poly(x, 4)1 2.3164 1.0415 2.224 0.0285 *
## poly(x, 4)2 -21.0586 1.0415 -20.220 <2e-16 ***
## poly(x, 4)3 -0.3048 1.0415 -0.293 0.7704
## poly(x, 4)4 -0.4926 1.0415 -0.473 0.6373
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1.084654)
##
## Null deviance: 552.21 on 99 degrees of freedom
## Residual deviance: 103.04 on 95 degrees of freedom
## AIC: 298.78
##
## Number of Fisher Scoring iterations: 2
The p-values that are significant are the linear and quadratic terms, which does agree with the CV results found.