\[ Var(\alpha X + (1-\alpha)Y) \\ = Var(\alpha X) + Var((1-\alpha)Y) +2 Cov(\alpha X, (1-\alpha)Y) \\ = \alpha^2 \sigma_X^2 + (1-\alpha)^2 \sigma_Y^2 + 2 \alpha (1-\alpha) \sigma_{XY} \\ = \alpha^2 \sigma_X^2 + (1+\alpha^2-2\alpha) \sigma_Y^2 + (2\alpha - 2\alpha^2) \sigma_{XY} \\ = \alpha^2 \sigma_X^2 + \sigma_Y^2+\alpha^2\sigma_Y^2-2\alpha\sigma_Y^2 + 2\alpha \sigma_{XY} - 2\alpha^2 \sigma_{XY} \]
\[ \frac{\partial }{\partial \alpha}: 2\alpha\sigma_X^2 + 0 + 2\alpha\sigma_Y^2 - 2\sigma_Y^2 + 2\sigma_{XY} - 4\alpha\sigma_{XY} = 0 \]
\[ (2\sigma_X^2 + 2\sigma_Y^2 - 4\sigma_{XY}) \alpha = 2\sigma_Y^2 - 2\sigma_{XY} \]
\[ \alpha = \frac{\sigma_Y^2 - \sigma_{XY}}{\sigma_X^2 + \sigma_Y^2 - 2\sigma_{XY}} \]
x = 1:100000
plot(x, 1 - (1 - 1/x)^x)
set.seed(42)
store=rep(NA, 10000)
for(i in 1:10000){
store[i]=sum(sample(1:100, rep=TRUE)==4)>0
}
mean(store)
## [1] 0.6313
We can estimate the standard deviation of our prediction by using the bootstrap method. Since we often don’t have access to new data sets from the population we instead use repeated samples with replacement from the one data set available. With different results for the Y value we can estimate the standard deviation of our prediction.
library(ISLR)
attach(Default)
set.seed(42)
glm.fit = glm(default~ balance+income, data=Default, family="binomial")
summary(glm.fit)
##
## Call:
## glm(formula = default ~ balance + income, family = "binomial",
## data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4725 -0.1444 -0.0574 -0.0211 3.7245
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
train = sample(nrow(Default), nrow(Default)*0.5)
glm.fit_val = glm(default~balance+income, data=Default, family=binomial, subset = train)
glm.probs = predict(glm.fit_val, Default[-train,], type="response")
glm.pred = ifelse(glm.probs > 0.5, "Yes", "No")
table(glm.pred, Default[-train, ]$default)
##
## glm.pred No Yes
## No 4817 110
## Yes 20 53
mean(glm.pred !=Default[-train,]$default)
## [1] 0.026
train = sample(nrow(Default), nrow(Default)*0.5)
glm.fit_val = glm(default~balance+income, data=Default, family=binomial, subset = train)
glm.probs = predict(glm.fit_val, Default[-train,], type="response")
glm.pred = ifelse(glm.probs > 0.5, "Yes", "No")
table(glm.pred, Default[-train, ]$default)
##
## glm.pred No Yes
## No 4816 116
## Yes 14 54
mean(glm.pred !=Default[-train,]$default)
## [1] 0.026
train = sample(nrow(Default), nrow(Default)*0.5)
glm.fit_val = glm(default~balance+income, data=Default, family=binomial, subset = train)
glm.probs = predict(glm.fit_val, Default[-train,], type="response")
glm.pred = ifelse(glm.probs > 0.5, "Yes", "No")
table(glm.pred, Default[-train, ]$default)
##
## glm.pred No Yes
## No 4802 135
## Yes 12 51
mean(glm.pred !=Default[-train,]$default)
## [1] 0.0294
train = sample(nrow(Default), nrow(Default)*0.5)
glm.fit_val = glm(default~balance+income, data=Default, family=binomial, subset = train)
glm.probs = predict(glm.fit_val, Default[-train,], type="response")
glm.pred = ifelse(glm.probs > 0.5, "Yes", "No")
table(glm.pred, Default[-train, ]$default)
##
## glm.pred No Yes
## No 4817 109
## Yes 20 54
mean(glm.pred !=Default[-train,]$default)
## [1] 0.0258
Since every train set has its own unique set of observations the error rate varies across every run.
train = sample(nrow(Default), nrow(Default)*0.5)
glm.fit_val = glm(default~balance+income+student, data=Default, family=binomial, subset = train)
glm.probs = predict(glm.fit_val, Default[-train,], type="response")
glm.pred = ifelse(glm.probs > 0.5, "Yes", "No")
table(glm.pred, Default[-train, ]$default)
##
## glm.pred No Yes
## No 4810 128
## Yes 11 51
mean(glm.pred !=Default[-train,]$default)
## [1] 0.0278
Adding a student dummy variable doesn’t seem to decrease the validation set estimate for the test error.
set.seed(42)
require(ISLR)
attach(Default)
## The following objects are masked from Default (pos = 3):
##
## balance, default, income, student
glm.fit = glm(default~income+balance, data=Default, family="binomial")
summary(glm.fit)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Default)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4725 -0.1444 -0.0574 -0.0211 3.7245
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
boot.fn = function(data, index){
return(coef(glm(default~income+balance, data=Default, family="binomial", subset=index)))
}
boot.fn(Default, 1:nrow(Default))
## (Intercept) income balance
## -1.154047e+01 2.080898e-05 5.647103e-03
library(boot)
boot(Default, boot.fn, 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Default, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -1.154047e+01 -2.292405e-02 4.435269e-01
## t2* 2.080898e-05 2.737444e-08 5.073444e-06
## t3* 5.647103e-03 1.176249e-05 2.299133e-04
set.seed(1)
attach(Weekly)
glm.fit1 = glm(Direction ~ Lag1 + Lag2, data=Weekly, family="binomial")
summary(glm.fit1)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = "binomial", data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.623 -1.261 1.001 1.083 1.506
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.22122 0.06147 3.599 0.000319 ***
## Lag1 -0.03872 0.02622 -1.477 0.139672
## Lag2 0.06025 0.02655 2.270 0.023232 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1488.2 on 1086 degrees of freedom
## AIC: 1494.2
##
## Number of Fisher Scoring iterations: 4
glm.fit2 = glm(Direction ~ Lag1 + Lag2, data=Weekly[-1, ], family="binomial")
summary(glm.fit2)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = "binomial", data = Weekly[-1,
## ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6258 -1.2617 0.9999 1.0819 1.5071
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.22324 0.06150 3.630 0.000283 ***
## Lag1 -0.03843 0.02622 -1.466 0.142683
## Lag2 0.06085 0.02656 2.291 0.021971 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1494.6 on 1087 degrees of freedom
## Residual deviance: 1486.5 on 1085 degrees of freedom
## AIC: 1492.5
##
## Number of Fisher Scoring iterations: 4
predict(glm.fit2, Weekly[1, ], type="response")>0.5
## 1
## TRUE
Weekly[1,]$Direction
## [1] Down
## Levels: Down Up
This oberservation was not correctly classified.
error <- rep(0, nrow(Weekly))
for (i in 1:nrow(Weekly)){
fit = glm(Direction ~ Lag1+Lag2, data=Weekly[-i, ], family=binomial)
pred = predict(fit, Weekly[i, ], type="response") > 0.5
true <- Weekly[i, ]$Direction == "Up"
if (pred != true)
error[i] = 1
}
mean(error)
## [1] 0.4499541
set.seed(1)
x=rnorm(100)
y=x-2*x^2+rnorm(100)
In this data set, what is n and what is p? Write out the model used to generate the data in equation form.
n = 100 p = 2 *\(y =X -2X^2 + \epsilon\)
plot(x, y)
library(boot)
set.seed(1)
df <- data.frame(x, y)
glm.fit1 <- glm(y ~ x)
cv.glm(df, glm.fit1)$delta[1]
## [1] 7.288162
glm.fit2 <- glm(y ~ poly(x, 2))
cv.glm(df, glm.fit2)$delta[1]
## [1] 0.9374236
glm.fit3 <- glm(y ~ poly(x, 3))
cv.glm(df, glm.fit3)$delta[1]
## [1] 0.9566218
glm.fit4 <- glm(y ~ poly(x, 4))
cv.glm(df, glm.fit4)$delta[1]
## [1] 0.9539049
set.seed(42)
df <- data.frame(x, y)
glm.fit1 <- glm(y ~ x)
cv.glm(df, glm.fit1)$delta[1]
## [1] 7.288162
glm.fit2 <- glm(y ~ poly(x, 2))
cv.glm(df, glm.fit2)$delta[1]
## [1] 0.9374236
glm.fit3 <- glm(y ~ poly(x, 3))
cv.glm(df, glm.fit3)$delta[1]
## [1] 0.9566218
glm.fit4 <- glm(y ~ poly(x, 4))
cv.glm(df, glm.fit4)$delta[1]
## [1] 0.9539049
The results are the same because LOOCV evaluates the model n times over n-1 training observations.
The quadratic model performs best. This is exactly what I expected because we used a quadratic function to generate the data.
summary(glm.fit4)
##
## Call:
## glm(formula = y ~ poly(x, 4))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0550 -0.6212 -0.1567 0.5952 2.2267
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.55002 0.09591 -16.162 < 2e-16 ***
## poly(x, 4)1 6.18883 0.95905 6.453 4.59e-09 ***
## poly(x, 4)2 -23.94830 0.95905 -24.971 < 2e-16 ***
## poly(x, 4)3 0.26411 0.95905 0.275 0.784
## poly(x, 4)4 1.25710 0.95905 1.311 0.193
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.9197797)
##
## Null deviance: 700.852 on 99 degrees of freedom
## Residual deviance: 87.379 on 95 degrees of freedom
## AIC: 282.3
##
## Number of Fisher Scoring iterations: 2
Only \(X\) and \(X^2\) are statistically significant. This is what we expected since the quadratic model performed best.
mu = mean(Boston$medv)
mu
## [1] 22.53281
se_mu = sd(Boston$medv)/sqrt(nrow(Boston))
se_mu
## [1] 0.4088611
set.seed(1)
boot.fn <- function(data, index) {
mu <- mean(data[index])
return (mu)
}
boot(Boston$medv, boot.fn, 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 0.007650791 0.4106622
Both estimates are almost equal.
t.test(Boston$medv)
##
## One Sample t-test
##
## data: Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
conf = c(22.53 - 2 * 0.4119, 22.53 + 2 * 0.4119)
conf
## [1] 21.7062 23.3538
mu_med = median(Boston$medv)
mu_med
## [1] 21.2
set.seed(1)
boot.fn <- function(data, index) {
mu <- median(data[index])
return (mu)
}
boot(Boston$medv, boot.fn, 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 21.2 0.02295 0.3778075
Same value of 21.2 with a rather small standard error of 0.38.
mu_01 = quantile(Boston$medv, c(0.1))
mu_01
## 10%
## 12.75
boot.fn <- function(data, index) {
mu <- quantile(data[index], c(0.1))
return (mu)
}
boot(Boston$medv, boot.fn, 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 12.75 0.01455 0.4823468