library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.3.1
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.1
library(moments)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.1
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:ISLR2':
##
## Boston
library(class)
library("corrplot")
## Warning: package 'corrplot' was built under R version 4.3.1
## corrplot 0.92 loaded
library(boot)
data("Default")
summary(Default)
## default student balance income
## No :9667 No :7056 Min. : 0.0 Min. : 772
## Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
## Median : 823.6 Median :34553
## Mean : 835.4 Mean :33517
## 3rd Qu.:1166.3 3rd Qu.:43808
## Max. :2654.3 Max. :73554
def.fit<-glm(default ~ income + balance, data = Default, family = binomial)
summary(def.fit)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
# splitting data
set.seed(42)
ind<-sample(1:nrow(Default), size = 0.5*nrow(Default))
train<-Default[ind,]
test<-Default[-ind,]
cross.fit<-glm(default ~ income + balance, data = train, family = "binomial")
summary(cross.fit)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.123e+01 5.898e-01 -19.033 < 2e-16 ***
## income 1.808e-05 6.975e-06 2.592 0.00954 **
## balance 5.528e-03 3.115e-04 17.746 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1483.83 on 4999 degrees of freedom
## Residual deviance: 813.39 on 4997 degrees of freedom
## AIC: 819.39
##
## Number of Fisher Scoring iterations: 8
def_pred<-predict(cross.fit, test)
def_prob<-rep("No", length(def_pred))
def_prob[def_pred > 0.5] = "Yes"
table(def_prob, test$default)
##
## def_prob No Yes
## No 4831 124
## Yes 6 39
mean(def_prob != test$default)
## [1] 0.026
#siplt data 60/40
set.seed(42)
ind<-sample(1:nrow(Default), size = 0.6*nrow(Default))
train6<-Default[ind,]
test6<-Default[-ind,]
cross.fit6<-glm(default ~ income + balance, data = train6, family = "binomial")
summary(cross.fit6)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = train6)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.151e+01 5.602e-01 -20.540 < 2e-16 ***
## income 1.681e-05 6.415e-06 2.621 0.00876 **
## balance 5.718e-03 2.982e-04 19.177 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1753.7 on 5999 degrees of freedom
## Residual deviance: 941.4 on 5997 degrees of freedom
## AIC: 947.4
##
## Number of Fisher Scoring iterations: 8
def_pred6<-predict(cross.fit6, test6)
def_prob6<-rep("No", length(def_pred6))
def_prob[def_pred6 > 0.5] = "Yes"
table(def_prob6, test6$default)
##
## def_prob6 No Yes
## No 3867 133
mean(def_prob6 != test6$default)
## [1] 0.03325
# split data 70/30
set.seed(42)
ind<-sample(1:nrow(Default), size = 0.7*nrow(Default))
train7<-Default[ind,]
test7<-Default[-ind,]
cross.fit7<-glm(default ~ income + balance, data = train7, family = "binomial")
summary(cross.fit)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.123e+01 5.898e-01 -19.033 < 2e-16 ***
## income 1.808e-05 6.975e-06 2.592 0.00954 **
## balance 5.528e-03 3.115e-04 17.746 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1483.83 on 4999 degrees of freedom
## Residual deviance: 813.39 on 4997 degrees of freedom
## AIC: 819.39
##
## Number of Fisher Scoring iterations: 8
def_pred7<-predict(cross.fit7, test7)
def_prob7<-rep("No", length(def_pred7))
def_prob[def_pred7 > 0.5] = "Yes"
table(def_prob7, test7$default)
##
## def_prob7 No Yes
## No 2904 96
mean(def_prob7 != test7$default)
## [1] 0.032
# Split data 80/20
set.seed(42)
ind<-sample(1:nrow(Default), size = 0.8*nrow(Default))
train8<-Default[ind,]
test8<-Default[-ind,]
cross.fit8<-glm(default ~ income + balance, data = train, family = "binomial")
summary(cross.fit)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.123e+01 5.898e-01 -19.033 < 2e-16 ***
## income 1.808e-05 6.975e-06 2.592 0.00954 **
## balance 5.528e-03 3.115e-04 17.746 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1483.83 on 4999 degrees of freedom
## Residual deviance: 813.39 on 4997 degrees of freedom
## AIC: 819.39
##
## Number of Fisher Scoring iterations: 8
def_pred8<-predict(cross.fit8, test8)
def_prob8<-rep("No", length(def_pred8))
def_prob[def_pred8 > 0.5] = "Yes"
table(def_prob8, test8$default)
##
## def_prob8 No Yes
## No 1939 61
mean(def_prob8 != test8$default)
## [1] 0.0305
dummy.fit<-glm(default ~ income + balance + student, data = train, family = "binomial")
summary(dummy.fit)
##
## Call:
## glm(formula = default ~ income + balance + student, family = "binomial",
## data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.052e+01 6.677e-01 -15.763 <2e-16 ***
## income -4.771e-07 1.126e-05 -0.042 0.9662
## balance 5.616e-03 3.171e-04 17.713 <2e-16 ***
## studentYes -6.790e-01 3.223e-01 -2.107 0.0352 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1483.8 on 4999 degrees of freedom
## Residual deviance: 809.0 on 4996 degrees of freedom
## AIC: 817
##
## Number of Fisher Scoring iterations: 8
dummy_pred<-predict(dummy.fit, test)
dummy_prob<-rep("No", length(dummy_pred))
dummy_prob[dummy_pred > 0.5] = "Yes"
table(dummy_prob, test$default)
##
## dummy_prob No Yes
## No 4829 121
## Yes 8 42
mean(dummy_prob != test$default)
## [1] 0.0258
Adding the student variable doesnt seem to change the error, with the student variable not indicating rejection of the null.
set.seed(1)
def_log<-glm(default ~ income + balance, data = Default, family = 'binomial')
summary(def_log)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
boot_fun<- function(data = Default, index)
{fit <- glm(default ~ income + balance, data = Default, family = "binomial", subset = index)
return (coef(fit))}
boot(data = Default, boot_fun, R = 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Default, statistic = boot_fun, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -1.154047e+01 -3.945460e-02 4.344722e-01
## t2* 2.080898e-05 1.680317e-07 4.866284e-06
## t3* 5.647103e-03 1.855765e-05 2.298949e-04
The bootstrap function appears to produce a marginal increase in Sd as compared to GLM, but close enough to validate its use.
attach(Weekly)
week_log<-glm(Direction ~ Lag1 + Lag2, data = Weekly, family = "binomial")
Fit a logistic regression model that predicts Direction using Lag1 and Lag2 using all but the first observation.
Use the model from (b) to predict the direction of the first observation.
You can do this by predicting that the first observation will go up if P(Direction = “Up”|Lag1, Lag2) > 0.5. Was this observation correctly classified?
set.seed(1)
sub_log<-glm(Direction ~ Lag1 + Lag2, data = Weekly[-1,], family = "binomial")
pred_sub<-predict(sub_log, Weekly[1,], type = "response") > 0.5
print(pred_sub)
## 1
## TRUE
print(Weekly[1,])
## Year Lag1 Lag2 Lag3 Lag4 Lag5 Volume Today Direction
## 1 1990 0.816 1.572 -3.936 -0.229 -3.484 0.154976 -0.27 Down
Looks like the prediction was that the value would go up, the actual value was down.
err_total = rep(0, dim(Weekly)[1])
for (i in 1:(dim(Weekly)[1])) {
ith_fit= glm(Direction ~ Lag1 + Lag2, data = Weekly[-i, ], family = binomial)
pred_ith = predict.glm(ith_fit, Weekly[i, ], type = "response") > 0.5
tru_ith = Weekly[i, ]$Direction == "Up"
if (pred_ith != tru_ith)
err_total[i] = 1
}
sum(err_total)
## [1] 490
mean(err_total)
## [1] 0.4499541
set.seed (1)
x <- rnorm(100)
y <- x - 2 * x^2 + rnorm(100)
data<-data.frame(x,y)
In this data set, what is n and what is p? Write out the model used to generate the data in equation form. n = 100, p (number of predictors) = 1, formula is y = x - 2x^2+e
plot(x, y)
Looks like the relationship is curvlinear, indicating a quadratic
relationship.
set.seed(1)
# reg fit
fit1 <- glm(y ~ x)
cv.glm(data, fit1)$delta[1]
## [1] 7.288162
# ^2 fit
fit2 <- glm(y ~ poly(x, 2))
cv.glm(data, fit2)$delta[1]
## [1] 0.9374236
# ^3 fit
fit3 <- glm(y ~ poly(x, 3))
cv.glm(data, fit3)$delta[1]
## [1] 0.9566218
# ^4 fit
fit4 <- glm(y ~ poly(x, 4))
cv.glm(data, fit4)$delta[1]
## [1] 0.9539049
set.seed(42)
cv.error <- rep(0, 4)
for (i in 1:4) {
new1.fit <- glm(y ~ poly(x, i), data = data)
cv.error[i] <- cv.glm(data , new1.fit)$delta[1]
}
cv.error
## [1] 7.2881616 0.9374236 0.9566218 0.9539049
Poly fit^2 had the least error, which was demonstrated in the quadratic relationship between x and y.
summary(new1.fit)
##
## Call:
## glm(formula = y ~ poly(x, i), data = data)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.55002 0.09591 -16.162 < 2e-16 ***
## poly(x, i)1 6.18883 0.95905 6.453 4.59e-09 ***
## poly(x, i)2 -23.94830 0.95905 -24.971 < 2e-16 ***
## poly(x, i)3 0.26411 0.95905 0.275 0.784
## poly(x, i)4 1.25710 0.95905 1.311 0.193
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.9197797)
##
## Null deviance: 700.852 on 99 degrees of freedom
## Residual deviance: 87.379 on 95 degrees of freedom
## AIC: 282.3
##
## Number of Fisher Scoring iterations: 2
mu_hat<-mean(Boston$medv)
print(mu_hat)
## [1] 22.53281
stan_err <- sqrt(var(Boston$medv)/nrow(Boston))
print(stan_err)
## [1] 0.4088611
set.seed(42)
boot.fn<-function(data, indices){
resampled<-data[indices]
result<-mean(resampled)
return(result)
}
result<-boot(Boston$medv, boot.fn, 1000)
print(result)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 22.53281 0.02671186 0.4009216
Using bootstrap, the estimated std is .4, which is the approximate estimate of the original output (i.e. .408)
t.test(Boston$medv)
##
## One Sample t-test
##
## data: Boston$medv
## t = 55.111, df = 505, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 21.72953 23.33608
## sample estimates:
## mean of x
## 22.53281
low <- mu_hat - (2* stan_err)
high<- mu_hat + (2*stan_err)
boot_con<-c(low, high)
boot_con
## [1] 21.71508 23.35053
again, the output is very close.
med<-median(Boston$medv)
print(med)
## [1] 21.2
set.seed(42)
boot.fn<-function(data, indices){
resampled<-data[indices]
result<-median(resampled)
return(result)
}
result<-boot(Boston$medv, boot.fn, 1000)
print(result)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 21.2 0.0106 0.3661785
The results are identical. WOW!
mu_hat1<-quantile(Boston$medv, c(0.1))
print(mu_hat1)
## 10%
## 12.75
set.seed(42)
boot.fn<-function(data, indices){
resampled<-data[indices]
result<-quantile(Boston$medv, c(0.1))
return(result)
}
result<-boot(Boston$medv, boot.fn, 1000)
print(result)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Boston$medv, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 12.75 0 0