library(ISLR2)
library(ISLR)
##
## Attaching package: 'ISLR'
## The following objects are masked from 'package:ISLR2':
##
## Auto, Credit
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(boot)
data(Default)
model_lgr <- glm(default ~ income + balance, data = Default, family = 'binomial')
summary(model_lgr)
##
## Call:
## glm(formula = default ~ income + balance, family = "binomial",
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
set.seed(96)
n <- nrow(Default)
train <- sample(n, n/2)
training_set <- Default[train,]
validation_set <- Default[-train,]
### fitting lgr
model_vsa <- glm(default ~ income + balance, data = training_set, family = 'binomial')
probs <- predict(model_vsa, validation_set, type = "response")
predict_class <- ifelse(probs > 0.5, 1, 0)
true_classes <- ifelse(validation_set$default == 'Yes', 1, 0)
### validation set error rate
error_rate <- mean(predict_class != true_classes)
error_rate
## [1] 0.0272
validation_error_rate <- function(seed = 96){
set.seed(seed)
n <- nrow(Default)
train <- sample(n, n/2)
lg <- glm(default ~ income + balance, data = Default, family = 'binomial', subset = train)
probs <- predict(lg, Default[-train,], type = "response")
predict_class <- ifelse(probs > 0.5, 1, 0)
true_classes <- ifelse(Default$default[-train] == 'Yes', 1, 0)
error_rate <- mean(predict_class != true_classes) # Misclassification rate
return(error_rate)
}
seed = 96
for (i in 2:4) {
print(paste("Validation error rate at seed:", seed * i, "is", validation_error_rate(seed * i)))
}
## [1] "Validation error rate at seed: 192 is 0.027"
## [1] "Validation error rate at seed: 288 is 0.0264"
## [1] "Validation error rate at seed: 384 is 0.026"
### including student variable
validation_error_rate_with_student <- function(seed = 96){
set.seed(seed)
n <- nrow(Default)
train <- sample(n, n/2)
# Fit logistic regression with student variable
lg <- glm(default ~ income + balance + student, data = Default, family = 'binomial', subset = train) ## added student variable
probs <- predict(lg, Default[-train,], type = "response") # Correct subsetting for validation set
predict_class <- ifelse(probs > 0.5, 1, 0)
true_classes <- ifelse(Default$default[-train] == 'Yes', 1, 0)
error_rate <- mean(predict_class != true_classes) # Misclassification rate
return(error_rate)
}
seed = 96
for (i in 2:4) {
print(paste("Validation error rate (for model including dummy variable student) at seed", seed * i, "is", validation_error_rate_with_student(seed * i)))
}
## [1] "Validation error rate (for model including dummy variable student) at seed 192 is 0.0272"
## [1] "Validation error rate (for model including dummy variable student) at seed 288 is 0.0262"
## [1] "Validation error rate (for model including dummy variable student) at seed 384 is 0.0256"
set.seed(96)
# Fit the logistic regression model
glm_fit <- glm(default ~ income + balance, data = Default, family = binomial)
# Display model summary to get standard errors
summary(glm_fit)
##
## Call:
## glm(formula = default ~ income + balance, family = binomial,
## data = Default)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.154e+01 4.348e-01 -26.545 < 2e-16 ***
## income 2.081e-05 4.985e-06 4.174 2.99e-05 ***
## balance 5.647e-03 2.274e-04 24.836 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2920.6 on 9999 degrees of freedom
## Residual deviance: 1579.0 on 9997 degrees of freedom
## AIC: 1585
##
## Number of Fisher Scoring iterations: 8
income
in model
glm
is 4.985e-06 and for coeff. of balance
is
2.274e-04boot.fn <- function(data, index) {
# Fit logistic regression model on bootstrapped sample
boot_model <- glm(default ~ income + balance, data = data, family = binomial, subset = index)
# Return coefficients of interest (income and balance)
return(coef(boot_model)[2:3])
}
# Perform bootstrap with 1000 resamples
set.seed(96) # Ensuring reproducibility
boot_results <- boot(Default, boot.fn, R = 1000)
# Display estimated standard errors from bootstrap
boot_results
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = Default, statistic = boot.fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 2.080898e-05 -1.770689e-07 4.838230e-06
## t2* 5.647103e-03 1.103567e-05 2.309823e-04
boot_se <- apply(boot_results$t, 2, sd)
boot_se
## [1] 4.838230e-06 2.309823e-04
income
as 4.838230e-06 and for coeff. of
balance
as 2.309823e-04Coefficient | SE (glm) | SE (Bootstrap) |
---|---|---|
Income | 4.985e-06 | 4.838230e-06 |
Balance | 2.274e-04 | 2.309823e-04 |
summary(Weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
# set.seed(96)
lm_full <- glm(Direction ~ Lag1 + Lag2, data = Weekly, family = 'binomial')
lm_1 <- glm(Direction ~ Lag1 + Lag2, data = Weekly[-1,], family = 'binomial')
summary(lm_1)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2, family = "binomial", data = Weekly[-1,
## ])
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.22324 0.06150 3.630 0.000283 ***
## Lag1 -0.03843 0.02622 -1.466 0.142683
## Lag2 0.06085 0.02656 2.291 0.021971 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1494.6 on 1087 degrees of freedom
## Residual deviance: 1486.5 on 1085 degrees of freedom
## AIC: 1492.5
##
## Number of Fisher Scoring iterations: 4
prob <- predict(lm_1, Weekly[1,], type = 'response')
pred_class <- ifelse(prob > 0.5, 'Up', 'Down')
print(paste('The Predicted Direction class of the first observation is:', pred_class, "::and the true class is:", Weekly[1, 'Direction']))
## [1] "The Predicted Direction class of the first observation is: Up ::and the true class is: Down"
n <- nrow(Weekly)
errors <- rep(0, n)
for(i in 1:n){
lm_i <- glm(Direction ~ Lag1 + Lag2, data = Weekly[-i,], family = 'binomial')
prob <- predict(lm_i, Weekly[i,], type = 'response')
pred_class <- ifelse(prob > 0.5, 'Up', 'Down')
true_class <- Weekly[i, 'Direction']
errors[i] <- ifelse(pred_class != true_class, 1, 0)
}
print(paste('Total errors are:', sum(errors), "out of", n))
## [1] "Total errors are: 490 out of 1089"
validation_error_rate <- sum(errors)/n
validation_error_rate
## [1] 0.4499541
data(Boston)
mu_hat <- mean(Boston$medv)
mu_hat
## [1] 22.53281
medv
(median value of
owner-occupied homes) is 22.53.n <- nrow(Boston)
mu_sd <- sd(Boston$medv)
mu_se <- mu_sd/sqrt(n)
mu_se
## [1] 0.4088611
# Define bootstrap function
boot_mean <- function(data, index) {
return(mean(data[index]))
}
# Perform bootstrap with 1000 resamples
set.seed(96)
boot_res <- boot(Boston$medv, boot_mean, R = 1000)
# Bootstrap SE
boot_se_mu <- sd(boot_res$t)
boot_se_mu
## [1] 0.3996253
# Approximate 95% Confidence Interval
ci_boot <- c(mu_hat - 2 * boot_se_mu, mu_hat + 2 * boot_se_mu)
ci_boot
## [1] 21.73356 23.33206
t.test(Boston$medv)$conf.int
## [1] 21.72953 23.33608
## attr(,"conf.level")
## [1] 0.95
mu_med <- median(Boston$medv)
mu_med
## [1] 21.2
boot_median <- function(data, index) {
return(median(data[index]))
}
set.seed(96)
boot_med_res <- boot(Boston$medv, boot_median, R = 1000)
boot_se_med <- sd(boot_med_res$t)
boot_se_med
## [1] 0.3649334
medv
.mu_0.1 <- quantile(Boston$medv, 0.1)
mu_0.1
## 10%
## 12.75
medv
is
12.75.boot_percentile <- function(data, index) {
return(quantile(data[index], 0.1))
}
set.seed(96)
boot_pctl_res <- boot(Boston$medv, boot_percentile, R = 1000)
boot_se_0.1 <- sd(boot_pctl_res$t)
boot_se_0.1
## [1] 0.4872043
The SE for the 10th percentile is 0.4872, which
is higher than that for the mean and median.
This suggests that the lower tail of the distribution has more variability, meaning housing values in the bottom 10% fluctuate more than central values.
Summary Table:
Estimate | Value | Bootstrap SE |
---|---|---|
Mean (µ ) |
22.53281 | 0.3996253 |
Median (µmed ) |
21.2 | 0.3649334 |
10th Percentile (µ0.1 ) |
12.75 | 0.4872043 |