Chapter 5: (5), (6), (7), and (9)
library(ISLR2)
## Warning: package 'ISLR2' was built under R version 4.3.2
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.2
## corrplot 0.92 loaded
library(boot)
data(Default)
default = as.data.frame(Default)
# Removing missing values
# weekly <- na.omit(weekly)
summary(default)
## default student balance income
## No :9667 No :7056 Min. : 0.0 Min. : 772
## Yes: 333 Yes:2944 1st Qu.: 481.7 1st Qu.:21340
## Median : 823.6 Median :34553
## Mean : 835.4 Mean :33517
## 3rd Qu.:1166.3 3rd Qu.:43808
## Max. :2654.3 Max. :73554
dim(default)
## [1] 10000 4
seednum <- 140
set.seed(seednum)
model = glm(default ~ income + balance, data = default, family = "binomial")
set.seed(seednum)
train <- sample(10000, 7000)
model <- glm(default ~ income + balance, data = default[train,], family = "binomial")
pred <- predict(model, newdata = default[-train,], type = "response")
pred_class <- ifelse(pred > 0.5, "Yes", "No")
val_err <- mean(default[-train,]$default != pred_class)
cat(c("Train Size: 7000 | Validation Error: ",val_err))
## Train Size: 7000 | Validation Error: 0.024
set.seed(seednum)
train <- sample(10000, 3000)
model <- glm(default ~ income + balance, data = default[train,], family = "binomial")
pred <- predict(model, newdata = default[-train,], type = "response")
pred_class <- ifelse(pred > 0.5, "Yes", "No")
val_err <- mean(default[-train,]$default != pred_class)
cat(c("Train Size: 3000 | Validation Error: ",val_err))
## Train Size: 3000 | Validation Error: 0.025
set.seed(seednum)
train <- sample(10000, 5000)
model <- glm(default ~ income + balance, data = default[train,], family = "binomial")
pred <- predict(model, newdata = default[-train,], type = "response")
pred_class <- ifelse(pred > 0.5, "Yes", "No")
val_err <- mean(default[-train,]$default != pred_class)
cat(c("Train Size: 5000 | Validation Error: ",val_err))
## Train Size: 5000 | Validation Error: 0.0236
set.seed(seednum)
train <- sample(10000, 9000)
model <- glm(default ~ income + balance, data = default[train,], family = "binomial")
pred <- predict(model, newdata = default[-train,], type = "response")
pred_class <- ifelse(pred > 0.5, "Yes", "No")
val_err <- mean(default[-train,]$default != pred_class)
cat(c("Train Size: 5000 | Validation Error: ",val_err))
## Train Size: 5000 | Validation Error: 0.021
Lowest Validation error with Training-Test split of 9000-1000
set.seed(seednum)
train <- sample(10000, 9000)
model <- glm(default ~ income + balance + student, data = default[train,], family = "binomial")
pred <- predict(model, newdata = default[-train,], type = "response")
pred_class <- ifelse(pred > 0.5, "Yes", "No")
val_err <- mean(default[-train,]$default != pred_class)
cat(c("Train Size: 9000 | Validation Error: ",val_err))
## Train Size: 9000 | Validation Error: 0.02
The validation error was reduced substantially with the edition of student field.
set.seed(seednum)
model <- glm(default ~ income + balance, data = default, family = "binomial")
summary(model)$coefficients[,c("Estimate", "Std. Error")]
## Estimate Std. Error
## (Intercept) -1.154047e+01 4.347564e-01
## income 2.080898e-05 4.985167e-06
## balance 5.647103e-03 2.273731e-04
set.seed(seednum)
boot_fn <- function(data, index) {
modelfn <- glm(default ~ income + balance, data = default[index,], family = "binomial")
return (coef(modelfn))
}
set.seed(seednum)
boot_fn <- function(data, index) {
modelfn <- glm(default ~ income + balance, data = default[index,], family = "binomial")
return (coef(modelfn))
}
result1 <- boot(data = default, boot_fn, R=1000)
result1
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = default, statistic = boot_fn, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -1.154047e+01 -6.371372e-02 4.433311e-01
## t2* 2.080898e-05 1.166079e-07 4.675269e-06
## t3* 5.647103e-03 3.500014e-05 2.324025e-04
data(Weekly)
weekly = as.data.frame(Weekly)
# Removing missing values
# weekly <- na.omit(weekly)
summary(weekly)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume Today
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747 Min. :-18.1950
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202 1st Qu.: -1.1540
## Median : 0.2380 Median : 0.2340 Median :1.00268 Median : 0.2410
## Mean : 0.1458 Mean : 0.1399 Mean :1.57462 Mean : 0.1499
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373 3rd Qu.: 1.4050
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821 Max. : 12.0260
## Direction
## Down:484
## Up :605
##
##
##
##
dim(weekly)
## [1] 1089 9
set.seed(seednum)
model <- glm(Direction ~ Lag1 + Lag2, data = weekly, family = "binomial")
set.seed(seednum)
dim(weekly[-1,])
## [1] 1088 9
model <- glm(Direction ~ Lag1 + Lag2, data = weekly[-1,], family = "binomial")
set.seed(seednum)
model <- glm(Direction ~ Lag1 + Lag2, data = weekly[-1,], family = "binomial")
pred <- predict(model, newdata = weekly[1,], type = "response")
row <- weekly[1,]
pred_class <- ifelse(pred > 0.5, "Up", "Down")
cat(c("Pred: ",pred_class, " | True Val: ","Down"))
## Pred: Up | True Val: Down
The prediction is incorrect.
n <- nrow(weekly)
loocv_err <- numeric(n)
for (i in 1:n) {
set.seed(seednum)
model <- glm(Direction ~ Lag1 + Lag2, data = weekly[-i,], family = "binomial")
pred <- predict(model, newdata = weekly[-i,], type = "response")
pred_class <- ifelse(pred > 0.5, "Up", "Down")
loocv_err <- mean(weekly$Direction[i] != pred_class)
}
loocv_err_mean <- mean(loocv_err)
loocv_err_mean
## [1] 0.07169118
data(Boston)
boston = as.data.frame(Boston)
summary(boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio lstat
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 1.73
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.: 6.95
## Median : 5.000 Median :330.0 Median :19.05 Median :11.36
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :12.65
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:16.95
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :37.97
## medv
## Min. : 5.00
## 1st Qu.:17.02
## Median :21.20
## Mean :22.53
## 3rd Qu.:25.00
## Max. :50.00
dim(boston)
## [1] 506 13
# MU Hat
mean(Boston$medv)
## [1] 22.53281
data_sd <- sd(boston$medv)
data_n <- length(boston$medv)
data_se <- data_sd / sqrt(data_n)
data_se
## [1] 0.4088611
dim(boston$medv)
## NULL
mu_hat <- function(data, indices) {
set.seed(seednum)
sample_data <- data[indices, ]
return(mean(sample_data$medv))
}
result <- boot(boston,statistic = mu_hat, R = 506)
se <- sd(result$t)
se
## [1] 0.4199194
The standard Error is higher for the bootstrap method.
med <- median(Boston$medv)
med
## [1] 21.2
dim(boston$medv)
## NULL
mu_hat <- function(data, indices) {
set.seed(seednum)
sample_data <- data[indices, ]
return(median(sample_data$medv))
}
result <- boot(boston,statistic = mu_hat, R = 506)
se <- sd(result$t)
se
## [1] 0.3914139
The value is closer to the se for mean but lower. A 95% confidence interval for median of medv is [20.8085861, 21.5914139]
mu0_1 <- quantile(boston$medv, 0.1)
mu0_1
## 10%
## 12.75
mu_hat <- function(data, indices) {
set.seed(seednum)
sample_data <- data[indices, ]
return(quantile(sample_data$medv, 0.1))
}
result <- boot(boston,statistic = mu_hat, R = 506)
se <- sd(result$t)
se
## [1] 0.5079199
The value is higher than other values. A 95% confidence interval for 10%percentile of medv is [12.2420801, 13.2579199]