Homework 4

Assignment 4.

Starting Problem 1.

Load necessary packages and data.

library(dplyr)

## 
## 載入套件：'dplyr'

## 下列物件被遮斷自 'package:stats':
## 
##     filter, lag

## 下列物件被遮斷自 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(tidyr)
library(caret)

## 載入需要的套件：lattice

library(skimr)
library(psych)

## 
## 載入套件：'psych'

## 下列物件被遮斷自 'package:ggplot2':
## 
##     %+%, alpha

library(e1071)

pokemon <- read.csv(
  "https://www.dropbox.com/s/znbta9u9tub2ox9/pokemon.csv?dl=1"
  )

Subsetting and renaming data.

pokemon_sub <- pokemon[ ,
  c("is_legendary", 
  "hp",
  "weight_kg", 
  "height_m", 
  "sp_attack", 
  "sp_defense", 
  "type1", 
  "capture_rate") 
  ]

pokemon_sub <- pokemon_sub %>%
  rename(type = type1)
table(pokemon_sub$type) ## 18 types.

## 
##      bug     dark   dragon electric    fairy fighting     fire   flying 
##       72       29       27       39       18       28       52        3 
##    ghost    grass   ground      ice   normal   poison  psychic     rock 
##       27       78       32       23      105       32       53       45 
##    steel    water 
##       24      114

Converting NAs and variable classes.

pokemon_sub$weight_kg[is.na(pokemon_sub$weight_kg)] <- 0
pokemon_sub$height_m[is.na(pokemon_sub$height_m)] <- 0

class(pokemon_sub$capture_rate) ## Character class, means something's off.

## [1] "character"

pokemon_sub$is_legendary <- as.factor(pokemon_sub$is_legendary)

If we convert “capture_rate” to factor class, there would be a new NA value. We first deal with this problem.

poke <- pokemon_sub ## Temporarily assign a new df to solve the problem. 
poke$capture_rate <- as.numeric(poke$capture_rate) ## 強制變更過程中產生了 NA

## Warning: 強制變更過程中產生了 NA

sum(is.na(poke$capture_rate)) ## 1 NA in this column.

## [1] 1

which(is.na(poke$capture_rate), arr.ind = T) ## The 774th row is the NA.

## [1] 774

rm(poke) ## Remove the temporarily created data frame. 

## Now we go back to the original data and see what's in the 774th row.
pokemon[774, c("japanese_name", "capture_rate")] ## Metenoメテノ, 30 (Meteorite)255 (Core)

##     japanese_name             capture_rate
## 774  Metenoメテノ 30 (Meteorite)255 (Core)

pokemon[774, c("capture_rate")] <- 30 ## Keep its capture rate to only 30. 
pokemon[774, c("capture_rate")] ## Now it has only one capture rate: 30.

## [1] "30"

## Then we can convert this column to factor class. 
pokemon_sub$capture_rate <- as.numeric(pokemon_sub$capture_rate)

## Warning: 強制變更過程中產生了 NA

One-hot-encoding.

## Assigning is.legendary to y. 
y <- pokemon_sub$is_legendary
## One-hot-encoding. First convert is.legendary to numeric class. 
pokemon_sub$is_legendary <- as.numeric(pokemon_sub$is_legendary)

pokemon_ohe_model <- dummyVars(type ~ ., data = pokemon_sub) ## Creating dummies. 
data_mat <- predict(pokemon_ohe_model, newdata = pokemon_sub) ## OHE. 
pokemon_ohe <- data.frame(data_mat)

## Add y back in. 
pokemon_ohe$is_legendary <- y

Only 2 more features were created: is_legendary.0 and is_legendary.1.

dim(pokemon_ohe)

## [1] 801   7

colnames(pokemon_ohe)

## [1] "is_legendary" "hp"           "weight_kg"    "height_m"     "sp_attack"   
## [6] "sp_defense"   "capture_rate"

75-25 train-test split.

set.seed(123)
sample <- createDataPartition(pokemon_ohe$is_legendary, p = .75, list = F)
train_pokemon_ohe <- pokemon_ohe[sample, ]
test_pokemon_ohe <- pokemon_ohe[-sample, ]

Train an SVM using train data, then predict the outcome on test data and create a confusion matrix.

## SVM on original data. 
svm.fit.ohe <- svm(
  is_legendary ~ .,
  data = train_pokemon_ohe
  )

predict_train_svm <- predict(svm.fit.ohe, test_pokemon_ohe)
confmat_train_svm <- table(
  Predicted = predict_train_svm, 
  Actual = test_pokemon_ohe$is_legendary
  )

## Take a look at the confusion matrix. 
confmat_train_svm

##          Actual
## Predicted   0   1
##         0 181  10
##         1   1   7

## Calculate prediction accuracy. We get 94.47236% accuracy. 
(confmat_train_svm[1, 1] + confmat_train_svm[2, 2]) / sum(confmat_train_svm) *100

## [1] 94.47236

Reload libraries and data for next part.

pokemon <- read.csv(
  "https://www.dropbox.com/s/znbta9u9tub2ox9/pokemon.csv?dl=1"
  )

Subset data, convert capture_rate to numeric class and is_legendary to factor class.

pokemon_sub_2 <- pokemon[ ,
  c("is_legendary", 
  "hp",
  "weight_kg", 
  "height_m", 
  "sp_attack", 
  "sp_defense", 
  "type1", 
  "capture_rate") 
  ]

pokemon_sub_2 <-pokemon_sub_2 %>%
  rename(type = type1)

pokemon_sub_2$capture_rate <- as.numeric(pokemon_sub_2$capture_rate)

## Warning: 強制變更過程中產生了 NA

pokemon_sub_2$is_legendary <- as.factor(pokemon_sub_2$is_legendary)

Check for NAs using colSums(is.na()).

# 1 NA in capture_rate, 20 in both height_m and weight_kg.
colSums(is.na(pokemon_sub_2))

## is_legendary           hp    weight_kg     height_m    sp_attack   sp_defense 
##            0            0           20           20            0            0 
##         type capture_rate 
##            0            1

Using KNN to impute NAs.

library(RANN)

knn_na <- preProcess(as.data.frame(pokemon_sub_2), method = "knnImpute")
pokemon_knn <- predict(knn_na, newdata = pokemon_sub_2)

## Check for NAs again. 
colSums(is.na(pokemon_knn)) ## Not anymore.

## is_legendary           hp    weight_kg     height_m    sp_attack   sp_defense 
##            0            0            0            0            0            0 
##         type capture_rate 
##            0            0

Repeat OHE, then fit another SVM model.

y <- pokemon_knn$is_legendary

pokemon_knn$is_legendary <- as.numeric(pokemon_knn$is_legendary)
pokemon_ohe_model_2 <- dummyVars(type ~ ., data = pokemon_knn) ## Creating dummies. 
data_mat <- predict(pokemon_ohe_model, newdata = pokemon_knn) ## OHE. 
pokemon_knn <- data.frame(data_mat)

pokemon_knn$is_legendary <- y

sample <- createDataPartition(pokemon_knn$is_legendary, p = 0.75, list = F)
train_pokemon_knn <- pokemon_knn[sample, ]
test_pokemon_knn <- pokemon_knn[-sample, ]

svm.fit.knn <- svm(
  is_legendary ~ .,
  data = train_pokemon_knn
)

predict_train_knn <- predict(svm.fit.knn, newdata = test_pokemon_knn)
confmat_train_knn <- table(
  Predicted = predict_train_knn,
  Actual = test_pokemon_knn$is_legendary
)

## Take a look at the KNN confusion matrix. 
confmat_train_knn

##          Actual
## Predicted   0   1
##         0 181   7
##         1   1  10

## Calculate prediction accuracy. We got 93.96985% accuracy. 
(confmat_train_knn[1, 1] + confmat_train_knn[2, 2]) / sum(confmat_train_knn) *100

## [1] 95.9799

The KNN imputed method DOES NOT have better prediction accuracy. OHE method has 95.9799% accuacy, while KNN method has 93.96985%.

Starting Problem 2.

Load necessary packages and data.

library(gbm)

## Loaded gbm 2.2.2

## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## 載入套件：'pROC'

## 下列物件被遮斷自 'package:stats':
## 
##     cov, smooth, var

library(plotROC)

## 
## 載入套件：'plotROC'

## 下列物件被遮斷自 'package:pROC':
## 
##     ggroc

library(ISLR)

default <- as.data.frame(Default)

Train-test split.

set.seed(123)
sample <- createDataPartition(default$default, p = 0.75, list = F)
train_default <- default[sample, ]
test_default <- default[-sample, ]

Models fitting: logit, probit, SVM, GBM.

## Logit fitting. 
logit.fit.default <- glm(
  default ~ ., 
  family = binomial(link = "logit"), 
  data = train_default
  )

## Probit fitting. 
probit.fit.default <- glm(
  default ~ ., 
  family = binomial(link = "probit"), 
  data = train_default
)

## SVM fitting. 
svm.fit.default <- svm(
  default ~ .,
  data = train_default
)

## Hyperparameters tuning first. 
hyper_grid <- expand.grid(
  shrinkage = c(0.01, 0.1, 0.2),
  interaction.depth = c(1, 3, 5),
  n.minobsinnode = c(5, 10, 15),
  bag.fraction = c(0.5, 0.75, 1),
  optimal_trees = 0,
  min_RMSE = 0
  )

nrow(hyper_grid) ## 81 unique value combinations.

## [1] 81

## Converting values. 
default$default <- ifelse(default$default == "Yes", 1, 0)

## GBM fitting. 
random_index <- sample(1:nrow(default), nrow(train_default))
random_train <- default[random_index, ]

for(i in 1:nrow(hyper_grid)) {
  gbm.fit.default <- gbm(
    default ~ .,
    data = random_train,
    distribution = "bernoulli",
    cv.folds = 5,
    interaction.depth = hyper_grid$interaction.depth[i], 
    shrinkage = hyper_grid$shrinkage[i],
    n.trees = 500,
    n.minobsinnode = hyper_grid$n.minobsinnode[i], 
    bag.fraction = hyper_grid$bag.fraction[i],
    train.fraction = 0.75,
    n.cores = NULL,
    verbose = F
  )
  
  hyper_grid$optimal_trees[i] <- which.min(gbm.fit.default$valid.error)
  hyper_grid$min_RMSE[i] <- sqrt(min(gbm.fit.default$valid.error))
}

## Optimal hyperparameter from GBM. 
optimal_para <- hyper_grid %>%
  dplyr::arrange(min_RMSE) %>%
  head(1) 

## Now fit the optimal GBM using the optimal hyperparameters.
gbm.tuned.default <- gbm(
  default ~ .,
  data = random_train,
  distribution = "bernoulli",
  cv.folds = 5,
  interaction.depth = optimal_para$interaction.depth, 
  shrinkage = optimal_para$shrinkage,
  n.trees = 500,
  n.minobsinnode = optimal_para$n.minobsinnode, 
  bag.fraction = optimal_para$bag.fraction,
  train.fraction = 0.75,
  n.cores = NULL,
  verbose = F
)

Making predictions on all 4 models.

## Logit prediction, calculate prediction accuracy. 
logit.pred <- predict(logit.fit.default, test_default, type = "response")
logit.pred.class <- as.numeric(ifelse(logit.pred > 0.5, 1, 0))
confmat.logit <- table(
  Predicted = logit.pred.class,
  Actual = test_default$default
)
(confmat.logit[1, 1] + confmat.logit[2, 2]) / sum(confmat.logit)

## [1] 0.9711885

## Probit prediction, calculate prediction accuracy. 
probit.pred <- predict(probit.fit.default, test_default, type = "response")
probit.pred.class <- as.numeric(ifelse(probit.pred > 0.5, 1, 0))
confmat.probit <- table(
  Predicted = probit.pred.class,
  Actual = test_default$default
)
(confmat.probit[1, 1] + confmat.probit[2, 2]) / sum(confmat.probit)

## [1] 0.9707883

## SVM prediction, calculate prediction accuracy.
svm.pred <- predict(svm.fit.default, test_default, type = "response")
svm.pred.class <- as.numeric(ifelse(svm.pred == "Yes", 1, 0))
confmat.svm <- table(
  Predicted = svm.pred,
  Actual = test_default$default
)
(confmat.svm[1, 1] + confmat.svm[2, 2]) / sum(confmat.svm)

## [1] 0.9707883

## GBM prediction, calculate prediction accuracy. 
gbm.pred <- predict(gbm.tuned.default, test_default, type = "response")

## Using 193 trees...

gbm.pred.class <- as.numeric(ifelse(gbm.pred > 0.5, 1, 0))
confmat.gbm <- table(
  Predicted = gbm.pred.class,
  Actual = test_default$default
)
(confmat.gbm[1, 1] + confmat.gbm[2, 2]) / sum(confmat.gbm)

## [1] 0.9743898

Generate ROC.

train_default$default <- as.factor(train_default$default)
test_default$default <- as.factor(test_default$default)

## Assign values to roc() to compute classification accuracy. 
logit.roc <- roc(test_default$default, logit.pred.class)

## Setting levels: control = No, case = Yes

## Setting direction: controls < cases

probit.roc <- roc(test_default$default, probit.pred.class)

## Setting levels: control = No, case = Yes
## Setting direction: controls < cases

svm.roc <- roc(test_default$default, svm.pred.class)

## Setting levels: control = No, case = Yes
## Setting direction: controls < cases

gbm.roc <- roc(test_default$default, gbm.pred.class)

## Setting levels: control = No, case = Yes
## Setting direction: controls < cases

Calculate AUC for all four models.

auc(logit.roc) ## 0.6477.

## Area under the curve: 0.6477

auc(probit.roc) ## 0.6184.

## Area under the curve: 0.6184

auc(svm.roc) ## 0.601.

## Area under the curve: 0.601

auc(gbm.roc) ## 0.7075.

## Area under the curve: 0.7075

## The GBM has the largest AUC.

Plot all ROC. The closer the line to the upper left corner, the better the performance. From visual inspection, GBM (the last one) seems to have the best performance.

plot.roc(logit.roc)

plot.roc(probit.roc)

plot.roc(svm.roc)

plot.roc(gbm.roc)

## Starting Problem 3. Load and initiate h2o, convert data to h2o format.

library(h2o)

## 
## ----------------------------------------------------------------------
## 
## Your next step is to start H2O:
##     > h2o.init()
## 
## For H2O package documentation, ask for help:
##     > ??h2o
## 
## After starting H2O, you can use the Web UI at http://localhost:54321
## For more information visit https://docs.h2o.ai
## 
## ----------------------------------------------------------------------

## 
## 載入套件：'h2o'

## 下列物件被遮斷自 'package:pROC':
## 
##     var

## 下列物件被遮斷自 'package:stats':
## 
##     cor, sd, var

## 下列物件被遮斷自 'package:base':
## 
##     %*%, %in%, &&, ||, apply, as.factor, as.numeric, colnames,
##     colnames<-, ifelse, is.character, is.factor, is.numeric, log,
##     log10, log1p, log2, round, signif, trunc

h2o.init() ## Initiate h2o.

## 
## H2O is not running yet, starting it now...

## Warning in .h2o.startJar(ip = ip, port = port, name = name, nthreads = nthreads, : You have a 32-bit version of Java. H2O works best with 64-bit Java.
## Please download the latest Java SE JDK from the following URL:
## http://docs.h2o.ai/h2o/latest-stable/h2o-docs/welcome.html#java-requirements

## 
## Note:  In case of errors look at the following log files:
##     C:\Users\JACKCH~1\AppData\Local\Temp\RtmpQ5wk06\file69c06e743366/h2o_Jack_Chen_started_from_r.out
##     C:\Users\JACKCH~1\AppData\Local\Temp\RtmpQ5wk06\file69c027cb6e1f/h2o_Jack_Chen_started_from_r.err
## 
## 
## Starting H2O JVM and connecting:  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         3 seconds 957 milliseconds 
##     H2O cluster timezone:       Asia/Taipei 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.44.0.3 
##     H2O cluster version age:    1 year, 5 months and 7 days 
##     H2O cluster name:           H2O_started_from_R_Jack_Chen_qlm670 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   0.96 GB 
##     H2O cluster total cores:    12 
##     H2O cluster allowed cores:  12 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     R Version:                  R version 4.3.3 (2024-02-29 ucrt)

## Warning in h2o.clusterInfo(): 
## Your H2O cluster version is (1 year, 5 months and 7 days) old. There may be a newer version available.
## Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html

train_default_h2o <- as.h2o(train_default)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

test_default_h2o <- as.h2o(test_default)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

Hard-voting fitting.

## Fitting the base models first. Define X and Y. 
Y_default <- "default"
X_default <- names(default)[-1]

## Base logit. 
base_logit <- h2o.glm(
  x = X_default, y = Y_default, train = train_default_h2o,
  family = "binomial",
  nfold = 5,
  keep_cross_validation_predictions = T,
  auc_type = "AUTO",
  seed = 123
)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

## Base Naive Bayes. 
base_nb <- h2o.naiveBayes(
  x = X_default, y = Y_default, train = train_default_h2o,
  nfold = 5,
  keep_cross_validation_predictions = T,
  auc_type = "AUTO",
  seed = 123
)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

## Base GBM. 
base_gbm <- h2o.gbm(
  x = X_default, y = Y_default, train = train_default_h2o,
  ntrees = 500,
  max_depth = optimal_para$interaction.depth, 
  learn_rate = optimal_para$shrinkage,
  min_rows = optimal_para$n.minobsinnode, 
  sample_rate = optimal_para$bag.fraction,
  nfold = 5,
  keep_cross_validation_predictions = T,
  auc_type = "AUTO",
  seed = 123
)

##   |                                                                              |                                                                      |   0%  |                                                                              |==                                                                    |   2%  |                                                                              |=================                                                     |  24%  |                                                                              |==========================================================            |  83%  |                                                                              |==================================================================    |  94%  |                                                                              |======================================================================| 100%

Hard-voting accuracy: 96.7587%; AUC: 0.7098.

## Hard voting.
## Predict using logit. 
base_logit.pred <- as.vector(
  h2o.predict(base_logit, test_default_h2o)$predict
  )

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

## Predict using Naive Bayes. 
base_nb.pred <- as.vector(
  h2o.predict(base_nb, test_default_h2o)$predict
  )

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

## Predict using GBM.
base_gbm.pred <- as.vector(
  h2o.predict(base_gbm, test_default_h2o)$predict
  )

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

predictions <- data.frame(base_logit.pred, base_gbm.pred, base_nb.pred)

## Define function for hard voting. 
hard_vote <- function(preds) {
  apply(preds, 1, function(row) {
    names(sort(table(row), decreasing = TRUE))[1]
  })
}

## Final voting. 
hv_prediction <- hard_vote(predictions)

## Convert to factor class with the right ordering. 
hv_prediction <- as.factor(hv_prediction)

confmat.hv <- table(Predicted = hv_prediction, Actual = test_default$default)
(confmat.hv[1, 1] + confmat.hv[2, 2]) / sum(confmat.hv) * 100 ## 96.7587%.

## [1] 96.7587

hv_prediction_class <- as.numeric(ifelse(hv_prediction == "Yes", 1, 0))
hv_roc <- roc(test_default$default, hv_prediction_class)

## Setting levels: control = No, case = Yes

## Setting direction: controls < cases

auc(hv_roc) ## AUC is 0.7098.

## Area under the curve: 0.7098

Soft-voting accuracy.

## Create a model list.
base_models <- c(
  base_logit@model_id, 
  base_nb@model_id, 
  base_gbm@model_id
  )

## Create a stacked ensemble. 
sv_ensemble <- h2o.stackedEnsemble(
  x = X_default, y = Y_default, training_frame = train_default_h2o,
  base_models = base_models
)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

## Predict using soft voting model. 
sv_pred <- h2o.predict(sv_ensemble, newdata = test_default_h2o)

##   |                                                                              |                                                                      |   0%  |                                                                              |======================================================================| 100%

## Define AUC function and get AUCs. 
get_auc <- function(model) {
  results <- h2o.performance(model, newdata = test_default_h2o)
  h2o.auc(results)
}

voting_perf <- list(base_logit, base_nb, base_gbm, sv_ensemble) %>%
  purrr::map_dbl(get_auc)

tab_voting_auc <- data.frame(AUC = voting_perf)
rownames(tab_voting_auc) <- c("Logit", "Naive Bayes", "GBM", "Soft Voting")
tab_voting_auc

##                   AUC
## Logit       0.9538942
## Naive Bayes 0.9483538
## GBM         0.9499746
## Soft Voting 0.9539441

## I somehow cannot use grid.arrange(), so I look at tab_voting_auc directly instead. 

## theme_1 <- ttheme_default()
## grid.arrange(tableGrob(tab_voting_auc, theme = theme_1))

h2o.shutdown(prompt = F)

Soft Voting has the best AUC: 0.9539441, showing that using a voting model does have some benefit. ## Starting Problem 4. Load necessary packages and data.

library(dplyr)
library(rsample)

## 
## 載入套件：'rsample'

## 下列物件被遮斷自 'package:e1071':
## 
##     permutations

library(boot)

## 
## 載入套件：'boot'

## 下列物件被遮斷自 'package:psych':
## 
##     logit

## 下列物件被遮斷自 'package:lattice':
## 
##     melanoma

library(purrr)

## 
## 載入套件：'purrr'

## 下列物件被遮斷自 'package:caret':
## 
##     lift

default <- as.data.frame(Default)

Estimate a logit model and perform a 10-fold CV.

## Define a function to calculate the k-fold CV MSE. 
kf_cv_error <- function(x) {
  set.seed(123)
  logit.fit.kf <- glm(
  default ~ student + balance + poly(income, x),
  family = binomial(link = "logit"),
  data = default
  )
  cv.glm(default, logit.fit.kf, K = 10)$delta[1]
}

1:10 %>% map_dbl(kf_cv_error) ## The first degree has the smallest MSE.

## Warning: glm.fit:擬合機率算出來是數值零或一
## Warning: glm.fit:擬合機率算出來是數值零或一
## Warning: glm.fit:擬合機率算出來是數值零或一
## Warning: glm.fit:擬合機率算出來是數值零或一
## Warning: glm.fit:擬合機率算出來是數值零或一
## Warning: glm.fit:擬合機率算出來是數值零或一
## Warning: glm.fit:擬合機率算出來是數值零或一
## Warning: glm.fit:擬合機率算出來是數值零或一
## Warning: glm.fit:擬合機率算出來是數值零或一
## Warning: glm.fit:擬合機率算出來是數值零或一
## Warning: glm.fit:擬合機率算出來是數值零或一
## Warning: glm.fit:擬合機率算出來是數值零或一
## Warning: glm.fit:擬合機率算出來是數值零或一
## Warning: glm.fit:擬合機率算出來是數值零或一
## Warning: glm.fit:擬合機率算出來是數值零或一
## Warning: glm.fit:擬合機率算出來是數值零或一
## Warning: glm.fit:擬合機率算出來是數值零或一
## Warning: glm.fit:擬合機率算出來是數值零或一
## Warning: glm.fit:擬合機率算出來是數值零或一
## Warning: glm.fit:擬合機率算出來是數值零或一
## Warning: glm.fit:擬合機率算出來是數值零或一
## Warning: glm.fit:擬合機率算出來是數值零或一
## Warning: glm.fit:擬合機率算出來是數值零或一
## Warning: glm.fit:擬合機率算出來是數值零或一
## Warning: glm.fit:擬合機率算出來是數值零或一
## Warning: glm.fit:擬合機率算出來是數值零或一
## Warning: glm.fit:擬合機率算出來是數值零或一
## Warning: glm.fit:擬合機率算出來是數值零或一
## Warning: glm.fit:擬合機率算出來是數值零或一
## Warning: glm.fit:擬合機率算出來是數值零或一
## Warning: glm.fit:擬合機率算出來是數值零或一
## Warning: glm.fit:擬合機率算出來是數值零或一

##  [1] 0.02140615 0.02140915 0.02144320 0.02142835 0.02144654 0.02143820
##  [7] 0.02144703 0.02147494 0.02147267 0.02148085

Bootstrapping.

logit.boot <- glm(
  default ~ student + balance + income,
  family = binomial(link = "logit"),
  data = default
)

boot_result <- function(data, index) {
  logit.boot <- glm(
  default ~ student + balance + income,
  family = binomial(link = "logit"),
  data = data,
  subset = index
  )
  coef(logit.boot)
}

## Use boot() to bootstrap. 
set.seed(123)
boot(default, boot_result, R = 1000)

## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = default, statistic = boot_result, R = 1000)
## 
## 
## Bootstrap Statistics :
##          original        bias     std. error
## t1* -1.086905e+01 -3.291558e-02 4.797385e-01
## t2* -6.467758e-01 -3.118653e-03 2.427747e-01
## t3*  5.736505e-03  1.681596e-05 2.294104e-04
## t4*  3.033450e-06  1.160860e-07 8.249317e-06

The t3* term (balance) has the largest bias. ## End of Homework 4.

Homework 4

Jack_uwu

2025-05-20

Assignment 4.

Starting Problem 1.

Starting Problem 2.