Appendix - Code
> ## ----include=FALSE, echo=FALSE-------------------------------------------
> knitr::opts_chunk$set(comment = "", prompt = TRUE, out.width = 400, fig.height = 4, fig.width = 4)
> library(knitr)
> Sys.setlocale("LC_ALL", "eng")
> setwd("E:/Dropbox/00.2018/01.2018_1_semester/01.DataMining/04.HW/HW3")
>
> ## ----echo=FALSE----------------------------------------------------------
> library(ISLR)
> data(Default)
> kable(head(Default), caption = "head(Default)")
> str(Default)
>
> ## ----echo=FALSE----------------------------------------------------------
> library(MVN)
> ks.test(x = rnorm(10 ^ 4), Default$balance, alternative = "two.sided")
> ks.test(x = rnorm(10 ^ 4), Default$income, alternative = "two.sided")
>
>
> ## ----echo=FALSE----------------------------------------------------------
>
> n <- dim(Default)[1]
>
> table1 <- array(NA, c(2, 2, 30))
> error.rate.of.lda.vs <- rep(NA, 30)
>
> for (i in 1:30) {
+ set.seed(i)
+ train <- sample(1:n, n / 2)
+ default.train <- Default[train, ]
+ default.test <- Default[-train, ]
+ fit2 <- lda(default~ balance + income, data = Default, subset = train)
+ pred.lda <- predict(fit2, Default[-train, ])$class
+ table1[, , i] <- table(pred = pred.lda, true = Default[-train, 1])
+ error.rate.of.lda.vs[i] <- mean(pred.lda != Default[-train, 1])
+ }
>
> ## ----echo=FALSE----------------------------------------------------------
> # apply(table1,1:2,mean)
>
> tmp1 <- apply(table, 1:2, mean)
> dimnames(tmp1) <- list(c("No", "Yes"), c("No", "Yes"))
> kable(tmp1, "html", digits = 1) %>%
+ kable_styling(full_width = F)
>
> ## ------------------------------------------------------------------------
> mean(error.rate.of.lda.vs)
>
>
> ## ----echo=FALSE----------------------------------------------------------
> plot(
+ error.rate.of.lda.vs, type = "l",
+ ylim = c(min(error.rate.of.lda.vs) * 0.9, max(error.rate.of.lda.vs) * 1.1),
+ ylab = "Error Rate",
+ xlab = "K"
+ )
> abline(h = mean(error.rate.of.lda.vs), lty = 2, col = 2)
>
> ## ----echo=FALSE----------------------------------------------------------
>
> table2 <- array(NA, c(2, 2, 30))
> error.rate.of.qda.vs <- rep(NA, 30)
>
> for (i in 1:30) {
+ set.seed(i)
+ train <- sample(1:n, n / 2)
+ fit2 <- qda(default~ balance + income, data = Default, subset = train)
+ pred.qda <- predict(fit2, Default[-train, ])$class
+ table2[, , i] <- table(pred = pred.qda, true = Default[-train, 1])
+ error.rate.of.qda.vs[i] <- mean(pred.qda != Default[-train, 1])
+ }
>
> ## ----echo=FALSE----------------------------------------------------------
> # apply(table2,1:2, mean)
>
> tmp2 <- apply(table2, 1:2, mean)
> dimnames(tmp2) <- list(c("No", "Yes"), c("No", "Yes"))
> kable(tmp2, "html", digits = 1) %>%
+ kable_styling(full_width = F)
>
> ## ------------------------------------------------------------------------
> mean(error.rate.of.qda.vs)
>
> ## ----echo=FALSE----------------------------------------------------------
> plot(
+ error.rate.of.qda.vs, type = "l",
+ ylim = c(min(error.rate.of.qda.vs) * 0.9, max(error.rate.of.qda.vs) * 1.1),
+ ylab = "Error Rate",
+ xlab = "K"
+ )
> abline(h = mean(error.rate.of.qda.vs), lty = 2, col = 2)
>
> ## ------------------------------------------------------------------------
> Y.pred.loocv <- rep(NA, n)
> pprob <- tmp.cv.lda$posterior
>
> for (i in 1:n) Y.pred.loocv[i] <- which.max(pprob[i, ])
> Y.hat.loocv <- character(length(Y.pred.loocv))
> Y.hat.loocv[Y.pred.loocv == 1] <- "No"
> Y.hat.loocv[Y.pred.loocv == 2] <- "Yes"
> Y.hat.loocv <- factor(Y.hat.loocv, levels = c("No", "Yes"))
>
> kable(table(pred = Y.hat.loocv, true = Default[, 1]), "html") %>%
+ kable_styling(full_width = F)
>
> ## ------------------------------------------------------------------------
> mean(Y.hat.loocv != Default[, 1])
>
> ## ----echo=FALSE----------------------------------------------------------
> Y.pred.loocv.qda <- rep(NA, n)
> pprob.qda <- tmp.cv.qda$posterior
>
> for (i in 1:n) Y.pred.loocv.qda[i] <- which.max(pprob.qda[i, ])
> Y.hat.loocv.qda <- character(length(Y.pred.loocv.qda))
> Y.hat.loocv.qda[Y.pred.loocv.qda == 1] <- "No"
> Y.hat.loocv.qda[Y.pred.loocv.qda == 2] <- "Yes"
>
> kable(table(pred = Y.hat.loocv.qda, true = Default[, 1]), "html") %>%
+ kable_styling(full_width = F)
>
> ## ------------------------------------------------------------------------
> mean(Y.hat.loocv.qda != Default[, 1])
>
> ## ----echo=FALSE----------------------------------------------------------
> # K-fold CV
> K <- 10
> ind <- (1:n) %% K + 1
> set.seed(1 * i)
> folds <- sample(ind, n)
> predcv <- character(n)
> for (k in 1:K) {
+ fit <- lda(default~income + balance, data = Default, subset = which(ind != k))
+ predcv[ind == k] <- as.character(predict(fit, Default[ind == k, ])$class)
+ }
> table.10fold.lda <- table(pred = predcv, true = Default[, 1])
> kable(table.10fold.lda, "html") %>%
+ kable_styling(full_width = F)
>
>
> ## ------------------------------------------------------------------------
> error.rate.10fold <- mean(predcv != Default[, 1])
> error.rate.10fold
>
> ## ----echo=FALSE----------------------------------------------------------
> # K-fold CV
> K <- 10
> ind <- (1:n) %% K + 1
> set.seed(1 * i)
> folds <- sample(ind, n)
> predcv.qda <- character(n)
> for (k in 1:K) {
+ fit <- lda(default~income + balance, data = Default, subset = which(ind != k))
+ predcv.qda[ind == k] <- as.character(predict(fit, Default[ind == k, ])$class)
+ }
> table.10fold.qda <- table(pred = predcv.qda, true = Default[, 1])
> kable(table.10fold.qda, "html") %>%
+ kable_styling(full_width = F)
>
>
> ## ------------------------------------------------------------------------
> error.rate.10fold.qda <- mean(predcv.qda != Default[, 1])
> error.rate.10fold.qda
>
> ## ----echo=FALSE, message=FALSE, warning=FALSE----------------------------
> library(MASS)
> library(naivebayes)
> library(e1071)
>
> error.rate.nb.vs <- rep(NA, 30)
> n <- dim(Default)[1]
> table <- array(NA, c(2, 2, 30))
>
> for (i in 1:30) {
+ set.seed(i)
+ train <- sample(1:n, n / 2)
+ fit.nb.vs <- naiveBayes(default ~ ., data = Default[train, ])
+ pred.nb.vs <- predict(fit.nb.vs, newdata = Default[-train, ])
+ table[, , i] <- as.matrix(table(pred = pred.nb.vs, true = Default[-train, 1]))
+ error.rate.nb.vs[i] <- mean(pred1 != Default[-train, ]$default)
+ }
>
>
> ## ----echo=FALSE----------------------------------------------------------
> tmp <- apply(table, 1:2, mean)
> dimnames(tmp) <- list(c("No", "Yes"), c("No", "Yes"))
> kable(tmp, "html", digits = 1) %>%
+ kable_styling(full_width = F)
>
> ## ------------------------------------------------------------------------
> mean(error.rate.nb.vs)
>
> ## ----echo=FALSE----------------------------------------------------------
> plot(
+ error.rate.nb.vs, type = "l",
+ ylim = c(min(error.rate) * 0.9, max(error.rate) * 1.1),
+ ylab = "Error Rate",
+ xlab = "K"
+ )
> abline(h = mean(error.rate), lty = 2, col = 2)
>
> ## ----echo=FALSE, message=FALSE, warning=FALSE----------------------------
>
> n <- nrow(Default)
> error.loocv <- vector(mode = "logical", n)
> pred.nb.loocv <- rep(NA, n)
>
> for (i in 1:n) {
+ fit.loocv <- naiveBayes(default ~ ., data = Default[-i, ])
+ pred2 <- predict(fit.loocv, newdata = Default[i, ])
+ pred.nb.loocv[i] <- as.character(pred2)
+ error.loocv[i] <- (pred2 != Default[i, ]$default)
+ }
>
>
> ## ----echo=FALSE----------------------------------------------------------
> tab.loocv <- table(pred = pred.nb.loocv, true = Default[, 1])
> kable(tab.loocv, "html") %>%
+ kable_styling(full_width = F)
>
> ## ------------------------------------------------------------------------
>
> mean(error.loocv)
>
> ## ----echo=FALSE----------------------------------------------------------
>
> library(leaps)
>
> k <- 10
> error.kfold <- rep(NA, k)
> n <- dim(Default)[1]
> set.seed(1)
>
> pred.10fold <- character(n)
> folds <- sample(1:k, nrow(Default), replace = TRUE, prob = rep(1 / k, k))
>
> for (i in 1:k) {
+ fit.10fold <- naive_bayes(default ~ ., data = Default[folds != i, ])
+ pred.10fold[folds == i] <- as.character(predict(fit.10fold, newdata = Default[folds == i, ]))
+ }
>
>
> ## ----echo=FALSE----------------------------------------------------------
> tab1 <- table(pred = pred.10fold, true = Default[, 1])
> kable(tab1, "html") %>%
+ kable_styling(full_width = F)
>
> ## ------------------------------------------------------------------------
> error.kfold <- pred.10fold != Default[, 1]
> mean(error.kfold)
