library(dplyr)
library(ggplot2)
library(tidyr)
library(caret)
library(skimr)
library(psych)
library(e1071)
library(data.table)
library(Matrix)
library(keras)
poke <- read.csv("https://www.dropbox.com/s/znbta9u9tub2ox9/pokemon.csv?dl=1")
poke = tbl_df(poke)
# tibble is a kind of dataframe of which the format is more free and lazy. (week 10 rmd also rename classification. While it's not used here, I thus skip the process.)
classify_legendary = select(poke, is_legendary, hp, weight_kg, height_m, sp_attack, sp_defense, type1, capture_rate)
names(classify_legendary) ## rename No.7 column 'type1' <- 'type'
## [1] "is_legendary" "hp" "weight_kg" "height_m" "sp_attack"
## [6] "sp_defense" "type1" "capture_rate"
colnames(classify_legendary)[7] <- 'type'
names(classify_legendary) ## checked rename
## [1] "is_legendary" "hp" "weight_kg" "height_m" "sp_attack"
## [6] "sp_defense" "type" "capture_rate"
print(unique(classify_legendary$type)) ## check out how many unique “types” are in this variable
## [1] "grass" "fire" "water" "bug" "normal" "poison"
## [7] "electric" "ground" "fairy" "fighting" "psychic" "rock"
## [13] "ghost" "ice" "dragon" "dark" "steel" "flying"
classify_legendary$capture_rate <- as.numeric(classify_legendary$capture_rate)
classify_legendary$is_legendary <- as.factor(classify_legendary$is_legendary)
classify_legendary$weight_kg[is.na(classify_legendary$weight_kg)] <- 0
classify_legendary$height_m[is.na(classify_legendary$height_m)] <- 0
colSums(is.na(classify_legendary))
## is_legendary hp weight_kg height_m sp_attack sp_defense
## 0 0 0 0 0 0
## type capture_rate
## 0 1
which(is.na(classify_legendary$capture_rate) == TRUE)
## [1] 774
Because I found out 774th row capture_rate contained two values which
missed when I try to convert its class, I thus decide to address the
problem before next step.
## original data within 774th row capture_rate: 30 (Meteorite)255 (Core)
## According to instruction at the end of sheet, the first value is recommended to adopt.
classify_legendary[774, c("capture_rate")] <- 30
colSums(is.na(classify_legendary)) ## check na value
## is_legendary hp weight_kg height_m sp_attack sp_defense
## 0 0 0 0 0 0
## type capture_rate
## 0 0
dummies_model <- dummyVars(is_legendary ~ ., data = classify_legendary)
data_mat <- predict(dummies_model, newdata = classify_legendary) # one-hot-encode all categorical variables
classify_legendary_ohe <- data.frame(data_mat)
## concatenate
classify_legendary_ohe <- cbind(classify_legendary$is_legendary, classify_legendary_ohe)
colnames(classify_legendary_ohe)[1] <- "is_legendary"
count = 0
for (i in names(classify_legendary_ohe)){
exist = FALSE
for (j in names(classify_legendary)){
if (i == j){
exist = TRUE
}
}
if (exist == FALSE){
print(i)
count = count + 1
}
}
## [1] "typebug"
## [1] "typedark"
## [1] "typedragon"
## [1] "typeelectric"
## [1] "typefairy"
## [1] "typefighting"
## [1] "typefire"
## [1] "typeflying"
## [1] "typeghost"
## [1] "typegrass"
## [1] "typeground"
## [1] "typeice"
## [1] "typenormal"
## [1] "typepoison"
## [1] "typepsychic"
## [1] "typerock"
## [1] "typesteel"
## [1] "typewater"
print(paste0("new columns generated by one hot encoding: ", count))
## [1] "new columns generated by one hot encoding: 18"
set.seed(123)
train_row_numbers <- createDataPartition(classify_legendary_ohe$is_legendary, p=0.75, list=FALSE)
train <- classify_legendary_ohe[train_row_numbers, ]
test <- classify_legendary_ohe[-train_row_numbers, ]
svm.fit <- svm(is_legendary~.,
scale = TRUE, # the default, x and y are scaled to zero mean and unit variance
kernel = "radial", # other options are available
degree = 3, # if kernel is of type = "polynomial"
# gamma = if (is.vector(x)) 1 else 1 / ncol(x), # you can provide one for variable x
cost = 1, # the cost function (C)
probability = FALSE, # whether to output probability predictions
na.action = na.omit,
data = train)
predict_test_svm <- predict(svm.fit, test)
confusion_svm <- table(Predicted = predict_test_svm,
Actual = test$is_legendary)
print(confusion_svm)
## Actual
## Predicted 0 1
## 0 181 16
## 1 1 1
print(paste0('acc score: ', (confusion_svm[1, 1] + confusion_svm[2, 2]) / sum(confusion_svm) * 100))
## [1] "acc score: 91.4572864321608"
poke_1 <- read.csv("https://www.dropbox.com/s/znbta9u9tub2ox9/pokemon.csv?dl=1")
poke_1 = tbl_df(poke_1)
KNN_classify_legendary = select(poke_1, is_legendary, hp, weight_kg, height_m, sp_attack, sp_defense, type1, capture_rate)
colnames(KNN_classify_legendary)[7] <- 'type'
KNN_classify_legendary$capture_rate <- as.numeric(KNN_classify_legendary$capture_rate)
KNN_classify_legendary$is_legendary <- as.factor(KNN_classify_legendary$is_legendary)
colSums(is.na(KNN_classify_legendary))
## is_legendary hp weight_kg height_m sp_attack sp_defense
## 0 0 20 20 0 0
## type capture_rate
## 0 1
library(RANN)
pre_process_missing_data <- preProcess(as.data.frame(KNN_classify_legendary), method="knnImpute")
KNN_classify_legendary <- predict(pre_process_missing_data, newdata = KNN_classify_legendary)
colSums(is.na(KNN_classify_legendary))
## is_legendary hp weight_kg height_m sp_attack sp_defense
## 0 0 0 0 0 0
## type capture_rate
## 0 0
set.seed(123)
KNN_train_row_numbers <- createDataPartition(KNN_classify_legendary$is_legendary, p=0.75, list=FALSE)
KNN_train <- KNN_classify_legendary[KNN_train_row_numbers, ]
KNN_test <- KNN_classify_legendary[-KNN_train_row_numbers, ]
KNN_svm.fit <- svm(is_legendary~.,
scale = TRUE, # the default, x and y are scaled to zero mean and unit variance
kernel = "radial", # other options are available
degree = 3, # if kernel is of type = "polynomial"
# gamma = if (is.vector(x)) 1 else 1 / ncol(x), # you can provide one for variable x
cost = 1, # the cost function (C)
probability = FALSE, # whether to output probability predictions
na.action = na.omit,
data = KNN_train)
KNN_predict_test_svm <- predict(KNN_svm.fit, KNN_test)
KNN_confusion_svm <- table(Predicted = KNN_predict_test_svm,
Actual = KNN_test$is_legendary)
print(paste0('The KNN-imputed model has higher prediction accuracy. KNN acc score: ', (KNN_confusion_svm[1, 1] + KNN_confusion_svm[2, 2]) / sum(KNN_confusion_svm) * 100))
## [1] "The KNN-imputed model has higher prediction accuracy. KNN acc score: 93.4673366834171"
library(e1071)
library(caret)
library(gbm)
library(pROC)
library(plotROC)
library(ISLR)
data('Default')
head(Default)
## default student balance income
## 1 No No 729.5265 44361.625
## 2 No Yes 817.1804 12106.135
## 3 No No 1073.5492 31767.139
## 4 No No 529.2506 35704.494
## 5 No No 785.6559 38463.496
## 6 No Yes 919.5885 7491.559
colSums(is.na(Default))
## default student balance income
## 0 0 0 0
Default$default <- ifelse(Default$default == "No", 0, 1)
Default$student <- ifelse(Default$student == "No", 0, 1)
set.seed(123)
default_train_row_numbers <- createDataPartition(Default$default, p=0.75, list=FALSE)
default_train <- Default[default_train_row_numbers, ]
default_test <- Default[-default_train_row_numbers, ]
logit model
logit_deft <- glm(default ~ ., family = binomial, data = default_train)
probit model
probit_deft <- glm(default ~ ., family = binomial(link = "probit"), data = default_train)
GBM
library(rsample)
library(gbm)
library(xgboost)
library(pdp)
library(ggplot2)
library(purrr)
## adjust hyperparameters
## random_index <- sample(1:nrow(Default), nrow(default_train))
## random_training <- Default[random_index, ]
hyper_grid <- expand.grid(
shrinkage = c(.01, .1, .2),
interaction.depth = c(1, 3, 5),
n.minobsinnode = c(5, 10, 15),
bag.fraction = c(.5, .75, 1),
optimal_trees = 0, # you will fill in values from loop
min_RMSE = 0)
## iterate to find the best combination set of hyperparameters
for(i in 1:nrow(hyper_grid)) {
# reproducibility
set.seed(123)
# train model
gbm.tune <- gbm(
default ~ .,
data = default_train,
distribution = "bernoulli",
cv.folds = 5,
n.trees = 500,
interaction.depth = hyper_grid$interaction.depth[i],
shrinkage = hyper_grid$shrinkage[i],
n.minobsinnode = hyper_grid$n.minobsinnode[i],
bag.fraction = hyper_grid$bag.fraction[i],
train.fraction = .75,
n.cores = NULL, # will use all cores by default
verbose = FALSE
)
# locate minimum training error from the n-th tree, add it to grid
hyper_grid$optimal_trees[i] <- which.min(gbm.tune$valid.error)
hyper_grid$min_RMSE[i] <- sqrt(min(gbm.tune$valid.error))
}
optimal_para <- hyper_grid %>% dplyr::arrange(min_RMSE) %>% head(1)
# define a list of optimal hyperparameters
shrink <- optimal_para$shrinkage
ntrees <- optimal_para$optimal_trees
minobs <- optimal_para$n.minobsinnode
bagfrac <- optimal_para$bag.fraction
depth <- optimal_para$interaction.depth
# train final GBM model
gbm_deft <- gbm(
default ~ .,
data = default_train,
distribution = "bernoulli",
cv.folds = 5,
n.trees = ntrees,
interaction.depth = depth,
shrinkage = shrink,
n.minobsinnode = minobs,
bag.fraction = bagfrac,
n.cores = NULL, # will use all cores by default
verbose = FALSE
)
SVM
default_test$default <- as.factor(default_test$default)
class(default_test$default)
## [1] "factor"
default_train$default <- as.factor(default_train$default)
class(default_train$default)
## [1] "factor"
svm_deft <- svm(default ~ .,
scale = TRUE, # the default, x and y are scaled to zero mean and unit variance
kernel = "radial", # other options are available
degree = 3, # if kernel is of type = "polynomial"
# gamma = if (is.vector(x)) 1 else 1 / ncol(x), # you can provide one for variable x
cost = 1, # the cost function (C)
probability = FALSE, # whether to output probability predictions
na.action = na.omit,
data = default_train)
pred_logit_deft <- predict(logit_deft, default_test, type = "response")
pred_probit_deft <- predict(probit_deft, default_test, type = "response")
pred_svm_deft <- predict(svm_deft, default_test)
pred_gbm_deft <- predict(gbm_deft, default_test, type = "response")
(convert logit, probit, gbm results to 0/1)
convert_gbm_pred <- as.numeric(pred_gbm_deft > 0.5)
confmat_gbm <- table(Predicted = convert_gbm_pred, Actual = default_test$default)
(confmat_gbm[1 ,1] + confmat_gbm[2, 2]) / sum(confmat_gbm)
## [1] 0.9736
convert_logit_pred <- as.numeric(pred_logit_deft > 0.5)
confmat_logit <- table(Predicted = convert_logit_pred, Actual = default_test$default)
(confmat_logit[1 ,1] + confmat_logit[2, 2]) / sum(confmat_logit)
## [1] 0.9732
convert_probit_pred <- as.numeric(pred_probit_deft > 0.5)
confmat_probit <- table(Predicted = convert_probit_pred, Actual = default_test$default)
(confmat_probit[1 ,1] + confmat_probit[2, 2]) / sum(confmat_probit)
## [1] 0.9736
confmat_svm <- table(Predicted = pred_svm_deft, Actual = default_test$default)
(confmat_svm[1 ,1] + confmat_svm[2, 2]) / sum(confmat_svm)
## [1] 0.9704
library(pROC)
library(plotROC)
ROC_logit <- roc(default_test$default, convert_logit_pred)
ROC_probit <- roc(default_test$default, convert_probit_pred)
ROC_svm <- roc(default_test$default, as.numeric(pred_svm_deft))
ROC_gbm <- roc(default_test$default, convert_gbm_pred)
auc(ROC_logit)
## Area under the curve: 0.6445
auc(ROC_probit)
## Area under the curve: 0.6388
auc(ROC_svm)
## Area under the curve: 0.5841
auc(ROC_gbm)
## Area under the curve: 0.6447
The AUC of Gradient Boost Machine is largest.
par(mar=c(1,1,1,1))
plot(ROC_logit, xlim = c(1.1, -0.1))
lines(ROC_probit, col="blue")
lines(ROC_svm, col="red")
lines(ROC_gbm, col="green")
legend(0.2, 0.5,
legend = c("model AUC", "logit: 0.6445", "probit: 0.6388", "SVM: 0.5841", "GBM: 0.6447"),
col = c("white", "black", "blue", "red", "green"),
lty = c(1, 1, 2, 3, 4),
pch = c(NA, NA, NA, NA, NA),
cex = 0.7)
Overall, Gradient Boost Machine is the best predictive model.
library(dplyr)
library(rsample)
library(boot)
library(purrr)
kfcv_error <- function(x) {
set.seed(123)
logit_deft_2 <- glm(default ~ student + balance + poly(income, x, raw = FALSE), family = binomial, data = default_train)
cv.glm(default_train, logit_deft_2, K = 10)$delta[1] # I set K = 10
}
me_li <- 1:10 %>% map_dbl(kfcv_error)
me_li
## [1] 0.02147460 0.02150658 0.02153383 0.02157427 0.02160413 0.02162116
## [7] 0.02164158 0.02165694 0.02165978 0.02171254
min_me = 1
min_degree = 0
degree = 0
for (me in me_li){
degree = degree + 1
if (min_me > me){
min_me = me
min_degree = degree
}
}
print(paste0("The lowest mean prediction error ", min_me, " at ", min_degree,"-th degree."))
## [1] "The lowest mean prediction error 0.0214745998145375 at 1-th degree."
result <- function(data, index) {
set.seed(123)
logit_deft_3 <- glm(default ~ student + balance + income, family = binomial, data = default_train, subset = index)
coef(logit_deft_3)
}
set.seed(123)
boot(default_train, result, R = 1000)
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = default_train, statistic = result, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* -1.107857e+01 -3.623368e-02 5.653469e-01
## t2* -5.506691e-01 -1.486520e-02 2.704693e-01
## t3* 5.772478e-03 2.829435e-05 2.660595e-04
## t4* 7.082431e-06 -2.660294e-07 9.532645e-06
t3* which is balance has the largest bias.