DATA622 ASSIGNMENT 1 - Run the model exercise
library(tidyverse)
library(tidymodels)
library(GGally)
library(discrim)
library(kknn)
library(knitr)
library(kableExtra)
1 Data Exploration
1.1 Load Data
data <- read_csv('https://raw.githubusercontent.com/oggyluky11/DATA622-FALL-2020/main/HW1/data.csv') %>%
mutate_if(is.character,as.factor)
data
1.2 Data Summary
## X Y label
## Min. : 5 a:6 BLACK:22
## 1st Qu.:19 b:6 BLUE :14
## Median :43 c:6
## Mean :38 d:6
## 3rd Qu.:55 e:6
## Max. :63 f:6
1.3 Train Test Split
set.seed(123)
train_test_split <- initial_split(data, prop = 0.75, strata = 'label')
data_train <- training(train_test_split)
data_test <- testing(train_test_split)
1.3.1 Training Set
## X Y label
## Min. : 5.00 a:6 BLACK:17
## 1st Qu.:19.00 b:6 BLUE :11
## Median :43.00 c:4
## Mean :38.71 d:4
## 3rd Qu.:55.00 e:4
## Max. :63.00 f:4
1.3.2 Testing Set
## X Y label
## Min. : 5.0 a:0 BLACK:5
## 1st Qu.:15.5 b:0 BLUE :3
## Median :43.0 c:2
## Mean :35.5 d:2
## 3rd Qu.:52.0 e:2
## Max. :63.0 f:2
2 Logistic Regression
2.1 Construct Logicstic Regression Classifier
LR_fit <- logistic_reg() %>%
set_mode('classification') %>%
set_engine('glm') %>%
fit(label ~ ., data_train)
LR_fit
## parsnip model object
##
## Fit time: 21ms
##
## Call: stats::glm(formula = label ~ ., family = stats::binomial, data = data)
##
## Coefficients:
## (Intercept) X Yb Yc Yd Ye
## -6.081e-01 -2.247e-03 -2.732e-16 1.797e+00 -4.046e-01 -3.943e-01
## Yf
## 6.879e-01
##
## Degrees of Freedom: 27 Total (i.e. Null); 21 Residual
## Null Deviance: 37.52
## Residual Deviance: 34.31 AIC: 48.31
2.2 Model Prediction
2.2.1 Prediction on Training Set
LR_train_pred <- predict(LR_fit, data_train) %>%
bind_cols(data_train) %>%
bind_cols(predict(LR_fit,data_train, type ='prob')) %>%
select(.pred_class, label, everything())
LR_train_pred
2.2.2 Model Evaluation on Training Data
2.2.2.1 ROC Curve & AUC on Training Data
LR_train_pred_AUC <- LR_train_pred %>%
roc_auc(label, .pred_BLACK) %>%
select(.estimate) %>%
rename(AUC = `.estimate`)
LR_train_pred_AUC
2.2.2.2 Metrics from Confusion Matrix
#create a function to calculate required metrics
confusion_matrix_metrics <- function(m){
tp <- m[1,1]
fp <- m[1,2]
fn <- m[2,1]
tn <- m[2,2]
ACCURACY <- (tp+tn)/sum(m)
TPR <- tp/(tp+fn)
TNR <- tn/(tn+fp)
FPR <- 1 - TNR
FNR <- 1 - TPR
return(data.frame(ACCURACY, TPR, FPR, TNR, FNR))
}
LR_train_conf_m <- LR_train_pred %>%
select(.pred_class,label) %>%
table() %>%
as.matrix()
LR_train_conf_m
## label
## .pred_class BLACK BLUE
## BLACK 15 7
## BLUE 2 4
2.2.3 Overall Performance Metrics Table
LR_train_metrics <- bind_cols(Algo = 'LR',
Data_Set = 'Train',
AUC = LR_train_pred_AUC,
LR_train_conf_m_metrics)
LR_train_metrics
2.2.4 Prediction on Testing data & Calculate Overall Performance Matrics Table
Using the same approaches to make prediction on testing data
# make prediction on testing data
LR_test_pred <- predict(LR_fit, data_test) %>%
bind_cols(data_test) %>%
bind_cols(predict(LR_fit,data_test, type ='prob')) %>%
select(.pred_class, label, everything())
# produce ROC Curve from traning data
LR_test_pred %>% roc_curve(label, .pred_BLACK) %>% autoplot()
# get AUC from testing data
LR_test_pred_AUC <- LR_test_pred %>%
roc_auc(label, .pred_BLACK) %>%
select(.estimate) %>%
rename(AUC = `.estimate`)
# get confusion matrix from testing set
LR_test_conf_m <- LR_test_pred %>%
select(.pred_class,label) %>%
table() %>%
as.matrix()
# get accuracy metrics from confusion matrix
LR_test_conf_m_metrics <- confusion_matrix_metrics(LR_test_conf_m)
# Summarize overall performance metrics for LR testing data
LR_test_metrics <- bind_cols(Algo = 'LR',
Data_Set = 'Test',
AUC = LR_test_pred_AUC,
LR_test_conf_m_metrics)
LR_test_metrics
3 Naive Bayers Classifier
Construct an Naive Bayers Classifier
NB_fit <- naive_Bayes() %>%
set_mode('classification') %>%
set_engine('klaR') %>%
fit(label ~ ., data_train)
NB_fit
## parsnip model object
##
## Fit time: 0ms
## $apriori
## grouping
## BLACK BLUE
## 0.6071429 0.3928571
##
## $tables
## $tables$X
## $tables$X$BLACK
##
## Call:
## density.default(x = xx)
##
## Data: xx (17 obs.); Bandwidth 'bw' = 7.622
##
## x y
## Min. :-17.866 Min. :3.922e-05
## 1st Qu.: 8.067 1st Qu.:3.201e-03
## Median : 34.000 Median :8.266e-03
## Mean : 34.000 Mean :9.628e-03
## 3rd Qu.: 59.933 3rd Qu.:1.476e-02
## Max. : 85.866 Max. :2.592e-02
##
## $tables$X$BLUE
##
## Call:
## density.default(x = xx)
##
## Data: xx (11 obs.); Bandwidth 'bw' = 12.72
##
## x y
## Min. :-33.1542 Min. :0.0000346
## 1st Qu.: 0.4229 1st Qu.:0.0016252
## Median : 34.0000 Median :0.0093722
## Mean : 34.0000 Mean :0.0074337
## 3rd Qu.: 67.5771 3rd Qu.:0.0122704
## Max. :101.1542 Max. :0.0144140
##
##
## $tables$Y
## var
## grouping a b c d e f
## BLACK 0.23529412 0.23529412 0.05882353 0.17647059 0.17647059 0.11764706
## BLUE 0.18181818 0.18181818 0.27272727 0.09090909 0.09090909 0.18181818
##
##
## $levels
## [1] "BLACK" "BLUE"
##
## $call
## NaiveBayes.default(x = ~as.data.frame(x), grouping = ~y, usekernel = ~TRUE)
##
## $x
## X Y
## 1 5 a
## 2 5 b
## 3 5 d
## 4 5 f
## 5 19 a
## 6 19 b
## 7 19 c
## 8 19 e
## 9 19 f
## 10 35 a
## 11 35 b
## 12 35 c
## 13 35 d
## 14 35 e
## 15 51 a
## 16 51 b
## 17 51 c
## 18 51 d
## 19 55 a
## 20 55 b
## 21 55 c
## 22 55 e
## 23 55 f
## 24 63 a
## 25 63 b
## 26 63 d
## 27 63 e
## 28 63 f
##
## $usekernel
## [1] TRUE
##
## $varnames
## [1] "X" "Y"
##
## attr(,"class")
## [1] "NaiveBayes"
3.1 Model Prediction
3.1.1 Prediction on Training Set
NB_train_pred <- predict(NB_fit, data_train) %>%
bind_cols(data_train) %>%
bind_cols(predict(NB_fit,data_train, type ='prob')) %>%
select(.pred_class, label, everything())
NB_train_pred
3.1.2 Model Evaluation on Training Data
3.1.2.1 ROC Curve & AUC on Training Data
NB_train_pred_AUC <- NB_train_pred %>%
roc_auc(label, .pred_BLACK) %>%
select(.estimate) %>%
rename(AUC = `.estimate`)
NB_train_pred_AUC
3.1.2.2 Metrics from Confusion Matrix
NB_train_conf_m <- NB_train_pred %>%
select(.pred_class,label) %>%
table() %>%
as.matrix()
NB_train_conf_m
## label
## .pred_class BLACK BLUE
## BLACK 16 5
## BLUE 1 6
3.1.3 Overall Performance Metrics Table
NB_train_metrics <- bind_cols(Algo = 'NB',
Data_Set = 'Train',
AUC = NB_train_pred_AUC,
NB_train_conf_m_metrics)
NB_train_metrics
3.1.4 Prediction on Testing data & Calculate Overall Performance Matrics Table
Using the same approaches to make prediction on testing data
# make prediction on testing data
NB_test_pred <- predict(NB_fit, data_test) %>%
bind_cols(data_test) %>%
bind_cols(predict(NB_fit,data_test, type ='prob')) %>%
select(.pred_class, label, everything())
# produce ROC Curve from traning data
NB_test_pred %>% roc_curve(label, .pred_BLACK) %>% autoplot()
# get AUC from testing data
NB_test_pred_AUC <- NB_test_pred %>%
roc_auc(label, .pred_BLACK) %>%
select(.estimate) %>%
rename(AUC = `.estimate`)
# get confusion matrix from testing set
NB_test_conf_m <- NB_test_pred %>%
select(.pred_class,label) %>%
table() %>%
as.matrix()
# get accuracy metrics from confusion matrix
NB_test_conf_m_metrics <- confusion_matrix_metrics(NB_test_conf_m)
# Summarize overall performance metrics for NB testing data
NB_test_metrics <- bind_cols(Algo = 'NB',
Data_Set = 'Test',
AUC = NB_test_pred_AUC,
NB_test_conf_m_metrics)
NB_test_metrics
4 KNN Classifier (k = 3)
Construct an KNN Classifier
KNN_3_fit <- nearest_neighbor(neighbors = 3) %>%
set_mode('classification') %>%
set_engine('kknn') %>%
translate() %>%
fit(label ~ ., data_train)
KNN_3_fit
## parsnip model object
##
## Fit time: 10ms
##
## Call:
## kknn::train.kknn(formula = label ~ ., data = data, ks = ~3)
##
## Type of response variable: nominal
## Minimal misclassification: 0.4642857
## Best kernel: optimal
## Best k: 3
4.1 Model Prediction
4.1.1 Prediction on Training Set
KNN_3_train_pred <- predict(KNN_3_fit, data_train) %>%
data.frame(`.pred_class` = .) %>%
bind_cols(data_train) %>%
bind_cols(predict(KNN_3_fit,data_train, type ='prob')) %>%
select(.pred_class, label, everything())
KNN_3_train_pred
4.1.2 Model Evaluation on Training Data
4.1.2.1 ROC Curve & AUC on Training Data
KNN_3_train_pred_AUC <- KNN_3_train_pred %>%
roc_auc(label, .pred_BLACK) %>%
select(.estimate) %>%
rename(AUC = `.estimate`)
KNN_3_train_pred_AUC
4.1.2.2 Metrics from Confusion Matrix
KNN_3_train_conf_m <- KNN_3_train_pred %>%
select(.pred_class,label) %>%
table() %>%
as.matrix()
KNN_3_train_conf_m
## label
## .pred_class BLACK BLUE
## BLACK 17 0
## BLUE 0 11
KNN_3_train_conf_m_metrics <- confusion_matrix_metrics(KNN_3_train_conf_m)
KNN_3_train_conf_m_metrics
4.1.3 Overall Performance Metrics Table
KNN_3_train_metrics <- bind_cols(Algo = 'KNN (k=3)',
Data_Set = 'Train',
AUC = KNN_3_train_pred_AUC,
KNN_3_train_conf_m_metrics)
KNN_3_train_metrics
4.1.4 Prediction on Testing data & Calculate Overall Performance Matrics Table
Using the same approaches to make prediction on testing data
# make prediction on testing data
KNN_3_test_pred <- predict(KNN_3_fit, data_test) %>%
bind_cols(data_test) %>%
bind_cols(predict(KNN_3_fit,data_test, type ='prob')) %>%
select(.pred_class, label, everything())
# produce ROC Curve from traning data
KNN_3_test_pred %>% roc_curve(label, .pred_BLACK) %>% autoplot()
# get AUC from testing data
KNN_3_test_pred_AUC <- KNN_3_test_pred %>%
roc_auc(label, .pred_BLACK) %>%
select(.estimate) %>%
rename(AUC = `.estimate`)
# get confusion matrix from testing set
KNN_3_test_conf_m <- KNN_3_test_pred %>%
select(.pred_class,label) %>%
table() %>%
as.matrix()
# get accuracy metrics from confusion matrix
KNN_3_test_conf_m_metrics <- confusion_matrix_metrics(KNN_3_test_conf_m)
# Summarize overall performance metrics for KNN 3 testing data
KNN_3_test_metrics <- bind_cols(Algo = 'KNN (k=3)',
Data_Set = 'Test',
AUC = KNN_3_test_pred_AUC,
KNN_3_test_conf_m_metrics)
KNN_3_test_metrics
5 KNN Classifier (k = 5)
Construct an KNN Classifier
KNN_5_fit <- nearest_neighbor(neighbors = 5) %>%
set_mode('classification') %>%
set_engine('kknn') %>%
translate() %>%
fit(label ~ ., data_train)
KNN_5_fit
## parsnip model object
##
## Fit time: 0ms
##
## Call:
## kknn::train.kknn(formula = label ~ ., data = data, ks = ~5)
##
## Type of response variable: nominal
## Minimal misclassification: 0.3571429
## Best kernel: optimal
## Best k: 5
5.1 Model Prediction
5.1.1 Prediction on Training Set
KNN_5_train_pred <- predict(KNN_5_fit, data_train) %>%
data.frame(`.pred_class` = .) %>%
bind_cols(data_train) %>%
bind_cols(predict(KNN_5_fit,data_train, type ='prob')) %>%
select(.pred_class, label, everything())
KNN_5_train_pred
5.1.2 Model Evaluation on Training Data
5.1.2.1 ROC Curve & AUC on Training Data
KNN_5_train_pred_AUC <- KNN_3_train_pred %>%
roc_auc(label, .pred_BLACK) %>%
select(.estimate) %>%
rename(AUC = `.estimate`)
KNN_5_train_pred_AUC
5.1.2.2 Metrics from Confusion Matrix
KNN_5_train_conf_m <- KNN_5_train_pred %>%
select(.pred_class,label) %>%
table() %>%
as.matrix()
KNN_5_train_conf_m
## label
## .pred_class BLACK BLUE
## BLACK 17 3
## BLUE 0 8
KNN_5_train_conf_m_metrics <- confusion_matrix_metrics(KNN_5_train_conf_m)
KNN_5_train_conf_m_metrics
5.1.3 Overall Performance Metrics Table
KNN_5_train_metrics <- bind_cols(Algo = 'KNN (K=5)',
Data_Set = 'Train',
AUC = KNN_5_train_pred_AUC,
KNN_5_train_conf_m_metrics)
KNN_5_train_metrics
5.1.4 Prediction on Testing data & Calculate Overall Performance Matrics Table
Using the same approaches to make prediction on testing data
# make prediction on testing data
KNN_5_test_pred <- predict(KNN_5_fit, data_test) %>%
bind_cols(data_test) %>%
bind_cols(predict(KNN_5_fit,data_test, type ='prob')) %>%
select(.pred_class, label, everything())
# produce ROC Curve from traning data
KNN_5_test_pred %>% roc_curve(label, .pred_BLACK) %>% autoplot()
# get AUC from testing data
KNN_5_test_pred_AUC <- KNN_5_test_pred %>%
roc_auc(label, .pred_BLACK) %>%
select(.estimate) %>%
rename(AUC = `.estimate`)
# get confusion matrix from testing set
KNN_5_test_conf_m <- KNN_5_test_pred %>%
select(.pred_class,label) %>%
table() %>%
as.matrix()
# get accuracy metrics from confusion matrix
KNN_5_test_conf_m_metrics <- confusion_matrix_metrics(KNN_5_test_conf_m)
# Summarize overall performance metrics for LR testing data
KNN_5_test_metrics <- bind_cols(Algo = 'KNN (k=5)',
Data_Set = 'Test',
AUC = KNN_5_test_pred_AUC,
KNN_5_test_conf_m_metrics)
KNN_5_test_metrics
6 Summary
6.1 Model Performance Metrics
6.1.1 Metrics of Training Set (Model’s Capacity to Learn)
bind_rows(LR_train_metrics,
NB_train_metrics,
KNN_3_train_metrics,
KNN_5_train_metrics) %>%
select(-Data_Set) %>%
kable(caption = 'Model Performance Metrics on Training Data') #%>%
Algo | AUC | ACCURACY | TPR | FPR | TNR | FNR |
---|---|---|---|---|---|---|
LR | 0.6737968 | 0.6785714 | 0.8823529 | 0.6363636 | 0.3636364 | 0.1176471 |
NB | 0.8235294 | 0.7857143 | 0.9411765 | 0.4545455 | 0.5454545 | 0.0588235 |
KNN (k=3) | 1.0000000 | 1.0000000 | 1.0000000 | 0.0000000 | 1.0000000 | 0.0000000 |
KNN (K=5) | 1.0000000 | 0.8928571 | 1.0000000 | 0.2727273 | 0.7272727 | 0.0000000 |
6.1.2 Metrics of Testing Set (model’s Ability to Generalize)
bind_rows(LR_test_metrics,
NB_test_metrics,
KNN_3_test_metrics,
KNN_5_test_metrics) %>%
select(-Data_Set) %>%
kable(caption = 'Model Performance Metrics on Testing Data') #%>%
Algo | AUC | ACCURACY | TPR | FPR | TNR | FNR |
---|---|---|---|---|---|---|
LR | 0.8000000 | 0.750 | 0.8 | 0.3333333 | 0.6666667 | 0.2 |
NB | 1.0000000 | 0.875 | 1.0 | 0.3333333 | 0.6666667 | 0.0 |
KNN (k=3) | 0.6333333 | 0.625 | 0.8 | 0.6666667 | 0.3333333 | 0.2 |
KNN (k=5) | 0.7333333 | 0.750 | 0.8 | 0.3333333 | 0.6666667 | 0.2 |
6.2 Model Performance Analysis
6.2.1 Model Description
1. Logistic Regression (LR) LR is a discriminative classifier, means it directly computes the conditional probability P(class|data). The classification is determined base on a given threshold of probalitility. Also, given a sufficent sample size, logistic regression has a better peformance compared to Naive Bayes classifier in terms of accuracy.
2. Naive Bayes Classifier (NB) NB is a generative classifier because it first generate the joint probability P(class, data) then computes the comditional probability P(class|data). The generated probability is a N-values descrete output where N is the number of classes in the response variable (the label), the classificiation is detemined to be the class that holds the largest P(Class|data). Also, regardless of sample size, Naive Bayes always converge faster to its asymptotic error compared to LR, which means it ‘learns’ faster.
3. KNN Classifier KNN is a discriminative algorithm since it directly models the conditional probability P(class|data). The performance of KNN depends on the choice of K and is negatively impacted by the complexity of the variable space (the number of variables) because it directly focusses on the distance between the target data point and the other data points. As variable space because more and more complex, the abstraction of high dimentional distance calculation make the distance not a good representation of the similarty. The data is used in this project only has two predictor variables therefore KNN can performs well.
6.2.2 Selection of Model that Has Best Capacity to Learn
According to the model performance statistics of the training data, KNN model with K = 3 has the best performance statistics in terms of AUC, Accuracy, TPR, FRP, TNR and FNR, following by KNN with K = 5, Naive Bayes and Logistic Regression in order. Therefore KNN with K = 3 shows it has the best capacity to learn among all models in this project.
6.2.3 Selection of Model that Has Best Performance on Generalizing.
According to the model performance statistics of the testing data, Naive Bayes (NB) has the best performance statistics in terms of AUC, Accuracy, TPR, FRP, TNR and FNR, following by Logistic Regression, KNN with K = 5 and KNN with K = 3 in order. Therefore Naive Bayes shows it has the best ability to generalize among all models in this project.
However, from business perspective, because the models are built based on a data set which contains only 36 observations, the performance of the models are yet to be further re-evaluated given more data are collected in future business operations.