library(tidyverse)
library(tidymodels)
library(GGally)
library(discrim)
library(kknn)
library(knitr)
library(kableExtra)

1 Data Exploration

1.1 Load Data

data <- read_csv('https://raw.githubusercontent.com/oggyluky11/DATA622-FALL-2020/main/HW1/data.csv') %>%
  mutate_if(is.character,as.factor)

data

1.2 Data Summary

summary(data)

##        X      Y       label   
##  Min.   : 5   a:6   BLACK:22  
##  1st Qu.:19   b:6   BLUE :14  
##  Median :43   c:6             
##  Mean   :38   d:6             
##  3rd Qu.:55   e:6             
##  Max.   :63   f:6

1.3 Train Test Split

set.seed(123)
train_test_split <- initial_split(data, prop = 0.75, strata = 'label')
data_train <- training(train_test_split)
data_test <- testing(train_test_split)

1.3.1 Training Set

data_train

summary(data_train)

##        X         Y       label   
##  Min.   : 5.00   a:6   BLACK:17  
##  1st Qu.:19.00   b:6   BLUE :11  
##  Median :43.00   c:4             
##  Mean   :38.71   d:4             
##  3rd Qu.:55.00   e:4             
##  Max.   :63.00   f:4

1.3.2 Testing Set

data_test

summary(data_test)

##        X        Y       label  
##  Min.   : 5.0   a:0   BLACK:5  
##  1st Qu.:15.5   b:0   BLUE :3  
##  Median :43.0   c:2            
##  Mean   :35.5   d:2            
##  3rd Qu.:52.0   e:2            
##  Max.   :63.0   f:2

2 Logistic Regression

2.1 Construct Logicstic Regression Classifier

LR_fit <- logistic_reg() %>%
  set_mode('classification') %>%
  set_engine('glm') %>%
  fit(label ~ ., data_train)


LR_fit

## parsnip model object
## 
## Fit time:  21ms 
## 
## Call:  stats::glm(formula = label ~ ., family = stats::binomial, data = data)
## 
## Coefficients:
## (Intercept)            X           Yb           Yc           Yd           Ye  
##  -6.081e-01   -2.247e-03   -2.732e-16    1.797e+00   -4.046e-01   -3.943e-01  
##          Yf  
##   6.879e-01  
## 
## Degrees of Freedom: 27 Total (i.e. Null);  21 Residual
## Null Deviance:       37.52 
## Residual Deviance: 34.31     AIC: 48.31

2.2 Model Prediction

2.2.1 Prediction on Training Set

LR_train_pred <- predict(LR_fit, data_train) %>%
  bind_cols(data_train) %>%
  bind_cols(predict(LR_fit,data_train, type ='prob')) %>%
  select(.pred_class, label, everything())


LR_train_pred

2.2.2 Model Evaluation on Training Data

2.2.2.1 ROC Curve & AUC on Training Data

LR_train_pred %>% roc_curve(label, .pred_BLACK) %>% autoplot()

LR_train_pred_AUC <- LR_train_pred %>% 
  roc_auc(label, .pred_BLACK) %>%
  select(.estimate) %>%
  rename(AUC = `.estimate`)

LR_train_pred_AUC

2.2.2.2 Metrics from Confusion Matrix

#create a function to calculate required metrics
confusion_matrix_metrics <- function(m){
  tp <- m[1,1]
  fp <- m[1,2]
  fn <- m[2,1]
  tn <- m[2,2]
  ACCURACY <- (tp+tn)/sum(m)
  TPR <- tp/(tp+fn)
  TNR <- tn/(tn+fp)
  FPR <- 1 - TNR
  FNR <- 1 - TPR
  return(data.frame(ACCURACY, TPR, FPR, TNR, FNR))
  
}

LR_train_conf_m <- LR_train_pred %>%
  select(.pred_class,label) %>%
  table() %>%
  as.matrix() 

LR_train_conf_m

##            label
## .pred_class BLACK BLUE
##       BLACK    15    7
##       BLUE      2    4

LR_train_conf_m_metrics <- confusion_matrix_metrics(LR_train_conf_m)
LR_train_conf_m_metrics

2.2.3 Overall Performance Metrics Table

LR_train_metrics <- bind_cols(Algo = 'LR', 
                              Data_Set = 'Train',
                              AUC = LR_train_pred_AUC,
                              LR_train_conf_m_metrics)

LR_train_metrics

2.2.4 Prediction on Testing data & Calculate Overall Performance Matrics Table

Using the same approaches to make prediction on testing data

# make prediction on testing data
LR_test_pred <- predict(LR_fit, data_test) %>%
  bind_cols(data_test) %>%
  bind_cols(predict(LR_fit,data_test, type ='prob')) %>%
  select(.pred_class, label, everything())

# produce ROC Curve from traning data
LR_test_pred %>% roc_curve(label, .pred_BLACK) %>% autoplot()

# get AUC from testing data
LR_test_pred_AUC <- LR_test_pred %>% 
  roc_auc(label, .pred_BLACK) %>%
  select(.estimate) %>%
  rename(AUC = `.estimate`)

# get confusion matrix from testing set
LR_test_conf_m <- LR_test_pred %>%
  select(.pred_class,label) %>%
  table() %>%
  as.matrix() 

# get accuracy metrics from confusion matrix
LR_test_conf_m_metrics <- confusion_matrix_metrics(LR_test_conf_m)

# Summarize overall performance metrics for LR testing data
LR_test_metrics <- bind_cols(Algo = 'LR', 
                              Data_Set = 'Test',
                              AUC = LR_test_pred_AUC,
                              LR_test_conf_m_metrics)

LR_test_metrics

3 Naive Bayers Classifier

Construct an Naive Bayers Classifier

NB_fit <- naive_Bayes() %>%
  set_mode('classification') %>%
  set_engine('klaR') %>%
  fit(label ~ ., data_train)

NB_fit

## parsnip model object
## 
## Fit time:  0ms 
## $apriori
## grouping
##     BLACK      BLUE 
## 0.6071429 0.3928571 
## 
## $tables
## $tables$X
## $tables$X$BLACK
## 
## Call:
##  density.default(x = xx)
## 
## Data: xx (17 obs.);  Bandwidth 'bw' = 7.622
## 
##        x                 y            
##  Min.   :-17.866   Min.   :3.922e-05  
##  1st Qu.:  8.067   1st Qu.:3.201e-03  
##  Median : 34.000   Median :8.266e-03  
##  Mean   : 34.000   Mean   :9.628e-03  
##  3rd Qu.: 59.933   3rd Qu.:1.476e-02  
##  Max.   : 85.866   Max.   :2.592e-02  
## 
## $tables$X$BLUE
## 
## Call:
##  density.default(x = xx)
## 
## Data: xx (11 obs.);  Bandwidth 'bw' = 12.72
## 
##        x                  y            
##  Min.   :-33.1542   Min.   :0.0000346  
##  1st Qu.:  0.4229   1st Qu.:0.0016252  
##  Median : 34.0000   Median :0.0093722  
##  Mean   : 34.0000   Mean   :0.0074337  
##  3rd Qu.: 67.5771   3rd Qu.:0.0122704  
##  Max.   :101.1542   Max.   :0.0144140  
## 
## 
## $tables$Y
##         var
## grouping          a          b          c          d          e          f
##    BLACK 0.23529412 0.23529412 0.05882353 0.17647059 0.17647059 0.11764706
##    BLUE  0.18181818 0.18181818 0.27272727 0.09090909 0.09090909 0.18181818
## 
## 
## $levels
## [1] "BLACK" "BLUE" 
## 
## $call
## NaiveBayes.default(x = ~as.data.frame(x), grouping = ~y, usekernel = ~TRUE)
## 
## $x
##     X Y
## 1   5 a
## 2   5 b
## 3   5 d
## 4   5 f
## 5  19 a
## 6  19 b
## 7  19 c
## 8  19 e
## 9  19 f
## 10 35 a
## 11 35 b
## 12 35 c
## 13 35 d
## 14 35 e
## 15 51 a
## 16 51 b
## 17 51 c
## 18 51 d
## 19 55 a
## 20 55 b
## 21 55 c
## 22 55 e
## 23 55 f
## 24 63 a
## 25 63 b
## 26 63 d
## 27 63 e
## 28 63 f
## 
## $usekernel
## [1] TRUE
## 
## $varnames
## [1] "X" "Y"
## 
## attr(,"class")
## [1] "NaiveBayes"

3.1 Model Prediction

3.1.1 Prediction on Training Set

NB_train_pred <- predict(NB_fit, data_train) %>%
  bind_cols(data_train) %>%
  bind_cols(predict(NB_fit,data_train, type ='prob')) %>%
  select(.pred_class, label, everything())


NB_train_pred

3.1.2 Model Evaluation on Training Data

3.1.2.1 ROC Curve & AUC on Training Data

NB_train_pred %>% roc_curve(label, .pred_BLACK) %>% autoplot()

NB_train_pred_AUC <- NB_train_pred %>% 
  roc_auc(label, .pred_BLACK) %>%
  select(.estimate) %>%
  rename(AUC = `.estimate`)

NB_train_pred_AUC

3.1.2.2 Metrics from Confusion Matrix

NB_train_conf_m <- NB_train_pred %>%
  select(.pred_class,label) %>%
  table() %>%
  as.matrix() 

NB_train_conf_m

##            label
## .pred_class BLACK BLUE
##       BLACK    16    5
##       BLUE      1    6

NB_train_conf_m_metrics <- confusion_matrix_metrics(NB_train_conf_m)
NB_train_conf_m_metrics

3.1.3 Overall Performance Metrics Table

NB_train_metrics <- bind_cols(Algo = 'NB', 
                              Data_Set = 'Train',
                              AUC = NB_train_pred_AUC,
                              NB_train_conf_m_metrics)

NB_train_metrics

3.1.4 Prediction on Testing data & Calculate Overall Performance Matrics Table

Using the same approaches to make prediction on testing data

# make prediction on testing data
NB_test_pred <- predict(NB_fit, data_test) %>%
  bind_cols(data_test) %>%
  bind_cols(predict(NB_fit,data_test, type ='prob')) %>%
  select(.pred_class, label, everything())

# produce ROC Curve from traning data
NB_test_pred %>% roc_curve(label, .pred_BLACK) %>% autoplot()

# get AUC from testing data
NB_test_pred_AUC <- NB_test_pred %>% 
  roc_auc(label, .pred_BLACK) %>%
  select(.estimate) %>%
  rename(AUC = `.estimate`)

# get confusion matrix from testing set
NB_test_conf_m <- NB_test_pred %>%
  select(.pred_class,label) %>%
  table() %>%
  as.matrix() 

# get accuracy metrics from confusion matrix
NB_test_conf_m_metrics <- confusion_matrix_metrics(NB_test_conf_m)

# Summarize overall performance metrics for NB testing data
NB_test_metrics <- bind_cols(Algo = 'NB', 
                              Data_Set = 'Test',
                              AUC = NB_test_pred_AUC,
                              NB_test_conf_m_metrics)

NB_test_metrics

4 KNN Classifier (k = 3)

Construct an KNN Classifier

KNN_3_fit <- nearest_neighbor(neighbors = 3) %>%
  set_mode('classification') %>%
  set_engine('kknn') %>%
  translate() %>%
  fit(label ~ ., data_train)

KNN_3_fit

## parsnip model object
## 
## Fit time:  10ms 
## 
## Call:
## kknn::train.kknn(formula = label ~ ., data = data, ks = ~3)
## 
## Type of response variable: nominal
## Minimal misclassification: 0.4642857
## Best kernel: optimal
## Best k: 3

4.1 Model Prediction

4.1.1 Prediction on Training Set

KNN_3_train_pred <- predict(KNN_3_fit, data_train) %>%
  data.frame(`.pred_class` = .) %>%
  bind_cols(data_train) %>%
  bind_cols(predict(KNN_3_fit,data_train, type ='prob')) %>%
  select(.pred_class, label, everything())


KNN_3_train_pred

4.1.2 Model Evaluation on Training Data

4.1.2.1 ROC Curve & AUC on Training Data

KNN_3_train_pred %>% roc_curve(label, .pred_BLACK) %>% autoplot()

KNN_3_train_pred_AUC <- KNN_3_train_pred %>% 
  roc_auc(label, .pred_BLACK) %>%
  select(.estimate) %>%
  rename(AUC = `.estimate`)

KNN_3_train_pred_AUC

4.1.2.2 Metrics from Confusion Matrix

KNN_3_train_conf_m <- KNN_3_train_pred %>%
  select(.pred_class,label) %>%
  table() %>%
  as.matrix() 

KNN_3_train_conf_m

##            label
## .pred_class BLACK BLUE
##       BLACK    17    0
##       BLUE      0   11

KNN_3_train_conf_m_metrics <- confusion_matrix_metrics(KNN_3_train_conf_m)
KNN_3_train_conf_m_metrics

4.1.3 Overall Performance Metrics Table

KNN_3_train_metrics <- bind_cols(Algo = 'KNN (k=3)', 
                              Data_Set = 'Train',
                              AUC = KNN_3_train_pred_AUC,
                              KNN_3_train_conf_m_metrics)

KNN_3_train_metrics

4.1.4 Prediction on Testing data & Calculate Overall Performance Matrics Table

Using the same approaches to make prediction on testing data

# make prediction on testing data
KNN_3_test_pred <- predict(KNN_3_fit, data_test) %>%
  bind_cols(data_test) %>%
  bind_cols(predict(KNN_3_fit,data_test, type ='prob')) %>%
  select(.pred_class, label, everything())

# produce ROC Curve from traning data
KNN_3_test_pred %>% roc_curve(label, .pred_BLACK) %>% autoplot()

# get AUC from testing data
KNN_3_test_pred_AUC <- KNN_3_test_pred %>% 
  roc_auc(label, .pred_BLACK) %>%
  select(.estimate) %>%
  rename(AUC = `.estimate`)

# get confusion matrix from testing set
KNN_3_test_conf_m <- KNN_3_test_pred %>%
  select(.pred_class,label) %>%
  table() %>%
  as.matrix() 

# get accuracy metrics from confusion matrix
KNN_3_test_conf_m_metrics <- confusion_matrix_metrics(KNN_3_test_conf_m)

# Summarize overall performance metrics for KNN 3 testing data
KNN_3_test_metrics <- bind_cols(Algo = 'KNN (k=3)', 
                              Data_Set = 'Test',
                              AUC = KNN_3_test_pred_AUC,
                              KNN_3_test_conf_m_metrics)

KNN_3_test_metrics

5 KNN Classifier (k = 5)

Construct an KNN Classifier

KNN_5_fit <- nearest_neighbor(neighbors = 5) %>%
  set_mode('classification') %>%
  set_engine('kknn') %>%
  translate() %>%
  fit(label ~ ., data_train)

KNN_5_fit

## parsnip model object
## 
## Fit time:  0ms 
## 
## Call:
## kknn::train.kknn(formula = label ~ ., data = data, ks = ~5)
## 
## Type of response variable: nominal
## Minimal misclassification: 0.3571429
## Best kernel: optimal
## Best k: 5

5.1 Model Prediction

5.1.1 Prediction on Training Set

KNN_5_train_pred <- predict(KNN_5_fit, data_train) %>%
  data.frame(`.pred_class` = .) %>%
  bind_cols(data_train) %>%
  bind_cols(predict(KNN_5_fit,data_train, type ='prob')) %>%
  select(.pred_class, label, everything())


KNN_5_train_pred

5.1.2 Model Evaluation on Training Data

5.1.2.1 ROC Curve & AUC on Training Data

KNN_5_train_pred %>% roc_curve(label, .pred_BLACK) %>% autoplot()

KNN_5_train_pred_AUC <- KNN_3_train_pred %>% 
  roc_auc(label, .pred_BLACK) %>%
  select(.estimate) %>%
  rename(AUC = `.estimate`)

KNN_5_train_pred_AUC

5.1.2.2 Metrics from Confusion Matrix

KNN_5_train_conf_m <- KNN_5_train_pred %>%
  select(.pred_class,label) %>%
  table() %>%
  as.matrix() 

KNN_5_train_conf_m

##            label
## .pred_class BLACK BLUE
##       BLACK    17    3
##       BLUE      0    8

KNN_5_train_conf_m_metrics <- confusion_matrix_metrics(KNN_5_train_conf_m)
KNN_5_train_conf_m_metrics

5.1.3 Overall Performance Metrics Table

KNN_5_train_metrics <- bind_cols(Algo = 'KNN (K=5)', 
                              Data_Set = 'Train',
                              AUC = KNN_5_train_pred_AUC,
                              KNN_5_train_conf_m_metrics)

KNN_5_train_metrics

5.1.4 Prediction on Testing data & Calculate Overall Performance Matrics Table

Using the same approaches to make prediction on testing data

# make prediction on testing data
KNN_5_test_pred <- predict(KNN_5_fit, data_test) %>%
  bind_cols(data_test) %>%
  bind_cols(predict(KNN_5_fit,data_test, type ='prob')) %>%
  select(.pred_class, label, everything())

# produce ROC Curve from traning data
KNN_5_test_pred %>% roc_curve(label, .pred_BLACK) %>% autoplot()

# get AUC from testing data
KNN_5_test_pred_AUC <- KNN_5_test_pred %>% 
  roc_auc(label, .pred_BLACK) %>%
  select(.estimate) %>%
  rename(AUC = `.estimate`)

# get confusion matrix from testing set
KNN_5_test_conf_m <- KNN_5_test_pred %>%
  select(.pred_class,label) %>%
  table() %>%
  as.matrix() 

# get accuracy metrics from confusion matrix
KNN_5_test_conf_m_metrics <- confusion_matrix_metrics(KNN_5_test_conf_m)

# Summarize overall performance metrics for LR testing data
KNN_5_test_metrics <- bind_cols(Algo = 'KNN (k=5)', 
                              Data_Set = 'Test',
                              AUC = KNN_5_test_pred_AUC,
                              KNN_5_test_conf_m_metrics)

KNN_5_test_metrics

6 Summary

6.1 Model Performance Metrics

6.1.1 Metrics of Training Set (Model’s Capacity to Learn)

bind_rows(LR_train_metrics,
          NB_train_metrics,
          KNN_3_train_metrics,
          KNN_5_train_metrics) %>%
  select(-Data_Set) %>%
  kable(caption = 'Model Performance Metrics on Training Data') #%>%

Model Performance Metrics on Training Data
Algo	AUC	ACCURACY	TPR	FPR	TNR	FNR
LR	0.6737968	0.6785714	0.8823529	0.6363636	0.3636364	0.1176471
NB	0.8235294	0.7857143	0.9411765	0.4545455	0.5454545	0.0588235
KNN (k=3)	1.0000000	1.0000000	1.0000000	0.0000000	1.0000000	0.0000000
KNN (K=5)	1.0000000	0.8928571	1.0000000	0.2727273	0.7272727	0.0000000

  #kable_styling(bootstrap_options = c('bordered','striped')) %>%
  #add_header_above(c('Model Performance Metrics on Training Data'=7))

6.1.2 Metrics of Testing Set (model’s Ability to Generalize)

bind_rows(LR_test_metrics,
          NB_test_metrics,
          KNN_3_test_metrics,
          KNN_5_test_metrics) %>%
  select(-Data_Set) %>%
  kable(caption = 'Model Performance Metrics on Testing Data') #%>%

Model Performance Metrics on Testing Data
Algo	AUC	ACCURACY	TPR	FPR	TNR	FNR
LR	0.8000000	0.750	0.8	0.3333333	0.6666667	0.2
NB	1.0000000	0.875	1.0	0.3333333	0.6666667	0.0
KNN (k=3)	0.6333333	0.625	0.8	0.6666667	0.3333333	0.2
KNN (k=5)	0.7333333	0.750	0.8	0.3333333	0.6666667	0.2

  #kable_styling(bootstrap_options = c('bordered','striped')) %>%
  #add_header_above(c('Model Performance Metrics on Testing Data'=7))

6.2 Model Performance Analysis

6.2.1 Model Description

1. Logistic Regression (LR) LR is a discriminative classifier, means it directly computes the conditional probability P(class|data). The classification is determined base on a given threshold of probalitility. Also, given a sufficent sample size, logistic regression has a better peformance compared to Naive Bayes classifier in terms of accuracy.

2. Naive Bayes Classifier (NB) NB is a generative classifier because it first generate the joint probability P(class, data) then computes the comditional probability P(class|data). The generated probability is a N-values descrete output where N is the number of classes in the response variable (the label), the classificiation is detemined to be the class that holds the largest P(Class|data). Also, regardless of sample size, Naive Bayes always converge faster to its asymptotic error compared to LR, which means it ‘learns’ faster.

3. KNN Classifier KNN is a discriminative algorithm since it directly models the conditional probability P(class|data). The performance of KNN depends on the choice of K and is negatively impacted by the complexity of the variable space (the number of variables) because it directly focusses on the distance between the target data point and the other data points. As variable space because more and more complex, the abstraction of high dimentional distance calculation make the distance not a good representation of the similarty. The data is used in this project only has two predictor variables therefore KNN can performs well.

6.2.2 Selection of Model that Has Best Capacity to Learn

According to the model performance statistics of the training data, KNN model with K = 3 has the best performance statistics in terms of AUC, Accuracy, TPR, FRP, TNR and FNR, following by KNN with K = 5, Naive Bayes and Logistic Regression in order. Therefore KNN with K = 3 shows it has the best capacity to learn among all models in this project.

6.2.3 Selection of Model that Has Best Performance on Generalizing.

According to the model performance statistics of the testing data, Naive Bayes (NB) has the best performance statistics in terms of AUC, Accuracy, TPR, FRP, TNR and FNR, following by Logistic Regression, KNN with K = 5 and KNN with K = 3 in order. Therefore Naive Bayes shows it has the best ability to generalize among all models in this project.

However, from business perspective, because the models are built based on a data set which contains only 36 observations, the performance of the models are yet to be further re-evaluated given more data are collected in future business operations.

DATA622 ASSIGNMENT 1 - Run the model exercise

DATA622 ASSIGNMENT 1 - Run the model exercise

1 Data Exploration

1.1 Load Data

1.2 Data Summary

1.3 Train Test Split

1.3.1 Training Set

1.3.2 Testing Set

2 Logistic Regression

2.1 Construct Logicstic Regression Classifier

2.2 Model Prediction

2.2.1 Prediction on Training Set

2.2.2 Model Evaluation on Training Data

2.2.2.1 ROC Curve & AUC on Training Data

2.2.2.2 Metrics from Confusion Matrix

2.2.3 Overall Performance Metrics Table

2.2.4 Prediction on Testing data & Calculate Overall Performance Matrics Table

3 Naive Bayers Classifier

3.1 Model Prediction

3.1.1 Prediction on Training Set

3.1.2 Model Evaluation on Training Data

3.1.2.1 ROC Curve & AUC on Training Data

3.1.2.2 Metrics from Confusion Matrix

3.1.3 Overall Performance Metrics Table

3.1.4 Prediction on Testing data & Calculate Overall Performance Matrics Table

4 KNN Classifier (k = 3)

4.1 Model Prediction

4.1.1 Prediction on Training Set

4.1.2 Model Evaluation on Training Data

4.1.2.1 ROC Curve & AUC on Training Data

4.1.2.2 Metrics from Confusion Matrix

4.1.3 Overall Performance Metrics Table

4.1.4 Prediction on Testing data & Calculate Overall Performance Matrics Table

5 KNN Classifier (k = 5)

5.1 Model Prediction

5.1.1 Prediction on Training Set

5.1.2 Model Evaluation on Training Data

5.1.2.1 ROC Curve & AUC on Training Data

5.1.2.2 Metrics from Confusion Matrix

5.1.3 Overall Performance Metrics Table

5.1.4 Prediction on Testing data & Calculate Overall Performance Matrics Table

6 Summary

6.1 Model Performance Metrics

6.1.1 Metrics of Training Set (Model’s Capacity to Learn)

6.1.2 Metrics of Testing Set (model’s Ability to Generalize)

6.2 Model Performance Analysis

6.2.1 Model Description

6.2.2 Selection of Model that Has Best Capacity to Learn

6.2.3 Selection of Model that Has Best Performance on Generalizing.