library(tidyverse)
library(tidymodels)
library(caret)
library(ggplot2)
library(GGally)
library(e1071)
library(class)
library(kknn)
library(knitr)

1 Data Exploration

This is a dataset with 36 observations of 3 variables with no missing values.

X and Y are predictor variables and label is target variable, where X is numerical, Y and label are categorical variables.

set.seed(999)
df <- read.csv("hw1.txt", header=TRUE) %>%
  mutate_all(str_trim) %>%
  mutate(X = X %>% as.numeric()) %>%
  mutate_if(is.character,as.factor)
glimpse(df)

## Rows: 36
## Columns: 3
## $ X     <dbl> 5, 5, 5, 5, 5, 5, 19, 19, 19, 19, 19, 19, 35, 35, 35, 35...
## $ Y     <fct> a, b, c, d, e, f, a, b, c, d, e, f, a, b, c, d, e, f, a,...
## $ label <fct> BLUE, BLACK, BLUE, BLACK, BLACK, BLACK, BLUE, BLUE, BLUE...

summary(df)

##        X      Y       label   
##  Min.   : 5   a:6   BLACK:22  
##  1st Qu.:19   b:6   BLUE :14  
##  Median :43   c:6             
##  Mean   :38   d:6             
##  3rd Qu.:55   e:6             
##  Max.   :63   f:6

df

ggplot(df, aes(x=X, y=Y, shape=label, col=label)) +
  geom_point(size=5) +
  scale_color_manual(values=c('Black', '#56B4E9')) +
  labs(title = "Scatter Plot")

ggpairs(df, aes(col=label, alpha=0.5)) +
  labs(title = "Pair Plot")

2 Building Models

I am going to model the dataset using Logistic Regression, Naive Bayes, and kNN (k=3,k=5) methods below.

#train-test-split
df_split <- initial_split(df, prop=0.7)
df_train <- training(df_split)
df_test <- testing(df_split)

2.1 Logistic Regression

# logistic regression model
lr_model <- logistic_reg() %>%
  # using model classification
  set_mode('classification') %>%
  # use glm function
  set_engine('glm') %>%
  #fit training data
  fit(label ~ ., df_train)

lr_model

## parsnip model object
## 
## Fit time:  0ms 
## 
## Call:  stats::glm(formula = label ~ ., family = stats::binomial, data = data)
## 
## Coefficients:
## (Intercept)            X           Yb           Yc           Yd  
##   -21.76313      0.04214     20.30670     39.69995     18.87678  
##          Ye           Yf  
##    18.64541     19.33477  
## 
## Degrees of Freedom: 25 Total (i.e. Null);  19 Residual
## Null Deviance:       34.65 
## Residual Deviance: 19.69     AIC: 33.69

lr_train_pred <- lr_model %>% 
  #make prediction on training data
  predict(df_train) %>%
  # rename the prediction column
  mutate(prediction = `.pred_class`) %>%
  # merge the prediction result back to training data set
  bind_cols(df_train) %>%
  # reorder columns to focus on the predicton value and actual label
  select(prediction, label, everything())
lr_train_pred

lr_train_cm <- lr_train_pred %>% 
  #use only prediction values and actual label
  select(prediction, label) %>%
  #construct confusion matrix
  table() %>%
  #display as matrix
  as.matrix()
lr_train_cm

##           label
## prediction BLACK BLUE
##      BLACK    15    4
##      BLUE      1    6

lr_train_pred %>%
  metrics(truth = label, estimate = prediction)

tp<-lr_train_cm[1,1]
fp<-lr_train_cm[1,2]
fn<-lr_train_cm[2,1]
tn<-lr_train_cm[2,2]

accuracy <- (tp+tn)/(tp+tn+fp+fn) #accurary=(TP+TN/P+N)
tpr <- tp/(tp+fn) #TPR=TP/(TP+FN)
fpr <- fp/(fp+tn) #FPR=FP/(FP+TN)
tnr <- tn/(tn+fp) #TNR=TN/(TN+FP)
fnr <- fn/(fn+tp) #FNR=FN/(FN+TP)


lr_pred_prob_tr <- lr_train_pred %>%
  cbind(predict(lr_model, df_train, type='prob'))
lr_pred_prob_tr %>%
  roc_curve(label, c(.pred_BLACK)) %>%
  autoplot()

auc <- lr_pred_prob_tr %>%
  roc_auc(label, c(.pred_BLACK)) %>%
  .[1,3] %>%
  as.numeric()

train_r1 <- data.frame(AUC = auc, 
                       ACCURACY = accuracy, 
                       TPR = tpr, 
                       FPR = fpr, 
                       TNR = tnr, 
                       FNR = fnr) %>%
  mutate(Algo = 'LR') %>%
  select(Algo, everything())

train_r1 %>%
  kable(caption = 'Training')

Training
Algo	AUC	ACCURACY	TPR	FPR	TNR	FNR
LR	0.8875	0.8076923	0.9375	0.4	0.6	0.0625

lr_test_pred <- lr_model %>% 
  #make prediction on testing data
  predict(df_test) %>%
  # rename the prediction column
  mutate(prediction = `.pred_class`) %>%
  # merge the prediction result back to training data set
  bind_cols(df_test) %>%
  # reorder columns to focus on the predicton value and actual label
  select(prediction, label, everything())
lr_test_pred

lr_test_cm <- lr_test_pred %>% 
  #use only prediction values and actual label
  select(prediction, label) %>%
  #construct confusion matrix
  table() %>%
  #display as matrix
  as.matrix()
lr_test_cm

##           label
## prediction BLACK BLUE
##      BLACK     3    3
##      BLUE      3    1

lr_test_pred %>%
  metrics(truth = label, estimate = prediction)

tp<-lr_test_cm[1,1]
fp<-lr_test_cm[1,2]
fn<-lr_test_cm[2,1]
tn<-lr_test_cm[2,2]

accuracy <- (tp+tn)/(tp+tn+fp+fn) #accurary=(TP+TN/P+N)
tpr <- tp/(tp+fn) #TPR=TP/(TP+FN)
fpr <- fp/(fp+tn) #FPR=FP/(FP+TN)
tnr <- tn/(tn+fp) #TNR=TN/(TN+FP)
fnr <- fn/(fn+tp) #FNR=FN/(FN+TP)

lr_pred_prob_te <- lr_test_pred %>%
  cbind(predict(lr_model, df_test, type='prob'))
lr_pred_prob_te %>%
  roc_curve(label, c(.pred_BLACK)) %>%
  autoplot()

auc <- lr_pred_prob_te %>%
  roc_auc(label, c(.pred_BLACK)) %>%
  .[1,3] %>%
  as.numeric()

test_r1 <- data.frame(AUC = auc, 
                       ACCURACY = accuracy, 
                       TPR = tpr, 
                       FPR = fpr, 
                       TNR = tnr, 
                       FNR = fnr) %>%
  mutate(Algo = 'LR') %>%
  select(Algo, everything())
test_r1 %>%
  kable(caption = 'Testing')

Testing
Algo	AUC	ACCURACY	TPR	FPR	TNR	FNR
LR	0.25	0.4	0.5	0.75	0.25	0.5

2.2 Naive Bayes

# Naive Bayes model
nb_model <- naiveBayes(label ~ ., df_train)
nb_model

## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##     BLACK      BLUE 
## 0.6153846 0.3846154 
## 
## Conditional probabilities:
##        X
## Y         [,1]     [,2]
##   BLACK 38.125 19.91942
##   BLUE  45.800 20.48739
## 
##        Y
## Y           a     b     c     d     e     f
##   BLACK 0.250 0.125 0.000 0.125 0.250 0.250
##   BLUE  0.000 0.200 0.400 0.100 0.100 0.200

nb_train_pred <- nb_model %>% 
  #make prediction on training data
  predict(df_train) %>%
  data.frame(prediction = .) %>%
  # merge the prediction result back to training data set
  bind_cols(df_train) %>%
  # reorder columns to focus on the predicton value and actual label
  select(prediction, label, everything())

nb_train_pred

nb_train_cm <- nb_train_pred %>% 
  #use only prediction values and actual label
  select(prediction, label) %>%
  #construct confusion matrix
  table() %>%
  #display as matrix
  as.matrix()
nb_train_cm

##           label
## prediction BLACK BLUE
##      BLACK    15    5
##      BLUE      1    5

nb_train_pred %>%
  metrics(truth = label, estimate = prediction)

tp<-nb_train_cm[1,1]
fp<-nb_train_cm[1,2]
fn<-nb_train_cm[2,1]
tn<-nb_train_cm[2,2]

accuracy <- (tp+tn)/(tp+tn+fp+fn) #accurary=(TP+TN/P+N)
tpr <- tp/(tp+fn) #TPR=TP/(TP+FN)
fpr <- fp/(fp+tn) #FPR=FP/(FP+TN)
tnr <- tn/(tn+fp) #TNR=TN/(TN+FP)
fnr <- fn/(fn+tp) #FNR=FN/(FN+TP)


nb_pred_prob_tr <- nb_train_pred %>%
  cbind(predict(nb_model, df_train, type='raw'))
nb_pred_prob_tr %>%
  roc_curve(label, c(BLACK)) %>%
  autoplot()

auc <- nb_pred_prob_tr %>%
  roc_auc(label, c(BLACK)) %>%
  .[1,3] %>%
  as.numeric()

train_r2 <- data.frame(AUC = auc, 
                       ACCURACY = accuracy, 
                       TPR = tpr, 
                       FPR = fpr, 
                       TNR = tnr, 
                       FNR = fnr) %>%
  mutate(Algo = 'NB') %>%
  select(Algo, everything())

train_r1 %>% 
  bind_rows(train_r2) %>%
  kable(caption = 'Training')

Training
Algo	AUC	ACCURACY	TPR	FPR	TNR	FNR
LR	0.88750	0.8076923	0.9375	0.4	0.6	0.0625
NB	0.89375	0.7692308	0.9375	0.5	0.5	0.0625

nb_test_pred <- nb_model %>% 
  #make prediction on testing data
  predict(df_test) %>%
  data.frame(prediction = .) %>%
  # merge the prediction result back to training data set
  bind_cols(df_test) %>%
  # reorder columns to focus on the predicton value and actual label
  select(prediction, label, everything())
nb_test_pred

nb_test_cm <- nb_test_pred %>% 
  #use only prediction values and actual label
  select(prediction, label) %>%
  #construct confusion matrix
  table() %>%
  #display as matrix
  as.matrix()
nb_test_cm

##           label
## prediction BLACK BLUE
##      BLACK     4    3
##      BLUE      2    1

nb_test_pred %>%
  metrics(truth = label, estimate = prediction)

tp<-nb_test_cm[1,1]
fp<-nb_test_cm[1,2]
fn<-nb_test_cm[2,1]
tn<-nb_test_cm[2,2]

accuracy <- (tp+tn)/(tp+tn+fp+fn) #accurary=(TP+TN/P+N)
tpr <- tp/(tp+fn) #TPR=TP/(TP+FN)
fpr <- fp/(fp+tn) #FPR=FP/(FP+TN)
tnr <- tn/(tn+fp) #TNR=TN/(TN+FP)
fnr <- fn/(fn+tp) #FNR=FN/(FN+TP)


nb_pred_prob_te <- nb_test_pred %>%
  cbind(predict(nb_model, df_test, type='raw'))
nb_pred_prob_tr %>%
  roc_curve(label, c(BLACK)) %>%
  autoplot()

auc <- nb_pred_prob_te %>%
  roc_auc(label, c(BLACK)) %>%
  .[1,3] %>%
  as.numeric()

test_r2 <- data.frame(AUC = auc, 
                       ACCURACY = accuracy, 
                       TPR = tpr, 
                       FPR = fpr, 
                       TNR = tnr, 
                       FNR = fnr) %>%
  mutate(Algo = 'NB') %>%
  select(Algo, everything())
test_r1 %>% 
  bind_rows(test_r2) %>%
  kable(caption = 'Testing')

Testing
Algo	AUC	ACCURACY	TPR	FPR	TNR	FNR
LR	0.2500000	0.4	0.5000000	0.75	0.25	0.5000000
NB	0.2916667	0.5	0.6666667	0.75	0.25	0.3333333

2.3 kNN

2.3.1 k=3

# kNN model with k=3
knn3_model <- nearest_neighbor(neighbors = 3) %>%
  set_mode('classification') %>%
  set_engine('kknn') %>%
  fit(label ~ ., df_train)

knn3_model

## parsnip model object
## 
## Fit time:  10ms 
## 
## Call:
## kknn::train.kknn(formula = label ~ ., data = data, ks = ~3)
## 
## Type of response variable: nominal
## Minimal misclassification: 0.4230769
## Best kernel: optimal
## Best k: 3

knn3_train_pred <- knn3_model %>% 
  #make prediction on training data
  predict(df_train) %>%
  rename(prediction = `.pred_class`) %>%
  # merge the prediction result back to training data set
  bind_cols(df_train) %>%
  # reorder columns to focus on the predicton value and actual label
  select(prediction, label, everything())
knn3_train_pred

knn3_train_cm <- knn3_train_pred %>% 
  #use only prediction values and actual label
  select(prediction, label) %>%
  #construct confusion matrix
  table() %>%
  #display as matrix
  as.matrix()
knn3_train_cm

##           label
## prediction BLACK BLUE
##      BLACK    16    0
##      BLUE      0   10

knn3_train_pred %>%
  metrics(truth = label, estimate = prediction)

tp<-knn3_train_cm[1,1]
fp<-knn3_train_cm[1,2]
fn<-knn3_train_cm[2,1]
tn<-knn3_train_cm[2,2]

accuracy <- (tp+tn)/(tp+tn+fp+fn) #accurary=(TP+TN/P+N)
tpr <- tp/(tp+fn) #TPR=TP/(TP+FN)
fpr <- fp/(fp+tn) #FPR=FP/(FP+TN)
tnr <- tn/(tn+fp) #TNR=TN/(TN+FP)
fnr <- fn/(fn+tp) #FNR=FN/(FN+TP)

knn3_pred_prob_tr <- knn3_train_pred %>%
  cbind(predict(knn3_model, df_train, type='prob'))
knn3_pred_prob_tr %>%
  roc_curve(label, c(.pred_BLACK)) %>%
  autoplot()

auc <- knn3_pred_prob_tr %>%
  roc_auc(label, c(.pred_BLACK)) %>%
  .[1,3] %>%
  as.numeric()

train_r3 <- data.frame(AUC = auc, 
                       ACCURACY = accuracy, 
                       TPR = tpr, 
                       FPR = fpr, 
                       TNR = tnr, 
                       FNR = fnr) %>%
  mutate(Algo = 'kNN_3') %>%
  select(Algo, everything())

train_r1 %>% 
  bind_rows(train_r2, train_r3) %>%
  kable(caption = 'Training')

Training
Algo	AUC	ACCURACY	TPR	FPR	TNR	FNR
LR	0.88750	0.8076923	0.9375	0.4	0.6	0.0625
NB	0.89375	0.7692308	0.9375	0.5	0.5	0.0625
kNN_3	1.00000	1.0000000	1.0000	0.0	1.0	0.0000

knn3_test_pred <- knn3_model %>% 
  #make prediction on testing data
  predict(df_test) %>%
  # rename the prediction column
  mutate(prediction = `.pred_class`) %>%
  # merge the prediction result back to training data set
  bind_cols(df_test) %>%
  # reorder columns to focus on the predicton value and actual label
  select(prediction, label, everything())
knn3_test_pred

knn3_test_cm <- knn3_test_pred %>% 
  #use only prediction values and actual label
  select(prediction, label) %>%
  #construct confusion matrix
  table() %>%
  #display as matrix
  as.matrix()
knn3_test_cm

##           label
## prediction BLACK BLUE
##      BLACK     5    3
##      BLUE      1    1

knn3_test_pred %>%
  metrics(truth = label, estimate = prediction)

tp<-knn3_test_cm[1,1]
fp<-knn3_test_cm[1,2]
fn<-knn3_test_cm[2,1]
tn<-knn3_test_cm[2,2]

accuracy <- (tp+tn)/(tp+tn+fp+fn) #accurary=(TP+TN/P+N)
tpr <- tp/(tp+fn) #TPR=TP/(TP+FN)
fpr <- fp/(fp+tn) #FPR=FP/(FP+TN)
tnr <- tn/(tn+fp) #TNR=TN/(TN+FP)
fnr <- fn/(fn+tp) #FNR=FN/(FN+TP)

knn3_pred_prob_te <- knn3_test_pred %>%
  cbind(predict(knn3_model, df_test, type='prob'))
knn3_pred_prob_te %>%
  roc_curve(label, c(.pred_BLACK)) %>%
  autoplot()

auc <- knn3_pred_prob_te %>%
  roc_auc(label, c(.pred_BLACK)) %>%
  .[1,3] %>% 
  as.numeric()
  

test_r3 <- data.frame(AUC = auc, 
                       ACCURACY = accuracy, 
                       TPR = tpr, 
                       FPR = fpr, 
                       TNR = tnr, 
                       FNR = fnr) %>%
  mutate(Algo = 'kNN_3') %>%
  select(Algo, everything())

test_r1 %>% 
  bind_rows(test_r2, test_r3) %>%
  kable(caption = 'Testing')

Testing
Algo	AUC	ACCURACY	TPR	FPR	TNR	FNR
LR	0.2500000	0.4	0.5000000	0.75	0.25	0.5000000
NB	0.2916667	0.5	0.6666667	0.75	0.25	0.3333333
kNN_3	0.5208333	0.6	0.8333333	0.75	0.25	0.1666667

2.3.2 k=5

# kNN model with k=5
knn5_model <- nearest_neighbor(neighbors = 5) %>%
  set_mode('classification') %>%
  set_engine('kknn') %>%
  fit(label ~ ., df_train)

knn5_model

## parsnip model object
## 
## Fit time:  0ms 
## 
## Call:
## kknn::train.kknn(formula = label ~ ., data = data, ks = ~5)
## 
## Type of response variable: nominal
## Minimal misclassification: 0.1923077
## Best kernel: optimal
## Best k: 5

knn5_train_pred <- knn5_model %>% 
  #make prediction on training data
  predict(df_train) %>%
  rename(prediction = `.pred_class`) %>%
  # merge the prediction result back to training data set
  bind_cols(df_train) %>%
  # reorder columns to focus on the predicton value and actual label
  select(prediction, label, everything())
knn5_train_pred

knn5_train_cm <- knn5_train_pred %>% 
  #use only prediction values and actual label
  select(prediction, label) %>%
  #construct confusion matrix
  table() %>%
  #display as matrix
  as.matrix()
knn5_train_cm

##           label
## prediction BLACK BLUE
##      BLACK    16    3
##      BLUE      0    7

knn5_train_pred %>%
  metrics(truth = label, estimate = prediction)

tp<-knn5_train_cm[1,1]
fp<-knn5_train_cm[1,2]
fn<-knn5_train_cm[2,1]
tn<-knn5_train_cm[2,2]

accuracy <- (tp+tn)/(tp+tn+fp+fn) #accurary=(TP+TN/P+N)
tpr <- tp/(tp+fn) #TPR=TP/(TP+FN)
fpr <- fp/(fp+tn) #FPR=FP/(FP+TN)
tnr <- tn/(tn+fp) #TNR=TN/(TN+FP)
fnr <- fn/(fn+tp) #FNR=FN/(FN+TP)

knn5_pred_prob_tr <- knn5_train_pred %>%
  cbind(predict(knn5_model, df_train, type='prob'))
knn5_pred_prob_tr %>%
  roc_curve(label, c(.pred_BLACK)) %>%
  autoplot()

auc <- knn5_pred_prob_tr %>%
  roc_auc(label, c(.pred_BLACK)) %>%
  .[1,3] %>% 
  as.numeric()

train_r4 <- data.frame(AUC = auc, 
                       ACCURACY = accuracy, 
                       TPR = tpr, 
                       FPR = fpr, 
                       TNR = tnr, 
                       FNR = fnr) %>%
  mutate(Algo = 'kNN_5') %>%
  select(Algo, everything())


train_r1 %>% 
  bind_rows(train_r2, train_r3,train_r4) %>%
  kable(caption = 'Training')

Training
Algo	AUC	ACCURACY	TPR	FPR	TNR	FNR
LR	0.887500	0.8076923	0.9375	0.4	0.6	0.0625
NB	0.893750	0.7692308	0.9375	0.5	0.5	0.0625
kNN_3	1.000000	1.0000000	1.0000	0.0	1.0	0.0000
kNN_5	0.984375	0.8846154	1.0000	0.3	0.7	0.0000

knn5_test_pred <- knn5_model %>% 
  #make prediction on testing data
  predict(df_test) %>%
  # rename the prediction column
  mutate(prediction = `.pred_class`) %>%
  # merge the prediction result back to training data set
  bind_cols(df_test) %>%
  # reorder columns to focus on the predicton value and actual label
  select(prediction, label, everything())
knn5_test_pred

knn5_test_cm <- knn5_test_pred %>% 
  #use only prediction values and actual label
  select(prediction, label) %>%
  #construct confusion matrix
  table() %>%
  #display as matrix
  as.matrix()
knn5_test_cm

##           label
## prediction BLACK BLUE
##      BLACK     5    2
##      BLUE      1    2

knn5_test_pred %>%
  metrics(truth = label, estimate = prediction)

tp<-knn5_test_cm[1,1]
fp<-knn5_test_cm[1,2]
fn<-knn5_test_cm[2,1]
tn<-knn5_test_cm[2,2]

accuracy <- (tp+tn)/(tp+tn+fp+fn) #accurary=(TP+TN/P+N)
tpr <- tp/(tp+fn) #TPR=TP/(TP+FN)
fpr <- fp/(fp+tn) #FPR=FP/(FP+TN)
tnr <- tn/(tn+fp) #TNR=TN/(TN+FP)
fnr <- fn/(fn+tp) #FNR=FN/(FN+TP)

knn5_pred_prob_te <- knn5_test_pred %>%
  cbind(predict(knn5_model, df_test, type='prob'))
knn5_pred_prob_te %>%
  roc_curve(label, c(.pred_BLACK)) %>%
  autoplot()

auc <- knn5_pred_prob_te %>%
  roc_auc(label, c(.pred_BLACK)) %>%
  .[1,3] %>%
  as.numeric()

test_r4 <- data.frame(AUC = auc, 
                       ACCURACY = accuracy, 
                       TPR = tpr, 
                       FPR = fpr, 
                       TNR = tnr, 
                       FNR = fnr) %>%
  mutate(Algo = 'kNN_5') %>%
  select(Algo, everything())

test_r1 %>% 
  bind_rows(test_r2, test_r3, test_r4) %>%
  kable(caption = 'Testing')

Testing
Algo	AUC	ACCURACY	TPR	FPR	TNR	FNR
LR	0.2500000	0.4	0.5000000	0.75	0.25	0.5000000
NB	0.2916667	0.5	0.6666667	0.75	0.25	0.3333333
kNN_3	0.5208333	0.6	0.8333333	0.75	0.25	0.1666667
kNN_5	0.6041667	0.7	0.8333333	0.50	0.50	0.1666667

3 Performance

3.1 Metrics of Training Data

			Training
Algo	AUC	ACCURACY	TPR	FPR	TNR	FNR
LR	0.8875	0.8077	0.9375	0.4	0.6	0.0625
NB	0.8938	0.7692	0.9375	0.5	0.5	0.0625
kNN3	1	1	1	0	1	0
kNN5	0.9844	0.8846	1	0.3	0.7	0

3.2 Metrics of Testing Data

			Testing
Algo	AUC	ACCURACY	TPR	FPR	TNR	FNR
LR	0.25	0.4	0.5	0.75	0.25	0.5
NB	0.2917	0.5	0.6667	0.75	0.25	0.3333
kNN3	0.5208	0.6	0.8333	0.75	0.25	0.1667
kNN5	0.6042	0.7	0.8333	0.5	0.5	0.1667

3.3 Short Summary

Logistic regression is a disciminative model. The logistic regression model of training data has the lowest AUC and 2nd lowest accuracy among the 4 models. The logistic regression model of testing data has the lowest AUC and accuracy. Overall, it performs the worst among all.

Naive Bayes is a generative model based on the joint probability. The Naive Bayes model of training data has the 2nd lowest AUC and the lowest accuracy. The Naive Bayes of testing data has the 2nd lowest AUC and accuracy. Overall, it did not perform well among the 4 models.

kNN stores available cases and classifies new cases based on a similarity measure. - The kNN model with k=3 of training data has the highest AUC and accuracy. It performs well in training data. - The kNN model with k=3 of testing data has the 2nd highest AUC and accuracy. It performs fairly well in testing data. - The kNN model with k=5 of training data has the 2nd highest AUC and accuracy. It performs fairly well in training data. - The kNN model with k=5 of testing data has the highest AUC and accuracy. It performs well in testing data. - Overall, kNN models perform well.

4 Conclusion

4.1 Best Ability to Learn

The performance data shows that kNN models are the best models. While the performance for k=3 and k=5 are silimar, kNN model with k=3 has the best performance in training data, i.e. it has the best ability to learn among all models.

4.2 Best Ability to Generalize

The performance data shows that kNN models are the best models. While the performance for k=3 and k=5 are silimar, kNN model with k=5 has the best performance in testing data, i.e. it has the best ability to generalize among all models.

4.3 Future Improvement

For a small data set with only 36 observations, it is very hard to say which model can always perform the best. The amount of data and its structure is not ideal and not realistic compare to the real world for any particular classifier. A larger dataset with 1k~1M observations with more variables could improve our study on the 4 models and can provide a more confirmative conclusion on which model performs the best.

Data 622 HW1: Run the Model