HW7

my_confusion_matrix <- function(cf_table) {
  true_positive <- cf_table[4]
  true_negative <- cf_table[1]
  false_positive <- cf_table[2]
  false_negative <- cf_table[3]
  accuracy <- (true_positive + true_negative) / (true_positive + true_negative + false_positive + false_negative)
  sensitivity_recall <- true_positive / (true_positive + false_negative) 
  specificity_selectivity <- true_negative / (true_negative + false_positive)
  precision <- true_positive / (true_positive + false_positive) 
  neg_pred_value <- true_negative/(true_negative + false_negative)
  print(cf_table)
  my_list <- list(sprintf("%1.0f = True Positive (TP), Hit", true_positive),
                  sprintf("%1.0f = True Negative (TN), Rejection", true_negative),
                  sprintf("%1.0f = False Positive (FP), Type 1 Error", false_positive),
                  sprintf("%1.0f = False Negative (FN), Type 2 Error", false_negative),
                  sprintf("%1.4f = Accuracy (TP+TN/(TP+TN+FP+FN))", accuracy), 
                  sprintf("%1.4f = Sensitivity, Recall, Hit Rate, True Positive Rate (How many positives did the model get right? TP/(TP+FN))", sensitivity_recall),
                  sprintf("%1.4f = Specificity, Selectivity, True Negative Rate (How many negatives did the model get right? TN/(TN+FP))", specificity_selectivity),
                  sprintf("%1.4f = Precision, Positive Predictive Value (How good are the model's positive predictions? TP/(TP+FP))", precision),
                  sprintf("%1.4f = Negative Predictive Value (How good are the model's negative predictions? TN/(TN+FN)", neg_pred_value)
  )
  return(my_list)
}

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

df <- read_rds("mod6HE_logit.rds")

knn1 <- df %>% ungroup() %>% 
  select(store, week, region, high_med_rev, high_med_gp, high_med_gpm)

knn2 <- df %>% 
  mutate(high_med_units = factor(if_else(high_med_units==1, 'high', 'low'), levels=c('low', 'high')))

knn2 <- knn2 %>% ungroup() %>% 
  select(high_med_units, size, region, 
         promo_units_per, 
         altbev_units_per, confect_units_per, salty_units_per, 
         velocityA_units_per, velocityB_units_per, velocityC_units_per, velocityD_units_per, velocityNEW_units_per)

library(fastDummies)

knn2 <- fastDummies::dummy_cols(knn2, select_columns = c("region"), remove_selected_columns=T)

contrasts(knn2$high_med_units)

##      high
## low     0
## high    1

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

set.seed(77)
partition <- caret::createDataPartition(y=knn2$high_med_units, p=.75, list=FALSE)
data_train <- knn2[partition, ]
data_test <- knn2[-partition, ]

X_train <- data_train %>% 
  select(-high_med_units)

X_test <-  data_test %>% 
  select(-high_med_units) 

y_train <- data_train$high_med_units

y_test <- data_test$high_med_units

X_train <- scale(X_train)
X_test <- scale(X_test)

library(class)

knn_prediction = class::knn(train=X_train, test=X_test, cl=y_train, k=round(sqrt(nrow(data_train))/2))

table2 <- table(knn_prediction, y_test) 
my_confusion_matrix(table2)

##               y_test
## knn_prediction  low high
##           low   936  215
##           high  324 1042

## [[1]]
## [1] "1042 = True Positive (TP), Hit"
## 
## [[2]]
## [1] "936 = True Negative (TN), Rejection"
## 
## [[3]]
## [1] "324 = False Positive (FP), Type 1 Error"
## 
## [[4]]
## [1] "215 = False Negative (FN), Type 2 Error"
## 
## [[5]]
## [1] "0.7859 = Accuracy (TP+TN/(TP+TN+FP+FN))"
## 
## [[6]]
## [1] "0.8290 = Sensitivity, Recall, Hit Rate, True Positive Rate (How many positives did the model get right? TP/(TP+FN))"
## 
## [[7]]
## [1] "0.7429 = Specificity, Selectivity, True Negative Rate (How many negatives did the model get right? TN/(TN+FP))"
## 
## [[8]]
## [1] "0.7628 = Precision, Positive Predictive Value (How good are the model's positive predictions? TP/(TP+FP))"
## 
## [[9]]
## [1] "0.8132 = Negative Predictive Value (How good are the model's negative predictions? TN/(TN+FN)"

data_test$knn <- knn_prediction


data_test <- data_test %>% 
  mutate(correct_knn = if_else(knn == high_med_units, 'correct', 'WRONG!'))


temp1 <- knn1[-partition, ]
full_test_knn <- bind_cols(temp1, data_test)


full_test_knn <- full_test_knn %>% 
  select(store, week, high_med_units, knn, correct_knn, size, region, promo_units_per, salty_units_per)
slice_sample(full_test_knn, n=10)

## # A tibble: 10 × 9
##    store  week high_med_units knn   correct_knn  size region  promo_un…¹ salty…²
##    <fct> <dbl> <fct>          <fct> <chr>       <int> <fct>        <dbl>   <dbl>
##  1 186      49 high           high  correct       966 ONTARIO      0.279   0.210
##  2 68287    36 high           high  correct       949 WEST         0.331   0.177
##  3 38862    37 high           high  correct       930 WEST         0.270   0.161
##  4 77831    21 high           high  correct       916 WEST         0.256   0.139
##  5 37668     8 high           high  correct       971 WEST         0.304   0.191
##  6 37668    45 low            high  WRONG!        971 WEST         0.348   0.130
##  7 448      39 low            low   correct       969 ONTARIO      0.324   0.147
##  8 37531    40 low            high  WRONG!        891 WEST         0.325   0.223
##  9 2669     31 high           high  correct       944 WEST         0.377   0.153
## 10 186      35 high           low   WRONG!        966 ONTARIO      0.331   0.169
## # … with abbreviated variable names ¹promo_units_per, ²salty_units_per

tree1 <- df %>% ungroup() %>% 
  select(store, week, high_med_rev, high_med_gp, high_med_gpm)


tree2 <- df %>% 
  mutate(high_med_units = factor(if_else(high_med_units==1, 'high', 'low'), levels=c('low', 'high')),
                       region = factor(region)) 

tree2 <- tree2 %>% ungroup() %>% 
  select(high_med_units, size, region, 
         promo_units_per, 
         altbev_units_per, confect_units_per, salty_units_per, 
         velocityA_units_per, velocityB_units_per, velocityC_units_per, velocityD_units_per, velocityNEW_units_per)


contrasts(tree2$high_med_units)

##      high
## low     0
## high    1

library(caret)
set.seed(77)
partition <- caret::createDataPartition(y=tree2$high_med_units, p=.75, list=FALSE)
data_train <- tree2[partition, ]
data_test <- tree2[-partition, ]

library(rpart)
library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 4.2.2

model_tree <- rpart::rpart(high_med_units ~ ., data_train)

predict_tree <- predict(model_tree, data_test, type='class')

table1 <- table(predict_tree, data_test$high_med_units)
my_confusion_matrix(table1)

##             
## predict_tree  low high
##         low   852  217
##         high  408 1040

## [[1]]
## [1] "1040 = True Positive (TP), Hit"
## 
## [[2]]
## [1] "852 = True Negative (TN), Rejection"
## 
## [[3]]
## [1] "408 = False Positive (FP), Type 1 Error"
## 
## [[4]]
## [1] "217 = False Negative (FN), Type 2 Error"
## 
## [[5]]
## [1] "0.7517 = Accuracy (TP+TN/(TP+TN+FP+FN))"
## 
## [[6]]
## [1] "0.8274 = Sensitivity, Recall, Hit Rate, True Positive Rate (How many positives did the model get right? TP/(TP+FN))"
## 
## [[7]]
## [1] "0.6762 = Specificity, Selectivity, True Negative Rate (How many negatives did the model get right? TN/(TN+FP))"
## 
## [[8]]
## [1] "0.7182 = Precision, Positive Predictive Value (How good are the model's positive predictions? TP/(TP+FP))"
## 
## [[9]]
## [1] "0.7970 = Negative Predictive Value (How good are the model's negative predictions? TN/(TN+FN)"

rpart.plot::rpart.plot(model_tree, box.palette = 'RdBu', shadow.col = 'gray', nn=TRUE, yesno=2)

data_test$tree <- predict_tree


data_test <- data_test %>% 
  mutate(correct_tree = if_else(tree == high_med_units, 'correct', 'WRONG!'))


temp1 <- tree1[-partition, ]
full_test_tree <- bind_cols(temp1, data_test)


full_test_tree <- full_test_tree %>% 
  select(store, week, high_med_units, tree, correct_tree, size, region, promo_units_per, salty_units_per)
slice_sample(full_test_tree, n=10)

## # A tibble: 10 × 9
##    store  week high_med_units tree  correct_tree  size region   promo_…¹ salty…²
##    <fct> <dbl> <fct>          <fct> <chr>        <int> <fct>       <dbl>   <dbl>
##  1 12685    12 low            low   correct        896 ATLANTIC    0.361   0.179
##  2 2519     22 low            high  WRONG!         917 WEST        0.307   0.181
##  3 68287    18 high           high  correct        949 WEST        0.383   0.167
##  4 16110     2 low            low   correct        915 QUEBEC      0.385   0.166
##  5 77809    31 high           high  correct        982 WEST        0.396   0.143
##  6 38869    11 high           high  correct       1031 WEST        0.319   0.186
##  7 501      48 low            low   correct        931 ONTARIO     0.388   0.145
##  8 59222    51 high           high  correct        981 ONTARIO     0.306   0.200
##  9 1446     41 high           low   WRONG!         944 ONTARIO     0.390   0.196
## 10 38967    43 high           high  correct       1046 WEST        0.294   0.219
## # … with abbreviated variable names ¹promo_units_per, ²salty_units_per

full_test <- bind_cols(full_test_knn %>% 
                         select(store, week, high_med_units, knn, correct_knn), 
                       full_test_tree %>% 
                         select(-store, -week, -high_med_units))

slice_sample(full_test, n=10)

## # A tibble: 10 × 11
##    store  week high_m…¹ knn   corre…² tree  corre…³  size region promo…⁴ salty…⁵
##    <fct> <dbl> <fct>    <fct> <chr>   <fct> <chr>   <int> <fct>    <dbl>   <dbl>
##  1 38952    51 low      high  WRONG!  high  WRONG!    924 WEST     0.333   0.161
##  2 85964    14 high     high  correct high  correct   946 WEST     0.295   0.197
##  3 85527    31 high     high  correct high  correct   997 WEST     0.379   0.256
##  4 12816    40 high     high  correct high  correct   897 QUEBEC   0.286   0.199
##  5 14136    19 high     high  correct high  correct   893 QUEBEC   0.312   0.149
##  6 37531     7 high     high  correct high  correct   891 WEST     0.340   0.235
##  7 13541    14 low      low   correct low   correct   904 QUEBEC   0.457   0.141
##  8 14161     1 low      high  WRONG!  high  WRONG!    923 QUEBEC   0.308   0.153
##  9 91298    14 low      high  WRONG!  high  WRONG!    939 WEST     0.319   0.175
## 10 92589    37 low      low   correct low   correct   904 WEST     0.454   0.166
## # … with abbreviated variable names ¹high_med_units, ²correct_knn,
## #   ³correct_tree, ⁴promo_units_per, ⁵salty_units_per

Questions

Question 1: The False Positive (FP), Type 1 Error is higher at 324.

Question 2a: Sensitivity is higher at 0.8290 or 83%. Sensitivity = TP/(TP+FN))”

Specificity = TN/(TN+FP))”

Question 2b: The equations for sensitivity and specificity are - Sensitivity = TP/(TP+FN)) = 1040/(1040+215) = 1040/1239 = 0.83 - Specificity = TN/(TN+FP)) = 936/(936+324) = 936/1260 = 0.74 The larger difference between TP and FN creates a larger result while the smaller difference between TN and FP creates a smaller result.

Question 2c: The higher sensitivity rate (highest of all the measures) means that the model is good at telling NANSE which stores will sell the most units.

Question 2d: Matthew’s Correlation Coefficient: MCC = (TPxTN – FPxFN)/√(TP+FP)(TP+FN)(TN+FP)(TN+FN) The value for MCC ranges from -1 to 1 where:

-1 indicates total disagreement between predicted classes and actual classes 0 is synonymous with completely random guessing 1 indicates total agreement between predicted classes and actual classes The MCC is useful when the classes are unbalanced. MCC takes into account all four values in the confusion matrix, and a high value (close to 1) means that both classes are predicted well, even if one class is disproportionately under- (or over-) represented.

Question 3a: Size is the most important

Question 3b: The answer is not Ontario, Atlantic, therefore it’s Quebec and West

Question 4a: The KNN is the better model. It has a higher accuracy rate at 0.7859 (79%) vs. 0.7517 (75%) for the decision tree model and 0.7465 (75%) for the logistic regression model

Question 4b: The KNN is the better model here as well. It’s precision or positive predictive value is 0.7628 (76%) vs. 0.7182 (72%) for the decision tree model and 0.7420 (74%) for the logistic regression model The KNN is also slightly better than the decision tree model in sensitivity with 0.8290 (83%) vs. 0.8274 (83%). While that difference is largely insignificant given the higher scores the KNN has in the other two relevant measures, it is the better model. The logistic regression model has a sensitivity rate of 0.7653 (77%)

Question 5a: Simple to Implement. The conditional probabilities are easy to evaluate. Very fast – no iterations since the probabilities can be directly computed. So this technique is useful where speed of training is important. If the conditional Independence assumption holds, it could give great results.

Question 5b: Conditional Independence Assumption does not always hold. In most situations, the feature show some form of dependency. Zero probability problem : When we encounter words in the test data for a particular class that are not present in the training data, we might end up with zero class probabilities. See the example below for more details: P(bumper | Ham) is 0 since bumper does not occuer in any ham (non-spam) documents in the training data.

Question 5c: Install and load packages #install.packages(naivebayes) #install.packages(dplyr) #install.packages(ggplot2) #install.packages(psych) #library(naivebayes) #library(dplyr) #library(ggplot2) #library(psych)

#load data df<- read.csv(“D:/RStudio/NaiveClassifiaction/binary.csv”, header = T) head(data)

HW7

Rachel Robison

2022-07-22