my_confusion_matrix <- function(cf_table) {
true_positive <- cf_table[4]
true_negative <- cf_table[1]
false_positive <- cf_table[2]
false_negative <- cf_table[3]
accuracy <- (true_positive + true_negative) / (true_positive + true_negative + false_positive + false_negative)
sensitivity_recall <- true_positive / (true_positive + false_negative)
specificity_selectivity <- true_negative / (true_negative + false_positive)
precision <- true_positive / (true_positive + false_positive)
neg_pred_value <- true_negative/(true_negative + false_negative)
print(cf_table)
my_list <- list(sprintf("%1.0f = True Positive (TP), Hit", true_positive),
sprintf("%1.0f = True Negative (TN), Rejection", true_negative),
sprintf("%1.0f = False Positive (FP), Type 1 Error", false_positive),
sprintf("%1.0f = False Negative (FN), Type 2 Error", false_negative),
sprintf("%1.4f = Accuracy (TP+TN/(TP+TN+FP+FN))", accuracy),
sprintf("%1.4f = Sensitivity, Recall, Hit Rate, True Positive Rate (How many positives did the model get right? TP/(TP+FN))", sensitivity_recall),
sprintf("%1.4f = Specificity, Selectivity, True Negative Rate (How many negatives did the model get right? TN/(TN+FP))", specificity_selectivity),
sprintf("%1.4f = Precision, Positive Predictive Value (How good are the model's positive predictions? TP/(TP+FP))", precision),
sprintf("%1.4f = Negative Predictive Value (How good are the model's negative predictions? TN/(TN+FN)", neg_pred_value)
)
return(my_list)
}
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
df <- read_rds("mod6HE_logit.rds")
knn1 <- df %>% ungroup() %>%
select(store, week, region, high_med_rev, high_med_gp, high_med_gpm)
knn2 <- df %>%
mutate(high_med_units = factor(if_else(high_med_units==1, 'high', 'low'), levels=c('low', 'high')))
knn2 <- knn2 %>% ungroup() %>%
select(high_med_units, size, region,
promo_units_per,
altbev_units_per, confect_units_per, salty_units_per,
velocityA_units_per, velocityB_units_per, velocityC_units_per, velocityD_units_per, velocityNEW_units_per)
library(fastDummies)
knn2 <- fastDummies::dummy_cols(knn2, select_columns = c("region"), remove_selected_columns=T)
contrasts(knn2$high_med_units)
## high
## low 0
## high 1
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
set.seed(77)
partition <- caret::createDataPartition(y=knn2$high_med_units, p=.75, list=FALSE)
data_train <- knn2[partition, ]
data_test <- knn2[-partition, ]
X_train <- data_train %>%
select(-high_med_units)
X_test <- data_test %>%
select(-high_med_units)
y_train <- data_train$high_med_units
y_test <- data_test$high_med_units
X_train <- scale(X_train)
X_test <- scale(X_test)
library(class)
knn_prediction = class::knn(train=X_train, test=X_test, cl=y_train, k=round(sqrt(nrow(data_train))/2))
table2 <- table(knn_prediction, y_test)
my_confusion_matrix(table2)
## y_test
## knn_prediction low high
## low 936 215
## high 324 1042
## [[1]]
## [1] "1042 = True Positive (TP), Hit"
##
## [[2]]
## [1] "936 = True Negative (TN), Rejection"
##
## [[3]]
## [1] "324 = False Positive (FP), Type 1 Error"
##
## [[4]]
## [1] "215 = False Negative (FN), Type 2 Error"
##
## [[5]]
## [1] "0.7859 = Accuracy (TP+TN/(TP+TN+FP+FN))"
##
## [[6]]
## [1] "0.8290 = Sensitivity, Recall, Hit Rate, True Positive Rate (How many positives did the model get right? TP/(TP+FN))"
##
## [[7]]
## [1] "0.7429 = Specificity, Selectivity, True Negative Rate (How many negatives did the model get right? TN/(TN+FP))"
##
## [[8]]
## [1] "0.7628 = Precision, Positive Predictive Value (How good are the model's positive predictions? TP/(TP+FP))"
##
## [[9]]
## [1] "0.8132 = Negative Predictive Value (How good are the model's negative predictions? TN/(TN+FN)"
data_test$knn <- knn_prediction
data_test <- data_test %>%
mutate(correct_knn = if_else(knn == high_med_units, 'correct', 'WRONG!'))
temp1 <- knn1[-partition, ]
full_test_knn <- bind_cols(temp1, data_test)
full_test_knn <- full_test_knn %>%
select(store, week, high_med_units, knn, correct_knn, size, region, promo_units_per, salty_units_per)
slice_sample(full_test_knn, n=10)
## # A tibble: 10 × 9
## store week high_med_units knn correct_knn size region promo_un…¹ salty…²
## <fct> <dbl> <fct> <fct> <chr> <int> <fct> <dbl> <dbl>
## 1 186 49 high high correct 966 ONTARIO 0.279 0.210
## 2 68287 36 high high correct 949 WEST 0.331 0.177
## 3 38862 37 high high correct 930 WEST 0.270 0.161
## 4 77831 21 high high correct 916 WEST 0.256 0.139
## 5 37668 8 high high correct 971 WEST 0.304 0.191
## 6 37668 45 low high WRONG! 971 WEST 0.348 0.130
## 7 448 39 low low correct 969 ONTARIO 0.324 0.147
## 8 37531 40 low high WRONG! 891 WEST 0.325 0.223
## 9 2669 31 high high correct 944 WEST 0.377 0.153
## 10 186 35 high low WRONG! 966 ONTARIO 0.331 0.169
## # … with abbreviated variable names ¹promo_units_per, ²salty_units_per
tree1 <- df %>% ungroup() %>%
select(store, week, high_med_rev, high_med_gp, high_med_gpm)
tree2 <- df %>%
mutate(high_med_units = factor(if_else(high_med_units==1, 'high', 'low'), levels=c('low', 'high')),
region = factor(region))
tree2 <- tree2 %>% ungroup() %>%
select(high_med_units, size, region,
promo_units_per,
altbev_units_per, confect_units_per, salty_units_per,
velocityA_units_per, velocityB_units_per, velocityC_units_per, velocityD_units_per, velocityNEW_units_per)
contrasts(tree2$high_med_units)
## high
## low 0
## high 1
library(caret)
set.seed(77)
partition <- caret::createDataPartition(y=tree2$high_med_units, p=.75, list=FALSE)
data_train <- tree2[partition, ]
data_test <- tree2[-partition, ]
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.2.2
model_tree <- rpart::rpart(high_med_units ~ ., data_train)
predict_tree <- predict(model_tree, data_test, type='class')
table1 <- table(predict_tree, data_test$high_med_units)
my_confusion_matrix(table1)
##
## predict_tree low high
## low 852 217
## high 408 1040
## [[1]]
## [1] "1040 = True Positive (TP), Hit"
##
## [[2]]
## [1] "852 = True Negative (TN), Rejection"
##
## [[3]]
## [1] "408 = False Positive (FP), Type 1 Error"
##
## [[4]]
## [1] "217 = False Negative (FN), Type 2 Error"
##
## [[5]]
## [1] "0.7517 = Accuracy (TP+TN/(TP+TN+FP+FN))"
##
## [[6]]
## [1] "0.8274 = Sensitivity, Recall, Hit Rate, True Positive Rate (How many positives did the model get right? TP/(TP+FN))"
##
## [[7]]
## [1] "0.6762 = Specificity, Selectivity, True Negative Rate (How many negatives did the model get right? TN/(TN+FP))"
##
## [[8]]
## [1] "0.7182 = Precision, Positive Predictive Value (How good are the model's positive predictions? TP/(TP+FP))"
##
## [[9]]
## [1] "0.7970 = Negative Predictive Value (How good are the model's negative predictions? TN/(TN+FN)"
rpart.plot::rpart.plot(model_tree, box.palette = 'RdBu', shadow.col = 'gray', nn=TRUE, yesno=2)
data_test$tree <- predict_tree
data_test <- data_test %>%
mutate(correct_tree = if_else(tree == high_med_units, 'correct', 'WRONG!'))
temp1 <- tree1[-partition, ]
full_test_tree <- bind_cols(temp1, data_test)
full_test_tree <- full_test_tree %>%
select(store, week, high_med_units, tree, correct_tree, size, region, promo_units_per, salty_units_per)
slice_sample(full_test_tree, n=10)
## # A tibble: 10 × 9
## store week high_med_units tree correct_tree size region promo_…¹ salty…²
## <fct> <dbl> <fct> <fct> <chr> <int> <fct> <dbl> <dbl>
## 1 12685 12 low low correct 896 ATLANTIC 0.361 0.179
## 2 2519 22 low high WRONG! 917 WEST 0.307 0.181
## 3 68287 18 high high correct 949 WEST 0.383 0.167
## 4 16110 2 low low correct 915 QUEBEC 0.385 0.166
## 5 77809 31 high high correct 982 WEST 0.396 0.143
## 6 38869 11 high high correct 1031 WEST 0.319 0.186
## 7 501 48 low low correct 931 ONTARIO 0.388 0.145
## 8 59222 51 high high correct 981 ONTARIO 0.306 0.200
## 9 1446 41 high low WRONG! 944 ONTARIO 0.390 0.196
## 10 38967 43 high high correct 1046 WEST 0.294 0.219
## # … with abbreviated variable names ¹promo_units_per, ²salty_units_per
full_test <- bind_cols(full_test_knn %>%
select(store, week, high_med_units, knn, correct_knn),
full_test_tree %>%
select(-store, -week, -high_med_units))
slice_sample(full_test, n=10)
## # A tibble: 10 × 11
## store week high_m…¹ knn corre…² tree corre…³ size region promo…⁴ salty…⁵
## <fct> <dbl> <fct> <fct> <chr> <fct> <chr> <int> <fct> <dbl> <dbl>
## 1 38952 51 low high WRONG! high WRONG! 924 WEST 0.333 0.161
## 2 85964 14 high high correct high correct 946 WEST 0.295 0.197
## 3 85527 31 high high correct high correct 997 WEST 0.379 0.256
## 4 12816 40 high high correct high correct 897 QUEBEC 0.286 0.199
## 5 14136 19 high high correct high correct 893 QUEBEC 0.312 0.149
## 6 37531 7 high high correct high correct 891 WEST 0.340 0.235
## 7 13541 14 low low correct low correct 904 QUEBEC 0.457 0.141
## 8 14161 1 low high WRONG! high WRONG! 923 QUEBEC 0.308 0.153
## 9 91298 14 low high WRONG! high WRONG! 939 WEST 0.319 0.175
## 10 92589 37 low low correct low correct 904 WEST 0.454 0.166
## # … with abbreviated variable names ¹high_med_units, ²correct_knn,
## # ³correct_tree, ⁴promo_units_per, ⁵salty_units_per
Questions
Question 1: The False Positive (FP), Type 1 Error is higher at 324.
Question 2a: Sensitivity is higher at 0.8290 or 83%. Sensitivity = TP/(TP+FN))”
Specificity = TN/(TN+FP))”
Question 2b: The equations for sensitivity and specificity are - Sensitivity = TP/(TP+FN)) = 1040/(1040+215) = 1040/1239 = 0.83 - Specificity = TN/(TN+FP)) = 936/(936+324) = 936/1260 = 0.74 The larger difference between TP and FN creates a larger result while the smaller difference between TN and FP creates a smaller result.
Question 2c: The higher sensitivity rate (highest of all the measures) means that the model is good at telling NANSE which stores will sell the most units.
Question 2d: Matthew’s Correlation Coefficient: MCC = (TPxTN – FPxFN)/√(TP+FP)(TP+FN)(TN+FP)(TN+FN) The value for MCC ranges from -1 to 1 where:
-1 indicates total disagreement between predicted classes and actual classes 0 is synonymous with completely random guessing 1 indicates total agreement between predicted classes and actual classes The MCC is useful when the classes are unbalanced. MCC takes into account all four values in the confusion matrix, and a high value (close to 1) means that both classes are predicted well, even if one class is disproportionately under- (or over-) represented.
Question 3a: Size is the most important
Question 3b: The answer is not Ontario, Atlantic, therefore it’s Quebec and West
Question 4a: The KNN is the better model. It has a higher accuracy rate at 0.7859 (79%) vs. 0.7517 (75%) for the decision tree model and 0.7465 (75%) for the logistic regression model
Question 4b: The KNN is the better model here as well. It’s precision or positive predictive value is 0.7628 (76%) vs. 0.7182 (72%) for the decision tree model and 0.7420 (74%) for the logistic regression model The KNN is also slightly better than the decision tree model in sensitivity with 0.8290 (83%) vs. 0.8274 (83%). While that difference is largely insignificant given the higher scores the KNN has in the other two relevant measures, it is the better model. The logistic regression model has a sensitivity rate of 0.7653 (77%)
Question 5a: Simple to Implement. The conditional probabilities are easy to evaluate. Very fast – no iterations since the probabilities can be directly computed. So this technique is useful where speed of training is important. If the conditional Independence assumption holds, it could give great results.
Question 5b: Conditional Independence Assumption does not always hold. In most situations, the feature show some form of dependency. Zero probability problem : When we encounter words in the test data for a particular class that are not present in the training data, we might end up with zero class probabilities. See the example below for more details: P(bumper | Ham) is 0 since bumper does not occuer in any ham (non-spam) documents in the training data.
Question 5c: Install and load packages #install.packages(naivebayes) #install.packages(dplyr) #install.packages(ggplot2) #install.packages(psych) #library(naivebayes) #library(dplyr) #library(ggplot2) #library(psych)
#load data df<- read.csv(“D:/RStudio/NaiveClassifiaction/binary.csv”, header = T) head(data)