#1. Loading packages

library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(pROC)
## Warning: package 'pROC' was built under R version 4.4.3
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(naivebayes)
## Warning: package 'naivebayes' was built under R version 4.4.3
## naivebayes 1.0.0 loaded
## For more information please visit:
## https://majkamichal.github.io/naivebayes/

#2. Importing Data

appdata_tv <- read.csv("C:/Users/ajpay/OneDrive/Documents/MSBA/SCH-MGMT 655 - Machine Learning/Assignments/Assignment 5/Running App Data, Training & Validation.csv")
appdata_test <- read.csv("C:/Users/ajpay/OneDrive/Documents/MSBA/SCH-MGMT 655 - Machine Learning/Assignments/Assignment 5/Running App Data, Test.csv")

#3. Prepare for Classification

appdata_tv$premium_member <- factor(appdata_tv$premium_member)
appdata_test$premium_member <- factor(appdata_test$premium_member)

#4. Assess Outcome Variable Balance in Training & Validation

table(appdata_tv$premium_member)
## 
##      0      1 
## 113815   1450

Question: In the training & validation data, how many observations are there for premium members? How many observations are there for non-premium members? Is this a balanced or imbalanced dataset?

Answer: There are 1,450 premium members and 113,815 non-premium members. This dataset is imbalanced, with non-premium users making up the vast majority.

#5. Set Random Seed

set.seed(1234)

#6a. Oversampling Training & Validation

premium <- appdata_tv[which(appdata_tv$premium_member == 1), ]
non_premium <- appdata_tv[which(appdata_tv$premium_member == 0), ]

sample <- sample.int(n = nrow(non_premium), size = 1450, replace = FALSE)
non_premium_reduced <- non_premium[sample, ]

oversampled_data <- rbind(premium, non_premium_reduced)

#6b. Oversample Check

nrow(oversampled_data)
## [1] 2900

Question: How many rows are present in the oversampled training & validation data frame?

Answer: There are 2900 rows in the oversampled training & validation data frame.

#7. Data Partition

sample <- sample.int(n = nrow(oversampled_data), size = 0.8 * nrow(oversampled_data), replace = FALSE)

train_data <- oversampled_data[sample, ]
val_data <- oversampled_data[-sample, ]

#8. Training Naive Bayes Model

nb_model <- naive_bayes(premium_member ~ . -runner_id, data = train_data)

#9. Producing Prob Predictions on Validation & Test

val_probs <- predict(nb_model, val_data, type = "prob")
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
val_data <- cbind(val_data, val_probs)
names(val_data)[names(val_data) == "1"] <- "prob_1"
names(val_data)[names(val_data) == "0"] <- "prob_0"

test_probs <- predict(nb_model, appdata_test, type = "prob")
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
appdata_test <- cbind(appdata_test, test_probs)
names(appdata_test)[names(appdata_test) == "1"] <- "prob_1"
names(appdata_test)[names(appdata_test) == "0"] <- "prob_0"

#10. Obtaining Test Data AUC

roc_test <- roc(appdata_test$premium_member, appdata_test$prob_1)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_test)
## Area under the curve: 0.9781

Question: What is the test AUC? Is this a good AUC? Answer: The test AUC is 0.9781. This is a very good AUC, indicating that the model does an excellent job distinguishing between premium and non-premium members.

#11. Selecting Probability Threshold Using the Test ROC Curve

roc_data <- coords(roc_test, ret = c("threshold", "sensitivity", "accuracy", "precision", "fpr"), transpose = FALSE)

ideal_thresholds <- roc_data[which(roc_data$sensitivity >= 0.80 & roc_data$precision >= 0.23), ]
ideal_thresholds
##       threshold sensitivity  accuracy precision        fpr
## 12237 0.9245141   0.8650307 0.9615534 0.2300163 0.03720637
## 12238 0.9247427   0.8650307 0.9616313 0.2303922 0.03712754
## 12239 0.9249939   0.8650307 0.9617091 0.2307692 0.03704872
## 12240 0.9252473   0.8650307 0.9617869 0.2311475 0.03696989
## 12241 0.9261298   0.8650307 0.9618647 0.2315271 0.03689106
## 12242 0.9268788   0.8650307 0.9619426 0.2319079 0.03681223
## 12243 0.9272860   0.8650307 0.9620204 0.2322900 0.03673341
## 12244 0.9279579   0.8650307 0.9620982 0.2326733 0.03665458
## 12245 0.9282870   0.8588957 0.9620204 0.2314050 0.03665458
## 12246 0.9285281   0.8588957 0.9620982 0.2317881 0.03657575
## 12247 0.9288395   0.8588957 0.9621760 0.2321725 0.03649693
## 12248 0.9292764   0.8588957 0.9622539 0.2325581 0.03641810
## 12249 0.9296921   0.8588957 0.9623317 0.2329451 0.03633927
## 12250 0.9301996   0.8588957 0.9624095 0.2333333 0.03626044
## 12251 0.9311751   0.8588957 0.9624874 0.2337229 0.03618162
## 12252 0.9322571   0.8588957 0.9625652 0.2341137 0.03610279
## 12253 0.9327820   0.8588957 0.9626430 0.2345059 0.03602396
## 12254 0.9329307   0.8588957 0.9627208 0.2348993 0.03594514
## 12255 0.9331263   0.8588957 0.9627987 0.2352941 0.03586631
## 12256 0.9333953   0.8588957 0.9628765 0.2356902 0.03578748
## 12257 0.9338283   0.8527607 0.9627987 0.2344013 0.03578748
## 12258 0.9340792   0.8527607 0.9628765 0.2347973 0.03570866
## 12259 0.9342640   0.8527607 0.9629543 0.2351946 0.03562983
## 12260 0.9348643   0.8527607 0.9630321 0.2355932 0.03555100
## 12261 0.9353174   0.8527607 0.9631100 0.2359932 0.03547217
## 12262 0.9354304   0.8527607 0.9631878 0.2363946 0.03539335
## 12263 0.9356503   0.8527607 0.9632656 0.2367973 0.03531452
## 12264 0.9359503   0.8527607 0.9633435 0.2372014 0.03523569
## 12265 0.9365303   0.8527607 0.9634213 0.2376068 0.03515687
## 12266 0.9369696   0.8527607 0.9634991 0.2380137 0.03507804
## 12267 0.9373715   0.8527607 0.9635769 0.2384220 0.03499921
## 12268 0.9378210   0.8527607 0.9636548 0.2388316 0.03492038
## 12269 0.9382492   0.8527607 0.9637326 0.2392427 0.03484156
## 12270 0.9387892   0.8527607 0.9638104 0.2396552 0.03476273
## 12271 0.9390030   0.8527607 0.9638882 0.2400691 0.03468390
## 12272 0.9394906   0.8527607 0.9639661 0.2404844 0.03460508
## 12273 0.9402415   0.8527607 0.9640439 0.2409012 0.03452625
## 12274 0.9407780   0.8527607 0.9641217 0.2413194 0.03444742
## 12275 0.9413035   0.8527607 0.9641995 0.2417391 0.03436860
## 12276 0.9416951   0.8527607 0.9642774 0.2421603 0.03428977
## 12277 0.9420588   0.8527607 0.9643552 0.2425829 0.03421094
## 12278 0.9427354   0.8527607 0.9644330 0.2430070 0.03413211
## 12279 0.9434263   0.8527607 0.9645109 0.2434326 0.03405329
## 12280 0.9439373   0.8527607 0.9645887 0.2438596 0.03397446
## 12281 0.9443567   0.8527607 0.9646665 0.2442882 0.03389563
## 12282 0.9447184   0.8527607 0.9647443 0.2447183 0.03381681
## 12283 0.9451473   0.8466258 0.9646665 0.2433862 0.03381681
## 12284 0.9462030   0.8466258 0.9647443 0.2438163 0.03373798
## 12285 0.9471326   0.8466258 0.9648222 0.2442478 0.03365915
## 12286 0.9473646   0.8466258 0.9649000 0.2446809 0.03358032
## 12287 0.9475095   0.8466258 0.9649778 0.2451155 0.03350150
## 12288 0.9478038   0.8404908 0.9649000 0.2437722 0.03350150
## 12289 0.9480921   0.8404908 0.9649778 0.2442068 0.03342267
## 12290 0.9480973   0.8404908 0.9650556 0.2446429 0.03334384
## 12291 0.9484095   0.8404908 0.9651335 0.2450805 0.03326502
## 12292 0.9489669   0.8404908 0.9652113 0.2455197 0.03318619
## 12293 0.9493174   0.8404908 0.9652891 0.2459605 0.03310736
## 12294 0.9494820   0.8404908 0.9653670 0.2464029 0.03302854
## 12295 0.9497806   0.8404908 0.9654448 0.2468468 0.03294971
## 12296 0.9500329   0.8404908 0.9655226 0.2472924 0.03287088
## 12297 0.9501295   0.8404908 0.9656004 0.2477396 0.03279205
## 12298 0.9504265   0.8404908 0.9656783 0.2481884 0.03271323
## 12299 0.9512209   0.8343558 0.9656004 0.2468240 0.03271323
## 12300 0.9518060   0.8343558 0.9656783 0.2472727 0.03263440
## 12301 0.9519895   0.8343558 0.9657561 0.2477231 0.03255557
## 12302 0.9522542   0.8343558 0.9658339 0.2481752 0.03247675
## 12303 0.9524846   0.8343558 0.9659117 0.2486289 0.03239792
## 12304 0.9526480   0.8343558 0.9659896 0.2490842 0.03231909
## 12305 0.9527341   0.8282209 0.9659117 0.2477064 0.03231909
## 12306 0.9528780   0.8282209 0.9659896 0.2481618 0.03224026
## 12307 0.9529656   0.8282209 0.9660674 0.2486188 0.03216144
## 12308 0.9531498   0.8282209 0.9661452 0.2490775 0.03208261
## 12309 0.9534977   0.8282209 0.9662231 0.2495379 0.03200378
## 12310 0.9537400   0.8282209 0.9663009 0.2500000 0.03192496
## 12311 0.9538276   0.8220859 0.9662231 0.2486085 0.03192496
## 12312 0.9539645   0.8220859 0.9663009 0.2490706 0.03184613
## 12313 0.9545193   0.8220859 0.9663787 0.2495345 0.03176730
## 12314 0.9550560   0.8220859 0.9664565 0.2500000 0.03168848
## 12315 0.9554577   0.8220859 0.9665344 0.2504673 0.03160965
## 12316 0.9557905   0.8159509 0.9664565 0.2490637 0.03160965
## 12317 0.9559240   0.8159509 0.9665344 0.2495310 0.03153082
## 12318 0.9562626   0.8159509 0.9666122 0.2500000 0.03145199
## 12319 0.9566036   0.8159509 0.9666900 0.2504708 0.03137317
## 12320 0.9567964   0.8159509 0.9667678 0.2509434 0.03129434
## 12321 0.9568950   0.8098160 0.9666900 0.2495274 0.03129434
## 12322 0.9569808   0.8098160 0.9667678 0.2500000 0.03121551
## 12323 0.9574000   0.8098160 0.9668457 0.2504744 0.03113669
## 12324 0.9580700   0.8036810 0.9667678 0.2490494 0.03113669
## 12325 0.9584651   0.8036810 0.9668457 0.2495238 0.03105786
## 12326 0.9586621   0.8036810 0.9669235 0.2500000 0.03097903
## 12327 0.9588362   0.8036810 0.9670013 0.2504780 0.03090020
## 12328 0.9589667   0.8036810 0.9670792 0.2509579 0.03082138
## 12329 0.9592128   0.8036810 0.9671570 0.2514395 0.03074255
## 12330 0.9597386   0.8036810 0.9672348 0.2519231 0.03066372
## 12331 0.9604268   0.8036810 0.9673126 0.2524085 0.03058490
## 12332 0.9608060   0.8036810 0.9673905 0.2528958 0.03050607
## 12333 0.9609745   0.8036810 0.9674683 0.2533849 0.03042724
## 12334 0.9611790   0.8036810 0.9675461 0.2538760 0.03034842
## 12335 0.9612707   0.8036810 0.9676239 0.2543689 0.03026959
## 12336 0.9613414   0.8036810 0.9677018 0.2548638 0.03019076
## 12337 0.9614728   0.8036810 0.9677796 0.2553606 0.03011193
## 12338 0.9615637   0.8036810 0.9678574 0.2558594 0.03003311
## 12339 0.9617035   0.8036810 0.9679352 0.2563601 0.02995428
## 12340 0.9618264   0.8036810 0.9680131 0.2568627 0.02987545
## 12341 0.9618952   0.8036810 0.9680909 0.2573674 0.02979663
## 12342 0.9619589   0.8036810 0.9681687 0.2578740 0.02971780
## 12343 0.9619813   0.8036810 0.9682466 0.2583826 0.02963897
## 12344 0.9620297   0.8036810 0.9683244 0.2588933 0.02956015
## 12345 0.9621345   0.8036810 0.9684022 0.2594059 0.02948132
## 12346 0.9622851   0.8036810 0.9684800 0.2599206 0.02940249
## 12347 0.9623937   0.8036810 0.9685579 0.2604374 0.02932366

Question: What probability threshold should be set to ensure that the sensitivity is greater than or equal to 0.80 and the precision is greater than or equal to 0.23? Choose a probability threshold that maximizes precision under these constraints.

Answer: The best threshold that meets the criteria is 0.9624.

At this threshold:

Sensitivity = 80.4%

Precision = 26.04%

Accuracy = 96.86%

This is the best choice because it maximizes precision while still keeping sensitivity above 0.80.

#12. OPTIONAL

# Step 1: Remove any duplicate column names (like duplicate prob_1)
appdata_test <- appdata_test[, !duplicated(names(appdata_test))]

# Step 2: Safely remove CLASSIFICATION column if it already exists
if ("CLASSIFICATION" %in% names(appdata_test)) {
  appdata_test <- appdata_test %>% select(-CLASSIFICATION)
}

# Step 3: Set your chosen threshold
chosen_threshold <- 0.9624

# Step 4: Add classification column
appdata_test <- appdata_test %>%
  mutate(CLASSIFICATION = 1 * (prob_1 >= chosen_threshold))

# Step 5: Generate confusion matrix
test_performance <- confusionMatrix(
  data = as.factor(appdata_test$CLASSIFICATION),
  reference = as.factor(appdata_test$premium_member),
  positive = "1"
)

# Step 6: Display it
test_performance
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction     0     1
##          0 12314    32
##          1   372   131
##                                           
##                Accuracy : 0.9686          
##                  95% CI : (0.9654, 0.9715)
##     No Information Rate : 0.9873          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.3815          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.80368         
##             Specificity : 0.97068         
##          Pos Pred Value : 0.26044         
##          Neg Pred Value : 0.99741         
##              Prevalence : 0.01269         
##          Detection Rate : 0.01020         
##    Detection Prevalence : 0.03915         
##       Balanced Accuracy : 0.88718         
##                                           
##        'Positive' Class : 1               
## 

#12a. Accuracy on Test Data? Accuracy: 0.9686

#12b. Precision on Test Data? Precision (Positive Predictive Value): 0.2604

#12c. Interpreting the metrics. Accuracy (96.86%) indicates the model performs very well overall, correctly predicting most user statuses.

Precision (26.04%) tells us that among predicted premium members, roughly 1 in 4 are truly premium — a significant improvement compared to random guessing in an imbalanced dataset.

Sensitivity (80.37%) shows the model captures most true premium users, which is important for not missing potential customers.

The balance of high accuracy, good precision, and strong sensitivity suggests this model is well-calibrated and useful for outreach or targeting premium upgrade offers.