##1. Loading packages:

library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(pROC)
## Warning: package 'pROC' was built under R version 4.4.3
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(naivebayes)
## Warning: package 'naivebayes' was built under R version 4.4.3
## naivebayes 1.0.0 loaded
## For more information please visit:
## https://majkamichal.github.io/naivebayes/

##2. Importing data: Import 2 data sets. 1 training + validation, 1 test. T&V is ~5700 rows; test set is ~900

training_val <- read.csv("C:/Users/ajpay/OneDrive/Documents/MSBA/SCH-MGMT 655 - Machine Learning/Exercise Files/Week 7/Social Media Advertising Analytics, Training & Validation.csv") 

test <- read.csv("C:/Users/ajpay/OneDrive/Documents/MSBA/SCH-MGMT 655 - Machine Learning/Exercise Files/Week 7/Social Media Advertising Analytics Test.csv") 

##3. Exploring data:

head(test) ##shows first few rows of dataframe
##   Consumer.ID Clicked.on.Ad Is.Female Private.Account Public.Account
## 1       51784             0         0               0              1
## 2      275714             0         1               1              0
## 3       39040             0         1               0              1
## 4      151266             0         0               1              0
## 5       97207             0         1               1              0
## 6       32188             0         0               0              1
##   Public.Restricted.Account Accounts.Followed Followers
## 1                         0               759    247778
## 2                         0              1375    238369
## 3                         0               951    100012
## 4                         0               989    316622
## 5                         0               422    341174
## 6                         0               519     91828
##   Contribution.Frequency.Index Video.Consumption.Index Luxury.Goods Books
## 1                            0                       2            0     1
## 2                            8                       9            0     2
## 3                            3                       9            0     1
## 4                            5                       9            0     2
## 5                            9                       9            0     1
## 6                            9                       9            0     1
##   Religion Gardening Culinary Health...Fitness DIY Finance
## 1        0         0        0                1   0       0
## 2        0         0        0                1   0       0
## 3        0         0        0                1   0       0
## 4        0         0        0                1   0       0
## 5        0         0        0                1   0       0
## 6        0         0        0                1   0       0
nrow(test) ##shows number of rows in dataframe
## [1] 967

##4. Getting data ready for classification

training_val$Clicked.on.Ad <- factor(training_val$Clicked.on.Ad) ##Sets outcome variable as categorical
test$OUTCOME_VARIABLE <- factor(test$Clicked.on.Ad) ##Sets outcome variable as categorical

##5. Assessing outcome variable balance

table(training_val$Clicked.on.Ad) ##Shows how many records are in each possible value of the outcome variable
## 
##    0    1 
## 5247  490
table(test$Clicked.on.Ad) ##Shows how many records are in each possible value of the outcome variable
## 
##   0   1 
## 881  86

##6. Setting random seed: ensures reproducible results when using randomization

set.seed(1234)

##7. Oversampling an imbalanced dataset (specifically when the outcome variable ==1 is least common)

tv_1 <- training_val[which(training_val$Clicked.on.Ad==1), ] ##Extracts all rows of the data frame where the outcome variable = the least common class

tv_0 <- training_val[which(training_val$Clicked.on.Ad==0), ] ##Extracts all rows of the data frame where the outcome variable = the most common class

We are going to oversample our tv dataset to make it a 50-50 split data between between people who clicked (1) and how didn’t (0) Because there are 490 people who clicked this means we need to randomy select 490 people who did not click to be in oversampled dataset.

sample <- sample.int(n = nrow(tv_0), size = 490, replace = F) ##extracts random subsample of rows from most common class subset that is the same size as the entire subset of the least common class
tv_0_reduced <- tv_0[sample,] ##Extract subsample of NUM_ROWS size from most common class dataframe

tv_oversampled <- rbind(tv_1, tv_0_reduced) ##Yields oversampled data

##8. Partitioning oversampled data between training and validation

set.seed(1234) ##Setting random seed

sample <- sample.int(n = nrow(tv_oversampled), size = nrow(tv_oversampled)*0.80, replace = F)
ad_training <- tv_oversampled[sample, ] ##Yields training dataset
ad_validation <- tv_oversampled[-sample, ] ##Yields validation portion

##9. Train Naive Bayes model on oversampled data

naivebayes_model <- naive_bayes(Clicked.on.Ad ~ . -Consumer.ID, data = ad_training)

##10. Produce probability predictions on validation & test data

VALIDATION_PREDICTIONS <- predict(naivebayes_model, ad_validation, type="prob") ##Produce predictions
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
VALIDATION_SET <- (cbind(ad_validation, VALIDATION_PREDICTIONS)) ##Save validation predictions into dataframe

names(ad_validation)[names(ad_validation) == "1"] <- "prob_1" ##Rename column appropriately

names(ad_validation)[names(ad_validation) == "0"] <- "prob_0" ##Rename column appropriately
TEST_PREDICTIONS <- predict(naivebayes_model, test, type="prob") ##Produce predictions
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
TEST_SET <- (cbind(test, TEST_PREDICTIONS)) ##Save test predictions into dataframe

names(TEST_SET)[names(TEST_SET) == "1"] <- "prob_1" ##Rename column appropriately

names(TEST_SET)[names(TEST_SET) == "0"] <- "prob_0" ##Rename column appropriately

##11. Generating ROC curve from test predictions AUC is 0.9245 out of 1.0; model appears very good

myroc <- roc(TEST_SET$Clicked.on.Ad, TEST_SET$prob_1) ##ROC curve data
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(myroc) ##Print out AUC
## Area under the curve: 0.9245

##12. Selecting the probability threshold using the ROC curve Min sensitivity/recall of 50%; 50% of actual people who will click on ad are captured in model Min precision of 50%; at least 50% of people we end up classifying as clicking on ad actually will Min accuracy of 80%; at least 80% _________

rocdata<-coords(myroc,ret=c("threshold", "sensitivity", "accuracy", "precision","fpr")) ##Displays the ROC curve data; sensitivity, precision, and accuracy displayed with each probability threshold

ideal_thresholds <- rocdata[which(rocdata$sensitivity>=0.50 & rocdata$precision>=0.50 & rocdata$accuracy>=0.80), ] ##Extracts the ROC curve data that meets the specified constraints
ideal_thresholds
##      threshold sensitivity  accuracy precision        fpr
## 850 0.01212538   0.6860465 0.9110651 0.5000000 0.06696935
## 851 0.01919889   0.6860465 0.9120993 0.5042735 0.06583428
## 852 0.02215210   0.6744186 0.9110651 0.5000000 0.06583428
## 853 0.02404217   0.6744186 0.9120993 0.5043478 0.06469921
## 854 0.02903211   0.6744186 0.9131334 0.5087719 0.06356413
## 855 0.04443534   0.6627907 0.9120993 0.5044248 0.06356413
## 856 0.06223614   0.6627907 0.9131334 0.5089286 0.06242906
## 857 0.07035555   0.6511628 0.9120993 0.5045045 0.06242906
## 858 0.07407301   0.6511628 0.9131334 0.5090909 0.06129398
## 859 0.08917231   0.6511628 0.9141675 0.5137615 0.06015891
## 860 0.12481590   0.6395349 0.9131334 0.5092593 0.06015891
## 861 0.17773235   0.6395349 0.9141675 0.5140187 0.05902384
## 862 0.23844446   0.6395349 0.9152017 0.5188679 0.05788876
## 863 0.45034269   0.6279070 0.9141675 0.5142857 0.05788876
## 864 0.68637076   0.6279070 0.9152017 0.5192308 0.05675369
## 865 0.76712383   0.6279070 0.9162358 0.5242718 0.05561862
## 866 0.83749096   0.6279070 0.9172699 0.5294118 0.05448354
## 867 0.91009971   0.6279070 0.9183040 0.5346535 0.05334847
## 868 0.94019540   0.6279070 0.9193382 0.5400000 0.05221339
## 869 0.94209117   0.6279070 0.9203723 0.5454545 0.05107832
## 870 0.94403924   0.6279070 0.9214064 0.5510204 0.04994325
## 871 0.94640821   0.6279070 0.9224405 0.5567010 0.04880817
## 872 0.94997433   0.6279070 0.9234747 0.5625000 0.04767310
## 873 0.95295190   0.6279070 0.9245088 0.5684211 0.04653802
## 874 0.95445595   0.6279070 0.9255429 0.5744681 0.04540295
## 875 0.95585918   0.6279070 0.9265770 0.5806452 0.04426788
## 876 0.96079240   0.6279070 0.9276112 0.5869565 0.04313280
## 877 0.96677313   0.6279070 0.9286453 0.5934066 0.04199773
## 878 0.96951773   0.6279070 0.9296794 0.6000000 0.04086266
## 879 0.97283772   0.6279070 0.9307135 0.6067416 0.03972758
## 880 0.97607512   0.6279070 0.9317477 0.6136364 0.03859251
## 881 0.97893701   0.6279070 0.9327818 0.6206897 0.03745743
## 882 0.98105478   0.6279070 0.9338159 0.6279070 0.03632236
## 883 0.98159059   0.6279070 0.9348501 0.6352941 0.03518729
## 884 0.98215981   0.6279070 0.9358842 0.6428571 0.03405221
## 885 0.98449688   0.6279070 0.9369183 0.6506024 0.03291714
## 886 0.98809118   0.6162791 0.9358842 0.6463415 0.03291714
## 887 0.98977810   0.6162791 0.9369183 0.6543210 0.03178207
## 888 0.99017198   0.6162791 0.9379524 0.6625000 0.03064699
## 889 0.99052529   0.6162791 0.9389866 0.6708861 0.02951192
## 890 0.99080802   0.6162791 0.9400207 0.6794872 0.02837684
## 891 0.99118267   0.6162791 0.9410548 0.6883117 0.02724177
## 892 0.99189216   0.6162791 0.9420889 0.6973684 0.02610670
## 893 0.99287409   0.6162791 0.9431231 0.7066667 0.02497162
## 894 0.99446060   0.6162791 0.9441572 0.7162162 0.02383655
## 895 0.99551773   0.6162791 0.9451913 0.7260274 0.02270148
## 896 0.99566426   0.6046512 0.9441572 0.7222222 0.02270148
## 897 0.99590741   0.5930233 0.9431231 0.7183099 0.02270148
## 898 0.99616060   0.5930233 0.9441572 0.7285714 0.02156640
## 899 0.99646755   0.5813953 0.9431231 0.7246377 0.02156640
## 900 0.99682928   0.5697674 0.9420889 0.7205882 0.02156640
## 901 0.99736834   0.5697674 0.9431231 0.7313433 0.02043133
## 902 0.99792406   0.5581395 0.9420889 0.7272727 0.02043133
## 903 0.99836426   0.5581395 0.9431231 0.7384615 0.01929625
## 904 0.99883261   0.5465116 0.9420889 0.7343750 0.01929625
## 905 0.99907914   0.5348837 0.9410548 0.7301587 0.01929625
## 906 0.99913853   0.5232558 0.9400207 0.7258065 0.01929625
## 907 0.99929819   0.5116279 0.9389866 0.7213115 0.01929625
## 908 0.99944247   0.5000000 0.9379524 0.7166667 0.01929625

To maximize sensitivity I would set prob threshold to 0.01212538

To maximize precision I would set prob threshold to 0.99836426

#13. Classifying validation & test records according to a chosen probability threshold

ad_validation <- ad_validation  %>% mutate(CLASSIFICATION = 1*("prob_1" >= 0.99836426))

test <- test  %>% mutate(CLASSIFICATION = 1*("prob_1" >= 0.99836426))

#14. Evaluating classification performance on validation & test using confusion matrices

validation_performance <- confusionMatrix(data=as.factor(ad_validation$CLASSIFICATION), reference = as.factor(ad_validation$Clicked.on.Ad),positive="1") ##Generate confusion matrix (based on probability cutoff)
## Warning in confusionMatrix.default(data =
## as.factor(ad_validation$CLASSIFICATION), : Levels are not in the same order for
## reference and data. Refactoring data to match.
validation_performance
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0   0   0
##          1  96 100
##                                          
##                Accuracy : 0.5102         
##                  95% CI : (0.438, 0.5821)
##     No Information Rate : 0.5102         
##     P-Value [Acc > NIR] : 0.5287         
##                                          
##                   Kappa : 0              
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 1.0000         
##             Specificity : 0.0000         
##          Pos Pred Value : 0.5102         
##          Neg Pred Value :    NaN         
##              Prevalence : 0.5102         
##          Detection Rate : 0.5102         
##    Detection Prevalence : 1.0000         
##       Balanced Accuracy : 0.5000         
##                                          
##        'Positive' Class : 1              
## 
test_performance <- confusionMatrix(data=as.factor(test$CLASSIFICATION), reference = as.factor(test$Clicked.on.Ad),positive="1") ##Generate confusion matrix (based on probability cutoff)
## Warning in confusionMatrix.default(data = as.factor(test$CLASSIFICATION), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
test_performance
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0   0   0
##          1 881  86
##                                           
##                Accuracy : 0.0889          
##                  95% CI : (0.0718, 0.1087)
##     No Information Rate : 0.9111          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 1.00000         
##             Specificity : 0.00000         
##          Pos Pred Value : 0.08893         
##          Neg Pred Value :     NaN         
##              Prevalence : 0.08893         
##          Detection Rate : 0.08893         
##    Detection Prevalence : 1.00000         
##       Balanced Accuracy : 0.50000         
##                                           
##        'Positive' Class : 1               
##