##1. Loading packages:
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Loading required package: lattice
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(pROC)
## Warning: package 'pROC' was built under R version 4.4.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(naivebayes)
## Warning: package 'naivebayes' was built under R version 4.4.3
## naivebayes 1.0.0 loaded
## For more information please visit:
## https://majkamichal.github.io/naivebayes/
##2. Importing data: Import 2 data sets. 1 training + validation, 1 test. T&V is ~5700 rows; test set is ~900
training_val <- read.csv("C:/Users/ajpay/OneDrive/Documents/MSBA/SCH-MGMT 655 - Machine Learning/Exercise Files/Week 7/Social Media Advertising Analytics, Training & Validation.csv")
test <- read.csv("C:/Users/ajpay/OneDrive/Documents/MSBA/SCH-MGMT 655 - Machine Learning/Exercise Files/Week 7/Social Media Advertising Analytics Test.csv")
##3. Exploring data:
head(test) ##shows first few rows of dataframe
## Consumer.ID Clicked.on.Ad Is.Female Private.Account Public.Account
## 1 51784 0 0 0 1
## 2 275714 0 1 1 0
## 3 39040 0 1 0 1
## 4 151266 0 0 1 0
## 5 97207 0 1 1 0
## 6 32188 0 0 0 1
## Public.Restricted.Account Accounts.Followed Followers
## 1 0 759 247778
## 2 0 1375 238369
## 3 0 951 100012
## 4 0 989 316622
## 5 0 422 341174
## 6 0 519 91828
## Contribution.Frequency.Index Video.Consumption.Index Luxury.Goods Books
## 1 0 2 0 1
## 2 8 9 0 2
## 3 3 9 0 1
## 4 5 9 0 2
## 5 9 9 0 1
## 6 9 9 0 1
## Religion Gardening Culinary Health...Fitness DIY Finance
## 1 0 0 0 1 0 0
## 2 0 0 0 1 0 0
## 3 0 0 0 1 0 0
## 4 0 0 0 1 0 0
## 5 0 0 0 1 0 0
## 6 0 0 0 1 0 0
nrow(test) ##shows number of rows in dataframe
## [1] 967
##4. Getting data ready for classification
training_val$Clicked.on.Ad <- factor(training_val$Clicked.on.Ad) ##Sets outcome variable as categorical
test$OUTCOME_VARIABLE <- factor(test$Clicked.on.Ad) ##Sets outcome variable as categorical
##5. Assessing outcome variable balance
table(training_val$Clicked.on.Ad) ##Shows how many records are in each possible value of the outcome variable
##
## 0 1
## 5247 490
table(test$Clicked.on.Ad) ##Shows how many records are in each possible value of the outcome variable
##
## 0 1
## 881 86
##6. Setting random seed: ensures reproducible results when using randomization
set.seed(1234)
##7. Oversampling an imbalanced dataset (specifically when the outcome variable ==1 is least common)
tv_1 <- training_val[which(training_val$Clicked.on.Ad==1), ] ##Extracts all rows of the data frame where the outcome variable = the least common class
tv_0 <- training_val[which(training_val$Clicked.on.Ad==0), ] ##Extracts all rows of the data frame where the outcome variable = the most common class
We are going to oversample our tv dataset to make it a 50-50 split data between between people who clicked (1) and how didn’t (0) Because there are 490 people who clicked this means we need to randomy select 490 people who did not click to be in oversampled dataset.
sample <- sample.int(n = nrow(tv_0), size = 490, replace = F) ##extracts random subsample of rows from most common class subset that is the same size as the entire subset of the least common class
tv_0_reduced <- tv_0[sample,] ##Extract subsample of NUM_ROWS size from most common class dataframe
tv_oversampled <- rbind(tv_1, tv_0_reduced) ##Yields oversampled data
##8. Partitioning oversampled data between training and validation
set.seed(1234) ##Setting random seed
sample <- sample.int(n = nrow(tv_oversampled), size = nrow(tv_oversampled)*0.80, replace = F)
ad_training <- tv_oversampled[sample, ] ##Yields training dataset
ad_validation <- tv_oversampled[-sample, ] ##Yields validation portion
##9. Train Naive Bayes model on oversampled data
naivebayes_model <- naive_bayes(Clicked.on.Ad ~ . -Consumer.ID, data = ad_training)
##10. Produce probability predictions on validation & test data
VALIDATION_PREDICTIONS <- predict(naivebayes_model, ad_validation, type="prob") ##Produce predictions
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
VALIDATION_SET <- (cbind(ad_validation, VALIDATION_PREDICTIONS)) ##Save validation predictions into dataframe
names(ad_validation)[names(ad_validation) == "1"] <- "prob_1" ##Rename column appropriately
names(ad_validation)[names(ad_validation) == "0"] <- "prob_0" ##Rename column appropriately
TEST_PREDICTIONS <- predict(naivebayes_model, test, type="prob") ##Produce predictions
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
TEST_SET <- (cbind(test, TEST_PREDICTIONS)) ##Save test predictions into dataframe
names(TEST_SET)[names(TEST_SET) == "1"] <- "prob_1" ##Rename column appropriately
names(TEST_SET)[names(TEST_SET) == "0"] <- "prob_0" ##Rename column appropriately
##11. Generating ROC curve from test predictions AUC is 0.9245 out of 1.0; model appears very good
myroc <- roc(TEST_SET$Clicked.on.Ad, TEST_SET$prob_1) ##ROC curve data
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(myroc) ##Print out AUC
## Area under the curve: 0.9245
##12. Selecting the probability threshold using the ROC curve Min sensitivity/recall of 50%; 50% of actual people who will click on ad are captured in model Min precision of 50%; at least 50% of people we end up classifying as clicking on ad actually will Min accuracy of 80%; at least 80% _________
rocdata<-coords(myroc,ret=c("threshold", "sensitivity", "accuracy", "precision","fpr")) ##Displays the ROC curve data; sensitivity, precision, and accuracy displayed with each probability threshold
ideal_thresholds <- rocdata[which(rocdata$sensitivity>=0.50 & rocdata$precision>=0.50 & rocdata$accuracy>=0.80), ] ##Extracts the ROC curve data that meets the specified constraints
ideal_thresholds
## threshold sensitivity accuracy precision fpr
## 850 0.01212538 0.6860465 0.9110651 0.5000000 0.06696935
## 851 0.01919889 0.6860465 0.9120993 0.5042735 0.06583428
## 852 0.02215210 0.6744186 0.9110651 0.5000000 0.06583428
## 853 0.02404217 0.6744186 0.9120993 0.5043478 0.06469921
## 854 0.02903211 0.6744186 0.9131334 0.5087719 0.06356413
## 855 0.04443534 0.6627907 0.9120993 0.5044248 0.06356413
## 856 0.06223614 0.6627907 0.9131334 0.5089286 0.06242906
## 857 0.07035555 0.6511628 0.9120993 0.5045045 0.06242906
## 858 0.07407301 0.6511628 0.9131334 0.5090909 0.06129398
## 859 0.08917231 0.6511628 0.9141675 0.5137615 0.06015891
## 860 0.12481590 0.6395349 0.9131334 0.5092593 0.06015891
## 861 0.17773235 0.6395349 0.9141675 0.5140187 0.05902384
## 862 0.23844446 0.6395349 0.9152017 0.5188679 0.05788876
## 863 0.45034269 0.6279070 0.9141675 0.5142857 0.05788876
## 864 0.68637076 0.6279070 0.9152017 0.5192308 0.05675369
## 865 0.76712383 0.6279070 0.9162358 0.5242718 0.05561862
## 866 0.83749096 0.6279070 0.9172699 0.5294118 0.05448354
## 867 0.91009971 0.6279070 0.9183040 0.5346535 0.05334847
## 868 0.94019540 0.6279070 0.9193382 0.5400000 0.05221339
## 869 0.94209117 0.6279070 0.9203723 0.5454545 0.05107832
## 870 0.94403924 0.6279070 0.9214064 0.5510204 0.04994325
## 871 0.94640821 0.6279070 0.9224405 0.5567010 0.04880817
## 872 0.94997433 0.6279070 0.9234747 0.5625000 0.04767310
## 873 0.95295190 0.6279070 0.9245088 0.5684211 0.04653802
## 874 0.95445595 0.6279070 0.9255429 0.5744681 0.04540295
## 875 0.95585918 0.6279070 0.9265770 0.5806452 0.04426788
## 876 0.96079240 0.6279070 0.9276112 0.5869565 0.04313280
## 877 0.96677313 0.6279070 0.9286453 0.5934066 0.04199773
## 878 0.96951773 0.6279070 0.9296794 0.6000000 0.04086266
## 879 0.97283772 0.6279070 0.9307135 0.6067416 0.03972758
## 880 0.97607512 0.6279070 0.9317477 0.6136364 0.03859251
## 881 0.97893701 0.6279070 0.9327818 0.6206897 0.03745743
## 882 0.98105478 0.6279070 0.9338159 0.6279070 0.03632236
## 883 0.98159059 0.6279070 0.9348501 0.6352941 0.03518729
## 884 0.98215981 0.6279070 0.9358842 0.6428571 0.03405221
## 885 0.98449688 0.6279070 0.9369183 0.6506024 0.03291714
## 886 0.98809118 0.6162791 0.9358842 0.6463415 0.03291714
## 887 0.98977810 0.6162791 0.9369183 0.6543210 0.03178207
## 888 0.99017198 0.6162791 0.9379524 0.6625000 0.03064699
## 889 0.99052529 0.6162791 0.9389866 0.6708861 0.02951192
## 890 0.99080802 0.6162791 0.9400207 0.6794872 0.02837684
## 891 0.99118267 0.6162791 0.9410548 0.6883117 0.02724177
## 892 0.99189216 0.6162791 0.9420889 0.6973684 0.02610670
## 893 0.99287409 0.6162791 0.9431231 0.7066667 0.02497162
## 894 0.99446060 0.6162791 0.9441572 0.7162162 0.02383655
## 895 0.99551773 0.6162791 0.9451913 0.7260274 0.02270148
## 896 0.99566426 0.6046512 0.9441572 0.7222222 0.02270148
## 897 0.99590741 0.5930233 0.9431231 0.7183099 0.02270148
## 898 0.99616060 0.5930233 0.9441572 0.7285714 0.02156640
## 899 0.99646755 0.5813953 0.9431231 0.7246377 0.02156640
## 900 0.99682928 0.5697674 0.9420889 0.7205882 0.02156640
## 901 0.99736834 0.5697674 0.9431231 0.7313433 0.02043133
## 902 0.99792406 0.5581395 0.9420889 0.7272727 0.02043133
## 903 0.99836426 0.5581395 0.9431231 0.7384615 0.01929625
## 904 0.99883261 0.5465116 0.9420889 0.7343750 0.01929625
## 905 0.99907914 0.5348837 0.9410548 0.7301587 0.01929625
## 906 0.99913853 0.5232558 0.9400207 0.7258065 0.01929625
## 907 0.99929819 0.5116279 0.9389866 0.7213115 0.01929625
## 908 0.99944247 0.5000000 0.9379524 0.7166667 0.01929625
To maximize sensitivity I would set prob threshold to 0.01212538
To maximize precision I would set prob threshold to 0.99836426
#13. Classifying validation & test records according to a chosen probability threshold
ad_validation <- ad_validation %>% mutate(CLASSIFICATION = 1*("prob_1" >= 0.99836426))
test <- test %>% mutate(CLASSIFICATION = 1*("prob_1" >= 0.99836426))
#14. Evaluating classification performance on validation & test using confusion matrices
validation_performance <- confusionMatrix(data=as.factor(ad_validation$CLASSIFICATION), reference = as.factor(ad_validation$Clicked.on.Ad),positive="1") ##Generate confusion matrix (based on probability cutoff)
## Warning in confusionMatrix.default(data =
## as.factor(ad_validation$CLASSIFICATION), : Levels are not in the same order for
## reference and data. Refactoring data to match.
validation_performance
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 0 0
## 1 96 100
##
## Accuracy : 0.5102
## 95% CI : (0.438, 0.5821)
## No Information Rate : 0.5102
## P-Value [Acc > NIR] : 0.5287
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.5102
## Neg Pred Value : NaN
## Prevalence : 0.5102
## Detection Rate : 0.5102
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 1
##
test_performance <- confusionMatrix(data=as.factor(test$CLASSIFICATION), reference = as.factor(test$Clicked.on.Ad),positive="1") ##Generate confusion matrix (based on probability cutoff)
## Warning in confusionMatrix.default(data = as.factor(test$CLASSIFICATION), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
test_performance
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 0 0
## 1 881 86
##
## Accuracy : 0.0889
## 95% CI : (0.0718, 0.1087)
## No Information Rate : 0.9111
## P-Value [Acc > NIR] : 1
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.00000
## Specificity : 0.00000
## Pos Pred Value : 0.08893
## Neg Pred Value : NaN
## Prevalence : 0.08893
## Detection Rate : 0.08893
## Detection Prevalence : 1.00000
## Balanced Accuracy : 0.50000
##
## 'Positive' Class : 1
##