corpus <-VCorpus(VectorSource(email_df$text))corpus_clean <- corpus %>%tm_map(content_transformer(tolower)) %>%tm_map(removeNumbers) %>%tm_map(removeWords, stopwords("english")) %>%tm_map(removePunctuation) %>%tm_map(stripWhitespace)# Creating the DTMdtm <-DocumentTermMatrix(corpus_clean)dtm_filtered <-removeSparseTerms(dtm, 0.99) #Filtering out rarely used wordsconvert_counts <-function(x) { x <-ifelse(x >0, "Yes", "No")}# convert the dataframe to a matrixdtm_binary <-apply(dtm_filtered, MARGIN =2, convert_counts)labeled_data <-as.data.frame(dtm_binary)labeled_data$class_label <-as.factor(email_df$label)
Training a Naive Bayes model with Laplace smoothing
set.seed(67) # Seed set for reproducibilitytrain_index <-createDataPartition(labeled_data$class_label, p =0.7, list =FALSE)train_set <- labeled_data[train_index, ]test_set <- labeled_data[-train_index, ]# Laplace = 1 prevents the "zero probability" trapmodel <-naiveBayes(class_label ~ ., data = train_set, laplace =1)# 6. Evaluationpredictions <-predict(model, test_set)confusionMatrix(predictions, test_set$class_label)
Confusion Matrix and Statistics
Reference
Prediction ham spam
ham 706 2
spam 59 148
Accuracy : 0.9333
95% CI : (0.9152, 0.9486)
No Information Rate : 0.8361
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.789
Mcnemar's Test P-Value : 7.496e-13
Sensitivity : 0.9229
Specificity : 0.9867
Pos Pred Value : 0.9972
Neg Pred Value : 0.7150
Prevalence : 0.8361
Detection Rate : 0.7716
Detection Prevalence : 0.7738
Balanced Accuracy : 0.9548
'Positive' Class : ham