# Loading the required packages
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.3.2
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(tm)
## Warning: package 'tm' was built under R version 4.3.2
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
# Reading the csv file
emails <- read.csv("/Users/ursulapodosenin/Desktop/spamham.csv")
# Split data into training and testing sets
set.seed(19754) # for reproducibility
spl <- createDataPartition(emails$label_num, p = 0.95, list = FALSE)
train <- emails[spl, ]
test <- emails[-spl, ]
# Pre-processing the data for the training set
trainc <- Corpus(VectorSource(train$text))
trainc <- tm_map(trainc, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(trainc, content_transformer(tolower)):
## transformation drops documents
trainc <- tm_map(trainc, removePunctuation)
## Warning in tm_map.SimpleCorpus(trainc, removePunctuation): transformation drops
## documents
trainc <- tm_map(trainc, removeWords, stopwords("en"))
## Warning in tm_map.SimpleCorpus(trainc, removeWords, stopwords("en")):
## transformation drops documents
# Converting the data into a document term matrix and removing extra terms
training <- DocumentTermMatrix(trainc)
training <- removeSparseTerms(training, 0.75)
# Converting the data into a data frame
train_df <- as.data.frame(as.matrix(training))
train_df$spam <- as.factor(train$label_num)
# Training the model using the caret package
spamm <- train(spam ~ ., data = train_df, method = "glm", family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Making predictions using the training data
train_predictions <- predict(spamm, newdata = train_df)
confusionMatrix(train_predictions, train_df$spam)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2547 207
## 1 928 1231
##
## Accuracy : 0.769
## 95% CI : (0.7569, 0.7807)
## No Information Rate : 0.7073
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5135
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.7329
## Specificity : 0.8561
## Pos Pred Value : 0.9248
## Neg Pred Value : 0.5702
## Prevalence : 0.7073
## Detection Rate : 0.5184
## Detection Prevalence : 0.5606
## Balanced Accuracy : 0.7945
##
## 'Positive' Class : 0
##
# Pre-processing the data for test set
testc <- Corpus(VectorSource(test$text))
testc <- tm_map(testc, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(testc, content_transformer(tolower)):
## transformation drops documents
testc <- tm_map(testc, removePunctuation)
## Warning in tm_map.SimpleCorpus(testc, removePunctuation): transformation drops
## documents
testc <- tm_map(testc, removeWords, stopwords("en"))
## Warning in tm_map.SimpleCorpus(testc, removeWords, stopwords("en")):
## transformation drops documents
# Converting the data into d ocument term matrix using the training set
ttest <- DocumentTermMatrix(testc, control = list(dictionary = Terms(training)))
# Converting the data into a to data frame
test_df <- as.data.frame(as.matrix(ttest))
test_df$spam <- as.factor(test$label_num)
# Making predictions based on the text data
test_predictions <- predict(spamm, newdata = test_df)
confusionMatrix(test_predictions, test_df$spam)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 134 10
## 1 63 51
##
## Accuracy : 0.7171
## 95% CI : (0.6579, 0.7712)
## No Information Rate : 0.7636
## P-Value [Acc > NIR] : 0.9644
##
## Kappa : 0.3972
##
## Mcnemar's Test P-Value : 1.157e-09
##
## Sensitivity : 0.6802
## Specificity : 0.8361
## Pos Pred Value : 0.9306
## Neg Pred Value : 0.4474
## Prevalence : 0.7636
## Detection Rate : 0.5194
## Detection Prevalence : 0.5581
## Balanced Accuracy : 0.7581
##
## 'Positive' Class : 0
##