# Loading the required packages
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.3.2
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(tm)
## Warning: package 'tm' was built under R version 4.3.2
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
# Reading the csv file
emails <- read.csv("/Users/ursulapodosenin/Desktop/spamham.csv")
# Split data into training and testing sets
set.seed(19754)  # for reproducibility
spl <- createDataPartition(emails$label_num, p = 0.95, list = FALSE)
train <- emails[spl, ]
test <- emails[-spl, ]

# Pre-processing the data for the training set
trainc <- Corpus(VectorSource(train$text))
trainc <- tm_map(trainc, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(trainc, content_transformer(tolower)):
## transformation drops documents
trainc <- tm_map(trainc, removePunctuation)
## Warning in tm_map.SimpleCorpus(trainc, removePunctuation): transformation drops
## documents
trainc <- tm_map(trainc, removeWords, stopwords("en"))
## Warning in tm_map.SimpleCorpus(trainc, removeWords, stopwords("en")):
## transformation drops documents
# Converting the data into a document term matrix and removing extra terms
training <- DocumentTermMatrix(trainc)
training <- removeSparseTerms(training, 0.75)

# Converting the data into a data frame
train_df <- as.data.frame(as.matrix(training))
train_df$spam <- as.factor(train$label_num)
# Training the model using the caret package
spamm <- train(spam ~ ., data = train_df, method = "glm", family = binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Making predictions using the training data
train_predictions <- predict(spamm, newdata = train_df)
confusionMatrix(train_predictions, train_df$spam)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2547  207
##          1  928 1231
##                                           
##                Accuracy : 0.769           
##                  95% CI : (0.7569, 0.7807)
##     No Information Rate : 0.7073          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5135          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.7329          
##             Specificity : 0.8561          
##          Pos Pred Value : 0.9248          
##          Neg Pred Value : 0.5702          
##              Prevalence : 0.7073          
##          Detection Rate : 0.5184          
##    Detection Prevalence : 0.5606          
##       Balanced Accuracy : 0.7945          
##                                           
##        'Positive' Class : 0               
## 
# Pre-processing the data for test set
testc <- Corpus(VectorSource(test$text))
testc <- tm_map(testc, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(testc, content_transformer(tolower)):
## transformation drops documents
testc <- tm_map(testc, removePunctuation)
## Warning in tm_map.SimpleCorpus(testc, removePunctuation): transformation drops
## documents
testc <- tm_map(testc, removeWords, stopwords("en"))
## Warning in tm_map.SimpleCorpus(testc, removeWords, stopwords("en")):
## transformation drops documents
# Converting the data into d ocument term matrix using the training set
ttest <- DocumentTermMatrix(testc, control = list(dictionary = Terms(training)))

# Converting the data into a to data frame
test_df <- as.data.frame(as.matrix(ttest))
test_df$spam <- as.factor(test$label_num)
# Making predictions based on the text data
test_predictions <- predict(spamm, newdata = test_df)
confusionMatrix(test_predictions, test_df$spam)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 134  10
##          1  63  51
##                                           
##                Accuracy : 0.7171          
##                  95% CI : (0.6579, 0.7712)
##     No Information Rate : 0.7636          
##     P-Value [Acc > NIR] : 0.9644          
##                                           
##                   Kappa : 0.3972          
##                                           
##  Mcnemar's Test P-Value : 1.157e-09       
##                                           
##             Sensitivity : 0.6802          
##             Specificity : 0.8361          
##          Pos Pred Value : 0.9306          
##          Neg Pred Value : 0.4474          
##              Prevalence : 0.7636          
##          Detection Rate : 0.5194          
##    Detection Prevalence : 0.5581          
##       Balanced Accuracy : 0.7581          
##                                           
##        'Positive' Class : 0               
##