UP_Project

# Loading the required packages
library(tidytext)

## Warning: package 'tidytext' was built under R version 4.3.2

library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(tm)

## Warning: package 'tm' was built under R version 4.3.2

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

# Reading the csv file
emails <- read.csv("/Users/ursulapodosenin/Desktop/spamham.csv")

# Split data into training and testing sets
set.seed(19754)  # for reproducibility
spl <- createDataPartition(emails$label_num, p = 0.95, list = FALSE)
train <- emails[spl, ]
test <- emails[-spl, ]

# Pre-processing the data for the training set
trainc <- Corpus(VectorSource(train$text))
trainc <- tm_map(trainc, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(trainc, content_transformer(tolower)):
## transformation drops documents

trainc <- tm_map(trainc, removePunctuation)

## Warning in tm_map.SimpleCorpus(trainc, removePunctuation): transformation drops
## documents

trainc <- tm_map(trainc, removeWords, stopwords("en"))

## Warning in tm_map.SimpleCorpus(trainc, removeWords, stopwords("en")):
## transformation drops documents

# Converting the data into a document term matrix and removing extra terms
training <- DocumentTermMatrix(trainc)
training <- removeSparseTerms(training, 0.75)

# Converting the data into a data frame
train_df <- as.data.frame(as.matrix(training))
train_df$spam <- as.factor(train$label_num)

# Training the model using the caret package
spamm <- train(spam ~ ., data = train_df, method = "glm", family = binomial)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

# Making predictions using the training data
train_predictions <- predict(spamm, newdata = train_df)
confusionMatrix(train_predictions, train_df$spam)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2547  207
##          1  928 1231
##                                           
##                Accuracy : 0.769           
##                  95% CI : (0.7569, 0.7807)
##     No Information Rate : 0.7073          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5135          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.7329          
##             Specificity : 0.8561          
##          Pos Pred Value : 0.9248          
##          Neg Pred Value : 0.5702          
##              Prevalence : 0.7073          
##          Detection Rate : 0.5184          
##    Detection Prevalence : 0.5606          
##       Balanced Accuracy : 0.7945          
##                                           
##        'Positive' Class : 0               
##

# Pre-processing the data for test set
testc <- Corpus(VectorSource(test$text))
testc <- tm_map(testc, content_transformer(tolower))

## Warning in tm_map.SimpleCorpus(testc, content_transformer(tolower)):
## transformation drops documents

testc <- tm_map(testc, removePunctuation)

## Warning in tm_map.SimpleCorpus(testc, removePunctuation): transformation drops
## documents

testc <- tm_map(testc, removeWords, stopwords("en"))

## Warning in tm_map.SimpleCorpus(testc, removeWords, stopwords("en")):
## transformation drops documents

# Converting the data into d ocument term matrix using the training set
ttest <- DocumentTermMatrix(testc, control = list(dictionary = Terms(training)))

# Converting the data into a to data frame
test_df <- as.data.frame(as.matrix(ttest))
test_df$spam <- as.factor(test$label_num)

# Making predictions based on the text data
test_predictions <- predict(spamm, newdata = test_df)
confusionMatrix(test_predictions, test_df$spam)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 134  10
##          1  63  51
##                                           
##                Accuracy : 0.7171          
##                  95% CI : (0.6579, 0.7712)
##     No Information Rate : 0.7636          
##     P-Value [Acc > NIR] : 0.9644          
##                                           
##                   Kappa : 0.3972          
##                                           
##  Mcnemar's Test P-Value : 1.157e-09       
##                                           
##             Sensitivity : 0.6802          
##             Specificity : 0.8361          
##          Pos Pred Value : 0.9306          
##          Neg Pred Value : 0.4474          
##              Prevalence : 0.7636          
##          Detection Rate : 0.5194          
##    Detection Prevalence : 0.5581          
##       Balanced Accuracy : 0.7581          
##                                           
##        'Positive' Class : 0               
##

UP_Project_Four

Ursula Podosenin

2024-04-08