The dataset needed for this project was obtained from Kaggle.com.

It is available @ https://www.kaggle.com/datasets/venky73/spam-mails-dataset?resource=download.

Load library

# NLP library is used to do natural language processing
#install.packages("NLP")
#library(NLP)
# The tm library is used to work with corpus which is a collection of data
#install.packages("tm")
#install.packages("caTools")
library(tm)
## Loading required package: NLP

This is to load data to Github.

emails<- read.csv("https://raw.githubusercontent.com/deepasharma06/Data-607/main/spam_ham_dataset.csv")

This set of code was used in the begenning because my data was in the local machine. However, as I moved the data to Github, this is not needed any more.

#getwd()
##setwd("C:/Users/dkbs0/OneDrive/Desktop/Class 607/Project 4/")
# File location = "C:\Users\dkbs0\OneDrive\Desktop\Class 607\Project 4\spam_ham_dataset.csv"
# This is to read the data file
##emails = read.csv("spam_ham_dataset.csv", stringsAsFactors = TRUE)

This is to view the structure of the data frame as a string.

str(emails)
## 'data.frame':    5171 obs. of  4 variables:
##  $ X        : int  605 2349 3624 4685 2030 2949 2793 4185 2641 1870 ...
##  $ label    : chr  "ham" "ham" "ham" "spam" ...
##  $ text     : chr  "Subject: enron methanol ; meter # : 988291\nthis is a follow up to the note i gave you on monday , 4 / 3 / 00 {"| __truncated__ "Subject: hpl nom for january 9 , 2001\n( see attached file : hplnol 09 . xls )\n- hplnol 09 . xls" "Subject: neon retreat\nho ho ho , we ' re around to that most wonderful time of the year - - - neon leaders ret"| __truncated__ "Subject: photoshop , windows , office . cheap . main trending\nabasements darer prudently fortuitous undergone\"| __truncated__ ...
##  $ label_num: int  0 0 0 1 0 0 0 1 0 0 ...

This shows the number of records that are categorized as spam (1) and non spam (o)

table(emails$label_num)
## 
##    0    1 
## 3672 1499
#This is to create a virtual corpus
corpus = VCorpus(VectorSource(emails$text))
# This is to convert all text to lowercase
corpus = tm_map(corpus, content_transformer(tolower))
# This is to convert all text to plain text document
corpus = tm_map(corpus, PlainTextDocument)
# This is to convert all text to remove any punctuation
corpus = tm_map(corpus, removePunctuation)
# This is to convert all text to remove any English stop words (eg. a, an, the etc.)
corpus = tm_map(corpus, removeWords, stopwords("en"))
# This is to convert the data into a DocumentTermMatrix. The data is aligned into columns and rows
dtm = DocumentTermMatrix(corpus)
dtm
## <<DocumentTermMatrix (documents: 5171, terms: 49690)>>
## Non-/sparse entries: 332421/256614569
## Sparsity           : 100%
## Maximal term length: 24
## Weighting          : term frequency (tf)

This is to remove sparse terms - that are not readily available in the data set

spdtm = removeSparseTerms(dtm, 0.75)
spdtm
## <<DocumentTermMatrix (documents: 5171, terms: 6)>>
## Non-/sparse entries: 13158/17868
## Sparsity           : 58%
## Maximal term length: 7
## Weighting          : term frequency (tf)

This is to convert the processed data into a data frame

emailsSparse = as.data.frame(as.matrix(spdtm))
#The colsums function shows the frequency of each word in the data set
colnames(colSums(emailsSparse))
## NULL
sort(colSums(emailsSparse))
##  thanks  please    will    2000   enron subject 
##    1898    3198    4132    4386    6555    8060
emailsSparse$spam = emails$label_num
#Below codes shows the frequency of all non spam words
sort(colSums(subset(emailsSparse, "label_num" == 0)))
##    2000   enron  please subject  thanks    will    spam 
##       0       0       0       0       0       0       0
#Below codes shows the frequency of all spam words
sort(colSums(subset(emailsSparse, "label_num" == 1)))
##    2000   enron  please subject  thanks    will    spam 
##       0       0       0       0       0       0       0
emailsSparse$spam = as.factor(emailsSparse$spam)

Below is to randomize the data to increase model accuracy.

library(caTools)
set.seed(123)
spl = sample.split(emailsSparse$spam, .95)
#This is to create a subset of spam model
train = subset(emailsSparse, spl == TRUE)
#This is to create a subset of non spam model
test = subset(emailsSparse, spl == FALSE)
#This gives the number of iteration it will take to calculate the various attributes such as mean, median etc.
spamlog = glm(spam~., data = train, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(spamlog)
## 
## Call:
## glm(formula = spam ~ ., family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4196  -0.7415   0.0000   1.0784   4.1237  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.16044    0.10258   1.564   0.1178    
## `2000`       -1.22525    0.11542 -10.616   <2e-16 ***
## enron       -17.10993  268.90246  -0.064   0.9493    
## please       -0.16300    0.05135  -3.175   0.0015 ** 
## subject       0.07698    0.08814   0.873   0.3825    
## thanks       -1.89713    0.12623 -15.029   <2e-16 ***
## will         -0.04864    0.02815  -1.728   0.0840 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5914.7  on 4911  degrees of freedom
## Residual deviance: 4028.6  on 4905  degrees of freedom
## AIC: 4042.6
## 
## Number of Fisher Scoring iterations: 21

This is to predict the training dataset with an initial threshold of 0.5.

predTrainLog = predict(spamlog, type = "response")
table(train$spam, predTrainLog > 0.5)
##    
##     FALSE TRUE
##   0  2560  928
##   1   214 1210
# This is to calculate the accuracy of the model
(2547+1214)/nrow(train)
## [1] 0.7656759

This shows that the model is trained to be 76.5% accurate.

predTestLog = predict(spamlog, newdata = test, type = "response")
table(test$spam, predTestLog > 0.5)
##    
##     FALSE TRUE
##   0   129   55
##   1     7   68
# This is to calculate the accuracy of the model
(139+68)/nrow(test)
## [1] 0.7992278

Conlcusion: This shows that this model can be used to predict whether an email is a spam or not with an accuracy of up to 79.9%. The parameters in the code above can be changed to get to a different level of confidence.