It is available @ https://www.kaggle.com/datasets/venky73/spam-mails-dataset?resource=download.
# NLP library is used to do natural language processing
#install.packages("NLP")
#library(NLP)
# The tm library is used to work with corpus which is a collection of data
#install.packages("tm")
#install.packages("caTools")
library(tm)
## Loading required package: NLP
emails<- read.csv("https://raw.githubusercontent.com/deepasharma06/Data-607/main/spam_ham_dataset.csv")
#getwd()
##setwd("C:/Users/dkbs0/OneDrive/Desktop/Class 607/Project 4/")
# File location = "C:\Users\dkbs0\OneDrive\Desktop\Class 607\Project 4\spam_ham_dataset.csv"
# This is to read the data file
##emails = read.csv("spam_ham_dataset.csv", stringsAsFactors = TRUE)
str(emails)
## 'data.frame': 5171 obs. of 4 variables:
## $ X : int 605 2349 3624 4685 2030 2949 2793 4185 2641 1870 ...
## $ label : chr "ham" "ham" "ham" "spam" ...
## $ text : chr "Subject: enron methanol ; meter # : 988291\nthis is a follow up to the note i gave you on monday , 4 / 3 / 00 {"| __truncated__ "Subject: hpl nom for january 9 , 2001\n( see attached file : hplnol 09 . xls )\n- hplnol 09 . xls" "Subject: neon retreat\nho ho ho , we ' re around to that most wonderful time of the year - - - neon leaders ret"| __truncated__ "Subject: photoshop , windows , office . cheap . main trending\nabasements darer prudently fortuitous undergone\"| __truncated__ ...
## $ label_num: int 0 0 0 1 0 0 0 1 0 0 ...
table(emails$label_num)
##
## 0 1
## 3672 1499
#This is to create a virtual corpus
corpus = VCorpus(VectorSource(emails$text))
# This is to convert all text to lowercase
corpus = tm_map(corpus, content_transformer(tolower))
# This is to convert all text to plain text document
corpus = tm_map(corpus, PlainTextDocument)
# This is to convert all text to remove any punctuation
corpus = tm_map(corpus, removePunctuation)
# This is to convert all text to remove any English stop words (eg. a, an, the etc.)
corpus = tm_map(corpus, removeWords, stopwords("en"))
# This is to convert the data into a DocumentTermMatrix. The data is aligned into columns and rows
dtm = DocumentTermMatrix(corpus)
dtm
## <<DocumentTermMatrix (documents: 5171, terms: 49690)>>
## Non-/sparse entries: 332421/256614569
## Sparsity : 100%
## Maximal term length: 24
## Weighting : term frequency (tf)
spdtm = removeSparseTerms(dtm, 0.75)
spdtm
## <<DocumentTermMatrix (documents: 5171, terms: 6)>>
## Non-/sparse entries: 13158/17868
## Sparsity : 58%
## Maximal term length: 7
## Weighting : term frequency (tf)
emailsSparse = as.data.frame(as.matrix(spdtm))
#The colsums function shows the frequency of each word in the data set
colnames(colSums(emailsSparse))
## NULL
sort(colSums(emailsSparse))
## thanks please will 2000 enron subject
## 1898 3198 4132 4386 6555 8060
emailsSparse$spam = emails$label_num
#Below codes shows the frequency of all non spam words
sort(colSums(subset(emailsSparse, "label_num" == 0)))
## 2000 enron please subject thanks will spam
## 0 0 0 0 0 0 0
#Below codes shows the frequency of all spam words
sort(colSums(subset(emailsSparse, "label_num" == 1)))
## 2000 enron please subject thanks will spam
## 0 0 0 0 0 0 0
emailsSparse$spam = as.factor(emailsSparse$spam)
library(caTools)
set.seed(123)
spl = sample.split(emailsSparse$spam, .95)
#This is to create a subset of spam model
train = subset(emailsSparse, spl == TRUE)
#This is to create a subset of non spam model
test = subset(emailsSparse, spl == FALSE)
#This gives the number of iteration it will take to calculate the various attributes such as mean, median etc.
spamlog = glm(spam~., data = train, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(spamlog)
##
## Call:
## glm(formula = spam ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4196 -0.7415 0.0000 1.0784 4.1237
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.16044 0.10258 1.564 0.1178
## `2000` -1.22525 0.11542 -10.616 <2e-16 ***
## enron -17.10993 268.90246 -0.064 0.9493
## please -0.16300 0.05135 -3.175 0.0015 **
## subject 0.07698 0.08814 0.873 0.3825
## thanks -1.89713 0.12623 -15.029 <2e-16 ***
## will -0.04864 0.02815 -1.728 0.0840 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5914.7 on 4911 degrees of freedom
## Residual deviance: 4028.6 on 4905 degrees of freedom
## AIC: 4042.6
##
## Number of Fisher Scoring iterations: 21
predTrainLog = predict(spamlog, type = "response")
table(train$spam, predTrainLog > 0.5)
##
## FALSE TRUE
## 0 2560 928
## 1 214 1210
# This is to calculate the accuracy of the model
(2547+1214)/nrow(train)
## [1] 0.7656759
predTestLog = predict(spamlog, newdata = test, type = "response")
table(test$spam, predTestLog > 0.5)
##
## FALSE TRUE
## 0 129 55
## 1 7 68
# This is to calculate the accuracy of the model
(139+68)/nrow(test)
## [1] 0.7992278