library(tm)
## Loading required package: NLP
library(caTools)
sh = read.csv("spam_ham_dataset.csv")
sh
After loading the data set into R, we will look at its structure
head(sh)
str(sh)
## 'data.frame': 5171 obs. of 4 variables:
## $ X : int 605 2349 3624 4685 2030 2949 2793 4185 2641 1870 ...
## $ label : chr "ham" "ham" "ham" "spam" ...
## $ text : chr "Subject: enron methanol ; meter # : 988291\nthis is a follow up to the note i gave you on monday , 4 / 3 / 00 {"| __truncated__ "Subject: hpl nom for january 9 , 2001\n( see attached file : hplnol 09 . xls )\n- hplnol 09 . xls" "Subject: neon retreat\nho ho ho , we ' re around to that most wonderful time of the year - - - neon leaders ret"| __truncated__ "Subject: photoshop , windows , office . cheap . main trending\nabasements darer prudently fortuitous undergone\"| __truncated__ ...
## $ label_num: int 0 0 0 1 0 0 0 1 0 0 ...
Now, let’s check how many are labeled as ‘spam’ and ‘not-spam’
table(sh$label_num)
##
## 0 1
## 3672 1499
Now, let’s create the corpus and clean the text: Convert into lower, remove punctuation, and remove stop words. Turn into a Document Term Matrix, called “doc_tm”
corpus = VCorpus(VectorSource(sh$text))
corpus = tm_map(corpus, content_transformer(tolower))
corpus = tm_map(corpus, PlainTextDocument)
corpus = tm_map(corpus, removePunctuation)
corpus = tm_map(corpus, removeWords, stopwords("en"))
doc_tm = DocumentTermMatrix(corpus)
doc_tm
## <<DocumentTermMatrix (documents: 5171, terms: 49690)>>
## Non-/sparse entries: 332421/256614569
## Sparsity : 100%
## Maximal term length: 24
## Weighting : term frequency (tf)
Now let’s remove the sparse terms in the matrix and create a data frame, called “sh1”
rem = removeSparseTerms(doc_tm, 0.85)
rem
## <<DocumentTermMatrix (documents: 5171, terms: 24)>>
## Non-/sparse entries: 30446/93658
## Sparsity : 75%
## Maximal term length: 9
## Weighting : term frequency (tf)
sh1 = as.data.frame(as.matrix(rem))
Now, we will use the ‘colSums’ function to see the frequency of each word in the data set
colnames(colSums(sh1))
## NULL
sort(colSums(sh1))
## attached let see get forwarded need know corp
## 1097 1159 1200 1275 1297 1480 1588 1776
## thanks daren can 2001 000 hpl deal gas
## 1898 1901 2021 2028 2127 2318 2827 3034
## please com will 2000 enron hou subject ect
## 3198 3709 4132 4386 6555 7289 8060 13900
sh1$spam = sh$label_num
sort(colSums(subset(sh1, "label_num" == 0)))
## 000 2000 2001 attached can com corp daren
## 0 0 0 0 0 0 0 0
## deal ect enron forwarded gas get hou hpl
## 0 0 0 0 0 0 0 0
## know let need please see subject thanks will
## 0 0 0 0 0 0 0 0
## spam
## 0
sort(colSums(subset(sh1, "label_num" == 1)))
## 000 2000 2001 attached can com corp daren
## 0 0 0 0 0 0 0 0
## deal ect enron forwarded gas get hou hpl
## 0 0 0 0 0 0 0 0
## know let need please see subject thanks will
## 0 0 0 0 0 0 0 0
## spam
## 0
sh1$spam = as.factor(sh1$spam)
Now, let’s see the accuracy of the data adn create a subset of the spam model, and a subset of the non-spam model
set.seed(123)
split = sample.split(sh1$spam, .8)
train_set = subset(sh1, split == TRUE)
test_set = subset(sh1, split == FALSE)
spams = glm(spam~., data = train_set, family = "binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
Now, for some predictions
pred = predict(spams, type = "response")
table(train_set$spam, pred > 0.5)
##
## FALSE TRUE
## 0 2571 367
## 1 117 1082
accuracy1 = (2571+1082)/nrow(train_set)
pred2 = predict(spams, newdata = test_set, type = "response")
table(test_set$spam, pred2 > 0.5)
##
## FALSE TRUE
## 0 662 72
## 1 22 278
accuracy2 = (662+278)/nrow(test_set)
accuracy1
## [1] 0.883007
accuracy2
## [1] 0.9090909
This shows that the accuracy is around 90%.