Project 4

library(tm)

## Loading required package: NLP

library(caTools)
sh = read.csv("spam_ham_dataset.csv")
sh

After loading the data set into R, we will look at its structure

head(sh)

str(sh)

## 'data.frame':    5171 obs. of  4 variables:
##  $ X        : int  605 2349 3624 4685 2030 2949 2793 4185 2641 1870 ...
##  $ label    : chr  "ham" "ham" "ham" "spam" ...
##  $ text     : chr  "Subject: enron methanol ; meter # : 988291\nthis is a follow up to the note i gave you on monday , 4 / 3 / 00 {"| __truncated__ "Subject: hpl nom for january 9 , 2001\n( see attached file : hplnol 09 . xls )\n- hplnol 09 . xls" "Subject: neon retreat\nho ho ho , we ' re around to that most wonderful time of the year - - - neon leaders ret"| __truncated__ "Subject: photoshop , windows , office . cheap . main trending\nabasements darer prudently fortuitous undergone\"| __truncated__ ...
##  $ label_num: int  0 0 0 1 0 0 0 1 0 0 ...

Now, let’s check how many are labeled as ‘spam’ and ‘not-spam’

table(sh$label_num)

## 
##    0    1 
## 3672 1499

Now, let’s create the corpus and clean the text: Convert into lower, remove punctuation, and remove stop words. Turn into a Document Term Matrix, called “doc_tm”

corpus = VCorpus(VectorSource(sh$text))
corpus = tm_map(corpus, content_transformer(tolower))
corpus = tm_map(corpus, PlainTextDocument)
corpus = tm_map(corpus, removePunctuation)
corpus = tm_map(corpus, removeWords, stopwords("en"))
doc_tm = DocumentTermMatrix(corpus)
doc_tm

## <<DocumentTermMatrix (documents: 5171, terms: 49690)>>
## Non-/sparse entries: 332421/256614569
## Sparsity           : 100%
## Maximal term length: 24
## Weighting          : term frequency (tf)

Now let’s remove the sparse terms in the matrix and create a data frame, called “sh1”

rem = removeSparseTerms(doc_tm, 0.85)
rem

## <<DocumentTermMatrix (documents: 5171, terms: 24)>>
## Non-/sparse entries: 30446/93658
## Sparsity           : 75%
## Maximal term length: 9
## Weighting          : term frequency (tf)

sh1 = as.data.frame(as.matrix(rem))

Now, we will use the ‘colSums’ function to see the frequency of each word in the data set

colnames(colSums(sh1))

## NULL

sort(colSums(sh1))

##  attached       let       see       get forwarded      need      know      corp 
##      1097      1159      1200      1275      1297      1480      1588      1776 
##    thanks     daren       can      2001       000       hpl      deal       gas 
##      1898      1901      2021      2028      2127      2318      2827      3034 
##    please       com      will      2000     enron       hou   subject       ect 
##      3198      3709      4132      4386      6555      7289      8060     13900

sh1$spam = sh$label_num
sort(colSums(subset(sh1, "label_num" == 0)))

##       000      2000      2001  attached       can       com      corp     daren 
##         0         0         0         0         0         0         0         0 
##      deal       ect     enron forwarded       gas       get       hou       hpl 
##         0         0         0         0         0         0         0         0 
##      know       let      need    please       see   subject    thanks      will 
##         0         0         0         0         0         0         0         0 
##      spam 
##         0

sort(colSums(subset(sh1, "label_num" == 1)))

##       000      2000      2001  attached       can       com      corp     daren 
##         0         0         0         0         0         0         0         0 
##      deal       ect     enron forwarded       gas       get       hou       hpl 
##         0         0         0         0         0         0         0         0 
##      know       let      need    please       see   subject    thanks      will 
##         0         0         0         0         0         0         0         0 
##      spam 
##         0

sh1$spam = as.factor(sh1$spam)

Now, let’s see the accuracy of the data adn create a subset of the spam model, and a subset of the non-spam model

set.seed(123)
split = sample.split(sh1$spam, .8)
train_set = subset(sh1, split == TRUE)
test_set = subset(sh1, split == FALSE)
spams = glm(spam~., data = train_set, family = "binomial")

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

Now, for some predictions

pred = predict(spams, type = "response")
table(train_set$spam, pred > 0.5)

##    
##     FALSE TRUE
##   0  2571  367
##   1   117 1082

accuracy1 = (2571+1082)/nrow(train_set)

pred2 = predict(spams, newdata = test_set, type = "response")
table(test_set$spam, pred2 > 0.5)

##    
##     FALSE TRUE
##   0   662   72
##   1    22  278

accuracy2 = (662+278)/nrow(test_set)

accuracy1

## [1] 0.883007

accuracy2

## [1] 0.9090909

This shows that the accuracy is around 90%.