DATA 607 Project 4 - Document Classification

The goal of this project is to use a sample of known spam files and known ham (non-spam) files to be able to determine whether an unknown email is spam or ham.

Load Libraries

suppressWarnings(suppressMessages(library(readr)))
suppressWarnings(suppressMessages(library(knitr)))
suppressWarnings(suppressMessages(library(tidytext)))
suppressWarnings(suppressMessages(library(tidyr)))
suppressWarnings(suppressMessages(library(dplyr)))
suppressWarnings(suppressMessages(library(tm)))
suppressWarnings(suppressMessages(library(stringr)))
suppressWarnings(suppressMessages(library(RCurl)))

Read in Spam Data and Create a Data Frame of Data to Test and a Data Frame of Withheld Data

setwd("C:/Users/Swigo/OneDrive/Documents/GitHub/DATA-607/spamham/20021010_spam/spam")

spam.path<- "C:/Users/Swigo/OneDrive/Documents/GitHub/DATA-607/spamham/20021010_spam/spam/"

#The code from get.msg through all.spam is code taken from Drew Conway and John Myles White to eliminate the headers from emails
#https://view.officeapps.live.com/op/view.aspx?src=https%3A%2F%2Fqualityandinnovation.files.wordpress.com%2F2012%2F09%2Ftext-analysis-75-925.doc

get.msg <- function(path) {
   con <- file(path,open="rt")
   text <- readLines(con)
   msg <- text[seq(which(text=="")[1]+1,length(text))]  
   close(con)
   return(paste(msg,collapse="\n"))
}

spam.docs <- dir(spam.path)
spam.docs <- spam.docs[which(spam.docs!="cmds")]
all.spam <- sapply(spam.docs, function(p)get.msg(paste(spam.path,p,sep="")))

spam_list <- do.call(rbind,lapply(all.spam, read_file))
spam_df <- data.frame(emails=sample(spam_list, 500, replace=FALSE))



spamtestdata <-data.frame(rep(NA, 400))
spamholddata <-data.frame(rep(NA, 100))

spamtestdata$emails <- spam_df$emails[-(401:500)]
spamholddata$emails <- spam_df$emails[-(1:400)]

Read in Ham Data and Create a Data Frame to Test and a Data Frame of Withheld Data

setwd("C:/Users/Swigo/OneDrive/Documents/GitHub/DATA-607/spamham/20021010_easy_ham/easy_ham")

ham.path <- "C:/Users/Swigo/OneDrive/Documents/GitHub/DATA-607/spamham/20021010_easy_ham/easy_ham/"

ham.docs <- dir(ham.path)
ham.docs <- ham.docs[which(ham.docs!="cmds")]
all.ham <- sapply(ham.docs, function(p)get.msg(paste(ham.path,p,sep="")))

ham_list <- do.call(rbind,lapply(all.ham, read_file))

ham_df <- data.frame(emails=sample(ham_list, 2551, replace=FALSE))

hamtestdata <-data.frame(rep(NA, 2449))
hamtestdata$emails <- ham_df$emails[-(2450:2551)]

hamwithholddata <- data.frame(rep(NA, 102))
hamwithholddata$emails <- ham_df$emails[-(1:2449)]

hamtestdata$emails <- as.character((hamtestdata$emails))

Tidying Spam and Ham Emails

Words are separated, stop words are removed, only words 3 letters and longer are kept, and words are counted

spamtestdata$emails <- as.character(spamtestdata$emails)

wordnumspam <- vapply(strsplit(spamtestdata$emails, "\\w+"), length, integer(1))
summary(wordnumspam)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    11.0   136.8   362.0   663.5   803.5 11660.0

spamtidy_df <- spamtestdata %>% 
  unnest_tokens(word, emails) %>%
  anti_join(stop_words) %>%
  filter(str_detect(word, "[[:alpha:]]{3,}"))
spamwords <- spamtidy_df %>% 
  count(word, sort=TRUE)
spamwords

## # A tibble: 27,148 x 2
##      word     n
##     <chr> <int>
##  1   font  7262
##  2   size  2462
##  3  width  2323
##  4   http  2280
##  5  color  1714
##  6  align  1360
##  7 height  1342
##  8   nbsp  1151
##  9 center  1145
## 10  table  1136
## # ... with 27,138 more rows

hamtestdata$emails <- as.character((hamtestdata$emails))

wordnumham <- vapply(strsplit(hamtestdata$emails, "\\w+"), length, integer(1))
summary(wordnumham)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     7.0    74.0   144.0   239.7   250.0 13303.0

hamtidy_df <- hamtestdata %>% 
  unnest_tokens(word, emails) %>%
  anti_join(stop_words) %>%
  filter(str_detect(word, "[[:alpha:]]{3,}"))
hamtidy_df %>%
  count(word, sort=TRUE)

## # A tibble: 27,176 x 2
##            word     n
##           <chr> <int>
##  1         http  3930
##  2         list  2195
##  3          rpm  1213
##  4     listinfo   979
##  5 spamassassin   971
##  6         exmh   942
##  7        wrote   915
##  8         time   908
##  9        users   900
## 10       people   883
## # ... with 27,166 more rows

Spam emails tend to have more words than ham emails. The spam emails have a median word length of 332 while ham emails have a median word length of 144.

Finding Sentiment of Spam and Ham Emails

spamsentiment <- spamtidy_df %>%
  inner_join(get_sentiments("bing")) %>%
  count(sentiment) 

spamsentimentpercentage <- (spamsentiment$n[2]-spamsentiment$n[1])/(spamsentiment$n[2]+spamsentiment$n[1])
spamsentimentpercentage

## [1] 0.3351675

hamsentiment <- hamtidy_df %>%
  inner_join(get_sentiments("bing")) %>%
  count(sentiment) 

hamsentimentpercentage <- (hamsentiment$n[2]-hamsentiment$n[1])/(hamsentiment$n[2]+hamsentiment$n[1])
hamsentimentpercentage

## [1] -0.1075175

Spam emails tend to be positive while ham emails tend to be negative. The difference between the percentage of positive and negative words in spam emails is 33%. The difference between the percentage of positive and negative words in ham emails is -11%. The negative sign indicates that there is a greater likelihood that the ham email will be negative.

Predicting whether an Email is Spam or Ham

To predict whether an email will be spam or ham, the number of words in the email will be calculated and the sentiment of the email will be determined. The data being used to test is data from the original collectionn of spam and ham emails that were withheld from the previous analysis. The most definitive way to determine whether an email is spam or ham is based on the sentiment analysis. I chose to test if the percentage of positive words-percentage of negative words is greater than 0.20. If so, then the email is categorized as spam. If not, then the email undergoes another check based on its length: if it is less than 400 words, it is classified as ham. Otherwise it is classified as spam.

decision <- list()
for (i in 1:length(spamholddata$rep.NA..100.)){
  unknown <- data.frame(rep(NA, 1))
  unknown$emails <- spamholddata$emails[i]
  unknown$emails <- as.character(unknown$emails)  
  tidy_df <- unknown %>% 
    unnest_tokens(word, emails) %>%
    anti_join(stop_words) %>%
    filter(str_detect(word, "[[:alpha:]]{3,}"))

  wordnum <- sum(sapply(gregexpr(" ", spamholddata$emails[i]), length)+1)   
  
  unknownsentiment <- tidy_df %>%
    inner_join(get_sentiments("bing")) %>%
    count(sentiment) 

  sentimentpercentage <- (unknownsentiment$n[2]-unknownsentiment$n[1])/(unknownsentiment$n[2]+unknownsentiment$n[1])
  sentimentpercentage

  ifelse (sentimentpercentage > .25, decision<-c(decision, "spam"), {
   ifelse (wordnum <400, decision <- c(decision,"ham"), decision<-c(decision, "spam"))
  })
}

length(decision[decision=="spam"])/length(decision)

## [1] 0.852459

decisionh <- list()
for (i in 1:length(hamwithholddata$rep.NA..102.)){
  unknown <- data.frame(rep(NA, 1))
  unknown$emails <- hamwithholddata$emails[i]
  unknown$emails <- as.character(unknown$emails)  
  tidy_df <- unknown %>% 
    unnest_tokens(word, emails) %>%
    anti_join(stop_words) %>%
    filter(str_detect(word, "[[:alpha:]]{3,}"))

  wordnum <- sum(sapply(gregexpr(" ", hamwithholddata$emails[i]), length)+1)  

  unknownsentiment <- tidy_df %>%
    inner_join(get_sentiments("bing")) %>%
    count(sentiment) 

  sentimentpercentage <- (unknownsentiment$n[2]-unknownsentiment$n[1])/(unknownsentiment$n[2]+unknownsentiment$n[1])
  sentimentpercentage

  ifelse (sentimentpercentage > .25, decisionh<-c(decisionh, "spam"), {ifelse (wordnum < 400, decisionh <- c(decisionh,"ham"), decisionh<-c(decisionh, "spam"))})
}

length(decisionh[decisionh=="ham"])/length(decisionh)

## [1] 0.6349206

The algorithm correctly identified spam emails about 82% of the time and correctly identified ham emails about 68% of the time. The greater the accuracy I am able to get predicting one type of email, the lower the accuracy I am able to get in predicting the other type of email.