The goal of this project is to use a sample of known spam files and known ham (non-spam) files to be able to determine whether an unknown email is spam or ham.
Load Libraries
suppressWarnings(suppressMessages(library(readr)))
suppressWarnings(suppressMessages(library(knitr)))
suppressWarnings(suppressMessages(library(tidytext)))
suppressWarnings(suppressMessages(library(tidyr)))
suppressWarnings(suppressMessages(library(dplyr)))
suppressWarnings(suppressMessages(library(tm)))
suppressWarnings(suppressMessages(library(stringr)))
suppressWarnings(suppressMessages(library(RCurl)))
Read in Spam Data and Create a Data Frame of Data to Test and a Data Frame of Withheld Data
setwd("C:/Users/Swigo/OneDrive/Documents/GitHub/DATA-607/spamham/20021010_spam/spam")
spam.path<- "C:/Users/Swigo/OneDrive/Documents/GitHub/DATA-607/spamham/20021010_spam/spam/"
#The code from get.msg through all.spam is code taken from Drew Conway and John Myles White to eliminate the headers from emails
#https://view.officeapps.live.com/op/view.aspx?src=https%3A%2F%2Fqualityandinnovation.files.wordpress.com%2F2012%2F09%2Ftext-analysis-75-925.doc
get.msg <- function(path) {
con <- file(path,open="rt")
text <- readLines(con)
msg <- text[seq(which(text=="")[1]+1,length(text))]
close(con)
return(paste(msg,collapse="\n"))
}
spam.docs <- dir(spam.path)
spam.docs <- spam.docs[which(spam.docs!="cmds")]
all.spam <- sapply(spam.docs, function(p)get.msg(paste(spam.path,p,sep="")))
spam_list <- do.call(rbind,lapply(all.spam, read_file))
spam_df <- data.frame(emails=sample(spam_list, 500, replace=FALSE))
spamtestdata <-data.frame(rep(NA, 400))
spamholddata <-data.frame(rep(NA, 100))
spamtestdata$emails <- spam_df$emails[-(401:500)]
spamholddata$emails <- spam_df$emails[-(1:400)]
Read in Ham Data and Create a Data Frame to Test and a Data Frame of Withheld Data
setwd("C:/Users/Swigo/OneDrive/Documents/GitHub/DATA-607/spamham/20021010_easy_ham/easy_ham")
ham.path <- "C:/Users/Swigo/OneDrive/Documents/GitHub/DATA-607/spamham/20021010_easy_ham/easy_ham/"
ham.docs <- dir(ham.path)
ham.docs <- ham.docs[which(ham.docs!="cmds")]
all.ham <- sapply(ham.docs, function(p)get.msg(paste(ham.path,p,sep="")))
ham_list <- do.call(rbind,lapply(all.ham, read_file))
ham_df <- data.frame(emails=sample(ham_list, 2551, replace=FALSE))
hamtestdata <-data.frame(rep(NA, 2449))
hamtestdata$emails <- ham_df$emails[-(2450:2551)]
hamwithholddata <- data.frame(rep(NA, 102))
hamwithholddata$emails <- ham_df$emails[-(1:2449)]
hamtestdata$emails <- as.character((hamtestdata$emails))
Tidying Spam and Ham Emails
Words are separated, stop words are removed, only words 3 letters and longer are kept, and words are counted
spamtestdata$emails <- as.character(spamtestdata$emails)
wordnumspam <- vapply(strsplit(spamtestdata$emails, "\\w+"), length, integer(1))
summary(wordnumspam)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 11.0 136.8 362.0 663.5 803.5 11660.0
spamtidy_df <- spamtestdata %>%
unnest_tokens(word, emails) %>%
anti_join(stop_words) %>%
filter(str_detect(word, "[[:alpha:]]{3,}"))
spamwords <- spamtidy_df %>%
count(word, sort=TRUE)
spamwords
## # A tibble: 27,148 x 2
## word n
## <chr> <int>
## 1 font 7262
## 2 size 2462
## 3 width 2323
## 4 http 2280
## 5 color 1714
## 6 align 1360
## 7 height 1342
## 8 nbsp 1151
## 9 center 1145
## 10 table 1136
## # ... with 27,138 more rows
hamtestdata$emails <- as.character((hamtestdata$emails))
wordnumham <- vapply(strsplit(hamtestdata$emails, "\\w+"), length, integer(1))
summary(wordnumham)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.0 74.0 144.0 239.7 250.0 13303.0
hamtidy_df <- hamtestdata %>%
unnest_tokens(word, emails) %>%
anti_join(stop_words) %>%
filter(str_detect(word, "[[:alpha:]]{3,}"))
hamtidy_df %>%
count(word, sort=TRUE)
## # A tibble: 27,176 x 2
## word n
## <chr> <int>
## 1 http 3930
## 2 list 2195
## 3 rpm 1213
## 4 listinfo 979
## 5 spamassassin 971
## 6 exmh 942
## 7 wrote 915
## 8 time 908
## 9 users 900
## 10 people 883
## # ... with 27,166 more rows
Spam emails tend to have more words than ham emails. The spam emails have a median word length of 332 while ham emails have a median word length of 144.
Finding Sentiment of Spam and Ham Emails
spamsentiment <- spamtidy_df %>%
inner_join(get_sentiments("bing")) %>%
count(sentiment)
spamsentimentpercentage <- (spamsentiment$n[2]-spamsentiment$n[1])/(spamsentiment$n[2]+spamsentiment$n[1])
spamsentimentpercentage
## [1] 0.3351675
hamsentiment <- hamtidy_df %>%
inner_join(get_sentiments("bing")) %>%
count(sentiment)
hamsentimentpercentage <- (hamsentiment$n[2]-hamsentiment$n[1])/(hamsentiment$n[2]+hamsentiment$n[1])
hamsentimentpercentage
## [1] -0.1075175
Spam emails tend to be positive while ham emails tend to be negative. The difference between the percentage of positive and negative words in spam emails is 33%. The difference between the percentage of positive and negative words in ham emails is -11%. The negative sign indicates that there is a greater likelihood that the ham email will be negative.
Predicting whether an Email is Spam or Ham
To predict whether an email will be spam or ham, the number of words in the email will be calculated and the sentiment of the email will be determined. The data being used to test is data from the original collectionn of spam and ham emails that were withheld from the previous analysis. The most definitive way to determine whether an email is spam or ham is based on the sentiment analysis. I chose to test if the percentage of positive words-percentage of negative words is greater than 0.20. If so, then the email is categorized as spam. If not, then the email undergoes another check based on its length: if it is less than 400 words, it is classified as ham. Otherwise it is classified as spam.
decision <- list()
for (i in 1:length(spamholddata$rep.NA..100.)){
unknown <- data.frame(rep(NA, 1))
unknown$emails <- spamholddata$emails[i]
unknown$emails <- as.character(unknown$emails)
tidy_df <- unknown %>%
unnest_tokens(word, emails) %>%
anti_join(stop_words) %>%
filter(str_detect(word, "[[:alpha:]]{3,}"))
wordnum <- sum(sapply(gregexpr(" ", spamholddata$emails[i]), length)+1)
unknownsentiment <- tidy_df %>%
inner_join(get_sentiments("bing")) %>%
count(sentiment)
sentimentpercentage <- (unknownsentiment$n[2]-unknownsentiment$n[1])/(unknownsentiment$n[2]+unknownsentiment$n[1])
sentimentpercentage
ifelse (sentimentpercentage > .25, decision<-c(decision, "spam"), {
ifelse (wordnum <400, decision <- c(decision,"ham"), decision<-c(decision, "spam"))
})
}
length(decision[decision=="spam"])/length(decision)
## [1] 0.852459
decisionh <- list()
for (i in 1:length(hamwithholddata$rep.NA..102.)){
unknown <- data.frame(rep(NA, 1))
unknown$emails <- hamwithholddata$emails[i]
unknown$emails <- as.character(unknown$emails)
tidy_df <- unknown %>%
unnest_tokens(word, emails) %>%
anti_join(stop_words) %>%
filter(str_detect(word, "[[:alpha:]]{3,}"))
wordnum <- sum(sapply(gregexpr(" ", hamwithholddata$emails[i]), length)+1)
unknownsentiment <- tidy_df %>%
inner_join(get_sentiments("bing")) %>%
count(sentiment)
sentimentpercentage <- (unknownsentiment$n[2]-unknownsentiment$n[1])/(unknownsentiment$n[2]+unknownsentiment$n[1])
sentimentpercentage
ifelse (sentimentpercentage > .25, decisionh<-c(decisionh, "spam"), {ifelse (wordnum < 400, decisionh <- c(decisionh,"ham"), decisionh<-c(decisionh, "spam"))})
}
length(decisionh[decisionh=="ham"])/length(decisionh)
## [1] 0.6349206
The algorithm correctly identified spam emails about 82% of the time and correctly identified ham emails about 68% of the time. The greater the accuracy I am able to get predicting one type of email, the lower the accuracy I am able to get in predicting the other type of email.