Document Classification

Loading Packages

library(stringr)
library(tm)
## Loading required package: NLP
library(SnowballC)
library(stopwords)
## 
## Attaching package: 'stopwords'
## The following object is masked from 'package:tm':
## 
##     stopwords
library(corpus)
library(plyr)

Access the Data (Ignoring the cmds files in each folder)

ham_folder <- "easy_ham\\"
spam_folder <- "spam_2\\"
Spam_Files <- list.files(path = spam_folder, full.names = TRUE)
n_spam <- length(Spam_Files)

Ham_Files <- list.files(path = ham_folder, full.names = TRUE)
n_ham <- length(Ham_Files)

Spam_Files <- Spam_Files[which(Spam_Files != "cmds")]
Ham_Files <- Ham_Files[which(Ham_Files != "cmds")]

Fill a corpus with the data

Spam_Dir <- DirSource("spam_2")
#Spam_c <- Corpus(Spam_Dir, readerControl = list(readPlain))
Spam_c <- VCorpus(Spam_Dir)
meta(Spam_c, "type") <- "Spam"

Ham_Dir <- DirSource("easy_ham")
Ham_c <- VCorpus(Ham_Dir)
meta(Ham_c, "type") <- "Ham"

Create Term Document Matrices

options <- list(removeNumbers=TRUE, removePunctuation=TRUE,tolower=TRUE,stopwords=TRUE,stripWhitespace=TRUE,minWordLength=2,maxWordLength=25)

TDM_spam <- TermDocumentMatrix(Spam_c,control = options)
TDM_spam <- removeSparseTerms(TDM_spam,(1-(10/length(Spam_c))))
TDM_Ham <- TermDocumentMatrix(Ham_c,control = options)
TDM_Ham <- removeSparseTerms(TDM_Ham,(1-(10/length(Ham_c))))
term_Spam_df <- as.data.frame(as.table(TDM_spam))
names(term_Spam_df) <- c("Term", "Spam_Doc", "Spam_Freq")
term_Spam_df$Spam_Doc <- "Spam"
term_Spam_df$Spam_Freq[is.na(term_Spam_df$Spam_Freq)] <- 0
term_Spam_df <- ddply(term_Spam_df, .(Term, Spam_Doc), summarize, Spam_Freq = sum(as.numeric(Spam_Freq)))
term_Ham_df <- as.data.frame(as.table(TDM_Ham))
names(term_Ham_df) <- c("Term", "Ham_Doc", "Ham_Freq")
term_Ham_df$Ham_Doc <- "Ham"
term_Ham_df$Ham_Freq[is.na(term_Ham_df$Ham_Freq)] <- 0
term_Ham_df <- ddply(term_Ham_df, .(Term, Ham_Doc), summarize, Ham_Freq = sum(as.numeric(Ham_Freq)))

Combine the two dataframes and calculate the difference between the frequency of each term in the ham files and in the spam files. Limit the final Scoring dataframe to the terms found more commonly in spam documents (specifically those with a count of more than 100 more occurences in spam)

Combo_df <- merge(term_Ham_df,term_Spam_df, by = "Term")
Combo_df$Delta <- Combo_df$Ham_Freq - Combo_df$Spam_Freq
Scoring_df <- Combo_df[order(Combo_df$Delta),]
Scoring_df <- Scoring_df[as.numeric(Scoring_df$Delta) <  -100,]

Develop a scoring scheme to determine the likelyhood of a document being SPAM

Is_It_Spam <- function(filename){
  text <- paste(readLines(filename), collapse = ' ')
  word_df <- as.data.frame(strsplit(text, "\\W+"))
  names(word_df) <- "Text"
  word_df$Text <- tolower(word_df$Text)
  nWords <- nrow(word_df)
  word_df$Delta <- 0
  #word_df <- word_df[!duplicated(word_df),]
  #print(word_df)
  
  for (i in c(1:nWords)){
    #print(i)
    #print(word_df[i,"Text"])
    wordRow <-  which(Scoring_df$Term == word_df[i,"Text"])
    #print(wordRow)
    if (!(length(wordRow) == 0)){
      word_df[i,"Delta"] <- Scoring_df[wordRow,"Delta"]
    }
  }
  print(mean(word_df$Delta))
  
}

Run several tests to determine a threshold between spam/ham for this equation.

Is_It_Spam("spam_2/00005.ed0aba4d386c5e62bc737cf3f0ed9589")
## [1] -99.78102
Is_It_Spam("spam_2/00011.bd8c904d9f7b161a813d222230214d50")
## [1] -108.2053
Is_It_Spam("spam_2/01237.245ac1766016b756a3ddb3b463cc9645")
## [1] -41.63551
Is_It_Spam("spam_2/00014.13574737e55e51fe6737a475b88b5052")
## [1] -24.72703
Is_It_Spam("spam_2/00663.4baa9521293a04306b038be1f65d4471")
## [1] -5.683544
Is_It_Spam("spam_2/01040.24856bbcaedd4d7b28eae47d8f89a62f")
## [1] -61.06009
Is_It_Spam("easy_ham/00010.145d22c053c1a0c410242e46c01635b3")
## [1] -12.45862
Is_It_Spam("easy_ham/00916.1ea7a40e892220d43795fee49ab4849e")
## [1] -7.327492
Is_It_Spam("easy_ham/02462.4f93bc374730a117b103d5c3a2d699f6")
## [1] -7.111332
Is_It_Spam("easy_ham/02467.b05b3925df99da2a9426f07833ce5a9d")
## [1] -7.660173
Is_It_Spam("easy_ham/02475.9277ee243e3f51fa53ed6be55798d360")
## [1] -3.173554
Is_It_Spam("easy_ham/01183.36c29b5d6d19a328c9928a157019a49c")
## [1] -14.01753

Adjust the scoring scheme based on the threshold settled on above to state whether a document is determined to be spam.

Is_It_Spam <- function(filename){
  if (is.factor(filename)){
    filename <- as.character(filename)
  }
  text <- paste(readLines(filename), collapse = ' ')
  word_df <- as.data.frame(strsplit(text, "\\W+"))
  names(word_df) <- "Text"
  word_df$Text <- tolower(word_df$Text)
  nWords <- nrow(word_df)
  word_df$Delta <- 0
  #word_df <- word_df[!duplicated(word_df),]
  #print(word_df)
  
  for (i in c(1:nWords)){
    #print(i)
    #print(word_df[i,"Text"])
    wordRow <-  which(Scoring_df$Term == word_df[i,"Text"])
    #print(wordRow)
    if (!(length(wordRow) == 0)){
      word_df[i,"Delta"] <- Scoring_df[wordRow,"Delta"]
    }
  }
  #print(mean(word_df$Delta))
  if (mean(word_df$Delta) < -25) {
    return("SPAM")
  } else{
    return("HAM")
  }
}

Test on all of the Spam and Ham documents.

testing_Spam <- data.frame(Spam_Files)
testing_Spam$Spam_Files <- as.character(testing_Spam$Spam_Files)
testing_Spam$Spam_Files <- str_replace(testing_Spam$Spam_Files,"\\\\","/")
testing_Ham <- data.frame(Ham_Files)
testing_Ham$Ham_Files <- as.character(testing_Ham$Ham_Files)
testing_Ham$Ham_Files <- str_replace(testing_Ham$Ham_Files,"\\\\","/")
testing_Spam$Test <- sapply(testing_Spam$Spam_Files,Is_It_Spam)
testing_Ham$Test <- sapply(testing_Ham$Ham_Files,Is_It_Spam)

Summarize results for spam and ham

print("Marking of the Spam files")
## [1] "Marking of the Spam files"
count(testing_Spam$Test)
##      x freq
## 1  HAM   80
## 2 SPAM 1317
print("Marking of the Ham Files")
## [1] "Marking of the Ham Files"
count(testing_Ham$Test)
##      x freq
## 1  HAM 2434
## 2 SPAM   67

Based on these counts, it is evident that an adjustment would need to be made to the equation. While missing 80 out of 1397 (5.7%) of spam emails and showing them in a customers inbox may be acceptable (though this is still fairly high), wrongly labeling 67 out of 2501 (2.7%) good emails as spam is unacceptable. In order to create a valid scheme for identifying spam, I would suggest looking at pairings of the words (in addition to their frequency). The equation should also be weighted in favor of the clean emails in order to avoid mistakenly labelling something as spam when it is not. Finally, adding a function that looks at the sender would help greatly with the filtering of spam/ham.