Assignment: Document Classification

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/publiccorpus/

For more adventurous students, you are welcome (encouraged!) to come up with a different set of documents (including scraped web pages!?) that have already been classified (e.g. tagged), then analyze these documents to predict how new documents should be classified.

Required Packages

if (!require('tm')) install.packages('tm')
if (!require('stopwords')) install.packages('stopwords')
if (!require('plyr')) install.packages('plyr')
if (!require('knitr')) install.packages('knitr')
if (!require('wordcloud')) install.packages('wordcloud')
if (!require('SnowballC')) install.packages('SnowballC')
library("RColorBrewer")
library(e1071)
library(dplyr)
library('ISOcodes')

Imports:

Function to Load Corpus:

# My computer crashes when I go higher than MAX_FILES...
MAX_FILES <- 40
# How many table rows to show for knitr tables
NUM_TABLE_ROWS <- 10
# how many terms to return on analysis
TOP_X_TERMS <- 999
# get the corpus given the directory
get_corpus <- function(the_dir){
  file_contents <- c()
  the_files <- list.files(path=the_dir, full.names = TRUE)
  head(the_files)
  i <- 0
  for (cur_file in the_files){
    if(i < MAX_FILES){
      current_content <- readLines(cur_file)
      file_contents <- c(file_contents, current_content)
      i <- (i+1)
    }
  }
  the_corpus <- Corpus(VectorSource(file_contents))
  return (the_corpus)
}

Create the SPAM/HAM Corpuses:

# Get the Ham and Spam Corpuses:
ham_corpus <- get_corpus("easy_ham/")
length(ham_corpus)
## [1] 3334
ham_corpus
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 3334
spam_corpus <- get_corpus("spam/")
length(spam_corpus)
## [1] 4951
spam_corpus
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 4951
cleanfunction<- function(corpus) {
  toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
  tmp <- tm_map(corpus, toSpace, "\\n")
  tmp <- tm_map(tmp, toSpace, "<.*?>")
  tmp <- tm_map(tmp, toSpace, "^.*Subject: ")
  tmp <- tm_map(tmp, content_transformer(tolower))
  tmp <- tm_map(tmp, content_transformer(removePunctuation)) 
  tmp <- tm_map(tmp, content_transformer(removeNumbers))
  tmp <- tm_map(tmp,  removeWords, c(stopwords("english")))
  tmp <- tm_map(tmp, content_transformer(stemDocument))  
  tmp <- tm_map(tmp, content_transformer(stripWhitespace))
}

ham_corpus<-cleanfunction(ham_corpus)


spam_corpus<-cleanfunction(spam_corpus)

Filter the 2 Corpuses, and create Term Document Matrices:

# general filtering opts:
tdm_dtm_opts <- list(removePunctuation=TRUE, removeNumbers=TRUE, stripWhitespace=TRUE, tolower=TRUE, stopwords=TRUE, minWordLength = 2)

tdm_dtm_opts
## $removePunctuation
## [1] TRUE
## 
## $removeNumbers
## [1] TRUE
## 
## $stripWhitespace
## [1] TRUE
## 
## $tolower
## [1] TRUE
## 
## $stopwords
## [1] TRUE
## 
## $minWordLength
## [1] 2
# create the TDMs
spam_tdm <- TermDocumentMatrix(spam_corpus,control=tdm_dtm_opts)
spam_tdm
## <<TermDocumentMatrix (terms: 2725, documents: 4951)>>
## Non-/sparse entries: 9595/13481880
## Sparsity           : 100%
## Maximal term length: 76
## Weighting          : term frequency (tf)
ham_tdm <- TermDocumentMatrix(ham_corpus,control=tdm_dtm_opts)
ham_tdm
## <<TermDocumentMatrix (terms: 2162, documents: 3334)>>
## Non-/sparse entries: 8706/7199402
## Sparsity           : 100%
## Maximal term length: 54
## Weighting          : term frequency (tf)
spam_corpus
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 4951

Create Spam and Ham Data Frames:

spam_df <- as.data.frame(as.table(spam_tdm))
spam_df$spam_ham <- "SPAM"
colnames(spam_df) <- c('TERM', 'SPAM_DOCS', 'SPAM_FREQ', 'TYPE_SPAM')
spam_df <- subset(spam_df, select = -c(2) )
spam_df$SPAM_FREQ[is.na(spam_df$SPAM_FREQ)] <- '0'
spam_df <- ddply(spam_df, .(TERM, TYPE_SPAM), summarize, SPAM_FREQ = sum(as.numeric(SPAM_FREQ)))
kable(head(spam_df, n = NUM_TABLE_ROWS))
TERM TYPE_SPAM SPAM_FREQ
amailbotwebd SPAM 2
aug SPAM 237
thu SPAM 89
returnpath SPAM 34
deliveredto SPAM 36
zzzzlocalhostspamassassintaintorg SPAM 34
localhost SPAM 107
receiv SPAM 238
esmtp SPAM 93
phoboslabsspamassassintaintorg SPAM 34
spam_count <- nrow(spam_df)
ham_df <- as.data.frame(as.table(ham_tdm))
ham_df$spam_ham <- "HAM"
colnames(ham_df) <- c('TERM', 'HAM_DOCS', 'HAM_FREQ', 'TYPE_HAM')
ham_df <- subset(ham_df, select = -c(2) )
ham_df$HAM_FREQ[is.na(ham_df$HAM_FREQ)] <- '0'
ham_df <- ddply(ham_df, .(TERM, TYPE_HAM), summarize, HAM_FREQ = sum(as.numeric(HAM_FREQ)))
kable(head(ham_df, n = NUM_TABLE_ROWS))
TERM TYPE_HAM HAM_FREQ
aug HAM 394
exmhworkersadminredhatcom HAM 2
thu HAM 257
returnpath HAM 40
deliveredto HAM 63
zzzzlocalhostnetnoteinccom HAM 40
localhost HAM 139
receiv HAM 302
dec HAM 1
esmtp HAM 142
ham_count <- nrow(ham_df)

Merge the Spam and Ham Data Frames:

# now hopefully merge them with no memory issues..
all_df <- merge(x = ham_df, y = spam_df, by="TERM", all = TRUE)
# since this is like an outer join, fill the nulls with Zeros...
all_df$SPAM_FREQ[is.na(all_df$SPAM_FREQ)] <- '0'
all_df$TYPE_SPAM[is.na(all_df$TYPE_SPAM)] <- 'SPAM'
all_df$HAM_FREQ[is.na(all_df$HAM_FREQ)] <- '0'
all_df$TYPE_HAM[is.na(all_df$TYPE_HAM)] <- 'HAM'
all_df[is.na(all_df)] <- '0'

Take a look at the SpamHam DataFrame sorted by HAM_FREQ desc, then SPAM_FREQ desc

all_df$SPAM_WEIGHT <- as.numeric(all_df$SPAM_FREQ) - as.numeric(all_df$HAM_FREQ)
kable(head(all_df[order(-as.numeric(all_df$HAM_FREQ)), ], n=NUM_TABLE_ROWS))
TERM TYPE_HAM HAM_FREQ TYPE_SPAM SPAM_FREQ SPAM_WEIGHT
1 aug HAM 394 SPAM 237 -157
8 receiv HAM 302 SPAM 238 -64
3 thu HAM 257 SPAM 89 -168
10 esmtp HAM 142 SPAM 93 -49
7 localhost HAM 139 SPAM 107 -32
12 postfix HAM 72 SPAM 48 -24
72 list HAM 69 SPAM 36 -33
236 group HAM 67 SPAM 16 -51
5 deliveredto HAM 63 SPAM 36 -27
1050 linux HAM 60 SPAM 14 -46
kable(head(all_df[order(-as.numeric(all_df$SPAM_FREQ)), ], n=NUM_TABLE_ROWS))
TERM TYPE_HAM HAM_FREQ TYPE_SPAM SPAM_FREQ SPAM_WEIGHT
8 receiv HAM 302 SPAM 238 -64
1 aug HAM 394 SPAM 237 -157
926 option HAM 3 SPAM 149 146
1568 fri HAM 55 SPAM 140 85
2515 valu HAM 0 SPAM 108 108
7 localhost HAM 139 SPAM 107 -32
10 esmtp HAM 142 SPAM 93 -49
3 thu HAM 257 SPAM 89 -168
244 email HAM 31 SPAM 66 35
284 report HAM 5 SPAM 57 52

HAM CLOUD

wordcloud(ham_corpus, max.words = 200, random.order = FALSE, colors=c('green'))

SPAM CLOUD

wordcloud(spam_corpus, max.words = 200, random.order = FALSE, colors=c('red'))

Function to calculate the spam score (positive means more likely to be spam…):