Assignment: Document Classification

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/publiccorpus/

For more adventurous students, you are welcome (encouraged!) to come up with a different set of documents (including scraped web pages!?) that have already been classified (e.g. tagged), then analyze these documents to predict how new documents should be classified.

Imports:

library(tm)
library(knitr)
library(plyr)
library(wordcloud)

Function to Load Corpus:

# My computer crashes when I go higher than MAX_FILES...
MAX_FILES <- 40
# How many table rows to show for knitr tables
NUM_TABLE_ROWS <- 10
# how many terms to return on analysis
TOP_X_TERMS <- 999

# get the corpus given the directory
get_corpus <- function(the_dir){
  file_contents <- c()
  the_files <- list.files(path=the_dir, full.names = TRUE)
  head(the_files)
  i <- 0
  for (cur_file in the_files){
    if(i < MAX_FILES){
      current_content <- readLines(cur_file)
      file_contents <- c(file_contents, current_content)
      i <- (i+1)
    }
  }
  the_corpus <- Corpus(VectorSource(file_contents))
  return (the_corpus)
}

Create the SPAM/HAM Corpuses:

# Get the Ham and Spam Corpuses:
ham_corpus <- get_corpus("hw11/easy_ham/")
length(ham_corpus)
## [1] 3334
ham_corpus
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 3334
spam_corpus <- get_corpus("hw11/spam_2/")
length(spam_corpus)
## [1] 6276
spam_corpus
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 6276

Filter the 2 Corpuses, and create Term Document Matrices:

# general filtering opts:
tdm_dtm_opts <- list(removePunctuation=TRUE, removeNumbers=TRUE, stripWhitespace=TRUE, tolower=TRUE, stopwords=TRUE, minWordLength = 2)

# create the TDMs
spam_tdm <- TermDocumentMatrix(spam_corpus,control=tdm_dtm_opts)
spam_tdm
## <<TermDocumentMatrix (terms: 5963, documents: 6276)>>
## Non-/sparse entries: 16009/37407779
## Sparsity           : 100%
## Maximal term length: 251
## Weighting          : term frequency (tf)
ham_tdm <- TermDocumentMatrix(ham_corpus,control=tdm_dtm_opts)
ham_tdm
## <<TermDocumentMatrix (terms: 2607, documents: 3334)>>
## Non-/sparse entries: 9308/8682430
## Sparsity           : 100%
## Maximal term length: 67
## Weighting          : term frequency (tf)
spam_corpus
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 6276

Create Spam and Ham Data Frames:

spam_df <- as.data.frame(as.table(spam_tdm))
spam_df$spam_ham <- "SPAM"
colnames(spam_df) <- c('TERM', 'SPAM_DOCS', 'SPAM_FREQ', 'TYPE_SPAM')
spam_df <- subset(spam_df, select = -c(2) )
spam_df$SPAM_FREQ[is.na(spam_df$SPAM_FREQ)] <- '0'
spam_df <- ddply(spam_df, .(TERM, TYPE_SPAM), summarize, SPAM_FREQ = sum(as.numeric(SPAM_FREQ)))
kable(head(spam_df, n = NUM_TABLE_ROWS))
TERM TYPE_SPAM SPAM_FREQ
“ecomog” SPAM 1
“on SPAM 2
aabaabhaceadbdc SPAM 1
aaf SPAM 1
aafcf SPAM 1
ababbuoption SPAM 4
abandoned SPAM 1
abaqbv SPAM 1
abbuoption SPAM 2
abd SPAM 1
spam_count <- nrow(spam_df)

ham_df <- as.data.frame(as.table(ham_tdm))
ham_df$spam_ham <- "HAM"
colnames(ham_df) <- c('TERM', 'HAM_DOCS', 'HAM_FREQ', 'TYPE_HAM')
ham_df <- subset(ham_df, select = -c(2) )
ham_df$HAM_FREQ[is.na(ham_df$HAM_FREQ)] <- '0'
ham_df <- ddply(ham_df, .(TERM, TYPE_HAM), summarize, HAM_FREQ = sum(as.numeric(HAM_FREQ)))
kable(head(ham_df, n = NUM_TABLE_ROWS))
TERM TYPE_HAM HAM_FREQ
aaf HAM 1
abef HAM 1
abilities HAM 1
able HAM 3
acc HAM 1
access HAM 2
accomplish HAM 1
according HAM 1
account HAM 1
accounts HAM 1
ham_count <- nrow(ham_df)

Merge the Spam and Ham Data Frames:

# now hopefully merge them with no memory issues..
all_df <- merge(x = ham_df, y = spam_df, by="TERM", all = TRUE)
# since this is like an outer join, fill the nulls with Zeros...
all_df$SPAM_FREQ[is.na(all_df$SPAM_FREQ)] <- '0'
all_df$TYPE_SPAM[is.na(all_df$TYPE_SPAM)] <- 'SPAM'
all_df$HAM_FREQ[is.na(all_df$HAM_FREQ)] <- '0'
all_df$TYPE_HAM[is.na(all_df$TYPE_HAM)] <- 'HAM'
all_df[is.na(all_df)] <- '0'

Take a look at the SpamHam DataFrame sorted by HAM_FREQ desc, then SPAM_FREQ desc

all_df$SPAM_WEIGHT <- as.numeric(all_df$SPAM_FREQ) - as.numeric(all_df$HAM_FREQ)
kable(head(all_df[order(-as.numeric(all_df$HAM_FREQ)), ], n=NUM_TABLE_ROWS))
TERM TYPE_HAM HAM_FREQ TYPE_SPAM SPAM_FREQ SPAM_WEIGHT
149 aug HAM 394 SPAM 28 -366
1916 received HAM 299 SPAM 144 -155
2335 thu HAM 257 SPAM 49 -208
716 esmtp HAM 142 SPAM 65 -77
1369 localhost HAM 139 SPAM 23 -116
2602 zzzzlocalhost HAM 80 SPAM 0 -80
1816 postfix HAM 72 SPAM 39 -33
1169 iluglinuxie HAM 69 SPAM 5 -64
1347 list HAM 66 SPAM 30 -36
547 deliveredto HAM 63 SPAM 34 -29
kable(head(all_df[order(-as.numeric(all_df$SPAM_FREQ)), ], n=NUM_TABLE_ROWS))
TERM TYPE_HAM HAM_FREQ TYPE_SPAM SPAM_FREQ SPAM_WEIGHT
4261 faceverdana HAM 0 SPAM 274 274
2837 arial HAM 0 SPAM 272 272
4610 helvetica HAM 0 SPAM 271 271
3537 colorfont HAM 0 SPAM 270 270
4465 geneva HAM 0 SPAM 270 270
6405 sansseriffont HAM 0 SPAM 270 270
1916 received HAM 299 SPAM 144 -155
4345 font HAM 0 SPAM 120 120
2511 will HAM 20 SPAM 107 87
3161 brbr HAM 0 SPAM 106 106

HAM CLOUD

wordcloud(ham_corpus, max.words = 200, random.order = FALSE, colors=c('green'))

SPAM CLOUD

wordcloud(spam_corpus, max.words = 200, random.order = FALSE, colors=c('red'))

Function to calculate the spam score (positive means more likely to be spam…):

get_spam_score <- function(file_path){
  content <- readLines(file_path)
  one_string <- paste(content, collapse = ' ')
  word_list <- strsplit(one_string, "\\W+")
  dfx <- as.data.frame(word_list)
  colnames(dfx) <- c("WORD")
  dfx$WORD <- tolower(dfx$WORD)
  
  total_score <- sum(all_df$SPAM_WEIGHT[dfx$WORD == all_df$TERM])
  # Thought this should have given a sum of the SPAM_WEIGHT...some pos, some neg...but not sure why some NAs come back and the rest are 0s and 1s...
  print(total_score)
}

Test with some HAM:

get_spam_score("hw11/easy_ham/01451.b5a50ca35f50e38d37a2eba47399f57d")
## [1] 0
get_spam_score("hw11/easy_ham/00677.b957e34b4dd0d9263b56bf71b1168d8a")
## [1] 0
get_spam_score("hw11/easy_ham/00765.ea01c46568902b1338c9685b55d77f6c")
## [1] 6
get_spam_score("hw11/easy_ham/01554.0aed12846b3981a2a13adf793083e4f0")
## [1] 0
get_spam_score("hw11/easy_ham/00831.dfa70bbdaef79d5863917ba90097ba7a")
## [1] 0

Test with some SPAM:

get_spam_score("hw11/spam_2/00028.60393e49c90f750226bee6381eb3e69d")
## [1] 45
get_spam_score("hw11/spam_2/00077.6e13224e39fae4b94bcbe0f5ae9f4939")
## [1] 5
get_spam_score("hw11/spam_2/00081.4c7fbdca38b8def54e276e75ec56682e")
## [1] 0
get_spam_score("hw11/spam_2/01326.32e7912cae22a40e7b27f7d020de08fe")
## [1] 0
get_spam_score("hw11/spam_2/01188.67d69a8d6e5c899914556488c8cbd2c9")
## [1] 0