Assignment: Document Classification
It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.
For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/publiccorpus/
For more adventurous students, you are welcome (encouraged!) to come up with a different set of documents (including scraped web pages!?) that have already been classified (e.g. tagged), then analyze these documents to predict how new documents should be classified.
- Read in the Data
- Count occurrences of each word (Train)
- Classify each Test Mail (Test)
- Print Results
Required Packages
if (!require('tm')) install.packages('tm')
if (!require('stopwords')) install.packages('stopwords')
if (!require('plyr')) install.packages('plyr')
if (!require('knitr')) install.packages('knitr')
if (!require('wordcloud')) install.packages('wordcloud')
if (!require('SnowballC')) install.packages('SnowballC')
library("RColorBrewer")
library(e1071)
library(dplyr)
library('ISOcodes')
Imports:
Function to Load Corpus:
# My computer crashes when I go higher than MAX_FILES...
MAX_FILES <- 40
# How many table rows to show for knitr tables
NUM_TABLE_ROWS <- 10
# how many terms to return on analysis
TOP_X_TERMS <- 999
# get the corpus given the directory
get_corpus <- function(the_dir){
file_contents <- c()
the_files <- list.files(path=the_dir, full.names = TRUE)
head(the_files)
i <- 0
for (cur_file in the_files){
if(i < MAX_FILES){
current_content <- readLines(cur_file)
file_contents <- c(file_contents, current_content)
i <- (i+1)
}
}
the_corpus <- Corpus(VectorSource(file_contents))
return (the_corpus)
}
Create the SPAM/HAM Corpuses:
# Get the Ham and Spam Corpuses:
ham_corpus <- get_corpus("easy_ham/")
length(ham_corpus)
## [1] 3334
ham_corpus
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 3334
spam_corpus <- get_corpus("spam/")
length(spam_corpus)
## [1] 4951
spam_corpus
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 4951
cleanfunction<- function(corpus) {
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
tmp <- tm_map(corpus, toSpace, "\\n")
tmp <- tm_map(tmp, toSpace, "<.*?>")
tmp <- tm_map(tmp, toSpace, "^.*Subject: ")
tmp <- tm_map(tmp, content_transformer(tolower))
tmp <- tm_map(tmp, content_transformer(removePunctuation))
tmp <- tm_map(tmp, content_transformer(removeNumbers))
tmp <- tm_map(tmp, removeWords, c(stopwords("english")))
tmp <- tm_map(tmp, content_transformer(stemDocument))
tmp <- tm_map(tmp, content_transformer(stripWhitespace))
}
ham_corpus<-cleanfunction(ham_corpus)
spam_corpus<-cleanfunction(spam_corpus)
Filter the 2 Corpuses, and create Term Document Matrices:
# general filtering opts:
tdm_dtm_opts <- list(removePunctuation=TRUE, removeNumbers=TRUE, stripWhitespace=TRUE, tolower=TRUE, stopwords=TRUE, minWordLength = 2)
tdm_dtm_opts
## $removePunctuation
## [1] TRUE
##
## $removeNumbers
## [1] TRUE
##
## $stripWhitespace
## [1] TRUE
##
## $tolower
## [1] TRUE
##
## $stopwords
## [1] TRUE
##
## $minWordLength
## [1] 2
# create the TDMs
spam_tdm <- TermDocumentMatrix(spam_corpus,control=tdm_dtm_opts)
spam_tdm
## <<TermDocumentMatrix (terms: 2725, documents: 4951)>>
## Non-/sparse entries: 9595/13481880
## Sparsity : 100%
## Maximal term length: 76
## Weighting : term frequency (tf)
ham_tdm <- TermDocumentMatrix(ham_corpus,control=tdm_dtm_opts)
ham_tdm
## <<TermDocumentMatrix (terms: 2162, documents: 3334)>>
## Non-/sparse entries: 8706/7199402
## Sparsity : 100%
## Maximal term length: 54
## Weighting : term frequency (tf)
spam_corpus
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 4951
Create Spam and Ham Data Frames:
spam_df <- as.data.frame(as.table(spam_tdm))
spam_df$spam_ham <- "SPAM"
colnames(spam_df) <- c('TERM', 'SPAM_DOCS', 'SPAM_FREQ', 'TYPE_SPAM')
spam_df <- subset(spam_df, select = -c(2) )
spam_df$SPAM_FREQ[is.na(spam_df$SPAM_FREQ)] <- '0'
spam_df <- ddply(spam_df, .(TERM, TYPE_SPAM), summarize, SPAM_FREQ = sum(as.numeric(SPAM_FREQ)))
kable(head(spam_df, n = NUM_TABLE_ROWS))
| amailbotwebd |
SPAM |
2 |
| aug |
SPAM |
237 |
| thu |
SPAM |
89 |
| returnpath |
SPAM |
34 |
| deliveredto |
SPAM |
36 |
| zzzzlocalhostspamassassintaintorg |
SPAM |
34 |
| localhost |
SPAM |
107 |
| receiv |
SPAM |
238 |
| esmtp |
SPAM |
93 |
| phoboslabsspamassassintaintorg |
SPAM |
34 |
spam_count <- nrow(spam_df)
ham_df <- as.data.frame(as.table(ham_tdm))
ham_df$spam_ham <- "HAM"
colnames(ham_df) <- c('TERM', 'HAM_DOCS', 'HAM_FREQ', 'TYPE_HAM')
ham_df <- subset(ham_df, select = -c(2) )
ham_df$HAM_FREQ[is.na(ham_df$HAM_FREQ)] <- '0'
ham_df <- ddply(ham_df, .(TERM, TYPE_HAM), summarize, HAM_FREQ = sum(as.numeric(HAM_FREQ)))
kable(head(ham_df, n = NUM_TABLE_ROWS))
| aug |
HAM |
394 |
| exmhworkersadminredhatcom |
HAM |
2 |
| thu |
HAM |
257 |
| returnpath |
HAM |
40 |
| deliveredto |
HAM |
63 |
| zzzzlocalhostnetnoteinccom |
HAM |
40 |
| localhost |
HAM |
139 |
| receiv |
HAM |
302 |
| dec |
HAM |
1 |
| esmtp |
HAM |
142 |
ham_count <- nrow(ham_df)
Merge the Spam and Ham Data Frames:
# now hopefully merge them with no memory issues..
all_df <- merge(x = ham_df, y = spam_df, by="TERM", all = TRUE)
# since this is like an outer join, fill the nulls with Zeros...
all_df$SPAM_FREQ[is.na(all_df$SPAM_FREQ)] <- '0'
all_df$TYPE_SPAM[is.na(all_df$TYPE_SPAM)] <- 'SPAM'
all_df$HAM_FREQ[is.na(all_df$HAM_FREQ)] <- '0'
all_df$TYPE_HAM[is.na(all_df$TYPE_HAM)] <- 'HAM'
all_df[is.na(all_df)] <- '0'
Take a look at the SpamHam DataFrame sorted by HAM_FREQ desc, then SPAM_FREQ desc
all_df$SPAM_WEIGHT <- as.numeric(all_df$SPAM_FREQ) - as.numeric(all_df$HAM_FREQ)
kable(head(all_df[order(-as.numeric(all_df$HAM_FREQ)), ], n=NUM_TABLE_ROWS))
| 1 |
aug |
HAM |
394 |
SPAM |
237 |
-157 |
| 8 |
receiv |
HAM |
302 |
SPAM |
238 |
-64 |
| 3 |
thu |
HAM |
257 |
SPAM |
89 |
-168 |
| 10 |
esmtp |
HAM |
142 |
SPAM |
93 |
-49 |
| 7 |
localhost |
HAM |
139 |
SPAM |
107 |
-32 |
| 12 |
postfix |
HAM |
72 |
SPAM |
48 |
-24 |
| 72 |
list |
HAM |
69 |
SPAM |
36 |
-33 |
| 236 |
group |
HAM |
67 |
SPAM |
16 |
-51 |
| 5 |
deliveredto |
HAM |
63 |
SPAM |
36 |
-27 |
| 1050 |
linux |
HAM |
60 |
SPAM |
14 |
-46 |
kable(head(all_df[order(-as.numeric(all_df$SPAM_FREQ)), ], n=NUM_TABLE_ROWS))
| 8 |
receiv |
HAM |
302 |
SPAM |
238 |
-64 |
| 1 |
aug |
HAM |
394 |
SPAM |
237 |
-157 |
| 926 |
option |
HAM |
3 |
SPAM |
149 |
146 |
| 1568 |
fri |
HAM |
55 |
SPAM |
140 |
85 |
| 2515 |
valu |
HAM |
0 |
SPAM |
108 |
108 |
| 7 |
localhost |
HAM |
139 |
SPAM |
107 |
-32 |
| 10 |
esmtp |
HAM |
142 |
SPAM |
93 |
-49 |
| 3 |
thu |
HAM |
257 |
SPAM |
89 |
-168 |
| 244 |
email |
HAM |
31 |
SPAM |
66 |
35 |
| 284 |
report |
HAM |
5 |
SPAM |
57 |
52 |
HAM CLOUD
wordcloud(ham_corpus, max.words = 200, random.order = FALSE, colors=c('green'))

SPAM CLOUD
wordcloud(spam_corpus, max.words = 200, random.order = FALSE, colors=c('red'))

Function to calculate the spam score (positive means more likely to be spam…):