Assignment: Document Classification
It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.
For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/publiccorpus/
For more adventurous students, you are welcome (encouraged!) to come up with a different set of documents (including scraped web pages!?) that have already been classified (e.g. tagged), then analyze these documents to predict how new documents should be classified.
Imports:
library(tm)
library(knitr)
library(plyr)
library(wordcloud)
Function to Load Corpus:
# My computer crashes when I go higher than MAX_FILES...
MAX_FILES <- 40
# How many table rows to show for knitr tables
NUM_TABLE_ROWS <- 10
# how many terms to return on analysis
TOP_X_TERMS <- 999
# get the corpus given the directory
get_corpus <- function(the_dir){
file_contents <- c()
the_files <- list.files(path=the_dir, full.names = TRUE)
head(the_files)
i <- 0
for (cur_file in the_files){
if(i < MAX_FILES){
current_content <- readLines(cur_file)
file_contents <- c(file_contents, current_content)
i <- (i+1)
}
}
the_corpus <- Corpus(VectorSource(file_contents))
return (the_corpus)
}
Create the SPAM/HAM Corpuses:
# Get the Ham and Spam Corpuses:
ham_corpus <- get_corpus("hw11/easy_ham/")
length(ham_corpus)
## [1] 3334
ham_corpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 3334
spam_corpus <- get_corpus("hw11/spam_2/")
length(spam_corpus)
## [1] 6276
spam_corpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 6276
Filter the 2 Corpuses, and create Term Document Matrices:
# general filtering opts:
tdm_dtm_opts <- list(removePunctuation=TRUE, removeNumbers=TRUE, stripWhitespace=TRUE, tolower=TRUE, stopwords=TRUE, minWordLength = 2)
# create the TDMs
spam_tdm <- TermDocumentMatrix(spam_corpus,control=tdm_dtm_opts)
spam_tdm
## <<TermDocumentMatrix (terms: 5963, documents: 6276)>>
## Non-/sparse entries: 16009/37407779
## Sparsity : 100%
## Maximal term length: 251
## Weighting : term frequency (tf)
ham_tdm <- TermDocumentMatrix(ham_corpus,control=tdm_dtm_opts)
ham_tdm
## <<TermDocumentMatrix (terms: 2607, documents: 3334)>>
## Non-/sparse entries: 9308/8682430
## Sparsity : 100%
## Maximal term length: 67
## Weighting : term frequency (tf)
spam_corpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 6276
Create Spam and Ham Data Frames:
spam_df <- as.data.frame(as.table(spam_tdm))
spam_df$spam_ham <- "SPAM"
colnames(spam_df) <- c('TERM', 'SPAM_DOCS', 'SPAM_FREQ', 'TYPE_SPAM')
spam_df <- subset(spam_df, select = -c(2) )
spam_df$SPAM_FREQ[is.na(spam_df$SPAM_FREQ)] <- '0'
spam_df <- ddply(spam_df, .(TERM, TYPE_SPAM), summarize, SPAM_FREQ = sum(as.numeric(SPAM_FREQ)))
kable(head(spam_df, n = NUM_TABLE_ROWS))
| “ecomog” |
SPAM |
1 |
| “on |
SPAM |
2 |
| aabaabhaceadbdc |
SPAM |
1 |
| aaf |
SPAM |
1 |
| aafcf |
SPAM |
1 |
| ababbuoption |
SPAM |
4 |
| abandoned |
SPAM |
1 |
| abaqbv |
SPAM |
1 |
| abbuoption |
SPAM |
2 |
| abd |
SPAM |
1 |
spam_count <- nrow(spam_df)
ham_df <- as.data.frame(as.table(ham_tdm))
ham_df$spam_ham <- "HAM"
colnames(ham_df) <- c('TERM', 'HAM_DOCS', 'HAM_FREQ', 'TYPE_HAM')
ham_df <- subset(ham_df, select = -c(2) )
ham_df$HAM_FREQ[is.na(ham_df$HAM_FREQ)] <- '0'
ham_df <- ddply(ham_df, .(TERM, TYPE_HAM), summarize, HAM_FREQ = sum(as.numeric(HAM_FREQ)))
kable(head(ham_df, n = NUM_TABLE_ROWS))
| aaf |
HAM |
1 |
| abef |
HAM |
1 |
| abilities |
HAM |
1 |
| able |
HAM |
3 |
| acc |
HAM |
1 |
| access |
HAM |
2 |
| accomplish |
HAM |
1 |
| according |
HAM |
1 |
| account |
HAM |
1 |
| accounts |
HAM |
1 |
ham_count <- nrow(ham_df)
Merge the Spam and Ham Data Frames:
# now hopefully merge them with no memory issues..
all_df <- merge(x = ham_df, y = spam_df, by="TERM", all = TRUE)
# since this is like an outer join, fill the nulls with Zeros...
all_df$SPAM_FREQ[is.na(all_df$SPAM_FREQ)] <- '0'
all_df$TYPE_SPAM[is.na(all_df$TYPE_SPAM)] <- 'SPAM'
all_df$HAM_FREQ[is.na(all_df$HAM_FREQ)] <- '0'
all_df$TYPE_HAM[is.na(all_df$TYPE_HAM)] <- 'HAM'
all_df[is.na(all_df)] <- '0'
Take a look at the SpamHam DataFrame sorted by HAM_FREQ desc, then SPAM_FREQ desc
all_df$SPAM_WEIGHT <- as.numeric(all_df$SPAM_FREQ) - as.numeric(all_df$HAM_FREQ)
kable(head(all_df[order(-as.numeric(all_df$HAM_FREQ)), ], n=NUM_TABLE_ROWS))
| 149 |
aug |
HAM |
394 |
SPAM |
28 |
-366 |
| 1916 |
received |
HAM |
299 |
SPAM |
144 |
-155 |
| 2335 |
thu |
HAM |
257 |
SPAM |
49 |
-208 |
| 716 |
esmtp |
HAM |
142 |
SPAM |
65 |
-77 |
| 1369 |
localhost |
HAM |
139 |
SPAM |
23 |
-116 |
| 2602 |
zzzzlocalhost |
HAM |
80 |
SPAM |
0 |
-80 |
| 1816 |
postfix |
HAM |
72 |
SPAM |
39 |
-33 |
| 1169 |
iluglinuxie |
HAM |
69 |
SPAM |
5 |
-64 |
| 1347 |
list |
HAM |
66 |
SPAM |
30 |
-36 |
| 547 |
deliveredto |
HAM |
63 |
SPAM |
34 |
-29 |
kable(head(all_df[order(-as.numeric(all_df$SPAM_FREQ)), ], n=NUM_TABLE_ROWS))
| 4261 |
faceverdana |
HAM |
0 |
SPAM |
274 |
274 |
| 2837 |
arial |
HAM |
0 |
SPAM |
272 |
272 |
| 4610 |
helvetica |
HAM |
0 |
SPAM |
271 |
271 |
| 3537 |
colorfont |
HAM |
0 |
SPAM |
270 |
270 |
| 4465 |
geneva |
HAM |
0 |
SPAM |
270 |
270 |
| 6405 |
sansseriffont |
HAM |
0 |
SPAM |
270 |
270 |
| 1916 |
received |
HAM |
299 |
SPAM |
144 |
-155 |
| 4345 |
font |
HAM |
0 |
SPAM |
120 |
120 |
| 2511 |
will |
HAM |
20 |
SPAM |
107 |
87 |
| 3161 |
brbr |
HAM |
0 |
SPAM |
106 |
106 |
HAM CLOUD
wordcloud(ham_corpus, max.words = 200, random.order = FALSE, colors=c('green'))

SPAM CLOUD
wordcloud(spam_corpus, max.words = 200, random.order = FALSE, colors=c('red'))

Function to calculate the spam score (positive means more likely to be spam…):
get_spam_score <- function(file_path){
content <- readLines(file_path)
one_string <- paste(content, collapse = ' ')
word_list <- strsplit(one_string, "\\W+")
dfx <- as.data.frame(word_list)
colnames(dfx) <- c("WORD")
dfx$WORD <- tolower(dfx$WORD)
total_score <- sum(all_df$SPAM_WEIGHT[dfx$WORD == all_df$TERM])
# Thought this should have given a sum of the SPAM_WEIGHT...some pos, some neg...but not sure why some NAs come back and the rest are 0s and 1s...
print(total_score)
}
Test with some HAM:
get_spam_score("hw11/easy_ham/01451.b5a50ca35f50e38d37a2eba47399f57d")
## [1] 0
get_spam_score("hw11/easy_ham/00677.b957e34b4dd0d9263b56bf71b1168d8a")
## [1] 0
get_spam_score("hw11/easy_ham/00765.ea01c46568902b1338c9685b55d77f6c")
## [1] 6
get_spam_score("hw11/easy_ham/01554.0aed12846b3981a2a13adf793083e4f0")
## [1] 0
get_spam_score("hw11/easy_ham/00831.dfa70bbdaef79d5863917ba90097ba7a")
## [1] 0
Test with some SPAM:
get_spam_score("hw11/spam_2/00028.60393e49c90f750226bee6381eb3e69d")
## [1] 45
get_spam_score("hw11/spam_2/00077.6e13224e39fae4b94bcbe0f5ae9f4939")
## [1] 5
get_spam_score("hw11/spam_2/00081.4c7fbdca38b8def54e276e75ec56682e")
## [1] 0
get_spam_score("hw11/spam_2/01326.32e7912cae22a40e7b27f7d020de08fe")
## [1] 0
get_spam_score("hw11/spam_2/01188.67d69a8d6e5c899914556488c8cbd2c9")
## [1] 0