Assignment: Document Classification

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/publiccorpus/

For more adventurous students, you are welcome (encouraged!) to come up with a different set of documents (including scraped web pages!?) that have already been classified (e.g. tagged), then analyze these documents to predict how new documents should be classified.

Imports:

library(tm)
library(knitr)
library(plyr)
library(wordcloud)

Function to Load Corpus:

# My computer crashes when I go higher than MAX_FILES...
MAX_FILES <- 40
# How many table rows to show for knitr tables
NUM_TABLE_ROWS <- 10
# how many terms to return on analysis
TOP_X_TERMS <- 999

# get the corpus given the directory
get_corpus <- function(the_dir){
  file_contents <- c()
  the_files <- list.files(path=the_dir, full.names = TRUE)
  head(the_files)
  i <- 0
  for (cur_file in the_files){
    if(i < MAX_FILES){
      current_content <- readLines(cur_file)
      file_contents <- c(file_contents, current_content)
      i <- (i+1)
    }
  }
  the_corpus <- Corpus(VectorSource(file_contents))
  return (the_corpus)
}

Create the SPAM/HAM Corpuses:

# Get the Ham and Spam Corpuses:
ham_corpus <- get_corpus("hw11/easy_ham/")
length(ham_corpus)

## [1] 3334

ham_corpus

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 3334

spam_corpus <- get_corpus("hw11/spam_2/")
length(spam_corpus)

## [1] 6276

spam_corpus

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 6276

Filter the 2 Corpuses, and create Term Document Matrices:

# general filtering opts:
tdm_dtm_opts <- list(removePunctuation=TRUE, removeNumbers=TRUE, stripWhitespace=TRUE, tolower=TRUE, stopwords=TRUE, minWordLength = 2)

# create the TDMs
spam_tdm <- TermDocumentMatrix(spam_corpus,control=tdm_dtm_opts)
spam_tdm

## <<TermDocumentMatrix (terms: 5963, documents: 6276)>>
## Non-/sparse entries: 16009/37407779
## Sparsity           : 100%
## Maximal term length: 251
## Weighting          : term frequency (tf)

ham_tdm <- TermDocumentMatrix(ham_corpus,control=tdm_dtm_opts)
ham_tdm

## <<TermDocumentMatrix (terms: 2607, documents: 3334)>>
## Non-/sparse entries: 9308/8682430
## Sparsity           : 100%
## Maximal term length: 67
## Weighting          : term frequency (tf)

spam_corpus

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 6276

Create Spam and Ham Data Frames:

spam_df <- as.data.frame(as.table(spam_tdm))
spam_df$spam_ham <- "SPAM"
colnames(spam_df) <- c('TERM', 'SPAM_DOCS', 'SPAM_FREQ', 'TYPE_SPAM')
spam_df <- subset(spam_df, select = -c(2) )
spam_df$SPAM_FREQ[is.na(spam_df$SPAM_FREQ)] <- '0'
spam_df <- ddply(spam_df, .(TERM, TYPE_SPAM), summarize, SPAM_FREQ = sum(as.numeric(SPAM_FREQ)))
kable(head(spam_df, n = NUM_TABLE_ROWS))

TERM	TYPE_SPAM	SPAM_FREQ
“ecomog”	SPAM	1
“on	SPAM	2
aabaabhaceadbdc	SPAM	1
aaf	SPAM	1
aafcf	SPAM	1
ababbuoption	SPAM	4
abandoned	SPAM	1
abaqbv	SPAM	1
abbuoption	SPAM	2
abd	SPAM	1

spam_count <- nrow(spam_df)

ham_df <- as.data.frame(as.table(ham_tdm))
ham_df$spam_ham <- "HAM"
colnames(ham_df) <- c('TERM', 'HAM_DOCS', 'HAM_FREQ', 'TYPE_HAM')
ham_df <- subset(ham_df, select = -c(2) )
ham_df$HAM_FREQ[is.na(ham_df$HAM_FREQ)] <- '0'
ham_df <- ddply(ham_df, .(TERM, TYPE_HAM), summarize, HAM_FREQ = sum(as.numeric(HAM_FREQ)))
kable(head(ham_df, n = NUM_TABLE_ROWS))

TERM	TYPE_HAM	HAM_FREQ
aaf	HAM	1
abef	HAM	1
abilities	HAM	1
able	HAM	3
acc	HAM	1
access	HAM	2
accomplish	HAM	1
according	HAM	1
account	HAM	1
accounts	HAM	1

ham_count <- nrow(ham_df)

Merge the Spam and Ham Data Frames:

# now hopefully merge them with no memory issues..
all_df <- merge(x = ham_df, y = spam_df, by="TERM", all = TRUE)
# since this is like an outer join, fill the nulls with Zeros...
all_df$SPAM_FREQ[is.na(all_df$SPAM_FREQ)] <- '0'
all_df$TYPE_SPAM[is.na(all_df$TYPE_SPAM)] <- 'SPAM'
all_df$HAM_FREQ[is.na(all_df$HAM_FREQ)] <- '0'
all_df$TYPE_HAM[is.na(all_df$TYPE_HAM)] <- 'HAM'
all_df[is.na(all_df)] <- '0'

Take a look at the SpamHam DataFrame sorted by HAM_FREQ desc, then SPAM_FREQ desc

all_df$SPAM_WEIGHT <- as.numeric(all_df$SPAM_FREQ) - as.numeric(all_df$HAM_FREQ)
kable(head(all_df[order(-as.numeric(all_df$HAM_FREQ)), ], n=NUM_TABLE_ROWS))

	TERM	TYPE_HAM	HAM_FREQ	TYPE_SPAM	SPAM_FREQ	SPAM_WEIGHT
149	aug	HAM	394	SPAM	28	-366
1916	received	HAM	299	SPAM	144	-155
2335	thu	HAM	257	SPAM	49	-208
716	esmtp	HAM	142	SPAM	65	-77
1369	localhost	HAM	139	SPAM	23	-116
2602	zzzzlocalhost	HAM	80	SPAM	0	-80
1816	postfix	HAM	72	SPAM	39	-33
1169	iluglinuxie	HAM	69	SPAM	5	-64
1347	list	HAM	66	SPAM	30	-36
547	deliveredto	HAM	63	SPAM	34	-29

kable(head(all_df[order(-as.numeric(all_df$SPAM_FREQ)), ], n=NUM_TABLE_ROWS))

	TERM	TYPE_HAM	HAM_FREQ	TYPE_SPAM	SPAM_FREQ	SPAM_WEIGHT
4261	faceverdana	HAM	0	SPAM	274	274
2837	arial	HAM	0	SPAM	272	272
4610	helvetica	HAM	0	SPAM	271	271
3537	colorfont	HAM	0	SPAM	270	270
4465	geneva	HAM	0	SPAM	270	270
6405	sansseriffont	HAM	0	SPAM	270	270
1916	received	HAM	299	SPAM	144	-155
4345	font	HAM	0	SPAM	120	120
2511	will	HAM	20	SPAM	107	87
3161	brbr	HAM	0	SPAM	106	106

HAM CLOUD

wordcloud(ham_corpus, max.words = 200, random.order = FALSE, colors=c('green'))

SPAM CLOUD

wordcloud(spam_corpus, max.words = 200, random.order = FALSE, colors=c('red'))

Function to calculate the spam score (positive means more likely to be spam…):

get_spam_score <- function(file_path){
  content <- readLines(file_path)
  one_string <- paste(content, collapse = ' ')
  word_list <- strsplit(one_string, "\\W+")
  dfx <- as.data.frame(word_list)
  colnames(dfx) <- c("WORD")
  dfx$WORD <- tolower(dfx$WORD)
  
  total_score <- sum(all_df$SPAM_WEIGHT[dfx$WORD == all_df$TERM])
  # Thought this should have given a sum of the SPAM_WEIGHT...some pos, some neg...but not sure why some NAs come back and the rest are 0s and 1s...
  print(total_score)
}

Test with some HAM:

get_spam_score("hw11/easy_ham/01451.b5a50ca35f50e38d37a2eba47399f57d")

## [1] 0

get_spam_score("hw11/easy_ham/00677.b957e34b4dd0d9263b56bf71b1168d8a")

## [1] 0

get_spam_score("hw11/easy_ham/00765.ea01c46568902b1338c9685b55d77f6c")

## [1] 6

get_spam_score("hw11/easy_ham/01554.0aed12846b3981a2a13adf793083e4f0")

## [1] 0

get_spam_score("hw11/easy_ham/00831.dfa70bbdaef79d5863917ba90097ba7a")

## [1] 0

Test with some SPAM:

get_spam_score("hw11/spam_2/00028.60393e49c90f750226bee6381eb3e69d")

## [1] 45

get_spam_score("hw11/spam_2/00077.6e13224e39fae4b94bcbe0f5ae9f4939")

## [1] 5

get_spam_score("hw11/spam_2/00081.4c7fbdca38b8def54e276e75ec56682e")

## [1] 0

get_spam_score("hw11/spam_2/01326.32e7912cae22a40e7b27f7d020de08fe")

## [1] 0

get_spam_score("hw11/spam_2/01188.67d69a8d6e5c899914556488c8cbd2c9")

## [1] 0

DA607 Week 11 Assignment

Dan Fanelli