Required Packages

if (!require('tm')) install.packages('tm')
if (!require('stopwords')) install.packages('stopwords')
if (!require('plyr')) install.packages('plyr')
if (!require('knitr')) install.packages('knitr')
if (!require('wordcloud')) install.packages('wordcloud')
if (!require('SnowballC')) install.packages('SnowballC')
library("RColorBrewer")
library(e1071)
library(dplyr)
library('ISOcodes')

Imports:

Function to Load Corpus:

# My computer crashes when I go higher than MAX_FILES...
MAX_FILES <- 40
# How many table rows to show for knitr tables
NUM_TABLE_ROWS <- 10
# how many terms to return on analysis
TOP_X_TERMS <- 999
# get the corpus given the directory
get_corpus <- function(the_dir){
  file_contents <- c()
  the_files <- list.files(path=the_dir, full.names = TRUE)
  head(the_files)
  i <- 0
  for (cur_file in the_files){
    if(i < MAX_FILES){
      current_content <- readLines(cur_file)
      file_contents <- c(file_contents, current_content)
      i <- (i+1)
    }
  }
  the_corpus <- Corpus(VectorSource(file_contents))
  return (the_corpus)
}

Create the SPAM/HAM Corpuses:

# Get the Ham and Spam Corpuses:
ham_corpus <- get_corpus("easy_ham/")
length(ham_corpus)

## [1] 3334

ham_corpus

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 3334

spam_corpus <- get_corpus("spam/")
length(spam_corpus)

## [1] 4951

spam_corpus

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 4951

cleanfunction<- function(corpus) {
  toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
  tmp <- tm_map(corpus, toSpace, "\\n")
  tmp <- tm_map(tmp, toSpace, "<.*?>")
  tmp <- tm_map(tmp, toSpace, "^.*Subject: ")
  tmp <- tm_map(tmp, content_transformer(tolower))
  tmp <- tm_map(tmp, content_transformer(removePunctuation)) 
  tmp <- tm_map(tmp, content_transformer(removeNumbers))
  tmp <- tm_map(tmp,  removeWords, c(stopwords("english")))
  tmp <- tm_map(tmp, content_transformer(stemDocument))  
  tmp <- tm_map(tmp, content_transformer(stripWhitespace))
}

ham_corpus<-cleanfunction(ham_corpus)


spam_corpus<-cleanfunction(spam_corpus)

Filter the 2 Corpuses, and create Term Document Matrices:

# general filtering opts:
tdm_dtm_opts <- list(removePunctuation=TRUE, removeNumbers=TRUE, stripWhitespace=TRUE, tolower=TRUE, stopwords=TRUE, minWordLength = 2)

tdm_dtm_opts

## $removePunctuation
## [1] TRUE
## 
## $removeNumbers
## [1] TRUE
## 
## $stripWhitespace
## [1] TRUE
## 
## $tolower
## [1] TRUE
## 
## $stopwords
## [1] TRUE
## 
## $minWordLength
## [1] 2

# create the TDMs
spam_tdm <- TermDocumentMatrix(spam_corpus,control=tdm_dtm_opts)
spam_tdm

## <<TermDocumentMatrix (terms: 2725, documents: 4951)>>
## Non-/sparse entries: 9595/13481880
## Sparsity           : 100%
## Maximal term length: 76
## Weighting          : term frequency (tf)

ham_tdm <- TermDocumentMatrix(ham_corpus,control=tdm_dtm_opts)
ham_tdm

## <<TermDocumentMatrix (terms: 2162, documents: 3334)>>
## Non-/sparse entries: 8706/7199402
## Sparsity           : 100%
## Maximal term length: 54
## Weighting          : term frequency (tf)

spam_corpus

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 4951

Create Spam and Ham Data Frames:

spam_df <- as.data.frame(as.table(spam_tdm))
spam_df$spam_ham <- "SPAM"
colnames(spam_df) <- c('TERM', 'SPAM_DOCS', 'SPAM_FREQ', 'TYPE_SPAM')
spam_df <- subset(spam_df, select = -c(2) )
spam_df$SPAM_FREQ[is.na(spam_df$SPAM_FREQ)] <- '0'
spam_df <- ddply(spam_df, .(TERM, TYPE_SPAM), summarize, SPAM_FREQ = sum(as.numeric(SPAM_FREQ)))
kable(head(spam_df, n = NUM_TABLE_ROWS))

TERM	TYPE_SPAM	SPAM_FREQ
amailbotwebd	SPAM	2
aug	SPAM	237
thu	SPAM	89
returnpath	SPAM	34
deliveredto	SPAM	36
zzzzlocalhostspamassassintaintorg	SPAM	34
localhost	SPAM	107
receiv	SPAM	238
esmtp	SPAM	93
phoboslabsspamassassintaintorg	SPAM	34

spam_count <- nrow(spam_df)
ham_df <- as.data.frame(as.table(ham_tdm))
ham_df$spam_ham <- "HAM"
colnames(ham_df) <- c('TERM', 'HAM_DOCS', 'HAM_FREQ', 'TYPE_HAM')
ham_df <- subset(ham_df, select = -c(2) )
ham_df$HAM_FREQ[is.na(ham_df$HAM_FREQ)] <- '0'
ham_df <- ddply(ham_df, .(TERM, TYPE_HAM), summarize, HAM_FREQ = sum(as.numeric(HAM_FREQ)))
kable(head(ham_df, n = NUM_TABLE_ROWS))

TERM	TYPE_HAM	HAM_FREQ
aug	HAM	394
exmhworkersadminredhatcom	HAM	2
thu	HAM	257
returnpath	HAM	40
deliveredto	HAM	63
zzzzlocalhostnetnoteinccom	HAM	40
localhost	HAM	139
receiv	HAM	302
dec	HAM	1
esmtp	HAM	142

ham_count <- nrow(ham_df)

Merge the Spam and Ham Data Frames:

# now hopefully merge them with no memory issues..
all_df <- merge(x = ham_df, y = spam_df, by="TERM", all = TRUE)
# since this is like an outer join, fill the nulls with Zeros...
all_df$SPAM_FREQ[is.na(all_df$SPAM_FREQ)] <- '0'
all_df$TYPE_SPAM[is.na(all_df$TYPE_SPAM)] <- 'SPAM'
all_df$HAM_FREQ[is.na(all_df$HAM_FREQ)] <- '0'
all_df$TYPE_HAM[is.na(all_df$TYPE_HAM)] <- 'HAM'
all_df[is.na(all_df)] <- '0'

Take a look at the SpamHam DataFrame sorted by HAM_FREQ desc, then SPAM_FREQ desc

all_df$SPAM_WEIGHT <- as.numeric(all_df$SPAM_FREQ) - as.numeric(all_df$HAM_FREQ)
kable(head(all_df[order(-as.numeric(all_df$HAM_FREQ)), ], n=NUM_TABLE_ROWS))

	TERM	TYPE_HAM	HAM_FREQ	TYPE_SPAM	SPAM_FREQ	SPAM_WEIGHT
1	aug	HAM	394	SPAM	237	-157
8	receiv	HAM	302	SPAM	238	-64
3	thu	HAM	257	SPAM	89	-168
10	esmtp	HAM	142	SPAM	93	-49
7	localhost	HAM	139	SPAM	107	-32
12	postfix	HAM	72	SPAM	48	-24
72	list	HAM	69	SPAM	36	-33
236	group	HAM	67	SPAM	16	-51
5	deliveredto	HAM	63	SPAM	36	-27
1050	linux	HAM	60	SPAM	14	-46

kable(head(all_df[order(-as.numeric(all_df$SPAM_FREQ)), ], n=NUM_TABLE_ROWS))

	TERM	TYPE_HAM	HAM_FREQ	TYPE_SPAM	SPAM_FREQ	SPAM_WEIGHT
8	receiv	HAM	302	SPAM	238	-64
1	aug	HAM	394	SPAM	237	-157
926	option	HAM	3	SPAM	149	146
1568	fri	HAM	55	SPAM	140	85
2515	valu	HAM	0	SPAM	108	108
7	localhost	HAM	139	SPAM	107	-32
10	esmtp	HAM	142	SPAM	93	-49
3	thu	HAM	257	SPAM	89	-168
244	email	HAM	31	SPAM	66	35
284	report	HAM	5	SPAM	57	52

HAM CLOUD

wordcloud(ham_corpus, max.words = 200, random.order = FALSE, colors=c('green'))

SPAM CLOUD

wordcloud(spam_corpus, max.words = 200, random.order = FALSE, colors=c('red'))

Untitled

VINAYAK PATEL

November 4, 2018

Assignment: Document Classification

Required Packages

Imports:

Function to Load Corpus:

Create the SPAM/HAM Corpuses:

Filter the 2 Corpuses, and create Term Document Matrices:

Create Spam and Ham Data Frames:

Merge the Spam and Ham Data Frames:

Take a look at the SpamHam DataFrame sorted by HAM_FREQ desc, then SPAM_FREQ desc

HAM CLOUD

SPAM CLOUD

Function to calculate the spam score (positive means more likely to be spam…):