DATA607_Project4_Matheesha

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/publiccorpus/

library(tm)
library(knitr)
library(plyr)
library(wordcloud)

Setup data storing directory

#getwd()
setwd("../data/")

Dataload Directly from the URL

spam_url <- "https://spamassassin.apache.org/old/publiccorpus/20030228_spam.tar.bz2"
spam2_url <- "https://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2"
hard_ham_url <- "https://spamassassin.apache.org/old/publiccorpus/20030228_hard_ham.tar.bz2"
easy_ham_url <- "https://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2"

#Download tarballs
download.file(spam_url, destfile="spam.tar.gz")
download.file(spam2_url, destfile="spam2.tar.gz")
download.file(hard_ham_url, destfile="hardham.tar.gz")
download.file(easy_ham_url, destfile="easyham.tar.gz")

#Extract tarballs
untar("spam.tar.gz")
untar("spam2.tar.gz")
untar("hardham.tar.gz")
untar("easyham.tar.gz")

#Create spam and ham corpus
spam <- Corpus(DirSource("spam"))
spam2 <- Corpus(DirSource("spam_2"))
easy_ham <- Corpus(DirSource("easy_ham"))
hard_ham <- Corpus(DirSource("hard_ham"))
# length(spam)
# length(spam2)
# length(easy_ham)

Cleaning and filterning text files

tdm_dtm_opts <- list(removePunctuation=TRUE, removeNumbers=TRUE, stripWhitespace=TRUE, tolower=TRUE, stopwords=TRUE, minWordLength = 2,removePunctuation("'"),removePunctuation("|"),removePunctuation("`"))

#Remove cmds file
if (file.exists("easy_ham/cmds")) file.remove("easy_ham/cmds")

## [1] TRUE

if (file.exists("hard_ham/cmds")) file.remove("hard_ham/cmds")

## [1] TRUE

if (file.exists("spam/cmds")) file.remove("spam/cmds")

## [1] TRUE

#Add meta labels
meta(spam, tag = "type") <- "spam"
meta(easy_ham, tag = "type") <- "easy_ham"
meta(hard_ham, tag = "type") <- "hard_ham"

Creating Document term matrix

spam <- Corpus(DirSource("spam"))


#Combine corpus objects.

spam_corpus <- c(spam, recursive=T)
ham_corpus <- c(easy_ham, recursive=T)


#Create reduced and randomized corpus

spam_corpus_sample <- sample(spam_corpus, 500)
ham_corpus_sapmle <- sample(ham_corpus, 500)


#Build document-term matrix. 

# spam_tdm <- DocumentTermMatrix(spam_corpus_sample)
# ham_tdm <- DocumentTermMatrix(ham_corpus_sapmle)


spam_tdm <- TermDocumentMatrix(spam_corpus_sample,control=tdm_dtm_opts)
ham_tdm <- TermDocumentMatrix(ham_corpus_sapmle,control=tdm_dtm_opts)

Create Spam and Ham Data Frames:

spam_df <- as.data.frame(as.table(spam_tdm))
spam_df$spam_ham <- "SPAM"
colnames(spam_df) <- c('TERM', 'SPAM_DOCS', 'SPAM_FREQ', 'TYPE_SPAM')
spam_df <- subset(spam_df, select = -c(2) )
spam_df$SPAM_FREQ[is.na(spam_df$SPAM_FREQ)] <- '0'
spam_df <- ddply(spam_df, .(TERM, TYPE_SPAM), summarize, SPAM_FREQ = sum(as.numeric(SPAM_FREQ)))
NUM_TABLE_ROWS <-10
kable(head(spam_df, n = NUM_TABLE_ROWS))

TERM	TYPE_SPAM	SPAM_FREQ
bb	SPAM	3
balbvipmail	SPAM	3
bckkonshbptbwlshyb	SPAM	3
bckonshkfclrdhbbcbb	SPAM	3
bdbvipmailbkbgo	SPAM	3
benpitmwjjgddruklkjonrb	SPAM	3
benpitmwjjonrb	SPAM	3
bfmanaknib	SPAM	6
bgawjb	SPAM	6
bgnpqhkkmb	SPAM	3

spam_count <- nrow(spam_df)

ham_df <- as.data.frame(as.table(ham_tdm))
ham_df$spam_ham <- "HAM"
colnames(ham_df) <- c('TERM', 'HAM_DOCS', 'HAM_FREQ', 'TYPE_HAM')
ham_df <- subset(ham_df, select = -c(2) )
ham_df$HAM_FREQ[is.na(ham_df$HAM_FREQ)] <- '0'
ham_df <- ddply(ham_df, .(TERM, TYPE_HAM), summarize, HAM_FREQ = sum(as.numeric(HAM_FREQ)))
kable(head(ham_df, n = NUM_TABLE_ROWS))

TERM	TYPE_HAM	HAM_FREQ
“big	HAM	1
“doggy”	HAM	1
“during	HAM	1
“lynx	HAM	1
aaa	HAM	1
aaaaaccbef	HAM	1
aaaaaccfbbdaeb	HAM	1
aaaaaddeecb	HAM	1
aaaacecacae	HAM	1
aaaadbcdc	HAM	1

ham_count <- nrow(ham_df)

Combine both data frames using common term

# now hopefully merge them with no memory issues..
all_df <- merge(x = ham_df, y = spam_df, by="TERM", all = TRUE)
# since this is like an outer join, fill the nulls with Zeros...
all_df$SPAM_FREQ[is.na(all_df$SPAM_FREQ)] <- '0'
all_df$TYPE_SPAM[is.na(all_df$TYPE_SPAM)] <- 'SPAM'
all_df$HAM_FREQ[is.na(all_df$HAM_FREQ)] <- '0'
all_df$TYPE_HAM[is.na(all_df$TYPE_HAM)] <- 'HAM'
all_df[is.na(all_df)] <- '0'

Showing the combined data frame

all_df$SPAM_WEIGHT <- as.numeric(all_df$SPAM_FREQ) - as.numeric(all_df$HAM_FREQ)
kable(head(all_df[order(-as.numeric(all_df$HAM_FREQ)), ], n=NUM_TABLE_ROWS))

	TERM	TYPE_HAM	HAM_FREQ	TYPE_SPAM	SPAM_FREQ	SPAM_WEIGHT
16364	received	HAM	2772	SPAM	2490	-282
17304	sep	HAM	1797	SPAM	2128	331
8275	esmtp	HAM	1667	SPAM	1118	-549
13501	localhost	HAM	1489	SPAM	1178	-311
14907	oct	HAM	1156	SPAM	56	-1100
1619	aug	HAM	933	SPAM	1173	240
15743	postfix	HAM	912	SPAM	595	-317
12692	ist	HAM	848	SPAM	740	-108
14296	mon	HAM	838	SPAM	674	-164
12812	jmlocalhost	HAM	818	SPAM	22	-796

kable(head(all_df[order(-as.numeric(all_df$SPAM_FREQ)), ], n=NUM_TABLE_ROWS))

	TERM	TYPE_HAM	HAM_FREQ	TYPE_SPAM	SPAM_FREQ	SPAM_WEIGHT
16364	received	HAM	2772	SPAM	2490	-282
17304	sep	HAM	1797	SPAM	2128	331
46903	widthd	HAM	0	SPAM	1523	1523
18401	table	HAM	5	SPAM	1196	1191
13501	localhost	HAM	1489	SPAM	1178	-311
1619	aug	HAM	933	SPAM	1173	240
19782	width	HAM	1	SPAM	1157	1156
8275	esmtp	HAM	1667	SPAM	1118	-549
20273	zzzzlocalhost	HAM	122	SPAM	908	786
9715	font	HAM	1	SPAM	899	898
###HAM C	loude

wordcloud(ham_corpus, max.words = 200, random.order = FALSE, colors=c('blue'))

## Warning in wordcloud(ham_corpus, max.words = 200, random.order = FALSE, :
## <mailto:fork-request@xent.com?subject=subscribe> could not be fit on page.
## It will not be plotted.

## Warning in wordcloud(ham_corpus, max.words = 200, random.order = FALSE, :
## <mailto:fork-request@xent.com?subject=unsubscribe> could not be fit on
## page. It will not be plotted.

## Warning in wordcloud(ham_corpus, max.words = 200, random.order = FALSE, :
## <http://lists.freshrpms.net/mailman/listinfo/rpm-zzzlist>, could not be fit
## on page. It will not be plotted.

###SPAM Cloude

wordcloud(spam_corpus, max.words = 200, random.order = FALSE, colors=c('red'))

summary(all_df)

##          TERM         TYPE_HAM           HAM_FREQ        
##  <U+0093>big      :    1   Length:48847       Length:48847      
##  <U+0093>doggy<U+0094>   :    1   Class :character   Class :character  
##  <U+0093>during   :    1   Mode  :character   Mode  :character  
##  <U+0093>lynx     :    1                                        
##  aaa       :    1                                        
##  aaaaaccbef:    1                                        
##  (Other)   :48841                                        
##   TYPE_SPAM          SPAM_FREQ          SPAM_WEIGHT       
##  Length:48847       Length:48847       Min.   :-1100.000  
##  Class :character   Class :character   1st Qu.:   -1.000  
##  Mode  :character   Mode  :character   Median :    1.000  
##                                        Mean   :    1.462  
##                                        3rd Qu.:    1.000  
##                                        Max.   : 1523.000  
##

DATA607_Project4_Matheesha_Thambeliyagodage

Matheesha Thambeliyagodage

April 18, 2017