Assignment:-
It can be useful to be able to classify new “test” documents using already classified “training” documents.
A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.
For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder).
One example corpus: https://spamassassin.apache.org/publiccorpus/
For more adventurous students, you are welcome (encouraged!) to come up with a different set of documents (including scraped web pages!?) that have already been classified (e.g. tagged), then analyze these documents to predict how new documents should be classified.
library(tm)
## Loading required package: NLP
library(knitr)
library(plyr)
library(wordcloud)
## Loading required package: RColorBrewer
dir_spam <- "/Users/ashishsm1986/git/Cuny-Assignments/week10-assignment/spam/"
names_spam <- list.files(dir_spam)
head(names_spam)
## [1] "0000.7b1b73cf36cf9dbc3d64e3f2ee2b91f1"
## [2] "0001.bfc8d64d12b325ff385cca8d07b84288"
## [3] "0002.24b47bb3ce90708ae29d0aec1da08610"
## [4] "0003.4b3d943b8df71af248d12f8b2e7a224a"
## [5] "0004.1874ab60c71f0b31b580f313a3f6e777"
## [6] "0005.1f42bb885de0ef7fc5cd09d34dc2ba54"
lspam <- length(names_spam)
lspam
## [1] 501
dir_ham <- "/Users/ashishsm1986/git/Cuny-Assignments/week10-assignment/ham/"
names_ham <- list.files(dir_ham)
head(names_ham)
## [1] "0001.ea7e79d3153e7469e7a9c3e0af6a357e"
## [2] "0002.b3120c4bcbf3101e661161ee7efcb8bf"
## [3] "0003.acfc5ad94bbd27118a0d8685d18c89dd"
## [4] "0004.e8d5727378ddde5c3be181df593f1712"
## [5] "0005.8c3b9e9c0f3f183ddaf7592a11b99957"
## [6] "0006.ee8b0dba12856155222be180ba122058"
lham <- length(names_ham)
lham
## [1] 2551
Creating the Corpuses:
#Function to create Cosrpuses
getcorpus <- function(dir, maxfiles){
content <- c()
the_files <- list.files(path=dir, full.names = TRUE)
i <- 0
for (cur_file in the_files){
if(i <= 200){ #Here the condition should be if(i <= 200) but my computer just crashes when i increase limit from 200 so had to hardcode this value
current_content <- readLines(cur_file)
content <- c(content, current_content)
i <- (i+1)
}
}
the_corpus <- Corpus(VectorSource(content))
return (the_corpus)
}
# Creating Spam Corpus
spam_corpus <- getcorpus(dir_spam,lspam)
length(spam_corpus)
## [1] 21043
spam_corpus
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 21043
# Creating Ham Corpus
ham_corpus <- getcorpus(dir_ham,lham)
length(ham_corpus)
## [1] 16823
ham_corpus
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 16823
Creating TermDocumentMetrices for the 2 corpuses:
Sys.setlocale("LC_ALL", "C") # Was getting encoding error and found this solution on Google to fix it. This converts encoding to UTF-8
## [1] "C/C/C/C/C/en_US.UTF-8"
tdm_dtm_opts <- list(removePunctuation=TRUE, removeNumbers=TRUE, stripWhitespace=TRUE, tolower=TRUE, stopwords=TRUE, minWordLength = 2, fileEncoding="latin1")
spam_tdm <- TermDocumentMatrix(spam_corpus,control=tdm_dtm_opts)
spam_tdm
## <<TermDocumentMatrix (terms: 8163, documents: 21043)>>
## Non-/sparse entries: 71568/171702441
## Sparsity : 100%
## Maximal term length: 184
## Weighting : term frequency (tf)
ham_tdm <- TermDocumentMatrix(ham_corpus,control=tdm_dtm_opts)
ham_tdm
## <<TermDocumentMatrix (terms: 7554, documents: 16823)>>
## Non-/sparse entries: 65945/127014997
## Sparsity : 100%
## Maximal term length: 76
## Weighting : term frequency (tf)
Creating Dataframes:
spamdf <- as.data.frame(as.table(spam_tdm))
spamdf$spam_ham <- "SPAM"
colnames(spamdf) <- c('TERM', 'SPAM_DOCS', 'SPAM_FREQ', 'TYPE_SPAM')
head(spamdf)
## TERM SPAM_DOCS SPAM_FREQ TYPE_SPAM
## 1 bfcddbffccadb 1 1 SPAM
## 2 ffbcebefcdceaaa 1 0 SPAM
## 3 cdcbcfabae 1 0 SPAM
## 4 dfdeeaebdceaefa 1 0 SPAM
## 5 eddeaedb 1 0 SPAM
## 6 cbfedfedfbfa 1 0 SPAM
spamdf$SPAM_FREQ[is.na(spamdf$SPAM_FREQ)] <- '0'
spamdf <- ddply(spamdf, .(TERM, TYPE_SPAM), summarize, SPAM_FREQ = sum(as.numeric(SPAM_FREQ)))
kable(head(spamdf))
TERM | TYPE_SPAM | SPAM_FREQ |
---|---|---|
bfcddbffccadb | SPAM | 1 |
ffbcebefcdceaaa | SPAM | 1 |
cdcbcfabae | SPAM | 1 |
dfdeeaebdceaefa | SPAM | 1 |
eddeaedb | SPAM | 1 |
cbfedfedfbfa | SPAM | 1 |
hamdf <- as.data.frame(as.table(ham_tdm))
hamdf$spam_ham <- "HAM"
colnames(hamdf) <- c('TERM', 'HAM_DOCS', 'HAM_FREQ', 'TYPE_HAM')
hamdf$HAM_FREQ[is.na(hamdf$HAM_FREQ)] <- '0'
hamdf <- ddply(hamdf, .(TERM, TYPE_HAM), summarize, HAM_FREQ = sum(as.numeric(HAM_FREQ)))
kable(head(hamdf))
TERM | TYPE_HAM | HAM_FREQ |
---|---|---|
admin | HAM | 536 |
aug | HAM | 1004 |
com | HAM | 3687 |
exmh | HAM | 539 |
redhat | HAM | 271 |
thu | HAM | 564 |
Combining Dataframes:
combdf <- merge(x = hamdf, y = spamdf, by="TERM", all = TRUE)
Ham Wordcloud:
wordcloud(ham_corpus, max.words = 200, random.order = FALSE, colors=c('purple'))
Spam Wordcloud:
wordcloud(spam_corpus, max.words = 200, random.order = FALSE, colors=c('red'))