project4 -<80><93> Document Classification

Assignment:-

It can be useful to be able to classify new “test” documents using already classified “training” documents.
A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder).
One example corpus: https://spamassassin.apache.org/publiccorpus/

For more adventurous students, you are welcome (encouraged!) to come up with a different set of documents (including scraped web pages!?) that have already been classified (e.g. tagged), then analyze these documents to predict how new documents should be classified.

library(tm)

## Loading required package: NLP

library(knitr)
library(plyr)
library(wordcloud)

## Loading required package: RColorBrewer

dir_spam <- "/Users/ashishsm1986/git/Cuny-Assignments/week10-assignment/spam/"
names_spam <- list.files(dir_spam)
head(names_spam)

## [1] "0000.7b1b73cf36cf9dbc3d64e3f2ee2b91f1"
## [2] "0001.bfc8d64d12b325ff385cca8d07b84288"
## [3] "0002.24b47bb3ce90708ae29d0aec1da08610"
## [4] "0003.4b3d943b8df71af248d12f8b2e7a224a"
## [5] "0004.1874ab60c71f0b31b580f313a3f6e777"
## [6] "0005.1f42bb885de0ef7fc5cd09d34dc2ba54"

lspam <- length(names_spam)
lspam

## [1] 501

dir_ham <- "/Users/ashishsm1986/git/Cuny-Assignments/week10-assignment/ham/"
names_ham <- list.files(dir_ham)
head(names_ham)

## [1] "0001.ea7e79d3153e7469e7a9c3e0af6a357e"
## [2] "0002.b3120c4bcbf3101e661161ee7efcb8bf"
## [3] "0003.acfc5ad94bbd27118a0d8685d18c89dd"
## [4] "0004.e8d5727378ddde5c3be181df593f1712"
## [5] "0005.8c3b9e9c0f3f183ddaf7592a11b99957"
## [6] "0006.ee8b0dba12856155222be180ba122058"

lham <- length(names_ham)
lham

## [1] 2551

Creating the Corpuses:

#Function to create Cosrpuses

getcorpus <- function(dir, maxfiles){
  content <- c()
  the_files <- list.files(path=dir, full.names = TRUE)
  i <- 0
  for (cur_file in the_files){
    if(i <= 200){ #Here the condition should be if(i <= 200) but my computer just crashes when i increase limit from 200 so had to hardcode this value
      current_content <- readLines(cur_file)
      content <- c(content, current_content)
      i <- (i+1)
    }
  }
  the_corpus <- Corpus(VectorSource(content))
  return (the_corpus)
}

# Creating Spam Corpus
spam_corpus <- getcorpus(dir_spam,lspam)
length(spam_corpus)

## [1] 21043

spam_corpus

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 21043

# Creating Ham Corpus
ham_corpus <- getcorpus(dir_ham,lham)
length(ham_corpus)

## [1] 16823

ham_corpus

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 16823

Creating TermDocumentMetrices for the 2 corpuses:

Sys.setlocale("LC_ALL", "C") # Was getting encoding error and found this solution on Google to fix it. This converts encoding to UTF-8

## [1] "C/C/C/C/C/en_US.UTF-8"

tdm_dtm_opts <- list(removePunctuation=TRUE, removeNumbers=TRUE, stripWhitespace=TRUE, tolower=TRUE, stopwords=TRUE, minWordLength = 2, fileEncoding="latin1")


spam_tdm <- TermDocumentMatrix(spam_corpus,control=tdm_dtm_opts)
spam_tdm

## <<TermDocumentMatrix (terms: 8163, documents: 21043)>>
## Non-/sparse entries: 71568/171702441
## Sparsity           : 100%
## Maximal term length: 184
## Weighting          : term frequency (tf)

ham_tdm <- TermDocumentMatrix(ham_corpus,control=tdm_dtm_opts)
ham_tdm

## <<TermDocumentMatrix (terms: 7554, documents: 16823)>>
## Non-/sparse entries: 65945/127014997
## Sparsity           : 100%
## Maximal term length: 76
## Weighting          : term frequency (tf)

Creating Dataframes:

spamdf <- as.data.frame(as.table(spam_tdm))
spamdf$spam_ham <- "SPAM"

colnames(spamdf) <- c('TERM', 'SPAM_DOCS', 'SPAM_FREQ', 'TYPE_SPAM')
head(spamdf)

##              TERM SPAM_DOCS SPAM_FREQ TYPE_SPAM
## 1   bfcddbffccadb         1         1      SPAM
## 2 ffbcebefcdceaaa         1         0      SPAM
## 3      cdcbcfabae         1         0      SPAM
## 4 dfdeeaebdceaefa         1         0      SPAM
## 5        eddeaedb         1         0      SPAM
## 6    cbfedfedfbfa         1         0      SPAM

spamdf$SPAM_FREQ[is.na(spamdf$SPAM_FREQ)] <- '0'
spamdf <- ddply(spamdf, .(TERM, TYPE_SPAM), summarize, SPAM_FREQ = sum(as.numeric(SPAM_FREQ)))
kable(head(spamdf))

TERM	TYPE_SPAM	SPAM_FREQ
bfcddbffccadb	SPAM	1
ffbcebefcdceaaa	SPAM	1
cdcbcfabae	SPAM	1
dfdeeaebdceaefa	SPAM	1
eddeaedb	SPAM	1
cbfedfedfbfa	SPAM	1

hamdf <- as.data.frame(as.table(ham_tdm))
hamdf$spam_ham <- "HAM"
colnames(hamdf) <- c('TERM', 'HAM_DOCS', 'HAM_FREQ', 'TYPE_HAM')
hamdf$HAM_FREQ[is.na(hamdf$HAM_FREQ)] <- '0'
hamdf <- ddply(hamdf, .(TERM, TYPE_HAM), summarize, HAM_FREQ = sum(as.numeric(HAM_FREQ)))
kable(head(hamdf))

TERM	TYPE_HAM	HAM_FREQ
admin	HAM	536
aug	HAM	1004
com	HAM	3687
exmh	HAM	539
redhat	HAM	271
thu	HAM	564

Combining Dataframes:

combdf <- merge(x = hamdf, y = spamdf, by="TERM", all = TRUE)

Ham Wordcloud:

wordcloud(ham_corpus, max.words = 200, random.order = FALSE, colors=c('purple'))

Spam Wordcloud:

wordcloud(spam_corpus, max.words = 200, random.order = FALSE, colors=c('red'))

project4 -<80><93> Document Classification

Ashish Kumar

4/14/2018