Document Classification

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/publiccorpus/ ***

Loading required packages

library(RTextTools)
library(tm)
library(SnowballC)
library(stringr)
library(plyr)
library(kableExtra)
library(knitr)
library(wordcloud)
library(caret)
library(e1071)
library(rpart)
library(rpart.plot)

Step I :Reading the HAM files and creating the corpus

While loading whole bunch of files my R stopped working so I limited number of files to 100

maxfiles <- 100

get_corpus <- function(the_dir){
  file_contents <- c()
  the_files <- list.files(path=the_dir, full.names = TRUE)
  head(the_files)
  i <- 0
  for (cur_file in the_files){
    if(i < maxfiles){
      current_content <- readLines(cur_file)
      file_contents <- c(file_contents, current_content)
      i <- (i+1)
    }
  }
  the_corpus <- Corpus(VectorSource(file_contents))
  return (the_corpus)
}

ham_corpus <- get_corpus("~/Desktop/MSDA/DATA 607/Project4/easy_ham/")
length(ham_corpus)

## [1] 8505

ham_corpus

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 8505

Read SPAM files

spam_corpus <- get_corpus("~/Desktop/MSDA/DATA 607/Project4/spam_2/")
length(spam_corpus)

## [1] 16827

Create term document matrix for SPAM and HAM corpus

spam_corpus <- tm_map(spam_corpus, removeNumbers)
spam_corpus <- tm_map(spam_corpus, content_transformer(str_replace_all), pattern = "[[:punct:]]", replacement = " ")
spam_corpus <- tm_map(spam_corpus, removeWords, words = stopwords("en"))
spam_corpus <- tm_map(spam_corpus, content_transformer(tolower))
spam_corpus <- tm_map(spam_corpus, stemDocument)

tdm <- TermDocumentMatrix(spam_corpus)
tdm

## <<TermDocumentMatrix (terms: 6220, documents: 16827)>>
## Non-/sparse entries: 55110/104608830
## Sparsity           : 100%
## Maximal term length: 58
## Weighting          : term frequency (tf)

ham_corpus <- tm_map(ham_corpus, removeNumbers)
ham_corpus <- tm_map(ham_corpus, content_transformer(str_replace_all), pattern = "[[:punct:]]", replacement = " ")
ham_corpus <- tm_map(ham_corpus, removeWords, words = stopwords("en"))
ham_corpus <- tm_map(ham_corpus, content_transformer(tolower))
ham_corpus <- tm_map(ham_corpus, stemDocument)

tdm1 <- TermDocumentMatrix(ham_corpus)
tdm1

## <<TermDocumentMatrix (terms: 4253, documents: 8505)>>
## Non-/sparse entries: 33907/36137858
## Sparsity           : 100%
## Maximal term length: 76
## Weighting          : term frequency (tf)

Create datasets for both files and summarize words with respect to their occurance frequency in both SPAM and HAM folders.

dataset_spam <- as.data.frame(as.table(tdm))
dataset_spam$spam_ham <- "SPAM"
colnames(dataset_spam) <- c('Term','Spam_docs', 'Freq1', 'Type1')
dataset_spam <- subset(dataset_spam, select = -c(2) )
dataset_spam$Freq1[is.na(dataset_spam$Freq1)] <- '0'

Dataset for SPAM files

dataset_spam <- ddply(dataset_spam, .(Term, Type1), summarize, Freq1 = sum(as.numeric(Freq1)))

kable(head(dataset_spam))

Term	Type1	Freq1
admin	SPAM	17
aug	SPAM	29
from	SPAM	246
ilug	SPAM	12
linux	SPAM	18
tue	SPAM	86

Dataset for HAM files

dataset_ham <- as.data.frame(as.table(tdm1))
dataset_ham$spam_ham <- "HAM"
colnames(dataset_ham) <- c('Term', 'Ham_docs', 'Freq', 'Type')
dataset_ham <- subset(dataset_ham, select = -c(2) )
dataset_ham$Freq[is.na(dataset_ham$Freq)] <- '0'
dataset_ham <- ddply(dataset_ham, .(Term, Type), summarize, Freq = sum(as.numeric(Freq)))
kable(head(dataset_ham))

Term	Type	Freq
admin	HAM	318
aug	HAM	683
com	HAM	1410
exmh	HAM	68
from	HAM	215
redhat	HAM	55

Merger of both datasets to with respect to ‘Terms’ used in datasets.

megaset <- merge(x = dataset_ham, y = dataset_spam, by="Term", all = TRUE)
megaset$Freq1[is.na(megaset$Freq1)] <- '0'
megaset$Type1[is.na(megaset$Type1)] <- 'SPAM'
megaset$Freq[is.na(megaset$Freq)] <- '0'
megaset$Type[is.na(megaset$Type)] <- 'HAM'
megaset[is.na(megaset)] <- '0'
kable(head(megaset))

Term	Type	Freq	Type1	Freq1
admin	HAM	318	SPAM	17
aug	HAM	683	SPAM	29
com	HAM	1410	SPAM	1836
exmh	HAM	68	SPAM	0
from	HAM	215	SPAM	246
redhat	HAM	55	SPAM	0

Wordcloud for ‘SPAM’ and ‘HAM’ corpus

wordcloud(ham_corpus, max.words = 300, random.order = FALSE, colors=c('green'))

wordcloud(spam_corpus, max.words = 300, random.order = FALSE, colors=c('red'))

I repeated the process to create dataset again as I needed dataset with same column names in order to draw “Decision tree” prediction model

dataset_spam <- as.data.frame(as.table(tdm))
dataset_spam$spam_ham <- "SPAM"
colnames(dataset_spam) <- c('Term','Spam_docs', 'Freq', 'Type')
dataset_spam <- subset(dataset_spam, select = -c(2) )
dataset_spam$Freq[is.na(dataset_spam$Freq)] <- '0'
dataset_spam <- ddply(dataset_spam, .(Term, Type), summarize, Freq = sum(as.numeric(Freq)))

dataset_ham <- as.data.frame(as.table(tdm1))
dataset_ham$spam_ham <- "HAM"
colnames(dataset_ham) <- c('Term', 'Ham_docs', 'Freq', 'Type')
dataset_ham <- subset(dataset_ham, select = -c(2) )
dataset_ham$Freq[is.na(dataset_ham$Freq)] <- '0'
dataset_ham <- ddply(dataset_ham, .(Term, Type), summarize, Freq = sum(as.numeric(Freq)))

megaset1 <- rbind(dataset_ham,dataset_spam)
megaset1$Type <- as.factor(megaset1$Type)
head(megaset1)

##     Term Type Freq
## 1  admin  HAM  318
## 2    aug  HAM  683
## 3    com  HAM 1410
## 4   exmh  HAM   68
## 5   from  HAM  215
## 6 redhat  HAM   55

I order to predict future documnet to be included in HAM or SPAM I used “decision tree” model to draw predictions.

Step 1: Forming “Train”and “test” data sets with partition ratio of 75%

smp_size <- floor(0.75 *nrow(megaset1))
train_ind <- sample(seq_len(nrow(megaset1)), size = smp_size)

train <- megaset1[train_ind, ]
test <- megaset1[-train_ind, ]
dim(train)

## [1] 7854    3

dim(test)

## [1] 2619    3

Analysis on training dataset

*** As I ran my code chunks the R session repeatedly got aborted or hung.Hence I am unable to draw conclusions here.but The algorithm and regression model is studied carefully for drawing conclusions and hence shall be taken into account .In order to make the html document,The commands further are masked with # ***

#fit_rpart <- train(Type~.,method = 'rpart',data = train)

Tree model

#rpart.plot(fit_rpart$finalmodel)

Drawing predictions

#prediction <- Predict(fit_rpart,test)
#ConclusionMatrix(prediction,test$Type,positive = 'spam')