It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.
For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/publiccorpus/ ***
Loading required packages
library(RTextTools)
library(tm)
library(SnowballC)
library(stringr)
library(plyr)
library(kableExtra)
library(knitr)
library(wordcloud)
library(caret)
library(e1071)
library(rpart)
library(rpart.plot)
Step I :Reading the HAM files and creating the corpus
While loading whole bunch of files my R stopped working so I limited number of files to 100
maxfiles <- 100
get_corpus <- function(the_dir){
file_contents <- c()
the_files <- list.files(path=the_dir, full.names = TRUE)
head(the_files)
i <- 0
for (cur_file in the_files){
if(i < maxfiles){
current_content <- readLines(cur_file)
file_contents <- c(file_contents, current_content)
i <- (i+1)
}
}
the_corpus <- Corpus(VectorSource(file_contents))
return (the_corpus)
}
ham_corpus <- get_corpus("~/Desktop/MSDA/DATA 607/Project4/easy_ham/")
length(ham_corpus)
## [1] 8505
ham_corpus
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 8505
Read SPAM files
spam_corpus <- get_corpus("~/Desktop/MSDA/DATA 607/Project4/spam_2/")
length(spam_corpus)
## [1] 16827
Create term document matrix for SPAM and HAM corpus
spam_corpus <- tm_map(spam_corpus, removeNumbers)
spam_corpus <- tm_map(spam_corpus, content_transformer(str_replace_all), pattern = "[[:punct:]]", replacement = " ")
spam_corpus <- tm_map(spam_corpus, removeWords, words = stopwords("en"))
spam_corpus <- tm_map(spam_corpus, content_transformer(tolower))
spam_corpus <- tm_map(spam_corpus, stemDocument)
tdm <- TermDocumentMatrix(spam_corpus)
tdm
## <<TermDocumentMatrix (terms: 6220, documents: 16827)>>
## Non-/sparse entries: 55110/104608830
## Sparsity : 100%
## Maximal term length: 58
## Weighting : term frequency (tf)
ham_corpus <- tm_map(ham_corpus, removeNumbers)
ham_corpus <- tm_map(ham_corpus, content_transformer(str_replace_all), pattern = "[[:punct:]]", replacement = " ")
ham_corpus <- tm_map(ham_corpus, removeWords, words = stopwords("en"))
ham_corpus <- tm_map(ham_corpus, content_transformer(tolower))
ham_corpus <- tm_map(ham_corpus, stemDocument)
tdm1 <- TermDocumentMatrix(ham_corpus)
tdm1
## <<TermDocumentMatrix (terms: 4253, documents: 8505)>>
## Non-/sparse entries: 33907/36137858
## Sparsity : 100%
## Maximal term length: 76
## Weighting : term frequency (tf)
Create datasets for both files and summarize words with respect to their occurance frequency in both SPAM and HAM folders.
dataset_spam <- as.data.frame(as.table(tdm))
dataset_spam$spam_ham <- "SPAM"
colnames(dataset_spam) <- c('Term','Spam_docs', 'Freq1', 'Type1')
dataset_spam <- subset(dataset_spam, select = -c(2) )
dataset_spam$Freq1[is.na(dataset_spam$Freq1)] <- '0'
Dataset for SPAM files
dataset_spam <- ddply(dataset_spam, .(Term, Type1), summarize, Freq1 = sum(as.numeric(Freq1)))
kable(head(dataset_spam))
| admin |
SPAM |
17 |
| aug |
SPAM |
29 |
| from |
SPAM |
246 |
| ilug |
SPAM |
12 |
| linux |
SPAM |
18 |
| tue |
SPAM |
86 |
Dataset for HAM files
dataset_ham <- as.data.frame(as.table(tdm1))
dataset_ham$spam_ham <- "HAM"
colnames(dataset_ham) <- c('Term', 'Ham_docs', 'Freq', 'Type')
dataset_ham <- subset(dataset_ham, select = -c(2) )
dataset_ham$Freq[is.na(dataset_ham$Freq)] <- '0'
dataset_ham <- ddply(dataset_ham, .(Term, Type), summarize, Freq = sum(as.numeric(Freq)))
kable(head(dataset_ham))
| admin |
HAM |
318 |
| aug |
HAM |
683 |
| com |
HAM |
1410 |
| exmh |
HAM |
68 |
| from |
HAM |
215 |
| redhat |
HAM |
55 |
Merger of both datasets to with respect to ‘Terms’ used in datasets.
megaset <- merge(x = dataset_ham, y = dataset_spam, by="Term", all = TRUE)
megaset$Freq1[is.na(megaset$Freq1)] <- '0'
megaset$Type1[is.na(megaset$Type1)] <- 'SPAM'
megaset$Freq[is.na(megaset$Freq)] <- '0'
megaset$Type[is.na(megaset$Type)] <- 'HAM'
megaset[is.na(megaset)] <- '0'
kable(head(megaset))
| admin |
HAM |
318 |
SPAM |
17 |
| aug |
HAM |
683 |
SPAM |
29 |
| com |
HAM |
1410 |
SPAM |
1836 |
| exmh |
HAM |
68 |
SPAM |
0 |
| from |
HAM |
215 |
SPAM |
246 |
| redhat |
HAM |
55 |
SPAM |
0 |
Wordcloud for ‘SPAM’ and ‘HAM’ corpus
wordcloud(ham_corpus, max.words = 300, random.order = FALSE, colors=c('green'))

wordcloud(spam_corpus, max.words = 300, random.order = FALSE, colors=c('red'))

I repeated the process to create dataset again as I needed dataset with same column names in order to draw “Decision tree” prediction model
dataset_spam <- as.data.frame(as.table(tdm))
dataset_spam$spam_ham <- "SPAM"
colnames(dataset_spam) <- c('Term','Spam_docs', 'Freq', 'Type')
dataset_spam <- subset(dataset_spam, select = -c(2) )
dataset_spam$Freq[is.na(dataset_spam$Freq)] <- '0'
dataset_spam <- ddply(dataset_spam, .(Term, Type), summarize, Freq = sum(as.numeric(Freq)))
dataset_ham <- as.data.frame(as.table(tdm1))
dataset_ham$spam_ham <- "HAM"
colnames(dataset_ham) <- c('Term', 'Ham_docs', 'Freq', 'Type')
dataset_ham <- subset(dataset_ham, select = -c(2) )
dataset_ham$Freq[is.na(dataset_ham$Freq)] <- '0'
dataset_ham <- ddply(dataset_ham, .(Term, Type), summarize, Freq = sum(as.numeric(Freq)))
megaset1 <- rbind(dataset_ham,dataset_spam)
megaset1$Type <- as.factor(megaset1$Type)
head(megaset1)
## Term Type Freq
## 1 admin HAM 318
## 2 aug HAM 683
## 3 com HAM 1410
## 4 exmh HAM 68
## 5 from HAM 215
## 6 redhat HAM 55
I order to predict future documnet to be included in HAM or SPAM I used “decision tree” model to draw predictions.
Analysis on training dataset
*** As I ran my code chunks the R session repeatedly got aborted or hung.Hence I am unable to draw conclusions here.but The algorithm and regression model is studied carefully for drawing conclusions and hence shall be taken into account .In order to make the html document,The commands further are masked with # ***
#fit_rpart <- train(Type~.,method = 'rpart',data = train)
Tree model
#rpart.plot(fit_rpart$finalmodel)
Drawing predictions
#prediction <- Predict(fit_rpart,test)
#ConclusionMatrix(prediction,test$Type,positive = 'spam')