Document Classification (Text Mining)

The task of this assignment is to classify documents using already classified training documents. For this purpose documents from this site https://spamassassin.apache.org/publiccorpus/ are used.
The objective of this homework assignment is to classify documents
(1) Load Libraries :
library(tm)
## Warning: package 'tm' was built under R version 3.3.2
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.3.2
library(stringr)
library(RTextTools)
## Warning: package 'RTextTools' was built under R version 3.3.2
## Loading required package: SparseM
## Warning: package 'SparseM' was built under R version 3.3.2
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
library(SnowballC)
## Warning: package 'SnowballC' was built under R version 3.3.2
## 
## Attaching package: 'SnowballC'
## The following objects are masked from 'package:RTextTools':
## 
##     getStemLanguages, wordStem
library(caret)
## Warning: package 'caret' was built under R version 3.3.2
## Loading required package: lattice
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.3.2
## Loading required package: RColorBrewer
(2) Download documents and Load them into Corpus :
# Set Options
options(stringsAsFactors = FALSE)
(2.a) Create Corpus function :
# create corpus
createCorpus = function(directoryName) {
  directory = paste0(getwd(), "/", directoryName)
  corpus = Corpus(DirSource(directory=directory, pattern="\\d+"))
  print(length(corpus))
  return(corpus)
}
(2.b) Clean Corpus function :
# clean corpus
cleanCorpus = function(corpus) {
  tempCorpus = corpus
  tempCorpus = tm_map(tempCorpus, removeNumbers)
  tempCorpus = tm_map(tempCorpus, str_replace_all, pattern="[[.]]", replacement=" ")
  tempCorpus = tm_map(tempCorpus, removeWords, words=stopwords("en"))
  tempCorpus = tm_map(tempCorpus, tolower)
  tempCorpus = tm_map(tempCorpus, stemDocument)
  tempCorpus = tm_map(tempCorpus, PlainTextDocument)
  print(length(tempCorpus))
  return(tempCorpus)
}
(2.c) Generate DocumentTermMatrix function :
# build Document Term Matrix
generateDTM = function(corpus) {
  dtm = DocumentTermMatrix(corpus)
  dtm = removeSparseTerms(dtm, 0.9)
  inspect(dtm[1:10, 1:2])
  return (dtm)
}
(2.d) Create Corpus :
hamCorpus = createCorpus("easy_ham")
## [1] 2500
spamCorpus = createCorpus("spam")
## [1] 500
meta(hamCorpus,  tag="type") = "ham"
meta(spamCorpus, tag="type") = "spam"

totalCorpus = c(spamCorpus, hamCorpus)
totalCorpus = cleanCorpus(totalCorpus)
## [1] 3000
metaList = factor(unlist(meta(totalCorpus, "type")))
table(metaList)
## metaList
##  ham spam 
## 2500  500
(2.e) Generate DocumentTermMatrix :
dtm = generateDTM(totalCorpus)
## <<DocumentTermMatrix (documents: 10, terms: 2)>>
## Non-/sparse entries: 10/10
## Sparsity           : 50%
## Maximal term length: 9
## Weighting          : term frequency (tf)
## 
##               Terms
## Docs           (debian)) (edt)
##   character(0)         0     1
##   character(0)         0     1
##   character(0)         0     1
##   character(0)         0     2
##   character(0)         0     1
##   character(0)         0     1
##   character(0)         0     3
##   character(0)         0     1
##   character(0)         0     1
##   character(0)         0     1
print(hamCorpus)
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 1
## Content:  documents: 2500
print(spamCorpus)
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 1
## Content:  documents: 500
metaListLength = length(metaList)
trainPartition = round(.75 * metaListLength)
(3) Train model using SVM and Classify :
(3.a) Make container and Train Model :
container = create_container(dtm, 
                             labels=as.numeric(metaList), 
                             trainSize=1:trainPartition, 
                             testSize=trainPartition:metaListLength, 
                             virgin=FALSE)

svmModel   = train_model(container, "SVM")
svmOut     = classify_model(container, svmModel)
svmSummary = create_analytics(container, svmOut)
(3.b) Take 100% spam and test it :
testCorpus = createCorpus("spam_2")
## [1] 1396
testCorpus = cleanCorpus(testCorpus)
## [1] 1396
testTdm = DocumentTermMatrix(testCorpus, list(dictionary=findFreqTerms(dtm)) )
(4) Analyze Results
(4.a) Word Cloud (TotalCorpus) :
wordcloud(totalCorpus, max.words = 100)

(4.b) Word Cloud (SpamCorpus) :
wordcloud(spamCorpus, max.words = 100)

(4.c) Word Cloud (hamCorpus) :
wordcloud(hamCorpus,  max.words = 100)

(4.d) Word Cloud (testCorpus) :
wordcloud(testCorpus, max.words = 100)

(4.e) Frequency Plot (totalCorpus) :
wordMatrix1 = as.data.frame( t(as.matrix(  dtm )) )
term_frequency1 = rowSums(wordMatrix1)
term_frequency1 = sort(term_frequency1, decreasing = TRUE)
barplot(term_frequency1[1:10], col="blue", las = 2)

(4.f) Frequency Plot (testCorpus) :
wordMatrix2 = as.data.frame( t(as.matrix(  testTdm )) )
term_frequency2 = rowSums(wordMatrix2)
term_frequency2 = sort(term_frequency2, decreasing = TRUE)
barplot(term_frequency2[1:10], col="blue", las = 2)

(5) Conclusion :
(5.a) In this project spam and ham data is used to create and classify the model using Support Vector Machines (SVM) and tested with another spam data. Text/Data Mining is an interesting field and useful information could be extracted using text/data mining techniques.