Week10 Assignment

Document Classification (Text Mining)

The task of this assignment is to classify documents using already classified training documents. For this purpose documents from this site https://spamassassin.apache.org/publiccorpus/ are used.

The objective of this homework assignment is to classify documents

(1) Load Libraries :

library(tm)

## Warning: package 'tm' was built under R version 3.3.2

## Loading required package: NLP

## Warning: package 'NLP' was built under R version 3.3.2

library(stringr)
library(RTextTools)

## Warning: package 'RTextTools' was built under R version 3.3.2

## Loading required package: SparseM

## Warning: package 'SparseM' was built under R version 3.3.2

## 
## Attaching package: 'SparseM'

## The following object is masked from 'package:base':
## 
##     backsolve

library(SnowballC)

## Warning: package 'SnowballC' was built under R version 3.3.2

## 
## Attaching package: 'SnowballC'

## The following objects are masked from 'package:RTextTools':
## 
##     getStemLanguages, wordStem

library(caret)

## Warning: package 'caret' was built under R version 3.3.2

## Loading required package: lattice

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library(wordcloud)

## Warning: package 'wordcloud' was built under R version 3.3.2

## Loading required package: RColorBrewer

(2) Download documents and Load them into Corpus :

# Set Options
options(stringsAsFactors = FALSE)

(2.a) Create Corpus function :

# create corpus
createCorpus = function(directoryName) {
  directory = paste0(getwd(), "/", directoryName)
  corpus = Corpus(DirSource(directory=directory, pattern="\\d+"))
  print(length(corpus))
  return(corpus)
}

(2.b) Clean Corpus function :

# clean corpus
cleanCorpus = function(corpus) {
  tempCorpus = corpus
  tempCorpus = tm_map(tempCorpus, removeNumbers)
  tempCorpus = tm_map(tempCorpus, str_replace_all, pattern="[[.]]", replacement=" ")
  tempCorpus = tm_map(tempCorpus, removeWords, words=stopwords("en"))
  tempCorpus = tm_map(tempCorpus, tolower)
  tempCorpus = tm_map(tempCorpus, stemDocument)
  tempCorpus = tm_map(tempCorpus, PlainTextDocument)
  print(length(tempCorpus))
  return(tempCorpus)
}

(2.c) Generate DocumentTermMatrix function :

# build Document Term Matrix
generateDTM = function(corpus) {
  dtm = DocumentTermMatrix(corpus)
  dtm = removeSparseTerms(dtm, 0.9)
  inspect(dtm[1:10, 1:2])
  return (dtm)
}

(2.d) Create Corpus :

hamCorpus = createCorpus("easy_ham")

## [1] 2500

spamCorpus = createCorpus("spam")

## [1] 500

meta(hamCorpus,  tag="type") = "ham"
meta(spamCorpus, tag="type") = "spam"

totalCorpus = c(spamCorpus, hamCorpus)
totalCorpus = cleanCorpus(totalCorpus)

## [1] 3000

metaList = factor(unlist(meta(totalCorpus, "type")))
table(metaList)

## metaList
##  ham spam 
## 2500  500

(2.e) Generate DocumentTermMatrix :

dtm = generateDTM(totalCorpus)

## <<DocumentTermMatrix (documents: 10, terms: 2)>>
## Non-/sparse entries: 10/10
## Sparsity           : 50%
## Maximal term length: 9
## Weighting          : term frequency (tf)
## 
##               Terms
## Docs           (debian)) (edt)
##   character(0)         0     1
##   character(0)         0     1
##   character(0)         0     1
##   character(0)         0     2
##   character(0)         0     1
##   character(0)         0     1
##   character(0)         0     3
##   character(0)         0     1
##   character(0)         0     1
##   character(0)         0     1

print(hamCorpus)

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 1
## Content:  documents: 2500

print(spamCorpus)

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 1
## Content:  documents: 500

metaListLength = length(metaList)
trainPartition = round(.75 * metaListLength)

(3) Train model using SVM and Classify :

(3.a) Make container and Train Model :

container = create_container(dtm, 
                             labels=as.numeric(metaList), 
                             trainSize=1:trainPartition, 
                             testSize=trainPartition:metaListLength, 
                             virgin=FALSE)

svmModel   = train_model(container, "SVM")
svmOut     = classify_model(container, svmModel)
svmSummary = create_analytics(container, svmOut)

(3.b) Take 100% spam and test it :

testCorpus = createCorpus("spam_2")

## [1] 1396

testCorpus = cleanCorpus(testCorpus)

## [1] 1396

testTdm = DocumentTermMatrix(testCorpus, list(dictionary=findFreqTerms(dtm)) )

(4) Analyze Results

(4.a) Word Cloud (TotalCorpus) :

wordcloud(totalCorpus, max.words = 100)

(4.b) Word Cloud (SpamCorpus) :

wordcloud(spamCorpus, max.words = 100)

(4.c) Word Cloud (hamCorpus) :

wordcloud(hamCorpus,  max.words = 100)

(4.d) Word Cloud (testCorpus) :

wordcloud(testCorpus, max.words = 100)

(4.e) Frequency Plot (totalCorpus) :

wordMatrix1 = as.data.frame( t(as.matrix(  dtm )) )
term_frequency1 = rowSums(wordMatrix1)
term_frequency1 = sort(term_frequency1, decreasing = TRUE)
barplot(term_frequency1[1:10], col="blue", las = 2)

(4.f) Frequency Plot (testCorpus) :

wordMatrix2 = as.data.frame( t(as.matrix(  testTdm )) )
term_frequency2 = rowSums(wordMatrix2)
term_frequency2 = sort(term_frequency2, decreasing = TRUE)
barplot(term_frequency2[1:10], col="blue", las = 2)

(5) Conclusion :

(5.a) In this project spam and ham data is used to create and classify the model using Support Vector Machines (SVM) and tested with another spam data. Text/Data Mining is an interesting field and useful information could be extracted using text/data mining techniques.

Week10 Assignment

A.S

November 4, 2016

Document Classification (Text Mining)

The task of this assignment is to classify documents using already classified training documents. For this purpose documents from this site https://spamassassin.apache.org/publiccorpus/ are used.

The objective of this homework assignment is to classify documents

(1) Load Libraries :

(2) Download documents and Load them into Corpus :

(2.a) Create Corpus function :

(2.b) Clean Corpus function :

(2.c) Generate DocumentTermMatrix function :

(2.d) Create Corpus :

(2.e) Generate DocumentTermMatrix :

(3) Train model using SVM and Classify :

(3.a) Make container and Train Model :

(3.b) Take 100% spam and test it :

(4) Analyze Results

(4.a) Word Cloud (TotalCorpus) :

(4.b) Word Cloud (SpamCorpus) :

(4.c) Word Cloud (hamCorpus) :

(4.d) Word Cloud (testCorpus) :

(4.e) Frequency Plot (totalCorpus) :

(4.f) Frequency Plot (testCorpus) :

(5) Conclusion :

(5.a) In this project spam and ham data is used to create and classify the model using Support Vector Machines (SVM) and tested with another spam data. Text/Data Mining is an interesting field and useful information could be extracted using text/data mining techniques.