1 Source and Reference

Automated Data Collection With R, Chapter 10, S. Munzert, C. Rubba, et. al.

2 Include R libraries

library(tm)

## Loading required package: NLP

library(stringr)
library(SnowballC)
library(RTextTools)

## Loading required package: SparseM

## 
## Attaching package: 'SparseM'

## The following object is masked from 'package:base':
## 
##     backsolve

## 
## Attaching package: 'RTextTools'

## The following objects are masked from 'package:SnowballC':
## 
##     getStemLanguages, wordStem

3 Create Text Corpus For Both Ham and Spam Files (Emails)

The spam and ham files were downloaded, and unzipped from https://spamassassin.apache.org/publiccorpus/.

setwd("C:/data")
pathToHam = "easy_ham"
pathToSpam = "spam_2"

corpusHam <- Corpus((DirSource(directory=pathToHam, pattern="\\d+")), 
    readerControl = list(reader = readPlain))

length(corpusHam)

## [1] 2500

corpusSpam <- Corpus((DirSource(directory=pathToSpam, pattern="\\d+")), 
    readerControl = list(reader = readPlain))

length(corpusSpam)

## [1] 1396

4 Create Meta Tag “emailtype”

Diffentiate email files and combine the 2 corpuses (ham and spam emails)

meta(corpusHam, tag="emailtype") = "Ham"

meta(corpusSpam, tag="emailtype") = "Spam"

corpusHS = c(corpusHam, corpusSpam)

length(corpusHS)

## [1] 3896

5 Transform Corpus To A Term Document Matrix

Perform text mapping cleansing techiques

##HS_tdm = TermDocumentMatrix(corpusHS)
##HS_tdm 

corpusHS = tm_map(corpusHS, removeNumbers)

##HS_tdm = TermDocumentMatrix(corpusHS)
##HS_tdm 

corpusHS = tm_map(corpusHS, str_replace_all, pattern="[[:punct:]]", replacement=" ")

##HS_tdm = TermDocumentMatrix(corpusHS)
##HS_tdm 

corpusHS = tm_map(corpusHS, removeWords, words=stopwords("en"))

##HS_tdm = TermDocumentMatrix(corpusHS)
##HS_tdm 

corpusHS = tm_map(corpusHS, tolower)

corpusHS = tm_map(corpusHS, stemDocument)

#HS_tdm = TermDocumentMatrix(corpusHS, PlainTextDocument)
##HS_tdm 


##HS_tdm = removeSparseTerms(HS_tdm, 1-(10/length(corpusHS)))
##HS_tdm

6 Transform Corpus To A Document Term Matrix

Perform text mapping cleansing techiques

corpusHS <- tm_map(corpusHS, PlainTextDocument)

corpusHSR = sample(corpusHS)

HS_dtm = DocumentTermMatrix(corpusHSR)
##HS_dtm 

HS_dtm = removeSparseTerms(HS_dtm, 1-(10/length(corpusHSR)))
HS_dtm

## <<DocumentTermMatrix (documents: 3896, terms: 6961)>>
## Non-/sparse entries: 613091/26506965
## Sparsity           : 98%
## Maximal term length: 70
## Weighting          : term frequency (tf)

7 Create A Container From The Document Text Matrix

Divide the corpus into training and test datasets. Create training models for classification algorithms. In addition, create corresponding test datasets for each of the classification algorithms.

head(meta(corpusHSR))

##      emailtype
## 1034       Ham
## 2259       Ham
## 153        Ham
## 3004      Spam
## 983        Ham
## 1845       Ham

HS_labels <- as.factor(unlist(meta(corpusHSR, "emailtype")[,1]))

class(HS_labels)

## [1] "factor"

N = nrow(meta(corpusHSR))

container = create_container(HS_dtm, labels=HS_labels, trainSize = 1:3000, testSize = 3001:N, virgin = FALSE)

slotNames(container)

## [1] "training_matrix"       "classification_matrix" "training_codes"       
## [4] "testing_codes"         "column_names"          "virgin"

svm_model = train_model(container, "SVM")
tree_model = train_model(container, "TREE")
maxent_model = train_model(container, "MAXENT")
boost_model = train_model(container, "BOOSTING")
##bagg_model = train_model(container, "BAGGING")
##nnet_model = train_model(container, "NNET")

svm_out = classify_model(container, svm_model)
tree_out = classify_model(container, tree_model)
maxent_out = classify_model(container, maxent_model)
boost_out = classify_model(container, boost_model)
##bagg_out = classify_model(container, bagg_model)
##nnet_out = classify_model(container, nnet_model)

8 Analyze Results For The Classification Algorithms

Determine which of the classification algorithm is the most accurate

head(svm_out)

##   SVM_LABEL  SVM_PROB
## 1      Spam 0.9099099
## 2      Spam 0.9674704
## 3       Ham 0.9983343
## 4      Spam 1.0000000
## 5      Spam 1.0000000
## 6       Ham 0.9998960

head(tree_out)

##   TREE_LABEL TREE_PROB
## 1       Spam 0.9969789
## 2       Spam 0.9969789
## 3        Ham 1.0000000
## 4       Spam 0.9969789
## 5       Spam 0.9969789
## 6        Ham 1.0000000

head(maxent_out)

##   MAXENTROPY_LABEL MAXENTROPY_PROB
## 1             Spam               1
## 2             Spam               1
## 3              Ham               1
## 4             Spam               1
## 5             Spam               1
## 6              Ham               1

head(boost_out)

##   LOGITBOOST_LABEL LOGITBOOST_PROB
## 1             Spam       1.0000000
## 2             Spam       1.0000000
## 3              Ham       1.0000000
## 4             Spam       0.9999999
## 5             Spam       0.9999999
## 6              Ham       1.0000000

labels_out = data.frame(correct_label =  HS_labels[3001:N],
                       svm = as.character(svm_out[,1]),
                       tree = as.character(tree_out[,1]),
                       maxent = as.character(maxent_out[,1]),
                       boost= as.character(boost_out[,1]),
                       stringsAsFactors = F)

8.1 SVM Performance

table(labels_out[,1] == labels_out[,2])

## 
## FALSE  TRUE 
##     6   890

prop.table(table(labels_out[,1] == labels_out[,2]))

## 
##       FALSE        TRUE 
## 0.006696429 0.993303571

8.2 Random Forest Performance

table(labels_out[,1] == labels_out[,3])

## 
## FALSE  TRUE 
##    15   881

prop.table(table(labels_out[,1] == labels_out[,3]))

## 
##      FALSE       TRUE 
## 0.01674107 0.98325893

8.3 Maximum Entropy Performance

table(labels_out[,1] == labels_out[,4])

## 
## FALSE  TRUE 
##     5   891

prop.table(table(labels_out[,1] == labels_out[,4]))

## 
##       FALSE        TRUE 
## 0.005580357 0.994419643

8.4 Boosting Performance

table(labels_out[,1] == labels_out[,5])

## 
## FALSE  TRUE 
##     9   887

prop.table(table(labels_out[,1] == labels_out[,5]))

## 
##      FALSE       TRUE 
## 0.01004464 0.98995536

9 Conclusion

All 4 supervised machine learning algorithms performed well for the given training and testing datasets. The Boosting Algorithm performed best matching all but 1 test email correctly. The SVM and Maximum Entrophy algorithms misclassified only 5 of 896 test emails while the Random Forest algorithm was wrost as it misclassified 15 of the 896 emails.

Data 607 Week 11 HW Text Mining

Antonio J Bayquen

April 10, 2016