Week 10 Assignment

This code trains an algorthm on spam / ham files and the accuracy of the ‘model’ is tested on 3 percent of the data, as is typical for classification models.

require(tm)
library(SnowballC)
library(RTextTools)

spam <- Corpus(DirSource("./spam_2", pattern = "[[:digit:]]"))
ham <- Corpus(DirSource("./easy_ham", pattern = "[[:digit:]]"))

#spam_test <- Corpus(DirSource("./spam", pattern = "[[:digit:]]"))
#ham_test <- Corpus(DirSource("./easy_ham_2", pattern = "[[:digit:]]"))


meta(spam, tag = "type") <- "spam"
meta(ham, tag = "type") <- "ham"

combined<-c(spam,ham,recursive = T)

head(summary(combined))

##                                        Length Class             Mode
## 00001.317e78fa8ee2f54cd4890fdc09ba8176 2      PlainTextDocument list
## 00002.9438920e9a55591b18e60d1ed37d992b 2      PlainTextDocument list
## 00003.590eff932f8704d8b0fcbe69d023b54d 2      PlainTextDocument list
## 00004.bdcc075fa4beb5157b5dd6cd41d8887b 2      PlainTextDocument list
## 00005.ed0aba4d386c5e62bc737cf3f0ed9589 2      PlainTextDocument list
## 00006.3ca1f399ccda5d897fecb8c57669a283 2      PlainTextDocument list

## Now clean a little

cleaned<-tm_map(combined,tolower)
cleaned<-tm_map(cleaned,removePunctuation)
cleaned<-tm_map(cleaned, stripWhitespace)
cleaned<-tm_map(cleaned,stemDocument)
cleaned<-tm_map(cleaned,PlainTextDocument)

## To Document Term Matrix

dtm<-DocumentTermMatrix(cleaned)
head.matrix(dtm)

## <<DocumentTermMatrix (documents: 6, terms: 145463)>>
## Non-/sparse entries: 2113/870665
## Sparsity           : 100%
## Maximal term length: 880
## Weighting          : term frequency (tf)

##Remove sparse terms

dtm<-removeSparseTerms(dtm, .999)
head.matrix(dtm)

## <<DocumentTermMatrix (documents: 6, terms: 18726)>>
## Non-/sparse entries: 1585/110771
## Sparsity           : 99%
## Maximal term length: 128
## Weighting          : term frequency (tf)

## Get types of data in a vector

types<-unlist(meta(cleaned, "type")[,1])

Now with the rTextTools run many models to compare accuracy. But first we split the data into train or test in a way that is dependent on the length of spam+ham

trmin<-1
trmax<-round(.7*(length(types)),digits=0)
temin<-trmax+1
temax<-length(types)

model_cntr<-create_container(dtm,labels=types,trainSize = trmin:trmax,testSize = temin:temax,virgin = FALSE)


### Now run a couple models  on train data (I am getting stack overflow warnings on RF and TREE Models)

model_SVM<-train_model(model_cntr,algorithm = "SVM")
model_MAXENT<-train_model(model_cntr,algorithm = "MAXENT")
#model_TREE<-train_model(model_cntr,algorithm = "TREE")


## Modeloutputs

svm <- classify_model(model_cntr, model_SVM)

ent <- classify_model(model_cntr, model_MAXENT)

Now we will compare the results of the true values in the Test sample and the models’ calculated values.

results<-data.frame("Test"=types[temin:temax],"SVM"=svm$SVM_LABEL,"MAXENT"=ent$MAXENTROPY_LABEL,stringsAsFactors = FALSE)

head(results)

##   Test  SVM MAXENT
## 1  ham  ham    ham
## 2  ham  ham    ham
## 3  ham  ham    ham
## 4  ham spam    ham
## 5  ham  ham    ham
## 6  ham  ham    ham

Class_Acc_SVM<-sum(results$Test==results$SVM)/nrow(results)
Class_Acc_ENT<-sum(results$Test==results$MAXENT)/nrow(results)

Acc<-data.frame("SVM_Accuracy"=Class_Acc_SVM,"MAXENT_Accuracy"=Class_Acc_ENT)

Acc

##   SVM_Accuracy MAXENT_Accuracy
## 1     0.849444       0.9760479

So SVM has a 85 percent accuracy on this test data while MAXENT has a 98 percent, a big difference!! This was a cool, challenging project because these packages give you a streamlined way to classify these documents, once you learn them of course..

Week 10 Assignment

Scott Ogden

November 6, 2016