This code trains an algorthm on spam / ham files and the accuracy of the ‘model’ is tested on 3 percent of the data, as is typical for classification models.
require(tm)
library(SnowballC)
library(RTextTools)
spam <- Corpus(DirSource("./spam_2", pattern = "[[:digit:]]"))
ham <- Corpus(DirSource("./easy_ham", pattern = "[[:digit:]]"))
#spam_test <- Corpus(DirSource("./spam", pattern = "[[:digit:]]"))
#ham_test <- Corpus(DirSource("./easy_ham_2", pattern = "[[:digit:]]"))
meta(spam, tag = "type") <- "spam"
meta(ham, tag = "type") <- "ham"
combined<-c(spam,ham,recursive = T)
head(summary(combined))
## Length Class Mode
## 00001.317e78fa8ee2f54cd4890fdc09ba8176 2 PlainTextDocument list
## 00002.9438920e9a55591b18e60d1ed37d992b 2 PlainTextDocument list
## 00003.590eff932f8704d8b0fcbe69d023b54d 2 PlainTextDocument list
## 00004.bdcc075fa4beb5157b5dd6cd41d8887b 2 PlainTextDocument list
## 00005.ed0aba4d386c5e62bc737cf3f0ed9589 2 PlainTextDocument list
## 00006.3ca1f399ccda5d897fecb8c57669a283 2 PlainTextDocument list
## Now clean a little
cleaned<-tm_map(combined,tolower)
cleaned<-tm_map(cleaned,removePunctuation)
cleaned<-tm_map(cleaned, stripWhitespace)
cleaned<-tm_map(cleaned,stemDocument)
cleaned<-tm_map(cleaned,PlainTextDocument)
## To Document Term Matrix
dtm<-DocumentTermMatrix(cleaned)
head.matrix(dtm)
## <<DocumentTermMatrix (documents: 6, terms: 145463)>>
## Non-/sparse entries: 2113/870665
## Sparsity : 100%
## Maximal term length: 880
## Weighting : term frequency (tf)
##Remove sparse terms
dtm<-removeSparseTerms(dtm, .999)
head.matrix(dtm)
## <<DocumentTermMatrix (documents: 6, terms: 18726)>>
## Non-/sparse entries: 1585/110771
## Sparsity : 99%
## Maximal term length: 128
## Weighting : term frequency (tf)
## Get types of data in a vector
types<-unlist(meta(cleaned, "type")[,1])
Now with the rTextTools run many models to compare accuracy. But first we split the data into train or test in a way that is dependent on the length of spam+ham
trmin<-1
trmax<-round(.7*(length(types)),digits=0)
temin<-trmax+1
temax<-length(types)
model_cntr<-create_container(dtm,labels=types,trainSize = trmin:trmax,testSize = temin:temax,virgin = FALSE)
### Now run a couple models on train data (I am getting stack overflow warnings on RF and TREE Models)
model_SVM<-train_model(model_cntr,algorithm = "SVM")
model_MAXENT<-train_model(model_cntr,algorithm = "MAXENT")
#model_TREE<-train_model(model_cntr,algorithm = "TREE")
## Modeloutputs
svm <- classify_model(model_cntr, model_SVM)
ent <- classify_model(model_cntr, model_MAXENT)
Now we will compare the results of the true values in the Test sample and the models’ calculated values.
results<-data.frame("Test"=types[temin:temax],"SVM"=svm$SVM_LABEL,"MAXENT"=ent$MAXENTROPY_LABEL,stringsAsFactors = FALSE)
head(results)
## Test SVM MAXENT
## 1 ham ham ham
## 2 ham ham ham
## 3 ham ham ham
## 4 ham spam ham
## 5 ham ham ham
## 6 ham ham ham
Class_Acc_SVM<-sum(results$Test==results$SVM)/nrow(results)
Class_Acc_ENT<-sum(results$Test==results$MAXENT)/nrow(results)
Acc<-data.frame("SVM_Accuracy"=Class_Acc_SVM,"MAXENT_Accuracy"=Class_Acc_ENT)
Acc
## SVM_Accuracy MAXENT_Accuracy
## 1 0.849444 0.9760479
So SVM has a 85 percent accuracy on this test data while MAXENT has a 98 percent, a big difference!! This was a cool, challenging project because these packages give you a streamlined way to classify these documents, once you learn them of course..