library(tm)
## Warning: package 'tm' was built under R version 3.2.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.2.3
library(RTextTools)
## Warning: package 'RTextTools' was built under R version 3.2.4
## Loading required package: SparseM
## Warning: package 'SparseM' was built under R version 3.2.3
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
library(SnowballC)
## Warning: package 'SnowballC' was built under R version 3.2.3
## 
## Attaching package: 'SnowballC'
## The following objects are masked from 'package:RTextTools':
## 
##     getStemLanguages, wordStem
preprocess = function(basicdir, fileloc){
  full = paste0(basicdir, fileloc)
  everything = DirSource(full, encoding = "UTF-8", recursive = T)
  corpus = Corpus(everything, readerControl = list(reader = readPlain, language = "en"))
  corpus = tm_map(corpus, removePunctuation) #remove punctuation marks
  corpus = tm_map(corpus, removeNumbers) #remove numbers
  corpus = tm_map(corpus, stemDocument) 
  corpus = tm_map(corpus, removeWords, stopwords("english")) #remove stopwords
  corpus = tm_map(corpus, stripWhitespace) #remove white space
  corpus = tm_map(corpus, content_transformer(tolower)) #remove uppers
  corpus = tm_map(corpus, PlainTextDocument)
  corpus
}
basicdir = "C:/Users/Yadu/Downloads"
fileloc1 = "/spam"
fileloc2 = "/spam_2"
fileloc3 = "/hard_ham"
fileloc4 = "/easy_ham"
fileloc5 = "/easy_ham_2"
#preprocessing
corpusspam = preprocess(basicdir, fileloc1)
corpusspam2 = preprocess(basicdir, fileloc2)
corpushard = preprocess(basicdir, fileloc3)
corpuseasy = preprocess(basicdir, fileloc4)
corpuseasy2 = preprocess(basicdir, fileloc5)

#Without these following commands, we would get wrong results for the svm probabilities, tree probabilities, and the maximum entropy probabilities. The svm probabilities and tree probabilities would all be the same. 
corpusspam = tm_map(corpusspam, PlainTextDocument)
corpusspam2 = tm_map(corpusspam2, PlainTextDocument)
corpushard = tm_map(corpushard, PlainTextDocument)
corpuseasy = tm_map(corpuseasy, PlainTextDocument)
corpuseasy2 = tm_map(corpuseasy2, PlainTextDocument)
#add meta labels
meta(corpusspam, tag = "type") = "spam"
meta(corpusspam2, tag = "type") = "spam"
meta(corpushard, tag = "type") = "ham"
meta(corpuseasy, tag = "type") = "ham"
meta(corpuseasy2, tag = "type") = "ham"
combinedtrainingset = c(corpusspam, corpusspam2, corpushard, corpuseasy, corpuseasy2, recursive = T) #combine all data
combinedtrainingsample = sample(combinedtrainingset) #randomize all data
documentemail = DocumentTermMatrix(combinedtrainingsample)
documentemail = removeSparseTerms(documentemail, 0.002) #remove sparse terms so there will be 0.02 percent of empty space in matrix.
kind = unlist(meta(combinedtrainingsample, "type")[,1])
head(kind)
## [1] "ham" "ham" "ham" "ham" "ham" "ham"
set.seed(2000) #reproducible results

#prepare container
N = length(kind)
container = create_container(documentemail, labels = kind, trainSize = 1:(0.02*N), testSize = ((0.02*N)+1):N, virgin = F)
slotNames(container)
## [1] "training_matrix"       "classification_matrix" "training_codes"       
## [4] "testing_codes"         "column_names"          "virgin"
maxent_model = train_model(container, "MAXENT")
tree_model = train_model(container, "TREE")
svm_model = train_model(container, "SVM")
maxent_out = classify_model(container, maxent_model)
tree_out = classify_model(container, tree_model)
svm_out = classify_model(container, svm_model)
head(maxent_out)
##   MAXENTROPY_LABEL MAXENTROPY_PROB
## 1              ham       0.8643172
## 2              ham       0.8552287
## 3              ham       0.6945703
## 4              ham       0.6945703
## 5              ham       0.6091364
## 6              ham       0.6945703
head(tree_out)
##   TREE_LABEL TREE_PROB
## 1        ham 1.0000000
## 2        ham 0.8888889
## 3        ham 0.6891892
## 4        ham 0.6891892
## 5       spam 0.5714286
## 6        ham 0.6891892
head(svm_out)
##   SVM_LABEL  SVM_PROB
## 1       ham 0.7650212
## 2       ham 0.7234249
## 3       ham 0.7170200
## 4       ham 0.7170200
## 5       ham 0.5979873
## 6       ham 0.7170200