Overview Spam Email Filtering
library(RCurl)
## Loading required package: bitops
library(XML)
library(stringr)
library(tm)
## Loading required package: NLP
library(RTextTools)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
#training files
ehamfiles_train<-list.files("easy_ham_20021010")
hhamfiles_train<-list.files("hard_ham_20021010")
spamfiles_train<-list.files("spam_20021010")
#test files
ehamfiles_test<-list.files("easy_ham_20030228")
hhamfiles_test<-list.files("hard_ham_20030228")
spamfiles_test<-list.files("spam_20030228")
#write function
#takes as argument an directory, listoffiles and an input set
makecorpus<-function(dir,flist,input)
{
for(i in 1:length(flist))
{
if(i==1){
corp=readLines(file.path(dir,flist[i]))
corp<-str_replace_all(corp,"[[___]]+","")
corp<-str_replace_all(corp,"[[***]]+","")
corp<-str_trim(corp,side=c("both"))
corp<-str_to_lower(corp)
corp<-unlist(corp)
#remove dates
corp<-str_replace_all(corp,"\\d{2}\\s[:alpha:]{3}\\s\\d{4}","")
#remove times
corp<-str_replace_all(corp,"\\s?(\\d{2}\\:\\d{2}\\:\\d{2})(\\s[-|+]\\d{4})?","")
#remove IP addresses
corp<-str_replace_all(corp,"\\[\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\]","")
#collapse readlines
corp<-str_c(corp,collapse=" ")
corp<-VectorSource(corp)
corp<-Corpus(corp)
} else {
tmp=readLines(file.path(dir,flist[i]))
tmp<-str_replace_all(tmp,"[[___]]+","")
tmp<-str_replace_all(tmp,"[[***]]+","")
tmp<-str_trim(tmp,side=c("both"))
tmp<-str_to_lower(tmp)
tmp<-unlist(tmp)
#remove dates
tmp<-str_replace_all(tmp,"\\d{2}\\s[:alpha:]{3}\\s\\d{4}","")
#remove times
tmp<-str_replace_all(tmp,"\\s?(\\d{2}\\:\\d{2}\\:\\d{2})(\\s[-|+]\\d{4})?","")
#remove IP addresses
tmp<-str_replace_all(tmp,"\\[\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\]","")
#collapse readlines
tmp<-str_c(tmp,collapse=" ")
tmp<-VectorSource(tmp)
tmp<-Corpus(tmp)
corp<-c(corp,tmp)
}
meta(corp[[i]],"ID")<-i
meta(corp[[i]],"ham")<-input
}
return(corp)
}
###end function
#load all the corpus'
eham<-makecorpus("easy_ham_20021010",ehamfiles_train,1)
hham<-makecorpus("hard_ham_20021010",hhamfiles_train,1)
## Warning in readLines(file.path(dir, flist[i])): incomplete final line found
## on 'hard_ham_20021010/0231.7c6cc716ce3f3bfad7130dd3c8d7b072'
## Warning in readLines(file.path(dir, flist[i])): incomplete final line found
## on 'hard_ham_20021010/0250.7c6cc716ce3f3bfad7130dd3c8d7b072'
spam<-makecorpus("spam_20021010",spamfiles_train,0)
## Warning in readLines(file.path(dir, flist[i])): incomplete final line found
## on 'spam_20021010/0143.260a940290dcb61f9327b224a368d4af'
eham_test<-makecorpus("easy_ham_20030228",ehamfiles_test,1)
hham_test<-makecorpus("hard_ham_20030228",hhamfiles_test,1)
## Warning in readLines(file.path(dir, flist[i])): incomplete final line found
## on 'hard_ham_20030228/00228.0eaef7857bbbf3ebf5edbbdae2b30493'
spam_test<-makecorpus("spam_20030228",spamfiles_test,0)
## Warning in readLines(file.path(dir, flist[i])): incomplete final line found
## on 'spam_20030228/00136.faa39d8e816c70f23b4bb8758d8a74f0'
#combine all the files
#final corpus
finalcorpus<-c(eham,hham,spam,eham_test,hham_test,spam_test)
#develop term document matrix
tdm<-TermDocumentMatrix(finalcorpus)
tdm<-removeSparseTerms(tdm,1-(10/length(finalcorpus)))
#develop container
ham_spam<-unlist(meta(finalcorpus,type="local",tag="ham"))
N<-length(ham_spam)
tsize<-length(eham)+length(hham)+length(spam)
container<-create_container(tdm,ham_spam,trainSize= 1:tsize,testSize= 3303:N, virgin=FALSE)
#slot names
slotNames(container)
## [1] "training_matrix" "classification_matrix" "training_codes"
## [4] "testing_codes" "column_names" "virgin"
#different models estimated
#svm_model<-train_model(container,"SVM")
#tree_model<-train_model(container,"TREE")
#boost_model<-train_model(container,"BOOSTING")
#maxent_model<-train_model(container,"MAXENT")
#getting an column error here
#model output
#svm_out<-classify_model(container,svm_model)
#tree_out<-classify_model(contaner,tree_model)
#boost_out<-classify_model(container,boost_model)
#maxtent_out<-classify_model(container,maxent_model)