Week 10 Assignment: Document Classification

Overview Spam Email Filtering

library(RCurl)

## Loading required package: bitops

library(XML)
library(stringr)
library(tm)

## Loading required package: NLP

library(RTextTools)

## Loading required package: SparseM

## 
## Attaching package: 'SparseM'

## The following object is masked from 'package:base':
## 
##     backsolve

#training files
ehamfiles_train<-list.files("easy_ham_20021010")
hhamfiles_train<-list.files("hard_ham_20021010")
spamfiles_train<-list.files("spam_20021010")
#test files
ehamfiles_test<-list.files("easy_ham_20030228")
hhamfiles_test<-list.files("hard_ham_20030228")
spamfiles_test<-list.files("spam_20030228")

#write function
#takes as argument an directory, listoffiles and an input set
makecorpus<-function(dir,flist,input)
{
  for(i in 1:length(flist))
  {
    if(i==1){
     corp=readLines(file.path(dir,flist[i]))
     corp<-str_replace_all(corp,"[[___]]+","")
     corp<-str_replace_all(corp,"[[***]]+","")
     corp<-str_trim(corp,side=c("both"))
     corp<-str_to_lower(corp)
     corp<-unlist(corp)

    #remove dates 
    corp<-str_replace_all(corp,"\\d{2}\\s[:alpha:]{3}\\s\\d{4}","")
    #remove times
    corp<-str_replace_all(corp,"\\s?(\\d{2}\\:\\d{2}\\:\\d{2})(\\s[-|+]\\d{4})?","")
    #remove IP addresses
    corp<-str_replace_all(corp,"\\[\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\]","")
    #collapse readlines
    corp<-str_c(corp,collapse=" ")
    corp<-VectorSource(corp)
    corp<-Corpus(corp)

} else {

     tmp=readLines(file.path(dir,flist[i]))
     tmp<-str_replace_all(tmp,"[[___]]+","")
     tmp<-str_replace_all(tmp,"[[***]]+","")
     tmp<-str_trim(tmp,side=c("both"))
     tmp<-str_to_lower(tmp)
     tmp<-unlist(tmp)

    #remove dates 
    tmp<-str_replace_all(tmp,"\\d{2}\\s[:alpha:]{3}\\s\\d{4}","")
    #remove times
    tmp<-str_replace_all(tmp,"\\s?(\\d{2}\\:\\d{2}\\:\\d{2})(\\s[-|+]\\d{4})?","")
    #remove IP addresses
    tmp<-str_replace_all(tmp,"\\[\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\]","")
    #collapse readlines
    tmp<-str_c(tmp,collapse=" ")
    tmp<-VectorSource(tmp)
    tmp<-Corpus(tmp)
    corp<-c(corp,tmp)
    }
    meta(corp[[i]],"ID")<-i
    meta(corp[[i]],"ham")<-input
    }
 
  return(corp)
}
###end function

#load all the corpus'
eham<-makecorpus("easy_ham_20021010",ehamfiles_train,1)  
hham<-makecorpus("hard_ham_20021010",hhamfiles_train,1)

## Warning in readLines(file.path(dir, flist[i])): incomplete final line found
## on 'hard_ham_20021010/0231.7c6cc716ce3f3bfad7130dd3c8d7b072'

## Warning in readLines(file.path(dir, flist[i])): incomplete final line found
## on 'hard_ham_20021010/0250.7c6cc716ce3f3bfad7130dd3c8d7b072'

spam<-makecorpus("spam_20021010",spamfiles_train,0)

## Warning in readLines(file.path(dir, flist[i])): incomplete final line found
## on 'spam_20021010/0143.260a940290dcb61f9327b224a368d4af'

eham_test<-makecorpus("easy_ham_20030228",ehamfiles_test,1)
hham_test<-makecorpus("hard_ham_20030228",hhamfiles_test,1)

## Warning in readLines(file.path(dir, flist[i])): incomplete final line found
## on 'hard_ham_20030228/00228.0eaef7857bbbf3ebf5edbbdae2b30493'

spam_test<-makecorpus("spam_20030228",spamfiles_test,0)

## Warning in readLines(file.path(dir, flist[i])): incomplete final line found
## on 'spam_20030228/00136.faa39d8e816c70f23b4bb8758d8a74f0'

#combine all the files
#final corpus
finalcorpus<-c(eham,hham,spam,eham_test,hham_test,spam_test)
#develop term document matrix
tdm<-TermDocumentMatrix(finalcorpus)
tdm<-removeSparseTerms(tdm,1-(10/length(finalcorpus)))

#develop container
ham_spam<-unlist(meta(finalcorpus,type="local",tag="ham"))
N<-length(ham_spam)
tsize<-length(eham)+length(hham)+length(spam)
container<-create_container(tdm,ham_spam,trainSize= 1:tsize,testSize= 3303:N, virgin=FALSE)
#slot names
slotNames(container)

## [1] "training_matrix"       "classification_matrix" "training_codes"       
## [4] "testing_codes"         "column_names"          "virgin"

#different models estimated
#svm_model<-train_model(container,"SVM")
#tree_model<-train_model(container,"TREE")
#boost_model<-train_model(container,"BOOSTING")
#maxent_model<-train_model(container,"MAXENT")

#getting an column error here
#model output
#svm_out<-classify_model(container,svm_model)
#tree_out<-classify_model(contaner,tree_model)
#boost_out<-classify_model(container,boost_model)
#maxtent_out<-classify_model(container,maxent_model)

Week 10 Assignment: Document Classification

Talha Muhammad

November 6, 2016

Overview Spam Email Filtering