Function to download the desired files
url.spam <- "http://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2"
file.spam <- "20050311_spam_2.tar.bz2"
file.spam2<-"20050311_spam_2.tar"
url.ham <- "http://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2"
file.ham <- "20030228_easy_ham.tar.bz2"
file.ham2 <- "20030228_easy_ham.tar"
download.file(url.spam, destfile= file.spam)
download.file(url.ham, destfile=file.ham)
bunzip2(file.spam)
bunzip2(file.ham)
untar(file.ham2, exdir="spamham")
untar(file.spam2, exdir = "spamham")
#Adding files to list
spam.dir="spamham\\spam_2\\"
ham.dir="spamham\\easy_ham\\"
spam.docs=list.files(spam.dir)
ham.docs=list.files(ham.dir)
Removing not needed files and writing functions for Corpus Creation, cleaning, and tagging
#Removing the .cmds files in all the folders.
spam.docs = spam.docs[which(spam.docs!="cmds")]
ham.docs=ham.docs[which(ham.docs!="cmds")]
toVCorpus <- function(file_path) {
corpus <- file_path %>%
paste(., list.files(.), sep = "/") %>% # Create a vector of file paths
lapply(readLines) %>% # Read the text in each file
VectorSource() %>% # Turn into VectorSource
VCorpus() # Turn into VCorpus
return(corpus)
}
docClean <- function(corpus) {
corpus <- corpus %>%
tm_map(removeNumbers) %>% # Remove numbers
tm_map(removePunctuation) %>% # Remove punctuation symbols
tm_map(tolower) %>% # Transform to lowercase
tm_map(PlainTextDocument) %>% # Transform back to PlainTextDocument
tm_map(removeWords, stopwords("en")) %>% # Remove stopwords
tm_map(stripWhitespace) %>% # Remove white spaces
tm_map(stemDocument) #Reduce to stems
return(corpus)
}
addTag <- function(corpus, tag, value){
for (i in 1:length(corpus)){
meta(corpus[[i]], tag) <- value # Add the value to the specified tag
}
return(corpus)
}
# Create ham corpus
ham_corpus <- ham.dir%>%
toVCorpus %>%
docClean %>%
addTag(tag = "ham_spam", value = "ham")
# Create spam corpus
spam_corpus <- spam.dir %>%
toVCorpus %>%
docClean %>%
addTag(tag = "ham_spam", value = "spam")
spamassassin_corpus <- c(ham_corpus, spam_corpus)
# Scramble the order
spamassassin_corpus <- spamassassin_corpus[sample(c(1:length(spamassassin_corpus)))]
# Check ham/spam proportion
spamassassin_corpus_prop <- spamassassin_corpus %>%
meta(tag = "ham_spam") %>%
unlist() %>%
table()
spamassassin_corpus_prop
## .
## ham spam
## 2501 1397
Corpus and physical files match
wordcloud(spamassassin_corpus,max.words = 70, random.order = FALSE, min.freq=1000)#header wordcloud
Now it’s the time to create the document term matrix and remove sparse terms. Terms appearing in less than 10 documents will be left out. Retrieving Spam Ham labels.
spamassassin_dtm <- spamassassin_corpus %>%
DocumentTermMatrix() %>%
removeSparseTerms(1-(10/length(spamassassin_corpus)))
spamassassin_labels <- unlist(meta(spamassassin_corpus, "ham_spam"))
In order to use the train_model function, I had to create a container. Here, I also did a 80/20 split in the dataset. About 80% of the data were used for training, and the remaining 20% were used for testing.
N <- length(spamassassin_labels)
split <- round(0.8*N)
container <- create_container(
spamassassin_dtm,
labels = spamassassin_labels,
trainSize = 1:split,
testSize = (split+1):N,
virgin = F
)
Using three different algorithms
-Suvervisor Vector Machines -Decision Tree -Max Enthropy
svm_model_spamassassin <- train_model(container, "SVM")
tree_model_spamassassin <- train_model(container, "TREE")
maxent_model_spamassassin <- train_model(container, "MAXENT")
# Classifying using the trained models
svm_out_spamassassin <- classify_model(container, svm_model_spamassassin)
tree_out_spamassassin <- classify_model(container, tree_model_spamassassin)
maxent_out_spamassassin <- classify_model(container, maxent_model_spamassassin)
# Collect the classification results into a table
labels_out_spamassassin <- data.frame(
correct_label = spamassassin_labels[(split+1):N],
svm = as.character(svm_out_spamassassin[,1]),
tree = as.character(tree_out_spamassassin[,1]),
maxent = as.character(maxent_out_spamassassin[,1]))
# Print results
for (i in 2:4){
print(names(labels_out_spamassassin)[i])
table(labels_out_spamassassin[,1] == labels_out_spamassassin[,i]) %>%
print() %>%
prop.table() %>%
round(2) %>%
print()
}
## [1] "svm"
##
## FALSE TRUE
## 1 779
##
## FALSE TRUE
## 0 1
## [1] "tree"
##
## FALSE TRUE
## 6 774
##
## FALSE TRUE
## 0.01 0.99
## [1] "maxent"
##
## FALSE TRUE
## 2 778
##
## FALSE TRUE
## 0 1
final<-summary(labels_out_spamassassin)
kable(final)
correct_label | svm | tree | maxent | |
---|---|---|---|---|
ham :503 | ham :504 | ham :507 | ham :505 | |
spam:277 | spam:276 | spam:273 | spam:275 | |
##Con | clusion |
As you can see, the tree classfication algorithm performed less accurate that hte pther ones. The svm and maxent classifiers were 99% accurate.