url.spam <- "http://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2"
file.spam <- "20050311_spam_2.tar.bz2"
file.spam2<-"20050311_spam_2.tar"
url.ham <- "http://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham.tar.bz2"
file.ham <- "20030228_easy_ham.tar.bz2"
file.ham2 <- "20030228_easy_ham.tar"
download.file(url.spam, destfile= file.spam)
download.file(url.ham, destfile=file.ham)
bunzip2(file.spam)
bunzip2(file.ham)
untar(file.ham2, exdir="C:\\Rdata\\")
untar(file.spam2, exdir = "C:\\Rdata\\")
Adding files to list
spam.dir="C:\\Rdata\\spam_2\\"
ham.dir="C:\\Rdata\\easy_ham\\"
spam.docs=list.files(spam.dir)
ham.docs=list.files(ham.dir)
spam.docs = spam.docs[which(spam.docs!="cmds")]
ham.docs=ham.docs[which(ham.docs!="cmds")]
toVCorpus <- function(file_path) {
corpus <- file_path %>%
paste(., list.files(.), sep = "/") %>% # Create a vector of file paths
lapply(readLines) %>% # Read the text in each file
VectorSource() %>% # Turn into VectorSource
VCorpus() # Turn into VCorpus
return(corpus)
}
docClean <- function(corpus) {
corpus <- corpus %>%
tm_map(removeNumbers) %>% # Remove numbers
tm_map(removePunctuation) %>% # Remove punctuation symbols
tm_map(tolower) %>% # Transform to lowercase
tm_map(PlainTextDocument) %>% # Transform back to PlainTextDocument
tm_map(removeWords, stopwords("en")) %>% # Remove stopwords
tm_map(stripWhitespace) %>% # Remove white spaces
tm_map(stemDocument) #Reduce to stems
return(corpus)
}
addTag <- function(corpus, tag, value){
for (i in 1:length(corpus)){
meta(corpus[[i]], tag) <- value # Add the value to the specified tag
}
return(corpus)
}
# Create ham corpus
ham_corpus <- ham.dir%>%
toVCorpus %>%
docClean %>%
addTag(tag = "ham_spam", value = "ham")
# Create spam corpus
spam_corpus <- spam.dir %>%
toVCorpus %>%
docClean %>%
addTag(tag = "ham_spam", value = "spam")
spamassassin_corpus <- c(ham_corpus, spam_corpus)
# Scramble the order
spamassassin_corpus <- spamassassin_corpus[sample(c(1:length(spamassassin_corpus)))]
# Check ham/spam proportion
spamassassin_corpus_prop <- spamassassin_corpus %>%
meta(tag = "ham_spam") %>%
unlist() %>%
table()
spamassassin_corpus_prop
## .
## ham spam
## 2501 1397
###Corpus and physical files match
library(wordcloud)
## Loading required package: RColorBrewer
wordcloud(spamassassin_corpus,max.words = 70, random.order = FALSE, min.freq=1000)#header wordcloud
## Training create the document term matrix and remove sparse terms. Terms appearing in less than 10 documents will be left out.
spamassassin_dtm <- spamassassin_corpus %>%
DocumentTermMatrix() %>%
removeSparseTerms(1-(10/length(spamassassin_corpus)))
spamassassin_labels <- unlist(meta(spamassassin_corpus, "ham_spam"))
N <- length(spamassassin_labels)
split <- round(0.8*N)
container <- create_container(
spamassassin_dtm,
labels = spamassassin_labels,
trainSize = 1:split,
testSize = (split+1):N,
virgin = F
)
Using three different algorithms -Suvervisor Vector Machines -Decision Tree
svm_model_spamassassin <- train_model(container, "SVM")
tree_model_spamassassin <- train_model(container, "TREE")
# Classifying using the trained models
svm_out_spamassassin <- classify_model(container, svm_model_spamassassin)
tree_out_spamassassin <- classify_model(container, tree_model_spamassassin)
# Collect the classification results into a table
labels_out_spamassassin <- data.frame(
correct_label = spamassassin_labels[(split+1):N],
svm = as.character(svm_out_spamassassin[,1]),
tree = as.character(tree_out_spamassassin[,1]))
# Print results
for (i in 2:3){
print(names(labels_out_spamassassin)[i])
table(labels_out_spamassassin[,1] == labels_out_spamassassin[,i]) %>%
print() %>%
prop.table() %>%
round(2) %>%
print()
}
## [1] "svm"
##
## FALSE TRUE
## 3 777
##
## FALSE TRUE
## 0 1
## [1] "tree"
##
## FALSE TRUE
## 9 771
##
## FALSE TRUE
## 0.01 0.99
library(knitr)
final<-summary(labels_out_spamassassin)
kable(final)
| correct_label | svm | tree | |
|---|---|---|---|
| Length:780 | Length:780 | Length:780 | |
| Class :character | Class :character | Class :character | |
| Mode :character | Mode :character | Mode :character |
The tree classfication algorithm performed less accurate that the other one. The svm classifier were 99% accurate.