The following script acquires and models spam data from the spamassassin public corpus. https://spamassassin.apache.org/publiccorpus/
library(tm)
## Loading required package: NLP
library(RTextTools)
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
##
## The following object is masked from 'package:base':
##
## backsolve
setwd("C:/Users/Andrew/Desktop/Cuny/Data Acquisition/Week 11/spam")
#Label links
spamlink <- "https://spamassassin.apache.org/publiccorpus/20030228_spam.tar.bz2"
hardhamlink <- "https://spamassassin.apache.org/publiccorpus/20030228_hard_ham.tar.bz2"
easyhamlink <- "https://spamassassin.apache.org/publiccorpus/20030228_easy_ham.tar.bz2"
#Download tarballs
download.file(spamlink, destfile="spam.tar.gz")
download.file(hardhamlink, destfile="hardham.tar.gz")
download.file(easyhamlink, destfile="easyham.tar.gz")
#Extract tarballs
untar("spam.tar.gz")
untar("hardham.tar.gz")
untar("easyham.tar.gz")
#Create corpus objects
spamcorpdat <- Corpus(DirSource("spam"), readerControl = list(language="lat"))
hamcorpdat <- Corpus(DirSource("easy_ham"), readerControl = list(language="lat"))
hardhamcorpdat <- Corpus(DirSource("hard_ham"), readerControl = list(language="lat"))
#Remove cmds file
spamcorpdat <- spamcorpdat[1:500]
hamcorpdat <- hamcorpdat[1:2500]
hardhamcorpdat <- hardhamcorpdat[1:250]
#Sample the corpuses
spamcorpdat <- sample(spamcorpdat, 250)
hamcorpdat <- sample(hamcorpdat, 250)
hardhamcorpdat <- sample(hardhamcorpdat, 100)
#Add meta labels
meta(spamcorpdat, tag = "type") <- "spam"
meta(hamcorpdat, tag = "type") <- "ham"
meta(hardhamcorpdat, tag = "type") <- "hardham"
#Combine corpus objects
combinedspamcorpus <- c(spamcorpdat,hamcorpdat,hardhamcorpdat, recursive=T)
#Create randomized corpus
spamcorpusR <- sample(combinedspamcorpus)
spamcorpusR
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 1
## Content: documents: 600
#Build document-term matrix. I worried about simplifying with spam.
spamdtm <- DocumentTermMatrix(spamcorpusR)
#Collect meta labels
spamtype <- unlist(meta(spamcorpusR, "type")[,1])
head(spamtype,5)
## [1] "ham" "spam" "spam" "spam" "spam"
#Prepare container
N <- length(spamtype)
container <- create_container(
spamdtm,
labels = spamtype,
trainSize = 1:400,
testSize = 401:N,
virgin = FALSE
)
slotNames(container)
## [1] "training_matrix" "classification_matrix" "training_codes"
## [4] "testing_codes" "column_names" "virgin"
#Training models
svm_model <- train_model(container, "SVM")
rf_model <- train_model(container, "RF")
maxent_model <- train_model(container, "MAXENT")
#Classifying data
svm_out <- classify_model(container, svm_model)
rf_out <- classify_model(container, rf_model)
maxent_out <- classify_model(container, maxent_model)
head(svm_out)
## SVM_LABEL SVM_PROB
## 1 spam 0.9289945
## 2 spam 0.9143803
## 3 ham 0.9763898
## 4 spam 0.6549566
## 5 ham 0.9443492
## 6 ham 0.9438663
head(rf_out)
## FORESTS_LABEL FORESTS_PROB
## 1 spam 0.790
## 2 spam 0.935
## 3 ham 0.960
## 4 spam 0.495
## 5 ham 0.880
## 6 ham 0.910
head(maxent_out)
## MAXENTROPY_LABEL MAXENTROPY_PROB
## 1 spam 1.0000000
## 2 spam 0.9999992
## 3 ham 1.0000000
## 4 spam 0.9066385
## 5 ham 1.0000000
## 6 ham 1.0000000