For this project, I start with a spam/ham dataset, then predict the class of new documents ( withheld from the training dataset). Then use models from RTextTools predict whether or not a new document is spam.

Get the training and testing data from the spamassassin site.

### Training Data

#used 20021010_easy_ham.tar.bz2 and  20021010_spam.tar.bz2 for training data and 20021010_hard_ham.tar.bz2 and #20030228_spam.tar.bz2 for testing. Data was loaded to desktop rather than directly from site for performance purposes.


train_spam_dir <- "C:/Users/eptrs/Desktop/CUNY/Data607_DataAcquisition/week10/files/train_spam/"
train_spam_files <- list.files(train_spam_dir)
length(train_spam_files)
## [1] 501
train_ham_dir <- "C:/Users/eptrs/Desktop/CUNY/Data607_DataAcquisition/week10/files/train_ham/"
train_ham_files <- list.files(train_ham_dir)
length(train_ham_files)
## [1] 2551
### Testing Data

test_spam_dir <- "C:/Users/eptrs/Desktop/CUNY/Data607_DataAcquisition/week10/files/test_spam/"
test_spam_files <- list.files(test_spam_dir)
length(test_spam_files)
## [1] 501
test_ham_dir <- "C:/Users/eptrs/Desktop/CUNY/Data607_DataAcquisition/week10/files/test_ham/"
test_ham_files <- list.files(test_ham_dir)
length(test_ham_files)
## [1] 250

Create 4 Corpus for Spam and Ham training and Spam and Ham testing.

Traing Corpus

##########corpus

###### Train Spam
tmp <- readLines(str_c(train_spam_dir, train_spam_files[1]))
tmp <- str_c(tmp, collapse = "")

stxt_corpus <- Corpus(VectorSource(tmp))

n <- 1

for (i in 1:length(train_spam_files)) {
  tmp <- readLines(str_c(train_spam_dir, train_spam_files[i]))
  tmp <- str_c(tmp, collapse = "")
  
  n <- n + 1
  
  tmp_corpus <- VCorpus(VectorSource(tmp))
  stxt_corpus <- c(stxt_corpus, tmp_corpus)
  stxt_corpus <- VCorpus(VectorSource(stxt_corpus))

}

meta(stxt_corpus, "classification") <- 0


################ Train Ham

htmp <- readLines(str_c(train_ham_dir, train_ham_files[1]))
htmp <- str_c(htmp, collapse = "")

htxt_corpus <- Corpus(VectorSource(htmp))


n <- 1

for (i in 1:length(train_ham_files)) {
  htmp <- readLines(str_c(train_ham_dir, train_ham_files[i]))
  htmp <- str_c(htmp, collapse = "")
  
  n <- n + 1
  htmp_corpus <- VCorpus(VectorSource(htmp))
  htxt_corpus <- c(htxt_corpus, htmp_corpus)
  htxt_corpus <- VCorpus(VectorSource(htxt_corpus))

}

meta(htxt_corpus, "classification") <- 1

Testing Corpus

###### test Spam
tmp <- readLines(str_c(test_spam_dir, test_spam_files[1]))
tmp <- str_c(tmp, collapse = "")

testSpam_corpus <- Corpus(VectorSource(tmp))

n <- 1

for (i in 1:length(test_spam_files)) {
  tmp <- readLines(str_c(test_spam_dir, test_spam_files[i]))
  tmp <- str_c(tmp, collapse = "")

  n <- n + 1
  tmp_corpus <- VCorpus(VectorSource(tmp))
  testSpam_corpus <- c(testSpam_corpus, tmp_corpus)
  testSpam_corpus <- VCorpus(VectorSource(testSpam_corpus))
  
}

meta(testSpam_corpus, "classification") <- 0


################ test Ham

htmp <- readLines(str_c(test_ham_dir , test_ham_files[1]))
htmp <- str_c(tmp, collapse = "")

testHam_corpus<- Corpus(VectorSource(htmp))


n <- 1

for (i in 1:length(test_ham_files)) {
  htmp <- readLines(str_c(test_ham_dir, test_ham_files[i]))
  htmp <- str_c(htmp, collapse = "")
  
  n <- n + 1
  htmp_corpus <- VCorpus(VectorSource(htmp))
  testHam_corpus<- c(testHam_corpus, htmp_corpus)
  testHam_corpus<- VCorpus(VectorSource(testHam_corpus))
  
  }

meta(testHam_corpus, "classification") <- 1

Combine all the Corpus into one big one

###########Combine

txt_corpus <- c(stxt_corpus, htxt_corpus, testSpam_corpus, testHam_corpus)

Perform Data Cleanup

######Clean up
txt_corpus <- tm_map(txt_corpus, removeNumbers)
txt_corpus <- tm_map(txt_corpus, content_transformer(str_replace_all), pattern = "[[:punct:]]", replacement = " ")
txt_corpus <- tm_map(txt_corpus, removeWords, words = stopwords("en"))
txt_corpus <- tm_map(txt_corpus, content_transformer(tolower))
txt_corpus <- tm_map(txt_corpus, stemDocument)

Build a Document Term Matrix

dtm <- DocumentTermMatrix(txt_corpus)
dtm <- removeSparseTerms(dtm, 1-(10/length(txt_corpus)))  #make the matrix less sparse
dtm
## <<DocumentTermMatrix (documents: 3823, terms: 6786)>>
## Non-/sparse entries: 688947/25253931
## Sparsity           : 97%
## Maximal term length: 73
## Weighting          : term frequency (tf)

Create a container with all relevant information for use in the estimation procedures. Specify that the first 75% documents are training data and the rest to be classified. Set the virgin attribute to FALSE, meaning that we have labels for all documents.

classification_labels <- unlist(meta(txt_corpus, "classification"))
N <- length(classification_labels)

N1 <- round(N * .75, 0)

N2 <- N1 + 1

container <- create_container(dtm,
                              labels = classification_labels,
                              trainSize = 1:N1,
                              testSize = N2:N,
                              virgin = FALSE)

slotNames(container)
## [1] "training_matrix"       "classification_matrix" "training_codes"       
## [4] "testing_codes"         "column_names"          "virgin"
#a set of objects that are used for the estimation procedures of the supervised learning methods

supply the information that have stored in the container to the models for training

svm_model <- train_model(container, "SVM")
tree_model <- train_model(container, "TREE")
maxent_model <- train_model(container, "MAXENT")

Use the model parameters to estimate the membership of the remaining documents (testing phase)

svm_out <- classify_model(container, svm_model)
tree_out <- classify_model(container, tree_model)
maxent_out <- classify_model(container, maxent_model)

Evaluation of Performance: Percentage of documents that have been classified correctly

#construct a data frame containing the correct and the predicted labels 
labels_out <- data.frame(
  correct_label = classification_labels[N2:N], #start from classify mode
  svm = as.character(svm_out[,1]),
  tree = as.character(tree_out[,1]),
  maxent = as.character(maxent_out[,1]),
  stringAsFactors = F)

#compare the tested lable with the actual lable 
perf <- data.frame(
  svm = prop.table(table(labels_out[,1] == labels_out[,2])),
  tree = prop.table(table(labels_out[,1] == labels_out[,3])),
  maxent = prop.table(table(labels_out[,1] == labels_out[,4]))
)

colnames(perf)<- c("","SVM","","Tree","","Maxent")
rownames(perf) <- c("Pct. Incorrect","Pct.Correct")
library(knitr)
kable(perf[,c(2,4,6)])
SVM Tree Maxent
Pct. Incorrect 0.2259414 0.2405858 0.2154812
Pct.Correct 0.7740586 0.7594142 0.7845188

The SVM model has predicted a correct spam/ham email 77.4 % of the time.

The Tree model has predicted a correct spam/ham email 75.9 % of the time.

The Maxent model has predicted a correct spam/ham email 78.5 % of the time.