Classification

Prepare data

# This class cleans up the text within a corpus
clean_corp <- function(corpus){
  
  tmp <- corpus
  release_corpus <- tm_map(tmp, str_replace_all, pattern= "[[:punct:]]", replacement = " ")
  release_corpus <- tm_map(release_corpus, removeNumbers)
  release_corpus <- tm_map(release_corpus, stripWhitespace)
  release_corpus <- tm_map(release_corpus, tolower)
  release_corpus <- tm_map(release_corpus, removeWords, words = stopwords("en"))
  release_corpus <- tm_map(release_corpus, PlainTextDocument)
  
  release_corpus <- tm_map(release_corpus, stemDocument)
  

  return(release_corpus)
}


# This function assigns classification, either Spam or Ham, to a corpus
assign_class <- function(corpus, cl){
  tmp <- corpus
  for(i in 1:length(tmp)){
    meta(tmp[[i]], "classification") <- cl
  }
  return(tmp)
}


# Load files from spam and ham directories into a Vector Corpus
spam <- VCorpus(DirSource('easy_ham\\'))
ham <- VCorpus(DirSource('spam_2\\'))

# Clean text within each corpus
spam_cleaned <- clean_corp(spam)
ham_cleaned <- clean_corp(ham)

# Assign a class
spam_classified <- assign_class(spam_cleaned, 'spam')
ham_classified <- assign_class(ham_cleaned, 'ham')

# Combine the ham and spam Corpuses 
full_corp <- c(spam_classified, ham_classified)

# Shuffle spam and ham records within the combined corpus
full_corp <- sample(full_corp)

# Create a Document Term Matrix and remove sparse terms
dtm <- DocumentTermMatrix(full_corp)
dtm <- removeSparseTerms(dtm, 1-(10/length(full_corp)))

Classification

# Create a vector with class labels and its length
class_labels <- unlist(meta(full_corp, type="local", tag = "classification"))
n <- length(class_labels)

# Create a contained with training and testing data and their class labels
# It's about 25% / 75%
container <- create_container(dtm, 
                              labels = class_labels, 
                              trainSize = 1:3000,
                              testSize = 3001:n,
                              virgin = TRUE)

# Train the model using Support Vector Machine algorithm
svm_model <- train_model(container, "SVM")

# Perform a classification for each document and show top 5 results
svm_out <- classify_model(container, svm_model)
head(svm_out)

##   SVM_LABEL  SVM_PROB
## 1      spam 0.9903427
## 2       ham 0.9995498
## 3      spam 0.9999998
## 4       ham 0.9994660
## 5      spam 0.9999993
## 6       ham 0.9999818

# MaxEnt and Tree
tree_model <- train_model(container, "TREE")
maxent_model <- train_model(container, "MAXENT")

tree_out <- classify_model(container, tree_model)
maxent_out <- classify_model(container, maxent_model)


#Performance Test
labels_out <- data.frame(correct_label = class_labels[3001:n],
                         svm = as.character(svm_out[,1]),
                         tree = as.character(tree_out[,1]),
                         maxent = as.character(maxent_out[,1]),
                         stringsAsFactors = F)



# SVM
prop.table(table(labels_out[,1] == labels_out[,2]))

## 
##       FALSE        TRUE 
## 0.006681514 0.993318486

# Tree
prop.table(table(labels_out[,1] == labels_out[,3]))

## 
##      FALSE       TRUE 
## 0.01113586 0.98886414

# MaxEnt
prop.table(table(labels_out[,1] == labels_out[,4]))

## 
##       FALSE        TRUE 
## 0.003340757 0.996659243

Accuracy

SVM is 99.9% accurate
Decision Tree is 98.9% accurate
Maxent is 99.7% accurate

Classification

Rafal Decowski

November 5, 2017