Project 4 - Document Classification

Read in data from local computer

setwd("~/CUNY/Fall2017/Data-607-DataAcquisition/project4")

spam <- Corpus(DirSource("spam_2"), readerControl = list(language="english"))
email <- Corpus(DirSource("easy_ham_2"), readerControl = list(language="english"))

Process/Clean the Data

In this example, email stands for ham.

spam <- tm_map(spam, removeNumbers)
spam <- tm_map(spam, content_transformer(str_replace_all), pattern = "[[:punct:]]", replacement = " ")
spam <- tm_map(spam, removeWords, words = stopwords("en"))
spam <- tm_map(spam, content_transformer(tolower))
spam <- tm_map(spam, stemDocument, language='english')
spam <-  VCorpus(VectorSource(spam))


email <- tm_map(email, removeNumbers)
email <- tm_map(email, content_transformer(str_replace_all), pattern = "[[:punct:]]", 
                replacement = " ")
email <- tm_map(email, removeWords, words = stopwords("en"))
email <- tm_map(email, content_transformer(tolower))
email <- tm_map(email, stemDocument, language='english')
email <- VCorpus(VectorSource(email))

Classify the Data

Classify spam as ‘+1’ and email as ‘-1’ and print first 10 items to confirm.

meta(spam, 'classification') <- '+1'
# confirm
meta(spam)[[1]][1:10]
##  [1] "+1" "+1" "+1" "+1" "+1" "+1" "+1" "+1" "+1" "+1"
meta(email, 'classification') <- '-1'
# confirm
meta(email)[[1]][1:10]
##  [1] "-1" "-1" "-1" "-1" "-1" "-1" "-1" "-1" "-1" "-1"

Combine the Corpus and convert into a DocumentTermMatrix object.

inbox <- c(email,spam,recursive=FALSE)
inbox_dtm <- DocumentTermMatrix(inbox)
inbox_dtm <- removeSparseTerms(inbox_dtm, 1-(10/length(inbox))) 

Extract labels and create a container

Create a cutoff point to separate dataset into training and testing subsets. Then extract the spam/non-spam classifers.

cutoff <- ceiling(length(inbox)*0.7)
class <- unlist(meta(inbox,"classification")[,1])

gmail <- create_container(inbox_dtm, labels = class, 
                              trainSize = 1:cutoff,
                              testSize = (cutoff+1):length(inbox), virgin = FALSE)

slotNames(gmail)
## [1] "training_matrix"       "classification_matrix" "training_codes"       
## [4] "testing_codes"         "column_names"          "virgin"

Classifying Models

Use three machine learning models (i.e., Support Vector Machine, Decision Tree/Random Forest, and Multivariate logistic regression) to classify emails into spam (+1) or not (-1).

svm_model <- train_model(gmail,"SVM")
tree_model <- train_model(gmail,"TREE")
maxent_model <- train_model(gmail,"MAXENT")

svm_out <- classify_model(gmail,svm_model)
tree_out <- classify_model(gmail,tree_model)
maxent_out <- classify_model(gmail,maxent_model)

Results

The SVM model is the best classifying model with an accuracy rate of 79%. The worst was the Random Forest with an accuracy of less than 50%. Flipping a coin is better than a Random Forest model.

###SVM Performance:
head(svm_out)
##   SVM_LABEL  SVM_PROB
## 1        +1 0.9955662
## 2        +1 1.0000000
## 3        +1 0.9970022
## 4        +1 0.9999981
## 5        +1 0.9924426
## 6        +1 0.9923491
prop.table(table(svm_out[,1] == class[(cutoff+1):length(class)]))
## 
##    FALSE     TRUE 
## 0.227652 0.772348
###Random Forest Performance:
head(tree_out)
##   TREE_LABEL TREE_PROB
## 1         +1 1.0000000
## 2         +1 0.8571429
## 3         +1 1.0000000
## 4         +1 1.0000000
## 5         +1 1.0000000
## 6         +1 1.0000000
prop.table(table(tree_out[,1] == class[(cutoff+1):length(class)]))
## 
##     FALSE      TRUE 
## 0.5137068 0.4862932
###Maximum Entropy Performance:
head(maxent_out)
##   MAXENTROPY_LABEL MAXENTROPY_PROB
## 1               +1       1.0000000
## 2               +1       1.0000000
## 3               +1       1.0000000
## 4               +1       1.0000000
## 5               +1       0.9999920
## 6               +1       0.9999921
prop.table(table(maxent_out[,1] == class[(cutoff+1):length(class)]))
## 
##     FALSE      TRUE 
## 0.2765197 0.7234803