Read in data from local computer
setwd("~/CUNY/Fall2017/Data-607-DataAcquisition/project4")
spam <- Corpus(DirSource("spam_2"), readerControl = list(language="english"))
email <- Corpus(DirSource("easy_ham_2"), readerControl = list(language="english"))
In this example, email stands for ham.
spam <- tm_map(spam, removeNumbers)
spam <- tm_map(spam, content_transformer(str_replace_all), pattern = "[[:punct:]]", replacement = " ")
spam <- tm_map(spam, removeWords, words = stopwords("en"))
spam <- tm_map(spam, content_transformer(tolower))
spam <- tm_map(spam, stemDocument, language='english')
spam <- VCorpus(VectorSource(spam))
email <- tm_map(email, removeNumbers)
email <- tm_map(email, content_transformer(str_replace_all), pattern = "[[:punct:]]",
replacement = " ")
email <- tm_map(email, removeWords, words = stopwords("en"))
email <- tm_map(email, content_transformer(tolower))
email <- tm_map(email, stemDocument, language='english')
email <- VCorpus(VectorSource(email))
Classify spam as ‘+1’ and email as ‘-1’ and print first 10 items to confirm.
meta(spam, 'classification') <- '+1'
# confirm
meta(spam)[[1]][1:10]
## [1] "+1" "+1" "+1" "+1" "+1" "+1" "+1" "+1" "+1" "+1"
meta(email, 'classification') <- '-1'
# confirm
meta(email)[[1]][1:10]
## [1] "-1" "-1" "-1" "-1" "-1" "-1" "-1" "-1" "-1" "-1"
inbox <- c(email,spam,recursive=FALSE)
inbox_dtm <- DocumentTermMatrix(inbox)
inbox_dtm <- removeSparseTerms(inbox_dtm, 1-(10/length(inbox)))
Create a cutoff point to separate dataset into training and testing subsets. Then extract the spam/non-spam classifers.
cutoff <- ceiling(length(inbox)*0.7)
class <- unlist(meta(inbox,"classification")[,1])
gmail <- create_container(inbox_dtm, labels = class,
trainSize = 1:cutoff,
testSize = (cutoff+1):length(inbox), virgin = FALSE)
slotNames(gmail)
## [1] "training_matrix" "classification_matrix" "training_codes"
## [4] "testing_codes" "column_names" "virgin"
Use three machine learning models (i.e., Support Vector Machine, Decision Tree/Random Forest, and Multivariate logistic regression) to classify emails into spam (+1) or not (-1).
svm_model <- train_model(gmail,"SVM")
tree_model <- train_model(gmail,"TREE")
maxent_model <- train_model(gmail,"MAXENT")
svm_out <- classify_model(gmail,svm_model)
tree_out <- classify_model(gmail,tree_model)
maxent_out <- classify_model(gmail,maxent_model)
The SVM model is the best classifying model with an accuracy rate of 79%. The worst was the Random Forest with an accuracy of less than 50%. Flipping a coin is better than a Random Forest model.
###SVM Performance:
head(svm_out)
## SVM_LABEL SVM_PROB
## 1 +1 0.9955662
## 2 +1 1.0000000
## 3 +1 0.9970022
## 4 +1 0.9999981
## 5 +1 0.9924426
## 6 +1 0.9923491
prop.table(table(svm_out[,1] == class[(cutoff+1):length(class)]))
##
## FALSE TRUE
## 0.227652 0.772348
###Random Forest Performance:
head(tree_out)
## TREE_LABEL TREE_PROB
## 1 +1 1.0000000
## 2 +1 0.8571429
## 3 +1 1.0000000
## 4 +1 1.0000000
## 5 +1 1.0000000
## 6 +1 1.0000000
prop.table(table(tree_out[,1] == class[(cutoff+1):length(class)]))
##
## FALSE TRUE
## 0.5137068 0.4862932
###Maximum Entropy Performance:
head(maxent_out)
## MAXENTROPY_LABEL MAXENTROPY_PROB
## 1 +1 1.0000000
## 2 +1 1.0000000
## 3 +1 1.0000000
## 4 +1 1.0000000
## 5 +1 0.9999920
## 6 +1 0.9999921
prop.table(table(maxent_out[,1] == class[(cutoff+1):length(class)]))
##
## FALSE TRUE
## 0.2765197 0.7234803