library(tm)
## Loading required package: NLP
library(RTextTools)
## Warning: package 'RTextTools' was built under R version 4.0.5
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
library(knitr)
library(tidyverse)
## Registered S3 method overwritten by 'cli':
##   method     from
##   print.tree tree
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.0     v dplyr   1.0.5
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x ggplot2::annotate() masks NLP::annotate()
## x dplyr::filter()     masks stats::filter()
## x dplyr::lag()        masks stats::lag()
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
#library(quanteda)

Spam Filter

In this project I build a simple spam filter. The dataset, downloaded from kaggle, contains 5572 texts, including 747 spam texts. We will be using the RTextTools package.

(https://www.kaggle.com/uciml/sms-spam-collection-dataset/home?select=spam.csv)

We begin by reading the csv into a dataframe. RTextTools does not randomly select the train and test datasets (you need to specify the rows for each) - therefore we split the dataframes up into spam and ham and then rejoin them to ensure enough spam and ham in each group.

The final result is a corpus.

dfTexts <- read.csv("D:\\RStudio\\CUNY_607\\Projects\\Project 4\\spam.csv", row.name=NULL)

dfSpam1 <- dfTexts %>%
  filter(v1 == "spam") %>%
  slice_head(n = 523)

dfSpam2 <- dfTexts %>%
  filter(v1 == "spam") %>%
  slice_head(n = 224)


dfHam1 <- dfTexts %>%
  filter(v1 == "ham") %>%
  slice_head(n = 3378)

dfHam2 <- dfTexts %>%
  filter(v1 == "ham") %>%
  slice_head(n = 1447)

Corpus_s1 <- VCorpus(VectorSource(dfSpam1$v2))
Corpus_h1 <- VCorpus(VectorSource(dfHam1$v2))
Corpus_s2 <- VCorpus(VectorSource(dfSpam2$v2))
Corpus_h2 <- VCorpus(VectorSource(dfHam2$v2))

meta(Corpus_s1, tag = "type") <- "spam"
meta(Corpus_h1, tag = "type") <- "ham"
meta(Corpus_s2, tag = "type") <- "spam"
meta(Corpus_h2, tag = "type") <- "ham"

Corpus_All <- c(Corpus_s1, Corpus_h1, Corpus_s2, Corpus_h2)

From here we create a document term matirx, removing sparse terms to speed up our process.

dtm <- DocumentTermMatrix(Corpus_All, control = list(stopwords=F))

dtm <- removeSparseTerms(dtm, 1-(10/length(Corpus_All)))

We create a container from the dtm. The labels must be recoded as numeric or RTextTools throws an error.

Corpus_labels <- as.vector(unlist(meta(Corpus_All)))

container <- create_container(dtm,
                              labels = as.numeric(factor(Corpus_labels)),
                              trainSize = 1:3901,
                              testSize = 3902:5572,
                              virgin = F)

We run two models (support vector machine and a generalized linear model).

models <- train_models(container, algorithms=c("GLMNET", "SVM"))
results <- classify_models(container, models)

analytics <- create_analytics(container, results)
analytics@algorithm_summary
##   SVM_PRECISION SVM_RECALL SVM_FSCORE GLMNET_PRECISION GLMNET_RECALL
## 1          0.99       1.00       0.99             0.97          1.00
## 2          1.00       0.94       0.97             0.98          0.81
##   GLMNET_FSCORE
## 1          0.98
## 2          0.89

1 is spam and 2 is ham. Both models do an excellent job overall except that a moderate amount of ham is being classified as spam. SVM is slightly better accross the board for all metrics.

We see if we can improve the model by removing stopwords:

dtm <- DocumentTermMatrix(Corpus_All, control = list(stopwords=T))

dtm <- removeSparseTerms(dtm, 1-(10/length(Corpus_All)))

Corpus_labels <- as.vector(unlist(meta(Corpus_All)))

container <- create_container(dtm,
                              labels = as.numeric(factor(Corpus_labels)),
                              trainSize = 1:3901,
                              testSize = 3902:5572,
                              virgin = F)

models <- train_models(container, algorithms=c("GLMNET", "SVM"))
results <- classify_models(container, models)

analytics <- create_analytics(container, results)
analytics@algorithm_summary
##   SVM_PRECISION SVM_RECALL SVM_FSCORE GLMNET_PRECISION GLMNET_RECALL
## 1          0.99       1.00       0.99             0.97          1.00
## 2          1.00       0.93       0.96             0.98          0.79
##   GLMNET_FSCORE
## 1          0.98
## 2          0.87

In fact, both models do a worse job identifying ham as spam - the recall score tells us that without certain stopwords, we are more likely to misidentify ham as spam.

Conclusion

RTextTools provides a very simple and effective way to work with labeled data. Once we created a proper container, from running the models to getting analytics took only three lines.