library(tm)
## Loading required package: NLP
library(RTextTools)
## Warning: package 'RTextTools' was built under R version 4.0.5
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
library(knitr)
library(tidyverse)
## Registered S3 method overwritten by 'cli':
## method from
## print.tree tree
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.0 v dplyr 1.0.5
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x ggplot2::annotate() masks NLP::annotate()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
#library(quanteda)
(https://www.kaggle.com/uciml/sms-spam-collection-dataset/home?select=spam.csv)
We begin by reading the csv into a dataframe. RTextTools does not randomly select the train and test datasets (you need to specify the rows for each) - therefore we split the dataframes up into spam and ham and then rejoin them to ensure enough spam and ham in each group.
The final result is a corpus.
dfTexts <- read.csv("D:\\RStudio\\CUNY_607\\Projects\\Project 4\\spam.csv", row.name=NULL)
dfSpam1 <- dfTexts %>%
filter(v1 == "spam") %>%
slice_head(n = 523)
dfSpam2 <- dfTexts %>%
filter(v1 == "spam") %>%
slice_head(n = 224)
dfHam1 <- dfTexts %>%
filter(v1 == "ham") %>%
slice_head(n = 3378)
dfHam2 <- dfTexts %>%
filter(v1 == "ham") %>%
slice_head(n = 1447)
Corpus_s1 <- VCorpus(VectorSource(dfSpam1$v2))
Corpus_h1 <- VCorpus(VectorSource(dfHam1$v2))
Corpus_s2 <- VCorpus(VectorSource(dfSpam2$v2))
Corpus_h2 <- VCorpus(VectorSource(dfHam2$v2))
meta(Corpus_s1, tag = "type") <- "spam"
meta(Corpus_h1, tag = "type") <- "ham"
meta(Corpus_s2, tag = "type") <- "spam"
meta(Corpus_h2, tag = "type") <- "ham"
Corpus_All <- c(Corpus_s1, Corpus_h1, Corpus_s2, Corpus_h2)
From here we create a document term matirx, removing sparse terms to speed up our process.
dtm <- DocumentTermMatrix(Corpus_All, control = list(stopwords=F))
dtm <- removeSparseTerms(dtm, 1-(10/length(Corpus_All)))
We create a container from the dtm. The labels must be recoded as numeric or RTextTools throws an error.
Corpus_labels <- as.vector(unlist(meta(Corpus_All)))
container <- create_container(dtm,
labels = as.numeric(factor(Corpus_labels)),
trainSize = 1:3901,
testSize = 3902:5572,
virgin = F)
We run two models (support vector machine and a generalized linear model).
models <- train_models(container, algorithms=c("GLMNET", "SVM"))
results <- classify_models(container, models)
analytics <- create_analytics(container, results)
analytics@algorithm_summary
## SVM_PRECISION SVM_RECALL SVM_FSCORE GLMNET_PRECISION GLMNET_RECALL
## 1 0.99 1.00 0.99 0.97 1.00
## 2 1.00 0.94 0.97 0.98 0.81
## GLMNET_FSCORE
## 1 0.98
## 2 0.89
1 is spam and 2 is ham. Both models do an excellent job overall except that a moderate amount of ham is being classified as spam. SVM is slightly better accross the board for all metrics.
We see if we can improve the model by removing stopwords:
dtm <- DocumentTermMatrix(Corpus_All, control = list(stopwords=T))
dtm <- removeSparseTerms(dtm, 1-(10/length(Corpus_All)))
Corpus_labels <- as.vector(unlist(meta(Corpus_All)))
container <- create_container(dtm,
labels = as.numeric(factor(Corpus_labels)),
trainSize = 1:3901,
testSize = 3902:5572,
virgin = F)
models <- train_models(container, algorithms=c("GLMNET", "SVM"))
results <- classify_models(container, models)
analytics <- create_analytics(container, results)
analytics@algorithm_summary
## SVM_PRECISION SVM_RECALL SVM_FSCORE GLMNET_PRECISION GLMNET_RECALL
## 1 0.99 1.00 0.99 0.97 1.00
## 2 1.00 0.93 0.96 0.98 0.79
## GLMNET_FSCORE
## 1 0.98
## 2 0.87
In fact, both models do a worse job identifying ham as spam - the recall score tells us that without certain stopwords, we are more likely to misidentify ham as spam.
RTextTools provides a very simple and effective way to work with labeled data. Once we created a proper container, from running the models to getting analytics took only three lines.