Predicting Spam

Overview

In this project the objective is to create a supervised learning model to predict if a text message is spam or a legitimate text aka “ham”.

Packages

library(tm)
library(dplyr)
library(SnowballC)
library(caret)
library(xgboost)
library(gmodels)
library(wordcloud)
library(e1071) #for naive bayes
library(gridExtra)

Read in Data

#read in data
sms_raw <- read.csv("spam.csv", stringsAsFactors = F)%>%
  rename("type" = "v1", "text" = "v2")%>%
  dplyr:: select(type, text)

sms_raw$type <- factor(sms_raw$type)

#line  removes emojis from the text strings
sms_raw <- data.frame(type = sms_raw$type, text = gsub("[^\x01-\x7F]", "", sms_raw$text))

Create corpus of real text to use in model

sms_corpus <- VCorpus(VectorSource(sms_raw$text))

Number of documents in corpus

sms_corpus

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 5572

Inspect a couple of the texts

lapply(sms_corpus[1:2], as.character)

## $`1`
## [1] "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
## 
## $`2`
## [1] "Ok lar... Joking wif u oni..."

Clean Corpus

Clean the corpus data using tmap which allows transformation to be applied to corpus. Because tolower is not built within the tm package have to use content_transformer to make lower case

clean_sms_corpus <- tm_map(sms_corpus, content_transformer(tolower))
#check
as.character(clean_sms_corpus[[1]])

## [1] "go until jurong point, crazy.. available only in bugis n great world la e buffet... cine there got amore wat..."

Strip numbers, remove stop words which are commonly used words like “for”, “the”, “then”. Remove punctuation, stem the words which is taking the word to its root. Example, testing, tested, and test, all stem to “test”. Lastly ,remove white space and check the clean version.

#strip numbers from corpus
clean_sms_corpus <- tm_map(clean_sms_corpus, removeNumbers)
#remove stop words
clean_sms_corpus <- tm_map(clean_sms_corpus, removeWords, stopwords())
#remove punctuation
clean_sms_corpus <- tm_map(clean_sms_corpus, removePunctuation)
#stemming - wordStem()-from snowball, function for vector, stemDocument (from tm) for applying to entire corpus
clean_sms_corpus <- tm_map(clean_sms_corpus, stemDocument)
#remove leftover whitespace
clean_sms_corpus <- tm_map(clean_sms_corpus, stripWhitespace)
#check clean corpus
lapply(clean_sms_corpus[1:2], as.character)

## $`1`
## [1] "go jurong point crazi avail bugi n great world la e buffet cine got amor wat"
## 
## $`2`
## [1] "ok lar joke wif u oni"

Tokenize Data

Tokenize - text split by each token (word) and split into DTM (document term matrix) DTM lists word frequency

sms_dtm <- DocumentTermMatrix(clean_sms_corpus)

Inspect the DTM

inspect(sms_dtm)

## <<DocumentTermMatrix (documents: 5572, terms: 6475)>>
## Non-/sparse entries: 42356/36036344
## Sparsity           : 100%
## Maximal term length: 40
## Weighting          : term frequency (tf)
## Sample             :
##       Terms
## Docs   call can come free get just know ltgt now will
##   1085    0   0    1    0   1    0    0    0   0   11
##   1579    0   0    0    0   0    0    0   18   0    0
##   1863    0   0    0    0   0    0    1    0   0    0
##   2158    0   0    0    0   0    0    0    0   0    0
##   2370    0   0    0    1   0    0    0    0   0    0
##   2380    0   1    0    0   0    0    0    1   0    0
##   2434    0   3    0    1   1    0    0    6   0    0
##   2848    0   0    0    0   0    0    0    0   0    0
##   3016    0   0    0    0   0    0    0    2   0    0
##   5105    0   0    0    1   0    0    0    0   0    0

Visualizing Spam and Ham word frequency

spam <- subset(sms_raw, type =="spam")
ham <- subset(sms_raw, type =="ham")

Spam

Ham

Create test and train data

#manual split since data is random (not sure how to createdatapartition on DTM)
sms_dtm_train <- sms_dtm[1:4457,]
sms_dtm_test <- sms_dtm[4458:5572,]
sms_train_labels <- sms_raw[1:4457,]$type
sms_test_labels <- sms_raw[4458:5572,]$type

start_time <- Sys.time()

word_thresh <- c(1,5,10)
Results_per_word_thresh=NULL
for (i in word_thresh) {
#number of top frequency words to find between 1 and 50
i <- i
freq_words <- findFreqTerms(sms_dtm_train,i)

#train and test filtered to high frequency words
freq_sms_dtm_train <- sms_dtm_train[, freq_words]
freq_sms_dtm_test <- sms_dtm_test[, freq_words]

#Change frequency value to categorical and apply function to train and test DTMs
convert_counts <- function(x) {
ifelse(x>0, "Yes", "No")}

#can use apply to apply function to the entire matrix
#NOTES: MARGIN 2 is used for columns MARGIN 1 for rows
sms_train <- apply(freq_sms_dtm_train, MARGIN = 2, convert_counts)
sms_test <- apply(freq_sms_dtm_test, MARGIN = 2, convert_counts)

#create Naive Bayes model with laplace = 0 (default)
sms_classifier <- naiveBayes(sms_train, sms_train_labels)

sms_test_predict <- predict(sms_classifier, sms_test)

OVERALL_ACCURACY <- confusionMatrix(sms_test_predict, sms_test_labels)$overall["Accuracy"] 
  Results_per_word_thresh = rbind(Results_per_word_thresh, data.frame(i, OVERALL_ACCURACY))
}

end_time <- Sys.time()

#time to complete
start_time - end_time

## Time difference of -2.112193 mins

#Results per # of Frequent words used in model
freq_plot <- ggplot(Results_per_word_thresh, aes(i, OVERALL_ACCURACY))+
geom_line()+
    ylim(.85,1)

Results

Below we can see that performance does not vary much between using 1 word and 10 words. Also the run time for this model is quite slow.

start_time - end_time
freq_plot

Next Steps

Explore XGB, and see if boosted tree performs betters and more efficiently.