In this project the objective is to create a supervised learning model to predict if a text message is spam or a legitimate text aka “ham”.
library(tm)
library(dplyr)
library(SnowballC)
library(caret)
library(xgboost)
library(gmodels)
library(wordcloud)
library(e1071) #for naive bayes
library(gridExtra)
#read in data
sms_raw <- read.csv("spam.csv", stringsAsFactors = F)%>%
rename("type" = "v1", "text" = "v2")%>%
dplyr:: select(type, text)
sms_raw$type <- factor(sms_raw$type)
#line removes emojis from the text strings
sms_raw <- data.frame(type = sms_raw$type, text = gsub("[^\x01-\x7F]", "", sms_raw$text))
sms_corpus <- VCorpus(VectorSource(sms_raw$text))
Number of documents in corpus
sms_corpus
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 5572
Inspect a couple of the texts
lapply(sms_corpus[1:2], as.character)
## $`1`
## [1] "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
##
## $`2`
## [1] "Ok lar... Joking wif u oni..."
Clean the corpus data using tmap which allows transformation to be applied to corpus. Because tolower is not built within the tm package have to use content_transformer to make lower case
clean_sms_corpus <- tm_map(sms_corpus, content_transformer(tolower))
#check
as.character(clean_sms_corpus[[1]])
## [1] "go until jurong point, crazy.. available only in bugis n great world la e buffet... cine there got amore wat..."
Strip numbers, remove stop words which are commonly used words like “for”, “the”, “then”. Remove punctuation, stem the words which is taking the word to its root. Example, testing, tested, and test, all stem to “test”. Lastly ,remove white space and check the clean version.
#strip numbers from corpus
clean_sms_corpus <- tm_map(clean_sms_corpus, removeNumbers)
#remove stop words
clean_sms_corpus <- tm_map(clean_sms_corpus, removeWords, stopwords())
#remove punctuation
clean_sms_corpus <- tm_map(clean_sms_corpus, removePunctuation)
#stemming - wordStem()-from snowball, function for vector, stemDocument (from tm) for applying to entire corpus
clean_sms_corpus <- tm_map(clean_sms_corpus, stemDocument)
#remove leftover whitespace
clean_sms_corpus <- tm_map(clean_sms_corpus, stripWhitespace)
#check clean corpus
lapply(clean_sms_corpus[1:2], as.character)
## $`1`
## [1] "go jurong point crazi avail bugi n great world la e buffet cine got amor wat"
##
## $`2`
## [1] "ok lar joke wif u oni"
Tokenize - text split by each token (word) and split into DTM (document term matrix) DTM lists word frequency
sms_dtm <- DocumentTermMatrix(clean_sms_corpus)
Inspect the DTM
inspect(sms_dtm)
## <<DocumentTermMatrix (documents: 5572, terms: 6475)>>
## Non-/sparse entries: 42356/36036344
## Sparsity : 100%
## Maximal term length: 40
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs call can come free get just know ltgt now will
## 1085 0 0 1 0 1 0 0 0 0 11
## 1579 0 0 0 0 0 0 0 18 0 0
## 1863 0 0 0 0 0 0 1 0 0 0
## 2158 0 0 0 0 0 0 0 0 0 0
## 2370 0 0 0 1 0 0 0 0 0 0
## 2380 0 1 0 0 0 0 0 1 0 0
## 2434 0 3 0 1 1 0 0 6 0 0
## 2848 0 0 0 0 0 0 0 0 0 0
## 3016 0 0 0 0 0 0 0 2 0 0
## 5105 0 0 0 1 0 0 0 0 0 0
spam <- subset(sms_raw, type =="spam")
ham <- subset(sms_raw, type =="ham")
#manual split since data is random (not sure how to createdatapartition on DTM)
sms_dtm_train <- sms_dtm[1:4457,]
sms_dtm_test <- sms_dtm[4458:5572,]
sms_train_labels <- sms_raw[1:4457,]$type
sms_test_labels <- sms_raw[4458:5572,]$type
start_time <- Sys.time()
word_thresh <- c(1,5,10)
Results_per_word_thresh=NULL
for (i in word_thresh) {
#number of top frequency words to find between 1 and 50
i <- i
freq_words <- findFreqTerms(sms_dtm_train,i)
#train and test filtered to high frequency words
freq_sms_dtm_train <- sms_dtm_train[, freq_words]
freq_sms_dtm_test <- sms_dtm_test[, freq_words]
#Change frequency value to categorical and apply function to train and test DTMs
convert_counts <- function(x) {
ifelse(x>0, "Yes", "No")}
#can use apply to apply function to the entire matrix
#NOTES: MARGIN 2 is used for columns MARGIN 1 for rows
sms_train <- apply(freq_sms_dtm_train, MARGIN = 2, convert_counts)
sms_test <- apply(freq_sms_dtm_test, MARGIN = 2, convert_counts)
#create Naive Bayes model with laplace = 0 (default)
sms_classifier <- naiveBayes(sms_train, sms_train_labels)
sms_test_predict <- predict(sms_classifier, sms_test)
OVERALL_ACCURACY <- confusionMatrix(sms_test_predict, sms_test_labels)$overall["Accuracy"]
Results_per_word_thresh = rbind(Results_per_word_thresh, data.frame(i, OVERALL_ACCURACY))
}
end_time <- Sys.time()
#time to complete
start_time - end_time
## Time difference of -2.112193 mins
#Results per # of Frequent words used in model
freq_plot <- ggplot(Results_per_word_thresh, aes(i, OVERALL_ACCURACY))+
geom_line()+
ylim(.85,1)
Below we can see that performance does not vary much between using 1 word and 10 words. Also the run time for this model is quite slow.
start_time - end_time
freq_plot
Explore XGB, and see if boosted tree performs betters and more efficiently.