For this project, tidymodels was utilized to build a SMS classification system which flagged text messages as ‘spam’ or ‘ham’. Naive bayes, random forest and XGBoost models were then run to compare classification results across models.
The dataset used for this project is located at: https://www.kaggle.com/datasets/uciml/sms-spam-collection-dataset. The dataset consists of 5,572 SMS messages/rows and five columns.
library(tidyverse)
library(tidymodels, quietly = T)
library(tidytext, quietly = T)
library(stopwords, quietly = T)
library(janitor, quietly = T)
library(wordcloud, quietly = T)
library(tm, quietly = T)
library(caret, quietly = T)
library(textrecipes, quietly = T)
library(discrim, quietly = T)
library(naivebayes, quietly = T)
library(wordcloud, quietly = T)
spam_or_ham =read.csv("https://raw.githubusercontent.com/greggmaloy/Data607_R/main/spam.csv")
glimpse(spam_or_ham)
## Rows: 5,572
## Columns: 5
## $ v1 <chr> "ham", "ham", "spam", "ham", "ham", "spam", "ham", "ham", "spam", …
## $ v2 <chr> "Go until jurong point, crazy.. Available only in bugis n great wo…
## $ X <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""…
## $ X.1 <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""…
## $ X.2 <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", ""…
Of the 5,572 SMS messages, 4825 (87%) were ham and 747(13%) were
spam. Two of the five columns contained data. The columns which do not
contain data were deleted and remaining columns renamed:
1.) ‘spam_or_ham’ - a flag which denotes whether a SMS message is spam
or ham
2.) ‘text’ -the full text of the SMS message
Spam vs ham breakdown in original dataset
df<- spam_or_ham
df<- df %>% rename(spam_or_ham=1, text=2)%>% select(1:2)
#df<- df %>%
# mutate(target=case_when(
# spam_or_ham=="spam"~as.factor("1"),
# spam_or_ham=="ham"~as.factor("0")
# ))
df<- df %>%
mutate(flag=case_when(
spam_or_ham=="spam"|spam_or_ham=="ham"~as.numeric("1"),
))
spam_or_ham_agg<-count(df, spam_or_ham)
spam_or_ham_agg
Occurance of Numbers in the Body of the SMS
Messages
The occurrence of numbers is more common in spam messages. See Chart I
below. Of the 4825 ham messages, 4077(84%) did not have a number present
in the message. Of the 747 spam messages, 39(5%) did not have a number
present in the message.
spam_wc<-df%>%
filter(spam_or_ham=="spam")
ham_wc<-df%>%
filter(spam_or_ham=="ham")
#spam word cloud prep length of messages
Sys.setlocale( 'LC_ALL','C' )
## [1] "C"
sms_messages_spam_length <- spam_wc %>%
mutate(num_characters = nchar(text)) %>%
mutate(num_numbers = str_count(text, '\\d+')) %>%
mutate(mean = mean(num_characters))%>%
mutate(median=median(num_characters))
sms_messages_ham_length <- ham_wc %>%
mutate(num_characters = nchar(text))%>%
mutate(num_numbers = str_count(text, '\\d+')) %>%
mutate(mean = mean(num_characters))%>%
mutate(median=median(num_characters))
length_spam_and_ham<-rbind(sms_messages_ham_length,sms_messages_spam_length)
ggplot(length_spam_and_ham, aes(x=num_numbers, fill=spam_or_ham)) +
geom_bar( alpha=.5, position="identity")+
labs(title="Chart I. Frequency of Numbers in SMS Messages",
x ="Frequency of Numbers", y = "Number of SMS Messages", fill="")
Table 1. Frequency of Numbers
Table_2<-length_spam_and_ham %>%
group_by(spam_or_ham,num_numbers)%>%
summarise(freq = n(), .groups = 'drop')%>%
pivot_wider(names_from = num_numbers, values_from = freq)
Table_2
Number of Characters in SMS messages
SMS message character length was also calculated. See chart 2 below. A
majority of ham messages had a character length of less than 100, while
a majority of spam messages had a character length of more than 100.
length_spam_and_ham<-rbind(sms_messages_ham_length,sms_messages_spam_length)
ggplot(length_spam_and_ham, aes(x=num_characters, fill=spam_or_ham)) +
geom_bar( alpha=.5, position="identity")+
labs(title="Chart 2. Number of characters in SMS Messages",
x ="Number of Characters", y = "Number of SMS Messages", fill="")
Table 2. Number of characters
Table_4<-length_spam_and_ham %>%
group_by(spam_or_ham,num_characters)%>%
summarise(freq = n(), .groups = 'drop')%>%
pivot_wider(names_from = num_characters, values_from = freq)
Table_4
Initial Wordcloud and Frequency Table
Stop words in all languages were eliminated from the dataset and word
frequencies were calculated for both spam and ham messages. Numbers and
punctuation were allowed to remain in the data for the moment as I was
interested in seeing how numbers and punctuation impacted the
wordclouds.
Wordclouds spam=left, ham=right
#prep for word clouds
stopwords <- stopwords_getlanguages(source = "snowball")
#stop words for all languages available
stops <- map_df(stopwords, function(x) get_stopwords(language = x))
#words from sms spam, remove stop words
suppressMessages(sms_words_spam <- spam_wc %>%
transmute(line = 1:nrow(spam_wc),
text = text) %>%
unnest_tokens(word, text) %>%
anti_join(stops) %>%
mutate(word = str_remove(word, "^\\w'")))
#Frequencies by grouping by word and group
tidy_spam <- sms_words_spam %>%
group_by(word) %>%
summarise(freq = n()) %>%
arrange(desc(freq))
# Get words from sms ham, remove stop words
suppressMessages(sms_words_ham <- ham_wc %>%
transmute(line = 1:nrow(ham_wc),
text = text) %>%
unnest_tokens(word, text) %>%
anti_join(stops) %>%
mutate(word = str_remove(word, "^\\w'")))
# Frequencies by grouping by word and group
tidy_ham <- sms_words_ham %>%
group_by(word) %>%
summarise(freq = n()) %>%
arrange(desc(freq))
par(mfrow=c(1,2))
wordcloud(tidy_ham$word, tidy_ham$freq, scale=c(2,0.5), max.words=200, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, 'Dark2'))
wordcloud(tidy_spam$word, tidy_spam$freq, scale=c(2,0.5), max.words=200, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, 'Dark2'))
Frequency Table
Below is table which lists the 50 most frequently occurring
words/characters which are present in spam and ham messages
respectively.
#making a table of fifty most frequently used word
tidy_ham<-tidy_ham %>%mutate(tidy_ham , ID = row_number()) %>% slice_min(ID, n = 50)
tidy_spam<-tidy_spam %>%mutate(tidy_spam , ID = row_number()) %>% slice_min(ID, n = 50)
pre_freq<-cbind(tidy_ham,tidy_spam)
colnames(pre_freq)[1] = "Ham Words"
colnames(pre_freq)[2] = "Ham Word Frequency"
colnames(pre_freq)[4] = "Spam Word"
colnames(pre_freq)[5] = "Spam Word Frequency"
colnames(pre_freq)[3] = "Index Position"
pre_freq_final<-pre_freq%>% select(3,1,2,4,5)
pre_freq_final
Data was further tidied secondary to corpus creation. Corpus creation included transformation of characters to lowercase, removal of punctuation marks, stem document transformation, and stop words. Additional wordclouds were created for both spam and ham messages. Finally, a term document matrix was created which had 5572 rows and 7800 columns.
Word Clouds
spam=left, ham=right
sms_corpus = VCorpus(VectorSource(df$text))
corpus_clean = tm_map(sms_corpus, content_transformer(tolower))#To Lowercase Transformation
corpus_clean = tm_map(corpus_clean, removePunctuation)#Remove Punctuation Transformation
corpus_clean = tm_map(corpus_clean, stemDocument)#Stem Document Transformation
corpus_clean = tm_map(corpus_clean, removeWords, stopwords("en"))#Remove Stop words Transformation
#wordcloud prep
reverse_sms <- data.frame(text = sapply(corpus_clean, as.character), stringsAsFactors = FALSE, spam_or_ham = as.factor(df$spam_or_ham))
#spam wc
par(mfrow=c(1,2))
spam <- subset(reverse_sms, spam_or_ham == "spam")
wordcloud(spam$text, max.words =75,scale=c(3, .4), colors = brewer.pal(7, "Dark2"), random.order = FALSE)
#ham wc
ham <- subset(reverse_sms, spam_or_ham == "ham")
wordcloud(ham$text , max.words =75,scale=c(2, .2), colors = brewer.pal(7, "Dark2"), random.order = FALSE)
Below is a sample of the term document matrix
#dtm
sms_dtm <- DocumentTermMatrix(corpus_clean)
suppressWarnings(
sms_dtm <- DocumentTermMatrix(sms_corpus, control = list(
tolower = TRUE,
removeNumbers = TRUE,
removePunctuation = TRUE,
stemming = TRUE,
stopwords = TRUE)))
tm::inspect(sms_dtm)
## <<DocumentTermMatrix (documents: 5572, terms: 6884)>>
## Non-/sparse entries: 43882/38313766
## Sparsity : 100%
## Maximal term length: 40
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs call can come dont free get just ltgt now will
## 1085 0 0 1 1 0 1 0 0 0 9
## 1579 0 0 0 0 0 0 0 18 0 0
## 1863 0 0 0 3 0 0 0 0 0 0
## 2158 0 0 0 0 0 0 0 0 0 0
## 2370 0 0 0 0 1 0 0 0 0 0
## 2380 0 1 0 0 0 0 0 1 0 0
## 2434 0 3 0 0 1 1 0 6 0 0
## 2848 0 0 0 0 0 0 0 0 0 0
## 3016 0 0 0 0 0 0 0 2 0 0
## 5105 0 0 0 0 1 0 0 0 0 0
Next data was separated into test and train partitions to prepare for analysis. By default, R splits the data into 75% training and 25% testing partitions. The larger ratio in the training data provides a larger amount of data to train the model, allowing more data for R to derive patterns.
Initial split numbers
set.seed(1234)#set seed secondary to randomization of split
split <-initial_split(reverse_sms, strata=spam_or_ham)
split
## <Training/Testing/Total>
## <4178/1394/5572>
Spam/ham break down in training partition
sms_train <- training(split)
spam_or_ham_agg_sms_train<-count(sms_train, spam_or_ham)
spam_or_ham_agg_sms_train
Spam/ham break down in testing partition
sms_test <- testing(split)
spam_or_ham_agg_sms_test<-count(sms_test, spam_or_ham)
spam_or_ham_agg_sms_test
Tidy models was used to construct and fit a Naive Bayes model. Both training and testing partitions were run in this step as a method of validating that both partitions accurately represented the distribution of spam/ham messages. The probability in both partitions was both equal to 87% ham and 13% spam after rounding.
sms_recipe <- recipe(spam_or_ham ~ text, data = sms_train)
sms_recipe <- sms_recipe %>%
step_tokenize(text) %>%
step_tokenfilter(text, max_tokens = 10) %>%
step_tfidf(text)
sms_wf <-workflow() %>%
add_recipe(sms_recipe)
bayes<- naive_Bayes() %>%
set_mode("classification") %>%
set_engine("naivebayes")
bayes_fit <- sms_wf %>%
add_model(bayes) %>%
fit(data = sms_train)
bayes_fit_test <- sms_wf %>%
add_model(bayes) %>%
fit(data = sms_test)
Training Partition
bayes_fit
## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: naive_Bayes()
##
## -- Preprocessor ----------------------------------------------------------------
## 3 Recipe Steps
##
## * step_tokenize()
## * step_tokenfilter()
## * step_tfidf()
##
## -- Model -----------------------------------------------------------------------
##
## ================================== Naive Bayes ==================================
##
## Call:
## naive_bayes.default(x = maybe_data_frame(x), y = y, usekernel = TRUE)
##
## ---------------------------------------------------------------------------------
##
## Laplace smoothing: 0
##
## ---------------------------------------------------------------------------------
##
## A priori probabilities:
##
## ham spam
## 0.8659646 0.1340354
##
## ---------------------------------------------------------------------------------
##
## Tables:
##
## ---------------------------------------------------------------------------------
## ::: tfidf_text_2::ham (KDE)
## ---------------------------------------------------------------------------------
##
## Call:
## density.default(x = x, na.rm = TRUE)
##
## Data: x (3618 obs.); Bandwidth 'bw' = 0.06845
##
## x y
## Min. :-0.2054 Min. :0.000009
## 1st Qu.: 0.5716 1st Qu.:0.005029
## Median : 1.3485 Median :0.015997
## Mean : 1.3485 Mean :0.321103
## 3rd Qu.: 2.1254 3rd Qu.:0.046965
## Max. : 2.9023 Max. :5.507688
##
## ---------------------------------------------------------------------------------
## ::: tfidf_text_2::spam (KDE)
## ---------------------------------------------------------------------------------
##
## Call:
## density.default(x = x, na.rm = TRUE)
##
## Data: x (560 obs.); Bandwidth 'bw' = 0.1607
##
## x y
## Min. :-0.4822 Min. :0.0009993
## 1st Qu.: 0.4331 1st Qu.:0.0402910
##
## ...
## and 147 more lines.
Test Partition
bayes_fit_test
## == Workflow [trained] ==========================================================
## Preprocessor: Recipe
## Model: naive_Bayes()
##
## -- Preprocessor ----------------------------------------------------------------
## 3 Recipe Steps
##
## * step_tokenize()
## * step_tokenfilter()
## * step_tfidf()
##
## -- Model -----------------------------------------------------------------------
##
## ================================== Naive Bayes ==================================
##
## Call:
## naive_bayes.default(x = maybe_data_frame(x), y = y, usekernel = TRUE)
##
## ---------------------------------------------------------------------------------
##
## Laplace smoothing: 0
##
## ---------------------------------------------------------------------------------
##
## A priori probabilities:
##
## ham spam
## 0.8658537 0.1341463
##
## ---------------------------------------------------------------------------------
##
## Tables:
##
## ---------------------------------------------------------------------------------
## ::: tfidf_text_2::ham (KDE)
## ---------------------------------------------------------------------------------
##
## Call:
## density.default(x = x, na.rm = TRUE)
##
## Data: x (1207 obs.); Bandwidth 'bw' = 0.0874
##
## x y
## Min. :-0.2622 Min. :0.000000
## 1st Qu.: 0.5722 1st Qu.:0.004969
## Median : 1.4066 Median :0.014409
## Mean : 1.4066 Mean :0.298974
## 3rd Qu.: 2.2410 3rd Qu.:0.051355
## Max. : 3.0754 Max. :4.321754
##
## ---------------------------------------------------------------------------------
## ::: tfidf_text_2::spam (KDE)
## ---------------------------------------------------------------------------------
##
## Call:
## density.default(x = x, na.rm = TRUE)
##
## Data: x (187 obs.); Bandwidth 'bw' = 0.1672
##
## x y
## Min. :-0.5015 Min. :0.0005741
## 1st Qu.: 0.4526 1st Qu.:0.0234266
##
## ...
## and 147 more lines.
To further evalulate the model, data from the training partition was resampled into ten different partitions and fit to the models to determine how well the model classified spam versus ham SMS messages in the previous step(part 5). AOC(82%) and accuracy (89%) were derived and plotted from the resampled data. The AOC and accuracy are both close to 1 denoting a good model fit. A confusion matrix was also plotted to allow for inspection of predicted values.
ROC and Accuracy Table
set.seed(1245)
sms_folds <- vfold_cv(sms_train)
#test<-data.frame(sms_folds)
#sms_recipe_test <- recipe(spam_or_ham ~ text, data = sms_train)
bayes_wf <- workflow() %>%
add_recipe(sms_recipe) %>%
add_model(bayes)
bayes_rs <- fit_resamples(bayes_wf, sms_folds,
control = control_resamples(save_pred = TRUE))
bayes_rs_metrics <- collect_metrics(bayes_rs)
bayes_rs_predictions <- collect_predictions(bayes_rs)
bayes_rs_metrics
#bayes_rs_predictions
bayes_rs_predictions %>%
group_by(id) %>%
roc_curve(truth = spam_or_ham, .pred_ham) %>% #######is this code correct?
autoplot() +
labs(
color = NULL,
title = "ROC",
subtitle = "Resamples shown in a different color"
)
Confusion Matrix
conf_mat_resampled(bayes_rs , tidy = FALSE) %>%
autoplot(type = "heatmap")
bayes_final<-
bayes_wf %>%
last_fit(split) %>%
collect_predictions() %>%
conf_mat(truth = spam_or_ham, estimate = .pred_class)
bayes_final
## Truth
## Prediction ham spam
## ham 1198 132
## spam 9 55
Next a random forest was run to determine how a random forest compared to naive bayes. The random forest was first run on training data (ROC=99.6%, accuracy=97.6%) and subsequently on resampled training data as was done with naive bayes. The resampled random forest results indicated better model fit than the naive bayes model (Random Forest ROC=90.7%, Accuracy=84.5%; Bayes ROC=82%, Accuracy=89%). The random forest resampled results are likely more accurate than the non-resampled random forest results.
Non-resampled ROC
rf_mod<- # random forest
rand_forest() %>%
set_engine("ranger") %>%
set_mode("classification")
rf_fit<-rf_mod %>%
fit(spam_or_ham ~ ., data = sms_train)
rf_training_pred <-
predict(rf_fit, sms_train) %>%
bind_cols(predict(rf_fit, sms_train, type = "prob")) %>%
# Add the true outcome data back in
bind_cols(sms_train %>%
select(spam_or_ham))
rf_training_pred %>% # training set predictions
roc_auc(truth = spam_or_ham, .pred_ham)
Non-resampled Accuracy
rf_training_pred %>% # training set predictions
accuracy(truth = spam_or_ham, .pred_class)
Resampled ROC & Accuracy
#resample
rf_cv_folds<- vfold_cv(sms_train, strata = spam_or_ham)#, v = 5, repeats = 1)
#workflow
rf_wf<-
workflow() %>%
add_recipe(sms_recipe) %>%
add_model(rf_mod)
#fit
rf_fit_rs<-
fit_resamples(
rf_wf,
rf_cv_folds,
metrics = metric_set(roc_auc, accuracy),
control = control_resamples(save_pred = TRUE)
)
collect_metrics(rf_fit_rs)
# resampling model eval
#rf_rs_metrics <- collect_metrics(rf_fit_rs)
rf_rs_predictions <- collect_predictions(rf_fit_rs)
rf_rs_predictions %>%
group_by(id) %>%
roc_curve(truth = spam_or_ham, .pred_ham) %>% #######is this code correct?
autoplot() +
labs(
color = NULL,
title = "ROC",
subtitle = "Resamples shown in a different color"
)
Confusion Matrix
conf_mat_resampled(rf_fit_rs , tidy = FALSE) %>%
autoplot(type = "heatmap")
rf_final<-
rf_wf %>%
last_fit(split) %>%
collect_predictions() %>%
conf_mat(truth = spam_or_ham, estimate = .pred_class)
rf_final
## Truth
## Prediction ham spam
## ham 1184 106
## spam 23 81
Next a XGBoost model was run to determine if a XGBoost model was able to better classify spam/ham messages. Again training data was resampled into ten seperate partitions. Results for the XGBoost resamples were very similar to the random forest model resample results. (XGBoost ROC=90.8, Accuracy=84.3%; Random Forest ROC=90.7%, Accuracy=84.5%; Bayes ROC=82%, Accuracy=89%).
Resampled ROC & Accuracy
xgb_mod<-
boost_tree() %>%
set_engine("xgboost") %>%
set_mode("classification")
xgb_cv_folds<- vfold_cv(sms_train, strata = spam_or_ham)#, v = 5, repeats = 1)
xgb_wf<-
workflow() %>%
add_recipe(sms_recipe) %>%
add_model(xgb_mod)
xgb_fit<-
fit_resamples(
xgb_wf,
xgb_cv_folds,
metrics = metric_set(roc_auc, accuracy),
control = control_resamples(save_pred = TRUE)
)
collect_metrics(xgb_fit)
# resampling model eval
#rf_rs_metrics <- collect_metrics(rf_fit_rs)
xgb_rs_predictions <- collect_predictions(xgb_fit)
xgb_rs_predictions %>%
group_by(id) %>%
roc_curve(truth = spam_or_ham, .pred_ham) %>% #######is this code correct?
autoplot() +
labs(
color = NULL,
title = "ROC",
subtitle = "Resamples shown in a different color"
)
Resampled Confusion Matrix
conf_mat_resampled(xgb_fit , tidy = FALSE) %>%
autoplot(type = "heatmap")
xgb_final<-
xgb_wf %>%
last_fit(split) %>%
collect_predictions() %>%
conf_mat(truth = spam_or_ham, estimate = .pred_class)
xgb_final
## Truth
## Prediction ham spam
## ham 1182 105
## spam 25 82
Random forest and xgboost classification models performed slightly better than the naive bayes model, as indicated by ROC and accuracy results (XGBoost ROC=90.8, Accuracy=84.3%; Random Forest ROC=90.7%, Accuracy=84.5%; Bayes ROC=82%, Accuracy=89%).