This is a project to build a spam filter for SMS messages using the multinomial Naive Bayes algorithm.
library (readr)
library(stringr)
library(ggplot2)
library(dplyr)
library(purrr)
library(tidyverse)
Import the data as a data frame and rename the columns.
setwd("C:/Users/Ana/Desktop/Data Analytics/CSV Files")
data <- read_tsv("SMSSpamCollection.tsv", col_names = FALSE)
Parsed with column specification:
cols(
X1 = [31mcol_character()[39m,
X2 = [31mcol_character()[39m
)
39 parsing failures.
row col expected actual file
283 X2 delimiter or quote 'SMSSpamCollection.tsv'
283 X2 delimiter or quote H 'SMSSpamCollection.tsv'
454 X2 delimiter or quote 'SMSSpamCollection.tsv'
454 X2 delimiter or quote Y 'SMSSpamCollection.tsv'
454 X2 delimiter or quote 'SMSSpamCollection.tsv'
... ... .................. ...... .......................
See problems(...) for more details.
colnames(data) <- c("label", "sms")
head(data, 100)
Explore dataset
nrow(data)
[1] 4837
ncol(data)
[1] 2
data %>% group_by(label) %>% summarise(Freq = n(), percent = n()/nrow(data)*100)
The dataset contains 4837 messages. 87% of all the messages are ham. 13% of all the messages are spam.
Need to create: -a training data set which contains 80% of all the messages (at random) -a cross-validation set which contains 10% of all the messages (at random) -a test set which contains 10% of all the messages (at random)
0.8*nrow(data)
[1] 3869.6
0.1*nrow(data)
[1] 483.7
3870+483+484
[1] 4837
The training set will contain 3870 messages The cross-validation set will contain 483 messages The test set will contain 484 messages
set.seed(2)
random_set_of_numbers <- sample(1:4837, 4837)
training_index <- random_set_of_numbers[1:3870]
x_valid_index <- random_set_of_numbers[3871:4353]
test_index <- random_set_of_numbers[4354:4837]
training_data <- data[training_index,]
x_valid_data <- data[x_valid_index,]
test_data <- data[test_index,]
training_data %>% group_by(label) %>% summarise(Freq = n(), percent = n()/nrow(training_data)*100)
x_valid_data %>% group_by(label) %>% summarise(Freq = n(), percent = n()/nrow(x_valid_data)*100)
test_data %>% group_by(label) %>% summarise(Freq = n(), percent = n()/nrow(test_data)*100)
Clean the sms column by removing all punctuation and converting all letters to lowercase
head(training_data, 20)
training_data_2 <- training_data %>%
mutate(sms = str_replace_all(sms, "[:punct:]", "")) %>%
mutate(sms = str_replace_all(sms, "[:digit:]", "")) %>%
mutate(sms = tolower(sms))
head(training_data_2, 20)
Create a vocabulary of all the unique words in the training set
all_vocab <- c()
for(i in 1:nrow(training_data_2)){
indv_words <- unlist(str_split(training_data_2[i,2], "\\s+"))
all_vocab <- c(all_vocab, indv_words)
}
vocab <- unique(all_vocab)
head(vocab, 100)
[1] "sunshine" "quiz" "win" "a" "super" "sony" "dvd" "recorder"
[9] "if" "you" "canname" "the" "capital" "of" "australia" "text"
[17] "mquiz" "to" "b" "asked" "mobile" "chatlines" "inclu" "in"
[25] "free" "mins" "india" "cust" "servs" "sed" "yes" "ler"
[33] "got" "mega" "bill" "dont" "giv" "shit" "bailiff" "due"
[41] "days" "i" "o" "£" "want" "meeting" "da" "will"
[49] "call" "havent" "eaten" "all" "day" "im" "sitting" "here"
[57] "staring" "at" "this" "juicy" "pizza" "and" "cant" "eat"
[65] "it" "these" "meds" "are" "ruining" "my" "life" ""
[73] "nightswe" "nt" "staying" "port" "step" "liaotoo" "ex" "night"
[81] "has" "ended" "for" "another" "morning" "come" "special" "way"
[89] "may" "smile" "like" "sunny" "rays" "leaves" "your" "worries"
[97] "blue" "bay" "thank" "princess"
Now, calculate other parameters which are constant for all calculations: P(Spam) P(Ham) N_spam (number of words in all the spam messages, not the number of spam messages, nor the number of unique words) N_ham (number of words in all the ham messages, not the number of ham messages, nor the number of unique words) N_vocabulary (number of unique words in the training data set)
p_spam <- 3364/(3364+506)
p_ham <- 1-p_spam
n_vocab <- length(vocab)
p_spam
[1] 0.8692506
p_ham
[1] 0.1307494
n_vocab
[1] 7660
Next, need to calculate n_spam and n_ham. To do this, create a vector containing all the words in all the spam messages. n_spam is the length of this vector. Then create a vector containing all the words in all the ham messages. n_ham is the length of this vector. First, split the data to contain two separate dataframes, one for spam messages and one for ham messages.
spam_training <- training_data_2 %>%
filter(label == "spam")
ham_training <- training_data_2 %>%
filter(label == "ham")
alpha <- 1
#sapply(strsplit(test, " "), length)
Then create the vectors containing all the words in the spam messages. And then do the same for ham messages.
all_spam_words <- c()
for(i in 1:nrow(spam_training)) {
split <- unlist(strsplit(spam_training$sms[i], "\\s+"))
all_spam_words <- c(all_spam_words, split)
}
head(all_spam_words, 100)
[1] "sunshine" "quiz" "win" "a" "super" "sony" "dvd" "recorder"
[9] "if" "you" "canname" "the" "capital" "of" "australia" "text"
[17] "mquiz" "to" "b" "asked" "mobile" "if" "chatlines" "inclu"
[25] "in" "free" "mins" "india" "cust" "servs" "sed" "yes"
[33] "ler" "got" "mega" "bill" "dont" "giv" "a" "shit"
[41] "bailiff" "due" "in" "days" "i" "o" "£" "want"
[49] "£" "urgent" "your" "mobile" "was" "awarded" "a" "£"
[57] "bonus" "caller" "prize" "on" "our" "final" "attempt" "contact"
[65] "u" "call" "you" "are" "guaranteed" "the" "latest" "nokia"
[73] "phone" "a" "gb" "ipod" "mp" "player" "or" "a"
[81] "£" "prize" "txt" "word" "collect" "to" "no" "ibhltd"
[89] "ldnwh" "pmtmsgrcvd" "enjoy" "the" "jamster" "videosound" "gold" "club"
[97] "with" "your" "credits" "for"
n_spam <- length(all_spam_words)
n_spam
[1] 11045
all_ham_words <- c()
for(i in 1:nrow(ham_training)) {
split <- unlist(strsplit(ham_training$sms[i], "\\s+"))
all_ham_words <- c(all_ham_words, split)
}
head(all_ham_words, 100)
[1] "in" "meeting" "da" "i" "will" "call" "you" "havent"
[9] "eaten" "all" "day" "im" "sitting" "here" "staring" "at"
[17] "this" "juicy" "pizza" "and" "i" "cant" "eat" "it"
[25] "these" "meds" "are" "ruining" "my" "life" "" "nightswe"
[33] "nt" "staying" "at" "port" "step" "liaotoo" "ex" "night"
[41] "has" "ended" "for" "another" "day" "morning" "has" "come"
[49] "in" "a" "special" "way" "may" "you" "smile" "like"
[57] "the" "sunny" "rays" "and" "leaves" "your" "worries" "at"
[65] "the" "blue" "blue" "bay" "thank" "you" "princess" "you"
[73] "are" "so" "sexy" "mm" "you" "ask" "him" "to"
[81] "come" "its" "enough" "oh" "mr" "sheffield" "you" "wanna"
[89] "play" "that" "game" "okay" "youre" "the" "boss" "and"
[97] "im" "the" "nanny" "you"
n_ham <- length(all_ham_words)
n_ham
[1] 55740
Next, create a function which takes the vector vocab (the vector that contains all unique vocabulary) and outputs a list of each word with its corresponding probability of occuring within the spam messages. The function first calculates how many time each word appears in the spam messages n_occurances_in_spam. Then it uses this, and the other parameters (calculated above), to calculate pword|spam. i.e. the probability of a particular word occuring, given a message is spam.
find_word_in_spam <- function(word, alpha){
logic_v <- all_spam_words == word
n_occurances_in_spam <- sum(logic_v)
p_word_gvn_spam <- (n_occurances_in_spam+alpha)/(n_spam + alpha*n_vocab)
return(p_word_gvn_spam)
}
p_list_spam <- map2(vocab, 1, find_word_in_spam)
names(p_list_spam) <- vocab
head(p_list_spam, 10)
$sunshine
[1] 0.0004811548
$quiz
[1] 0.0006415397
$win
[1] 0.002566159
$a
[1] 0.01475541
$super
[1] 0.0001603849
$sony
[1] 0.0003742315
$dvd
[1] 0.0003207698
$recorder
[1] 0.0001069233
$`if`
[1] 0.001283079
$you
[1] 0.01079925
Do the same as above, but for ham messages. i.e. Create a function which takes the vector vocab and outputs a list with each word with its corresponding probability of occuring within the ham messages. The function first calculates how many time each word appears in the ham messages n_occurances_in_ham. Then it uses this, and the other parameters (calculated above), to calculate pword|ham. i.e. the probability of a particular word occuring, given a message is ham.
find_word_in_ham <- function(word, alpha){
logic_v <- all_ham_words == word
n_occurances_in_ham <- sum(logic_v)
p_word_gvn_ham <- (n_occurances_in_ham+alpha)/(n_ham + alpha*n_vocab)
return(p_word_gvn_ham)
}
p_list_ham <- map2(vocab, 1, find_word_in_ham)
names(p_list_ham) <- vocab
head(p_list_ham, 10)
$sunshine
[1] 1.577287e-05
$quiz
[1] 1.577287e-05
$win
[1] 0.0002208202
$a
[1] 0.01364353
$super
[1] 6.309148e-05
$sony
[1] 3.154574e-05
$dvd
[1] 3.154574e-05
$recorder
[1] 1.577287e-05
$`if`
[1] 0.00455836
$you
[1] 0.02334385
Next, I just do some testing here to see if I think its working…
test_message <- "hello im running a test sentance to check that this is working"
test_message
[1] "hello im running a test sentance to check that this is working"
vec_test_words <- unlist(strsplit(test_message, "\\s+"))
vec_test_words
[1] "hello" "im" "running" "a" "test" "sentance" "to" "check" "that"
[10] "this" "is" "working"
a <- unlist(p_list_ham[c(vec_test_words)])
a
hello im running a test to check that
0.0006388702 0.0067975082 0.0000902559 0.0152921814 0.0003203200 0.0232559374 0.0004265034 0.0069921778
this is working
0.0035589141 0.0101422858 0.0003911089
p_message_ham <- p_ham*prod(a)
p_message_ham
[1] 2.457824e-31
b <- unlist(p_list_spam[c(vec_test_words)])
b
hello im running a test to check that
2.624672e-04 6.858014e-04 8.466684e-06 2.329185e-02 8.466684e-06 4.047921e-02 9.313352e-05 1.701803e-03
this is working
5.427144e-03 9.237152e-03 8.466684e-06
p_message_spam <- p_spam*prod(b)
p_message_spam
[1] 7.114088e-37
if_else(p_message_ham>=p_message_spam, "ham", "spam")
[1] "ham"
That looks reasonable.
NExt, write a function which takes in messages and returns whether or not it deems it to be spam or ham
classification_function <- function(message) {
cleaned_1 <- str_replace_all(message, "[:punct:]", "")
cleaned_2 <- str_replace_all(cleaned_1, "[:digit:]", "")
cleaned_3 <- tolower(cleaned_2)
split_message <- unlist(strsplit(cleaned_3, "\\s+"))
#print(split_message)
a <- unlist(p_list_ham[c(split_message)])
p_message_ham <- p_ham*prod(a)
b <- unlist(p_list_spam[c(split_message)])
p_message_spam <- p_spam*prod(b)
if_else(p_message_ham>=p_message_spam, "ham", "spam")
}
classification_function(test_message)
[1] "ham"
filter_output <- unlist(map(training_data$sms, classification_function))
comparison <- cbind(training_data, filter_output)
head(comparison)
NA
NA
sum(comparison$label == comparison$filter_output)/nrow(comparison)
[1] 0.9416021
The filter was 94% accurate!
Now assess the alpha value. Previously, alpha has been set to 1. However, see how the accuracy varies depending on the value of alpha.
alpha_range <- seq(0.1, 1, by = 0.1)
for(alpha in alpha_range) {
p_list_spam <- map2(vocab, alpha, find_word_in_spam)
names(p_list_spam) <- vocab
p_list_ham <- map2(vocab, alpha, find_word_in_ham)
names(p_list_ham) <- vocab
filter_output <- unlist(map(x_valid_data$sms, classification_function))
comparison <- cbind(x_valid_data$label, filter_output)
accuracy <- sum(comparison[,1] == comparison[,2])/nrow(comparison)
nrow(comparison)
#print(accuracy)
cat("The accuracy of the spam filter with alpha =", alpha, " is", (accuracy*100), "%", "\n")
}
The accuracy of the spam filter with alpha = 0.1 is 95.85921 %
The accuracy of the spam filter with alpha = 0.2 is 95.03106 %
The accuracy of the spam filter with alpha = 0.3 is 95.2381 %
The accuracy of the spam filter with alpha = 0.4 is 95.03106 %
The accuracy of the spam filter with alpha = 0.5 is 95.03106 %
The accuracy of the spam filter with alpha = 0.6 is 94.82402 %
The accuracy of the spam filter with alpha = 0.7 is 94.61698 %
The accuracy of the spam filter with alpha = 0.8 is 94.61698 %
The accuracy of the spam filter with alpha = 0.9 is 94.61698 %
The accuracy of the spam filter with alpha = 1 is 94.40994 %
From this, it seems that the lower alpha numbers produce a more accurate comparison. Therefore, alpha = 0.1 will be used for the test set.
alpha <- 0.1
p_list_spam <- map2(vocab, alpha, find_word_in_spam)
names(p_list_spam) <- vocab
p_list_ham <- map2(vocab, alpha, find_word_in_ham)
names(p_list_ham) <- vocab
filter_output <- unlist(map(test_data$sms, classification_function))
comparison <- cbind(test_data, filter_output)
accuracy <- sum(comparison$label == comparison$filter_output)/nrow(comparison)
nrow(comparison)
[1] 484
print(accuracy)
[1] 0.9504132
cat("With alpha =", alpha, "the spam filter correctly predicted", (accuracy*100), "%", "of all messages as being spam or ham", "\n")
With alpha = 0.1 the spam filter correctly predicted 95.04132 % of all messages as being spam or ham
head(comparison)
NA
The algorithm has correctly predicted 95% of messages from the test data set as being either spam or ham.