Data: SMS Spam Collection Data Set from the UCI machine learning repository (https://archive.ics.uci.edu/ml/datasets/sms+spam+collection)
setwd("C:/Users/Owner/Desktop/MachineLearningR_sampleData")
text <- read.csv("sms.csv", header = TRUE)
str(text)
## 'data.frame': 5559 obs. of 2 variables:
## $ type: Factor w/ 2 levels "ham","spam": 1 1 1 2 2 1 1 1 2 1 ...
## $ Text: Factor w/ 5156 levels "'An Amazing Quote'' - Sometimes in life its difficult to decide whats wrong!! a lie that brings a smile or the "| __truncated__,..: 1651 2557 257 626 3308 190 357 3392 2726 1079 ...
summary(text)
## type
## ham :4812
## spam: 747
##
##
##
##
##
## Text
## Sorry, I'll call later : 30
## I cant pick the phone right now. Pls send a message : 12
## Ok... : 10
## 7 wonders in My WORLD 7th You 6th Ur style 5th Ur smile 4th Ur Personality 3rd Ur Nature 2nd Ur SMS and 1st Ur Lovely Friendship... good morning dear: 4
## Ok : 4
## Ok. : 4
## (Other) :5495
head(text, n =10)
## type
## 1 ham
## 2 ham
## 3 ham
## 4 spam
## 5 spam
## 6 ham
## 7 ham
## 8 ham
## 9 spam
## 10 ham
## Text
## 1 Hope you are having a good week. Just checking in
## 2 K..give back my thanks.
## 3 Am also doing in cbe only. But have to pay.
## 4 complimentary 4 STAR Ibiza Holiday or £10,000 cash needs your URGENT collection. 09066364349 NOW from Landline not to lose out! Box434SK38WP150PPM18+
## 5 okmail: Dear Dave this is your final notice to collect your 4* Tenerife Holiday or #5000 CASH award! Call 09061743806 from landline. TCs SAE Box326 CW25WX 150ppm
## 6 Aiya we discuss later lar... Pick u up at 4 is it?
## 7 Are you this much buzy
## 8 Please ask mummy to call father
## 9 Marvel Mobile Play the official Ultimate Spider-man game (£4.50) on ur mobile right now. Text SPIDER to 83338 for the game & we ll send u a FREE 8Ball wallpaper
## 10 fyi I'm at usf now, swing by the room whenever
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.4.2
ggplot(text, aes(x = type)) +
theme_bw() +
geom_bar() +
theme(text = element_text(size=20))+
labs(y = "Number of sms",
title = "Classification of sms messages")
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.4.2
## Loading required package: RColorBrewer
wordcloud(text$Text, min.freq = 50, random.order = FALSE,
colors=brewer.pal(8, "Dark2"))
## Loading required package: tm
## Warning: package 'tm' was built under R version 3.4.2
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
spam <- subset(text, type == "spam")
wordcloud(spam$Text, max.words = 50, random.order = FALSE,
colors=brewer.pal(8, "Dark2"), main = "spam")
ham <- subset(text, type != "spam")
wordcloud(ham$Text, max.words = 50, random.order = FALSE,
colors=brewer.pal(8, "Dark2"), main = "ham")
corpus for the text documents (“Text” column of each row). This will generate 5559 text documents.
text_corpus <- VCorpus(VectorSource(text$Text))
Data cleaning (conversion to all lowercases, removal of numbers and punctuations, stemming and removal of whitespaces) and generation of DocumentTermMatrix containing the frequency of the words in each document.
text_dtm <- DocumentTermMatrix(text_corpus, control = list(
tolower = TRUE,
removeNumbers = TRUE,
stopwords = TRUE,
removePunctuation = TRUE,
stemming = TRUE
))
text_dtm
## <<DocumentTermMatrix (documents: 5559, terms: 6965)>>
## Non-/sparse entries: 43231/38675204
## Sparsity : 100%
## Maximal term length: 40
## Weighting : term frequency (tf)
data splitting into the training and testing datasets and the associated labels for each dataset
text_dtm_train <- text_dtm[1:4500, ]
text_dtm_test <- text_dtm[4501:5559, ]
text_dtm_train_labels <- text[1:4500, ]$type
text_dtm_test_labels <- text[4501:5559, ]$type
Reduce the words in each document to only words that appear at least 10 times in the entire dataset.
freq_words <- findFreqTerms(text_dtm, 10)
text_dtm_train_freq_words <- text_dtm_train[ , freq_words]
text_dtm_test_freq_words <- text_dtm_test[ , freq_words]
convert_counts <- function(x) {
x <- ifelse(x > 0, "Yes", "No")
}
text_dtm_train_ready <- apply(text_dtm_train_freq_words, MARGIN = 2,
convert_counts)
text_dtm_test_ready <- apply(text_dtm_test_freq_words, MARGIN = 2,
convert_counts)
library(e1071)
## Warning: package 'e1071' was built under R version 3.4.2
model_text <- naiveBayes(text_dtm_train_ready, text_dtm_train_labels, laplace = 1)
predict_test <- predict(model_text, text_dtm_test_ready)
library(gmodels)
## Warning: package 'gmodels' was built under R version 3.4.2
CrossTable(predict_test, text_dtm_test_labels,
prop.chisq = FALSE, prop.t = FALSE,
dnn = c('predicted', 'actual'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 1059
##
##
## | actual
## predicted | ham | spam | Row Total |
## -------------|-----------|-----------|-----------|
## ham | 911 | 22 | 933 |
## | 0.976 | 0.024 | 0.881 |
## | 0.997 | 0.152 | |
## -------------|-----------|-----------|-----------|
## spam | 3 | 123 | 126 |
## | 0.024 | 0.976 | 0.119 |
## | 0.003 | 0.848 | |
## -------------|-----------|-----------|-----------|
## Column Total | 914 | 145 | 1059 |
## | 0.863 | 0.137 | |
## -------------|-----------|-----------|-----------|
##
##