It can be useful to be able to classify new “test” documents using
already classified “training” documents. A common example is using a
corpus of labeled spam and ham (non-spam) e-mails to predict whether or
not a new document is spam.
For this project, you can start with a spam/ham dataset, then predict
the class of new documents (either withheld from the training dataset or
from another source such as your own spam folder).
The SMS Spam Collection v.1 (hereafter the corpus) is a set of SMS tagged messages that have been collected for SMS Spam research. It contains one set of SMS messages in English of 5,574 messages, tagged acording being ham (legitimate) or spam.
Preview of data
## [1] "ham\tGo until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
## [2] "ham\tOk lar... Joking wif u oni..."
## [3] "spam\tFree entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
## [4] "ham\tU dun say so early hor... U c already then say..."
## [5] "ham\tNah I don't think he goes to usf, he lives around here though"
## [6] "spam\tFreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv"
## [7] "ham\tEven my brother is not like to speak with me. They treat me like aids patent."
## [8] "ham\tAs per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune"
## [9] "spam\tWINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only."
## [10] "spam\tHad your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030"
Loading packages and data
## Warning: package 'tm' was built under R version 4.2.3
## Loading required package: NLP
## Warning: package 'randomForest' was built under R version 4.2.3
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## Warning: package 'wordcloud' was built under R version 4.2.3
## Loading required package: RColorBrewer
library(RColorBrewer)
# Loading data
# data is in a CSV file with columns 'label' and 'text'
RawSMS <- read.csv("C:/Users/aleja/Desktop/SMSSpamCollection.csv", sep="\t", header=TRUE)## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec,
## : EOF within quoted string
#Selecting & renaming appropriate columns of the dataset
RawSMS <- RawSMS[, 1:2]
colnames(RawSMS) <- c("Tag", "Msg")
str(RawSMS)## 'data.frame': 3183 obs. of 2 variables:
## $ Tag: chr "ham" "spam" "ham" "ham" ...
## $ Msg: chr "Ok lar... Joking wif u oni..." "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question("| __truncated__ "U dun say so early hor... U c already then say..." "Nah I don't think he goes to usf, he lives around here though" ...
# Converting 'Tag' column to a factor
RawSMS$Tag <- factor(RawSMS$Tag)
# Creating the corpus
text_corpus <- Corpus(VectorSource(RawSMS$Msg))
# Viewing the content of the first five texts using lapply
lapply(text_corpus[1:5], as.character)## [[1]]
## [1] "Ok lar... Joking wif u oni..."
##
## [[2]]
## [1] "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's"
##
## [[3]]
## [1] "U dun say so early hor... U c already then say..."
##
## [[4]]
## [1] "Nah I don't think he goes to usf, he lives around here though"
##
## [[5]]
## [1] "FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv"
# Cleaning the corpus
cleanCorpus <- tm_map(text_corpus, content_transformer(tolower)) # lowercase all texts## Warning in tm_map.SimpleCorpus(text_corpus, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(cleanCorpus, removeNumbers): transformation
## drops documents
cleanCorpus <- tm_map(cleanCorpus, removeWords, stopwords('english')) # remove common English stopwords## Warning in tm_map.SimpleCorpus(cleanCorpus, removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(cleanCorpus, removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(cleanCorpus, stripWhitespace): transformation
## drops documents
# Create Document-Term Matrix
text_dtm <- DocumentTermMatrix(cleanCorpus)
# Inspect the Document-Term Matrix
inspect(text_dtm)## <<DocumentTermMatrix (documents: 3183, terms: 7970)>>
## Non-/sparse entries: 34619/25333891
## Sparsity : 100%
## Maximal term length: 40
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs call can free get ham just ltgt now spam will
## 1517 0 5 1 1 23 0 8 5 4 1
## 1875 18 17 6 9 157 15 14 17 23 10
## 2452 15 8 9 23 185 6 15 20 26 11
## 2471 8 15 4 10 93 5 16 8 12 5
## 2550 30 9 9 12 145 10 9 21 33 12
## 2963 35 19 9 24 269 16 9 24 39 16
## 3183 53 38 27 36 434 30 25 36 58 45
## 472 0 0 0 1 0 0 0 0 0 11
## 670 15 13 12 21 247 27 9 29 30 17
## 98 70 45 33 37 528 37 25 50 85 40
The provided proportion of 75% for training and 25% for testing is a common choice, but the “right” proportion depends on the specifics of the dataset and the goals of the analysis.
# Creating train and test portions
train_size <- 0.75 # Proportion for training
total_rows <- nrow(text_dtm)
# Calculating the split index
split_index <- round(train_size * total_rows)
# Splitting the data
train_dtm <- text_dtm[1:split_index, ]
test_dtm <- text_dtm[(split_index + 1):total_rows, ]
# Extracting labels for training and testing sets
train_labels <- RawSMS$Tag[1:split_index]
test_labels <- RawSMS$Tag[(split_index + 1):total_rows]
# Verifying the proportions in training and testing sets
tbl_train <- prop.table(table(train_labels))
tbl_test <- prop.table(table(test_labels))
# Printing proportions
cat("Training set proportions:\n", tbl_train, "\n\n")## Training set proportions:
## 0.8638458 0.1361542
## Testing set proportions:
## 0.8580402 0.1419598
We’ll convert the sparse matrix into a format suitable for training the Naive Bayes model. This involves identifying the most frequent words in the texts using the findFreqTerms() function.
# Subsetting the spam messages
spamText <- subset(RawSMS, Tag == "spam")
# Defining a vector of darker colors
custom_colors <- c("#8B0000", "#006400", "#00008B", "#FF8C00", "#800080", "#FF1493", "#008B8B", "#8B4513", "#FFD700")
# Creating a word cloud with darker colors
wordcloud(words = spamText$Msg, max.words = 50, scale = c(5, 0.3),
random.order = FALSE, rot.per = 0.15, colors = custom_colors)## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
# Subsetting the ham messages
hamText <- subset(RawSMS, Tag == "ham")
# Define a vector of darker colors
custom_colors <- c("#8B0000", "#006400", "#00008B", "#FF8C00", "#800080", "#FF1493", "#008B8B", "#8B4513", "#FFD700")
# Creating a word cloud with darker colors for ham messages
wordcloud(words = hamText$Msg, max.words = 50, scale = c(5, 0.3),
random.order = FALSE, rot.per = 0.15, colors = custom_colors)## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
## chr [1:1087] "lar" "wif" "apply" "comp" "cup" "entry" "final" "free" "may" ...
# Selecting only the frequent words from the train and test datasets
freq_words_train <- train_dtm[, freq_words]
freq_words_test <- test_dtm[, freq_words]
# Creating a function for conversion
convert <- function(x) {
x <- ifelse(x > 0, "y", "n")
}
# Applying the conversion function to train and test datasets
train <- apply(freq_words_train, MARGIN = 2, convert)
test <- apply(freq_words_test, MARGIN = 2, convert)
# Verifying the conversion
str(train)## chr [1:2387, 1:1087] "y" "n" "n" "n" "n" "n" "n" "n" "n" "n" "n" "n" "n" ...
## - attr(*, "dimnames")=List of 2
## ..$ Docs : chr [1:2387] "1" "2" "3" "4" ...
## ..$ Terms: chr [1:1087] "lar" "wif" "apply" "comp" ...
## Warning: package 'e1071' was built under R version 4.2.3
# Creating a Naive Bayes classifier
sms_classifier <- naiveBayes(train, train_labels)
# Making predictions on the test set
test_predictions <- predict(sms_classifier, test)
# Evaluating the model
library(gmodels) # Load the gmodels library for CrossTable## Warning: package 'gmodels' was built under R version 4.2.3
# Creating a confusion table
confusion_table <- CrossTable(test_predictions, test_labels,
prop.chisq = FALSE, prop.t = FALSE,
dnn = c('Predicted', 'Actual'))##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## |-------------------------|
##
##
## Total Observations in Table: 796
##
##
## | Actual
## Predicted | ham | spam | Row Total |
## -------------|-----------|-----------|-----------|
## ham | 680 | 26 | 706 |
## | 0.963 | 0.037 | 0.887 |
## | 0.996 | 0.230 | |
## -------------|-----------|-----------|-----------|
## spam | 3 | 87 | 90 |
## | 0.033 | 0.967 | 0.113 |
## | 0.004 | 0.770 | |
## -------------|-----------|-----------|-----------|
## Column Total | 683 | 113 | 796 |
## | 0.858 | 0.142 | |
## -------------|-----------|-----------|-----------|
##
##
## $t
## y
## x ham spam
## ham 680 26
## spam 3 87
##
## $prop.row
## y
## x ham spam
## ham 0.96317280 0.03682720
## spam 0.03333333 0.96666667
##
## $prop.col
## y
## x ham spam
## ham 0.995607613 0.230088496
## spam 0.004392387 0.769911504
##
## $prop.tbl
## y
## x ham spam
## ham 0.854271357 0.032663317
## spam 0.003768844 0.109296482