Notice:

Use data and the same approach as in the post “Advanced classification _ Text classification”

Package for natural language process (NLP)

#install.packages("quanteda")
library (quanteda)
raw_data <- read.csv ("/Users/lytran/Desktop/R_cheetsheet/Data for practice/spam_classificiation.csv", nrows = 2000)
table(raw_data$v1)
## 
##  ham spam 
## 1720  280
set.seed(2012)
raw_data <- raw_data[sample(nrow(raw_data)),] #Randomly shuffling the dataset
names(raw_data) <- c('type','message') # Name for 2 columns in raw_data

# Must convert message into corpus format, but before doing this, convert message into character class first.
raw_data$message <- as.character(raw_data$message)
msg.corpus <- corpus(raw_data$message)
msg.corpus[10]
##                                  text10 
## "Aight, I'll ask a few of my roommates"
docvars(msg.corpus) <- raw_data$type #attach class label to the corpus message text

# Filter message in spam
spam.plot <- corpus_subset(msg.corpus, docvar1 == 'spam')

# The rule behind is to count the frequency of words in spam. So, we need to create a **document feature matrix** using dfm(). The rows represent texts and column represent the words in each text
spam.plot <- dfm(spam.plot, tolower = TRUE, remove_punct = TRUE,
                  remove_twitter = TRUE, remove_numbers = TRUE)
spam.plot[1:10,1:5] 
## Document-feature matrix of: 10 documents, 5 features (84% sparse).
## 10 x 5 sparse Matrix of class "dfm"
##         features
## docs     today's offer claim ur worth
##   text11       1     1     1  1     1
##   text20       0     0     0  0     0
##   text27       0     0     0  0     0
##   text36       0     0     0  1     0
##   text39       0     0     0  0     0
##   text60       0     0     0  0     0
##   text70       0     0     0  0     0
##   text81       0     1     1  0     0
##   text95       0     0     0  0     0
##   text98       0     0     0  0     0

Graph word cloud for Spam messages

spam.plot <- dfm(spam.plot, tolower = TRUE)

textplot_wordcloud(spam.plot, min_count = 10, color ='red')  #show word that occur with the least frequency = 10
title("Spam WordCloud", col.main = 'Green')

Graph word cloud for Ham messages

ham.plot <- corpus_subset(msg.corpus, docvar1 == 'ham')
ham.plot <- dfm(ham.plot, tolower = TRUE)
ham.plot[1:10,1:5]
## Document-feature matrix of: 10 documents, 5 features (82% sparse).
## 10 x 5 sparse Matrix of class "dfm"
##         features
## docs     ask g or iouri ,
##   text1    1 1  1     1 1
##   text2    0 0  0     0 0
##   text3    0 0  0     0 0
##   text4    0 0  0     0 0
##   text5    0 0  0     0 0
##   text6    0 0  0     0 1
##   text7    0 0  0     0 0
##   text8    0 0  0     0 0
##   text9    0 0  0     0 3
##   text10   1 0  0     0 1
textplot_wordcloud(ham.plot, min_count = 30, col = "blue",
                   min_size = 1, max_size = 7)
title("Ham WordCloud", col.main = 'Green')

Build a predictive model to compute the probabilities of a message being spam or ham

#Separate into train and test data
nrow(raw_data)
## [1] 2000
data.train <- raw_data[1:1000,]
data.test <- raw_data [1000:nrow(raw_data),]

#Build rules to train model
#First, we still have to use above msg.corpus (message were converted into character and corpus format already)
msg.dfm <- dfm(msg.corpus, tolower = TRUE)
msg.dfm <- dfm_trim(msg.dfm, min_count = 5, min_docfreq = 3) # keep words occuring >=5 and in >= 3 text
## Warning in dfm_trim.dfm(msg.dfm, min_count = 5, min_docfreq = 3): min_count
## is deprecated, use min_termfreq
head(msg.dfm)
## Document-feature matrix of: 6 documents, 950 features (98.7% sparse).
#Next, separate into training and testing of dfm
msg.dfm.train <- msg.dfm[1:1000,]
msg.dfm.test <- msg.dfm[1000: nrow(msg.dfm),]

# Train the Naive Bayes model
nb.classifier <- textmodel_nb(msg.dfm.train, data.train[,1])
summary(nb.classifier) # the model outputs the probabilities of message being smap or ham
## 
## Call:
## textmodel_nb.dfm(x = msg.dfm.train, y = data.train[, 1])
## 
## Class Priors:
## (showing first 2 elements)
##  ham spam 
##  0.5  0.5 
## 
## Estimated Feature Scores:
##         ask     g     or      ,   i've   told    the  story   like  times
## ham  0.7853 0.413 0.2543 0.5472 0.8183 0.7558 0.6338 0.5846 0.7885 0.6633
## spam 0.2147 0.587 0.7457 0.4528 0.1817 0.2442 0.3662 0.4154 0.2115 0.3367
##      already       i   your number     to      .   love    you     so
## ham   0.7558 0.91158 0.3471 0.3788 0.3903 0.6752 0.8609 0.6413 0.7976
## spam  0.2442 0.08842 0.6529 0.6212 0.6097 0.3248 0.1391 0.3587 0.2024
##        much    can     it   fuck    not      !   know      my      *
## ham  0.8798 0.7928 0.8213 0.5296 0.7715 0.3252 0.7112 0.91649 0.5364
## spam 0.1202 0.2072 0.1787 0.4704 0.2285 0.6748 0.2888 0.08351 0.4636
##         and   head
## ham  0.6972 0.5846
## spam 0.3028 0.4154
# Test the built model with data.test
pred <- predict(nb.classifier, msg.dfm.test)
# Create a confusion matrix to check the accuracy
table(predicted = pred, actual = data.test[,1])
##          actual
## predicted ham spam
##      ham  827    5
##      spam  30  139
#25 text wrongly classified for spam, and 4 texts wrongly classified for ham
(851+121)/(851+121+4+25) #Accuracy proportion of the model
## [1] 0.971029

References: http://kenbenoit.net/assets/courses/essex2014qta/exercise2.pdf https://www.r-bloggers.com/text-message-classification/