Use data and the same approach as in the post “Advanced classification _ Text classification”
#install.packages("quanteda")
library (quanteda)
raw_data <- read.csv ("/Users/lytran/Desktop/R_cheetsheet/Data for practice/spam_classificiation.csv", nrows = 2000)
table(raw_data$v1)
##
## ham spam
## 1720 280
set.seed(2012)
raw_data <- raw_data[sample(nrow(raw_data)),] #Randomly shuffling the dataset
names(raw_data) <- c('type','message') # Name for 2 columns in raw_data
# Must convert message into corpus format, but before doing this, convert message into character class first.
raw_data$message <- as.character(raw_data$message)
msg.corpus <- corpus(raw_data$message)
msg.corpus[10]
## text10
## "Aight, I'll ask a few of my roommates"
docvars(msg.corpus) <- raw_data$type #attach class label to the corpus message text
# Filter message in spam
spam.plot <- corpus_subset(msg.corpus, docvar1 == 'spam')
# The rule behind is to count the frequency of words in spam. So, we need to create a **document feature matrix** using dfm(). The rows represent texts and column represent the words in each text
spam.plot <- dfm(spam.plot, tolower = TRUE, remove_punct = TRUE,
remove_twitter = TRUE, remove_numbers = TRUE)
spam.plot[1:10,1:5]
## Document-feature matrix of: 10 documents, 5 features (84% sparse).
## 10 x 5 sparse Matrix of class "dfm"
## features
## docs today's offer claim ur worth
## text11 1 1 1 1 1
## text20 0 0 0 0 0
## text27 0 0 0 0 0
## text36 0 0 0 1 0
## text39 0 0 0 0 0
## text60 0 0 0 0 0
## text70 0 0 0 0 0
## text81 0 1 1 0 0
## text95 0 0 0 0 0
## text98 0 0 0 0 0
spam.plot <- dfm(spam.plot, tolower = TRUE)
textplot_wordcloud(spam.plot, min_count = 10, color ='red') #show word that occur with the least frequency = 10
title("Spam WordCloud", col.main = 'Green')
ham.plot <- corpus_subset(msg.corpus, docvar1 == 'ham')
ham.plot <- dfm(ham.plot, tolower = TRUE)
ham.plot[1:10,1:5]
## Document-feature matrix of: 10 documents, 5 features (82% sparse).
## 10 x 5 sparse Matrix of class "dfm"
## features
## docs ask g or iouri ,
## text1 1 1 1 1 1
## text2 0 0 0 0 0
## text3 0 0 0 0 0
## text4 0 0 0 0 0
## text5 0 0 0 0 0
## text6 0 0 0 0 1
## text7 0 0 0 0 0
## text8 0 0 0 0 0
## text9 0 0 0 0 3
## text10 1 0 0 0 1
textplot_wordcloud(ham.plot, min_count = 30, col = "blue",
min_size = 1, max_size = 7)
title("Ham WordCloud", col.main = 'Green')
#Separate into train and test data
nrow(raw_data)
## [1] 2000
data.train <- raw_data[1:1000,]
data.test <- raw_data [1000:nrow(raw_data),]
#Build rules to train model
#First, we still have to use above msg.corpus (message were converted into character and corpus format already)
msg.dfm <- dfm(msg.corpus, tolower = TRUE)
msg.dfm <- dfm_trim(msg.dfm, min_count = 5, min_docfreq = 3) # keep words occuring >=5 and in >= 3 text
## Warning in dfm_trim.dfm(msg.dfm, min_count = 5, min_docfreq = 3): min_count
## is deprecated, use min_termfreq
head(msg.dfm)
## Document-feature matrix of: 6 documents, 950 features (98.7% sparse).
#Next, separate into training and testing of dfm
msg.dfm.train <- msg.dfm[1:1000,]
msg.dfm.test <- msg.dfm[1000: nrow(msg.dfm),]
# Train the Naive Bayes model
nb.classifier <- textmodel_nb(msg.dfm.train, data.train[,1])
summary(nb.classifier) # the model outputs the probabilities of message being smap or ham
##
## Call:
## textmodel_nb.dfm(x = msg.dfm.train, y = data.train[, 1])
##
## Class Priors:
## (showing first 2 elements)
## ham spam
## 0.5 0.5
##
## Estimated Feature Scores:
## ask g or , i've told the story like times
## ham 0.7853 0.413 0.2543 0.5472 0.8183 0.7558 0.6338 0.5846 0.7885 0.6633
## spam 0.2147 0.587 0.7457 0.4528 0.1817 0.2442 0.3662 0.4154 0.2115 0.3367
## already i your number to . love you so
## ham 0.7558 0.91158 0.3471 0.3788 0.3903 0.6752 0.8609 0.6413 0.7976
## spam 0.2442 0.08842 0.6529 0.6212 0.6097 0.3248 0.1391 0.3587 0.2024
## much can it fuck not ! know my *
## ham 0.8798 0.7928 0.8213 0.5296 0.7715 0.3252 0.7112 0.91649 0.5364
## spam 0.1202 0.2072 0.1787 0.4704 0.2285 0.6748 0.2888 0.08351 0.4636
## and head
## ham 0.6972 0.5846
## spam 0.3028 0.4154
# Test the built model with data.test
pred <- predict(nb.classifier, msg.dfm.test)
# Create a confusion matrix to check the accuracy
table(predicted = pred, actual = data.test[,1])
## actual
## predicted ham spam
## ham 827 5
## spam 30 139
#25 text wrongly classified for spam, and 4 texts wrongly classified for ham
(851+121)/(851+121+4+25) #Accuracy proportion of the model
## [1] 0.971029
References: http://kenbenoit.net/assets/courses/essex2014qta/exercise2.pdf https://www.r-bloggers.com/text-message-classification/