Spam Text Misclassifiction using Naive Bayes
The following libraries were imported
library(readr)
library(tidyr)
library(stringr)
library(quanteda)
library(RColorBrewer)
library(caret)
Importing File
file1 <- read_csv("/Users/akul/Desktop/R Projects/spam.csv")
file1 <- file1[,-c(3,4,5)]
names(file1) <- c("label","text")
file1$text <- file1$text %>% str_to_lower()
Spam Wordcloud
corpus1 <- corpus(file1$text)
docvars(corpus1) <- file1$label
spam_plot <- corpus_subset(corpus1, docvar1 == "spam")
spam_plot <- dfm(spam_plot,stem = TRUE, tolower = TRUE, remove_punct = TRUE, remove_twitter = TRUE, remove_numbers = TRUE, remove=stopwords("SMART"))
spam_col <- brewer.pal(8, "Dark2")
spam_cloud <- textplot_wordcloud(spam_plot, min.freq = 18, color = spam_col)
title("Spam Wordcloud")
Ham Wordcloud
ham_plot <- corpus_subset(corpus1, docvar1 == "ham")
ham_plot <- dfm(ham_plot,stem = TRUE, tolower = TRUE, remove_punct = TRUE, remove_twitter = TRUE, remove_numbers = TRUE, remove=stopwords("SMART"))
ham.col <- brewer.pal(8, "Dark2")
ham.cloud <- textplot_wordcloud(ham_plot, min.freq = 90,colors = ham.col,rot.per = .25)
title("Ham Wordcloud", col.main = "grey14")
Stratified sample using caret
set.seed(90)
in.train <- createDataPartition(as.factor(file1$label), p=0.8, list=FALSE)
dfm.sms <- dfm(corpus1,stem = TRUE, tolower = TRUE, remove_punct = TRUE, remove_twitter = TRUE, remove_numbers = TRUE, remove=stopwords("SMART"), min_docfreq = 4,min_count = 6)
raw.data.train <- file1[as.integer(in.train),]
raw.data.test <- file1[-as.integer(in.train),]
spam.train <- dfm.sms[as.integer(in.train),]
spam.test <- dfm.sms[-as.integer(in.train),]
*Naive Bayes Classifier
spam.classifier <- textmodel_nb(spam.train, raw.data.train$label)
predictions.test <- predict(spam.classifier, newdata = spam.test)
table(predictions.test$nb.predicted, raw.data.test$label)
##
## ham spam
## ham 843 6
## spam 122 143