This analysis creates a classification for spam and ham using Random Forest. The code was based on a code referenced in https://kharshit.github.io/blog/2017/08/25/email-spam-filtering-text-analysis-in-r.
After the initial analysis, I was interested in testing whether random forest accuracy could be improved by providing the model more data. The first run was completed using one set of spam and one set of ham messages. The second run uses all of the spam and ham message available in this data set. The spam / ham data came from https://spamassassin.apache.org/old/publiccorpus/
#One randomly selected SPAM & HAM dataset from https://spamassassin.apache.org/old/publiccorpus/
<- readtext::readtext("/Users/johnnyrodriguez/spamham/spam", encoding='UTF-8')
spamdata <- readtext::readtext("/Users/johnnyrodriguez/spamham/ham", encoding='UTF-8')
hamdata
#Label spam records
<- spamdata %>%
spamdata add_column(label = "spam")
#Label ham records
<- hamdata %>%
hamdata add_column(label = "ham")
#Combines the spam & ham data into a single table
<- rbind(hamdata, spamdata)
emails glimpse(emails)
## Rows: 3,947
## Columns: 3
## $ doc_id <chr> "0001.ea7e79d3153e7469e7a9c3e0af6a357e", "0002.b3120c4bcbf3101e…
## $ text <chr> "From exmh-workers-admin@redhat.com Thu Aug 22 12:36:23 2002\n…
## $ label <chr> "ham", "ham", "ham", "ham", "ham", "ham", "ham", "ham", "ham", …
#For reproducibility, the email data was written to CSV and copied to github. The CSV is reloaded for the analysis.
write.csv(emails, "emails.csv", row.names=FALSE)
#Read the CSV from github
<- read.csv("https://raw.githubusercontent.com/johnnydrodriguez/data607_project4/main/emails.csv", na.strings=c("","NA")) emails
#checking the count of records for spam & ham
%>%
emails group_by(label) %>%
count(label)
## # A tibble: 2 × 2
## # Groups: label [2]
## label n
## <chr> <int>
## 1 ham 2551
## 2 spam 1396
#Clean up the email text using the textminer library
library(tm)
= VCorpus(VectorSource(emails$text)) #Vectorizes corpus
corpus = tm_map(corpus, content_transformer(tolower)) # converts to lowercase
corpus = tm_map(corpus, PlainTextDocument) #makes all plain text
corpus = tm_map(corpus, removePunctuation) #removes punctuation
corpus = tm_map(corpus, removeWords, stopwords("en")) #removes stopwords
corpus = tm_map(corpus, stemDocument) #stems words corpus
#Creates a Document Term Matrix from the corpus
= DocumentTermMatrix(corpus)
dtm
#Removes terms that appear less than 5% of the time in document term matrix
= removeSparseTerms(dtm, 0.95)
sparsedtm sparsedtm
## <<DocumentTermMatrix (documents: 3947, terms: 489)>>
## Non-/sparse entries: 310356/1619727
## Sparsity : 84%
## Maximal term length: 49
## Weighting : term frequency (tf)
#converts sparse document term matrix to a dataframe
= as.data.frame(as.matrix(sparsedtm))
emailsSparse
#makes the variable names (the sparse terms) the column names
colnames(emailsSparse) = make.names(colnames(emailsSparse))
#converts the spam variable into a factor used in the model
$spam = as.factor(emailsSparse$spam)
emailsSparse
str(emailsSparse)
## 'data.frame': 3947 obs. of 489 variables:
## $ X0000 : num 0 5 5 1 0 5 7 7 5 7 ...
## $ X0100 : num 2 5 4 2 2 4 4 4 4 4 ...
## $ X0200 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ X0400 : num 6 1 1 5 6 1 1 1 1 1 ...
## $ X0500 : num 1 0 0 0 0 0 0 0 0 0 ...
## $ X0700 : num 3 0 0 0 0 0 0 0 0 0 ...
## $ X0800 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ X100 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ X1000 : num 0 0 0 0 3 0 0 0 0 0 ...
## $ X103113 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ X127001 : num 4 2 2 3 3 2 2 2 2 2 ...
## $ X193120211219 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ X19317254 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ X2001 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ X2002 : num 14 12 12 11 16 11 11 12 11 12 ...
## $ X201 : num 1 0 0 0 1 0 0 0 0 0 ...
## $ X2011 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ X209sfnet : num 0 0 0 0 0 0 0 0 0 0 ...
## $ X212173515 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ X213105180140 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ X216136171252 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ X331vamm2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ X6416122236 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ X7bit : num 0 1 1 0 0 1 1 0 1 0 ...
## $ X81128112 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ X81168116 : num 6 1 1 4 5 1 2 2 1 2 ...
## $ X81168116egwn : num 0 0 0 0 0 0 0 0 0 0 ...
## $ X893893 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ X8bit : num 0 0 0 0 0 0 0 2 0 2 ...
## $ abl : num 1 0 0 0 0 0 0 0 0 0 ...
## $ access : num 0 0 0 0 0 0 0 0 0 0 ...
## $ account : num 0 0 0 0 0 0 0 0 0 0 ...
## $ actual : num 1 0 0 0 0 0 0 0 1 0 ...
## $ add : num 0 0 0 0 1 0 0 0 0 0 ...
## $ address : num 0 0 0 0 0 0 0 0 0 0 ...
## $ allow : num 0 0 0 0 0 0 0 0 0 0 ...
## $ alreadi : num 0 0 0 2 0 0 0 0 0 0 ...
## $ also : num 0 0 0 0 2 0 0 0 0 0 ...
## $ alway : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anoth : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anyon : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anyth : num 0 0 0 0 0 0 0 0 0 0 ...
## $ around : num 0 0 0 0 0 0 0 0 0 0 ...
## $ ask : num 0 0 0 0 0 1 1 0 0 0 ...
## $ aug : num 13 12 11 9 15 11 13 13 11 14 ...
## $ auth02nlegwnnet : num 0 0 0 0 0 0 0 0 0 0 ...
## $ avail : num 0 0 1 0 0 0 0 0 0 0 ...
## $ back : num 0 0 0 0 0 0 0 0 0 0 ...
## $ base : num 0 0 0 0 0 0 0 0 0 0 ...
## $ begin : num 0 0 0 0 0 0 0 0 0 0 ...
## $ best : num 0 0 0 0 0 0 0 0 0 0 ...
## $ better : num 0 0 0 0 0 0 0 0 0 0 ...
## $ bill : num 0 0 0 0 0 0 0 0 0 0 ...
## $ bit : num 0 0 0 0 0 0 0 0 0 0 ...
## $ bodi : num 0 0 0 0 0 0 1 1 0 1 ...
## $ border0 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ border3d0 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ build : num 0 0 3 0 0 0 0 0 0 0 ...
## $ bulk : num 1 1 1 1 1 1 1 1 1 1 ...
## $ busi : num 0 0 0 0 0 0 0 0 0 0 ...
## $ bythinkgeek : num 0 0 0 0 0 0 0 0 0 0 ...
## $ call : num 0 0 0 0 0 0 0 0 0 0 ...
## $ can : num 0 0 0 1 1 0 0 0 0 0 ...
## $ cant : num 1 0 0 0 0 0 0 0 0 0 ...
## $ card : num 0 0 0 0 0 0 0 0 0 0 ...
## $ case : num 0 0 0 0 0 0 0 0 0 0 ...
## $ cdt : num 0 0 0 0 0 0 0 0 0 0 ...
## $ cellspacing3d0 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ center : num 0 0 0 0 0 0 0 0 0 0 ...
## $ chang : num 0 0 0 0 0 0 0 0 0 0 ...
## $ charsetiso88591 : num 0 0 0 0 0 0 0 1 0 1 ...
## $ charsetusascii : num 1 1 1 1 0 1 1 0 1 0 ...
## $ check : num 0 0 0 0 0 0 0 0 0 0 ...
## $ claim : num 0 0 1 0 0 0 0 0 0 0 ...
## $ clean : num 0 0 0 0 0 0 0 0 0 0 ...
## $ click : num 0 0 0 0 0 0 0 0 0 0 ...
## $ code : num 1 0 0 0 0 0 0 0 0 0 ...
## $ come : num 2 0 0 0 0 0 0 1 0 0 ...
## $ comment : num 0 0 0 0 0 0 0 0 0 0 ...
## $ communic : num 0 0 0 0 0 0 0 0 0 0 ...
## $ compani : num 0 0 0 0 0 0 0 0 0 0 ...
## $ complet : num 0 0 0 0 0 0 0 0 0 0 ...
## $ comput : num 0 0 0 0 0 0 0 0 0 0 ...
## $ contact : num 0 1 1 0 0 1 1 1 1 1 ...
## $ contentdisposit : num 0 0 0 0 0 0 0 0 0 0 ...
## $ contenttransferencod : num 0 1 1 0 0 1 1 1 1 1 ...
## $ contenttyp : num 1 1 1 1 0 1 1 1 1 1 ...
## $ cost : num 0 0 0 0 0 0 0 0 0 0 ...
## $ cours : num 0 0 0 0 0 0 0 0 0 0 ...
## $ creat : num 1 0 0 0 0 0 0 0 0 0 ...
## $ credit : num 0 0 0 0 0 0 0 0 0 0 ...
## $ current : num 0 0 0 0 0 0 0 0 0 0 ...
## $ custom : num 0 0 0 0 0 0 0 0 0 0 ...
## $ data : num 0 0 0 1 0 0 0 0 0 0 ...
## $ date : num 2 1 1 1 1 1 1 1 1 1 ...
## $ day : num 1 0 0 0 0 0 0 0 0 0 ...
## $ debian : num 0 0 0 0 0 0 0 0 0 0 ...
## $ deliveredto : num 2 2 2 1 2 2 2 2 2 2 ...
## $ deliveryd : num 0 0 0 0 0 0 0 0 0 0 ...
## [list output truncated]
#splits the data into a test and training data
library(caTools)
set.seed(123)
= sample.split(emailsSparse$spam, 0.7)
spl = subset(emailsSparse, spl == TRUE)
train = subset(emailsSparse, spl == FALSE) test
#Use Random Forest to train the model and predict the label using the training data
library(randomForest)
set.seed(123)
= randomForest(spam~., data=train) spamRF
#calculate the accuracy of the RF model using the training data
= predict(spamRF, type="prob")[,2]
predTrainRF table(train$spam, predTrainRF > 0.5)
##
## FALSE TRUE
## 0 2537 0
## 1 89 8
## 2 63 1
## 3 22 0
## 4 16 0
## 5 11 0
## 6 6 0
## 7 4 0
## 8 2 0
## 9 1 0
## 10 1 0
## 12 1 0
## 14 1 0
## 15 1 0
##Accuracy Calculation for Training Data
2537+8)/nrow(train) (
## [1] 0.920767
= predict(spamRF, newdata=test, type="prob")[,2]
predTestRF table(test$spam, predTestRF > 0.5)
##
## FALSE TRUE
## 0 1087 0
## 1 37 4
## 2 27 0
## 3 9 0
## 4 7 0
## 5 5 0
## 6 3 0
## 7 2 0
## 8 1 0
## 9 0 0
## 10 0 0
## 12 0 0
## 14 0 0
## 15 1 0
##Accuracy Calculation for Test Data
1087+4)/nrow(test) (
## [1] 0.9222316
# All SPAM & HAM datasets from https://spamassassin.apache.org/old/publiccorpus/
<- readtext::readtext("/Users/johnnyrodriguez/spamham/spam1", encoding='UTF-8')
spamdata1 <- readtext::readtext("/Users/johnnyrodriguez/spamham/ham1", encoding='UTF-8')
hamdata1
#Label spam records
<- spamdata1 %>%
spamdata1 add_column(label = "spam")
#Label ham records
<- hamdata1 %>%
hamdata1 add_column(label = "ham")
#Combines the spam & ham data into a single table
<- rbind(hamdata1, spamdata1)
emails1
#For reproducibility, the email data was written to CSV and copied to github.
#Due to the size, the file was zipped. The zip was read into analysis and location at https://github.com/johnnydrodriguez/data607_project4/blob/main/emails1.csv.zip
write.csv(emails1, "emails1.csv", row.names=FALSE)
#checking the count of records for spam & ham
%>%
emails1 group_by(label) %>%
count(label)
## # A tibble: 2 × 2
## # Groups: label [2]
## label n
## <chr> <int>
## 1 ham 6951
## 2 spam 2398
#Clean up the email text using the textminer library
library(tm)
= VCorpus(VectorSource(emails1$text)) #Vectorizes corpus
corpus1 = tm_map(corpus1, content_transformer(tolower)) # converts to lowercase
corpus1 = tm_map(corpus1, PlainTextDocument) #makes all plain text
corpus1 = tm_map(corpus1, removePunctuation) #removes punctuation
corpus1 = tm_map(corpus1, removeWords, stopwords("en")) #removes stopwords
corpus1 = tm_map(corpus1, stemDocument) #stems words
corpus1
#Creates a Document Term Matrix from the corpus
= DocumentTermMatrix(corpus1)
dtm1
#Removes terms that appear less than 5% of the time in document term matrix
= removeSparseTerms(dtm1, 0.95)
sparsedtm1 sparsedtm1
## <<DocumentTermMatrix (documents: 9349, terms: 550)>>
## Non-/sparse entries: 792547/4349403
## Sparsity : 85%
## Maximal term length: 49
## Weighting : term frequency (tf)
#converts sparse document term matrix to a dataframe
= as.data.frame(as.matrix(sparsedtm1))
emailsSparse1
#makes the variable names (the sparse terms) the column names
colnames(emailsSparse1) = make.names(colnames(emailsSparse1))
#converts the spam variable into a factor used in the model
$spam = as.factor(emailsSparse1$spam)
emailsSparse1str(emailsSparse1)
## 'data.frame': 9349 obs. of 550 variables:
## $ X0000 : num 0 0 1 4 5 5 4 0 5 4 ...
## $ X0100 : num 2 2 0 2 5 0 2 1 4 0 ...
## $ X0200 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ X0400 : num 6 6 0 7 1 0 7 1 1 1 ...
## $ X0500 : num 1 1 1 2 0 1 2 1 0 5 ...
## $ X0700 : num 3 3 0 0 0 0 1 0 0 2 ...
## $ X0800 : num 0 0 3 0 0 0 0 0 0 0 ...
## $ X100 : num 0 0 0 0 0 2 0 0 0 1 ...
## $ X103113 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ X127001 : num 4 4 0 4 2 0 4 0 2 1 ...
## $ X193120211219 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ X19317254 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ X19412514545 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ X2000 : num 0 0 1 0 0 0 0 0 0 0 ...
## $ X2002 : num 12 14 8 16 12 10 16 5 12 16 ...
## $ X201 : num 1 1 0 1 0 0 1 0 0 0 ...
## $ X2011 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ X209sfnet : num 0 0 0 0 0 0 0 0 0 0 ...
## $ X212173515 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ X213105180140 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ X216136171252 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ X331vamm2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ X500 : num 0 0 0 0 0 1 0 0 0 0 ...
## $ X6416122236 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ X7bit : num 0 0 0 1 1 0 1 0 1 0 ...
## $ X81168116 : num 6 6 0 4 1 0 4 1 1 0 ...
## $ X81168116egwn : num 0 0 0 0 0 0 0 0 0 0 ...
## $ X893893 : num 0 0 0 0 0 1 0 1 0 0 ...
## $ X8bit : num 0 0 0 0 0 0 0 0 0 0 ...
## $ abl : num 0 1 0 0 0 1 0 0 0 0 ...
## $ access : num 0 0 0 0 0 2 0 0 0 0 ...
## $ account : num 0 0 3 0 0 3 0 0 0 0 ...
## $ actual : num 1 1 1 0 0 0 0 0 0 0 ...
## $ add : num 0 0 1 1 0 0 0 0 0 0 ...
## $ address : num 0 0 0 0 0 9 0 0 0 2 ...
## $ allow : num 0 0 0 0 0 1 0 0 0 0 ...
## $ alreadi : num 0 0 1 0 0 0 0 0 0 1 ...
## $ also : num 4 0 0 0 0 3 0 0 0 1 ...
## $ alway : num 1 0 0 0 0 0 1 0 0 1 ...
## $ anoth : num 0 0 0 0 0 1 0 1 0 0 ...
## $ anyon : num 0 0 0 0 0 0 0 0 0 0 ...
## $ anyth : num 0 0 0 0 0 0 0 0 0 0 ...
## $ applic : num 0 0 0 0 0 0 0 0 0 0 ...
## $ around : num 1 0 0 0 0 0 1 0 0 0 ...
## $ ask : num 0 0 4 0 0 0 0 0 0 0 ...
## $ aug : num 12 13 0 16 12 0 16 0 11 0 ...
## $ august : num 0 0 0 0 0 0 0 0 1 0 ...
## $ auth02nlegwnnet : num 0 0 0 0 0 0 0 0 0 0 ...
## $ avail : num 0 0 0 0 0 2 0 0 1 4 ...
## $ away : num 0 0 1 0 0 0 0 0 1 0 ...
## $ back : num 0 0 0 0 0 0 0 0 0 0 ...
## $ bad : num 0 0 1 0 0 0 0 0 0 0 ...
## $ base : num 0 0 0 0 0 0 0 0 0 0 ...
## $ begin : num 0 0 0 1 0 1 1 0 0 2 ...
## $ believ : num 0 0 0 0 0 0 0 0 0 1 ...
## $ best : num 1 0 1 0 0 0 0 0 0 0 ...
## $ better : num 1 0 0 0 0 0 0 0 0 0 ...
## $ bgcolorffffff : num 0 0 0 0 0 0 0 0 0 0 ...
## $ big : num 0 0 3 1 0 0 0 0 0 0 ...
## $ bit : num 0 0 0 0 0 0 0 0 0 0 ...
## $ bodi : num 3 0 0 0 0 0 0 0 0 0 ...
## $ border0 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ box : num 1 0 0 1 0 2 0 0 0 0 ...
## $ build : num 0 0 0 0 0 0 0 0 3 0 ...
## $ bulk : num 1 1 0 1 1 0 1 0 1 2 ...
## $ busi : num 2 0 0 0 0 1 0 0 0 1 ...
## $ buy : num 0 0 3 0 0 0 0 0 0 0 ...
## $ call : num 0 0 2 0 0 0 0 0 0 0 ...
## $ can : num 2 0 9 0 0 1 1 0 0 3 ...
## $ cant : num 1 1 1 0 0 0 0 0 0 0 ...
## $ card : num 0 0 3 0 0 0 0 0 0 0 ...
## $ case : num 0 0 0 0 0 2 0 0 0 0 ...
## $ cdt : num 0 0 0 0 0 0 0 0 0 5 ...
## $ cellpadding0 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ cellspacing0 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ center : num 0 0 1 0 0 1 0 0 0 0 ...
## $ chang : num 3 0 2 0 0 0 1 0 0 0 ...
## $ charsetiso88591 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ charsetusascii : num 1 1 1 2 1 0 1 0 1 1 ...
## $ check : num 0 0 1 1 0 1 0 0 0 0 ...
## $ claim : num 0 0 0 0 0 3 0 0 1 0 ...
## $ clean : num 0 0 0 1 0 0 1 0 0 0 ...
## $ click : num 0 0 0 0 0 2 0 0 0 0 ...
## $ code : num 0 1 0 0 0 0 0 0 0 0 ...
## $ come : num 0 2 0 0 0 0 0 0 0 0 ...
## $ comment : num 0 0 0 1 0 0 1 0 0 2 ...
## $ communic : num 0 0 0 0 0 1 0 0 0 0 ...
## $ compani : num 0 0 1 0 0 5 0 0 0 1 ...
## $ complet : num 0 0 0 0 0 8 0 0 0 0 ...
## $ comput : num 0 0 0 0 0 2 0 0 0 0 ...
## $ contact : num 0 0 1 0 1 0 0 1 1 3 ...
## $ content : num 0 0 0 0 1 0 0 0 0 0 ...
## $ contentdisposit : num 0 0 0 0 0 0 0 0 0 0 ...
## $ contenttransferencod : num 0 0 0 1 1 0 1 0 1 1 ...
## $ contenttyp : num 1 1 1 4 1 0 3 0 1 1 ...
## $ copi : num 0 0 0 0 0 0 0 0 0 0 ...
## $ copyright : num 0 0 1 0 0 0 0 0 0 2 ...
## $ cost : num 0 0 0 0 0 0 0 0 0 1 ...
## $ cours : num 0 0 2 0 0 0 0 0 0 0 ...
## [list output truncated]
#splits the data into a test and training data
library(caTools)
set.seed(123)
= sample.split(emailsSparse1$spam, 0.7)
spl1 = subset(emailsSparse1, spl1 == TRUE)
train1 = subset(emailsSparse1, spl1 == FALSE)
test1
#Use Random Forest to train the model and predict the label using the training data
library(randomForest)
set.seed(123)
= randomForest(spam~., data=train1)
spamRF1
#calculate the accuracy of the RF model using the training data
= predict(spamRF1, type="prob")[,2]
predTrainRF1 table(train1$spam, predTrainRF1 > 0.5)
##
## FALSE TRUE
## 0 5909 1
## 1 152 122
## 2 169 3
## 3 51 0
## 4 41 0
## 5 31 0
## 6 14 0
## 7 10 0
## 8 6 0
## 9 8 0
## 10 2 0
## 11 1 0
## 12 4 0
## 13 1 0
## 14 7 0
## 15 6 0
## 16 1 0
## 18 2 0
## 19 1 0
## 20 1 0
## 23 1 0
## 39 1 0
##Accuracy Calculation for Training Data
5909+122)/nrow(train1) (
## [1] 0.9214668
#calculate the accuracy of the model using the testing data
= predict(spamRF1, newdata=test1, type="prob")[,2]
predTestRF1 table(test1$spam, predTestRF1 > 0.5)
##
## FALSE TRUE
## 0 2533 0
## 1 53 65
## 2 73 0
## 3 22 0
## 4 18 0
## 5 13 0
## 6 6 0
## 7 4 0
## 8 3 0
## 9 3 0
## 10 1 0
## 11 0 0
## 12 1 0
## 13 0 0
## 14 3 0
## 15 3 0
## 16 0 0
## 18 1 0
## 19 1 0
## 20 0 0
## 23 0 0
## 39 1 0
##Accuracy Calculation for Test Data
2533+65)/nrow(test1) (
## [1] 0.9265335
Random forest accuracy did not improve for spam / ham classification when additional data was provided to the model (increase in n). Alternatives to improve the model would be necessary beyond the current range.
For the smaller data set, accuracy against the training data set was 92.07%. Accuracy against the test data was 92.23%
Type | Count |
ham | 2551 |
spam | 1396 |
For the larger data set, accuracy against the training data set was 92.14%/ Accuracy against the test data was 92.65%
Type | Count |
ham | 6951 |
spam | 2398 |