###Load Data
library(tidyr)
library(naivebayes)
## naivebayes 1.0.0 loaded
## For more information please visit:
## https://majkamichal.github.io/naivebayes/
library(tm)
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 4.0.3
Due to the size of the files I had to load the files locally as they would be too unweildy to get into githib, They were obtained from the spamm assasin website provided to us in the project guidelines
ham_filepath <- "C:\\Users\\Oluwaseyi\\Documents\\Data 607\\spammy hammer\\20021010_easy_ham\\easy_ham"
spam_filepath <- "C:\\Users\\Oluwaseyi\\Documents\\Data 607\\spammy hammer\\20050311_spam_2\\spam_2"
spam_filenames <- list.files(spam_filepath)
ham_filenames <- list.files(ham_filepath)
spam <- spam_filenames[which(spam_filenames!="cmds")]
ham <- ham_filenames[which(ham_filenames!="cmds")]
head(ham)
## [1] "0001.ea7e79d3153e7469e7a9c3e0af6a357e"
## [2] "0002.b3120c4bcbf3101e661161ee7efcb8bf"
## [3] "0003.acfc5ad94bbd27118a0d8685d18c89dd"
## [4] "0004.e8d5727378ddde5c3be181df593f1712"
## [5] "0005.8c3b9e9c0f3f183ddaf7592a11b99957"
## [6] "0006.ee8b0dba12856155222be180ba122058"
head(spam)
## [1] "00001.317e78fa8ee2f54cd4890fdc09ba8176"
## [2] "00002.9438920e9a55591b18e60d1ed37d992b"
## [3] "00003.590eff932f8704d8b0fcbe69d023b54d"
## [4] "00004.bdcc075fa4beb5157b5dd6cd41d8887b"
## [5] "00005.ed0aba4d386c5e62bc737cf3f0ed9589"
## [6] "00006.3ca1f399ccda5d897fecb8c57669a283"
##Process Data
hamcorpus<- ham_filepath %>%
paste(., list.files(.), sep = "/") %>%
lapply(readLines) %>%
VectorSource() %>%
VCorpus()%>%
tm_map(removeWords, stopwords()) %>%
tm_map(stripWhitespace) %>%
tm_map(stemDocument)%>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation)
spamcorpus<- spam_filepath %>%
paste(., list.files(.), sep = "/") %>%
lapply(readLines) %>%
VectorSource() %>%
VCorpus()%>%
tm_map(removeWords, stopwords()) %>%
tm_map(stripWhitespace) %>%
tm_map(stemDocument)%>%
tm_map(removeNumbers) %>%
tm_map(removePunctuation)
spammyhammer <- c(hamcorpus,spamcorpus)
##Creating dataframe
df_H <- as.data.frame(unlist(hamcorpus), type = "ham" , stringsAsFactors = FALSE)
df_H$type <- "ham"
colnames(df_H)=c("text", "spam")
df_S <- as.data.frame(unlist(spamcorpus), type = "spam" , stringsAsFactors = FALSE)
df_S$type <- "spam"
colnames(df_S)=c("text", "spam")
df_HS <- rbind(df_H, df_S)
df_HS[df_HS == "spam"] <- 1
df_HS[df_HS == "ham"] <- 0
##Training
df_HS$id <- 1:nrow(df_HS)
#Use 50% of dataset as training set and remaining 50% as testing set
train <- df_HS %>% dplyr::sample_frac(0.5)
test <- dplyr::anti_join(df_HS, train, by = 'id')
train_corpus <- Corpus(VectorSource(train$text))
test_corpus <- Corpus(VectorSource(test$text))
train_tdm <- DocumentTermMatrix(train_corpus)
test_tdm<-DocumentTermMatrix(test_corpus)
freq<- findFreqTerms(train_tdm, 105)
train_tdm_2<- DocumentTermMatrix(train_corpus, control=list(dictionary = freq))
test_tdm_2<- DocumentTermMatrix(test_corpus, control=list(dictionary = freq))
train_tdm_3 <- as.data.frame(as.matrix(train_tdm_2))
test_tdm_3 <- as.data.frame(as.matrix(test_tdm_2))
pred <- naive_bayes(train_tdm_3, factor(train$spam))
##test
test_pred <- predict(pred, newdata=test_tdm_3)
table(predicted=test_pred,actual=test_tdm_3[,1])
## actual
## predicted 0 1
## 0 59448 2687
## 1 162644 542
This was tricky to get correct due to the compute required but this is the predicition i was able to generate.