Project 4 Oluwaseyi Jimoh

###Load Data

library(tidyr)
library(naivebayes)

## naivebayes 1.0.0 loaded

## For more information please visit:

## https://majkamichal.github.io/naivebayes/

library(tm)

## Loading required package: NLP

## Warning: package 'NLP' was built under R version 4.0.3

Due to the size of the files I had to load the files locally as they would be too unweildy to get into githib, They were obtained from the spamm assasin website provided to us in the project guidelines

ham_filepath <- "C:\\Users\\Oluwaseyi\\Documents\\Data 607\\spammy hammer\\20021010_easy_ham\\easy_ham"

spam_filepath <- "C:\\Users\\Oluwaseyi\\Documents\\Data 607\\spammy hammer\\20050311_spam_2\\spam_2"

spam_filenames <- list.files(spam_filepath)
ham_filenames <- list.files(ham_filepath)

spam <- spam_filenames[which(spam_filenames!="cmds")]
ham <- ham_filenames[which(ham_filenames!="cmds")]

head(ham)

## [1] "0001.ea7e79d3153e7469e7a9c3e0af6a357e"
## [2] "0002.b3120c4bcbf3101e661161ee7efcb8bf"
## [3] "0003.acfc5ad94bbd27118a0d8685d18c89dd"
## [4] "0004.e8d5727378ddde5c3be181df593f1712"
## [5] "0005.8c3b9e9c0f3f183ddaf7592a11b99957"
## [6] "0006.ee8b0dba12856155222be180ba122058"

head(spam)

## [1] "00001.317e78fa8ee2f54cd4890fdc09ba8176"
## [2] "00002.9438920e9a55591b18e60d1ed37d992b"
## [3] "00003.590eff932f8704d8b0fcbe69d023b54d"
## [4] "00004.bdcc075fa4beb5157b5dd6cd41d8887b"
## [5] "00005.ed0aba4d386c5e62bc737cf3f0ed9589"
## [6] "00006.3ca1f399ccda5d897fecb8c57669a283"

##Process Data

hamcorpus<- ham_filepath %>%
  paste(., list.files(.), sep = "/") %>%
  lapply(readLines) %>%
  VectorSource() %>%
  VCorpus()%>%
  tm_map(removeWords, stopwords()) %>%
  tm_map(stripWhitespace) %>%
  tm_map(stemDocument)%>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation)

spamcorpus<- spam_filepath %>%
  paste(., list.files(.), sep = "/") %>%
  lapply(readLines) %>%
  VectorSource() %>%
  VCorpus()%>%
  tm_map(removeWords, stopwords()) %>%
  tm_map(stripWhitespace) %>%
  tm_map(stemDocument)%>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation)

spammyhammer <- c(hamcorpus,spamcorpus)

##Creating dataframe

df_H <- as.data.frame(unlist(hamcorpus), type = "ham" , stringsAsFactors = FALSE)
df_H$type <- "ham"
colnames(df_H)=c("text", "spam")

df_S <- as.data.frame(unlist(spamcorpus), type = "spam" , stringsAsFactors = FALSE)
df_S$type <- "spam"
colnames(df_S)=c("text", "spam")

df_HS <- rbind(df_H, df_S)
df_HS[df_HS == "spam"] <- 1
df_HS[df_HS == "ham"] <- 0

##Training

df_HS$id <- 1:nrow(df_HS)

#Use 50% of dataset as training set and remaining 50% as testing set 
train <- df_HS %>% dplyr::sample_frac(0.5)
test  <- dplyr::anti_join(df_HS, train, by = 'id')

train_corpus <- Corpus(VectorSource(train$text))
test_corpus <- Corpus(VectorSource(test$text))
train_tdm <- DocumentTermMatrix(train_corpus)
test_tdm<-DocumentTermMatrix(test_corpus)
freq<- findFreqTerms(train_tdm, 105)
train_tdm_2<- DocumentTermMatrix(train_corpus, control=list(dictionary = freq))
test_tdm_2<- DocumentTermMatrix(test_corpus, control=list(dictionary = freq))
train_tdm_3 <- as.data.frame(as.matrix(train_tdm_2))
test_tdm_3 <- as.data.frame(as.matrix(test_tdm_2))

Prediction

pred <- naive_bayes(train_tdm_3, factor(train$spam))
##test

test_pred <- predict(pred, newdata=test_tdm_3)
table(predicted=test_pred,actual=test_tdm_3[,1])

##          actual
## predicted      0      1
##         0  59448   2687
##         1 162644    542

This was tricky to get correct due to the compute required but this is the predicition i was able to generate.

Project 4 Oluwaseyi Jimoh

Oluwaseyi Jimoh

2024-04-13

Prediction