JLedesma_607_Project4

In this project, we will be creating a document classifier using the easy ham and spam data from spamassassin.com. In order to do that, we will read in the data, clean it, and create a “bag of words” representation. We will use that to create a model using Naive Bayes to predict further documents as spam or ham.

library(tm)

## Warning: package 'tm' was built under R version 4.2.2

## Loading required package: NLP

library(corpus)

## Warning: package 'corpus' was built under R version 4.2.2

library(readr)
library(readtext)

## Warning: package 'readtext' was built under R version 4.2.2

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.2.2

## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──

## ✔ ggplot2 3.3.6      ✔ dplyr   1.0.10
## ✔ tibble  3.1.8      ✔ stringr 1.4.1 
## ✔ tidyr   1.2.0      ✔ forcats 0.5.2 
## ✔ purrr   0.3.4      
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::annotate() masks NLP::annotate()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ dplyr::lag()        masks stats::lag()

library(klaR)

## Warning: package 'klaR' was built under R version 4.2.2

## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select

library(caTools)

## Warning: package 'caTools' was built under R version 4.2.2

library(wordcloud)

## Warning: package 'wordcloud' was built under R version 4.2.2

## Loading required package: RColorBrewer

library(e1071)

## Warning: package 'e1071' was built under R version 4.2.2

Loading the Ham and Spam files from spamassassin and removing the needless cmd files. A corpus is then created out of the remaining files.

#Creating directories
hamdir <- ("C:\\Users\\John Ledesma\\Desktop\\New folder (2)\\easy_ham")

spamdir <- ("C:\\Users\\John Ledesma\\Desktop\\New folder (2)\\spam")

easy_ham_files <- list.files("C:\\Users\\John Ledesma\\Desktop\\New folder (2)\\easy_ham")

spam_files <- list.files("C:\\Users\\John Ledesma\\Desktop\\New folder (2)\\spam")

#removing cmds files
spam_files <- spam_files[which(spam_files!="cmds")]
easy_ham_files <- easy_ham_files[which(easy_ham_files!="cmds")]

#Reading Corpus
easy_ham_corpus <- hamdir %>%
  paste(., list.files(.), sep = "/") %>%
  lapply(readLines) %>%
  VectorSource() %>%
  VCorpus()

spam_corpus <- spamdir %>%
  paste(., list.files(.), sep = "/") %>%
  lapply(readLines) %>%
  VectorSource() %>%
  VCorpus()

Cleaning the corpus by removing stop words, punctuation, white space and numbers and combining both corpuses into one.

easy_ham_corpus <- easy_ham_corpus %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(removeWords, stopwords()) %>%
  tm_map(stripWhitespace) %>%
  tm_map(stemDocument)

spam_corpus <- spam_corpus %>%
  tm_map(removeNumbers) %>%
  tm_map(removePunctuation) %>%
  tm_map(removeWords, stopwords()) %>%
  tm_map(stripWhitespace) %>%
  tm_map(stemDocument)

#Combining Corpus
ham_or_spam_corpus <- c(easy_ham_corpus, spam_corpus)

spamdf <- data.frame()
hamdf <- data.frame()
hamspamdf <- data.frame(stringsAsFactors = FALSE)

for(i in 1:500){
  spamdf[i,1]='ham'
  spamdf[i,2]=paste(spam_corpus[[i]]$content, collapse = " ")
} 
for(i in 1:2500){
  hamdf[i,1]='spam'
  hamdf[i,2]=paste(easy_ham_corpus[[i]]$content, collapse = " ")
}

hamspamdf <- rbind(spamdf,hamdf)
colnames(hamspamdf) <- c('hamorspam','text')
hamspamdf$hamorspam <- factor(hamspamdf$hamorspam)

Tokenizing the words into a matrix and also subsetting the matrices into Training and Test Matrices to use for our model and to test.

Full_dtm <- DocumentTermMatrix(ham_or_spam_corpus)
Full_dtm <- removeSparseTerms(Full_dtm,.995)


Train_dtm <- Full_dtm[300:700,]
Test_dtm <- Full_dtm[c(1:299,701:3000),]

frequentTerms <- findFreqTerms(Train_dtm, 5)



Train_Labels <- hamspamdf[300:700,]$hamorspam
Test_Labels<- hamspamdf[c(1:299,701:3000),]$hamorspam

#inspect(Freq[1005:1010,2002:2012])
#Cleaning Sparse terms
#sparsedtm <- removeSparseTerms(Freq,.995)

Train_dtm_freq<-Train_dtm[,frequentTerms]

Test_dtm_freq<-Test_dtm[,frequentTerms]

Splitting the dataframe into test and train data.

#split <- sample.split(hamspamdf, SplitRatio = .7)
traindata <- hamspamdf[300:700,]
testdata <- hamspamdf[c(1:299,701:3000),]

Converting the matrix into yes and no strings as the data is numerical and must be categorical.

#classifier <- naiveBayes(hamorspam ~., data = traindata)

#freqTrainData <- traindata[ , frequentTerms]
#freqTestData <- testdata[ , frequentTerms]
convert_counts <- function(x) {
  x <- ifelse(x > 0, "No", "Yes")}

mail_Train <- apply(Train_dtm_freq, MARGIN = 2,
convert_counts)

mail_Test <- apply(Test_dtm_freq, MARGIN = 2,
convert_counts)


#mailTest <- apply(testdata, MARGIN = 2,convert_counts)

Using the Training data set and the corresponding labels to create our classifier using Naive Bayes.

classifier <- naiveBayes(mail_Train, Train_Labels)

Using our classifier to predict the testing data.

predicted <- predict(classifier, mail_Test)
summary(predicted)

##  ham spam 
## 2358  241

Conclusion:

Based on the prediction, we recieved 10.22% spam rate. This is off as our original data set had 16.667% spam rate to the total. Our model can be improved upon further it seems. One suggestion would be to remove words that are common between the two sets to further the accuracy.