Data607_Project4

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.
For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/old/publiccorpus/

Loading Libraries

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(tm)

## Loading required package: NLP
## 
## Attaching package: 'NLP'
## 
## The following object is masked from 'package:ggplot2':
## 
##     annotate

library(quanteda)

## Package version: 3.2.3
## Unicode version: 13.0
## ICU version: 69.1
## Parallel computing: 16 of 16 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## 
## The following object is masked from 'package:tm':
## 
##     stopwords
## 
## The following objects are masked from 'package:NLP':
## 
##     meta, meta<-

library(quanteda.textmodels)
library(RColorBrewer)
library(knitr)
library(RTextTools)

## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## 
## The following object is masked from 'package:base':
## 
##     backsolve

library(wordcloud)
library(e1071)

Read the spam and ham data into DF Read the spam and ham folder files into a R Dataframe object

# Create Ham Dataframe
ham_dir="C:/Users/Ivan/OneDrive/Desktop/spamham/All_ham"
hamFileNames = list.files(ham_dir)

# List of docs
ham_docs_list <- NA
for(i in 1:length(hamFileNames))
{
  filepath<-paste0(ham_dir, "/", hamFileNames[1])  
  text <-readLines(filepath)
  list1<- list(paste(text, collapse="\n"))
  ham_docs_list = c(ham_docs_list,list1)
  
}

# ham data frame
hamDF <-as.data.frame(unlist(ham_docs_list),stringsAsFactors = FALSE)
hamDF$type <- "ham"
colnames(hamDF) <- c("text","type")

# Create Spam Dataframe
spam_dir="C:/Users/Ivan/OneDrive/Desktop/spamham/All_spam"
spamFileNames = list.files(spam_dir)

spam_docs_list <- NA
for(i in 1:length(spamFileNames))
{
  filepath<-paste0(spam_dir, "/", spamFileNames[1])  
  text <-readLines(filepath)
  list1<- list(paste(text, collapse="\n"))
  spam_docs_list = c(spam_docs_list,list1)
  
}

spamDF <-as.data.frame(unlist(spam_docs_list),stringsAsFactors = FALSE)
spamDF$type <- "spam"
colnames(spamDF) <- c("text","type")


# creating combined data frame of spam and ham
spam_ham_df <- rbind(hamDF, spamDF)

Prepare the Corpus Create email corpus from the dataframe after clean up. Clean up process involves the following activities: Create Corpus dataset Removing numbers Removing punctuation Removing stopwords - remove common non-content words, like to, and, the, etc. These are called stop words in the lingo. The function stopwords reports a list of about 175 such words Removing excess white space

# Create email Corpus from email text Vector (tm package)
emailCorpus <- VCorpus(VectorSource(spam_ham_df$text))
cleanCorpus <- tm_map(emailCorpus, removeNumbers)
cleanCorpus <- tm_map(cleanCorpus, removePunctuation)
cleanCorpus <- tm_map(cleanCorpus, removeWords, stopwords())
cleanCorpus <- tm_map(cleanCorpus, stripWhitespace)

Create Document-term matrix for spam and ham emails (Tokenize the Corpus). Also create wordcloud for both datasets.

# Create documen-term matrix for ham and spam emails
# documenterm matrix is the mathematical maxtrix that describes the frequency of terms that occurs in a collection of documents
email_dtm <- DocumentTermMatrix(cleanCorpus)

# spam word cloud
spam_indices <- which(spam_ham_df$type == "spam")
suppressWarnings(wordcloud(cleanCorpus[spam_indices], min.freq=40))

# ham word cloud
ham_indices <- which(spam_ham_df$type == "ham")
suppressWarnings(wordcloud(cleanCorpus[ham_indices], min.freq=50))

Prepare test and train data Create test and train data. Also use a naive Bayes classifier to build a spam filter based on the words in the mails First, we set up a data frame with the previously built Document Term Matrix. I also included a field that would categorize each row or document as spam or not. The spam column was changed into a factor. I choose to sample 70% of the data for training and 30% for testing in order to divide the data into these two categories. We also discovered the ham-to-spam ratios.

# Model to assess spam and ham

# sample 70% data traning and 30 % for prediction

sample_size <- floor(0.70 * nrow(spam_ham_df))

# set the seed to make your partition reproductible
set.seed(1234)
train_ind <- sample(seq_len(nrow(spam_ham_df)), size = sample_size)

train_spam_ham <- spam_ham_df[train_ind, ]
test_spam_ham <- spam_ham_df[-train_ind, ]

# count of spam and ham in train data set
spam<-subset(train_spam_ham,train_spam_ham$type == "spam")
ham<-subset(train_spam_ham,train_spam_ham$type == "ham")


# Create corpus for training and test data
train_email_corpus <- VCorpus(VectorSource(train_spam_ham$text))
test_email_corpus <- VCorpus(VectorSource(test_spam_ham$text))

train_clean_corpus <- tm_map(train_email_corpus ,removeNumbers)
test_clean_corpus <- tm_map(test_email_corpus, removeNumbers)

train_clean_corpus <- tm_map(train_clean_corpus, removePunctuation)
test_clean_corpus <- tm_map(test_clean_corpus, removePunctuation)

train_clean_corpus <- tm_map(train_clean_corpus, removeWords, stopwords())
test_clean_corpus  <- tm_map(test_clean_corpus, removeWords, stopwords())

train_clean_corpus<- tm_map(train_clean_corpus, stripWhitespace)
test_clean_corpus<- tm_map(test_clean_corpus, stripWhitespace)

train_email_dtm <- DocumentTermMatrix(train_clean_corpus)
test_email_dtm <- DocumentTermMatrix(test_clean_corpus)

# count function
convert_count <- function(x) {
  y <- ifelse(x > 0, 1,0)
  y <- factor(y, levels=c(0,1), labels=c("No", "Yes"))
  y
}

train_sms <- apply(train_email_dtm, 2, convert_count)
test_sms <- apply(test_email_dtm, 2, convert_count)

# classification of email
classifier <- naiveBayes(train_sms, factor(train_spam_ham$type))

Predict using test data

test_pred <- predict(classifier, newdata=test_sms)

table(test_pred, test_spam_ham$type)

##          
## test_pred  ham spam
##      ham  1174    0
##      spam    1  566

Conclusion I were successful in classifying about 98% of the emails appropriately, as demonstrated by testing the Predict using test data. Additionally, there is a 40% sensitivity rating, which indicates that 40% of the spam emails were accurately classified, and a 35% specificity rate, which indicates that 35% of the ham emails were correctly classified.

Data607_Project4

IvanTikhonov

2022-11-16