Overview

Cleaning Data

#Clean html from string
cleanFun <- function(htmlString) {
  return(gsub("<.*?>", "", htmlString))
}

#Load data
spam_dir <- "C:/Users/jlixa/Desktop/CUNY/Fall22/DATA607_Rep/Project4/Data/spam_2"
#spam_dir
filenames_spam <- list.files(spam_dir, pattern = "*.*", full.names = TRUE)
length(filenames_spam)
## [1] 1396
ham_dir <- "C:/Users/jlixa/Desktop/CUNY/Fall22/DATA607_Rep/Project4/Data/easy_ham_2"
#ham_dir
filenames_ham <- list.files(ham_dir, pattern = "*.*", full.names = TRUE)
length(filenames_ham)
## [1] 1400
#HTML clean up function---------------------------------------------------------
cleanFun <- function(htmlString) {
  return(gsub("<.*?>", "", htmlString))
}
spam <- filenames_spam |>
  as.data.frame() |>
  magrittr::set_colnames("file") |>
  mutate(text = lapply(filenames_spam, readLines)) |>      #read emails
  mutate(text = lapply(text, cleanFun)) |>                 #clean html tags
  unnest(c(text)) |>                                       #expand text by line
  mutate(spam = "yes") |>                                #create column labeling as spam
  group_by(file) |>                                        
  mutate(text = paste(text, collapse = " ")) |>            #use group() and paste() to combine strings         
  ungroup() |>
  distinct()

ham <- filenames_ham |>
  as.data.frame() |>
  magrittr::set_colnames("file") |>
  mutate(text = lapply(filenames_ham, readLines)) |>       #read emails
  mutate(text = lapply(text, cleanFun)) |>                 #clean html tags
  unnest(c(text)) |>                                       #expand text by line
  mutate(spam = "no") |>                                #create column labeling as spam
  group_by(file) |>                                        
  mutate(text = paste(text, collapse = " ")) |>            #use group() and paste() to combine strings         
  ungroup() |>
  distinct()

#Combine ham and spam
df <- rbind(spam, ham)

# shuffle the dataframe by rows
df= df[sample(1:nrow(df)), ]

df$spam <- factor(df$spam)

#transform data into text corpus and clean
corpus <- VCorpus(VectorSource(df$text)) %>%
  tm_map(content_transformer(tolower)) %>%
  tm_map(removeNumbers) %>%
  tm_map(removeWords, stopwords("english")) %>%
  tm_map(removePunctuation) |>
  tm_map(stemDocument) |>
  tm_map(stripWhitespace)

lapply(corpus[1:3], as.character) #check
## $`1`
## [1] "forkadminxentcom thu jul returnpath deliv yyyylocalhostnetnoteinccom receiv localhost localhost phoboslabsnetnoteinccom postfix esmtp id ded thu jul edt receiv phobo localhost imap fetchmail jmlocalhost singledrop thu jul ist receiv xentcom dogmaslashnullorg esmtp id goina wed jul receiv lairxentcom localhost xentcom postfix esmtp id dd wed jul pdt deliv forkspamassassintaintorg receiv catsucscedu catsmxucscedu xentcom postfix esmtp id d wed jul pdt receiv tycho dhcpcseucscedu catsucscedu smtp id goiku wed jul pdt jim whitehead subject re usa usa number six messageid mimevers contenttyp textplain charsetusascii contenttransferencod bit xprioriti normal xmsmailprior normal xmailer microsoft outlook imo build import normal repli xmimeol produc microsoft mimeol v xucsccatsmailscann found clean sender forkadminxentcom error forkadminxentcom xbeenther forkspamassassintaintorg xmailmanvers preced bulk listhelp listpost listsubscrib listid friend rohit khare listunsubscrib listarch date wed jul around world grow sens democraci deliv develop sakiko fukudaparr un report author oppos form govern jim httpxentcommailmanlistinfofork"
## 
## $`2`
## [1] "forkadminxentcom fri jul returnpath deliv yyyylocalhostnetnoteinccom receiv localhost localhost phoboslabsnetnoteinccom postfix esmtp id eadc fri jul edt receiv dogmaslashnullorg localhost imap fetchmail jmlocalhost singledrop fri jul ist receiv xentcom dogmaslashnullorg esmtp id gjfukj fri jul receiv lairxentcom localhost xentcom postfix esmtp id abb fri jul pdt deliv forkspamassassintaintorg receiv jaspernet jaspernet xentcom postfix esmtp id fri jul pdt receiv toadhcpapplecom sdsldslscamegapathnet jaspernet esmtp id iaa fri jul date fri jul subject re crop becom king contenttyp textplain charsetusascii formatflow mimevers appl messag framework v cc flatwar road kill rodent unusu size bill humphri repli messageid contenttransferencod bit xmailer appl mail sender forkadminxentcom error forkadminxentcom xmailmanvers preced bulk listid friend rohit khare xbeenther forkspamassassintaintorg friday juli rodent unusu size wrote day nation near millionacr field corn roll across countrysid like second great lawn wholesom american imag obscur decid dubious realiti mention health problem loud kid glow eye take away field httpwwwangelfirecomtncotcfanpag bill humphri httpwwwwhumpcommorelikethi httpxentcommailmanlistinfofork"
## 
## $`3`
## [1] "ilugadminlinuxi mon jul returnpath deliv yyyylocalhostnetnoteinccom receiv localhost localhost phoboslabsnetnoteinccom postfix esmtp id c mon jul edt receiv phobo localhost imap fetchmail jmlocalhost singledrop mon jul ist receiv webnotenet mailwebnotenet dogmaslashnullorg esmtp id grawri sat jul receiv lughtuathaorg rootlughtuathaorg webnotenet esmtp id laa sat jul receiv lugh rootlocalhost lughtuathaorg esmtp id laa sat jul receiv weasel lughtuathaorg esmtp id laa sat jul xauthenticationwarn lughtuathaorg host claim weasel receiv helo weasel smtp exim id yozlm sat jul contenttyp textplain charsetiso robert synnott liam bedford iluglinuxi subject re ilug ot oceanfre dial number date sat jul xmailer kmail version refer repli mimevers contenttransferencod bit messageid sender ilugadminlinuxi error ilugadminlinuxi xmailmanvers preced bulk listid irish linux user group xbeenther iluglinuxi actual though case sensit seem troubl wtih papchap authent well might script friday juli liam bedford wrote fri jul hamiltondavid hpirelandex claim think hi tri find oceanfre isdn dialup number dublin httpiiutaintorg appear least get want pay oceanfre ¤ per second tech support usernam oceanfre password oceanfre think usernam password matter much though l irish linux user group iluglinuxi httpwwwlinuxiemailmanlistinfoilug unsubscript inform list maintain listmasterlinuxi"
dtm <- DocumentTermMatrix(corpus)
inspect(dtm)
## <<DocumentTermMatrix (documents: 2796, terms: 58499)>>
## Non-/sparse entries: 445747/163117457
## Sparsity           : 100%
## Maximal term length: 868
## Weighting          : term frequency (tf)
## Sample             :
##       Terms
## Docs   aug email esmtp jul localhost mon postfix receiv tue wed
##   1517   0    38     4   8         1   8       0     34   0   0
##   166    0     2     5   9         4   0       5      7   9   0
##   1937  11     1     1   0         5   2       1      9   4   0
##   2066   0    37     4   8         1   0       0     35   7   1
##   2188   0     4     1   5         3   0       1     11   0   0
##   2718   0    16     2   0         0   0       2     10   0   0
##   396    0    24     2   0         0   0       2      4   0   0
##   619   12     5     2  16         3   0       2      7   6   0
##   689    0     1     4  10         1   0       0      9   0   0
##   958    0    20     6  10         3   0       6     12   0  10
#findFreqTerms(dtm,5) #Find top freq words. Sparsity: 90%

dtm = removeSparseTerms(dtm, 0.7)

Split data into Training and Test

#split data
train <- dtm[1:1958, ] # 70% for training
test <- dtm[1959:2796, ] #30% for training
train_type <- df[1:1958, ]$spam
test_type <- df[1959:2796, ]$spam

#checking training proportions
tbl_train <- prop.table(table(train_type))
tbl_train
## train_type
##        no       yes 
## 0.5107252 0.4892748
#check testing proportions
tbl_test <- prop.table(table(test_type))
tbl_test
## test_type
##       no      yes 
## 0.477327 0.522673

Building WordClouds

spamText <- subset(df, spam == "yes") 
wordcloud(spamText$text, max.words = 50, scale = c(5, 0.3),random.order = FALSE, rot.per = 0.15, colors = brewer.pal(8, "Dark2") )
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents

hamText <- subset(df, spam =="no") # selecting ham texts
wordcloud(hamText$text, max.words = 50, scale = c(5, 0.3),random.order = FALSE, rot.per = 0.15, colors = brewer.pal(8, "Dark2"))
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents

freq_words <- findFreqTerms(train, 5) 

freq_words_train <- train[ , freq_words]
freq_words_test <- test[ , freq_words]

# creating a function for conversion
convert <- function(x) {x <- ifelse(x > 0, "y", "n")} 
train <- apply(freq_words_train, MARGIN = 2, convert)
test <- apply(freq_words_test, MARGIN = 2, convert)
str(train) # verifying the conversion
##  chr [1:1958, 1:60] "n" "n" "n" "y" "n" "n" "n" "n" "y" "n" "y" "n" "n" "n" ...
##  - attr(*, "dimnames")=List of 2
##   ..$ Docs : chr [1:1958] "1" "2" "3" "4" ...
##   ..$ Terms: chr [1:60] "aug" "bit" "bulk" "can" ...

Training Model and Evaluating

# Creating a Naive Bayes classifier
email_classifier <- naiveBayes(train, train_type)

# Making prediction & evaluation with the classifier
test_prediction <- predict(email_classifier, test)
CrossTable(test_prediction, test_type, 
           prop.chisq = FALSE, prop.t = FALSE,
           dnn = c('predicted', 'actual'))
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  838 
## 
##  
##              | actual 
##    predicted |        no |       yes | Row Total | 
## -------------|-----------|-----------|-----------|
##           no |       379 |        49 |       428 | 
##              |     0.886 |     0.114 |     0.511 | 
##              |     0.948 |     0.112 |           | 
## -------------|-----------|-----------|-----------|
##          yes |        21 |       389 |       410 | 
##              |     0.051 |     0.949 |     0.489 | 
##              |     0.052 |     0.888 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |       400 |       438 |       838 | 
##              |     0.477 |     0.523 |           | 
## -------------|-----------|-----------|-----------|
## 
## 
#model is 90% accurate!

Resources

#https://www.pluralsight.com/guides/building-classification-models-in-r #https://www.linkedin.com/pulse/detailed-naive-bayes-spam-filter-r-leonardo-anello/ #https://www.r-bloggers.com/2013/07/document-classification-using-r/