Project 4 - Email Classifer

This Project is to build a predictive classifer to classify an email a spam or ham. A machine Learning model needs to be build from some of the dataset posted in https://spamassassin.apache.org/old/publiccorpus/ and the model would be used to predict the new dataset whether it is a spam and ham.

Reading Spam and Ham email content

Lets read spam emails from the local directory. Spam emails are downloaded from 20030228_spam.tar.bz2 in https://spamassassin.apache.org/old/publiccorpus/

Read all the spam email contents from dump and prepare a dataframe with column msg as the message content and the class as an indicator whether the message is spam or ham

Class = 1 => SPAM Class = 0 => HAM

Define a function to extract the message body from the email content. Because we dont need to include all the words which comes in the header.

# define a function to extract the message body from the email content. 

get_email_body <- function(emailContent){
  msge <- str_split(emailContent,"\n\n") %>% unlist()
  body <- paste(msge[2:length(msge)], collapse=' ' )
  return(body)
}

Reading the spam emails

dir="C:/Users/Charls/Documents/CunyMSDS/607-Data Acquiction/project-4/dataset/spam/"
filename = list.files(dir)

msgContent<-NA
for(i in 1:length(filename))
{
  filepath<-paste0(dir,filename[i])  
  emailContent <-suppressWarnings(warning(readtext(filepath)))
  msg <- get_email_body(emailContent)
  msg <- gsub("<.*?>", " ", msg)
  eachMsg<- list(paste(msg, collapse="\n"))
  msgContent = c(msgContent,eachMsg)
  
}
spam<-data.frame()
spam<-as.data.frame(unlist(msgContent),stringsAsFactors = FALSE)
spam$class<-1
colnames(spam)<-c("msg","class")

Total number of observation for ‘SPAM’ emails:

nrow(spam)
## [1] 502

Read Ham emails in the similar way.

dir="C:/Users/Charls/Documents/CunyMSDS/607-Data Acquiction/project-4/dataset/easy_ham/"
filename = list.files(dir)

msgContent<-NA
for(i in 1:length(filename))
{
  filepath<-paste0(dir,filename[i])  
  emailContent <-suppressWarnings(warning(readtext(filepath)))
  msg <- get_email_body(emailContent)
  msg <- gsub("<.*?>", " ", msg)
  eachMsg<- list(paste(msg, collapse="\n"))
  msgContent = c(msgContent,eachMsg)
  
}

ham<-data.frame()
ham<-as.data.frame(unlist(msgContent),stringsAsFactors = FALSE)
ham$class<-0
colnames(ham)<-c("msg","class")

Total number of observation for ‘HAM’ emails:

nrow(ham)
## [1] 2502

Merge the ‘HAM’ and ‘SPAM’ dataset into a final dataframe.

mergeDataSet<-rbind(spam,ham)
nrow(mergeDataSet)
## [1] 3004

Creating Corpus and cleaning the message text

  1. Convert into lower.
  2. removing numbers
  3. remove Punctuation.
  4. remove stop words using snowballC package.
  5. Stemming the words into its root word.
  6. stripping off the whit space created by the removal of words, numbers.
#install.packages(c("tm", "SnowballC"))
library(tm)
## Loading required package: NLP
library(SnowballC)
corpus = VCorpus(VectorSource(mergeDataSet$msg))
corpus = tm_map(corpus, content_transformer(tolower))
corpus = tm_map(corpus, removeNumbers)
corpus = tm_map(corpus, removePunctuation)
corpus = tm_map(corpus, removeWords, stopwords())
corpus = tm_map(corpus, stemDocument)
corpus = tm_map(corpus, stripWhitespace)

Creating the Bag of Words model

# Creating the Bag of Words model
dtm = DocumentTermMatrix(corpus)
# removing the most sparse terms
dtm = removeSparseTerms(dtm, 0.98)
dataset = as.data.frame(as.matrix(dtm))
dataset$outputType = mergeDataSet$class

spamDF <- dataset %>% filter(`outputType` == "1" )
nrow(spamDF)
## [1] 502
hamDF <- dataset %>% filter(`outputType` == "0" )
nrow(hamDF)
## [1] 2502
head(dataset$outputType)
## [1] 1 1 1 1 1 1
nrow(dataset)
## [1] 3004

word cloud for SPAM emails.

This is the most frequent words in the spam mail dataset. You can see the word ‘free’, ‘money’ and the words with spelling mistakes coming up.

spam_word_frequency <- colSums(spamDF)
spam_word_frequency <- sort(spam_word_frequency, decreasing = TRUE)
spam_word_frequency[1:20]
##      email       will       nbsp       free     receiv        can 
##       1105        867        686        654        552        518 
##      money outputType       list      pleas        get       name 
##        509        502        473        472        469        427 
##      order       busi      click       make       mail    address 
##        427        410        409        408        398        387 
##     inform     report 
##        384        384
spam_words <- names(spam_word_frequency)
wordcloud(spam_words[1:50], spam_word_frequency[1:50])

word cloud for Ham emails.

This is the most frequent words in the spam mail dataset.

ham_word_frequency <- colSums(hamDF)
ham_word_frequency <- sort(ham_word_frequency, decreasing = TRUE)
ham_word_frequency[1:20]
##    use    can   will    get   list    one   mail   just   like messag 
##   2114   1523   1408   1402   1400   1336   1218   1211   1171   1096 
##   time   work  peopl  wrote   dont    new   date    now   make  email 
##   1076    990    950    923    901    895    869    816    800    783
ham_words <- names(ham_word_frequency)
wordcloud(ham_words[1:50], ham_word_frequency[1:50])

Shuffling the dataset

Also we need to shuffle the dataset before buildin the machine learning model.

shuffledDF <- dataset[sample(1:nrow(dataset)),]
nrow(shuffledDF)
## [1] 3004

Spliting the dataset into training and test data set

Split the complete data set into training set(80%) and test set(20%)

# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(shuffledDF$outputType, SplitRatio = 0.8)
training_set = subset(shuffledDF, split == TRUE)
test_set = subset(shuffledDF, split == FALSE)

Total number of training set observations:

nrow(training_set)
## [1] 2404

Total number of test set observations:

nrow(test_set)
## [1] 600

The total number of observations in the bag of word matrix.

no_observation <- ncol(training_set) - 1
no_observation
## [1] 839

Modeling using Random forest.

Fitting Random Forest Classification to the Training set

# Fitting Random Forest Classification to the Training set
# install.packages('randomForest')
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
classifier = randomForest(x = training_set[-no_observation],
                          y = training_set$outputType,
                          ntree = 3)
## Warning in randomForest.default(x = training_set[-no_observation], y =
## training_set$outputType, : The response has five or fewer unique values.
## Are you sure you want to do regression?

Predicting the new dataset.

Here we are using the test data.

# Predicting the Test set results
y_pred = predict(classifier, newdata = test_set[-no_observation])


# Making the Confusion Matrix

cm <- table(y_pred>0,test_set$outputType)
cm
##        
##           0   1
##   FALSE 500   0
##   TRUE    0 100

The accuracy of the model is

success <- cm['TRUE', 2] + cm['FALSE', 1] 
accuracy <- success/nrow(test_set) * 100
accuracy
## [1] 100