This Project is to build a predictive classifer to classify an email a spam or ham. A machine Learning model needs to be build from some of the dataset posted in https://spamassassin.apache.org/old/publiccorpus/ and the model would be used to predict the new dataset whether it is a spam and ham.
Lets read spam emails from the local directory. Spam emails are downloaded from 20030228_spam.tar.bz2 in https://spamassassin.apache.org/old/publiccorpus/
Read all the spam email contents from dump and prepare a dataframe with column msg as the message content and the class as an indicator whether the message is spam or ham
Class = 1 => SPAM Class = 0 => HAM
Define a function to extract the message body from the email content. Because we dont need to include all the words which comes in the header.
# define a function to extract the message body from the email content.
get_email_body <- function(emailContent){
msge <- str_split(emailContent,"\n\n") %>% unlist()
body <- paste(msge[2:length(msge)], collapse=' ' )
return(body)
}
Reading the spam emails
dir="C:/Users/Charls/Documents/CunyMSDS/607-Data Acquiction/project-4/dataset/spam/"
filename = list.files(dir)
msgContent<-NA
for(i in 1:length(filename))
{
filepath<-paste0(dir,filename[i])
emailContent <-suppressWarnings(warning(readtext(filepath)))
msg <- get_email_body(emailContent)
msg <- gsub("<.*?>", " ", msg)
eachMsg<- list(paste(msg, collapse="\n"))
msgContent = c(msgContent,eachMsg)
}
spam<-data.frame()
spam<-as.data.frame(unlist(msgContent),stringsAsFactors = FALSE)
spam$class<-1
colnames(spam)<-c("msg","class")
Total number of observation for ‘SPAM’ emails:
nrow(spam)
## [1] 502
Read Ham emails in the similar way.
dir="C:/Users/Charls/Documents/CunyMSDS/607-Data Acquiction/project-4/dataset/easy_ham/"
filename = list.files(dir)
msgContent<-NA
for(i in 1:length(filename))
{
filepath<-paste0(dir,filename[i])
emailContent <-suppressWarnings(warning(readtext(filepath)))
msg <- get_email_body(emailContent)
msg <- gsub("<.*?>", " ", msg)
eachMsg<- list(paste(msg, collapse="\n"))
msgContent = c(msgContent,eachMsg)
}
ham<-data.frame()
ham<-as.data.frame(unlist(msgContent),stringsAsFactors = FALSE)
ham$class<-0
colnames(ham)<-c("msg","class")
Total number of observation for ‘HAM’ emails:
nrow(ham)
## [1] 2502
Merge the ‘HAM’ and ‘SPAM’ dataset into a final dataframe.
mergeDataSet<-rbind(spam,ham)
nrow(mergeDataSet)
## [1] 3004
#install.packages(c("tm", "SnowballC"))
library(tm)
## Loading required package: NLP
library(SnowballC)
corpus = VCorpus(VectorSource(mergeDataSet$msg))
corpus = tm_map(corpus, content_transformer(tolower))
corpus = tm_map(corpus, removeNumbers)
corpus = tm_map(corpus, removePunctuation)
corpus = tm_map(corpus, removeWords, stopwords())
corpus = tm_map(corpus, stemDocument)
corpus = tm_map(corpus, stripWhitespace)
# Creating the Bag of Words model
dtm = DocumentTermMatrix(corpus)
# removing the most sparse terms
dtm = removeSparseTerms(dtm, 0.98)
dataset = as.data.frame(as.matrix(dtm))
dataset$outputType = mergeDataSet$class
spamDF <- dataset %>% filter(`outputType` == "1" )
nrow(spamDF)
## [1] 502
hamDF <- dataset %>% filter(`outputType` == "0" )
nrow(hamDF)
## [1] 2502
head(dataset$outputType)
## [1] 1 1 1 1 1 1
nrow(dataset)
## [1] 3004
This is the most frequent words in the spam mail dataset. You can see the word ‘free’, ‘money’ and the words with spelling mistakes coming up.
spam_word_frequency <- colSums(spamDF)
spam_word_frequency <- sort(spam_word_frequency, decreasing = TRUE)
spam_word_frequency[1:20]
## email will nbsp free receiv can
## 1105 867 686 654 552 518
## money outputType list pleas get name
## 509 502 473 472 469 427
## order busi click make mail address
## 427 410 409 408 398 387
## inform report
## 384 384
spam_words <- names(spam_word_frequency)
wordcloud(spam_words[1:50], spam_word_frequency[1:50])
This is the most frequent words in the spam mail dataset.
ham_word_frequency <- colSums(hamDF)
ham_word_frequency <- sort(ham_word_frequency, decreasing = TRUE)
ham_word_frequency[1:20]
## use can will get list one mail just like messag
## 2114 1523 1408 1402 1400 1336 1218 1211 1171 1096
## time work peopl wrote dont new date now make email
## 1076 990 950 923 901 895 869 816 800 783
ham_words <- names(ham_word_frequency)
wordcloud(ham_words[1:50], ham_word_frequency[1:50])
Also we need to shuffle the dataset before buildin the machine learning model.
shuffledDF <- dataset[sample(1:nrow(dataset)),]
nrow(shuffledDF)
## [1] 3004
Split the complete data set into training set(80%) and test set(20%)
# Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(shuffledDF$outputType, SplitRatio = 0.8)
training_set = subset(shuffledDF, split == TRUE)
test_set = subset(shuffledDF, split == FALSE)
Total number of training set observations:
nrow(training_set)
## [1] 2404
Total number of test set observations:
nrow(test_set)
## [1] 600
The total number of observations in the bag of word matrix.
no_observation <- ncol(training_set) - 1
no_observation
## [1] 839
Fitting Random Forest Classification to the Training set
# Fitting Random Forest Classification to the Training set
# install.packages('randomForest')
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
classifier = randomForest(x = training_set[-no_observation],
y = training_set$outputType,
ntree = 3)
## Warning in randomForest.default(x = training_set[-no_observation], y =
## training_set$outputType, : The response has five or fewer unique values.
## Are you sure you want to do regression?
Here we are using the test data.
# Predicting the Test set results
y_pred = predict(classifier, newdata = test_set[-no_observation])
# Making the Confusion Matrix
cm <- table(y_pred>0,test_set$outputType)
cm
##
## 0 1
## FALSE 500 0
## TRUE 0 100
The accuracy of the model is
success <- cm['TRUE', 2] + cm['FALSE', 1]
accuracy <- success/nrow(test_set) * 100
accuracy
## [1] 100