Loading necessary Packages

suppressMessages(suppressWarnings(library(tm)))
suppressMessages(suppressWarnings(library(dplyr)))
suppressMessages(suppressWarnings(library(SnowballC)))
suppressMessages(suppressWarnings(library(tidyr)))
suppressMessages(suppressWarnings(library(RTextTools)))
suppressMessages(suppressWarnings(library(R.utils)))
suppressMessages(suppressWarnings(library(knitr)))
suppressMessages(suppressWarnings(library(data.table)))
suppressMessages(suppressWarnings(library(stringr)))
suppressMessages(suppressWarnings(library(pROC)))
suppressMessages(suppressWarnings(library(ROCR)))

Now we load the directories and clean the data being used to create the modal

#Training
spamdirec = "C:/Users/DrKil/OneDrive/Documents/R/Data/spam"
hamdirec = "C:/Users/DrKil/OneDrive/Documents/R/Data/easy_ham"
#Testing
ham2directory = "C:/Users/DrKil/OneDrive/Documents/R/Data/easy_ham_2"
spam2directory = "C:/Users/DrKil/OneDrive/Documents/R/Data/spam_2"

#Listing of all files
spam_files = list.files(spamdirec)
spam2_files = list.files(spam2directory)
ham2_files = list.files(ham2directory)
ham_files = list.files(hamdirec)

#clearing the random .cmd files at the bottom of all the directories
spam_files = spam_files[which(spam_files!="cmds")]
ham_files=ham_files[which(ham_files!="cmds")]
spam2_files = spam2_files[which(spam2_files!="cmds")]
ham2_files=ham2_files[which(ham2_files!="cmds")]

Function we will use to extract the text in all the emails.

extractText=function(path){
con = file(path, open = "rt", encoding = "native.enc")
text = readLines(con)
msg = text[seq(which(text=="")[1]+1,length(text),1)]
close(con)
return(paste(msg, collapse="\n"))
}

Extract texts and place into Data Frames

spam = sapply(spam_files, function(p) extractText(paste(spamdirec,p,sep="/")))
## Warning in readLines(con): incomplete final line found on 'C:/Users/DrKil/
## OneDrive/Documents/R/Data/spam/00136.faa39d8e816c70f23b4bb8758d8a74f0'
ham = sapply(ham_files, function(p) extractText(paste(hamdirec,p,sep="/")))
spam2 = sapply(spam2_files, function(p) extractText(paste(spam2directory,p,sep="/")))
ham2 = sapply(ham2_files, function(p) extractText(paste(ham2directory,p,sep="/")))
spam.df=as.data.frame(spam)
ham.df=as.data.frame(ham)
df.2spam=as.data.frame(spam2)
df.2ham=as.data.frame(ham2)

To make our SVM model work, we need to create the classification for emails as either “0” or “1” as this will help with accuracy (binary classification)

#Setting the outcomes as 1 for SPAM and 0 for HAM.
spam.df$Outcome=1
ham.df$Outcome=0
df.2spam$Outcome=1
df.2ham$Outcome=0
names(spam.df)=c("Text","Type")
names(ham.df)=c("Text","Type")
names(df.2spam)=c("Text","Type")
names(df.2ham)=c("Text","Type")

Differentiating between Test and Training and then combining them

trainframe=rbind(spam.df,ham.df)
trainsize=nrow(trainframe)
testframe=rbind(df.2spam,df.2ham)
testsize=nrow(testframe)
cmbnd_frame =rbind(trainframe,testframe)
cmbnd_frame_Type=cmbnd_frame$Type
cmbnd_frame_Text=cmbnd_frame$Text

Document Term Matrices are useful in applying statistical methods to text documents, so we created them here created the container for the corpus

DTM=create_matrix(cmbnd_frame_Text, language="english", minWordLength=3, removeNumbers=TRUE, stemWords=FALSE, removePunctuation=TRUE, weighting=weightTfIdf)
## Warning in weighting(x): empty document(s): 
## ÐÏࡱ
corpdoctm=create_container(DTM, t(cmbnd_frame_Type), trainSize=1:trainsize, testSize=(trainsize+1):nrow(cmbnd_frame), virgin=FALSE)

Now we will use the Support Vector Machine supervised learning model to classify emails in the test set as ham or spam. The model is great for datasets where the distribution is unknown.

trainsvm=train_model(corpdoctm, "SVM")
result=classify_model(corpdoctm,trainsvm)
analytics=create_analytics(corpdoctm,result)
trainsummary= analytics@document_summary

svm_spam = trainsummary[trainsummary$MANUAL_CODE==1, ]
svm_ham = trainsummary[trainsummary$MANUAL_CODE==0, ]
svm_pos  = round(nrow(svm_spam[svm_spam$CONSENSUS_CODE==1,]) / nrow(svm_spam)*100,2)
svm_falseneg = round(nrow(svm_spam[svm_spam$CONSENSUS_CODE==0,]) / nrow(svm_spam)*100,2)
svm_neg = round(nrow(svm_ham[svm_ham$CONSENSUS_CODE==0,]) / nrow(svm_ham)*100,2)
svm_falspos = round(nrow(svm_ham[svm_ham$CONSENSUS_CODE==1,]) / nrow(svm_ham)*100,2)

#Using Tibbles for clarity of the model
svmpredictor = tibble(Email=list("spam", "ham"), true=list(svm_pos,svm_neg), false=list(svm_falseneg,svm_falspos))
knitr::kable(svmpredictor)
Email true false
spam 93.27 6.73
ham 98.64 1.36

The best way to see the effectiveness of the SVM is to plot the Receiver Operating Characteristic (ROC) curve, which will illustrate the diagnostic ability of a binary classifying model

predictor = prediction(trainsummary$SVM_PROB,trainsummary$PROBABILITY_CODE)
ROC = performance(predictor, measure = "tpr", x.measure = "fpr")
plot(ROC)

We have developed a model here that can predict actual emails 99% of the time and spam emails just over 93 percent of the time.