suppressMessages(suppressWarnings(library(tm)))
suppressMessages(suppressWarnings(library(dplyr)))
suppressMessages(suppressWarnings(library(SnowballC)))
suppressMessages(suppressWarnings(library(tidyr)))
suppressMessages(suppressWarnings(library(RTextTools)))
suppressMessages(suppressWarnings(library(R.utils)))
suppressMessages(suppressWarnings(library(knitr)))
suppressMessages(suppressWarnings(library(data.table)))
suppressMessages(suppressWarnings(library(stringr)))
suppressMessages(suppressWarnings(library(pROC)))
suppressMessages(suppressWarnings(library(ROCR)))
#Training
spamdirec = "C:/Users/DrKil/OneDrive/Documents/R/Data/spam"
hamdirec = "C:/Users/DrKil/OneDrive/Documents/R/Data/easy_ham"
#Testing
ham2directory = "C:/Users/DrKil/OneDrive/Documents/R/Data/easy_ham_2"
spam2directory = "C:/Users/DrKil/OneDrive/Documents/R/Data/spam_2"
#Listing of all files
spam_files = list.files(spamdirec)
spam2_files = list.files(spam2directory)
ham2_files = list.files(ham2directory)
ham_files = list.files(hamdirec)
#clearing the random .cmd files at the bottom of all the directories
spam_files = spam_files[which(spam_files!="cmds")]
ham_files=ham_files[which(ham_files!="cmds")]
spam2_files = spam2_files[which(spam2_files!="cmds")]
ham2_files=ham2_files[which(ham2_files!="cmds")]
extractText=function(path){
con = file(path, open = "rt", encoding = "native.enc")
text = readLines(con)
msg = text[seq(which(text=="")[1]+1,length(text),1)]
close(con)
return(paste(msg, collapse="\n"))
}
spam = sapply(spam_files, function(p) extractText(paste(spamdirec,p,sep="/")))
## Warning in readLines(con): incomplete final line found on 'C:/Users/DrKil/
## OneDrive/Documents/R/Data/spam/00136.faa39d8e816c70f23b4bb8758d8a74f0'
ham = sapply(ham_files, function(p) extractText(paste(hamdirec,p,sep="/")))
spam2 = sapply(spam2_files, function(p) extractText(paste(spam2directory,p,sep="/")))
ham2 = sapply(ham2_files, function(p) extractText(paste(ham2directory,p,sep="/")))
spam.df=as.data.frame(spam)
ham.df=as.data.frame(ham)
df.2spam=as.data.frame(spam2)
df.2ham=as.data.frame(ham2)
#Setting the outcomes as 1 for SPAM and 0 for HAM.
spam.df$Outcome=1
ham.df$Outcome=0
df.2spam$Outcome=1
df.2ham$Outcome=0
names(spam.df)=c("Text","Type")
names(ham.df)=c("Text","Type")
names(df.2spam)=c("Text","Type")
names(df.2ham)=c("Text","Type")
trainframe=rbind(spam.df,ham.df)
trainsize=nrow(trainframe)
testframe=rbind(df.2spam,df.2ham)
testsize=nrow(testframe)
cmbnd_frame =rbind(trainframe,testframe)
cmbnd_frame_Type=cmbnd_frame$Type
cmbnd_frame_Text=cmbnd_frame$Text
DTM=create_matrix(cmbnd_frame_Text, language="english", minWordLength=3, removeNumbers=TRUE, stemWords=FALSE, removePunctuation=TRUE, weighting=weightTfIdf)
## Warning in weighting(x): empty document(s):
## ÐÏࡱ
corpdoctm=create_container(DTM, t(cmbnd_frame_Type), trainSize=1:trainsize, testSize=(trainsize+1):nrow(cmbnd_frame), virgin=FALSE)
trainsvm=train_model(corpdoctm, "SVM")
result=classify_model(corpdoctm,trainsvm)
analytics=create_analytics(corpdoctm,result)
trainsummary= analytics@document_summary
svm_spam = trainsummary[trainsummary$MANUAL_CODE==1, ]
svm_ham = trainsummary[trainsummary$MANUAL_CODE==0, ]
svm_pos = round(nrow(svm_spam[svm_spam$CONSENSUS_CODE==1,]) / nrow(svm_spam)*100,2)
svm_falseneg = round(nrow(svm_spam[svm_spam$CONSENSUS_CODE==0,]) / nrow(svm_spam)*100,2)
svm_neg = round(nrow(svm_ham[svm_ham$CONSENSUS_CODE==0,]) / nrow(svm_ham)*100,2)
svm_falspos = round(nrow(svm_ham[svm_ham$CONSENSUS_CODE==1,]) / nrow(svm_ham)*100,2)
#Using Tibbles for clarity of the model
svmpredictor = tibble(Email=list("spam", "ham"), true=list(svm_pos,svm_neg), false=list(svm_falseneg,svm_falspos))
knitr::kable(svmpredictor)
| true | false | |
|---|---|---|
| spam | 93.27 | 6.73 |
| ham | 98.64 | 1.36 |
predictor = prediction(trainsummary$SVM_PROB,trainsummary$PROBABILITY_CODE)
ROC = performance(predictor, measure = "tpr", x.measure = "fpr")
plot(ROC)