#Initialises an empty dataframe
spamhamdocs <- data.frame(body = character(), is_spam = logical(), stringsAsFactors = FALSE)
#Returns a list of file names for all of the files in the folder where we keep the spam emails
filenames <- list.files(spamdir)
#Populates the dataframe with the spam messages, and lables them spam
i = 0
while (i < length(filenames)) {
i = i+1
spamhamdocs[nrow(spamhamdocs) + 1,1] <- suppressWarnings(readtext(paste0(spamdir,"\\",filenames[i])))[2]
spamhamdocs[nrow(spamhamdocs),2] <- TRUE
}
#Populates the dataframe with the ham messages, and lables them as not spam (same as above)
filenames <- list.files(hamdir)
i = 0
while (i < length(filenames)) {
i = i+1
spamhamdocs[nrow(spamhamdocs) + 1,1] <- suppressWarnings(readtext(paste0(hamdir,"\\",filenames[i])))[2]
spamhamdocs[nrow(spamhamdocs),2] <- FALSE
}
spamhamdocs[1,1:2]
## body
## 1 From 12a1mailbot1@web.de Thu Aug 22 13:17:22 2002\nReturn-Path: <12a1mailbot1@web.de>\nDelivered-To: zzzz@localhost.example.com\nReceived: from localhost (localhost [127.0.0.1])\n\tby phobos.labs.example.com (Postfix) with ESMTP id 136B943C32\n\tfor <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)\nReceived: from mail.webnote.net [193.120.211.219]\n\tby localhost with POP3 (fetchmail-5.9.0)\n\tfor zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)\nReceived: from dd_it7 ([210.97.77.167])\n\tby webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623\n\tfor <zzzz@example.com>; Thu, 22 Aug 2002 13:09:41 +0100\nFrom: 12a1mailbot1@web.de\nReceived: from r-smtp.korea.com - 203.122.2.197 by dd_it7 with Microsoft SMTPSVC(5.5.1775.675.6);\n\t Sat, 24 Aug 2002 09:42:10 +0900\nTo: <dcek1a1@netsgo.com>\nSubject: Life Insurance - Why Pay More?\nDate: Wed, 21 Aug 2002 20:31:57 -1600\nMIME-Version: 1.0\nMessage-ID: <0103c1042001882DD_IT7@dd_it7>\nContent-Type: text/html; charset="iso-8859-1"\nContent-Transfer-Encoding: quoted-printable\n\n<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">\n<HTML><HEAD>\n<META content=3D"text/html; charset=3Dwindows-1252" http-equiv=3DContent-T=\nype>\n<META content=3D"MSHTML 5.00.2314.1000" name=3DGENERATOR></HEAD>\n<BODY><!-- Inserted by Calypso -->\n<TABLE border=3D0 cellPadding=3D0 cellSpacing=3D2 id=3D_CalyPrintHeader_ r=\nules=3Dnone \nstyle=3D"COLOR: black; DISPLAY: none" width=3D"100%">\n <TBODY>\n <TR>\n <TD colSpan=3D3>\n <HR color=3Dblack noShade SIZE=3D1>\n </TD></TR></TD></TR>\n <TR>\n <TD colSpan=3D3>\n <HR color=3Dblack noShade SIZE=3D1>\n </TD></TR></TBODY></TABLE><!-- End Calypso --><!-- Inserted by Calypso=\n --><FONT \ncolor=3D#000000 face=3DVERDANA,ARIAL,HELVETICA size=3D-2><BR></FONT></TD><=\n/TR></TABLE><!-- End Calypso --><FONT color=3D#ff0000 \nface=3D"Copperplate Gothic Bold" size=3D5 PTSIZE=3D"10">\n<CENTER>Save up to 70% on Life Insurance.</CENTER></FONT><FONT color=3D#ff=\n0000 \nface=3D"Copperplate Gothic Bold" size=3D5 PTSIZE=3D"10">\n<CENTER>Why Spend More Than You Have To?\n<CENTER><FONT color=3D#ff0000 face=3D"Copperplate Gothic Bold" size=3D5 PT=\nSIZE=3D"10">\n<CENTER>Life Quote Savings\n<CENTER>\n<P align=3Dleft></P>\n<P align=3Dleft></P></FONT></U></I></B><BR></FONT></U></B></U></I>\n<P></P>\n<CENTER>\n<TABLE border=3D0 borderColor=3D#111111 cellPadding=3D0 cellSpacing=3D0 wi=\ndth=3D650>\n <TBODY></TBODY></TABLE>\n<TABLE border=3D0 borderColor=3D#111111 cellPadding=3D5 cellSpacing=3D0 wi=\ndth=3D650>\n <TBODY>\n <TR>\n <TD colSpan=3D2 width=3D"35%"><B><FONT face=3DVerdana size=3D4>Ensurin=\ng your \n family's financial security is very important. Life Quote Savings ma=\nkes \n buying life insurance simple and affordable. We Provide FREE Access =\nto The \n Very Best Companies and The Lowest Rates.</FONT></B></TD></TR>\n <TR>\n <TD align=3Dmiddle vAlign=3Dtop width=3D"18%">\n <TABLE borderColor=3D#111111 width=3D"100%">\n <TBODY>\n <TR>\n <TD style=3D"PADDING-LEFT: 5px; PADDING-RIGHT: 5px" width=3D"100=\n%"><FONT \n face=3DVerdana size=3D4><B>Life Quote Savings</B> is FAST, EAS=\nY and \n SAVES you money! Let us help you get started with the best val=\nues in \n the country on new coverage. You can SAVE hundreds or even tho=\nusands \n of dollars by requesting a FREE quote from Lifequote Savings. =\nOur \n service will take you less than 5 minutes to complete. Shop an=\nd \n compare. SAVE up to 70% on all types of Life insurance! \n</FONT></TD></TR>\n <TR><BR><BR>\n <TD height=3D50 style=3D"PADDING-LEFT: 5px; PADDING-RIGHT: 5px" \n width=3D"100%">\n <P align=3Dcenter><B><FONT face=3DVerdana size=3D5><A \n href=3D"http://website.e365.cc/savequote/">Click Here For Your=\n \n Free Quote!</A></FONT></B></P></TD>\n <P><FONT face=3DVerdana size=3D4><STRONG>\n <CENTER>Protecting your family is the best investment you'll eve=\nr \n make!<BR></B></TD></TR>\n <TR><BR><BR></STRONG></FONT></TD></TR></TD></TR>\n <TR></TR></TBODY></TABLE>\n <P align=3Dleft><FONT face=3D"Arial, Helvetica, sans-serif" size=3D2=\n></FONT></P>\n <P></P>\n <CENTER><BR><BR><BR>\n <P></P>\n <P align=3Dleft><BR></B><BR><BR><BR><BR></P>\n <P align=3Dcenter><BR></P>\n <P align=3Dleft><BR></B><BR><BR></FONT>If you are in receipt of this=\n email \n in error and/or wish to be removed from our list, <A \n href=3D"mailto:coins@btamail.net.cn">PLEASE CLICK HERE</A> AND TYPE =\nREMOVE. If you \n reside in any state which prohibits e-mail solicitations for insuran=\nce, \n please disregard this \n email.<BR></FONT><BR><BR><BR><BR><BR><BR><BR><BR><BR><BR><BR><BR><BR=\n><BR><BR><BR></FONT></P></CENTER></CENTER></TR></TBODY></TABLE></CENTER></=\nCENTER></CENTER></CENTER></CENTER></BODY></HTML>\n\n\n
## is_spam
## 1 TRUE
#We need to get the data into a corpus object; this will first take a vector of the texts from each email (spamham[,1]), interperet it as a vector of documents (VectorSource()), then load it as a corpus object (Corpus())
spamhamcorp <- Corpus(VectorSource(spamhamdocs[,1]))
#Now, it needs to be cleaned. Included in the corpus is a bunch of nonsense related to the email format, which tends to be fairly similar between spam and ham. Below, it is purged.
spamhamcorp <- spamhamcorp %>%
tm_map(content_transformer(tolower)) %>% #Changes all characters to lower case characters
tm_map(removeWords, stopwords(kind = "en")) %>% #Removes stop-words (the english ones)
tm_map(stemDocument) %>% #applies Porter's stemming algorithm; removes endings (e.g. created, creates -> create)
tm_map(removePunctuation) %>%
tm_map(removeNumbers) #This line got rid of alot of nonsense related to the email headers
## Warning in tm_map.SimpleCorpus(., content_transformer(tolower)): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(., removeWords, stopwords(kind = "en")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(., stemDocument): transformation drops documents
## Warning in tm_map.SimpleCorpus(., removePunctuation): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(., removeNumbers): transformation drops documents
#Creates the document term matrix
spamhamDTM <- DocumentTermMatrix(spamhamcorp) %>%
removeSparseTerms(0.99) #We have alot of words (and other nonsense stuff) that just doesn't occur much. This trims out anything that occurs infrequently. Values approaching 1 allow sparcer terms to pass. Even a value of 0.99 filters out about 150,000 terms (mostly nonsense)
#Sticks the result in a dataframe
spamham <- as.data.frame(as.matrix(spamhamDTM))
#Turns word frequencies into word occurance (e.g. 4->1, 0->0, 1->1)
spamham[spamham>1] <- 1
#Adds spam status to it.
spamham <- cbind(spamhamdocs[,2], spamham)
colnames(spamham)[1] <- "is.spam"
#Makes factors for all variables
spamham <- as.data.frame(lapply(spamham, as.factor))
#caTools includes an easy tool for this, which yields a logical vector with random trues and falses under the proportion specified. One can use that to subset the main dataset
set.seed("1234567890")
set <- sample.split(spamham, SplitRatio = 0.80)
train <- spamham[set == TRUE,]
test <- spamham[set == FALSE,]
#Seperates out spam and ham from train
spam <- subset(train, is.spam == TRUE)
ham <- subset(train, is.spam == FALSE)
df <- as.data.frame(colSums(as.data.frame(lapply(spam, as.numeric))[-1]))
colnames(df) <- "value"
df$terms <- rownames(df)
df<- head(as.data.frame(df[order(-df$value),]),30)
spamplot <- ggplot(df, aes(x = reorder(df$terms, df$value), y = df$value)) +
geom_bar(stat = "identity") +
xlab("Term") +
ylab("Occurances") +
labs(title = "Spam") +
coord_flip()
df1 <- as.data.frame(colSums(as.data.frame(lapply(ham, as.numeric))[-1]))
colnames(df1) <- "value"
df1$terms <- rownames(df1)
df1<- head(as.data.frame(df1[order(-df$value),]),30)
hamplot <- ggplot(df, aes(x = reorder(df1$terms, df1$value), y = df1$value)) +
geom_bar(stat = "identity") +
xlab("Term") +
ylab("Occurances") +
labs(title = "Ham") +
coord_flip()
grid.arrange(spamplot, hamplot)
## Warning: Use of `df$terms` is discouraged. Use `terms` instead.
## Warning: Use of `df$value` is discouraged. Use `value` instead.
## Warning: Use of `df$value` is discouraged. Use `value` instead.
##
## Call:
## randomForest(x = train[, -1], y = train[, 1], ntree = 200)
## Type of random forest: classification
## Number of trees: 200
## No. of variables tried at each split: 44
##
## OOB estimate of error rate: 0.12%
## Confusion matrix:
## FALSE TRUE class.error
## FALSE 2001 2 0.0009985022
## TRUE 1 399 0.0025000000
set.seed("1234567890")
prediction <- predict(RFmodel, test[,-1])
confusionMatrix(prediction, test$is.spam)
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 496 0
## TRUE 2 100
##
## Accuracy : 0.9967
## 95% CI : (0.988, 0.9996)
## No Information Rate : 0.8328
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9881
##
## Mcnemar's Test P-Value : 0.4795
##
## Sensitivity : 0.9960
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9804
## Prevalence : 0.8328
## Detection Rate : 0.8294
## Detection Prevalence : 0.8294
## Balanced Accuracy : 0.9980
##
## 'Positive' Class : FALSE
##