Loading the dataset for the “HAM” messages.
## [1] "00001.1a31cc283af0060967a233d26548a6ce"
## [2] "00002.5a587ae61666c5aa097c8e866aedcc59"
## [3] "00003.19be8acd739ad589cd00d8425bac7115"
## [4] "00004.b2ed6c3c62bbdfab7683d60e214d1445"
## [5] "00005.07b9d4aa9e6c596440295a5170111392"
## [6] "00006.654c4ec7c059531accf388a807064363"
## [7] "00007.2e086b13730b68a21ee715db145522b9"
## [8] "00008.6b73027e1e56131377941ff1db17ff12"
## [9] "00009.13c349859b09264fa131872ed4fb6e4e"
## [10] "00010.d1b4dbbad797c5c0537c5a0670c373fd"
## [11] "00011.bc1aa4dca14300a8eec8b7658e568f29"
## [12] "00012.3c1ff7380f10a806321027fc0ad09560"
## [13] "00013.245fc5b9e5719b033d5d740c51af92e0"
## [14] "00014.8e21078a89bd9c57255d302f346551e8"
## [15] "00015.d5c8f360cf052b222819718165db24c6"
## [16] "00016.bc1f434b566619637a0de033cd3380d1"
## [17] "00017.8b965080dfffada165a54c041c27e33f"
## [18] "00018.3b6a8c5da4043f2a6a63a1ae12bd9824"
## [19] "00019.c6b272a04ec32252f7c685f464ae3942"
## [20] "00020.83ef024f76cc42b8245a683ed9b38406"
We need to strip out the header information as the function below only needs the body of the email. Using the function identified above, we are able to extract just the meat of the message and save into a Dataframe.
ham_docs_list <- NA
for(i in 1:length(hamFileNames))
{
filepath<-paste0(ham_dir, "/", hamFileNames[1])
Content <-suppressWarnings(warning(readtext(filepath)))
message <- get_email_body (Content)
message <- gsub("<.*?>", " ", message)
list1<- list(paste(message, collapse="\n"))
ham_docs_list = c(ham_docs_list,list1)
}
hamDF <-as.data.frame(unlist(ham_docs_list),stringsAsFactors = FALSE)
hamDF$type <- "ham"
colnames(hamDF) <- c("text","type")Loading the dataset for the “SPAM” messages.
## [1] "00001.317e78fa8ee2f54cd4890fdc09ba8176"
## [2] "00002.9438920e9a55591b18e60d1ed37d992b"
## [3] "00003.590eff932f8704d8b0fcbe69d023b54d"
## [4] "00004.bdcc075fa4beb5157b5dd6cd41d8887b"
## [5] "00005.ed0aba4d386c5e62bc737cf3f0ed9589"
## [6] "00006.3ca1f399ccda5d897fecb8c57669a283"
## [7] "00007.acefeee792b5298f8fee175f9f65c453"
## [8] "00008.ccf927a6aec028f5472ca7b9db9eee20"
## [9] "00009.1e1a8cb4b57532ab38aa23287523659d"
## [10] "00010.2558d935f6439cb40d3acb8b8569aa9b"
## [11] "00011.bd8c904d9f7b161a813d222230214d50"
## [12] "00012.cb9c9f2a25196f5b16512338625a85b4"
## [13] "00013.372ec9dc663418ca71f7d880a76f117a"
## [14] "00014.13574737e55e51fe6737a475b88b5052"
## [15] "00015.206d5a5d1d34272ae32fc286788fdf55"
## [16] "00016.4fb07c8dff1a5a2b4889dc5024c55023"
## [17] "00017.6430f3b8dedf51ba3c3fcb9304e722e7"
## [18] "00018.336cb9e7b0358594cf002e7bf669eaf5"
## [19] "00019.86ce6f6c2e9f4ae0415860fecdf055db"
## [20] "00020.7d36d16fd2be07c4f6a5616590cdea07"
Again, just as in the ‘HAM’ messages, we only need the meat of the email saved to a Dataframe.
spam_docs_list <- NA
for(i in 1:length(spamFileNames))
{
filepath<-paste0(spam_dir, "/", spamFileNames[1])
Content <-suppressWarnings(warning(readtext(filepath)))
message <- get_email_body (Content)
message <- gsub("<.*?>", " ", message)
list1<- list(paste(message, collapse="\n"))
spam_docs_list = c(spam_docs_list,list1)
}
spamDF <-as.data.frame(unlist(spam_docs_list),stringsAsFactors = FALSE)
spamDF$type <- "spam"
colnames(spamDF) <- c("text","type")Combining the two dataframes into one DF.
Using the TM function, we need to create a ‘corpus’ from the text which was extracted in the earlier section.
corpus_ham = VCorpus(VectorSource(hamDF$text))
corpus_ham = tm_map(corpus_ham, content_transformer(tolower))
corpus_ham = tm_map(corpus_ham, removeNumbers)
corpus_ham = tm_map(corpus_ham, removePunctuation)
corpus_ham = tm_map(corpus_ham, removeWords, stopwords())
#corpus_ham = tm_map(corpus_ham, stemDocument) # I left the "Stem" function in however I didn't like what it did to the clouds below
corpus_ham = tm_map(corpus_ham, stripWhitespace)
corpus_ham = tm_map(corpus_ham, removeWords, c("lbrace", "rbrace"))
corpus_spam = VCorpus(VectorSource(spamDF$text))
corpus_spam = tm_map(corpus_spam, content_transformer(tolower))
corpus_spam = tm_map(corpus_spam, removeNumbers)
corpus_spam = tm_map(corpus_spam, removePunctuation)
corpus_spam = tm_map(corpus_spam, removeWords, stopwords())
#corpus_spam = tm_map(corpus_spam, stemDocument) # I left the "Stem" function in however I didn't like what it did to the clouds below
corpus_spam = tm_map(corpus_spam, stripWhitespace)
corpus_spam = tm_map(corpus_spam, removeWords, c("lbrace", "rbrace"))
corpus_all <- c(corpus_ham, corpus_spam)
for(i in 1:length(corpus_ham)){
meta(corpus_all[[i]],"classification") <- "Ham"}
for(i in (length(corpus_ham)+1):(length(corpus_spam)+length(corpus_ham))){
meta(corpus_all[[i]],"classification") <- "Spam"}What does this data now look like? Creating a word cloud, I can see at quick glance what the data now looks like. There are a few “non” words which in the future I would want to go abck and clean up. They come from the footer of the email. This would become a problem in companies who use email disclaimers in their post signature section. Luckily, most of those are the same text so it can be filtered out.
SPAM word cloud
HAM word cloud
Next, I wanted to play with getting the Sparsity of the messages down to 0%. What words start to emerge from the data?
HAM Sparsity:0%
ham_subset <- corpus_ham
ham_subset_dtm <- DocumentTermMatrix(ham_subset)
ham_subset_tdm <- TermDocumentMatrix (ham_subset)
dtm_ham_Sparse <- removeSparseTerms(ham_subset_dtm, 0.2)
inspect(dtm_ham_Sparse)## <<DocumentTermMatrix (documents: 1402, terms: 291)>>
## Non-/sparse entries: 407691/291
## Sparsity : 0%
## Maximal term length: 59
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs exmh folder invoked line list message procedure sequences window
## 10 7 9 25 23 10 8 18 13 8
## 11 7 9 25 23 10 8 18 13 8
## 2 7 9 25 23 10 8 18 13 8
## 3 7 9 25 23 10 8 18 13 8
## 4 7 9 25 23 10 8 18 13 8
## 5 7 9 25 23 10 8 18 13 8
## 6 7 9 25 23 10 8 18 13 8
## 7 7 9 25 23 10 8 18 13 8
## 8 7 9 25 23 10 8 18 13 8
## 9 7 9 25 23 10 8 18 13 8
## Terms
## Docs within
## 10 25
## 11 25
## 2 25
## 3 25
## 4 25
## 5 25
## 6 25
## 7 25
## 8 25
## 9 25
## invoked within line procedure
## 35025 35025 32223 25218
## sequences list folder message
## 18213 14010 12609 11208
## window exmh one just
## 11208 9807 9807 8406
## new sequence show unseen
## 8406 8406 8406 8406
## args folderchange msgchange msgshow
## 7005 7005 7005 7005
## now pick also background
## 7005 7005 5604 5604
## menu msgid uplevel body
## 5604 5604 5604 4203
## busycursorhack cmd didnt display
## 4203 4203 4203 4203
## eval inbox name pickinner
## 4203 4203 4203 4203
## time use useful black
## 4203 4203 4203 2802
## bottom busy busycursorinner can
## 2802 2802 2802 2802
## chance changes cur current
## 2802 2802 2802 2802
## cursor defined
## 2802 2802
SPAM Sparsity:0%
spam_subset <- corpus_spam
spam_subset_dtm <- DocumentTermMatrix(spam_subset)
spam_subset_tdm <- TermDocumentMatrix (spam_subset)
dtm_spam_Sparse <- removeSparseTerms(spam_subset_dtm, 0.2)
inspect(dtm_spam_Sparse)## <<DocumentTermMatrix (documents: 1398, terms: 182)>>
## Non-/sparse entries: 254254/182
## Sparsity : 0%
## Maximal term length: 33
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs email free information letter list mlm people receiving send will
## 10 4 3 6 4 4 9 4 4 4 6
## 11 4 3 6 4 4 9 4 4 4 6
## 2 4 3 6 4 4 9 4 4 4 6
## 3 4 3 6 4 4 9 4 4 4 6
## 4 4 3 6 4 4 9 4 4 4 6
## 5 4 3 6 4 4 9 4 4 4 6
## 6 4 3 6 4 4 9 4 4 4 6
## 7 4 3 6 4 4 9 4 4 4 6
## 8 4 3 6 4 4 9 4 4 4 6
## 9 4 3 6 4 4 9 4 4 4 6
dtm_spam_freq <- sort(colSums(as.matrix(dtm_spam_Sparse)), decreasing = TRUE)
head(dtm_spam_freq, 50)## mlm information will email letter list
## 12573 8382 8382 5588 5588 5588
## people receiving send free marketing one
## 5588 5588 5588 4191 4191 4191
## tell use youve also big ever
## 4191 4191 4191 2794 2794 2794
## havent inbox multilevel online please read
## 2794 2794 2794 2794 2794 2794
## receive sent signed someone spam systems
## 2794 2794 2794 2794 2794 2794
## work works abandoned accept address agree
## 2794 2794 1397 1397 1397 1397
## agreed alternative altra apology backstabbed believe
## 1397 1397 1397 1397 1397 1397
## beneficial betrayed box brief burned business
## 1397 1397 1397 1397 1397 1397
## call click
## 1397 1397
For the actual model, I wanted more words than the Sparsity O% would give me. Setting the removal back up to 0.90 gives us a sparsity of 47%, meaning only terms in 47% of the documents were returned. I also wanted the terms to be in a dataframe which can then be shuffled. Shuffling gives us less chance of a huge block of HAM and a huge block of SPAM when setting the training/test sets.
terms_matrix <- DocumentTermMatrix(corpus_all)
dtm_training <- removeSparseTerms(terms_matrix, 0.90)
inspect(dtm_training)## <<DocumentTermMatrix (documents: 2800, terms: 446)>>
## Non-/sparse entries: 661945/586855
## Sparsity : 47%
## Maximal term length: 59
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs folder invoked line list message mlm one procedure sequences within
## 10 9 25 23 10 8 0 7 18 13 25
## 11 9 25 23 10 8 0 7 18 13 25
## 2 9 25 23 10 8 0 7 18 13 25
## 3 9 25 23 10 8 0 7 18 13 25
## 4 9 25 23 10 8 0 7 18 13 25
## 5 9 25 23 10 8 0 7 18 13 25
## 6 9 25 23 10 8 0 7 18 13 25
## 7 9 25 23 10 8 0 7 18 13 25
## 8 9 25 23 10 8 0 7 18 13 25
## 9 9 25 23 10 8 0 7 18 13 25
training_DF <- as.data.frame(as.matrix(dtm_training))
training_DF$class = spam_ham_comb$type
#Shuffling the data
training_DF <- training_DF[sample(1:nrow(training_DF)),]In order for the next functions to really work, I had to make the entire dataframe as numberic.
# In order for the next functions to really work, I had to make the entire dataframe as numberic.
training_DF <- training_DF %>% mutate(class = replace(class, class == "ham", 0))
training_DF <- training_DF %>% mutate(class = replace(class, class == "spam", 1))
training_DF$class <- as.numeric(as.character(training_DF$class))
training_DF <- as.data.frame(apply(training_DF, 2, as.numeric))
# splitting the dataframe into Training and Test
set.seed(500)
split = sample.split(training_DF$class, SplitRatio = 0.75)
training_set = subset(training_DF, split == TRUE)
test_set = subset(training_DF, split == FALSE)
# QC of the Sets
nrow(training_set)## [1] 2100
## [1] 700
# setting up the total number of observations
num_observation <- ncol(training_set) - 1
num_observation## [1] 446
I set up a Random Forest model to randomly pull values and be sampled independently. I grow 5 trees.
## Warning in randomForest.default(x = training_set[-num_observation], y =
## training_set$class, : The response has five or fewer unique values. Are you
## sure you want to do regression?
predicting_y = predict(rand_class, newdata = test_set[-num_observation])
test_matrix <- table(predicting_y>0,test_set$class)
cm## function (x)
## 2.54 * x
## <bytecode: 0x000000000e74d648>
## <environment: namespace:grDevices>
Is my model accurate?
success <- test_matrix['TRUE', 2] + test_matrix['FALSE', 1]
accuracy <- success/nrow(test_set) * 100
accuracy## [1] 100