In chapter 3 of the book, Data Smart [http://www.wiley.com/WileyCDA/WileyTitle/productCd-111866146X.html], by John Foreman, the author develops a Naive Bayes classifier in Excel to determine whether tweets containing the word ‘mandrill’ are related to Mailchimps’s Mandrill email transaction app or not.
Whereas the author used Excel, we choose to use R’s text mining package, tm, in order to take advantage of its automated text processing tools.
The book, Machine Learning for Hackers [https://github.com/johnmyleswhite/ML_for_Hackers] by Drew Conway and John Myles White is also a useful resource. We use elements of that book’s approach (chapter 3) to email spam classification using the tm package here.
# load necessary packages
library(tm) # an R text mining package
library(dplyr) # used for easy manipulation of data frames
# Write a function to create a document corpus and a Term Document Matrix. The preparation involves converting
# all characters to lower case, removing numbers, removing punctuation and removing stopwords.
createTDM <- function(doc){
corpus <- Corpus(VectorSource(doc)) # creates a corpus from the source document
control <- list(stopwords = TRUE, removePunctuation = TRUE, removeNumbers = TRUE) # list of text preparation steps to be applied
tdm <- TermDocumentMatrix(corpus, control)
# converts the terms to lowercase by default
# tokenizes the document to words by default
# removes stopwords. Type 'stopwords()' in the R console to view the list of stopwords
return(tdm)
}
# calculate the proportional frequency of occurrence of each word. Arrange information in a data frame
genProb <- function(record){
counts <- cbind(data.frame(record$dimnames$Terms, data.frame(record$v))) # bind together the data of interest from the tdm, the individual words and counts
names(counts) <- c("word", "count") # assign appropriate names to the columns
counts <- mutate(counts, prob = count/sum(count)) # create a new column, 'prob', using the mutate command from the dplyr package
}
# function to return score of testTweet based on word frequencies in the training corpora
genTweetScores <- function(trainingCounts){
testTweetScore <- as.numeric(vector())
for (i in 1:length(testTweets)){
score = 0
test <- createTDM(testTweets[i])
wordsIn <- intersect(test$dimnames$Terms, trainingCounts$word)
wordsNotIn <- setdiff(test$dimnames$Terms, wordsIn)
score <- score + log(notInValue)*length(wordsNotIn)
score <- score + sum(log(trainingCounts$prob[match(wordsIn, trainingCounts$word)]))
testTweetScore[i] <- score
}
return (testTweetScore)
}
# read in and concatenate sets of training tweets into a single character variable
mApp <- paste(readLines("MandrillApp.csv"), collapse = "\n")
other <- paste(readLines("Other.csv"), collapse = "\n")
testTweets <- readLines("testTweets.csv") # this body of individual tweets is not concatenated. Each is analyzed as an individual document
# Assign a proportional frequency value to words in test tweets which were not in the training corpus as per the
# approach in ML for Hackers. Data Smart used an additive smoothing approach
notInValue = 0.00005
# inspect the stopwords
stopwords()
## [1] "i" "me" "my" "myself" "we"
## [6] "our" "ours" "ourselves" "you" "your"
## [11] "yours" "yourself" "yourselves" "he" "him"
## [16] "his" "himself" "she" "her" "hers"
## [21] "herself" "it" "its" "itself" "they"
## [26] "them" "their" "theirs" "themselves" "what"
## [31] "which" "who" "whom" "this" "that"
## [36] "these" "those" "am" "is" "are"
## [41] "was" "were" "be" "been" "being"
## [46] "have" "has" "had" "having" "do"
## [51] "does" "did" "doing" "would" "should"
## [56] "could" "ought" "i'm" "you're" "he's"
## [61] "she's" "it's" "we're" "they're" "i've"
## [66] "you've" "we've" "they've" "i'd" "you'd"
## [71] "he'd" "she'd" "we'd" "they'd" "i'll"
## [76] "you'll" "he'll" "she'll" "we'll" "they'll"
## [81] "isn't" "aren't" "wasn't" "weren't" "hasn't"
## [86] "haven't" "hadn't" "doesn't" "don't" "didn't"
## [91] "won't" "wouldn't" "shan't" "shouldn't" "can't"
## [96] "cannot" "couldn't" "mustn't" "let's" "that's"
## [101] "who's" "what's" "here's" "there's" "when's"
## [106] "where's" "why's" "how's" "a" "an"
## [111] "the" "and" "but" "if" "or"
## [116] "because" "as" "until" "while" "of"
## [121] "at" "by" "for" "with" "about"
## [126] "against" "between" "into" "through" "during"
## [131] "before" "after" "above" "below" "to"
## [136] "from" "up" "down" "in" "out"
## [141] "on" "off" "over" "under" "again"
## [146] "further" "then" "once" "here" "there"
## [151] "when" "where" "why" "how" "all"
## [156] "any" "both" "each" "few" "more"
## [161] "most" "other" "some" "such" "no"
## [166] "nor" "not" "only" "own" "same"
## [171] "so" "than" "too" "very"
# Generate the term-document-matrices for the training corpora using the createTDM function
mTDM <- createTDM(mApp)
oTDM <- createTDM(other)
# generate the counts and proportional frequencies for words in the training corpora
mCounts <- genProb(mTDM)
oCounts <- genProb(oTDM)
# arrange the rows in descending order of occurrence. Inspect the top 20 rows
head(arrange(mCounts, -prob), 20)
## word count prob
## 1 mandrill 100 0.065703022
## 2 email 28 0.018396846
## 3 httphelpmandrillcom 22 0.014454665
## 4 can 20 0.013140604
## 5 mailchimp 20 0.013140604
## 6 sendgrid 18 0.011826544
## 7 request 16 0.010512484
## 8 mandrillapp 14 0.009198423
## 9 details 13 0.008541393
## 10 emails 13 0.008541393
## 11 send 13 0.008541393
## 12 transactional 12 0.007884363
## 13 just 11 0.007227332
## 14 mind 11 0.007227332
## 15 newsletters 11 0.007227332
## 16 service 11 0.007227332
## 17 use 11 0.007227332
## 18 using 11 0.007227332
## 19 via 11 0.007227332
## 20 api 10 0.006570302
head(arrange(oCounts, -prob), 20)
## word count prob
## 1 mandrill 138 0.112837285
## 2 spark 25 0.020441537
## 3 youtube 17 0.013900245
## 4 megaman 15 0.012264922
## 5 acapella 14 0.011447261
## 6 get 12 0.009811938
## 7 httpyoutubehyxkwyjdia 9 0.007358953
## 8 man 9 0.007358953
## 9 smoothmcgroove 9 0.007358953
## 10 vídeo 9 0.007358953
## 11 gostei 7 0.005723630
## 12 just 6 0.004905969
## 13 mega 6 0.004905969
## 14 can 5 0.004088307
## 15 like 5 0.004088307
## 16 new 5 0.004088307
## 17 que 5 0.004088307
## 18 via 5 0.004088307
## 19 ccpgames 4 0.003270646
## 20 freebooted 4 0.003270646
# Classify the tweets using their word content probabilities for each of the training corpora
classResults <- data.frame(cbind(genTweetScores(mCounts), genTweetScores(oCounts)))
names(classResults) <- c("mScores", "oScores")
classResults <- mutate(classResults, Classification = ifelse(mScores > oScores, "App", "Other"))
classResults
## mScores oScores Classification
## 1 -60.77438 -95.27575 App
## 2 -60.71495 -97.76066 App
## 3 -37.97522 -48.90482 App
## 4 -114.52122 -167.74667 App
## 5 -83.35133 -118.75262 App
## 6 -48.28697 -71.50622 App
## 7 -29.46878 -39.00133 App
## 8 -35.94576 -39.34240 App
## 9 -95.49885 -123.72886 App
## 10 -54.37375 -79.22790 App
## 11 -32.43307 -24.51166 Other
## 12 -62.14354 -47.63061 Other
## 13 -49.66434 -51.69925 App
## 14 -36.76512 -37.39190 App
## 15 -32.43307 -29.09785 Other
## 16 -52.24005 -24.94163 Other
## 17 -88.17968 -80.89603 Other
## 18 -86.55568 -60.70636 Other
## 19 -69.47132 -71.50622 App
## 20 -91.85400 -85.03120 Other
# The classifier classifies all Mandrill-App related tweets correctly (items 1-10 in the classResults list)
# Three of the unrelated tweets (items 11-20 in the classResults 1ist) are misclassified as 'App', so we can see that there is room for improvement.