Executive Summary

In chapter 3 of the book, Data Smart [http://www.wiley.com/WileyCDA/WileyTitle/productCd-111866146X.html], by John Foreman, the author develops a Naive Bayes classifier in Excel to determine whether tweets containing the word ‘mandrill’ are related to Mailchimps’s Mandrill email transaction app or not.

Whereas the author used Excel, we choose to use R’s text mining package, tm, in order to take advantage of its automated text processing tools.

The book, Machine Learning for Hackers [https://github.com/johnmyleswhite/ML_for_Hackers] by Drew Conway and John Myles White is also a useful resource. We use elements of that book’s approach (chapter 3) to email spam classification using the tm package here.

# load necessary packages
library(tm) # an R text mining package
library(dplyr) # used for easy manipulation of data frames

# Write a function to create a document corpus and a Term Document Matrix. The preparation involves converting 
# all characters to lower case, removing numbers, removing punctuation and removing stopwords.
createTDM <- function(doc){
  corpus <- Corpus(VectorSource(doc)) # creates a corpus from the source document
  control <- list(stopwords = TRUE, removePunctuation = TRUE, removeNumbers = TRUE) # list of text preparation steps to be applied
  tdm <- TermDocumentMatrix(corpus, control) 
    # converts the terms to lowercase by default
    # tokenizes the document to words by default 
    # removes stopwords. Type 'stopwords()' in the R console to view the list of stopwords 
  return(tdm)
}

# calculate the proportional frequency of occurrence of each word. Arrange information in a data frame
genProb <- function(record){
  counts <- cbind(data.frame(record$dimnames$Terms, data.frame(record$v))) # bind together the data of interest from the tdm, the individual words and counts
  names(counts) <- c("word", "count") # assign appropriate names to the columns
  counts <- mutate(counts, prob = count/sum(count)) # create a new column, 'prob', using the mutate command from the dplyr package
}

# function to return score of testTweet based on word frequencies in the training corpora
genTweetScores <- function(trainingCounts){
  testTweetScore <- as.numeric(vector())
  for (i in 1:length(testTweets)){
    score = 0
    test <- createTDM(testTweets[i])
    wordsIn <- intersect(test$dimnames$Terms, trainingCounts$word)
    wordsNotIn <- setdiff(test$dimnames$Terms, wordsIn)
    score <- score + log(notInValue)*length(wordsNotIn)
    score <- score + sum(log(trainingCounts$prob[match(wordsIn, trainingCounts$word)]))
    testTweetScore[i] <- score
  }
  return (testTweetScore)
}

# read in and concatenate sets of training tweets into a single character variable
mApp <- paste(readLines("MandrillApp.csv"), collapse = "\n")
other <- paste(readLines("Other.csv"), collapse = "\n")
testTweets <- readLines("testTweets.csv") # this body of individual tweets is not concatenated. Each is analyzed as an individual document

# Assign a proportional frequency value to words in test tweets which were not in the training corpus as per the 
# approach in ML for Hackers. Data Smart used an additive smoothing approach
notInValue = 0.00005 

# inspect the stopwords
stopwords()
##   [1] "i"          "me"         "my"         "myself"     "we"        
##   [6] "our"        "ours"       "ourselves"  "you"        "your"      
##  [11] "yours"      "yourself"   "yourselves" "he"         "him"       
##  [16] "his"        "himself"    "she"        "her"        "hers"      
##  [21] "herself"    "it"         "its"        "itself"     "they"      
##  [26] "them"       "their"      "theirs"     "themselves" "what"      
##  [31] "which"      "who"        "whom"       "this"       "that"      
##  [36] "these"      "those"      "am"         "is"         "are"       
##  [41] "was"        "were"       "be"         "been"       "being"     
##  [46] "have"       "has"        "had"        "having"     "do"        
##  [51] "does"       "did"        "doing"      "would"      "should"    
##  [56] "could"      "ought"      "i'm"        "you're"     "he's"      
##  [61] "she's"      "it's"       "we're"      "they're"    "i've"      
##  [66] "you've"     "we've"      "they've"    "i'd"        "you'd"     
##  [71] "he'd"       "she'd"      "we'd"       "they'd"     "i'll"      
##  [76] "you'll"     "he'll"      "she'll"     "we'll"      "they'll"   
##  [81] "isn't"      "aren't"     "wasn't"     "weren't"    "hasn't"    
##  [86] "haven't"    "hadn't"     "doesn't"    "don't"      "didn't"    
##  [91] "won't"      "wouldn't"   "shan't"     "shouldn't"  "can't"     
##  [96] "cannot"     "couldn't"   "mustn't"    "let's"      "that's"    
## [101] "who's"      "what's"     "here's"     "there's"    "when's"    
## [106] "where's"    "why's"      "how's"      "a"          "an"        
## [111] "the"        "and"        "but"        "if"         "or"        
## [116] "because"    "as"         "until"      "while"      "of"        
## [121] "at"         "by"         "for"        "with"       "about"     
## [126] "against"    "between"    "into"       "through"    "during"    
## [131] "before"     "after"      "above"      "below"      "to"        
## [136] "from"       "up"         "down"       "in"         "out"       
## [141] "on"         "off"        "over"       "under"      "again"     
## [146] "further"    "then"       "once"       "here"       "there"     
## [151] "when"       "where"      "why"        "how"        "all"       
## [156] "any"        "both"       "each"       "few"        "more"      
## [161] "most"       "other"      "some"       "such"       "no"        
## [166] "nor"        "not"        "only"       "own"        "same"      
## [171] "so"         "than"       "too"        "very"
# Generate the term-document-matrices for the training corpora using the createTDM function
mTDM <- createTDM(mApp)
oTDM <- createTDM(other)

# generate the counts and proportional frequencies for words in the training corpora
mCounts <- genProb(mTDM)
oCounts <- genProb(oTDM)

# arrange the rows in descending order of occurrence. Inspect the top 20 rows
head(arrange(mCounts, -prob), 20) 
##                   word count        prob
## 1             mandrill   100 0.065703022
## 2                email    28 0.018396846
## 3  httphelpmandrillcom    22 0.014454665
## 4                  can    20 0.013140604
## 5            mailchimp    20 0.013140604
## 6             sendgrid    18 0.011826544
## 7              request    16 0.010512484
## 8          mandrillapp    14 0.009198423
## 9              details    13 0.008541393
## 10              emails    13 0.008541393
## 11                send    13 0.008541393
## 12       transactional    12 0.007884363
## 13                just    11 0.007227332
## 14                mind    11 0.007227332
## 15         newsletters    11 0.007227332
## 16             service    11 0.007227332
## 17                 use    11 0.007227332
## 18               using    11 0.007227332
## 19                 via    11 0.007227332
## 20                 api    10 0.006570302
head(arrange(oCounts, -prob), 20) 
##                     word count        prob
## 1               mandrill   138 0.112837285
## 2                  spark    25 0.020441537
## 3                youtube    17 0.013900245
## 4                megaman    15 0.012264922
## 5               acapella    14 0.011447261
## 6                    get    12 0.009811938
## 7  httpyoutubehyxkwyjdia     9 0.007358953
## 8                    man     9 0.007358953
## 9         smoothmcgroove     9 0.007358953
## 10                 vídeo     9 0.007358953
## 11                gostei     7 0.005723630
## 12                  just     6 0.004905969
## 13                  mega     6 0.004905969
## 14                   can     5 0.004088307
## 15                  like     5 0.004088307
## 16                   new     5 0.004088307
## 17                   que     5 0.004088307
## 18                   via     5 0.004088307
## 19              ccpgames     4 0.003270646
## 20            freebooted     4 0.003270646
# Classify the tweets using their word content probabilities for each of the training corpora
classResults <- data.frame(cbind(genTweetScores(mCounts), genTweetScores(oCounts)))
names(classResults) <- c("mScores", "oScores")
classResults <- mutate(classResults, Classification = ifelse(mScores > oScores, "App", "Other"))
classResults
##       mScores    oScores Classification
## 1   -60.77438  -95.27575            App
## 2   -60.71495  -97.76066            App
## 3   -37.97522  -48.90482            App
## 4  -114.52122 -167.74667            App
## 5   -83.35133 -118.75262            App
## 6   -48.28697  -71.50622            App
## 7   -29.46878  -39.00133            App
## 8   -35.94576  -39.34240            App
## 9   -95.49885 -123.72886            App
## 10  -54.37375  -79.22790            App
## 11  -32.43307  -24.51166          Other
## 12  -62.14354  -47.63061          Other
## 13  -49.66434  -51.69925            App
## 14  -36.76512  -37.39190            App
## 15  -32.43307  -29.09785          Other
## 16  -52.24005  -24.94163          Other
## 17  -88.17968  -80.89603          Other
## 18  -86.55568  -60.70636          Other
## 19  -69.47132  -71.50622            App
## 20  -91.85400  -85.03120          Other
# The classifier classifies all Mandrill-App related tweets correctly (items 1-10 in the classResults list)
# Three of the unrelated tweets (items 11-20 in the classResults 1ist) are misclassified as 'App', so we can see that there is room for improvement.