It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/publiccorpus/

Load libraries

library(tm)
library(knitr)
library(plyr)
library(wordcloud)
library(ggplot2)
library(lattice)

library(e1071)
library(caret)
library(quanteda)
library(irlba)
library(randomForest)

Load Data

# Get the Ham and Spam data
dsHS <- read.csv(file="SpamHam.csv", header=TRUE, sep=",", stringsAsFactors=FALSE)

str(dsHS)
## 'data.frame':    5572 obs. of  5 variables:
##  $ v1 : chr  "ham" "ham" "spam" "ham" ...
##  $ v2 : chr  "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..." "Ok lar... Joking wif u oni..." "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question("| __truncated__ "U dun say so early hor... U c already then say..." ...
##  $ X  : chr  "" "" "" "" ...
##  $ X.1: chr  "" "" "" "" ...
##  $ X.2: chr  "" "" "" "" ...
#Find if there is missing values in columns
which(!complete.cases(dsHS))
## integer(0)
#Count number of ham/spam messages in our dataset
table(dsHS$v1)
## 
##  ham spam 
## 4825  747
#Find the proportion of ham vs spam messages in our dataset
prop.table(table(dsHS$v1))
## 
##       ham      spam 
## 0.8659368 0.1340632
#dsHS$v2 <- as.character(dsHS$v2)
dsHS$v2Len <- nchar(dsHS$v2)
hist(dsHS$v2Len)

ggplot(dsHS, aes(v2Len, fill=v1)) + geom_histogram(binwidth = 6) + facet_wrap(~v1)

histogram(~v2Len|v1, data=dsHS)

Build Corpus

corpus_hs <- Corpus(VectorSource(dsHS$v2))
corpus_hs
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5572
#Inspect corpus data
inspect(corpus_hs[1:5])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5
## 
## [1] Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...                                            
## [2] Ok lar... Joking wif u oni...                                                                                                                              
## [3] Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
## [4] U dun say so early hor... U c already then say...                                                                                                          
## [5] Nah I don't think he goes to usf, he lives around here though
#Cleanup
corpus_hs_clean <- tm_map(corpus_hs, tolower)
corpus_hs_clean <- tm_map(corpus_hs_clean, removeNumbers)
corpus_hs_clean <- tm_map(corpus_hs_clean, removePunctuation)
corpus_hs_clean <- tm_map(corpus_hs_clean, removeWords, stopwords())
corpus_hs_clean <- tm_map(corpus_hs_clean, stripWhitespace)

inspect(corpus_hs_clean[1:5])
## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5
## 
## [1] go jurong point crazy available bugis n great world la e buffet cine got amore wat                         
## [2] ok lar joking wif u oni                                                                                    
## [3] free entry wkly comp win fa cup final tkts st may text fa receive entry questionstd txt ratetcs apply overs
## [4] u dun say early hor u c already say                                                                        
## [5] nah dont think goes usf lives around though
corpus_dtm <- DocumentTermMatrix(corpus_hs_clean)
inspect(corpus_dtm[1:10, 10:15])
## <<DocumentTermMatrix (documents: 10, terms: 6)>>
## Non-/sparse entries: 6/54
## Sparsity           : 90%
## Maximal term length: 6
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs joking lar oni point wat world
##   1       0   0   0     1   1     1
##   10      0   0   0     0   0     0
##   2       1   1   1     0   0     0
##   3       0   0   0     0   0     0
##   4       0   0   0     0   0     0
##   5       0   0   0     0   0     0
##   6       0   0   0     0   0     0
##   7       0   0   0     0   0     0
##   8       0   0   0     0   0     0
##   9       0   0   0     0   0     0
corpus_dtm_1 <- DocumentTermMatrix(corpus_hs_clean, control = list(tolower=TRUE, removeNumbers=TRUE, stopwords=TRUE, removePunctuation=TRUE, stripWhitespace=TRUE ))
inspect(corpus_dtm_1[1:10, 10:15])
## <<DocumentTermMatrix (documents: 10, terms: 6)>>
## Non-/sparse entries: 6/54
## Sparsity           : 90%
## Maximal term length: 6
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs joking lar oni point wat world
##   1       0   0   0     1   1     1
##   10      0   0   0     0   0     0
##   2       1   1   1     0   0     0
##   3       0   0   0     0   0     0
##   4       0   0   0     0   0     0
##   5       0   0   0     0   0     0
##   6       0   0   0     0   0     0
##   7       0   0   0     0   0     0
##   8       0   0   0     0   0     0
##   9       0   0   0     0   0     0

Training and Test set

#Set random number at the same point
set.seed(32984)

#Split training and test set
idx <- createDataPartition(dsHS$v1, times = 1, p=0.7, list = FALSE)

#70% Split
train <- dsHS[idx,]

#30% Split
test <- dsHS[-idx,]

#Verify proportion. Make sure the proportion is the same as in original dataset.
prop.table(table(train$v1))
## 
##       ham      spam 
## 0.8659318 0.1340682
prop.table(table(test$v1))
## 
##       ham      spam 
## 0.8659485 0.1340515

Cleanup

#Cleanup
train_tkns <- tokens(train$v2, what="word", remove_numbers = TRUE, remove_punct = TRUE,
  remove_symbols = TRUE, remove_separators = TRUE, remove_hyphens = TRUE)

#Convert tokens to lowercase
train_tkns <- tokens_tolower(train_tkns)

#Take a look at the stopwords before removing
stopwords()
##   [1] "i"          "me"         "my"         "myself"     "we"        
##   [6] "our"        "ours"       "ourselves"  "you"        "your"      
##  [11] "yours"      "yourself"   "yourselves" "he"         "him"       
##  [16] "his"        "himself"    "she"        "her"        "hers"      
##  [21] "herself"    "it"         "its"        "itself"     "they"      
##  [26] "them"       "their"      "theirs"     "themselves" "what"      
##  [31] "which"      "who"        "whom"       "this"       "that"      
##  [36] "these"      "those"      "am"         "is"         "are"       
##  [41] "was"        "were"       "be"         "been"       "being"     
##  [46] "have"       "has"        "had"        "having"     "do"        
##  [51] "does"       "did"        "doing"      "would"      "should"    
##  [56] "could"      "ought"      "i'm"        "you're"     "he's"      
##  [61] "she's"      "it's"       "we're"      "they're"    "i've"      
##  [66] "you've"     "we've"      "they've"    "i'd"        "you'd"     
##  [71] "he'd"       "she'd"      "we'd"       "they'd"     "i'll"      
##  [76] "you'll"     "he'll"      "she'll"     "we'll"      "they'll"   
##  [81] "isn't"      "aren't"     "wasn't"     "weren't"    "hasn't"    
##  [86] "haven't"    "hadn't"     "doesn't"    "don't"      "didn't"    
##  [91] "won't"      "wouldn't"   "shan't"     "shouldn't"  "can't"     
##  [96] "cannot"     "couldn't"   "mustn't"    "let's"      "that's"    
## [101] "who's"      "what's"     "here's"     "there's"    "when's"    
## [106] "where's"    "why's"      "how's"      "a"          "an"        
## [111] "the"        "and"        "but"        "if"         "or"        
## [116] "because"    "as"         "until"      "while"      "of"        
## [121] "at"         "by"         "for"        "with"       "about"     
## [126] "against"    "between"    "into"       "through"    "during"    
## [131] "before"     "after"      "above"      "below"      "to"        
## [136] "from"       "up"         "down"       "in"         "out"       
## [141] "on"         "off"        "over"       "under"      "again"     
## [146] "further"    "then"       "once"       "here"       "there"     
## [151] "when"       "where"      "why"        "how"        "all"       
## [156] "any"        "both"       "each"       "few"        "more"      
## [161] "most"       "other"      "some"       "such"       "no"        
## [166] "nor"        "not"        "only"       "own"        "same"      
## [171] "so"         "than"       "too"        "very"       "will"
#Remove stopwords
train_tkns <- tokens_select(train_tkns, stopwords(), selection="remove")

#Stemming
train_tkns <- tokens_wordstem(train_tkns, language="english")

#View sample data
#sample(train_tkns, size = 10)

Bag of words model

#Create bag of words model
train_tkns_dfm <- dfm(train_tkns, tolower=FALSE)

train_tkns_matrix <- as.matrix(train_tkns_dfm)
#head(train_tkns_matrix)
dim(train_tkns_matrix)
## [1] 3901 5742
#Feature with labels
train_tkns_df <- cbind(labels=train$v1, convert(train_tkns_dfm, to = "data.frame"))
#train_tkns_df <- cbind(labels=train$v1, as.data.frame(train_tkns_dfm))

#Make syntactically valid column names to avoid errors with with many R functions. Eg 4txt is not a valid name
names(train_tkns_df) <- make.names(names(train_tkns_df))

Cross Validation

set.seed(48743)

#Since there is a class imbalance lets create stratified folds for 10-fold cross validation repeated 3 times. It creates 30 random stratified samples.
cv_folds <- createMultiFolds(train$v1, k=10, times=3)
cv_cntrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3, index = cv_folds)

Run Model (Parallel Processing

#For parallel processing. Reduces processing time.
library(doSNOW)

start_time <- Sys.time()

#Run with 3 cores
clstr <- makeCluster(3, type="SOCK")
registerDoSNOW(clstr)

model_cv_1 <- train(labels ~ ., data=train_tkns_df, method="rpart", trControl=cv_cntrl, tuneLength=7)

stopCluster(clstr)

total_time <- Sys.time() - start_time
total_time
## Time difference of 22.7641 mins
model_cv_1
## CART 
## 
## 3901 samples
## 5743 predictors
##    2 classes: 'ham', 'spam' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 3511, 3510, 3511, 3511, 3511, 3511, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.02103250  0.9429179  0.7134030
##   0.02294455  0.9402689  0.6957674
##   0.02868069  0.9356553  0.6665729
##   0.03059273  0.9333472  0.6508580
##   0.03824092  0.9302709  0.6281573
##   0.05098789  0.9154072  0.5141434
##   0.32504780  0.8796808  0.1483505
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.0210325.

Alternate Methods for data cleaning

#Custom Function for Data Cleanup
corpusCleanData <- function(col) {

  corpusData <- col
  
  #Remove punctuations
  corpusData <- gsub(pattern="\\W", replace=" ", corpusData)
  
  #Remove digits
  corpusData <- gsub(pattern="\\d", replace=" ", corpusData)
  
  #Lower case
  corpusData <- tolower(corpusData)

  #Remove stopwords
  corpusData <- removeWords(corpusData, stopwords("english"))
  
  #Remove single chars
  corpusData <- gsub(pattern="\\b[A-z]\\b{1}", replace=" ", corpusData)

  #Remove whitespaces
  corpusData <- stripWhitespace(corpusData)
  
}

Word Cloud - Ham and Spam

dsHamSpam <- split(dsHS, dsHS$v1)
dsHam <- dsHamSpam[[1]] 
dsSpam <- dsHamSpam[[2]] 

corpusSpam <- corpusCleanData(dsSpam$v2)
corpusHam <- corpusCleanData(dsHam$v2)

head(corpusSpam)
## [1] "free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate apply " 
## [2] "freemsg hey darling week now word back like fun still tb ok xxx std chgs send å rcv"                  
## [3] "winner valued network customer selected receivea å prize reward claim call claim code kl valid hours "
## [4] " mobile months entitled update latest colour mobiles camera free call mobile update co free "         
## [5] "six chances win cash pounds txt csh send cost day days tsandcs apply reply hl info"                   
## [6] "urgent won week free membership å prize jackpot txt word claim www dbuk net lccltd pobox ldnw rw "
head(corpusHam)
## [1] "go jurong point crazy available bugis great world la buffet cine got amore wat "                                 
## [2] "ok lar joking wif oni "                                                                                          
## [3] " dun say early hor already say "                                                                                 
## [4] "nah don think goes usf lives around though"                                                                      
## [5] "even brother like speak treat like aids patent "                                                                 
## [6] " per request melle melle oru minnaminunginte nurungu vettam set callertune callers press copy friends callertune"
wordcloud(corpusSpam, max.words = 200, random.order = FALSE, col=rainbow(3))

wordcloud(corpusHam, max.words = 200, random.order = FALSE, col=rainbow(3))

Source: https://www.youtube.com/watch?v=Y7385dGRNLM / https://www.youtube.com/watch?v=jCrQYOsAcv4

Data: https://www.kaggle.com/uciml/sms-spam-collection-dataset/version/1#spam.csv