Document Classification - Data 607 PROJECT 4

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/publiccorpus/

Load libraries

library(tm)
library(knitr)
library(plyr)
library(wordcloud)
library(ggplot2)
library(lattice)

library(e1071)
library(caret)
library(quanteda)
library(irlba)
library(randomForest)

Load Data

# Get the Ham and Spam data
dsHS <- read.csv(file="SpamHam.csv", header=TRUE, sep=",", stringsAsFactors=FALSE)

str(dsHS)

## 'data.frame':    5572 obs. of  5 variables:
##  $ v1 : chr  "ham" "ham" "spam" "ham" ...
##  $ v2 : chr  "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..." "Ok lar... Joking wif u oni..." "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question("| __truncated__ "U dun say so early hor... U c already then say..." ...
##  $ X  : chr  "" "" "" "" ...
##  $ X.1: chr  "" "" "" "" ...
##  $ X.2: chr  "" "" "" "" ...

#Find if there is missing values in columns
which(!complete.cases(dsHS))

## integer(0)

#Count number of ham/spam messages in our dataset
table(dsHS$v1)

## 
##  ham spam 
## 4825  747

#Find the proportion of ham vs spam messages in our dataset
prop.table(table(dsHS$v1))

## 
##       ham      spam 
## 0.8659368 0.1340632

#dsHS$v2 <- as.character(dsHS$v2)
dsHS$v2Len <- nchar(dsHS$v2)
hist(dsHS$v2Len)

ggplot(dsHS, aes(v2Len, fill=v1)) + geom_histogram(binwidth = 6) + facet_wrap(~v1)

histogram(~v2Len|v1, data=dsHS)

Build Corpus

corpus_hs <- Corpus(VectorSource(dsHS$v2))
corpus_hs

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5572

#Inspect corpus data
inspect(corpus_hs[1:5])

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5
## 
## [1] Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...                                            
## [2] Ok lar... Joking wif u oni...                                                                                                                              
## [3] Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
## [4] U dun say so early hor... U c already then say...                                                                                                          
## [5] Nah I don't think he goes to usf, he lives around here though

#Cleanup
corpus_hs_clean <- tm_map(corpus_hs, tolower)
corpus_hs_clean <- tm_map(corpus_hs_clean, removeNumbers)
corpus_hs_clean <- tm_map(corpus_hs_clean, removePunctuation)
corpus_hs_clean <- tm_map(corpus_hs_clean, removeWords, stopwords())
corpus_hs_clean <- tm_map(corpus_hs_clean, stripWhitespace)

inspect(corpus_hs_clean[1:5])

## <<SimpleCorpus>>
## Metadata:  corpus specific: 1, document level (indexed): 0
## Content:  documents: 5
## 
## [1] go jurong point crazy available bugis n great world la e buffet cine got amore wat                         
## [2] ok lar joking wif u oni                                                                                    
## [3] free entry wkly comp win fa cup final tkts st may text fa receive entry questionstd txt ratetcs apply overs
## [4] u dun say early hor u c already say                                                                        
## [5] nah dont think goes usf lives around though

corpus_dtm <- DocumentTermMatrix(corpus_hs_clean)
inspect(corpus_dtm[1:10, 10:15])

## <<DocumentTermMatrix (documents: 10, terms: 6)>>
## Non-/sparse entries: 6/54
## Sparsity           : 90%
## Maximal term length: 6
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs joking lar oni point wat world
##   1       0   0   0     1   1     1
##   10      0   0   0     0   0     0
##   2       1   1   1     0   0     0
##   3       0   0   0     0   0     0
##   4       0   0   0     0   0     0
##   5       0   0   0     0   0     0
##   6       0   0   0     0   0     0
##   7       0   0   0     0   0     0
##   8       0   0   0     0   0     0
##   9       0   0   0     0   0     0

corpus_dtm_1 <- DocumentTermMatrix(corpus_hs_clean, control = list(tolower=TRUE, removeNumbers=TRUE, stopwords=TRUE, removePunctuation=TRUE, stripWhitespace=TRUE ))
inspect(corpus_dtm_1[1:10, 10:15])

## <<DocumentTermMatrix (documents: 10, terms: 6)>>
## Non-/sparse entries: 6/54
## Sparsity           : 90%
## Maximal term length: 6
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs joking lar oni point wat world
##   1       0   0   0     1   1     1
##   10      0   0   0     0   0     0
##   2       1   1   1     0   0     0
##   3       0   0   0     0   0     0
##   4       0   0   0     0   0     0
##   5       0   0   0     0   0     0
##   6       0   0   0     0   0     0
##   7       0   0   0     0   0     0
##   8       0   0   0     0   0     0
##   9       0   0   0     0   0     0

Training and Test set

#Set random number at the same point
set.seed(32984)

#Split training and test set
idx <- createDataPartition(dsHS$v1, times = 1, p=0.7, list = FALSE)

#70% Split
train <- dsHS[idx,]

#30% Split
test <- dsHS[-idx,]

#Verify proportion. Make sure the proportion is the same as in original dataset.
prop.table(table(train$v1))

## 
##       ham      spam 
## 0.8659318 0.1340682

prop.table(table(test$v1))

## 
##       ham      spam 
## 0.8659485 0.1340515

Cleanup

#Cleanup
train_tkns <- tokens(train$v2, what="word", remove_numbers = TRUE, remove_punct = TRUE,
  remove_symbols = TRUE, remove_separators = TRUE, remove_hyphens = TRUE)

#Convert tokens to lowercase
train_tkns <- tokens_tolower(train_tkns)

#Take a look at the stopwords before removing
stopwords()

##   [1] "i"          "me"         "my"         "myself"     "we"        
##   [6] "our"        "ours"       "ourselves"  "you"        "your"      
##  [11] "yours"      "yourself"   "yourselves" "he"         "him"       
##  [16] "his"        "himself"    "she"        "her"        "hers"      
##  [21] "herself"    "it"         "its"        "itself"     "they"      
##  [26] "them"       "their"      "theirs"     "themselves" "what"      
##  [31] "which"      "who"        "whom"       "this"       "that"      
##  [36] "these"      "those"      "am"         "is"         "are"       
##  [41] "was"        "were"       "be"         "been"       "being"     
##  [46] "have"       "has"        "had"        "having"     "do"        
##  [51] "does"       "did"        "doing"      "would"      "should"    
##  [56] "could"      "ought"      "i'm"        "you're"     "he's"      
##  [61] "she's"      "it's"       "we're"      "they're"    "i've"      
##  [66] "you've"     "we've"      "they've"    "i'd"        "you'd"     
##  [71] "he'd"       "she'd"      "we'd"       "they'd"     "i'll"      
##  [76] "you'll"     "he'll"      "she'll"     "we'll"      "they'll"   
##  [81] "isn't"      "aren't"     "wasn't"     "weren't"    "hasn't"    
##  [86] "haven't"    "hadn't"     "doesn't"    "don't"      "didn't"    
##  [91] "won't"      "wouldn't"   "shan't"     "shouldn't"  "can't"     
##  [96] "cannot"     "couldn't"   "mustn't"    "let's"      "that's"    
## [101] "who's"      "what's"     "here's"     "there's"    "when's"    
## [106] "where's"    "why's"      "how's"      "a"          "an"        
## [111] "the"        "and"        "but"        "if"         "or"        
## [116] "because"    "as"         "until"      "while"      "of"        
## [121] "at"         "by"         "for"        "with"       "about"     
## [126] "against"    "between"    "into"       "through"    "during"    
## [131] "before"     "after"      "above"      "below"      "to"        
## [136] "from"       "up"         "down"       "in"         "out"       
## [141] "on"         "off"        "over"       "under"      "again"     
## [146] "further"    "then"       "once"       "here"       "there"     
## [151] "when"       "where"      "why"        "how"        "all"       
## [156] "any"        "both"       "each"       "few"        "more"      
## [161] "most"       "other"      "some"       "such"       "no"        
## [166] "nor"        "not"        "only"       "own"        "same"      
## [171] "so"         "than"       "too"        "very"       "will"

#Remove stopwords
train_tkns <- tokens_select(train_tkns, stopwords(), selection="remove")

#Stemming
train_tkns <- tokens_wordstem(train_tkns, language="english")

#View sample data
#sample(train_tkns, size = 10)

Bag of words model

#Create bag of words model
train_tkns_dfm <- dfm(train_tkns, tolower=FALSE)

train_tkns_matrix <- as.matrix(train_tkns_dfm)
#head(train_tkns_matrix)
dim(train_tkns_matrix)

## [1] 3901 5742

#Feature with labels
train_tkns_df <- cbind(labels=train$v1, convert(train_tkns_dfm, to = "data.frame"))
#train_tkns_df <- cbind(labels=train$v1, as.data.frame(train_tkns_dfm))

#Make syntactically valid column names to avoid errors with with many R functions. Eg 4txt is not a valid name
names(train_tkns_df) <- make.names(names(train_tkns_df))

Cross Validation

set.seed(48743)

#Since there is a class imbalance lets create stratified folds for 10-fold cross validation repeated 3 times. It creates 30 random stratified samples.
cv_folds <- createMultiFolds(train$v1, k=10, times=3)
cv_cntrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3, index = cv_folds)

Run Model (Parallel Processing

#For parallel processing. Reduces processing time.
library(doSNOW)

start_time <- Sys.time()

#Run with 3 cores
clstr <- makeCluster(3, type="SOCK")
registerDoSNOW(clstr)

model_cv_1 <- train(labels ~ ., data=train_tkns_df, method="rpart", trControl=cv_cntrl, tuneLength=7)

stopCluster(clstr)

total_time <- Sys.time() - start_time
total_time

## Time difference of 22.7641 mins

model_cv_1

## CART 
## 
## 3901 samples
## 5743 predictors
##    2 classes: 'ham', 'spam' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 3511, 3510, 3511, 3511, 3511, 3511, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.02103250  0.9429179  0.7134030
##   0.02294455  0.9402689  0.6957674
##   0.02868069  0.9356553  0.6665729
##   0.03059273  0.9333472  0.6508580
##   0.03824092  0.9302709  0.6281573
##   0.05098789  0.9154072  0.5141434
##   0.32504780  0.8796808  0.1483505
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.0210325.

Alternate Methods for data cleaning

#Custom Function for Data Cleanup
corpusCleanData <- function(col) {

  corpusData <- col
  
  #Remove punctuations
  corpusData <- gsub(pattern="\\W", replace=" ", corpusData)
  
  #Remove digits
  corpusData <- gsub(pattern="\\d", replace=" ", corpusData)
  
  #Lower case
  corpusData <- tolower(corpusData)

  #Remove stopwords
  corpusData <- removeWords(corpusData, stopwords("english"))
  
  #Remove single chars
  corpusData <- gsub(pattern="\\b[A-z]\\b{1}", replace=" ", corpusData)

  #Remove whitespaces
  corpusData <- stripWhitespace(corpusData)
  
}

Word Cloud - Ham and Spam

dsHamSpam <- split(dsHS, dsHS$v1)
dsHam <- dsHamSpam[[1]] 
dsSpam <- dsHamSpam[[2]] 

corpusSpam <- corpusCleanData(dsSpam$v2)
corpusHam <- corpusCleanData(dsHam$v2)

head(corpusSpam)

## [1] "free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate apply " 
## [2] "freemsg hey darling week now word back like fun still tb ok xxx std chgs send å rcv"                  
## [3] "winner valued network customer selected receivea å prize reward claim call claim code kl valid hours "
## [4] " mobile months entitled update latest colour mobiles camera free call mobile update co free "         
## [5] "six chances win cash pounds txt csh send cost day days tsandcs apply reply hl info"                   
## [6] "urgent won week free membership å prize jackpot txt word claim www dbuk net lccltd pobox ldnw rw "

head(corpusHam)

## [1] "go jurong point crazy available bugis great world la buffet cine got amore wat "                                 
## [2] "ok lar joking wif oni "                                                                                          
## [3] " dun say early hor already say "                                                                                 
## [4] "nah don think goes usf lives around though"                                                                      
## [5] "even brother like speak treat like aids patent "                                                                 
## [6] " per request melle melle oru minnaminunginte nurungu vettam set callertune callers press copy friends callertune"

wordcloud(corpusSpam, max.words = 200, random.order = FALSE, col=rainbow(3))

wordcloud(corpusHam, max.words = 200, random.order = FALSE, col=rainbow(3))

Source: https://www.youtube.com/watch?v=Y7385dGRNLM / https://www.youtube.com/watch?v=jCrQYOsAcv4
Data: https://www.kaggle.com/uciml/sms-spam-collection-dataset/version/1#spam.csv

Document Classification - Data 607 PROJECT 4

Monu Chacko

April 11, 2019

Load libraries

Load Data

Build Corpus

Training and Test set

Cleanup

Bag of words model

Cross Validation

Run Model (Parallel Processing

Alternate Methods for data cleaning

Word Cloud - Ham and Spam

Source: https://www.youtube.com/watch?v=Y7385dGRNLM / https://www.youtube.com/watch?v=jCrQYOsAcv4
Data: https://www.kaggle.com/uciml/sms-spam-collection-dataset/version/1#spam.csv

Document Classification - Data 607 PROJECT 4

Monu Chacko

April 11, 2019

Load libraries

Load Data

Build Corpus

Training and Test set

Cleanup

Bag of words model

Cross Validation

Run Model (Parallel Processing

Alternate Methods for data cleaning

Word Cloud - Ham and Spam

Source: https://www.youtube.com/watch?v=Y7385dGRNLM / https://www.youtube.com/watch?v=jCrQYOsAcv4 Data: https://www.kaggle.com/uciml/sms-spam-collection-dataset/version/1#spam.csv

Source: https://www.youtube.com/watch?v=Y7385dGRNLM / https://www.youtube.com/watch?v=jCrQYOsAcv4
Data: https://www.kaggle.com/uciml/sms-spam-collection-dataset/version/1#spam.csv