It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.
For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/publiccorpus/
library(tm)
library(knitr)
library(plyr)
library(wordcloud)
library(ggplot2)
library(lattice)
library(e1071)
library(caret)
library(quanteda)
library(irlba)
library(randomForest)
# Get the Ham and Spam data
dsHS <- read.csv(file="SpamHam.csv", header=TRUE, sep=",", stringsAsFactors=FALSE)
str(dsHS)
## 'data.frame': 5572 obs. of 5 variables:
## $ v1 : chr "ham" "ham" "spam" "ham" ...
## $ v2 : chr "Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..." "Ok lar... Joking wif u oni..." "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question("| __truncated__ "U dun say so early hor... U c already then say..." ...
## $ X : chr "" "" "" "" ...
## $ X.1: chr "" "" "" "" ...
## $ X.2: chr "" "" "" "" ...
#Find if there is missing values in columns
which(!complete.cases(dsHS))
## integer(0)
#Count number of ham/spam messages in our dataset
table(dsHS$v1)
##
## ham spam
## 4825 747
#Find the proportion of ham vs spam messages in our dataset
prop.table(table(dsHS$v1))
##
## ham spam
## 0.8659368 0.1340632
#dsHS$v2 <- as.character(dsHS$v2)
dsHS$v2Len <- nchar(dsHS$v2)
hist(dsHS$v2Len)
ggplot(dsHS, aes(v2Len, fill=v1)) + geom_histogram(binwidth = 6) + facet_wrap(~v1)
histogram(~v2Len|v1, data=dsHS)
corpus_hs <- Corpus(VectorSource(dsHS$v2))
corpus_hs
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 5572
#Inspect corpus data
inspect(corpus_hs[1:5])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 5
##
## [1] Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
## [2] Ok lar... Joking wif u oni...
## [3] Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
## [4] U dun say so early hor... U c already then say...
## [5] Nah I don't think he goes to usf, he lives around here though
#Cleanup
corpus_hs_clean <- tm_map(corpus_hs, tolower)
corpus_hs_clean <- tm_map(corpus_hs_clean, removeNumbers)
corpus_hs_clean <- tm_map(corpus_hs_clean, removePunctuation)
corpus_hs_clean <- tm_map(corpus_hs_clean, removeWords, stopwords())
corpus_hs_clean <- tm_map(corpus_hs_clean, stripWhitespace)
inspect(corpus_hs_clean[1:5])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 5
##
## [1] go jurong point crazy available bugis n great world la e buffet cine got amore wat
## [2] ok lar joking wif u oni
## [3] free entry wkly comp win fa cup final tkts st may text fa receive entry questionstd txt ratetcs apply overs
## [4] u dun say early hor u c already say
## [5] nah dont think goes usf lives around though
corpus_dtm <- DocumentTermMatrix(corpus_hs_clean)
inspect(corpus_dtm[1:10, 10:15])
## <<DocumentTermMatrix (documents: 10, terms: 6)>>
## Non-/sparse entries: 6/54
## Sparsity : 90%
## Maximal term length: 6
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs joking lar oni point wat world
## 1 0 0 0 1 1 1
## 10 0 0 0 0 0 0
## 2 1 1 1 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 0 0 0
## 7 0 0 0 0 0 0
## 8 0 0 0 0 0 0
## 9 0 0 0 0 0 0
corpus_dtm_1 <- DocumentTermMatrix(corpus_hs_clean, control = list(tolower=TRUE, removeNumbers=TRUE, stopwords=TRUE, removePunctuation=TRUE, stripWhitespace=TRUE ))
inspect(corpus_dtm_1[1:10, 10:15])
## <<DocumentTermMatrix (documents: 10, terms: 6)>>
## Non-/sparse entries: 6/54
## Sparsity : 90%
## Maximal term length: 6
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs joking lar oni point wat world
## 1 0 0 0 1 1 1
## 10 0 0 0 0 0 0
## 2 1 1 1 0 0 0
## 3 0 0 0 0 0 0
## 4 0 0 0 0 0 0
## 5 0 0 0 0 0 0
## 6 0 0 0 0 0 0
## 7 0 0 0 0 0 0
## 8 0 0 0 0 0 0
## 9 0 0 0 0 0 0
#Set random number at the same point
set.seed(32984)
#Split training and test set
idx <- createDataPartition(dsHS$v1, times = 1, p=0.7, list = FALSE)
#70% Split
train <- dsHS[idx,]
#30% Split
test <- dsHS[-idx,]
#Verify proportion. Make sure the proportion is the same as in original dataset.
prop.table(table(train$v1))
##
## ham spam
## 0.8659318 0.1340682
prop.table(table(test$v1))
##
## ham spam
## 0.8659485 0.1340515
#Cleanup
train_tkns <- tokens(train$v2, what="word", remove_numbers = TRUE, remove_punct = TRUE,
remove_symbols = TRUE, remove_separators = TRUE, remove_hyphens = TRUE)
#Convert tokens to lowercase
train_tkns <- tokens_tolower(train_tkns)
#Take a look at the stopwords before removing
stopwords()
## [1] "i" "me" "my" "myself" "we"
## [6] "our" "ours" "ourselves" "you" "your"
## [11] "yours" "yourself" "yourselves" "he" "him"
## [16] "his" "himself" "she" "her" "hers"
## [21] "herself" "it" "its" "itself" "they"
## [26] "them" "their" "theirs" "themselves" "what"
## [31] "which" "who" "whom" "this" "that"
## [36] "these" "those" "am" "is" "are"
## [41] "was" "were" "be" "been" "being"
## [46] "have" "has" "had" "having" "do"
## [51] "does" "did" "doing" "would" "should"
## [56] "could" "ought" "i'm" "you're" "he's"
## [61] "she's" "it's" "we're" "they're" "i've"
## [66] "you've" "we've" "they've" "i'd" "you'd"
## [71] "he'd" "she'd" "we'd" "they'd" "i'll"
## [76] "you'll" "he'll" "she'll" "we'll" "they'll"
## [81] "isn't" "aren't" "wasn't" "weren't" "hasn't"
## [86] "haven't" "hadn't" "doesn't" "don't" "didn't"
## [91] "won't" "wouldn't" "shan't" "shouldn't" "can't"
## [96] "cannot" "couldn't" "mustn't" "let's" "that's"
## [101] "who's" "what's" "here's" "there's" "when's"
## [106] "where's" "why's" "how's" "a" "an"
## [111] "the" "and" "but" "if" "or"
## [116] "because" "as" "until" "while" "of"
## [121] "at" "by" "for" "with" "about"
## [126] "against" "between" "into" "through" "during"
## [131] "before" "after" "above" "below" "to"
## [136] "from" "up" "down" "in" "out"
## [141] "on" "off" "over" "under" "again"
## [146] "further" "then" "once" "here" "there"
## [151] "when" "where" "why" "how" "all"
## [156] "any" "both" "each" "few" "more"
## [161] "most" "other" "some" "such" "no"
## [166] "nor" "not" "only" "own" "same"
## [171] "so" "than" "too" "very" "will"
#Remove stopwords
train_tkns <- tokens_select(train_tkns, stopwords(), selection="remove")
#Stemming
train_tkns <- tokens_wordstem(train_tkns, language="english")
#View sample data
#sample(train_tkns, size = 10)
#Create bag of words model
train_tkns_dfm <- dfm(train_tkns, tolower=FALSE)
train_tkns_matrix <- as.matrix(train_tkns_dfm)
#head(train_tkns_matrix)
dim(train_tkns_matrix)
## [1] 3901 5742
#Feature with labels
train_tkns_df <- cbind(labels=train$v1, convert(train_tkns_dfm, to = "data.frame"))
#train_tkns_df <- cbind(labels=train$v1, as.data.frame(train_tkns_dfm))
#Make syntactically valid column names to avoid errors with with many R functions. Eg 4txt is not a valid name
names(train_tkns_df) <- make.names(names(train_tkns_df))
set.seed(48743)
#Since there is a class imbalance lets create stratified folds for 10-fold cross validation repeated 3 times. It creates 30 random stratified samples.
cv_folds <- createMultiFolds(train$v1, k=10, times=3)
cv_cntrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3, index = cv_folds)
#For parallel processing. Reduces processing time.
library(doSNOW)
start_time <- Sys.time()
#Run with 3 cores
clstr <- makeCluster(3, type="SOCK")
registerDoSNOW(clstr)
model_cv_1 <- train(labels ~ ., data=train_tkns_df, method="rpart", trControl=cv_cntrl, tuneLength=7)
stopCluster(clstr)
total_time <- Sys.time() - start_time
total_time
## Time difference of 22.7641 mins
model_cv_1
## CART
##
## 3901 samples
## 5743 predictors
## 2 classes: 'ham', 'spam'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 3511, 3510, 3511, 3511, 3511, 3511, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.02103250 0.9429179 0.7134030
## 0.02294455 0.9402689 0.6957674
## 0.02868069 0.9356553 0.6665729
## 0.03059273 0.9333472 0.6508580
## 0.03824092 0.9302709 0.6281573
## 0.05098789 0.9154072 0.5141434
## 0.32504780 0.8796808 0.1483505
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.0210325.
#Custom Function for Data Cleanup
corpusCleanData <- function(col) {
corpusData <- col
#Remove punctuations
corpusData <- gsub(pattern="\\W", replace=" ", corpusData)
#Remove digits
corpusData <- gsub(pattern="\\d", replace=" ", corpusData)
#Lower case
corpusData <- tolower(corpusData)
#Remove stopwords
corpusData <- removeWords(corpusData, stopwords("english"))
#Remove single chars
corpusData <- gsub(pattern="\\b[A-z]\\b{1}", replace=" ", corpusData)
#Remove whitespaces
corpusData <- stripWhitespace(corpusData)
}
dsHamSpam <- split(dsHS, dsHS$v1)
dsHam <- dsHamSpam[[1]]
dsSpam <- dsHamSpam[[2]]
corpusSpam <- corpusCleanData(dsSpam$v2)
corpusHam <- corpusCleanData(dsHam$v2)
head(corpusSpam)
## [1] "free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate apply "
## [2] "freemsg hey darling week now word back like fun still tb ok xxx std chgs send å rcv"
## [3] "winner valued network customer selected receivea å prize reward claim call claim code kl valid hours "
## [4] " mobile months entitled update latest colour mobiles camera free call mobile update co free "
## [5] "six chances win cash pounds txt csh send cost day days tsandcs apply reply hl info"
## [6] "urgent won week free membership å prize jackpot txt word claim www dbuk net lccltd pobox ldnw rw "
head(corpusHam)
## [1] "go jurong point crazy available bugis great world la buffet cine got amore wat "
## [2] "ok lar joking wif oni "
## [3] " dun say early hor already say "
## [4] "nah don think goes usf lives around though"
## [5] "even brother like speak treat like aids patent "
## [6] " per request melle melle oru minnaminunginte nurungu vettam set callertune callers press copy friends callertune"
wordcloud(corpusSpam, max.words = 200, random.order = FALSE, col=rainbow(3))
wordcloud(corpusHam, max.words = 200, random.order = FALSE, col=rainbow(3))
Source: https://www.youtube.com/watch?v=Y7385dGRNLM / https://www.youtube.com/watch?v=jCrQYOsAcv4
Data: https://www.kaggle.com/uciml/sms-spam-collection-dataset/version/1#spam.csv