suppressWarnings(library(tm))
suppressWarnings(library(RTextTools))
I have downloaded emails from here and unzipped twice using 7-zip.
#list.files -- List the Files in a Directory/Folder
#list.files("easy_ham") #List the Files in a "easy_ham" Directory/Folder
#list.files("spam") #List the Files in a "spam" Directory/Folder
#making it like easy_ham/2542.1a940f58e68fa8a84a3f83e30a624e8d
easy_ham_files <- paste0("easy_ham/", list.files("easy_ham"))
spam_files <- paste0("spam/", list.files("spam"))
hard_ham_files <- paste0("hard_ham/", list.files("hard_ham"))
#just to see how this is orgnized
head(easy_ham_files,2)
## [1] "easy_ham/0001.ea7e79d3153e7469e7a9c3e0af6a357e"
## [2] "easy_ham/0002.b3120c4bcbf3101e661161ee7efcb8bf"
read_emails <- function(x) {
paste(readLines(x), collapse=" ")
}
#easy_ham_files => contains filename
#easy_ham_corp=> contains full email text
easy_ham_corp <- unlist(lapply(easy_ham_files, read_emails))
spam_corp <- unlist(lapply(spam_files, read_emails))
hard_ham_corp <- unlist(lapply(hard_ham_files, read_emails))
## Warning in readLines(x): incomplete final line found on 'hard_ham/
## 0231.7c6cc716ce3f3bfad7130dd3c8d7b072'
## Warning in readLines(x): incomplete final line found on 'hard_ham/
## 0250.7c6cc716ce3f3bfad7130dd3c8d7b072'
easy_ham_frame <- data.frame(text=easy_ham_corp)
spam_frame <- data.frame(text=spam_corp)
hard_ham_frame<-data.frame(text=hard_ham_corp)
Storing actual email classification ( so we can compare later). 0 is span, 1 is not spam
easy_ham_frame$outcome <- 1
spam_frame$outcome <- 0
hard_ham_frame$outcome <- 1
Look at below statements closely ( we will need to revisit them again ). Here we are learning from easy data frame, and will validate it against hard emails frame data.
classified_emails <- rbind(easy_ham_frame, spam_frame) # these records are training data
emails <- rbind(classified_emails,hard_ham_frame) # training data + test data ( data to be validated )
emails$text <- iconv(emails$text, to="UTF-8")
emails$text <- lapply(emails$text, tolower)
emails$text <- lapply(emails$text, removePunctuation)
emails$text <- lapply(emails$text, removeWords, stopwords("english"))
emails$text <- lapply(emails$text, stemDocument)
emails$text <- unlist(emails$text)
Make corpus and matix
#A vector source interprets each element of the vector x as a document.
email_corp <- Corpus(VectorSource(emails$text))
document_term_matrix <- DocumentTermMatrix(email_corp)
Sparsity refers to the threshold of relative document frequency for a term, above which the term will be removed. Still confused ? ( like I was ), then read simple explanation here.
Here we are doing 97%, so in this case probably –almost – all terms will be retained.
document_term_matrix <- removeSparseTerms(document_term_matrix, 0.97)
#as.matrix(document_term_matrix)
# When the virgin flag is set to FALSE, it indicates that all data in the training and testing sets have corresponding labels.
#
# When the virgin flag is set to TRUE, it indicates that the testing set is unclassified data with no known true values.
#document_term_matrix <- rbind(document_term_matrix, hard_ham_frame)
classfiedThresold<-nrow(classified_emails)
Create container and do actual classification work.
container <-create_container(document_term_matrix,trainSize=1:classfiedThresold,testSize=((classfiedThresold+1):nrow(emails)),labels=emails$outcome,virgin=FALSE)
tree_trainer <- train_model(container, "TREE")
tree_output <- classify_model(container, tree_trainer)
model_performance <- data.frame(
correct_label = emails$outcome[((classfiedThresold+1):nrow(emails))],
tree = as.character(tree_output[,1]),
stringsAsFactors = FALSE)
prop.table(table(model_performance$correct_label == model_performance$tree))
##
## FALSE TRUE
## 0.748 0.252
Why result was so bad ? where things went wrong ? If we check in “Mix them up” section, we did set easy frame as our training data. So basically, we trained our model easy way and put it into practice with hard data. As someone said “The more you sweat in training, the less you bleed in war”. If we do other way around i.e. putting training data with hard way and then validate it with easy emails then result is different.
classified_emails <- rbind(hard_ham_frame, spam_frame)
emailsnew <- rbind(classified_emails,easy_ham_frame)
emailsnew$text <- iconv(emailsnew$text, to="UTF-8")
emailsnew$text <- lapply(emailsnew$text, tolower)
emailsnew$text <- lapply(emailsnew$text, removePunctuation)
emailsnew$text <- lapply(emailsnew$text, removeWords, stopwords("english"))
emailsnew$text <- lapply(emailsnew$text, stemDocument)
emailsnew$text <- unlist(emailsnew$text)
email_corp <- Corpus(VectorSource(emailsnew$text))
document_term_matrix <- DocumentTermMatrix(email_corp)
document_term_matrix <- removeSparseTerms(document_term_matrix, 0.97)
classfiedNewThresold<-nrow(classified_emails)
container <-create_container(document_term_matrix,trainSize=1:classfiedNewThresold,testSize=((classfiedNewThresold+1):nrow(emailsnew)),labels=emailsnew$outcome,virgin=FALSE)
tree_trainer <- train_model(container, "TREE")
tree_output <- classify_model(container, tree_trainer)
model_performance <- data.frame(
correct_label = emailsnew$outcome[((classfiedNewThresold+1):nrow(emailsnew))],
tree = as.character(tree_output[,1]),
stringsAsFactors = FALSE)
prop.table(table(model_performance$correct_label == model_performance$tree))
##
## FALSE TRUE
## 0.1348491 0.8651509