It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.
For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder).
One example corpus: https://spamassassin.apache.org/publiccorpus/
setwd("/Users/oohanne@us.ibm.com/Documents/CUNY/DATA607")
local_path<-setwd("/Users/oohanne@us.ibm.com/Documents/CUNY/DATA607")
download_untar<-function(file, destfile){
url <- paste("https://spamassassin.apache.org/old/publiccorpus/", file, sep="")
download.file(url, destfile=destfile)
untar(destfile)
}
download_untar("20050311_spam_2.tar.bz2", "spam_2.tar.gz")
download_untar("20030228_easy_ham.tar.bz2", "easy_ham.tar.gz")
Functions:
corpusfiles_cleaning_actions<-c(removePunctuation, stripWhitespace, tolower, removeNumbers)
filter_files<-function(corpus_files){
for(i in corpusfiles_cleaning_actions){
data<-tm_map(corpus_files, i)
}
return(data)
}
setTmd<-function(type, path){
dir<-sprintf("%s/%s", path, type)
corpus_files<-Corpus(DirSource(directory=dir))
cleaned_files <- filter_files(corpus_files)
tmd<-TermDocumentMatrix(cleaned_files)
tdm<-removeSparseTerms(tmd,0.7)
}
df_testing <- c()
df_vector<-function(df_data, type, flag){
df_docs <- c(levels(df_data$Docs))
df_docs <- sample(df_docs)
df_training <- df_data %>%
filter(Docs %in% df_docs[1:round(0.66*length(df_docs))])
df_testing <- df_data %>%
filter(Docs %in% df_docs[((round(0.66*length(df_docs)))+1):length(df_docs)])
df_training <- df_training %>%
select(Terms,Freq) %>%
group_by(Terms) %>%
summarise(Freq=sum(Freq)) %>%
mutate(Type=type)
return(df_training)
}
ham <- setTmd("easy_ham", local_path)
spam <- setTmd("spam_2", local_path)
ham_data <- as.data.frame(as.table(ham))
spam_data <- as.data.frame(as.table(spam))
ham_training<-df_vector(ham_data,"ham","training")
ham_testing<-df_testing
spam_training<-df_vector(spam_data,"spam","training")
spam_testing<-df_testing
ham_spam_combined_training<-rbind(ham_training, spam_training)
ham_spam_combined_training<-spread(ham_spam_combined_training, Type, Freq)
ham_spam_combined_training[is.na(ham_spam_combined_training)] <- 0
kable(head(ham_spam_combined_training))
Terms | ham | spam |
---|---|---|
admin | 4076 | 0 |
and | 6972 | 6754 |
archive | 990 | 0 |
ascii | 748 | 0 |
beenthere | 988 | 0 |
bulk | 1093 | 0 |
ham_spam_training <- ham_spam_combined_training %>%
mutate(hamness=ham/(ham+spam),
spamness=spam/(ham+spam))
kable(head(ham_spam_training))
Terms | ham | spam | hamness | spamness |
---|---|---|---|---|
admin | 4076 | 0 | 1.0000000 | 0.0000000 |
and | 6972 | 6754 | 0.5079411 | 0.4920589 |
archive | 990 | 0 | 1.0000000 | 0.0000000 |
ascii | 748 | 0 | 1.0000000 | 0.0000000 |
beenthere | 988 | 0 | 1.0000000 | 0.0000000 |
bulk | 1093 | 0 | 1.0000000 | 0.0000000 |
ggplot(ham_spam_training, aes(x=ham_spam_training$spamness, y=ham_spam_training$hamness)) + geom_point(data=ham_spam_training, aes(x=ham_spam_training$spamness, y=ham_spam_training$hamness))
ham_testing <- ham_data %>%
mutate(Actual_Type="ham")
spam_testing <- spam_data %>%
mutate(Actual_Type="spam")
ham_spam_testing <- rbind(ham_testing, spam_testing)
ham_spam_testing[is.na(ham_spam_testing)] <- 0
kable(head(ham_spam_testing))
Terms | Docs | Freq | Actual_Type |
---|---|---|---|
admin | 1 | 4 | ham |
and | 1 | 2 | ham |
archive | 1 | 1 | ham |
ascii | 1 | 1 | ham |
beenthere | 1 | 1 | ham |
bulk | 1 | 1 | ham |
ggplot(ham_spam_testing, aes(x=ham_spam_testing$Terms, y=ham_spam_testing$Freq)) + geom_point(data=ham_spam_testing, aes(x=ham_spam_testing$Terms, y=ham_spam_testing$Freq))
ham_spam_merged <- merge(x=ham_spam_testing, y=ham_spam_training, by="Terms", all.x=T)
ham_spam_merged <- ham_spam_merged %>%
arrange(Docs) %>%
select(-ham,-spam) %>%
mutate(hamness_sum=Freq*hamness, spamness_sum=Freq*spamness)
kable(head(ham_spam_merged))
Terms | Docs | Freq | Actual_Type | hamness | spamness | hamness_sum | spamness_sum |
---|---|---|---|---|---|---|---|
about | 1 | 0 | ham | 1.0000000 | 0.0000000 | 0.000000 | 0.0000000 |
admin | 1 | 4 | ham | 1.0000000 | 0.0000000 | 4.000000 | 0.0000000 |
align | 1 | 0 | spam | 0.0000000 | 1.0000000 | 0.000000 | 0.0000000 |
all | 1 | 0 | ham | 0.4442069 | 0.5557931 | 0.000000 | 0.0000000 |
all | 1 | 0 | spam | 0.4442069 | 0.5557931 | 0.000000 | 0.0000000 |
and | 1 | 2 | ham | 0.5079411 | 0.4920589 | 1.015882 | 0.9841177 |
ggplot(ham_spam_merged, aes(x=ham_spam_testing, y=ham_spam_merged$spamness_sum)) + geom_point(data=ham_spam_merged, aes(x=ham_spam_merged$hamness_sum, y=ham_spam_merged$spamness_sum))