It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder).

One example corpus: https://spamassassin.apache.org/publiccorpus/

setwd("/Users/oohanne@us.ibm.com/Documents/CUNY/DATA607")
local_path<-setwd("/Users/oohanne@us.ibm.com/Documents/CUNY/DATA607")

download_untar<-function(file, destfile){
  url <- paste("https://spamassassin.apache.org/old/publiccorpus/", file, sep="")
  download.file(url, destfile=destfile)
  untar(destfile)
}

download_untar("20050311_spam_2.tar.bz2", "spam_2.tar.gz")
download_untar("20030228_easy_ham.tar.bz2", "easy_ham.tar.gz")

Functions:

corpusfiles_cleaning_actions<-c(removePunctuation, stripWhitespace, tolower, removeNumbers)
filter_files<-function(corpus_files){
  for(i in corpusfiles_cleaning_actions){
    data<-tm_map(corpus_files, i) 
  }
  return(data)
}

setTmd<-function(type, path){
  dir<-sprintf("%s/%s", path, type)
  corpus_files<-Corpus(DirSource(directory=dir))
  cleaned_files <- filter_files(corpus_files)
  tmd<-TermDocumentMatrix(cleaned_files)
  tdm<-removeSparseTerms(tmd,0.7)
}

df_testing <- c()
df_vector<-function(df_data, type, flag){
  df_docs <- c(levels(df_data$Docs))
  df_docs <- sample(df_docs)
  df_training <- df_data %>% 
                filter(Docs %in% df_docs[1:round(0.66*length(df_docs))]) 
  df_testing <- df_data %>% 
                filter(Docs %in% df_docs[((round(0.66*length(df_docs)))+1):length(df_docs)]) 
  df_training <- df_training %>% 
                select(Terms,Freq) %>% 
                group_by(Terms) %>% 
                summarise(Freq=sum(Freq)) %>% 
                mutate(Type=type)
  return(df_training)
}
ham <- setTmd("easy_ham", local_path)
spam <- setTmd("spam_2", local_path)

ham_data <- as.data.frame(as.table(ham))
spam_data <- as.data.frame(as.table(spam))

ham_training<-df_vector(ham_data,"ham","training")
ham_testing<-df_testing
spam_training<-df_vector(spam_data,"spam","training")
spam_testing<-df_testing
ham_spam_combined_training<-rbind(ham_training, spam_training) 
ham_spam_combined_training<-spread(ham_spam_combined_training, Type, Freq)
ham_spam_combined_training[is.na(ham_spam_combined_training)] <- 0
kable(head(ham_spam_combined_training))
Terms ham spam
admin 4076 0
and 6972 6754
archive 990 0
ascii 748 0
beenthere 988 0
bulk 1093 0
ham_spam_training <- ham_spam_combined_training %>% 
                    mutate(hamness=ham/(ham+spam),
                           spamness=spam/(ham+spam))

kable(head(ham_spam_training))
Terms ham spam hamness spamness
admin 4076 0 1.0000000 0.0000000
and 6972 6754 0.5079411 0.4920589
archive 990 0 1.0000000 0.0000000
ascii 748 0 1.0000000 0.0000000
beenthere 988 0 1.0000000 0.0000000
bulk 1093 0 1.0000000 0.0000000
ggplot(ham_spam_training, aes(x=ham_spam_training$spamness, y=ham_spam_training$hamness)) + geom_point(data=ham_spam_training, aes(x=ham_spam_training$spamness, y=ham_spam_training$hamness)) 

ham_testing <- ham_data %>% 
              mutate(Actual_Type="ham")
spam_testing <- spam_data %>% 
              mutate(Actual_Type="spam")

ham_spam_testing <- rbind(ham_testing, spam_testing) 
ham_spam_testing[is.na(ham_spam_testing)] <- 0

kable(head(ham_spam_testing))
Terms Docs Freq Actual_Type
admin 1 4 ham
and 1 2 ham
archive 1 1 ham
ascii 1 1 ham
beenthere 1 1 ham
bulk 1 1 ham
ggplot(ham_spam_testing, aes(x=ham_spam_testing$Terms, y=ham_spam_testing$Freq)) + geom_point(data=ham_spam_testing, aes(x=ham_spam_testing$Terms, y=ham_spam_testing$Freq)) 

ham_spam_merged <- merge(x=ham_spam_testing, y=ham_spam_training, by="Terms", all.x=T)
ham_spam_merged <- ham_spam_merged %>% 
                  arrange(Docs)  %>% 
                  select(-ham,-spam) %>% 
                  mutate(hamness_sum=Freq*hamness, spamness_sum=Freq*spamness)

kable(head(ham_spam_merged))
Terms Docs Freq Actual_Type hamness spamness hamness_sum spamness_sum
about 1 0 ham 1.0000000 0.0000000 0.000000 0.0000000
admin 1 4 ham 1.0000000 0.0000000 4.000000 0.0000000
align 1 0 spam 0.0000000 1.0000000 0.000000 0.0000000
all 1 0 ham 0.4442069 0.5557931 0.000000 0.0000000
all 1 0 spam 0.4442069 0.5557931 0.000000 0.0000000
and 1 2 ham 0.5079411 0.4920589 1.015882 0.9841177
ggplot(ham_spam_merged, aes(x=ham_spam_testing, y=ham_spam_merged$spamness_sum)) + geom_point(data=ham_spam_merged, aes(x=ham_spam_merged$hamness_sum, y=ham_spam_merged$spamness_sum))