Project 4 - Data 607

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.

For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder).

One example corpus: https://spamassassin.apache.org/publiccorpus/

Setting a local path on machine
Downloading easy_ham and spam_2 corpuses
Saving and untaring them for usage.

setwd("/Users/oohanne@us.ibm.com/Documents/CUNY/DATA607")
local_path<-setwd("/Users/oohanne@us.ibm.com/Documents/CUNY/DATA607")

download_untar<-function(file, destfile){
  url <- paste("https://spamassassin.apache.org/old/publiccorpus/", file, sep="")
  download.file(url, destfile=destfile)
  untar(destfile)
}

download_untar("20050311_spam_2.tar.bz2", "spam_2.tar.gz")
download_untar("20030228_easy_ham.tar.bz2", "easy_ham.tar.gz")

Functions:

Cleaning and filtering the ham and spam datasets
Building and processing term document matrix
Preparing the training and testing datasets
The filtering will be: 66% training and 34% testing

corpusfiles_cleaning_actions<-c(removePunctuation, stripWhitespace, tolower, removeNumbers)
filter_files<-function(corpus_files){
  for(i in corpusfiles_cleaning_actions){
    data<-tm_map(corpus_files, i) 
  }
  return(data)
}

setTmd<-function(type, path){
  dir<-sprintf("%s/%s", path, type)
  corpus_files<-Corpus(DirSource(directory=dir))
  cleaned_files <- filter_files(corpus_files)
  tmd<-TermDocumentMatrix(cleaned_files)
  tdm<-removeSparseTerms(tmd,0.7)
}

df_testing <- c()
df_vector<-function(df_data, type, flag){
  df_docs <- c(levels(df_data$Docs))
  df_docs <- sample(df_docs)
  df_training <- df_data %>% 
                filter(Docs %in% df_docs[1:round(0.66*length(df_docs))]) 
  df_testing <- df_data %>% 
                filter(Docs %in% df_docs[((round(0.66*length(df_docs)))+1):length(df_docs)]) 
  df_training <- df_training %>% 
                select(Terms,Freq) %>% 
                group_by(Terms) %>% 
                summarise(Freq=sum(Freq)) %>% 
                mutate(Type=type)
  return(df_training)
}

Training ham and spam datasets
Add type column for both datasets
Combining them into one dataset
Replace NAs with 0s

ham <- setTmd("easy_ham", local_path)
spam <- setTmd("spam_2", local_path)

ham_data <- as.data.frame(as.table(ham))
spam_data <- as.data.frame(as.table(spam))

ham_training<-df_vector(ham_data,"ham","training")
ham_testing<-df_testing
spam_training<-df_vector(spam_data,"spam","training")
spam_testing<-df_testing
ham_spam_combined_training<-rbind(ham_training, spam_training) 
ham_spam_combined_training<-spread(ham_spam_combined_training, Type, Freq)
ham_spam_combined_training[is.na(ham_spam_combined_training)] <- 0
kable(head(ham_spam_combined_training))

Terms	ham	spam
admin	4076	0
and	6972	6754
archive	990	0
ascii	748	0
beenthere	988	0
bulk	1093	0

Rating spamness and hamness in the training dataset
Plot the results

ham_spam_training <- ham_spam_combined_training %>% 
                    mutate(hamness=ham/(ham+spam),
                           spamness=spam/(ham+spam))

kable(head(ham_spam_training))

Terms	ham	spam	hamness	spamness
admin	4076	0	1.0000000	0.0000000
and	6972	6754	0.5079411	0.4920589
archive	990	0	1.0000000	0.0000000
ascii	748	0	1.0000000	0.0000000
beenthere	988	0	1.0000000	0.0000000
bulk	1093	0	1.0000000	0.0000000

ggplot(ham_spam_training, aes(x=ham_spam_training$spamness, y=ham_spam_training$hamness)) + geom_point(data=ham_spam_training, aes(x=ham_spam_training$spamness, y=ham_spam_training$hamness))

Preparing for testing ham and spam datasets
Add type column for both datasets
Combining them into one dataset
Replace NAs with 0s
Plot the results

ham_testing <- ham_data %>% 
              mutate(Actual_Type="ham")
spam_testing <- spam_data %>% 
              mutate(Actual_Type="spam")

ham_spam_testing <- rbind(ham_testing, spam_testing) 
ham_spam_testing[is.na(ham_spam_testing)] <- 0

kable(head(ham_spam_testing))

Terms	Docs	Freq	Actual_Type
admin	1	4	ham
and	1	2	ham
archive	1	1	ham
ascii	1	1	ham
beenthere	1	1	ham
bulk	1	1	ham

ggplot(ham_spam_testing, aes(x=ham_spam_testing$Terms, y=ham_spam_testing$Freq)) + geom_point(data=ham_spam_testing, aes(x=ham_spam_testing$Terms, y=ham_spam_testing$Freq))

Merging training and testing datasets into one dataset
Sort them by Docs
Plot the results

ham_spam_merged <- merge(x=ham_spam_testing, y=ham_spam_training, by="Terms", all.x=T)
ham_spam_merged <- ham_spam_merged %>% 
                  arrange(Docs)  %>% 
                  select(-ham,-spam) %>% 
                  mutate(hamness_sum=Freq*hamness, spamness_sum=Freq*spamness)

kable(head(ham_spam_merged))

Terms	Docs	Freq	Actual_Type	hamness	spamness	hamness_sum	spamness_sum
about	1	0	ham	1.0000000	0.0000000	0.000000	0.0000000
admin	1	4	ham	1.0000000	0.0000000	4.000000	0.0000000
align	1	0	spam	0.0000000	1.0000000	0.000000	0.0000000
all	1	0	ham	0.4442069	0.5557931	0.000000	0.0000000
all	1	0	spam	0.4442069	0.5557931	0.000000	0.0000000
and	1	2	ham	0.5079411	0.4920589	1.015882	0.9841177

ggplot(ham_spam_merged, aes(x=ham_spam_testing, y=ham_spam_merged$spamness_sum)) + geom_point(data=ham_spam_merged, aes(x=ham_spam_merged$hamness_sum, y=ham_spam_merged$spamness_sum))

Project 4 - Data 607

Ohannes (Hovig) Ohannessian

4/13/2018