Project 4: Document Classification

Overview

Setup the environment

Selecting the datasets

Overview

It can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.
For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apachttps://spamassassin.apache.org/old/publiccorpus/20030228_hard_ham.tar.bz2he.org/old/publiccorpus/

Setup the environment

library(stringr)
library(tm)
library(RTextTools)
library(tidyverse)
library(SnowballC)
library(knitr)
library(tidytext)
library(wordcloud)
library(caret)
library(gbm)
library(e1071)
library(naivebayes)

Selecting the datasets

url_spam <- "https://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2"


url_ham <- "https://spamassassin.apache.org/old/publiccorpus/20030228_hard_ham.tar.bz2"

Download the setup the datasets

download.file(url_spam, destfile = "20050311_spam_2 .tar.bz2")

spam_file <- untar("20050311_spam_2 .tar.bz2", list = TRUE)

length(spam_file)

## [1] 1398

download.file(url_ham, destfile = "20030228_hard_ham.tar.bz2")

ham_file <- untar("20030228_hard_ham.tar.bz2", list = TRUE)

length(ham_file)

## [1] 252

Get the files from the local drive

file_spam <- DirSource("/Users/karimh/Documents/Google Drive/607 - Data Acquisition and Management/607 - Project4/emails/spam")

file_ham <- DirSource("/Users/karimh/Documents/Google Drive/607 - Data Acquisition and Management/607 - Project4/emails/ham")

Create the Corpus for both spam and ham

spam_corpus1 <- Corpus(file_spam, readerControl = list(reader=readPlain))
length(spam_corpus1)

## [1] 1397

ham_corpus1 <- Corpus(file_ham, readerControl = list(reader=readPlain))
length(ham_corpus1)

## [1] 251

Spam Corpus Cleaning

spam_corpus1 <- spam_corpus1 %>%
 tm_map(content_transformer(tolower)) %>% 
 tm_map(removeNumbers) %>% 
 tm_map(removePunctuation) %>%
 tm_map(stripWhitespace) %>%
 tm_map(stemDocument) %>% 
 tolower()

spam_corpus1 <- Corpus(VectorSource(spam_corpus1))
spam_corpus1 <- tm_map(spam_corpus1, removeWords, stopwords())

## Warning in tm_map.SimpleCorpus(spam_corpus1, removeWords, stopwords()):
## transformation drops documents

x1 <- TermDocumentMatrix(spam_corpus1)
x2 <- as.matrix(x1)
x3 <- sort(rowSums(x2), decreasing = TRUE)
x <- data.frame(word=names(x3), frequency=x3)
head(x,10)

##          word frequency
## receiv receiv      7196
## size     size      4595
## jul       jul      4382
## font     font      3664
## widthd widthd      3547
## email   email      3260
## esmtp   esmtp      3139
## tabl     tabl      3117
## width   width      2862
## will     will      2617

Ham Corpus Cleaning

ham_corpus1 <- ham_corpus1 %>%
 tm_map(content_transformer(tolower)) %>% 
 tm_map(removeNumbers) %>% 
 tm_map(removePunctuation) %>%
 tm_map(stripWhitespace) %>%
 tm_map(stemDocument) %>% 
 tolower()

ham_corpus1 <- Corpus(VectorSource(ham_corpus1))
ham_corpus1 <- tm_map(ham_corpus1, removeWords, stopwords())

## Warning in tm_map.SimpleCorpus(ham_corpus1, removeWords, stopwords()):
## transformation drops documents

y1 <- TermDocumentMatrix(ham_corpus1)
y2 <- as.matrix(x1)
y3 <- sort(rowSums(y2), decreasing = TRUE)
y <- data.frame(word=names(y3), frequency=y3)
y4 <- head(y,100)

Word Cloud for Ham

wordcloud(x$word, max.words =100,min.freq=100,scale=c(1,.1), random.order = FALSE,rot.per=.5, color=brewer.pal(8,"Dark2"))

## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents

Word Cloud for Ham

wordcloud(y4$word, max.words =100,min.freq=100,scale=c(1,.1), random.order = FALSE,rot.per=.5, color=brewer.pal(8,"Dark2"))

## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents

## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents

Project 4: Document Classification

Karim Hammoud

2020-11-14

Overview

Setup the environment

Selecting the datasets