Project 4:Document Classification

The purpose of this project is to prepare document data for classification algorithms.For this project, I will be classifying emails using the ham/spam dataset. I am going to use random fores as machine learning algorithm. After creating the model I will compare the accuracy and predictive abilities.

Loading Packages

library(tm)
library(SnowballC)
library(wordcloud)
library(tidyverse)
library(caret)
library(dplyr)
library(stringr)
library(tidyr)
library(corpus)
library(caTools)
library(randomForest)
library(kableExtra)
library(e1071)
library(NLP)

Reading and Preparing the data

ham_dir <- 'E:/ERINDA/projects/Project 4/easy_ham/'
spam_dir <- 'E:/ERINDA/projects/Project 4/spam_2/'

#Create 2 lists for spam and ham in order to create dataframes
Ham_Files <- list.files(ham_dir)
length_ham <- length(Ham_Files)
Spam_Files <- list.files(spam_dir)
length_spam <- length(Spam_Files)


ham_list <- NA
for(i in 1:length(Ham_Files))
{
  filepath<-paste0(ham_dir, "/", Ham_Files[1])  
  text <-readLines(filepath)
  list1<- list(paste(text, collapse="\n"))
  ham_list = c(ham_list,list1)
  
}

ham_df <-as.data.frame(unlist(ham_list),stringsAsFactors = FALSE)
ham_df$type <- "ham"
colnames(ham_df) <- c("text","type")




spam_list <- NA
for(i in 1:length(Spam_Files))
{
  filepath<-paste0(spam_dir, "/", Spam_Files[1])  
  text <-readLines(filepath)
  list1<- list(paste(text, collapse="\n"))
  spam_list = c(spam_list,list1)
  
}

spam_df<-as.data.frame(unlist(spam_list),stringsAsFactors = FALSE)
spam_df$type <- "spam"
colnames(spam_df) <- c("text","type")

Combine the two dataframes into one

combined_df <- rbind(ham_df, spam_df)

Creating a corpus and cleaning the data

corpus_ham = VCorpus(VectorSource(ham_df$text))
corpus_spam = VCorpus(VectorSource(spam_df$text))
corpus_all = VCorpus(VectorSource(combined_df$text))

corpus_ham_clean <- tm_map(corpus_ham, removePunctuation)
corpus_ham_clean <- tm_map(corpus_ham_clean, removeNumbers)
corpus_ham_clean <- tm_map(corpus_ham_clean, removeWords, stopwords())
corpus_ham_clean <- tm_map(corpus_ham_clean, stripWhitespace)


corpus_spam_clean <- tm_map(corpus_spam, removePunctuation)
corpus_spam_clean <- tm_map(corpus_spam_clean, removeNumbers)
corpus_spam_clean <- tm_map(corpus_spam_clean, removeWords, stopwords())
corpus_spam_clean <- tm_map(corpus_spam_clean, stripWhitespace)


corpus_all_clean <- tm_map(corpus_all, removePunctuation)
corpus_all_clean <- tm_map(corpus_all_clean, removeNumbers)
corpus_all_clean <- tm_map(corpus_all_clean, removeWords, stopwords())
corpus_all_clean <- tm_map(corpus_all_clean, stripWhitespace)

Create Document-term matrix and wordcloud

wordcloud(corpus_ham_clean, max.words = 30)

wordcloud(corpus_spam_clean, max.words = 30)

spam_subset_dtm <- corpus_spam_clean %>% DocumentTermMatrix()
spam_subset_dtm <- spam_subset_dtm %>% removeSparseTerms(1-(10/length(corpus_spam_clean)))
spam_subset_dtm

## <<DocumentTermMatrix (documents: 1399, terms: 249)>>
## Non-/sparse entries: 348102/249
## Sparsity           : 0%
## Maximal term length: 33
## Weighting          : term frequency (tf)

ham_subset_dtm <- corpus_ham_clean %>% DocumentTermMatrix()
ham_subset_dtm <- ham_subset_dtm %>% removeSparseTerms(1-(10/length(corpus_ham_clean)))
ham_subset_dtm

## <<DocumentTermMatrix (documents: 2552, terms: 173)>>
## Non-/sparse entries: 441323/173
## Sparsity           : 0%
## Maximal term length: 51
## Weighting          : term frequency (tf)

corpus_all_dtm <- DocumentTermMatrix(corpus_all_clean)
corpus_all_dtm <- removeSparseTerms(corpus_all_dtm, 0.90)
inspect(corpus_all_dtm)

## <<DocumentTermMatrix (documents: 3951, terms: 375)>>
## Non-/sparse entries: 789425/692200
## Sparsity           : 47%
## Maximal term length: 51
## Weighting          : term frequency (tf)
## Sample             :
##       Terms
## Docs   aug esmtp lbrace list localhost pick rbrace received thu
##   2554   8     4      0    4         3    0      0        7   0
##   2555   8     4      0    4         3    0      0        7   0
##   2556   8     4      0    4         3    0      0        7   0
##   2557   8     4      0    4         3    0      0        7   0
##   2558   8     4      0    4         3    0      0        7   0
##   2559   8     4      0    4         3    0      0        7   0
##   2560   8     4      0    4         3    0      0        7   0
##   2561   8     4      0    4         3    0      0        7   0
##   2562   8     4      0    4         3    0      0        7   0
##   2563   8     4      0    4         3    0      0        7   0
##       Terms
## Docs   tmdadeepeddyvirciocom
##   2554                     0
##   2555                     0
##   2556                     0
##   2557                     0
##   2558                     0
##   2559                     0
##   2560                     0
##   2561                     0
##   2562                     0
##   2563                     0

dtm_spam_freq <- sort(colSums(as.matrix(spam_subset_dtm)), decreasing = TRUE)
head(dtm_spam_freq, 20)

##              mlm              aug         received      information 
##            13980            11184             9786             8388 
##             will              fri      iluglinuxie            email 
##             8388             6990             6990             5592 
##            esmtp ilugadminlinuxie           letter             list 
##             5592             5592             5592             5592 
##    lughtuathaorg           people        receiving             send 
##             5592             5592             5592             5592 
##              the bettyjagessarcom             free        localhost 
##             5592             4194             4194             4194

dtm_ham_freq <- sort(colSums(as.matrix(ham_subset_dtm)), decreasing = TRUE)
head(dtm_ham_freq, 20)

##                        aug                        thu 
##                      33163                      30612 
##                   received                       pick 
##                      25510                      22959 
##                      esmtp                     lbrace 
##                      15306                      15306 
##                       list                     rbrace 
##                      15306                      15306 
##      tmdadeepeddyvirciocom      exmhworkersexamplecom 
##                      15306                      10204 
##                  localhost                   sequence 
##                      10204                      10204 
##                    subject                    command 
##                      10204                       7653 
##                      delta                        edt 
##                       7653                       7653 
## exmhworkersadminexamplecom       exmhworkersredhatcom 
##                       7653                       7653 
##                       from                        ftp 
##                       7653                       7653

#Machine Learning Modeling

model_train <- as.data.frame(as.matrix(corpus_all_dtm))
model_train$class = combined_df$type
#Shuffling the data and convert the dataframe into numeric
model_train <- model_train[sample(1:nrow(model_train)),]
model_train <- model_train %>% mutate(class = replace(class, class == "ham", 0))
model_train <- model_train %>% mutate(class = replace(class, class == "spam", 1))
model_train$class <- as.numeric(as.character(model_train$class))
model_train <- as.data.frame(apply(model_train, 2, as.numeric))

# splitting the dataframe into Training and Test
set.seed(200)
split = sample.split(model_train$class, SplitRatio = 0.75)
training = subset(model_train, split == TRUE)
testing = subset(model_train, split == FALSE)

nrow(training)

## [1] 2963

nrow(testing)

## [1] 988

num_obs <- ncol(training) - 1
num_obs

## [1] 375

I used Random Forest model to pull random values*

randomForest = randomForest(x = training[-num_obs],
                          y = training$class,
                          ntree = 3)

## Warning in randomForest.default(x = training[-num_obs], y =
## training$class, : The response has five or fewer unique values. Are you
## sure you want to do regression?

Prediction of the results

prediction = predict(randomForest, newdata = testing[-num_obs])
test_matrix <- table(prediction>0,testing$class)
test_matrix

##        
##           0   1
##   FALSE 638   0
##   TRUE    0 350

Define the accuracy of the model

validate <- test_matrix['TRUE', 2] + test_matrix['FALSE', 1] 
accuracy_model <- validate/nrow(testing) * 100
accuracy_model

## [1] 100

Project 4:Document Classification

Erinda Budo

11/17/2019