The purpose of this project is to prepare document data for classification algorithms.For this project, I will be classifying emails using the ham/spam dataset. I am going to use random fores as machine learning algorithm. After creating the model I will compare the accuracy and predictive abilities.
Loading Packages
library(tm)
library(SnowballC)
library(wordcloud)
library(tidyverse)
library(caret)
library(dplyr)
library(stringr)
library(tidyr)
library(corpus)
library(caTools)
library(randomForest)
library(kableExtra)
library(e1071)
library(NLP)
Reading and Preparing the data
ham_dir <- 'E:/ERINDA/projects/Project 4/easy_ham/'
spam_dir <- 'E:/ERINDA/projects/Project 4/spam_2/'
#Create 2 lists for spam and ham in order to create dataframes
Ham_Files <- list.files(ham_dir)
length_ham <- length(Ham_Files)
Spam_Files <- list.files(spam_dir)
length_spam <- length(Spam_Files)
ham_list <- NA
for(i in 1:length(Ham_Files))
{
filepath<-paste0(ham_dir, "/", Ham_Files[1])
text <-readLines(filepath)
list1<- list(paste(text, collapse="\n"))
ham_list = c(ham_list,list1)
}
ham_df <-as.data.frame(unlist(ham_list),stringsAsFactors = FALSE)
ham_df$type <- "ham"
colnames(ham_df) <- c("text","type")
spam_list <- NA
for(i in 1:length(Spam_Files))
{
filepath<-paste0(spam_dir, "/", Spam_Files[1])
text <-readLines(filepath)
list1<- list(paste(text, collapse="\n"))
spam_list = c(spam_list,list1)
}
spam_df<-as.data.frame(unlist(spam_list),stringsAsFactors = FALSE)
spam_df$type <- "spam"
colnames(spam_df) <- c("text","type")
Combine the two dataframes into one
combined_df <- rbind(ham_df, spam_df)
Creating a corpus and cleaning the data
corpus_ham = VCorpus(VectorSource(ham_df$text))
corpus_spam = VCorpus(VectorSource(spam_df$text))
corpus_all = VCorpus(VectorSource(combined_df$text))
corpus_ham_clean <- tm_map(corpus_ham, removePunctuation)
corpus_ham_clean <- tm_map(corpus_ham_clean, removeNumbers)
corpus_ham_clean <- tm_map(corpus_ham_clean, removeWords, stopwords())
corpus_ham_clean <- tm_map(corpus_ham_clean, stripWhitespace)
corpus_spam_clean <- tm_map(corpus_spam, removePunctuation)
corpus_spam_clean <- tm_map(corpus_spam_clean, removeNumbers)
corpus_spam_clean <- tm_map(corpus_spam_clean, removeWords, stopwords())
corpus_spam_clean <- tm_map(corpus_spam_clean, stripWhitespace)
corpus_all_clean <- tm_map(corpus_all, removePunctuation)
corpus_all_clean <- tm_map(corpus_all_clean, removeNumbers)
corpus_all_clean <- tm_map(corpus_all_clean, removeWords, stopwords())
corpus_all_clean <- tm_map(corpus_all_clean, stripWhitespace)
Create Document-term matrix and wordcloud
wordcloud(corpus_ham_clean, max.words = 30)
wordcloud(corpus_spam_clean, max.words = 30)
spam_subset_dtm <- corpus_spam_clean %>% DocumentTermMatrix()
spam_subset_dtm <- spam_subset_dtm %>% removeSparseTerms(1-(10/length(corpus_spam_clean)))
spam_subset_dtm
## <<DocumentTermMatrix (documents: 1399, terms: 249)>>
## Non-/sparse entries: 348102/249
## Sparsity : 0%
## Maximal term length: 33
## Weighting : term frequency (tf)
ham_subset_dtm <- corpus_ham_clean %>% DocumentTermMatrix()
ham_subset_dtm <- ham_subset_dtm %>% removeSparseTerms(1-(10/length(corpus_ham_clean)))
ham_subset_dtm
## <<DocumentTermMatrix (documents: 2552, terms: 173)>>
## Non-/sparse entries: 441323/173
## Sparsity : 0%
## Maximal term length: 51
## Weighting : term frequency (tf)
corpus_all_dtm <- DocumentTermMatrix(corpus_all_clean)
corpus_all_dtm <- removeSparseTerms(corpus_all_dtm, 0.90)
inspect(corpus_all_dtm)
## <<DocumentTermMatrix (documents: 3951, terms: 375)>>
## Non-/sparse entries: 789425/692200
## Sparsity : 47%
## Maximal term length: 51
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs aug esmtp lbrace list localhost pick rbrace received thu
## 2554 8 4 0 4 3 0 0 7 0
## 2555 8 4 0 4 3 0 0 7 0
## 2556 8 4 0 4 3 0 0 7 0
## 2557 8 4 0 4 3 0 0 7 0
## 2558 8 4 0 4 3 0 0 7 0
## 2559 8 4 0 4 3 0 0 7 0
## 2560 8 4 0 4 3 0 0 7 0
## 2561 8 4 0 4 3 0 0 7 0
## 2562 8 4 0 4 3 0 0 7 0
## 2563 8 4 0 4 3 0 0 7 0
## Terms
## Docs tmdadeepeddyvirciocom
## 2554 0
## 2555 0
## 2556 0
## 2557 0
## 2558 0
## 2559 0
## 2560 0
## 2561 0
## 2562 0
## 2563 0
dtm_spam_freq <- sort(colSums(as.matrix(spam_subset_dtm)), decreasing = TRUE)
head(dtm_spam_freq, 20)
## mlm aug received information
## 13980 11184 9786 8388
## will fri iluglinuxie email
## 8388 6990 6990 5592
## esmtp ilugadminlinuxie letter list
## 5592 5592 5592 5592
## lughtuathaorg people receiving send
## 5592 5592 5592 5592
## the bettyjagessarcom free localhost
## 5592 4194 4194 4194
dtm_ham_freq <- sort(colSums(as.matrix(ham_subset_dtm)), decreasing = TRUE)
head(dtm_ham_freq, 20)
## aug thu
## 33163 30612
## received pick
## 25510 22959
## esmtp lbrace
## 15306 15306
## list rbrace
## 15306 15306
## tmdadeepeddyvirciocom exmhworkersexamplecom
## 15306 10204
## localhost sequence
## 10204 10204
## subject command
## 10204 7653
## delta edt
## 7653 7653
## exmhworkersadminexamplecom exmhworkersredhatcom
## 7653 7653
## from ftp
## 7653 7653
#Machine Learning Modeling
model_train <- as.data.frame(as.matrix(corpus_all_dtm))
model_train$class = combined_df$type
#Shuffling the data and convert the dataframe into numeric
model_train <- model_train[sample(1:nrow(model_train)),]
model_train <- model_train %>% mutate(class = replace(class, class == "ham", 0))
model_train <- model_train %>% mutate(class = replace(class, class == "spam", 1))
model_train$class <- as.numeric(as.character(model_train$class))
model_train <- as.data.frame(apply(model_train, 2, as.numeric))
# splitting the dataframe into Training and Test
set.seed(200)
split = sample.split(model_train$class, SplitRatio = 0.75)
training = subset(model_train, split == TRUE)
testing = subset(model_train, split == FALSE)
nrow(training)
## [1] 2963
nrow(testing)
## [1] 988
num_obs <- ncol(training) - 1
num_obs
## [1] 375
randomForest = randomForest(x = training[-num_obs],
y = training$class,
ntree = 3)
## Warning in randomForest.default(x = training[-num_obs], y =
## training$class, : The response has five or fewer unique values. Are you
## sure you want to do regression?
Prediction of the results
prediction = predict(randomForest, newdata = testing[-num_obs])
test_matrix <- table(prediction>0,testing$class)
test_matrix
##
## 0 1
## FALSE 638 0
## TRUE 0 350
Define the accuracy of the model
validate <- test_matrix['TRUE', 2] + test_matrix['FALSE', 1]
accuracy_model <- validate/nrow(testing) * 100
accuracy_model
## [1] 100