#loading all library needed for this assignment
# this library are already in my Local downloaded_packages if not, I #can install each
# install.packages("rtweet")
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.3
## Warning: package 'tibble' was built under R version 4.0.3
## Warning: package 'readr' was built under R version 4.0.3
## Warning: package 'dplyr' was built under R version 4.0.3
library(DT)
## Warning: package 'DT' was built under R version 4.0.3
library(knitr)
#library(plyr)
library(XML)
library(RCurl)
library(jsonlite)
## Warning: package 'jsonlite' was built under R version 4.0.3
library(httr)
library(tidytext)
## Warning: package 'tidytext' was built under R version 4.0.3
library(tidyr)
library(janeaustenr)
## Warning: package 'janeaustenr' was built under R version 4.0.3
library(textdata) # https://rdrr.io/cran/textdata/f/README.md
## Warning: package 'textdata' was built under R version 4.0.3
#get_sentiments("afinn") #general purpose lexions from Finn Arup Nielsen, AFINN is a lexicon of English words rated for valence with an integer between minus five (negative) and plus five (positive).
library(stopwords)
## Warning: package 'stopwords' was built under R version 4.0.3
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 4.0.3
library(tm)
## Warning: package 'tm' was built under R version 4.0.3
## Warning: package 'NLP' was built under R version 4.0.3
library(reshape2)
library(syuzhet)
## Warning: package 'syuzhet' was built under R version 4.0.3
library(corpus)
## Warning: package 'corpus' was built under R version 4.0.3
library(R.utils)
library(class)
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.0.3
library(caret)
## Warning: package 'caret' was built under R version 4.0.3
library(party)
## Warning: package 'party' was built under R version 4.0.3
## Warning: package 'mvtnorm' was built under R version 4.0.3
## Warning: package 'modeltools' was built under R version 4.0.3
## Warning: package 'strucchange' was built under R version 4.0.3
## Warning: package 'zoo' was built under R version 4.0.3
## Warning: package 'sandwich' was built under R version 4.0.3
library(TAR)
## Warning: package 'TAR' was built under R version 4.0.3
library(SnowballC)
## Warning: package 'SnowballC' was built under R version 4.0.3
library(readr)
library(RTextTools)
## Warning: package 'RTextTools' was built under R version 4.0.3
This assignment of week 12 is about document classification. Last assignment was on sentiment analysis which will be used for document classification as it is all about text analysis. This is machine learning where it can be useful to be able to classify new “test” documents using already classified “training” documents. A common example is using a corpus of labeled spam and ham (non-spam) e-mails to predict whether or not a new document is spam.
For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder). One example corpus: https://spamassassin.apache.org/old/publiccorpus/
Here are few short videos that you may find helpful.
We will first download 02 zip files easy_ham and spam.tar to local directory, then upload them to Rstudio. Since these files are zipped .tar.bze, we will perform manual and automatic unzip the data files. Document Classification using R: Document classification or Document categorization is to classify documents into one or more classes/categories manually or algorithmically.Document classification falls into Supervised Machine learning Technique. Technically speaking, we create a machine learning model using a number of text documents (called Corpus) as Input & its corresponding class/category (called Labels) as Output. The model thus generated will be able to classify into classes when a new text is supplied.
Creation of Corpus
Preprocessing of Corpus
Creation of Term Document Matrix
Preparing Features & Labels for Model
Creating Train & test data
Running the model
Testing the model
I want to acknowledge George and Ataky on helping debugging my code. Even, at the end the knit process spoiled my enthusiam. Machine learning is not that easy. Though, machine learning required some design phase. I had some difficulties getting the clean corpus to run… How to read multiple file at once (I think a for loop should do this).
# Automatic unzipped tar.bz2
#Using party package to read the tar.bz2 which automatically unzipped the tar file
#create a df
# view(X20021010_easy_ham_tar)
# view('20021010_spam.tar')
# hamSpam_df <- c(X20021010_easy_ham_tar, '20021010_spam.tar')
# view(hamSpam_df)
# str(hamSpam_df)
# Manual unzipped tar.bz2
# Another way is to use 7-zip to manually unzip the tar file and read it from github repo or local drive
# We can also build a script to import tar file from original source and manually unpack the file
#ham <- read.csv("https://raw.githubusercontent.com/asmozo24/DATA607_Project4_Document_Classification/tree/master/20021010_easy_ham.tar",header=TRUE,stringsAsFactors=FALSE)
# Reading from original source
# spam2 <- "https://spamassassin.apache.org/old/publiccorpus/20021010_spam.tar.bz2"
# ham2 <- "https://spamassassin.apache.org/old/publiccorpus/20021010_easy_ham.tar.bz2"
# #downloading the file
# download.file(spam2, destfile="tmp1.tar.bz2")
# download.file(ham2, destfile="tmp2.tar.bz2")
#
# # check conten
# bunzip2("tmp1.tar.bz2")
# #untar("tmp1.tar.bz2")
# # untar("tmp2.tar.bz2")
# # extraction
# spam2 <- read.csv("tmp1.tar", stringsAsFactors=FALSE)
# view(spam2)
# ham2 <- read.csv("tmp2.tar.bz2", stringsAsFactors=FALSE)
# checking to see if we are in the correct working directory
getwd()
## [1] "C:/Users/Petit Mandela/Documents/R/DATA607_Project4_Document_Classification"
# used 7z to open tar file which has a lot of files in
spam0 <- list.files("spam")
# reading one file from spam folder
spam0 <- read.csv("0002.24b47bb3ce90708ae29d0aec1da08610")
# let's take a look at this sample
str(spam0)
## 'data.frame': 64 obs. of 1 variable:
## $ From.ilug.admin.linux.ie..Thu.Aug.22.13.27.39.2002: chr "Return-Path: <ilug-admin@linux.ie>" "Delivered-To: zzzz@localhost.example.com" "Received: from localhost (localhost [127.0.0.1])" "\tby phobos.labs.example.com (Postfix) with ESMTP id A7FD7454F6" ...
#now let 's read the tar with csv since it worked before
# since we alreay use 7-zip on the file, we will just read them in renamed already
spam1 <- "spam.tar"
ham1 <- "easy_ham.tar"
spam1 <- read.csv(spam1)
## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## line 1 appears to contain embedded nulls
## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
## embedded nul(s) found in input
ham1 <- read.csv(ham1)
## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## line 1 appears to contain embedded nulls
## Warning in read.table(file = file, header = header, sep = sep, quote = quote, :
## embedded nul(s) found in input
view(spam1)
#str(spam1)
# this turning the file into dataframe text
df_H <- data.frame(text = sapply(ham1$easy_ham., as.character), type = "ham" , stringsAsFactors = FALSE)
df_S <- data.frame(text = sapply(spam1$spam., as.character), type = "spam" , stringsAsFactors = FALSE)
# Merge the two df...still don't know if need it.
df_HS <- rbind(df_H, df_S)
str(df_HS)
## 'data.frame': 131223 obs. of 2 variables:
## $ text: chr "Return-Path: <fork-admin@xent.com>" "Delivered-To: yyyy@localhost.netnoteinc.com" "Received: from localhost (localhost [127.0.0.1])" "\tby phobos.labs.netnoteinc.com (Postfix) with ESMTP id CBC6A44156" ...
## $ type: chr "ham" "ham" "ham" "ham" ...
Corpus is a large and structured set of texts used for analysis.
# define corpus for each email type, then differentiate (I am bit confused here), add tag to differentiate the 02 email type , then combine the two emails
# we have two email files (spam and ham), we want to create a classifier to scan incoming email to classifier where the email belong to (spam or ham folder)
# Since I have the file in R already, I don't need to point file path to dir
corpus_H = Corpus(VectorSource(df_H$text))
corpus_S = Corpus(VectorSource(df_S$text))
# I don't know if I should create corpus for the merge email folder
corpus_HS = Corpus(VectorSource(df_HS$text))
# The below code work, but at the end there is a proble
ham_corpus = Corpus(VectorSource(ham1$easy_ham.))
spam_corpus = Corpus(VectorSource(spam1$spam.))
#meta(ham_corpus, tag="type") = "ham"
#meta(spam_corpus, tag="type") = "spam"
corpus_SH = tm:::c.VCorpus(ham_corpus, spam_corpus)
inspect(corpus_SH[1:4])
## <<VCorpus>>
## Metadata: corpus specific: 2, document level (indexed): 0
## Content: documents: 4
##
## [1] Return-Path: <fork-admin@xent.com>
## [2] Delivered-To: yyyy@localhost.netnoteinc.com
## [3] Received: from localhost (localhost [127.0.0.1])
## [4] \tby phobos.labs.netnoteinc.com (Postfix) with ESMTP id CBC6A44156
inspect(ham_corpus[1:3])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 3
##
## [1] Return-Path: <fork-admin@xent.com>
## [2] Delivered-To: yyyy@localhost.netnoteinc.com
## [3] Received: from localhost (localhost [127.0.0.1])
Tidy_corpus_SH <- function(x){
Lcorpus <- x
# remove header
Lcorpus <- tm_map( Lcorpus, str_replace, pattern = "^(.+\\n)+\\n", replacement = "")
Lcorpus <- tm_map(Lcorpus, str_replace, pattern = "http^\\s\\s*", replacement = "")
Lcorpus <- tm_map(Lcorpus, str_replace, pattern = "<", replacement = "")
Lcorpus <- tm_map(Lcorpus, str_replace, pattern = ">", replacement = "")
# convert to lowercase
Lcorpus <- tm_map(Lcorpus, content_transformer(tolower))
# remove SMART stopwords
#Lcorpus <- m_map(Lcorpus, removeWords, stopwords("SMART"))
# Remove english common stopwords
Lcorpus <- tm_map(Lcorpus, removeWords, stopwords("english"))
Lcorpus <- tm_map(Lcorpus, removeWords, stopwords())
Lcorpus <- tm_map(Lcorpus, removeWords, c("com", "localhost","net", "esmtp", "www", "org", "freshrpms", "http", "xent", "rpm", "https", "sep", "sep", " zzzlist", "egwn", "will", "ilug", "zzzz", "thu", "exmh", "yyyy" , "a", "but", "x"))
# remove punctuation
Lcorpus <- tm_map(Lcorpus, str_replace_all, pattern = "[:punct:]", replacement = " ")
# remove numbers
Lcorpus <- tm_map(Lcorpus, removeNumbers)
# applying stemDocument() function to get to a word root
#Lcorpus <- tm_map(Lcorpus, stemDocument)
# Eliminate extra white spaces
Lcorpus <- tm_map(Lcorpus, stripWhitespace)
return(Lcorpus)
}
ham_corpusL <- Tidy_corpus_SH(ham_corpus)
## Warning in tm_map.SimpleCorpus(Lcorpus, str_replace, pattern = "^(.+\\n)+\\n", :
## transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, str_replace, pattern = "http^\\s\\s*", :
## transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, str_replace, pattern = "<", replacement
## = ""): transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, str_replace, pattern = ">", replacement
## = ""): transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, removeWords, stopwords()):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, removeWords, c("com", "localhost", :
## transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, str_replace_all, pattern =
## "[:punct:]", : transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(Lcorpus, stripWhitespace): transformation drops
## documents
spam_corpusL <- Tidy_corpus_SH(spam_corpus)
## Warning in tm_map.SimpleCorpus(Lcorpus, str_replace, pattern = "^(.+\\n)+\\n", :
## transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, str_replace, pattern = "http^\\s\\s*", :
## transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, str_replace, pattern = "<", replacement
## = ""): transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, str_replace, pattern = ">", replacement
## = ""): transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, removeWords, stopwords()):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, removeWords, c("com", "localhost", :
## transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, str_replace_all, pattern =
## "[:punct:]", : transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(Lcorpus, stripWhitespace): transformation drops
## documents
inspect(ham_corpusL[1:10])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 10
##
## [1] return path fork admin
## [2] delivered netnoteinc
## [3] received
## [4] phobos labs netnoteinc postfix id cbca
## [5] jm wed
## [6] aug edt
## [7] received phobos
## [8] imap fetchmail
## [9] jm single drop wed
## [10] aug + ist
inspect(spam_corpusL[1:10])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 10
##
## [1] return path george vccomputers ie delivered example
## [3] received phobos labs example postfix id cbfb
## [5] mon aug edt
## [7] received phobos imap fetchmail
## [9] single drop mon aug + ist
df1 <- data.frame(text = sapply(ham_corpusL, as.character), type = "ham" , stringsAsFactors = FALSE)
df2 <- data.frame(text = sapply(spam_corpusL, as.character), type = "spam" , stringsAsFactors = FALSE)
main_df <- rbind(df1, df2)
str(main_df)
## 'data.frame': 131223 obs. of 2 variables:
## $ text: chr "return path fork admin " "delivered netnoteinc " "received " " phobos labs netnoteinc postfix id cbca" ...
## $ type: chr "ham" "ham" "ham" "ham" ...
main_corpus <- Corpus(VectorSource(main_df$text))
# emails_corpusL <- rbind.data.frame(ham_corpusL, spam_corpusL)
inspect(main_corpus[1:10])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 10
##
## [1] return path fork admin
## [2] delivered netnoteinc
## [3] received
## [4] phobos labs netnoteinc postfix id cbca
## [5] jm wed
## [6] aug edt
## [7] received phobos
## [8] imap fetchmail
## [9] jm single drop wed
## [10] aug + ist
We need to perform cleaning of the dataset before analaysis
# I already build a cleaning function...just call it here
corpus_cleanH <- Tidy_corpus_SH(corpus_H)
## Warning in tm_map.SimpleCorpus(Lcorpus, str_replace, pattern = "^(.+\\n)+\\n", :
## transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, str_replace, pattern = "http^\\s\\s*", :
## transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, str_replace, pattern = "<", replacement
## = ""): transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, str_replace, pattern = ">", replacement
## = ""): transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, removeWords, stopwords()):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, removeWords, c("com", "localhost", :
## transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, str_replace_all, pattern =
## "[:punct:]", : transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(Lcorpus, stripWhitespace): transformation drops
## documents
corpus_cleanS <- Tidy_corpus_SH(corpus_S)
## Warning in tm_map.SimpleCorpus(Lcorpus, str_replace, pattern = "^(.+\\n)+\\n", :
## transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, str_replace, pattern = "http^\\s\\s*", :
## transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, str_replace, pattern = "<", replacement
## = ""): transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, str_replace, pattern = ">", replacement
## = ""): transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, content_transformer(tolower)):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, removeWords, stopwords("english")):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, removeWords, stopwords()):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, removeWords, c("com", "localhost", :
## transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, str_replace_all, pattern =
## "[:punct:]", : transformation drops documents
## Warning in tm_map.SimpleCorpus(Lcorpus, removeNumbers): transformation drops
## documents
## Warning in tm_map.SimpleCorpus(Lcorpus, stripWhitespace): transformation drops
## documents
#corpus_cleanHS <- Tidy_corpus_SH(corpus_HS)
inspect(corpus_cleanH[1:3])
## <<SimpleCorpus>>
## Metadata: corpus specific: 1, document level (indexed): 0
## Content: documents: 3
##
## [1] return path fork admin delivered netnoteinc received
Create of Term Document Matrix. Matrix has the frequency of terms that occur in a collection of documents.
# making doc matrix for each email type
# term frequency (tf) weighting, or a simple count of how frequently a word occurs in a document
corpus_dtmH<- TermDocumentMatrix(corpus_cleanH)
corpus_dtmS<- TermDocumentMatrix(corpus_cleanS)
# making doc matrix for merge email
# corpus_dtmHS<- TermDocumentMatrix(corpus_cleanHS)
# let's inspect this dtm
inspect(corpus_dtmH)
## <<TermDocumentMatrix (terms: 27822, documents: 130346)>>
## Non-/sparse entries: 409176/3626077236
## Sparsity : 100%
## Maximal term length: 80
## Weighting : term frequency (tf)
## Sample :
## Docs
## Terms 10280 112500 112530 11466 84345 84351 84421 84434 91967 9811
## admin 0 0 0 0 0 0 0 0 0 0
## example 1 0 0 1 0 0 0 0 2 0
## fork 0 0 0 0 0 0 0 0 0 0
## list 1 0 5 0 0 0 0 0 0 0
## mailman 0 0 0 0 0 0 0 0 0 0
## mailto 0 0 0 0 0 0 0 0 1 0
## postfix 0 0 0 0 0 0 0 0 0 0
## received 0 1 1 0 2 0 2 0 0 0
## sourceforge 0 0 0 0 0 0 0 0 0 0
## spam 0 0 0 0 0 0 0 0 0 0
Document term Matrix weights in each term or word to find frequency in the dataset Another approach to reducing model complexity is to remove sparse terms from the model. This is done by removing tokens which do not appear across the data set and delete them from dtm.
# the removeSparseTerms() function takes 02 arguments, first is the dtm, second is the maximal percentage sparsity allowed.
# meaning sparse = .99 would remove any tokens which are missing from more than 99% of the documents in the corpus (i.e. the token must appear in at least 1% of the documents to be retained)
corpus_dtmH <- removeSparseTerms(corpus_dtmH, sparse = .99)
corpus_dtmS <- removeSparseTerms(corpus_dtmS, sparse = .99)
#corpus_dtmHS <- removeSparseTerms(corpus_dtmHS, sparse = .99)
inspect(corpus_dtmH)
## <<TermDocumentMatrix (terms: 43, documents: 130346)>>
## Non-/sparse entries: 109778/5495100
## Sparsity : 98%
## Maximal term length: 12
## Weighting : term frequency (tf)
## Sample :
## Docs
## Terms 18850 24769 24770 26910 40556 40708 41368 41608 50812 5108
## admin 0 0 0 0 0 0 0 0 0 2
## example 1 0 0 0 0 1 0 0 0 0
## fork 0 0 0 0 0 0 0 0 0 0
## list 4 1 2 2 2 2 1 1 4 4
## mailman 1 1 2 0 2 2 1 1 2 4
## mailto 0 0 0 0 0 0 0 0 1 2
## postfix 0 0 0 0 0 0 0 0 0 0
## received 0 0 0 0 0 0 0 0 0 0
## sourceforge 0 0 0 4 0 0 0 0 0 1
## spam 0 0 0 0 0 0 0 0 0 0
# So, I can see , significant change from terms: 27823 to terms: 43
Create train and test dataset for the classifier
set.seed(222)
# Make test and train matrices of identical length (find intersection)
#train.df <- data.frame(train.dtm[,intersect(colnames(train.dtm), colnames(test.dtm))])
#test.df <- data.frame(test.dtm[,intersect(colnames(test.dtm), colnames(train.dtm))])
# really need to understand what happen here for training and test
#Training (70%) & Test data (30%) before we feed into our Model.
#train <- sample( nrow (corpus_dtmH), ceiling(nrow(corpus_dtmH) * 0.7) )
#test <- (1:nrow(corpus_dtmH))[-train]
set.seed(343443)
wordclouds <- function (x){
x <- wordcloud(words = x , min.freq = 5,
max.words=100, random.order=FALSE, rot.per=0.40,
)
return(x)
}
#wordcloud for ham eamil type (df1)
df1_cloud <- wordclouds(df1$text)
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
#d1cloud <- which(main_df$type == "ham")
#wordcloud for spam email type (df2)
df2_cloud <- wordclouds(df2$text)
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
#wordcloud for the main email
main_df_cloud <- wordclouds(main_df$text)
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
# making doc matrix
main_corpusX <- TermDocumentMatrix(main_corpus[1:6000])
main_corpusDTM <- DocumentTermMatrix(main_corpus)
# let's inspect this dtm
inspect(main_corpusDTM)
## <<DocumentTermMatrix (documents: 131223, terms: 28102)>>
## Non-/sparse entries: 412040/3687216706
## Sparsity : 100%
## Maximal term length: 80
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs admin aug example fork list mailman mailto postfix received spam
## 10280 0 0 1 0 1 0 0 0 0 0
## 112500 0 0 0 0 0 0 0 0 1 0
## 112530 0 0 0 0 5 0 0 0 1 0
## 11466 0 0 1 0 0 0 0 0 0 0
## 84345 0 0 0 0 0 0 0 0 2 0
## 84351 0 0 0 0 0 0 0 0 0 0
## 84421 0 0 0 0 0 0 0 0 2 0
## 84434 0 0 0 0 0 0 0 0 0 0
## 91967 0 0 2 0 0 0 1 0 0 0
## 9811 0 0 0 0 0 0 0 0 0 0
inspect(main_corpusX[1:5, 18:45])
## <<TermDocumentMatrix (terms: 5, documents: 28)>>
## Non-/sparse entries: 10/130
## Sparsity : 93%
## Maximal term length: 9
## Weighting : term frequency (tf)
## Sample :
## Docs
## Terms 18 19 20 21 22 27 40 41 42 45
## admin 0 0 0 0 0 0 1 1 0 0
## delivered 1 0 0 0 0 0 0 0 0 0
## fork 1 0 1 0 0 1 1 1 1 1
## path 0 0 0 0 0 0 0 0 0 0
## return 0 0 0 0 0 0 0 0 0 0
#error, we need to reduce main_corpus because file to big, above 27.5 GB, so we limit to first 2000
main_corpusX1 <- as.matrix(main_corpusX)
# Sort by descearing value of frequency
main_corpusX1 <- sort(rowSums(main_corpusX1),decreasing=TRUE)
main_corpusX1 <- data.frame(word = names(main_corpusX1),freq=main_corpusX1)
# let's view most the top 20 most frequent words
head(main_corpusX1, 20)
## word freq
## list list 543
## received received 409
## example example 387
## fork fork 295
## sourceforge sourceforge 228
## mailto mailto 201
## linux linux 173
## spam spam 169
## oct oct 168
## admin admin 158
## mailman mailman 152
## spamassassin spamassassin 146
## postfix postfix 139
## talk talk 139
## request request 136
## content content 133
## lists lists 132
## ist ist 127
## listinfo listinfo 127
## date date 120
#let's see plot for frequent words
barplot(main_corpusX1[1:20,]$freq, las = 2, names.arg = main_corpusX1[1:20,]$word,
col =rainbow(25), main =" Top 20 Words in Spam-Ham Email",
ylab = "Word Occurrency")
nrow(main_df)
## [1] 131223
# so , let's do 75% for train and 25% for test
Train_main_df <- main_df[1:98417,]
Test_main_df <- main_df[98418:131223,]
#let's do same thing with dtm
Train_main_corpusDTM <- main_corpusDTM[1:98417,]
Test_main_corpusDTM<- main_corpusDTM[98418:131223,]
# what is the 10 words freq:
Freq_word <- findFreqTerms(Train_main_corpusDTM, 10)
Freq_word[1:10]
## [1] "admin" "fork" "path" "return" "delivered"
## [6] "netnoteinc" "received" "labs" "phobos" "postfix"
# Train_mainEmail <- DocumentTermMatrix(Train_main_corpusDTM , control=list(dictionary = Freq_word))
# Test_mainEmail <- DocumentTermMatrix(Test_main_corpusDTM , control=list(dictionary = Freq_word))
#
# #checking whether "yes" or "no"
# check_count <- function(x) {
# x <- ifelse(x > 0, "Yes", "No")
# }
# #Convert document-term matrices:
# Train_mainEmail <- apply(Train_mainEmail, 2, check_count)
# Test_mainEmail <- apply(Test_mainEmail, 2, convert_count)
References
Supervised classification with text data https://cfss.uchicago.edu/notes/supervised-text-classification/
Document Classification using R https://www.r-bloggers.com/2013/07/document-classification-using-r/
https://www.rdocumentation.org/packages/stopwords/versions/2.0
https://www.r-bloggers.com/2018/01/how-to-implement-random-forests-in-r/