#download required packages
suppressMessages(suppressWarnings(library(tm)))
suppressMessages(suppressWarnings(library(RCurl)))
suppressMessages(suppressWarnings(library(stringr)))
suppressMessages(suppressWarnings(library(dplyr)))
suppressMessages(suppressWarnings(library(XML)))
suppressMessages(suppressWarnings(library(tidytext)))
suppressMessages(suppressWarnings(library(ggplot2)))
I uploaded each first 30 files from spam and easy_ham files as my data.Why 30 files each? Because the limit number of files in a folder in github is 100 and my raw data folds have thoursands files. 30 is just a nic number and save time from upload but reduce the accuracy in prediction (more files is better).
spam variable: a binary (0/1) variable, spam=1 while the text is spam; if from ham files then spam=0.
All source files are from the link http://spamassassin.apache.org/old/publiccorpus/ .
#Get the 30 easy_ham file names
easy_ham_list<-"https://raw.githubusercontent.com/ada2802/607-Project-4-Text-Mining/master/easy_ham/cmds"
easy_ham_df<- readLines(easy_ham_list)
easy_ham_fl <- sapply(strsplit(easy_ham_df," "),"[[",2)
easy_ham_30<-head(easy_ham_fl,30)
ham_path <- "https://raw.githubusercontent.com/ada2802/607-Project-4-Text-Mining/master/easy_ham/"
easy_ham_url_30=c()
for(i in 1:30){
easy_ham_url_30[i] <- paste0(ham_path,easy_ham_30[i])
}
#head(easy_ham_url_30)
#Get the 30 spam2 file names
spam2_list<-"https://raw.githubusercontent.com/ada2802/607-Project-4-Text-Mining/master/spam_2/cmds"
spam2_df<- readLines(spam2_list)
spam2_fl <- sapply(strsplit(spam2_df," "),"[[",2)
spam2_30<-head(spam2_fl,30)
#correct #7 file name from cmd file becasue it is unmatched the file name
spam2_30[7] <-"00007.acefeee792b5298f8fee175f9f65c453"
spam_path <- "https://raw.githubusercontent.com/ada2802/607-Project-4-Text-Mining/master/spam_2/"
spam2_url_30=c()
for(i in 1:30){
spam2_url_30[i] <- paste0(spam_path,spam2_30[i])
}
#head(spam2_url_30)
#create an empty vector for easy_ham train data set
easy_ham_train_set=c()
ham=c()
#read in ham file url one by one and store in the ham taining data set vector
for(i in 1:30) {
lines <- readLines(easy_ham_url_30[i])
ham <- paste(lines, collapse = ' ')
easy_ham_text <-data_frame(Spam=0, text=ham)
#combine all easy_ham files per file per count
easy_ham_train_set <- rbind(easy_ham_train_set,easy_ham_text)
}
str(easy_ham_train_set)
## Classes 'tbl_df', 'tbl' and 'data.frame': 30 obs. of 2 variables:
## $ Spam: num 0 0 0 0 0 0 0 0 0 0 ...
## $ text: chr "From exmh-workers-admin@redhat.com Thu Aug 22 12:36:23 2002 Return-Path: <exmh-workers-admin@spamassassin.tain"| __truncated__ "From Steve_Burt@cursor-system.com Thu Aug 22 12:46:39 2002 Return-Path: <Steve_Burt@cursor-system.com> Deliver"| __truncated__ "From timc@2ubh.com Thu Aug 22 13:52:59 2002 Return-Path: <timc@2ubh.com> Delivered-To: zzzz@localhost.netnotei"| __truncated__ "From irregulars-admin@tb.tf Thu Aug 22 14:23:39 2002 Return-Path: <irregulars-admin@tb.tf> Delivered-To: zzzz@"| __truncated__ ...
#create an empty vector for easy_ham train data set
spam_train_set=c()
spam=c()
#read in spam file url one by one and store in the spam taining data set vector
for(i in 1:30) {
lines <- readLines(spam2_url_30[i])
spam <- paste(lines, collapse = ' ')
spam_text <-data_frame(Spam=1, text=spam)
#combine all spam files per file per count
spam_train_set <- rbind(spam_train_set,spam_text)
}
str(spam_train_set)
## Classes 'tbl_df', 'tbl' and 'data.frame': 30 obs. of 2 variables:
## $ Spam: num 1 1 1 1 1 1 1 1 1 1 ...
## $ text: chr "From ilug-admin@linux.ie Tue Aug 6 11:51:02 2002 Return-Path: <ilug-admin@linux.ie> Delivered-To: yyyy@localh"| __truncated__ "From lmrn@mailexcite.com Mon Jun 24 17:03:24 2002 Return-Path: merchantsworld2001@juno.com Delivery-Date: Mon "| __truncated__ "From amknight@mailexcite.com Mon Jun 24 17:03:49 2002 Return-Path: merchantsworld2001@juno.com Delivery-Date: "| __truncated__ "From jordan23@mailexcite.com Mon Jun 24 17:04:20 2002 Return-Path: merchantsworld2001@juno.com Delivery-Date: "| __truncated__ ...
raw_train_data =c()
raw_train_data <- rbind(easy_ham_train_set,spam_train_set)
str(raw_train_data)
## Classes 'tbl_df', 'tbl' and 'data.frame': 60 obs. of 2 variables:
## $ Spam: num 0 0 0 0 0 0 0 0 0 0 ...
## $ text: chr "From exmh-workers-admin@redhat.com Thu Aug 22 12:36:23 2002 Return-Path: <exmh-workers-admin@spamassassin.tain"| __truncated__ "From Steve_Burt@cursor-system.com Thu Aug 22 12:46:39 2002 Return-Path: <Steve_Burt@cursor-system.com> Deliver"| __truncated__ "From timc@2ubh.com Thu Aug 22 13:52:59 2002 Return-Path: <timc@2ubh.com> Delivered-To: zzzz@localhost.netnotei"| __truncated__ "From irregulars-admin@tb.tf Thu Aug 22 14:23:39 2002 Return-Path: <irregulars-admin@tb.tf> Delivered-To: zzzz@"| __truncated__ ...
#table(raw_data$Spam)
#tidy data
spam_text <- raw_train_data %>%
filter(Spam==1)%>%
unnest_tokens(word, text) %>%
anti_join(stop_words ) %>%
count(word, sort=TRUE )
## Joining, by = "word"
#data analysis
spam_text %>%
filter(n > 100) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
coord_flip()
It is a better way to clean text from raw training data messages (tdm).
corpus <- Corpus(VectorSource(raw_train_data$text)) #create a new corpus variable
corpus.tmp <- tm_map(corpus,removePunctuation) #remove all punctuation
corpus.tmp <- tm_map(corpus.tmp, stripWhitespace) #remove all whitespace
corpus.tmp <- tm_map(corpus.tmp, tolower) #convert text to lowercase
corpus.tmp <- tm_map(corpus.tmp, removeWords,stopwords("english")) #remove all English stopwords
corpus.tmp <- tm_map(corpus.tmp, stemDocument) #stem the words
#str(corpus.tmp)
tdm_text <- DocumentTermMatrix (corpus.tmp) #build a document term matrix - training data messages
#tdm_text
#remove spare terms: limit tdm_text containing in at lease 5% of text
tdm_text_0.95 = removeSparseTerms(tdm_text, 0.95)
tdm_text_Sparse = as.data.frame(as.matrix(tdm_text_0.95))
colnames(tdm_text_Sparse) = make.names(colnames(tdm_text_Sparse))
sort(colSums(tdm_text_Sparse))
## actual
## 4
## discuss
## 4
## hand
## 4
## mark
## 4
## search
## 4
## ideal
## 4
## unverifi
## 4
## arm
## 4
## later
## 4
## press
## 4
## taken
## 4
## across
## 4
## hell
## 4
## univers
## 4
## adamson
## 4
## attract
## 4
## final
## 4
## reason
## 4
## recent
## 4
## sit
## 4
## thing
## 4
## word
## 4
## worth
## 4
## X103113
## 4
## X209sfnet
## 4
## X216136171252
## 4
## helouswsflist1sourceforgenet
## 4
## research
## 4
## uswsffw2sourceforgenet
## 4
## uswsflist1bsourceforgenet
## 4
## uswsflist1sourceforgenet
## 4
## xoriginald
## 4
## job
## 4
## contentdisposit
## 4
## mention
## 4
## best
## 4
## public
## 4
## repres
## 4
## simpl
## 4
## specif
## 4
## redhat
## 4
## reliabl
## 4
## tool
## 4
## enough
## 4
## accept
## 4
## X1980
## 4
## X45405br
## 4
## charsetdefault
## 4
## citystatezipa
## 4
## dayton
## 4
## expir
## 4
## ohio
## 4
## contain
## 4
## size3d4
## 4
## oblig
## 4
## jm7netnoteinccom
## 4
## abl
## 5
## ago
## 5
## fail
## 5
## localhostlocaldomain
## 5
## author
## 5
## detail
## 5
## edit
## 5
## half
## 5
## except
## 5
## xmimeautoconvert
## 5
## bad
## 5
## cell
## 5
## current
## 5
## exercis
## 5
## figur
## 5
## honest
## 5
## howev
## 5
## known
## 5
## mean
## 5
## result
## 5
## wish
## 5
## mimeol
## 5
## sfnet
## 5
## spamassassin
## 5
## X81258125
## 5
## looney
## 5
## fix
## 5
## full
## 5
## function.
## 5
## less
## 5
## X2011
## 5
## X6416122236
## 5
## advertis
## 5
## httpxentcompipermailfork
## 5
## khare
## 5
## lairxentcom
## 5
## mailtoforkrequestxentcomsubjecthelp
## 5
## mailtoforkrequestxentcomsubjectsubscrib
## 5
## mailtoforkrequestxentcomsubjectunsubscrib
## 5
## mailtoforkspamassassintaintorg
## 5
## rohit
## 5
## act
## 5
## general
## 5
## employ
## 5
## enjoy
## 5
## next.
## 5
## wonder
## 5
## zip
## 5
## stop
## 5
## X81128112
## 5
## namebr
## 5
## neat
## 5
## payabl
## 5
## valu
## 5
## X1000
## 5
## absolut
## 5
## assist
## 5
## guarante
## 5
## inc
## 5
## X10000000
## 5
## serv
## 5
## along
## 5
## sinc
## 6
## content
## 6
## martin
## 6
## technolog
## 6
## govern
## 6
## offic
## 6
## outsid
## 6
## red
## 6
## approach
## 6
## firm
## 6
## nice
## 6
## owner
## 6
## prefer
## 6
## guess
## 6
## resid
## 6
## extra
## 6
## freedom
## 6
## past
## 6
## spend
## 6
## style
## 6
## turn
## 6
## via
## 6
## xmimeol
## 6
## allow
## 6
## formatflow
## 6
## updat
## 6
## everyth
## 6
## john
## 6
## iii
## 6
## leav
## 6
## top
## 6
## qualiti
## 6
## typic
## 6
## comput
## 6
## pictur
## 6
## step
## 6
## server
## 6
## unix
## 6
## packag
## 6
## welcom
## 6
## easier
## 6
## X213105180140
## 6
## addressbr
## 6
## inbr
## 6
## merchantsworld2001junocom
## 6
## postal
## 6
## return
## 6
## minut
## 6
## corpor
## 6
## herea
## 6
## head
## 6
## eir
## 6
## payment
## 6
## uncollect
## 6
## either
## 6
## intern
## 6
## recipientsnetnoteinccom
## 6
## undisclos
## 6
## develop
## 7
## issu
## 7
## plan
## 7
## X6621866218
## 7
## august
## 7
## continu
## 7
## got
## 7
## mta3grpscdyahoocom
## 7
## wont
## 7
## ive
## 7
## mozilla50
## 7
## quit
## 7
## xacceptlanguag
## 7
## end
## 7
## fun
## 7
## pass
## 7
## promis
## 7
## quick
## 7
## soon
## 7
## apolog
## 7
## found
## 7
## main
## 7
## deal
## 7
## fact
## 7
## open
## 7
## releas
## 7
## old
## 7
## set
## 7
## box
## 7
## X200
## 7
## add
## 7
## case
## 7
## play
## 7
## seek
## 7
## deliveryd
## 7
## fill
## 7
## xkeyword
## 7
## pay
## 7
## stock
## 7
## alway
## 7
## creat
## 8
## xyahooprofil
## 8
## away
## 8
## move
## 8
## within
## 8
## X300
## 8
## bst
## 8
## realli
## 8
## life
## 8
## possibl
## 8
## anyon
## 8
## face
## 8
## hard
## 8
## learn
## 8
## power
## 8
## transfer
## 8
## X331vamm2
## 8
## exist
## 8
## uswsflist2sourceforgenet
## 8
## idea
## 8
## zzzzilugspamassassintaintorg
## 8
## signatur
## 8
## let
## 8
## repli
## 8
## base
## 8
## bonus
## 8
## famili
## 8
## keep
## 8
## legal
## 8
## net
## 8
## financi
## 8
## bfont
## 8
## match
## 8
## usa
## 8
## X100
## 8
## select
## 8
## win
## 8
## X0800
## 8
## error
## 9
## happen
## 9
## mercuri
## 9
## still
## 9
## say
## 9
## noth
## 9
## famous
## 9
## big
## 9
## diet
## 9
## dream
## 9
## expens
## 9
## feel
## 9
## made
## 9
## secret
## 9
## seem
## 9
## produc
## 9
## spam
## 9
## X19412514545
## 9
## friend
## 9
## rootlughtuathaorg
## 9
## xauthenticationwarn
## 9
## etc
## 9
## area
## 9
## direct
## 9
## effort
## 9
## least
## 9
## dell
## 9
## import
## 9
## trade
## 9
## wait
## 9
## bank
## 9
## there
## 9
## follow
## 9
## sale
## 9
## jmjmasonorg
## 9
## texthtml
## 9
## foreign
## 9
## plus
## 9
## place
## 9
## size3d2
## 9
## cant
## 10
## hit
## 10
## inreplyto
## 10
## line
## 10
## note
## 10
## think
## 10
## forteanayahoogroupscom
## 10
## high
## 10
## claim
## 10
## sign
## 10
## effect
## 10
## manag
## 10
## better
## 10
## youv
## 10
## chanc
## 10
## might
## 10
## share
## 10
## requir
## 10
## X500
## 10
## other
## 10
## side
## 10
## futur
## 10
## bit
## 10
## complet
## 10
## exchang
## 10
## citi
## 10
## doesnt
## 10
## back
## 10
## jmlocalhost
## 10
## debt
## 10
## fax
## 10
## safe
## 10
## loss
## 10
## purchas
## 10
## rate
## 10
## size3d3
## 10
## court
## 10
## chris
## 11
## code
## 11
## that
## 11
## dvds
## 11
## egp
## 11
## forteanaowneryahoogroupscom
## 11
## httpdocsyahoocominfoterm
## 11
## httpusclickyahoocompt6ybbnxieaamg3haa7gsolbtm
## 11
## mail8101
## 11
## mailinglist
## 11
## mailtozzzzteanaunsubscribeyahoogroupscom
## 11
## qmqp
## 11
## xapparentlyto
## 11
## xegroupsreturn
## 11
## outlook
## 11
## talk
## 11
## last
## 11
## softwar
## 11
## instead
## 11
## never
## 11
## organ
## 11
## seen
## 11
## cost
## 11
## fortun
## 11
## see
## 11
## someth
## 11
## special
## 11
## yes
## 11
## debian
## 11
## script
## 11
## great
## 11
## httpwwwlinuxiemailmanlistinfoilug
## 11
## listmasterlinuxi
## 11
## problem
## 11
## unsubscript
## 11
## anoth
## 11
## bill
## 11
## forkxentcom
## 11
## paid
## 11
## potenti
## 11
## someon
## 11
## save
## 11
## sat
## 11
## success
## 11
## reach
## 12
## featur
## 12
## forteanaunsubscribeegroupscom
## 12
## xsender
## 12
## avail
## 12
## build
## 12
## said
## 12
## lot
## 12
## low
## 12
## X8bit
## 12
## wrote
## 12
## give
## 12
## machin
## 12
## origin
## 12
## write
## 12
## X0200
## 12
## thank
## 12
## lugh
## 12
## rootlocalhost
## 12
## asset
## 12
## must
## 12
## sure
## 12
## mandarklabsnetnoteinccom
## 12
## protect
## 12
## price
## 12
## cash
## 12
## align3dcenterfont
## 12
## meta
## 12
## winner
## 12
## come
## 13
## listarch
## 13
## listhelp
## 13
## listpost
## 13
## listsubscrib
## 13
## part
## 13
## pick
## 13
## refer
## 13
## post
## 13
## zzzzteana
## 13
## alreadi
## 13
## ever
## 13
## ask
## 13
## userag
## 13
## charsetiso88591
## 13
## believ
## 13
## fit
## 13
## unit
## 13
## differ
## 13
## choos
## 13
## countri
## 13
## experi
## 13
## onlin
## 13
## pdt
## 13
## support
## 13
## oper
## 13
## long
## 13
## dollar
## 13
## billion
## 13
## train
## 13
## express
## 14
## nnfmp
## 14
## appear
## 14
## exim
## 14
## secur
## 14
## X2000
## 14
## web
## 14
## world
## 14
## quotedprint
## 14
## sent
## 14
## start
## 14
## maintain
## 14
## market
## 14
## visit
## 14
## X0500
## 15
## form
## 15
## helo
## 15
## invok
## 15
## qmail
## 15
## didnt
## 15
## forkspamassassintaintorg
## 15
## httpxentcommailmanlistinfofork
## 15
## per
## 15
## xentcom
## 15
## real
## 15
## system
## 15
## amaz
## 15
## run
## 16
## person
## 16
## put
## 16
## tri
## 16
## becom
## 16
## provid
## 16
## though
## 16
## week
## 16
## file
## 16
## ilug
## 16
## tell
## 16
## control
## 16
## suppli
## 16
## fontfont
## 16
## associ
## 16
## judici
## 16
## join
## 17
## sponsor
## 17
## zzzzspamassassintaintorg
## 17
## hour
## 17
## name
## 17
## number
## 17
## incom
## 17
## opportun
## 17
## color
## 17
## lose
## 17
## size
## 17
## tbodi
## 17
## collect
## 17
## mailnetnoteinccom
## 17
## version
## 18
## unsubscrib
## 18
## report
## 18
## account
## 18
## good
## 18
## live
## 18
## read
## 18
## xmsmailprior
## 18
## host
## 18
## know
## 18
## question
## 18
## answer
## 18
## div
## 18
## size3d
## 18
## offer
## 19
## phone
## 19
## product
## 19
## today
## 20
## without
## 20
## xprioriti
## 20
## home
## 20
## way
## 20
## mani
## 20
## forkadminxentcom
## 20
## ship
## 20
## websit
## 20
## yyyynetnoteinccom
## 20
## network
## 21
## well
## 21
## xmailer
## 21
## earn
## 21
## even
## 21
## irish
## 21
## process
## 21
## jmnetnoteinccom
## 21
## listid
## 22
## xbeenther
## 22
## xmailmanvers
## 22
## right
## 22
## design
## 22
## tabl
## 22
## X893893
## 22
## solari
## 22
## charsetusascii
## 23
## errorsto
## 23
## replyto
## 23
## find
## 23
## dont
## 23
## remov
## 23
## click
## 23
## program
## 23
## cours
## 23
## listunsubscrib
## 24
## X7bit
## 25
## contact
## 25
## messag
## 25
## link
## 25
## check
## 25
## sender
## 26
## window
## 26
## also
## 26
## need
## 26
## take
## 26
## everi
## 27
## yahoo
## 27
## enus
## 27
## fat
## 27
## money
## 27
## interest
## 27
## user
## 27
## compani
## 27
## state
## 27
## servic
## 28
## microsoft
## 28
## million
## 28
## much
## 28
## normal
## 29
## page
## 29
## zzzzlocalhostnetnoteinccom
## 30
## call
## 30
## includ
## 30
## card
## 30
## busi
## 30
## center
## 30
## wed
## 31
## fri
## 31
## phobo
## 32
## first
## 32
## help
## 32
## html
## 32
## preced
## 33
## jul
## 33
## bulk
## 34
## imap
## 34
## textplain
## 34
## internet
## 34
## order
## 34
## color3d000000
## 34
## face3dari
## 34
## fetchmail590
## 35
## phoboslabsnetnoteinccom
## 35
## singledrop
## 35
## month
## 35
## mon
## 35
## send
## 36
## unknown
## 36
## look
## 36
## ilugadminlinuxi
## 36
## day
## 37
## sun
## 38
## bodi
## 38
## peopl
## 38
## lughtuathaorg
## 38
## credit
## 38
## want
## 39
## year
## 39
## tue
## 40
## pleas
## 41
## inform
## 42
## contenttransferencod
## 43
## just
## 43
## address
## 44
## edt
## 45
## judgment
## 45
## X0700
## 46
## now
## 47
## contenttyp
## 48
## dogmaslashnullorg
## 50
## linux
## 50
## make
## 51
## ist
## 53
## mimevers
## 53
## like
## 54
## get
## 55
## free
## 55
## work
## 56
## iluglinuxi
## 56
## jun
## 56
## time
## 57
## zzzzteanayahoogroupscom
## 57
## X81168116
## 59
## returnpath
## 59
## smtp
## 60
## zzzzlocalhost
## 60
## messageid
## 61
## mail
## 62
## group
## 62
## may
## 62
## new
## 64
## one
## 64
## use
## 64
## X0400
## 65
## date
## 66
## deliveredto
## 70
## email
## 75
## postfix
## 76
## X2001
## 77
## list
## 78
## X0000
## 82
## will
## 87
## subject
## 88
## X127001
## 93
## option
## 100
## can
## 101
## brbr
## 106
## font
## 111
## localhost
## 118
## esmtp
## 157
## X0100
## 158
## thu
## 247
## aug
## 341
## receiv
## 367
## X2002
## 402
#dim(tdm_text_Sparse)
#head(tdm_text_Sparse)
#str(tdm_text_Sparse)
#Add spam variable in to the data frame
tdm_text_Sparse$Spam = raw_train_data$Spam
#head(tdm_text_Sparse)
#str(tdm_text_Sparse)
#Easy_Ham terms
head(sort(colSums(subset(tdm_text_Sparse, Spam == 0))))
## accept busi click easier financi follow
## 0 0 0 0 0 0
#Spam terms
head(sort(colSums(subset(tdm_text_Sparse, Spam == 1))))
## chris inreplyto localhostlocaldomain
## 0 0 0
## mercuri search zzzzlocalhost
## 0 0 0
train=c()
test=c()
train <- head(tdm_text_Sparse,42)
test <- head(tdm_text_Sparse,-18)
set.seed(2802)
#for sample.split function
library(caTools)
#Building the model by split 70% data in training and 30% data in test
spl <- sample.split(tdm_text_Sparse$Spam, 0.7,group = NULL)
train = as.matrix(subset(tdm_text_Sparse, spl == TRUE))
test = as.matrix(subset(tdm_text_Sparse, spl == FALSE))
#library(e1071)
#model <- naiveBayes(class ~ ., data=as.matrix(train))
#class(model)
#preds <- predict(modle, newdata=test)
#Accuracy
#conf_matrix <- table(preds, test$Spam)
#library(RTextTools)
#container <- create_container(as.numberic(tdm_text_Sparse), tdm_text_Sparse$text, trainSize=1:42, testSize=43:60,virgin=FALSE)
#models <- train_models(container, algorithms=c("MAXENT", "SVM"))
#results <- classify_models(container, models)