Reading first set of spam files

library(stringr)

spamFiles <- list.files(path="C:/Data/spam", pattern="*.*", full.names=TRUE, recursive=FALSE)



spamFiles<-spamFiles[-1]

spamList<-lapply(spamFiles, function(x) {
    fileName <- x
    readChar(fileName, file.info(fileName)$size)
})

Pulling email addresses

spamListAddress<-lapply(spamList,function(x) str_extract(x, pattern="From:.*@.*\\..*\\b"))
                                                      



spamListAddress1<-lapply(spamListAddress,function(x) str_replace_all(str_replace_all(tolower(str_extract(str_sub(x,start=7),pattern="\\b\\w*@.*\\..*\\b")),"[^a-zA-Z\\s]", " "),"\\s{2,}"," "))

spamListAddress1<-lapply(spamListAddress1,function(x) gsub("\\s*\\w*$", "", x))





hamFiles <- list.files(path="C:/Data/easy_ham", pattern="*.*", full.names=TRUE, recursive=FALSE)

head(hamFiles)
## [1] "C:/Data/easy_ham/0001.ea7e79d3153e7469e7a9c3e0af6a357e"
## [2] "C:/Data/easy_ham/0002.b3120c4bcbf3101e661161ee7efcb8bf"
## [3] "C:/Data/easy_ham/0003.acfc5ad94bbd27118a0d8685d18c89dd"
## [4] "C:/Data/easy_ham/0004.e8d5727378ddde5c3be181df593f1712"
## [5] "C:/Data/easy_ham/0005.8c3b9e9c0f3f183ddaf7592a11b99957"
## [6] "C:/Data/easy_ham/0006.ee8b0dba12856155222be180ba122058"
#hamFiles<-hamFiles[-1]

hamList<-lapply(hamFiles, function(x) {
    fileName <- x
    readChar(fileName, file.info(fileName)$size)
})

hamListAddress<-lapply(hamList,function(x) str_extract(x, pattern="From:.*@.*\\..*\\b"))
                                                      

head(hamListAddress)
## [[1]]
## [1] "From: Robert Elz <kre@munnari.OZ.AU"
## 
## [[2]]
## [1] "From: Steve Burt <steve.burt@cursor-system.com"
## 
## [[3]]
## [1] "From: \"Tim Chapman\" <timc@2ubh.com"
## 
## [[4]]
## [1] "From: Monty Solomon <monty@roscom.com"
## 
## [[5]]
## [1] "From: Tony Nugent <tony@linuxworks.com.au"
## 
## [[6]]
## [1] "From: Stewart Smith <Stewart.Smith@ee.ed.ac.uk"
hamListAddress1<-lapply(hamListAddress,function(x)  str_replace_all(str_replace_all(tolower(str_extract(str_sub(x,start=7),pattern="\\b\\w*@.*\\..*\\b")),"[^a-zA-Z\\s]", " "),"\\s{2,}"," "))

hamListAddress1<-lapply(hamListAddress1,function(x) gsub("\\s*\\w*$", "", x))


head(hamListAddress1)
## [[1]]
## [1] "kre munnari oz"
## 
## [[2]]
## [1] "burt cursor system"
## 
## [[3]]
## [1] "timc ubh"
## 
## [[4]]
## [1] "monty roscom"
## 
## [[5]]
## [1] "tony linuxworks com"
## 
## [[6]]
## [1] "smith ee ed ac"

doing statistical analyses

library(tm)
## Loading required package: NLP
spamListV<-unlist(spamListAddress1)

email_corpus<-c()

tmp<-VCorpus(VectorSource(spamListV[1]))




  email_corpus<-tmp
  
  flag<-c("Yes")
  
  meta(email_corpus[[1]])
##   author       : character(0)
##   datetimestamp: 2018-11-07 04:57:08
##   description  : character(0)
##   heading      : character(0)
##   id           : 1
##   language     : en
##   origin       : character(0)
  inspect(email_corpus[1])
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 14
  meta(email_corpus[[1]],tag="spamFlag")<-flag
  


for (i in 2:length(spamList)){
  tmp<-VCorpus(VectorSource(spamListAddress1[i]))
  email_corpus<-c(email_corpus,tmp)
  meta(email_corpus[[i]],"spamFlag")<-"Yes"
}

n<-0

for (i in (length(spamList)+1):(length(spamListAddress1)+length(hamListAddress1))){
  n<-n+1
  tmp<-VCorpus(VectorSource(hamListAddress1[n]))
  email_corpus<-c(email_corpus,tmp)
  meta(email_corpus[[i]],"spamFlag")<-"No"
}

email_corpus
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 3051
email_corpus1<-sample(email_corpus)

dtm<-DocumentTermMatrix(email_corpus1)

dtm
## <<DocumentTermMatrix (documents: 3051, terms: 1354)>>
## Non-/sparse entries: 6747/4124307
## Sparsity           : 100%
## Maximal term length: 35
## Weighting          : term frequency (tf)
library(RTextTools)
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
flag<-unlist(meta(email_corpus1,"spamFlag"))

flag[1:100]
##     1     1     1     1     1     1     1     1     1     1     1     1 
##  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##  "No" "Yes"  "No"  "No" "Yes"  "No"  "No"  "No" "Yes"  "No"  "No"  "No" 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##  "No" "Yes"  "No"  "No" "Yes"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##     1     1     1     1     1     1     1     1     1     1     1     1 
## "Yes"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" "Yes" 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##  "No"  "No" "Yes"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##  "No"  "No"  "No"  "No"  "No"  "No" "Yes" "Yes"  "No"  "No"  "No"  "No" 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##  "No"  "No"  "No"  "No" "Yes"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##     1     1     1     1     1     1     1     1     1     1     1     1 
##  "No"  "No"  "No" "Yes"  "No"  "No"  "No"  "No"  "No"  "No"  "No"  "No" 
##     1     1     1     1 
##  "No"  "No"  "No"  "No"
N<-length(flag)

N
## [1] 3051
container<-create_container(dtm,labels=flag, trainSize=1:2000,testSize=2001:N,virgin=FALSE)

email_corpus[[2016]]
## <<PlainTextDocument>>
## Metadata:  8
## Content:  chars: 32
svm_model<-train_model(container,"SVM")

tree_model<-train_model(container,"TREE")

maxent_model<-train_model(container,"MAXENT")

svm_out<-classify_model(container,svm_model)

tree_out<-classify_model(container,tree_model)

maxent_out<-classify_model(container,maxent_model)

head(svm_out)
##   SVM_LABEL  SVM_PROB
## 1        No 0.9571617
## 2        No 0.9571177
## 3        No 0.6980404
## 4        No 0.9571064
## 5       Yes 0.8443809
## 6        No 0.9571570
labels_out<-data.frame(correct_label=flag[2001:N],svm=as.character(svm_out[,1]),tree=as.character(tree_out[,1]),maxent=as.character(maxent_out[,1]),stringsAsFactors=F)

head(labels_out,100)
##     correct_label svm tree maxent
## 1              No  No   No     No
## 2              No  No   No     No
## 3              No  No   No    Yes
## 4              No  No   No     No
## 5             Yes Yes   No    Yes
## 6              No  No   No     No
## 7             Yes Yes  Yes    Yes
## 8              No  No   No     No
## 9              No  No   No     No
## 10             No  No   No     No
## 11             No  No   No     No
## 12             No  No   No     No
## 13             No  No   No     No
## 14             No  No   No     No
## 15             No  No   No     No
## 16             No  No   No     No
## 17             No  No   No     No
## 18             No  No   No     No
## 19             No  No   No     No
## 20             No  No   No     No
## 21             No  No   No     No
## 22             No  No   No     No
## 23             No  No   No     No
## 24             No  No   No     No
## 25             No  No   No     No
## 26             No  No   No     No
## 27             No  No   No     No
## 28             No  No   No     No
## 29             No  No   No     No
## 30             No  No   No     No
## 31             No  No   No     No
## 32             No  No   No     No
## 33             No  No   No     No
## 34             No  No   No     No
## 35             No  No   No     No
## 36            Yes Yes   No    Yes
## 37             No  No   No     No
## 38             No  No   No     No
## 39             No  No   No     No
## 40             No  No   No     No
## 41            Yes Yes  Yes    Yes
## 42             No  No   No     No
## 43            Yes Yes   No    Yes
## 44             No  No   No     No
## 45            Yes Yes   No    Yes
## 46             No  No   No     No
## 47             No  No   No     No
## 48             No  No   No     No
## 49             No  No   No     No
## 50             No  No   No     No
## 51             No  No   No     No
## 52             No  No   No     No
## 53             No  No   No     No
## 54             No  No   No     No
## 55            Yes Yes  Yes    Yes
## 56             No  No   No     No
## 57             No  No   No     No
## 58             No  No   No     No
## 59             No  No   No     No
## 60             No  No   No     No
## 61             No  No   No     No
## 62             No  No   No     No
## 63             No  No   No     No
## 64            Yes  No   No    Yes
## 65             No  No   No     No
## 66             No  No   No     No
## 67             No  No   No     No
## 68             No  No   No     No
## 69             No  No   No     No
## 70             No  No   No     No
## 71             No  No   No     No
## 72             No  No   No     No
## 73             No  No   No     No
## 74            Yes  No   No     No
## 75             No  No   No     No
## 76             No  No   No     No
## 77             No  No   No     No
## 78             No  No   No     No
## 79             No  No   No     No
## 80             No  No   No     No
## 81             No  No   No     No
## 82             No  No   No     No
## 83             No  No   No     No
## 84             No  No   No     No
## 85             No  No   No     No
## 86             No  No   No     No
## 87             No  No   No     No
## 88            Yes  No   No     No
## 89             No  No   No     No
## 90             No  No   No     No
## 91             No  No   No     No
## 92             No  No   No     No
## 93             No  No   No     No
## 94            Yes Yes   No    Yes
## 95             No  No   No     No
## 96             No  No   No     No
## 97             No  No   No     No
## 98             No  No   No     No
## 99             No  No   No     No
## 100           Yes  No   No     No
#svm

table(labels_out[,1]==labels_out[,2])
## 
## FALSE  TRUE 
##    73   978
prop.table(table(labels_out[,1]==labels_out[,2]))
## 
##      FALSE       TRUE 
## 0.06945766 0.93054234
# SVM model based on not cleansed email addresses produced a model with 92% correct rate

# tree 

table(labels_out[,1]==labels_out[,3])
## 
## FALSE  TRUE 
##   113   938
prop.table(table(labels_out[,1]==labels_out[,3]))
## 
##     FALSE      TRUE 
## 0.1075167 0.8924833
# maximum entropy ~ 94%

table(labels_out[,1]==labels_out[,4])
## 
## FALSE  TRUE 
##    52   999
prop.table(table(labels_out[,1]==labels_out[,4]))
## 
##      FALSE       TRUE 
## 0.04947669 0.95052331

Loading additional spam/ham files to see if our model works

#step 2 additional testing

spamFiles1 <- list.files(path="C:/Data/spam2/spam_2", pattern="*.*", full.names=TRUE, recursive=FALSE)

head(spamFiles1)
## [1] "C:/Data/spam2/spam_2/00001.317e78fa8ee2f54cd4890fdc09ba8176"
## [2] "C:/Data/spam2/spam_2/00002.9438920e9a55591b18e60d1ed37d992b"
## [3] "C:/Data/spam2/spam_2/00003.590eff932f8704d8b0fcbe69d023b54d"
## [4] "C:/Data/spam2/spam_2/00004.bdcc075fa4beb5157b5dd6cd41d8887b"
## [5] "C:/Data/spam2/spam_2/00005.ed0aba4d386c5e62bc737cf3f0ed9589"
## [6] "C:/Data/spam2/spam_2/00006.3ca1f399ccda5d897fecb8c57669a283"
#spamFiles1<-spamFiles1[-1]

spamList1<-lapply(spamFiles1, function(x) {
    fileName <- x
    readChar(fileName, file.info(fileName)$size)
})




spamListAddressA<-lapply(spamList1,function(x) str_extract(x, pattern="From:.*@.*\\..*\\b"))
                                                      

head(spamListAddressA)
## [[1]]
## [1] "From: \"Start Now\" <startnow2002@hotmail.com"
## 
## [[2]]
## [1] "From: lmrn@mailexcite.com"
## 
## [[3]]
## [1] "From: amknight@mailexcite.com"
## 
## [[4]]
## [1] "From: jordan23@mailexcite.com"
## 
## [[5]]
## [1] "From: yyyy@pluriproj.pt"
## 
## [[6]]
## [1] "From: 3b3fke@ms10.hinet.net"
spamListAddressA1<-lapply(spamListAddressA,function(x)  str_replace_all(str_replace_all(tolower(str_extract(str_sub(x,start=7),pattern="\\b\\w*@.*\\..*\\b")),"[^a-zA-Z\\s]", " "),"\\s{2,}"," "))

spamListAddressA1<-lapply(spamListAddressA1,function(x) gsub("\\s*\\w*$", "", x))

head(spamListAddressA1)
## [[1]]
## [1] "startnow hotmail"
## 
## [[2]]
## [1] "lmrn mailexcite"
## 
## [[3]]
## [1] "amknight mailexcite"
## 
## [[4]]
## [1] "jordan mailexcite"
## 
## [[5]]
## [1] "yyyy pluriproj"
## 
## [[6]]
## [1] " b fke ms hinet"
hamFiles1 <- list.files(path="C:/Data/ham1/hard_ham", pattern="*.*", full.names=TRUE, recursive=FALSE)

head(hamFiles1)
## [1] "C:/Data/ham1/hard_ham/00001.7c7d6921e671bbe18ebb5f893cd9bb35"
## [2] "C:/Data/ham1/hard_ham/00002.ca96f74042d05c1a1d29ca30467cfcd5"
## [3] "C:/Data/ham1/hard_ham/00003.268fd170a3fc73bee2739d8204856a53"
## [4] "C:/Data/ham1/hard_ham/00004.68819fc91d34c82433074d7bd3127dcc"
## [5] "C:/Data/ham1/hard_ham/00005.34bcaad58ad5f598f5d6af8cfa0c0465"
## [6] "C:/Data/ham1/hard_ham/00006.3409dec8ca4fcf2d6e0582554473b5c9"
hamList1<-lapply(hamFiles1, function(x) {
    fileName <- x
    readChar(fileName, file.info(fileName)$size)
})

hamListAddressA<-lapply(hamList1,function(x) str_extract(x, pattern="From:.*@.*\\..*\\b"))
                                                      

head(hamListAddressA)
## [[1]]
## [1] "From: The Motley Fool <Fool@motleyfool.com"
## 
## [[2]]
## [1] "From: malcolm-sweeps@mrichi.com"
## 
## [[3]]
## [1] "From: \"Starflung NIC\" <nic@starflung.com"
## 
## [[4]]
## [1] "From: \"John Levine\" <johnl@cauce.org"
## 
## [[5]]
## [1] "From: \"The ISO17799 Newsletter\" <iso17799@securityrisk.co.uk"
## 
## [[6]]
## [1] "From: \"jobfair24\" <newsletter@jobfair24.de"
hamListAddressA1<-lapply(hamListAddressA,function(x)  str_replace_all(str_replace_all(tolower(str_extract(str_sub(x,start=7),pattern="\\b\\w*@.*\\..*\\b")),"[^a-zA-Z\\s]", " "),"\\s{2,}"," "))

hamListAddressA1<-lapply(hamListAddressA1,function(x) gsub("\\s*\\w*$", "", x))



head(hamListAddressA1,20)
## [[1]]
## [1] "fool motleyfool"
## 
## [[2]]
## [1] "sweeps mrichi"
## 
## [[3]]
## [1] "nic starflung"
## 
## [[4]]
## [1] "johnl cauce"
## 
## [[5]]
## [1] "iso securityrisk co"
## 
## [[6]]
## [1] "newsletter jobfair"
## 
## [[7]]
## [1] " a f c xmr"
## 
## [[8]]
## [1] "michaelr lindows"
## 
## [[9]]
## [1] "newsletter jobfair"
## 
## [[10]]
## [1] "michaelr lindows"
## 
## [[11]]
## [1] " newsletter online"
## 
## [[12]]
## [1] " newsletter online"
## 
## [[13]]
## [1] " newsletter online"
## 
## [[14]]
## [1] "update list theregister co"
## 
## [[15]]
## [1] "subscriptions lockergnome"
## 
## [[16]]
## [1] "subscriptions lockergnome"
## 
## [[17]]
## [1] " ummail unitedmedia"
## 
## [[18]]
## [1] " newsletter online"
## 
## [[19]]
## [1] "subscriptions lockergnome"
## 
## [[20]]
## [1] " newsletter online"
start<-length(spamListAddress1)+length(hamListAddress1)+1

start
## [1] 3052
end<-start+length(spamListAddressA1)

end
## [1] 4449
n<-0

for (i in start:end){
  n<-n+1
  tmp<-VCorpus(VectorSource(spamListAddressA1[n]))
  email_corpus1<-c(email_corpus1,tmp)
  meta(email_corpus1[[i]],"spamFlag")<-"Yes"
}

start<-end+1

end<-start+length(hamListAddressA1)

n<-0
for (i in start:end){
  n<-n+1
  tmp<-VCorpus(VectorSource(hamListAddressA1[n]))
  email_corpus1<-c(email_corpus1,tmp)
  meta(email_corpus1[[i]],"spamFlag")<-"No"
}

email_corpus1
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 4701
email_corpus2<-sample(email_corpus1)

dtm<-DocumentTermMatrix(email_corpus2)

dtm
## <<DocumentTermMatrix (documents: 4701, terms: 2974)>>
## Non-/sparse entries: 10292/13970482
## Sparsity           : 100%
## Maximal term length: 38
## Weighting          : term frequency (tf)
flag<-unlist(meta(email_corpus2,"spamFlag"))


N<-length(flag)

N
## [1] 4701
container<-create_container(dtm,labels=flag, trainSize=1:3052,testSize=3053:N,virgin=FALSE)


svm_model<-train_model(container,"SVM")

tree_model<-train_model(container,"TREE")

maxent_model<-train_model(container,"MAXENT")

svm_out<-classify_model(container,svm_model)

tree_out<-classify_model(container,tree_model)

maxent_out<-classify_model(container,maxent_model)

head(svm_out)
##   SVM_LABEL  SVM_PROB
## 1       Yes 0.7534378
## 2        No 0.9892547
## 3       Yes 0.6189336
## 4        No 0.9892605
## 5       Yes 0.9912976
## 6        No 0.9892725
labels_out<-data.frame(correct_label=flag[3053:N],svm=as.character(svm_out[,1]),tree=as.character(tree_out[,1]),maxent=as.character(maxent_out[,1]),stringsAsFactors=F)

head(labels_out,100)
##     correct_label svm tree maxent
## 1             Yes Yes   No    Yes
## 2              No  No   No     No
## 3              No Yes   No    Yes
## 4              No  No   No     No
## 5             Yes Yes  Yes    Yes
## 6              No  No   No     No
## 7             Yes Yes   No     No
## 8              No  No   No     No
## 9              No  No   No     No
## 10            Yes Yes  Yes    Yes
## 11             No  No   No     No
## 12            Yes Yes   No    Yes
## 13             No  No   No     No
## 14            Yes Yes  Yes    Yes
## 15             No  No   No     No
## 16             No  No   No     No
## 17             No  No   No     No
## 18             No  No   No     No
## 19             No  No   No     No
## 20             No  No   No     No
## 21             No Yes   No    Yes
## 22             No Yes   No     No
## 23             No  No   No     No
## 24             No  No   No     No
## 25             No  No   No     No
## 26            Yes Yes   No    Yes
## 27             No  No   No     No
## 28             No  No   No     No
## 29             No  No   No     No
## 30            Yes Yes   No    Yes
## 31             No Yes   No    Yes
## 32             No  No   No     No
## 33             No Yes   No    Yes
## 34            Yes Yes   No    Yes
## 35            Yes Yes   No    Yes
## 36            Yes Yes  Yes    Yes
## 37             No  No   No     No
## 38             No  No   No     No
## 39             No  No   No     No
## 40             No  No   No     No
## 41             No  No   No     No
## 42             No  No   No     No
## 43            Yes Yes   No    Yes
## 44            Yes Yes  Yes    Yes
## 45             No  No   No     No
## 46             No Yes   No    Yes
## 47             No  No   No     No
## 48            Yes Yes   No    Yes
## 49             No Yes   No    Yes
## 50             No  No   No     No
## 51             No  No   No     No
## 52            Yes Yes  Yes    Yes
## 53             No  No   No     No
## 54            Yes Yes   No    Yes
## 55             No  No   No     No
## 56            Yes Yes   No    Yes
## 57            Yes Yes   No    Yes
## 58             No  No   No     No
## 59             No  No   No     No
## 60             No  No   No     No
## 61             No  No   No     No
## 62            Yes Yes   No    Yes
## 63             No  No   No     No
## 64             No  No   No     No
## 65            Yes Yes   No    Yes
## 66             No  No   No     No
## 67            Yes Yes   No    Yes
## 68            Yes Yes   No    Yes
## 69            Yes Yes   No    Yes
## 70            Yes Yes   No    Yes
## 71            Yes Yes  Yes    Yes
## 72             No  No   No     No
## 73             No  No   No     No
## 74             No Yes   No    Yes
## 75            Yes Yes  Yes    Yes
## 76            Yes Yes   No    Yes
## 77            Yes Yes   No    Yes
## 78            Yes Yes  Yes    Yes
## 79             No  No   No     No
## 80            Yes Yes   No    Yes
## 81             No  No   No     No
## 82             No  No   No     No
## 83            Yes Yes   No    Yes
## 84             No  No   No     No
## 85            Yes Yes   No    Yes
## 86            Yes Yes   No    Yes
## 87            Yes Yes   No    Yes
## 88            Yes Yes  Yes    Yes
## 89             No  No   No     No
## 90             No  No   No     No
## 91             No  No   No     No
## 92            Yes Yes  Yes    Yes
## 93            Yes Yes   No    Yes
## 94            Yes Yes   No    Yes
## 95             No  No   No     No
## 96             No Yes   No    Yes
## 97            Yes Yes  Yes    Yes
## 98             No  No   No     No
## 99             No  No   No     No
## 100            No  No   No     No
#svm

table(labels_out[,1]==labels_out[,2])
## 
## FALSE  TRUE 
##   142  1507
prop.table(table(labels_out[,1]==labels_out[,2]))
## 
##     FALSE      TRUE 
## 0.0861128 0.9138872
# SVM model based on not cleansed email addresses produced a model with 92% correct rate

# tree 

table(labels_out[,1]==labels_out[,3])
## 
## FALSE  TRUE 
##   464  1185
prop.table(table(labels_out[,1]==labels_out[,3]))
## 
##     FALSE      TRUE 
## 0.2813827 0.7186173
# maximum entropy 93% correct rate

table(labels_out[,1]==labels_out[,4])
## 
## FALSE  TRUE 
##   108  1541
prop.table(table(labels_out[,1]==labels_out[,4]))
## 
##      FALSE       TRUE 
## 0.06549424 0.93450576
head(spamListAddress1)
## [[1]]
## [1] " a mailbot web"
## 
## [[2]]
## [1] "taylor s serveimage"
## 
## [[3]]
## [1] "sabrina mx premio"
## 
## [[4]]
## [1] "wsup playful"
## 
## [[5]]
## [1] "yenene mx premio"
## 
## [[6]]
## [1] "thecashsystem firemail"

Testing subject

Subject

spamListSubject<-lapply(spamList,function(x) str_extract(x, pattern="Subject:.*"))

                                                      

head(spamListSubject)
## [[1]]
## [1] "Subject: Life Insurance - Why Pay More?"
## 
## [[2]]
## [1] "Subject: [ILUG] Guaranteed to lose 10-12 lbs in 30 days 10.206"
## 
## [[3]]
## [1] "Subject: Guaranteed to lose 10-12 lbs in 30 days                          11.150"
## 
## [[4]]
## [1] "Subject: Re: Fw: User Name & Password to Membership To 5 Sites zzzz@example.com pviqg"
## 
## [[5]]
## [1] "Subject: [ILUG-Social] re: Guaranteed to lose 10-12 lbs in 30 days 10.148"
## 
## [[6]]
## [1] "Subject: RE: Your Bank Account Information "
spamListSubject1<-lapply(spamListSubject,function(x) str_replace_all(tolower(str_replace_all(str_sub(x,start=10),"[^a-zA-Z\\s]", " ")),"\\s{2,}"," "))

spamListSubject1<-lapply(spamListSubject1,function(x) str_replace(str_replace_all(x,"^\\s",""),"\\s$",""))

head(spamListSubject1)
## [[1]]
## [1] "life insurance why pay more"
## 
## [[2]]
## [1] "ilug guaranteed to lose lbs in days"
## 
## [[3]]
## [1] "guaranteed to lose lbs in days"
## 
## [[4]]
## [1] "re fw user name password to membership to sites zzzz example com pviqg"
## 
## [[5]]
## [1] "ilug social re guaranteed to lose lbs in days"
## 
## [[6]]
## [1] "re your bank account information"
hamListSubject<-lapply(hamList,function(x) str_extract(x, pattern="Subject:.*"))

                                                      

head(hamListSubject)
## [[1]]
## [1] "Subject: Re: New Sequences Window"
## 
## [[2]]
## [1] "Subject: [zzzzteana] RE: Alexander"
## 
## [[3]]
## [1] "Subject: [zzzzteana] Moscow bomber"
## 
## [[4]]
## [1] "Subject: [IRR] Klez: The Virus That  Won't Die"
## 
## [[5]]
## [1] "Subject: Re: Insert signature"
## 
## [[6]]
## [1] "Subject: Re: [zzzzteana] Nothing like mama used to make"
hamListSubject1<-lapply(hamListSubject,function(x) str_replace_all(tolower(str_replace_all(str_sub(x,start=10),"[^a-zA-Z\\s]", " ")),"\\s{2,}"," "))

hamListSubject1<-lapply(hamListSubject1,function(x) str_replace(str_replace_all(x,"^\\s",""),"\\s$",""))

head(hamListSubject1)
## [[1]]
## [1] "re new sequences window"
## 
## [[2]]
## [1] "zzzzteana re alexander"
## 
## [[3]]
## [1] "zzzzteana moscow bomber"
## 
## [[4]]
## [1] "irr klez the virus that won t die"
## 
## [[5]]
## [1] "re insert signature"
## 
## [[6]]
## [1] "re zzzzteana nothing like mama used to make"
spamListSubjectA<-lapply(spamList1,function(x) str_extract(x, pattern="Subject:.*"))

                                                      

head(spamListSubjectA)
## [[1]]
## [1] "Subject: [ILUG] STOP THE MLM INSANITY"
## 
## [[2]]
## [1] "Subject: Real Protection, Stun Guns!  Free Shipping! Time:2:01:35 PM"
## 
## [[3]]
## [1] "Subject: New Improved Fat Burners, Now With TV Fat Absorbers! Time:6:25:49 PM"
## 
## [[4]]
## [1] "Subject: New Improved Fat Burners, Now With TV Fat Absorbers! Time:7:20:54 AM"
## 
## [[5]]
## [1] "Subject: Never Repay Cash Grants, $500 - $50,000, Secret Revealed!"
## 
## [[6]]
## [1] "Subject: ÁÙ¦b¥Î20%ªº«H¥Î¥d´`Àô¶Ü??? Time:PM 05:36:34"
spamListSubjectA1<-lapply(spamListSubjectA,function(x) str_replace_all(tolower(str_replace_all(str_sub(x,start=10),"[^a-zA-Z\\s]", " ")),"\\s{2,}"," "))

spamListSubjectA1<-lapply(spamListSubjectA1,function(x) str_replace(str_replace_all(x,"^\\s",""),"\\s$",""))

head(spamListSubjectA1)
## [[1]]
## [1] "ilug stop the mlm insanity"
## 
## [[2]]
## [1] "real protection stun guns free shipping time pm"
## 
## [[3]]
## [1] "new improved fat burners now with tv fat absorbers time pm"
## 
## [[4]]
## [1] "new improved fat burners now with tv fat absorbers time am"
## 
## [[5]]
## [1] "never repay cash grants secret revealed"
## 
## [[6]]
## [1] "b h d time pm"
hamListSubjectA<-lapply(hamList1,function(x) str_extract(x, pattern="Subject:.*"))

                                                      

head(hamListSubjectA)
## [[1]]
## [1] "Subject: Personal Finance: Resolutions You Can Keep"
## 
## [[2]]
## [1] "Subject: Malcolm in the Middle Sweepstakes Prize Notification"
## 
## [[3]]
## [1] "Subject: Automated 30 day renewal reminder 2002-05-27"
## 
## [[4]]
## [1] "Subject: CAUCE NEWS, Vol 6, No 2, June 2002"
## 
## [[5]]
## [1] "Subject: The ISO17799 Newsletter - Issue 4"
## 
## [[6]]
## [1] "Subject: Virtueller Messetag der jobfair24 am Mittwoch, 03. Juli 2002"
hamListSubjectA1<-lapply(hamListSubjectA,function(x) str_replace_all(tolower(str_replace_all(str_sub(x,start=10),"[^a-zA-Z\\s]", " ")),"\\s{2,}"," "))

hamListSubjectA1<-lapply(hamListSubjectA1,function(x) str_replace(str_replace_all(x,"^\\s",""),"\\s$",""))

head(hamListSubjectA1)
## [[1]]
## [1] "personal finance resolutions you can keep"
## 
## [[2]]
## [1] "malcolm in the middle sweepstakes prize notification"
## 
## [[3]]
## [1] "automated day renewal reminder"
## 
## [[4]]
## [1] "cauce news vol no june"
## 
## [[5]]
## [1] "the iso newsletter issue"
## 
## [[6]]
## [1] "virtueller messetag der jobfair am mittwoch juli"
spamListV<-unlist(spamListSubject1)

email_corpus<-c()

tmp<-VCorpus(VectorSource(spamListV[1]))




  email_corpus<-tmp
  
  flag<-c("Yes")
  
  meta(email_corpus[[1]])
##   author       : character(0)
##   datetimestamp: 2018-11-07 04:57:32
##   description  : character(0)
##   heading      : character(0)
##   id           : 1
##   language     : en
##   origin       : character(0)
  inspect(email_corpus[1])
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 27
  meta(email_corpus[[1]],tag="spamFlag")<-flag
  
  

start<-2

start
## [1] 2
end<-length(spamListSubject1)

end
## [1] 500
n<-1

for (i in start:end){
  n<-n+1
  tmp<-VCorpus(VectorSource(spamListSubject1[n]))
  email_corpus<-c(email_corpus,tmp)
  meta(email_corpus[[i]],"spamFlag")<-"Yes"
}

start<-end+1

end<-start+length(hamListSubject1)

n<-0

for (i in start:end){
  n<-n+1
  tmp<-VCorpus(VectorSource(hamListSubject1[n]))
  email_corpus<-c(email_corpus,tmp)
  meta(email_corpus[[i]],"spamFlag")<-"No"
}
start<-end+1

start
## [1] 3053
end<-start+length(spamListSubjectA1)

end
## [1] 4450
n<-0

for (i in start:end){
  n<-n+1
  tmp<-VCorpus(VectorSource(spamListSubjectA1[n]))
  email_corpus<-c(email_corpus,tmp)
  meta(email_corpus[[i]],"spamFlag")<-"Yes"
}

start<-end+1

end<-start+length(hamListSubjectA1)

n<-0

for (i in start:end){
  n<-n+1
  tmp<-VCorpus(VectorSource(hamListSubjectA1[n]))
  email_corpus<-c(email_corpus,tmp)
  meta(email_corpus[[i]],"spamFlag")<-"No"
}

email_corpus
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 4702
email_corpus1<-sample(email_corpus)

dtm<-DocumentTermMatrix(email_corpus1)

dtm
## <<DocumentTermMatrix (documents: 4702, terms: 5190)>>
## Non-/sparse entries: 21744/24381636
## Sparsity           : 100%
## Maximal term length: 31
## Weighting          : term frequency (tf)
flag<-unlist(meta(email_corpus1,"spamFlag"))


N<-length(flag)

N
## [1] 4702
container<-create_container(dtm,labels=flag, trainSize=1:3052,testSize=3053:N,virgin=FALSE)


svm_model<-train_model(container,"SVM")

tree_model<-train_model(container,"TREE")

maxent_model<-train_model(container,"MAXENT")

svm_out<-classify_model(container,svm_model)

tree_out<-classify_model(container,tree_model)

maxent_out<-classify_model(container,maxent_model)

head(svm_out)
##   SVM_LABEL  SVM_PROB
## 1        No 0.9694348
## 2        No 0.6200844
## 3        No 0.7448862
## 4        No 0.9024603
## 5        No 0.7249917
## 6       Yes 0.9722772
labels_out<-data.frame(correct_label=flag[3053:N],svm=as.character(svm_out[,1]),tree=as.character(tree_out[,1]),maxent=as.character(maxent_out[,1]),stringsAsFactors=F)

head(labels_out)
##   correct_label svm tree maxent
## 1            No  No   No     No
## 2            No  No   No    Yes
## 3            No  No   No     No
## 4            No  No   No     No
## 5            No  No   No     No
## 6           Yes Yes  Yes    Yes
#svm

table(labels_out[,1]==labels_out[,2])
## 
## FALSE  TRUE 
##   242  1408
prop.table(table(labels_out[,1]==labels_out[,2]))
## 
##     FALSE      TRUE 
## 0.1466667 0.8533333
# SVM model 86% correct rate

# tree 

table(labels_out[,1]==labels_out[,3])
## 
## FALSE  TRUE 
##   453  1197
prop.table(table(labels_out[,1]==labels_out[,3]))
## 
##     FALSE      TRUE 
## 0.2745455 0.7254545
# maximum entropy 88% correct rate

table(labels_out[,1]==labels_out[,4])
## 
## FALSE  TRUE 
##   196  1454
prop.table(table(labels_out[,1]==labels_out[,4]))
## 
##     FALSE      TRUE 
## 0.1187879 0.8812121

Testing Address + Subject

# email address + subject

spamListAddressC<-c(spamListAddress1,spamListAddressA1)

hamListAddressC<-c(hamListAddress1,hamListAddressA1)

spamListSubjectC<-c(spamListSubject1,spamListSubjectA1)

hamListSubjectC<-c(hamListSubject1,hamListSubjectA1)


spamListAS<-str_c(spamListAddressC," ",spamListSubjectC)

head(spamListAS)
## [1] " a mailbot web life insurance why pay more"                                         
## [2] "taylor s serveimage ilug guaranteed to lose lbs in days"                            
## [3] "sabrina mx premio guaranteed to lose lbs in days"                                   
## [4] "wsup playful re fw user name password to membership to sites zzzz example com pviqg"
## [5] "yenene mx premio ilug social re guaranteed to lose lbs in days"                     
## [6] "thecashsystem firemail re your bank account information"
hamListAS<-str_c(hamListAddressC," ",hamListSubjectC)

head(hamListAS)
## [1] "kre munnari oz re new sequences window"                    
## [2] "burt cursor system zzzzteana re alexander"                 
## [3] "timc ubh zzzzteana moscow bomber"                          
## [4] "monty roscom irr klez the virus that won t die"            
## [5] "tony linuxworks com re insert signature"                   
## [6] "smith ee ed ac re zzzzteana nothing like mama used to make"
spamListV<-unlist(spamListAS)

email_corpus<-c()

tmp<-VCorpus(VectorSource(spamListV[1]))




  email_corpus<-tmp
  
  flag<-c("Yes")
  
  meta(email_corpus[[1]])
##   author       : character(0)
##   datetimestamp: 2018-11-07 04:57:53
##   description  : character(0)
##   heading      : character(0)
##   id           : 1
##   language     : en
##   origin       : character(0)
  inspect(email_corpus[1])
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 42
  meta(email_corpus[[1]],tag="spamFlag")<-flag
  
  

start<-2

start
## [1] 2
end<-length(spamListAS)

end
## [1] 1897
n<-1

for (i in start:end){
  n<-n+1
  tmp<-VCorpus(VectorSource(spamListAS[n]))
  email_corpus<-c(email_corpus,tmp)
  meta(email_corpus[[i]],"spamFlag")<-"Yes"
}

start<-end+1

end<-start+length(hamListAS)

n<-0

for (i in start:end){
  n<-n+1
  tmp<-VCorpus(VectorSource(hamListAS[n]))
  email_corpus<-c(email_corpus,tmp)
  meta(email_corpus[[i]],"spamFlag")<-"No"
}

email_corpus
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 4700
email_corpus1<-sample(email_corpus)

dtm<-DocumentTermMatrix(email_corpus1)

dtm
## <<DocumentTermMatrix (documents: 4700, terms: 7769)>>
## Non-/sparse entries: 31759/36482541
## Sparsity           : 100%
## Maximal term length: 38
## Weighting          : term frequency (tf)
flag<-unlist(meta(email_corpus1,"spamFlag"))


N<-length(flag)

N
## [1] 4700
container<-create_container(dtm,labels=flag, trainSize=1:3052,testSize=3053:N,virgin=FALSE)


svm_model<-train_model(container,"SVM")

tree_model<-train_model(container,"TREE")

maxent_model<-train_model(container,"MAXENT")

svm_out<-classify_model(container,svm_model)

tree_out<-classify_model(container,tree_model)

maxent_out<-classify_model(container,maxent_model)

head(svm_out)
##   SVM_LABEL  SVM_PROB
## 1        No 0.7371761
## 2       Yes 0.6101042
## 3        No 0.9993536
## 4       Yes 0.9998919
## 5        No 0.5100162
## 6        No 0.9949306
labels_out<-data.frame(correct_label=flag[3053:N],svm=as.character(svm_out[,1]),tree=as.character(tree_out[,1]),maxent=as.character(maxent_out[,1]),stringsAsFactors=F)

head(labels_out)
##   correct_label svm tree maxent
## 1           Yes  No   No     No
## 2           Yes Yes   No    Yes
## 3            No  No   No     No
## 4           Yes Yes  Yes    Yes
## 5           Yes  No   No    Yes
## 6            No  No   No     No
#svm

table(labels_out[,1]==labels_out[,2])
## 
## FALSE  TRUE 
##    86  1562
prop.table(table(labels_out[,1]==labels_out[,2]))
## 
##      FALSE       TRUE 
## 0.05218447 0.94781553
# SVM model 94% correct rate - not bad

# tree - 72% correct rate - bad

table(labels_out[,1]==labels_out[,3])
## 
## FALSE  TRUE 
##   429  1219
prop.table(table(labels_out[,1]==labels_out[,3]))
## 
##     FALSE      TRUE 
## 0.2603155 0.7396845
# maximum entropy 95% correct rate - not bad

table(labels_out[,1]==labels_out[,4])
## 
## FALSE  TRUE 
##    72  1576
prop.table(table(labels_out[,1]==labels_out[,4]))
## 
##      FALSE       TRUE 
## 0.04368932 0.95631068

Next step would be to extract body text

body of text

spamListC<-c(spamList,spamList1)

hamListC<-c(hamList,hamList1)

head(spamListC)
## [[1]]
## [1] "From 12a1mailbot1@web.de  Thu Aug 22 13:17:22 2002\r\nReturn-Path: <12a1mailbot1@web.de>\r\nDelivered-To: zzzz@localhost.example.com\r\nReceived: from localhost (localhost [127.0.0.1])\r\n\tby phobos.labs.example.com (Postfix) with ESMTP id 136B943C32\r\n\tfor <zzzz@localhost>; Thu, 22 Aug 2002 08:17:21 -0400 (EDT)\r\nReceived: from mail.webnote.net [193.120.211.219]\r\n\tby localhost with POP3 (fetchmail-5.9.0)\r\n\tfor zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:17:21 +0100 (IST)\r\nReceived: from dd_it7 ([210.97.77.167])\r\n\tby webnote.net (8.9.3/8.9.3) with ESMTP id NAA04623\r\n\tfor <zzzz@example.com>; Thu, 22 Aug 2002 13:09:41 +0100\r\nFrom: 12a1mailbot1@web.de\r\nReceived: from r-smtp.korea.com - 203.122.2.197 by dd_it7  with Microsoft SMTPSVC(5.5.1775.675.6);\r\n\t Sat, 24 Aug 2002 09:42:10 +0900\r\nTo: <dcek1a1@netsgo.com>\r\nSubject: Life Insurance - Why Pay More?\r\nDate: Wed, 21 Aug 2002 20:31:57 -1600\r\nMIME-Version: 1.0\r\nMessage-ID: <0103c1042001882DD_IT7@dd_it7>\r\nContent-Type: text/html; charset=\"iso-8859-1\"\r\nContent-Transfer-Encoding: quoted-printable\r\n\r\n<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\">\r\n<HTML><HEAD>\r\n<META content=3D\"text/html; charset=3Dwindows-1252\" http-equiv=3DContent-T=\r\nype>\r\n<META content=3D\"MSHTML 5.00.2314.1000\" name=3DGENERATOR></HEAD>\r\n<BODY><!-- Inserted by Calypso -->\r\n<TABLE border=3D0 cellPadding=3D0 cellSpacing=3D2 id=3D_CalyPrintHeader_ r=\r\nules=3Dnone \r\nstyle=3D\"COLOR: black; DISPLAY: none\" width=3D\"100%\">\r\n  <TBODY>\r\n  <TR>\r\n    <TD colSpan=3D3>\r\n      <HR color=3Dblack noShade SIZE=3D1>\r\n    </TD></TR></TD></TR>\r\n  <TR>\r\n    <TD colSpan=3D3>\r\n      <HR color=3Dblack noShade SIZE=3D1>\r\n    </TD></TR></TBODY></TABLE><!-- End Calypso --><!-- Inserted by Calypso=\r\n --><FONT \r\ncolor=3D#000000 face=3DVERDANA,ARIAL,HELVETICA size=3D-2><BR></FONT></TD><=\r\n/TR></TABLE><!-- End Calypso --><FONT color=3D#ff0000 \r\nface=3D\"Copperplate Gothic Bold\" size=3D5 PTSIZE=3D\"10\">\r\n<CENTER>Save up to 70% on Life Insurance.</CENTER></FONT><FONT color=3D#ff=\r\n0000 \r\nface=3D\"Copperplate Gothic Bold\" size=3D5 PTSIZE=3D\"10\">\r\n<CENTER>Why Spend More Than You Have To?\r\n<CENTER><FONT color=3D#ff0000 face=3D\"Copperplate Gothic Bold\" size=3D5 PT=\r\nSIZE=3D\"10\">\r\n<CENTER>Life Quote Savings\r\n<CENTER>\r\n<P align=3Dleft></P>\r\n<P align=3Dleft></P></FONT></U></I></B><BR></FONT></U></B></U></I>\r\n<P></P>\r\n<CENTER>\r\n<TABLE border=3D0 borderColor=3D#111111 cellPadding=3D0 cellSpacing=3D0 wi=\r\ndth=3D650>\r\n  <TBODY></TBODY></TABLE>\r\n<TABLE border=3D0 borderColor=3D#111111 cellPadding=3D5 cellSpacing=3D0 wi=\r\ndth=3D650>\r\n  <TBODY>\r\n  <TR>\r\n    <TD colSpan=3D2 width=3D\"35%\"><B><FONT face=3DVerdana size=3D4>Ensurin=\r\ng your \r\n      family's financial security is very important. Life Quote Savings ma=\r\nkes \r\n      buying life insurance simple and affordable. We Provide FREE Access =\r\nto The \r\n      Very Best Companies and The Lowest Rates.</FONT></B></TD></TR>\r\n  <TR>\r\n    <TD align=3Dmiddle vAlign=3Dtop width=3D\"18%\">\r\n      <TABLE borderColor=3D#111111 width=3D\"100%\">\r\n        <TBODY>\r\n        <TR>\r\n          <TD style=3D\"PADDING-LEFT: 5px; PADDING-RIGHT: 5px\" width=3D\"100=\r\n%\"><FONT \r\n            face=3DVerdana size=3D4><B>Life Quote Savings</B> is FAST, EAS=\r\nY and \r\n            SAVES you money! Let us help you get started with the best val=\r\nues in \r\n            the country on new coverage. You can SAVE hundreds or even tho=\r\nusands \r\n            of dollars by requesting a FREE quote from Lifequote Savings. =\r\nOur \r\n            service will take you less than 5 minutes to complete. Shop an=\r\nd \r\n            compare. SAVE up to 70% on all types of Life insurance! \r\n</FONT></TD></TR>\r\n        <TR><BR><BR>\r\n          <TD height=3D50 style=3D\"PADDING-LEFT: 5px; PADDING-RIGHT: 5px\" \r\n          width=3D\"100%\">\r\n            <P align=3Dcenter><B><FONT face=3DVerdana size=3D5><A \r\n            href=3D\"http://website.e365.cc/savequote/\">Click Here For Your=\r\n \r\n            Free Quote!</A></FONT></B></P></TD>\r\n          <P><FONT face=3DVerdana size=3D4><STRONG>\r\n          <CENTER>Protecting your family is the best investment you'll eve=\r\nr \r\n          make!<BR></B></TD></TR>\r\n        <TR><BR><BR></STRONG></FONT></TD></TR></TD></TR>\r\n        <TR></TR></TBODY></TABLE>\r\n      <P align=3Dleft><FONT face=3D\"Arial, Helvetica, sans-serif\" size=3D2=\r\n></FONT></P>\r\n      <P></P>\r\n      <CENTER><BR><BR><BR>\r\n      <P></P>\r\n      <P align=3Dleft><BR></B><BR><BR><BR><BR></P>\r\n      <P align=3Dcenter><BR></P>\r\n      <P align=3Dleft><BR></B><BR><BR></FONT>If you are in receipt of this=\r\n email \r\n      in error and/or wish to be removed from our list, <A \r\n      href=3D\"mailto:coins@btamail.net.cn\">PLEASE CLICK HERE</A> AND TYPE =\r\nREMOVE. If you \r\n      reside in any state which prohibits e-mail solicitations for insuran=\r\nce, \r\n      please disregard this \r\n      email.<BR></FONT><BR><BR><BR><BR><BR><BR><BR><BR><BR><BR><BR><BR><BR=\r\n><BR><BR><BR></FONT></P></CENTER></CENTER></TR></TBODY></TABLE></CENTER></=\r\nCENTER></CENTER></CENTER></CENTER></BODY></HTML>\r\n\r\n\r\n\r\n"
## 
## [[2]]
## [1] "From ilug-admin@linux.ie  Thu Aug 22 13:27:39 2002\r\nReturn-Path: <ilug-admin@linux.ie>\r\nDelivered-To: zzzz@localhost.example.com\r\nReceived: from localhost (localhost [127.0.0.1])\r\n\tby phobos.labs.example.com (Postfix) with ESMTP id A7FD7454F6\r\n\tfor <zzzz@localhost>; Thu, 22 Aug 2002 08:27:38 -0400 (EDT)\r\nReceived: from phobos [127.0.0.1]\r\n\tby localhost with IMAP (fetchmail-5.9.0)\r\n\tfor zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:27:38 +0100 (IST)\r\nReceived: from lugh.tuatha.org (root@lugh.tuatha.org [194.125.145.45]) by\r\n    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g7MCJiZ06043 for\r\n    <zzzz-ilug@jmason.org>; Thu, 22 Aug 2002 13:19:44 +0100\r\nReceived: from lugh (root@localhost [127.0.0.1]) by lugh.tuatha.org\r\n    (8.9.3/8.9.3) with ESMTP id NAA29323; Thu, 22 Aug 2002 13:18:52 +0100\r\nReceived: from email.qves.com ([67.104.83.251]) by lugh.tuatha.org\r\n    (8.9.3/8.9.3) with ESMTP id NAA29282 for <ilug@linux.ie>; Thu,\r\n    22 Aug 2002 13:18:37 +0100\r\nX-Authentication-Warning: lugh.tuatha.org: Host [67.104.83.251] claimed to\r\n    be email.qves.com\r\nReceived: from qvp0091 ([169.254.6.22]) by email.qves.com with Microsoft\r\n    SMTPSVC(5.0.2195.2966); Thu, 22 Aug 2002 06:18:18 -0600\r\nFrom: \"Slim Down\" <taylor@s3.serveimage.com>\r\nTo: <ilug@linux.ie>\r\nDate: Thu, 22 Aug 2002 06:18:18 -0600\r\nMessage-Id: <59e6301c249d5$ffb7ea20$1606fea9@freeyankeedom.com>\r\nMIME-Version: 1.0\r\nContent-Type: text/plain; charset=\"iso-8859-1\"\r\nContent-Transfer-Encoding: 7bit\r\nX-Mailer: Microsoft CDO for Windows 2000\r\nThread-Index: AcJJ1f+3FWdz11AmR6uWbmQN5gGxxw==\r\nContent-Class: urn:content-classes:message\r\nX-Mimeole: Produced By Microsoft MimeOLE V6.00.2462.0000\r\nX-Originalarrivaltime: 22 Aug 2002 12:18:18.0699 (UTC) FILETIME=[FFB949B0:01C249D5]\r\nSubject: [ILUG] Guaranteed to lose 10-12 lbs in 30 days 10.206\r\nSender: ilug-admin@linux.ie\r\nErrors-To: ilug-admin@linux.ie\r\nX-Mailman-Version: 1.1\r\nPrecedence: bulk\r\nList-Id: Irish Linux Users' Group <ilug.linux.ie>\r\nX-Beenthere: ilug@linux.ie\r\n\r\n1) Fight The Risk of Cancer!\r\nhttp://www.adclick.ws/p.cfm?o=315&s=pk007\r\n\r\n2) Slim Down - Guaranteed to lose 10-12 lbs in 30 days\r\nhttp://www.adclick.ws/p.cfm?o=249&s=pk007\r\n\r\n3) Get the Child Support You Deserve - Free Legal Advice\r\nhttp://www.adclick.ws/p.cfm?o=245&s=pk002\r\n\r\n4) Join the Web's Fastest Growing Singles Community\r\nhttp://www.adclick.ws/p.cfm?o=259&s=pk007\r\n\r\n5) Start Your Private Photo Album Online!\r\nhttp://www.adclick.ws/p.cfm?o=283&s=pk007\r\n\r\nHave a Wonderful Day,\r\nOffer Manager\r\nPrizeMama\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\nIf you wish to leave this list please use the link below.\r\nhttp://www.qves.com/trim/?ilug@linux.ie%7C17%7C114258\r\n\r\n\r\n-- \r\nIrish Linux Users' Group: ilug@linux.ie\r\nhttp://www.linux.ie/mailman/listinfo/ilug for (un)subscription information.\r\nList maintainer: listmaster@linux.ie\r\n\r\n"
## 
## [[3]]
## [1] "From sabrina@mx3.1premio.com  Thu Aug 22 14:44:07 2002\r\nReturn-Path: <sabrina@mx3.1premio.com>\r\nDelivered-To: zzzz@localhost.example.com\r\nReceived: from localhost (localhost [127.0.0.1])\r\n\tby phobos.labs.example.com (Postfix) with ESMTP id 1E90847C66\r\n\tfor <zzzz@localhost>; Thu, 22 Aug 2002 09:44:02 -0400 (EDT)\r\nReceived: from mail.webnote.net [193.120.211.219]\r\n\tby localhost with POP3 (fetchmail-5.9.0)\r\n\tfor zzzz@localhost (single-drop); Thu, 22 Aug 2002 14:44:03 +0100 (IST)\r\nReceived: from email.qves.com (email1.qves.net [209.63.151.251] (may be forged))\r\n\tby webnote.net (8.9.3/8.9.3) with ESMTP id OAA04953\r\n\tfor <zzzz@example.com>; Thu, 22 Aug 2002 14:37:23 +0100\r\nReceived: from qvp0086 ([169.254.6.17]) by email.qves.com with Microsoft SMTPSVC(5.0.2195.2966);\r\n\t Thu, 22 Aug 2002 07:36:20 -0600\r\nFrom: \"Slim Down\" <sabrina@mx3.1premio.com>\r\nTo: <zzzz@example.com>\r\nSubject: Guaranteed to lose 10-12 lbs in 30 days                          11.150\r\nDate: Thu, 22 Aug 2002 07:36:19 -0600\r\nMessage-ID: <9a63c01c249e0$e5a9d610$1106fea9@freeyankeedom.com>\r\nMIME-Version: 1.0\r\nContent-Type: text/plain;\r\n\tcharset=\"iso-8859-1\"\r\nContent-Transfer-Encoding: 7bit\r\nX-Mailer: Microsoft CDO for Windows 2000\r\nThread-Index: AcJJ4OWpowGq7rdNSwCz5HE3x9ZZDQ==\r\nContent-Class: urn:content-classes:message\r\nX-MimeOLE: Produced By Microsoft MimeOLE V6.00.2462.0000\r\nX-OriginalArrivalTime: 22 Aug 2002 13:36:20.0969 (UTC) FILETIME=[E692FD90:01C249E0]\r\n\r\n1) Fight The Risk of Cancer!\r\nhttp://www.adclick.ws/p.cfm?o=315&s=pk007\r\n\r\n2) Slim Down - Guaranteed to lose 10-12 lbs in 30 days\r\nhttp://www.adclick.ws/p.cfm?o=249&s=pk007\r\n\r\n3) Get the Child Support You Deserve - Free Legal Advice\r\nhttp://www.adclick.ws/p.cfm?o=245&s=pk002\r\n\r\n4) Join the Web's Fastest Growing Singles Community\r\nhttp://www.adclick.ws/p.cfm?o=259&s=pk007\r\n\r\n5) Start Your Private Photo Album Online!\r\nhttp://www.adclick.ws/p.cfm?o=283&s=pk007\r\n\r\nHave a Wonderful Day,\r\nOffer Manager\r\nPrizeMama\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\nIf you wish to leave this list please use the link below.\r\nhttp://www.qves.com/trim/?zzzz@example.com%7C17%7C308417\r\n\r\n"
## 
## [[4]]
## [1] "From wsup@playful.com  Thu Aug 22 16:17:00 2002\r\nReturn-Path: <wsup@playful.com>\r\nDelivered-To: zzzz@localhost.example.com\r\nReceived: from localhost (localhost [127.0.0.1])\r\n\tby phobos.labs.example.com (Postfix) with ESMTP id B8E8D43F99\r\n\tfor <zzzz@localhost>; Thu, 22 Aug 2002 11:16:59 -0400 (EDT)\r\nReceived: from mail.webnote.net [193.120.211.219]\r\n\tby localhost with POP3 (fetchmail-5.9.0)\r\n\tfor zzzz@localhost (single-drop); Thu, 22 Aug 2002 16:16:59 +0100 (IST)\r\nReceived: from smtp.easydns.com (smtp.easydns.com [205.210.42.30])\r\n\tby webnote.net (8.9.3/8.9.3) with ESMTP id QAA05397\r\n\tfor <zzzz@example.com>; Thu, 22 Aug 2002 16:13:20 +0100\r\nReceived: from 200.161.16.132 (unknown [210.19.113.130])\r\n\tby smtp.easydns.com (Postfix) with SMTP id 694632EE5A\r\n\tfor <zzzz@example.com>; Thu, 22 Aug 2002 11:12:57 -0400 (EDT)\r\nReceived: from unknown (52.127.142.42) by rly-xl04.mx.aol.com with smtp; Aug, 22 2002 8:02:13 AM +0400\r\nReceived: from [176.244.234.14] by smtp-server6.tampabay.rr.com with local; Aug, 22 2002 6:50:51 AM +1200\r\nReceived: from [106.226.127.61] by n7.groups.yahoo.com with local; Aug, 22 2002 5:50:53 AM +0300\r\nReceived: from 177.139.227.166 ([177.139.227.166]) by sparc.isl.net with QMQP; Aug, 22 2002 4:47:16 AM -0000\r\nFrom: Account Services <wsup@playful.com>\r\nTo: zzzz@example.com\r\nCc: \r\nSubject: Re: Fw: User Name & Password to Membership To 5 Sites zzzz@example.com pviqg\r\nSender: Account Services <wsup@playful.com>\r\nMime-Version: 1.0\r\nContent-Type: text/plain; charset=\"iso-8859-1\"\r\nDate: Thu, 22 Aug 2002 08:13:35 -0700\r\nX-Mailer: Microsoft Outlook Express 5.00.2615.200\r\nX-Priority: 1\r\nMessage-Id: <20020822151301.694632EE5A@smtp.easydns.com>\r\n\r\n##################################################\r\n#                                                #\r\n#                 Adult Club                     #\r\n#           Offers FREE Membership               #\r\n#                                                #\r\n##################################################\r\n\r\n>>>>>  INSTANT ACCESS TO ALL SITES NOW\r\n>>>>>  Your User Name And Password is.\r\n>>>>>  User Name: zzzz@example.com\r\n>>>>>  Password: 760382\r\n\r\n5 of the Best Adult Sites on the Internet for FREE!\r\n---------------------------------------\r\nNEWS 08/18/02\r\nWith just over 2.9 Million Members that signed up for FREE, Last month there were 721,184 New\r\nMembers. Are you one of them yet???\r\n---------------------------------------\r\nOur Membership FAQ\r\n\r\nQ. Why are you offering free access to 5 adult membership sites for free?\r\nA. I have advertisers that pay me for ad space so you don't have to pay for membership.\r\n\r\nQ. Is it true my membership is for life?\r\nA. Absolutely you'll never have to pay a cent the advertisers do.\r\n\r\nQ. Can I give my account to my friends and family?\r\nA. Yes, as long they are over the age of 18.\r\n\r\nQ. Do I have to sign up for all 5 membership sites?\r\nA. No just one to get access to all of them.\r\n\r\nQ. How do I get started?\r\nA. Click on one of the following links below to become a member.\r\n\r\n- These are multi million dollar operations with policies and rules.\r\n- Fill in the required info and they won't charge you for the Free pass!\r\n- If you don't believe us, just read their terms and conditions.\r\n\r\n---------------------------\r\n\r\n# 5. > Adults Farm\r\nhttp://80.71.66.8/farm/?aid=760382\r\nGirls and Animals Getting Freaky....FREE Lifetime Membership!!\r\n\r\n# 4. > Sexy Celebes\r\nhttp://80.71.66.8/celebst/?aid=760382\r\nThousands Of XXX Celebes doing it...FREE Lifetime Membership!!\r\n\r\n# 3. > Play House Porn\r\nhttp://80.71.66.8/mega/?aid=760382\r\nLive Feeds From 60 Sites And Web Cams...FREE Lifetime Membership!!\r\n\r\n# 2. > Asian Sex Fantasies\r\nhttp://80.71.66.8/asian/?aid=760382\r\nJapanese Schoolgirls, Live Sex Shows ...FREE Lifetime Membership!!\r\n\r\n# 1. > Lesbian Lace\r\nhttp://80.71.66.8/lesbian/?aid=760382\r\nGirls and Girls Getting Freaky! ...FREE Lifetime Membership!!\r\n\r\n--------------------------\r\n\r\nJennifer Simpson, Miami, FL\r\nYour FREE lifetime membership has entertained my boyffriend and I for\r\nthe last two years!  Your Adult Sites are the best on the net!\r\n\r\nJoe Morgan Manhattan, NY\r\nYour live sex shows and live sex cams are unbelievable. The best part\r\nabout your porn sites, is that they're absolutely FREE!\r\n\r\n--------------------------\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\nRemoval Instructions:\r\n\r\nYou have received this advertisement because you have opted in to receive free adult internet\r\noffers and specials through our affiliated websites. If you do not wish to receive further emails\r\nor have received the email in error you may opt-out of our database here\r\nhttp://80.71.66.8/optout/ . Please allow 24 hours for removal.\r\n\r\nvonolmosatkirekpups\r\n\r\n"
## 
## [[5]]
## [1] "From social-admin@linux.ie  Thu Aug 22 16:37:34 2002\r\nReturn-Path: <social-admin@linux.ie>\r\nDelivered-To: zzzz@localhost.example.com\r\nReceived: from localhost (localhost [127.0.0.1])\r\n\tby phobos.labs.example.com (Postfix) with ESMTP id 30B2143F99\r\n\tfor <zzzz@localhost>; Thu, 22 Aug 2002 11:37:34 -0400 (EDT)\r\nReceived: from phobos [127.0.0.1]\r\n\tby localhost with IMAP (fetchmail-5.9.0)\r\n\tfor zzzz@localhost (single-drop); Thu, 22 Aug 2002 16:37:34 +0100 (IST)\r\nReceived: from lugh.tuatha.org (root@lugh.tuatha.org [194.125.145.45]) by\r\n    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g7MFYOZ12548 for\r\n    <zzzz+ilug-social@jmason.org>; Thu, 22 Aug 2002 16:34:25 +0100\r\nReceived: from lugh (root@localhost [127.0.0.1]) by lugh.tuatha.org\r\n    (8.9.3/8.9.3) with ESMTP id QAA07692; Thu, 22 Aug 2002 16:33:43 +0100\r\nReceived: from email.qves.com ([67.104.83.251]) by lugh.tuatha.org\r\n    (8.9.3/8.9.3) with ESMTP id QAA07662 for <social@linux.ie>; Thu,\r\n    22 Aug 2002 16:33:37 +0100\r\nX-Authentication-Warning: lugh.tuatha.org: Host [67.104.83.251] claimed to\r\n    be email.qves.com\r\nReceived: from qvp0080 ([169.254.6.11]) by email.qves.com with Microsoft\r\n    SMTPSVC(5.0.2195.2966); Thu, 22 Aug 2002 09:33:08 -0600\r\nFrom: \"Slim n Trim\" <yenene@mx2.1premio.com>\r\nTo: <social@linux.ie>\r\nDate: Thu, 22 Aug 2002 09:33:07 -0600\r\nMessage-Id: <104c1101c249f1$36e098b0$0b06fea9@freeyankeedom.com>\r\nMIME-Version: 1.0\r\nContent-Type: text/plain; charset=\"iso-8859-1\"\r\nContent-Transfer-Encoding: 7bit\r\nX-Mailer: Microsoft CDO for Windows 2000\r\nThread-Index: AcJJ8TbZoOKEj0AtTsKxJ7ZmOA0e/w==\r\nContent-Class: urn:content-classes:message\r\nX-Mimeole: Produced By Microsoft MimeOLE V6.00.2462.0000\r\nX-Originalarrivaltime: 22 Aug 2002 15:33:08.0313 (UTC) FILETIME=[3746D490:01C249F1]\r\nSubject: [ILUG-Social] re: Guaranteed to lose 10-12 lbs in 30 days 10.148\r\nSender: social-admin@linux.ie\r\nErrors-To: social-admin@linux.ie\r\nX-Mailman-Version: 1.1\r\nPrecedence: bulk\r\nList-Id: Irish Linux Users' Group social events <social.linux.ie>\r\nX-Beenthere: social@linux.ie\r\n\r\nI thought you might like these:\r\n1) Slim Down - Guaranteed to lose 10-12 lbs in 30 days\r\nhttp://www.freeyankee.com/cgi/fy2/to.cgi?l=822slim1\r\n\r\n2) Fight The Risk of Cancer! \r\nhttp://www.freeyankee.com/cgi/fy2/to.cgi?l=822nic1 \r\n\r\n3) Get the Child Support You Deserve - Free Legal Advice \r\nhttp://www.freeyankee.com/cgi/fy2/to.cgi?l=822ppl1\r\n\r\nOffer Manager\r\nDaily-Deals\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\nIf you wish to leave this list please use the link below.\r\nhttp://www.qves.com/trim/?social@linux.ie%7C29%7C134077\r\n\r\n\r\n-- \r\nIrish Linux Users' Group Social Events: social@linux.ie\r\nhttp://www.linux.ie/mailman/listinfo/social for (un)subscription information.\r\nList maintainer: listmaster@linux.ie\r\n\r\n"
## 
## [[6]]
## [1] "From Thecashsystem@firemail.de  Thu Aug 22 16:58:24 2002\r\nReturn-Path: <Thecashsystem@firemail.de>\r\nDelivered-To: zzzz@localhost.example.com\r\nReceived: from localhost (localhost [127.0.0.1])\r\n\tby phobos.labs.example.com (Postfix) with ESMTP id 3453043F99\r\n\tfor <zzzz@localhost>; Thu, 22 Aug 2002 11:58:24 -0400 (EDT)\r\nReceived: from mail.webnote.net [193.120.211.219]\r\n\tby localhost with POP3 (fetchmail-5.9.0)\r\n\tfor zzzz@localhost (single-drop); Thu, 22 Aug 2002 16:58:24 +0100 (IST)\r\nReceived: from mailbox-13.st1.spray.net (mailbox-13.st1.spray.net [212.78.202.113])\r\n\tby webnote.net (8.9.3/8.9.3) with ESMTP id QAA05573\r\n\tfor <zzzz@example.com>; Thu, 22 Aug 2002 16:55:29 +0100\r\nReceived: from freesource (user-24-214-168-210.knology.net [24.214.168.210])\r\n\tby mailbox-13.st1.spray.net (Postfix) with ESMTP\r\n\tid ADDD03E25C; Thu, 22 Aug 2002 17:50:55 +0200 (DST)\r\nMessage-ID: <413-220028422154219900@freesource>\r\nX-Priority: 1\r\nTo: \"1\" <thecashsystem@firemail.de>\r\nFrom: \"TheCashSystem\" <Thecashsystem@firemail.de>\r\nSubject: RE: Your Bank Account Information \r\nDate: Thu, 22 Aug 2002 10:42:19 -0500\r\nMIME-Version: 1.0\r\nContent-type: text/plain; charset=US-ASCII\r\nX-MIME-Autoconverted: from quoted-printable to 8bit by webnote.net id QAA05573\r\nContent-Transfer-Encoding: 8bit\r\n\r\nA POWERHOUSE GIFTING PROGRAM You Don't Want To Miss! \r\n \r\n  GET IN WITH THE FOUNDERS! \r\nThe MAJOR PLAYERS are on This ONE\r\nFor ONCE be where the PlayerS are\r\nThis is YOUR Private Invitation\r\n\r\nEXPERTS ARE CALLING THIS THE FASTEST WAY \r\nTO HUGE CASH FLOW EVER CONCEIVED\r\nLeverage $1,000 into $50,000 Over and Over Again\r\n\r\nTHE QUESTION HERE IS:\r\nYOU EITHER WANT TO BE WEALTHY \r\nOR YOU DON'T!!!\r\nWHICH ONE ARE YOU?\r\nI am tossing you a financial lifeline and for your sake I \r\nHope you GRAB onto it and hold on tight For the Ride of youR life!\r\n\r\nTestimonials\r\n\r\nHear what average people are doing their first few days:\r\n“We've received 8,000 in 1 day and we are doing that over and over again!' Q.S. in AL\r\n “I'm a single mother in FL and I've received 12,000 in the last 4 days.” D. S. in FL\r\n“I was not sure about this when I sent off my $1,000 pledge, but I got back $2,000 the very next day!” L.L. in KY\r\n“I didn't have the money, so I found myself a partner to work this with. We have received $4,000 over the last 2 days. \r\nI think I made the right decision; don't you?” K. C. in FL\r\n“I pick up $3,000 my first day and I  they gave me free leads and all the training, you can too!” J.W. in CA\r\n\r\nANNOUNCING: We will CLOSE your sales for YOU! And Help you get a Fax Blast IMMEDIATELY Upon Your Entry!!!    YOU Make the MONEY!!!\r\nFREE LEADS!!! TRAINING!!!\r\n\r\n$$DON'T WAIT!!! CALL NOW $$\r\nFAX BACK TO: 1-800-421-6318 OR Call 1-800-896-6568 \r\n\r\nName__________________________________Phone___________________________________________\r\n\r\nFax_____________________________________Email____________________________________________\r\n\r\nBest Time To Call_________________________Time Zone________________________________________\r\n\r\nThis message is sent in compliance of the new e-mail bill. \"Per Section 301, Paragraph (a)(2)(C) of S. 1618, further transmissions by the sender of this email may be stopped, at no cost to you, by sending a reply to this email address with the word \"REMOVE\" in the subject line. Errors, omissions, and exceptions excluded.\r\n \r\nThis is NOT spam! I have compiled this list from our Replicate Database, relative to Seattle Marketing Group, The Gigt, or Turbo Team for the sole purpose of these communications. Your continued inclusion is ONLY by your gracious permission. If you wish to not receive this mail from me, please send an email to tesrewinter@yahoo.com with \"Remove\" in the subject and you will be deleted immediately.\r\n\r\n\r\n\r\n"
spamBody<-lapply(spamListC, function(x) str_sub(str_extract(str_replace_all(str_replace(x,"\r\n\r\n"," zzzMGfirstblank "),"\r\n"," ")," zzzMGfirstblank.*"),start=18))

head(spamBody)
## [[1]]
## [1] "<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\"> <HTML><HEAD> <META content=3D\"text/html; charset=3Dwindows-1252\" http-equiv=3DContent-T= ype> <META content=3D\"MSHTML 5.00.2314.1000\" name=3DGENERATOR></HEAD> <BODY><!-- Inserted by Calypso --> <TABLE border=3D0 cellPadding=3D0 cellSpacing=3D2 id=3D_CalyPrintHeader_ r= ules=3Dnone  style=3D\"COLOR: black; DISPLAY: none\" width=3D\"100%\">   <TBODY>   <TR>     <TD colSpan=3D3>       <HR color=3Dblack noShade SIZE=3D1>     </TD></TR></TD></TR>   <TR>     <TD colSpan=3D3>       <HR color=3Dblack noShade SIZE=3D1>     </TD></TR></TBODY></TABLE><!-- End Calypso --><!-- Inserted by Calypso=  --><FONT  color=3D#000000 face=3DVERDANA,ARIAL,HELVETICA size=3D-2><BR></FONT></TD><= /TR></TABLE><!-- End Calypso --><FONT color=3D#ff0000  face=3D\"Copperplate Gothic Bold\" size=3D5 PTSIZE=3D\"10\"> <CENTER>Save up to 70% on Life Insurance.</CENTER></FONT><FONT color=3D#ff= 0000  face=3D\"Copperplate Gothic Bold\" size=3D5 PTSIZE=3D\"10\"> <CENTER>Why Spend More Than You Have To? <CENTER><FONT color=3D#ff0000 face=3D\"Copperplate Gothic Bold\" size=3D5 PT= SIZE=3D\"10\"> <CENTER>Life Quote Savings <CENTER> <P align=3Dleft></P> <P align=3Dleft></P></FONT></U></I></B><BR></FONT></U></B></U></I> <P></P> <CENTER> <TABLE border=3D0 borderColor=3D#111111 cellPadding=3D0 cellSpacing=3D0 wi= dth=3D650>   <TBODY></TBODY></TABLE> <TABLE border=3D0 borderColor=3D#111111 cellPadding=3D5 cellSpacing=3D0 wi= dth=3D650>   <TBODY>   <TR>     <TD colSpan=3D2 width=3D\"35%\"><B><FONT face=3DVerdana size=3D4>Ensurin= g your        family's financial security is very important. Life Quote Savings ma= kes        buying life insurance simple and affordable. We Provide FREE Access = to The        Very Best Companies and The Lowest Rates.</FONT></B></TD></TR>   <TR>     <TD align=3Dmiddle vAlign=3Dtop width=3D\"18%\">       <TABLE borderColor=3D#111111 width=3D\"100%\">         <TBODY>         <TR>           <TD style=3D\"PADDING-LEFT: 5px; PADDING-RIGHT: 5px\" width=3D\"100= %\"><FONT              face=3DVerdana size=3D4><B>Life Quote Savings</B> is FAST, EAS= Y and              SAVES you money! Let us help you get started with the best val= ues in              the country on new coverage. You can SAVE hundreds or even tho= usands              of dollars by requesting a FREE quote from Lifequote Savings. = Our              service will take you less than 5 minutes to complete. Shop an= d              compare. SAVE up to 70% on all types of Life insurance!  </FONT></TD></TR>         <TR><BR><BR>           <TD height=3D50 style=3D\"PADDING-LEFT: 5px; PADDING-RIGHT: 5px\"            width=3D\"100%\">             <P align=3Dcenter><B><FONT face=3DVerdana size=3D5><A              href=3D\"http://website.e365.cc/savequote/\">Click Here For Your=               Free Quote!</A></FONT></B></P></TD>           <P><FONT face=3DVerdana size=3D4><STRONG>           <CENTER>Protecting your family is the best investment you'll eve= r            make!<BR></B></TD></TR>         <TR><BR><BR></STRONG></FONT></TD></TR></TD></TR>         <TR></TR></TBODY></TABLE>       <P align=3Dleft><FONT face=3D\"Arial, Helvetica, sans-serif\" size=3D2= ></FONT></P>       <P></P>       <CENTER><BR><BR><BR>       <P></P>       <P align=3Dleft><BR></B><BR><BR><BR><BR></P>       <P align=3Dcenter><BR></P>       <P align=3Dleft><BR></B><BR><BR></FONT>If you are in receipt of this=  email        in error and/or wish to be removed from our list, <A        href=3D\"mailto:coins@btamail.net.cn\">PLEASE CLICK HERE</A> AND TYPE = REMOVE. If you        reside in any state which prohibits e-mail solicitations for insuran= ce,        please disregard this        email.<BR></FONT><BR><BR><BR><BR><BR><BR><BR><BR><BR><BR><BR><BR><BR= ><BR><BR><BR></FONT></P></CENTER></CENTER></TR></TBODY></TABLE></CENTER></= CENTER></CENTER></CENTER></CENTER></BODY></HTML>    "
## 
## [[2]]
## [1] "1) Fight The Risk of Cancer! http://www.adclick.ws/p.cfm?o=315&s=pk007  2) Slim Down - Guaranteed to lose 10-12 lbs in 30 days http://www.adclick.ws/p.cfm?o=249&s=pk007  3) Get the Child Support You Deserve - Free Legal Advice http://www.adclick.ws/p.cfm?o=245&s=pk002  4) Join the Web's Fastest Growing Singles Community http://www.adclick.ws/p.cfm?o=259&s=pk007  5) Start Your Private Photo Album Online! http://www.adclick.ws/p.cfm?o=283&s=pk007  Have a Wonderful Day, Offer Manager PrizeMama              If you wish to leave this list please use the link below. http://www.qves.com/trim/?ilug@linux.ie%7C17%7C114258   --  Irish Linux Users' Group: ilug@linux.ie http://www.linux.ie/mailman/listinfo/ilug for (un)subscription information. List maintainer: listmaster@linux.ie  "
## 
## [[3]]
## [1] "1) Fight The Risk of Cancer! http://www.adclick.ws/p.cfm?o=315&s=pk007  2) Slim Down - Guaranteed to lose 10-12 lbs in 30 days http://www.adclick.ws/p.cfm?o=249&s=pk007  3) Get the Child Support You Deserve - Free Legal Advice http://www.adclick.ws/p.cfm?o=245&s=pk002  4) Join the Web's Fastest Growing Singles Community http://www.adclick.ws/p.cfm?o=259&s=pk007  5) Start Your Private Photo Album Online! http://www.adclick.ws/p.cfm?o=283&s=pk007  Have a Wonderful Day, Offer Manager PrizeMama              If you wish to leave this list please use the link below. http://www.qves.com/trim/?zzzz@example.com%7C17%7C308417  "
## 
## [[4]]
## [1] "################################################## #                                                # #                 Adult Club                     # #           Offers FREE Membership               # #                                                # ##################################################  >>>>>  INSTANT ACCESS TO ALL SITES NOW >>>>>  Your User Name And Password is. >>>>>  User Name: zzzz@example.com >>>>>  Password: 760382  5 of the Best Adult Sites on the Internet for FREE! --------------------------------------- NEWS 08/18/02 With just over 2.9 Million Members that signed up for FREE, Last month there were 721,184 New Members. Are you one of them yet??? --------------------------------------- Our Membership FAQ  Q. Why are you offering free access to 5 adult membership sites for free? A. I have advertisers that pay me for ad space so you don't have to pay for membership.  Q. Is it true my membership is for life? A. Absolutely you'll never have to pay a cent the advertisers do.  Q. Can I give my account to my friends and family? A. Yes, as long they are over the age of 18.  Q. Do I have to sign up for all 5 membership sites? A. No just one to get access to all of them.  Q. How do I get started? A. Click on one of the following links below to become a member.  - These are multi million dollar operations with policies and rules. - Fill in the required info and they won't charge you for the Free pass! - If you don't believe us, just read their terms and conditions.  ---------------------------  # 5. > Adults Farm http://80.71.66.8/farm/?aid=760382 Girls and Animals Getting Freaky....FREE Lifetime Membership!!  # 4. > Sexy Celebes http://80.71.66.8/celebst/?aid=760382 Thousands Of XXX Celebes doing it...FREE Lifetime Membership!!  # 3. > Play House Porn http://80.71.66.8/mega/?aid=760382 Live Feeds From 60 Sites And Web Cams...FREE Lifetime Membership!!  # 2. > Asian Sex Fantasies http://80.71.66.8/asian/?aid=760382 Japanese Schoolgirls, Live Sex Shows ...FREE Lifetime Membership!!  # 1. > Lesbian Lace http://80.71.66.8/lesbian/?aid=760382 Girls and Girls Getting Freaky! ...FREE Lifetime Membership!!  --------------------------  Jennifer Simpson, Miami, FL Your FREE lifetime membership has entertained my boyffriend and I for the last two years!  Your Adult Sites are the best on the net!  Joe Morgan Manhattan, NY Your live sex shows and live sex cams are unbelievable. The best part about your porn sites, is that they're absolutely FREE!  --------------------------          Removal Instructions:  You have received this advertisement because you have opted in to receive free adult internet offers and specials through our affiliated websites. If you do not wish to receive further emails or have received the email in error you may opt-out of our database here http://80.71.66.8/optout/ . Please allow 24 hours for removal.  vonolmosatkirekpups  "
## 
## [[5]]
## [1] "I thought you might like these: 1) Slim Down - Guaranteed to lose 10-12 lbs in 30 days http://www.freeyankee.com/cgi/fy2/to.cgi?l=822slim1  2) Fight The Risk of Cancer!  http://www.freeyankee.com/cgi/fy2/to.cgi?l=822nic1   3) Get the Child Support You Deserve - Free Legal Advice  http://www.freeyankee.com/cgi/fy2/to.cgi?l=822ppl1  Offer Manager Daily-Deals         If you wish to leave this list please use the link below. http://www.qves.com/trim/?social@linux.ie%7C29%7C134077   --  Irish Linux Users' Group Social Events: social@linux.ie http://www.linux.ie/mailman/listinfo/social for (un)subscription information. List maintainer: listmaster@linux.ie  "
## 
## [[6]]
## [1] "A POWERHOUSE GIFTING PROGRAM You Don't Want To Miss!      GET IN WITH THE FOUNDERS!  The MAJOR PLAYERS are on This ONE For ONCE be where the PlayerS are This is YOUR Private Invitation  EXPERTS ARE CALLING THIS THE FASTEST WAY  TO HUGE CASH FLOW EVER CONCEIVED Leverage $1,000 into $50,000 Over and Over Again  THE QUESTION HERE IS: YOU EITHER WANT TO BE WEALTHY  OR YOU DON'T!!! WHICH ONE ARE YOU? I am tossing you a financial lifeline and for your sake I  Hope you GRAB onto it and hold on tight For the Ride of youR life!  Testimonials  Hear what average people are doing their first few days: “We've received 8,000 in 1 day and we are doing that over and over again!' Q.S. in AL  “I'm a single mother in FL and I've received 12,000 in the last 4 days.” D. S. in FL “I was not sure about this when I sent off my $1,000 pledge, but I got back $2,000 the very next day!” L.L. in KY “I didn't have the money, so I found myself a partner to work this with. We have received $4,000 over the last 2 days.  I think I made the right decision; don't you?” K. C. in FL “I pick up $3,000 my first day and I  they gave me free leads and all the training, you can too!” J.W. in CA  ANNOUNCING: We will CLOSE your sales for YOU! And Help you get a Fax Blast IMMEDIATELY Upon Your Entry!!!    YOU Make the MONEY!!! FREE LEADS!!! TRAINING!!!  $$DON'T WAIT!!! CALL NOW $$ FAX BACK TO: 1-800-421-6318 OR Call 1-800-896-6568   Name__________________________________Phone___________________________________________  Fax_____________________________________Email____________________________________________  Best Time To Call_________________________Time Zone________________________________________  This message is sent in compliance of the new e-mail bill. \"Per Section 301, Paragraph (a)(2)(C) of S. 1618, further transmissions by the sender of this email may be stopped, at no cost to you, by sending a reply to this email address with the word \"REMOVE\" in the subject line. Errors, omissions, and exceptions excluded.   This is NOT spam! I have compiled this list from our Replicate Database, relative to Seattle Marketing Group, The Gigt, or Turbo Team for the sole purpose of these communications. Your continued inclusion is ONLY by your gracious permission. If you wish to not receive this mail from me, please send an email to tesrewinter@yahoo.com with \"Remove\" in the subject and you will be deleted immediately.    "
cond<-sapply(spamBody,function(x) !str_detect(tolower(x),"<html>"))

spamTextList<-spamBody[cond]

spamTextList<-typeof(spamTextList)

length(spamTextList)
## [1] 1
head(spamTextList,1)
## [1] "list"
hamListAS<-str_c(hamListAddressC," ",hamListSubjectC)

head(hamListAS)
## [1] "kre munnari oz re new sequences window"                    
## [2] "burt cursor system zzzzteana re alexander"                 
## [3] "timc ubh zzzzteana moscow bomber"                          
## [4] "monty roscom irr klez the virus that won t die"            
## [5] "tony linuxworks com re insert signature"                   
## [6] "smith ee ed ac re zzzzteana nothing like mama used to make"