This exercise is to predict the class of new documents and analyse with different models with train and test dataset
#Set local working directory
#Please change the directory according to your system
setwd("C:/text_classifier")
library("stringr")
library("tm")
library("SnowballC")
library("RTextTools")
library("R.utils")
library("utils")
library("wordcloud")
library("knitr")
library("kableExtra")
library("Hmisc")#Spam file download
spam_file <- 'http://spamassassin.apache.org/old/publiccorpus/20050311_spam_2.tar.bz2'
download.file(spam_file, destfile="spam_zip.tar.bz2")
bunzip2("spam_zip.tar.bz2", remove = FALSE, overwrite = TRUE)
untar("spam_zip.tar")
#Ham file download
ham_file <- 'http://spamassassin.apache.org/old/publiccorpus/20030228_easy_ham_2.tar.bz2'
download.file(ham_file, destfile="ham_zip.tar.bz2")
bunzip2("ham_zip.tar.bz2", remove = FALSE, overwrite = TRUE)
untar("ham_zip.tar")
#Observed cmds file, delete the files
remove_cmds_ham <- list.files(path="easy_ham_2/", full.names=TRUE, recursive=FALSE, pattern="cmds")
file.remove(remove_cmds_ham)## [1] TRUE
remove_cmds_spam <-list.files(path="spam_2/", full.names=TRUE, recursive=FALSE, pattern="cmds")
file.remove(remove_cmds_spam)## [1] TRUE
#VCorpus - Setting the directory
spamfiles<-VCorpus(DirSource(directory = "spam_2/",encoding = "UTF-8"))
hamfiles<-VCorpus(DirSource(directory = "easy_ham_2/",encoding = "UTF-8"))
#Take some sample files
spamfiles <- sample(spamfiles, 500)
hamfiles <- sample(hamfiles, 500)
#Add meta labels
meta(spamfiles, tag = "type") <- "spam"
meta(hamfiles, tag = "type") <- "ham"
#Merge the files
merge_files <- c(spamfiles,hamfiles,recursive=TRUE)
#Getting the Sample files
spamham <- sample(merge_files)#Cleanup the data
spamham <- tm_map(spamham,removePunctuation)
spamham <- tm_map(spamham,removeNumbers)
spamham <- tm_map(spamham,stripWhitespace)
spamham <- tm_map(spamham,content_transformer(tolower))
spamham <- tm_map(spamham,removeWords, words = stopwords("en"))
spamham <- tm_map(spamham,stemDocument)#Creating the Document Term Matrix
dtm <- DocumentTermMatrix(spamham)
dtm <- removeSparseTerms(dtm, 0.95)
inspect(dtm)## <<DocumentTermMatrix (documents: 1000, terms: 586)>>
## Non-/sparse entries: 89677/496323
## Sparsity : 85%
## Maximal term length: 54
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs aug esmtp font jul localhost mon
## 00044.9f8c4b9ae007c6ded3d57476082bf2b2 0 2 14 0 0 0
## 00051.8b17ce16ace4d5845e2299c0123e1f14 0 2 9 0 0 0
## 00077.6e13224e39fae4b94bcbe0f5ae9f4939 0 2 9 4 0 0
## 00117.9f0ba9c35b1fe59307e32b7c2c0d4e61 4 2 4 0 0 0
## 00122.4a2f67839c81141a1075745a66c907bb 4 2 9 0 0 0
## 01083.a6b3c50be5abf782b585995d2c11176b 0 4 0 10 1 0
## 01095.520dcad6e0ebb4d30222292f51ee76ab 0 1 8 5 3 0
## 01308.c46f1215ccd8cedf162e0cf308b94dcd 5 1 0 0 3 0
## 01317.7fc86413a091430c3104b041a6525131 0 6 1 10 3 0
## 01380.e3fad5af747d3a110008f94a046bf31b 11 1 0 0 5 2
## Terms
## Docs receiv size tue widthd
## 00044.9f8c4b9ae007c6ded3d57476082bf2b2 4 14 0 0
## 00051.8b17ce16ace4d5845e2299c0123e1f14 8 24 0 0
## 00077.6e13224e39fae4b94bcbe0f5ae9f4939 6 24 0 0
## 00117.9f0ba9c35b1fe59307e32b7c2c0d4e61 9 6 0 0
## 00122.4a2f67839c81141a1075745a66c907bb 3 24 0 0
## 01083.a6b3c50be5abf782b585995d2c11176b 9 0 0 0
## 01095.520dcad6e0ebb4d30222292f51ee76ab 11 4 0 0
## 01308.c46f1215ccd8cedf162e0cf308b94dcd 19 1 5 0
## 01317.7fc86413a091430c3104b041a6525131 12 3 0 0
## 01380.e3fad5af747d3a110008f94a046bf31b 9 2 4 0
kable(findFreqTerms(dtm, 5)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive")) %>%
row_spec(0, bold = T, color = "white", background = "#ea7872") %>%
scroll_box(width = "100%", height = "300px")| x |
|---|
| abl |
| accept |
| access |
| account |
| act |
| actual |
| add |
| address |
| advertis |
| age |
| aligncent |
| aligndcent |
| allow |
| alreadi |
| also |
| alway |
| anoth |
| anyon |
| anyth |
| appli |
| arial |
| around |
| ask |
| aug |
| august |
| automat |
| avail |
| away |
| back |
| bad |
| base |
| begin |
| believ |
| best |
| better |
| bfont |
| bgcolordffffff |
| bgcolorffffff |
| big |
| bill |
| bit |
| bodi |
| border |
| borderd |
| box |
| build |
| bulk |
| busi |
| buy |
| bythinkgeek |
| call |
| can |
| cant |
| card |
| care |
| case |
| cash |
| caus |
| cdt |
| cellpad |
| cellpaddingd |
| cellspac |
| cellspacingd |
| center |
| certain |
| chang |
| charsetiso |
| charsetusascii |
| charsetwindow |
| check |
| cipheredhdssdescbcsha |
| claim |
| clean |
| click |
| client |
| code |
| color |
| colord |
| come |
| communic |
| compani |
| complet |
| comput |
| consid |
| contact |
| contain |
| contentdisposit |
| contentdtexthtml |
| contenttransferencod |
| contenttyp |
| copi |
| cost |
| countri |
| cours |
| cpunkshqpronsnet |
| cpunkslocalhost |
| creat |
| credit |
| current |
| custom |
| cypherpunksdspronsnet |
| cypherpunksforwarddspronsnet |
| data |
| date |
| day |
| deal |
| deathtospamdeathtospamdeathtospam |
| debian |
| deliveredto |
| deliveryd |
| design |
| detail |
| develop |
| didnt |
| differ |
| direct |
| discuss |
| disk |
| distribut |
| div |
| doesnt |
| dogmaslashnullorg |
| dollar |
| done |
| dont |
| drive |
| easi |
| edt |
| effect |
| either |
| els |
| end |
| engin |
| enter |
| envelopefrom |
| error |
| errorsto |
| esmtp |
| etc |
| even |
| ever |
| everi |
| everyth |
| exampl |
| exchang |
| exim |
| exist |
| expect |
| experi |
| express |
| faceari |
| facedari |
| fact |
| famili |
| fast |
| fax |
| featur |
| feel |
| fetchmail |
| file |
| fill |
| financi |
| find |
| first |
| fix |
| follow |
| font |
| fontfont |
| forg |
| forkadminxentcom |
| forkspamassassintaintorg |
| forkxentcom |
| form |
| format |
| formatflow |
| found |
| free |
| fri |
| friend |
| full |
| futur |
| geek |
| general |
| get |
| give |
| good |
| got |
| great |
| group |
| grow |
| guarante |
| hand |
| happen |
| hard |
| head |
| heaven |
| height |
| heightd |
| helo |
| helouswsflistsourceforgenet |
| help |
| helvetica |
| herea |
| high |
| home |
| hope |
| host |
| hour |
| howev |
| hqpronsnet |
| html |
| httpequivdcontenttyp |
| httpslistssourceforgenetlistslistinfospamassassinsight |
| httpthinkgeekcomsf |
| httpwwwlinuxiemailmanlistinfoilug |
| httpxentcommailmanlistinfofork |
| httpxentcompipermailfork |
| hundr |
| idea |
| ill |
| ilug |
| ilugadminlinuxi |
| iluglinuxi |
| imap |
| img |
| immedi |
| import |
| inc |
| includ |
| increas |
| individu |
| info |
| inform |
| inlin |
| input |
| inreplyto |
| instal |
| instead |
| instruct |
| interest |
| intern |
| internet |
| invest |
| invok |
| irish |
| isnt |
| issu |
| ist |
| ive |
| jmilugjmasonorg |
| jmjmasonorg |
| jmlocalhost |
| jmnetnoteinccom |
| job |
| john |
| jul |
| jun |
| just |
| keep |
| khare |
| kind |
| know |
| lairxentcom |
| larg |
| last |
| lead |
| learn |
| least |
| legal |
| less |
| let |
| level |
| life |
| like |
| limit |
| line |
| link |
| linux |
| list |
| listarch |
| listhelp |
| listid |
| listmasterlinuxi |
| listpost |
| listsubscrib |
| listunsubscrib |
| littl |
| live |
| local |
| localhost |
| localhostlocaldomain |
| long |
| look |
| lot |
| low |
| lugh |
| lughtuathaorg |
| made |
| mailnetnoteinccom |
| mailtoforkrequestxentcomsubjecthelp |
| mailtoforkrequestxentcomsubjectsubscrib |
| mailtoforkrequestxentcomsubjectunsubscrib |
| mailtoforkspamassassintaintorg |
| mailwebnotenet |
| maintain |
| major |
| make |
| man |
| manag |
| mandarklabsnetnoteinccom |
| mani |
| market |
| may |
| mean |
| mention |
| messag |
| messageid |
| meta |
| microsoft |
| might |
| million |
| mime |
| mimeol |
| mimevers |
| minut |
| mon |
| money |
| month |
| much |
| multipart |
| multipartaltern |
| must |
| mutti |
| name |
| need |
| network |
| never |
| new |
| news |
| next |
| nice |
| normal |
| note |
| noth |
| notic |
| now |
| number |
| offer |
| old |
| one |
| onlin |
| open |
| opportun |
| option |
| order |
| organ |
| origin |
| other |
| outlook |
| packag |
| page |
| part |
| past |
| pay |
| pdt |
| peopl |
| per |
| person |
| pfont |
| phobo |
| phoboslabsnetnoteinccom |
| phone |
| place |
| plan |
| pleas |
| plus |
| point |
| polici |
| pop |
| posit |
| possibl |
| post |
| postfix |
| power |
| preced |
| price |
| probabl |
| problem |
| process |
| produc |
| product |
| profession |
| program |
| promot |
| protect |
| provid |
| public |
| purchas |
| put |
| qmail |
| qualiti |
| question |
| quick |
| quot |
| quotedprint |
| rate |
| rather |
| reach |
| read |
| real |
| realli |
| reason |
| receiv |
| recent |
| refer |
| regard |
| relat |
| releas |
| remov |
| repli |
| replyto |
| report |
| request |
| requir |
| respons |
| result |
| return |
| returnpath |
| right |
| risk |
| rohit |
| rootlocalhost |
| rootlughtuathaorg |
| rpm |
| run |
| said |
| sale |
| sansserif |
| sat |
| save |
| say |
| search |
| secur |
| see |
| seem |
| seen |
| select |
| sell |
| send |
| sender |
| sent |
| server |
| servic |
| set |
| sever |
| sfnet |
| share |
| show |
| sign |
| simpl |
| simpli |
| sinc |
| singledrop |
| site |
| size |
| small |
| smtp |
| smtpsvc |
| softwar |
| someon |
| someth |
| sort |
| sourc |
| spam |
| spamassassinsight |
| spamassassinsightingslistssourceforgenet |
| special |
| sponsor |
| start |
| state |
| still |
| stop |
| stuff |
| subject |
| submit |
| success |
| suggest |
| sun |
| support |
| sure |
| system |
| tabl |
| take |
| talk |
| technolog |
| tell |
| test |
| text |
| texthtml |
| textplain |
| thank |
| that |
| there |
| thing |
| think |
| though |
| thought |
| thousand |
| three |
| thu |
| time |
| today |
| top |
| total |
| tri |
| true |
| tue |
| turn |
| two |
| type |
| understand |
| unknown |
| unsubscrib |
| unsubscript |
| updat |
| use |
| user |
| userag |
| userid |
| uswsffwsourceforgenet |
| uswsflistbsourceforgenet |
| uswsflistsourceforgenet |
| utc |
| valu |
| vamm |
| verifyno |
| version |
| versiontlsvsslv |
| via |
| visit |
| want |
| way |
| web |
| webnotenet |
| websit |
| wed |
| week |
| welcom |
| well |
| width |
| widthd |
| will |
| window |
| wish |
| within |
| without |
| wont |
| word |
| work |
| world |
| write |
| wrote |
| xauthenticationwarn |
| xbeenther |
| xentcom |
| xkeyword |
| xloop |
| xmailer |
| xmailmanvers |
| xmailscann |
| xmimeautoconvert |
| xmimeol |
| xmsmailprior |
| xoriginalarrivaltim |
| xoriginald |
| xprioriti |
| xurl |
| year |
| yes |
| yet |
| youll |
| your |
| yyyylocalhostnetnoteinccom |
| yyyynetnoteinccom |
freq <- colSums(as.matrix(dtm))
dark2 <- brewer.pal(6, "Dark2")
wordcloud(names(freq), freq, max.words=200, rot.per=0.2, colors=dark2) #container
spamtype <- unlist(meta(spamham, "type")[,1])
container <- create_container(
dtm,
labels = spamtype,
trainSize = 1:(0.8*length(spamtype)),
testSize = (0.8*length(spamtype)+1):length(spamtype),
virgin = FALSE
)#Train
train_svm <- train_model(container, "SVM")
train_rf <- train_model(container, "RF")
train_maxent <- train_model(container, "MAXENT")
train_tree <- train_model(container,"TREE")
train_boosting <- train_model(container,"BOOSTING")
train_bagging <- train_model(container,"BAGGING")
#Test
test_svm <- classify_model(container, train_svm)
test_rf <- classify_model(container, train_rf)
test_maxent <- classify_model(container, train_maxent)
test_tree <- classify_model(container, train_tree)
test_boost <- classify_model(container, train_boosting)
test_bagging <- classify_model(container, train_bagging)Hmisc::describe(test_svm)## test_svm
##
## 2 Variables 200 Observations
## ---------------------------------------------------------------------------
## SVM_LABEL
## n missing distinct
## 200 0 2
##
## Value ham spam
## Frequency 99 101
## Proportion 0.495 0.505
## ---------------------------------------------------------------------------
## SVM_PROB
## n missing distinct Info Mean Gmd .05 .10
## 200 0 197 1 0.9589 0.06134 0.7370 0.9105
## .25 .50 .75 .90 .95
## 0.9542 0.9914 0.9979 0.9993 0.9996
##
## lowest : 0.5765280 0.5954603 0.5961509 0.6299405 0.6666408
## highest: 0.9999107 0.9999121 0.9999463 0.9999705 0.9999747
## ---------------------------------------------------------------------------
Hmisc::describe(test_rf)## test_rf
##
## 2 Variables 200 Observations
## ---------------------------------------------------------------------------
## FORESTS_LABEL
## n missing distinct
## 200 0 2
##
## Value ham spam
## Frequency 103 97
## Proportion 0.515 0.485
## ---------------------------------------------------------------------------
## FORESTS_PROB
## n missing distinct Info Mean Gmd .05 .10
## 200 0 56 0.996 0.9175 0.1066 0.6543 0.7395
## .25 .50 .75 .90 .95
## 0.9150 0.9650 0.9900 1.0000 1.0000
##
## lowest : 0.515 0.520 0.525 0.540 0.555, highest: 0.980 0.985 0.990 0.995 1.000
## ---------------------------------------------------------------------------
Hmisc::describe(test_maxent)## test_maxent
##
## 2 Variables 200 Observations
## ---------------------------------------------------------------------------
## MAXENTROPY_LABEL
## n missing distinct
## 200 0 2
##
## Value ham spam
## Frequency 105 95
## Proportion 0.525 0.475
## ---------------------------------------------------------------------------
## MAXENTROPY_PROB
## n missing distinct Info Mean Gmd .05 .10
## 200 0 85 0.829 0.9983 0.003446 0.9998 1.0000
## .25 .50 .75 .90 .95
## 1.0000 1.0000 1.0000 1.0000 1.0000
##
## Value 0.8165 0.9340 0.9445 0.9790 0.9865 0.9955 0.9980 0.9995 1.0000
## Frequency 1 1 1 1 1 1 1 3 190
## Proportion 0.005 0.005 0.005 0.005 0.005 0.005 0.005 0.015 0.950
## ---------------------------------------------------------------------------
Hmisc::describe(test_tree)## test_tree
##
## 2 Variables 200 Observations
## ---------------------------------------------------------------------------
## TREE_LABEL
## n missing distinct
## 200 0 2
##
## Value ham spam
## Frequency 99 101
## Proportion 0.495 0.505
## ---------------------------------------------------------------------------
## TREE_PROB
## n missing distinct Info Mean Gmd
## 200 0 6 0.871 0.9734 0.03496
##
## Value 0.6000000 0.7500000 0.8571429 0.9820896 0.9908257 1.0000000
## Frequency 2 9 1 76 83 29
## Proportion 0.010 0.045 0.005 0.380 0.415 0.145
## ---------------------------------------------------------------------------
Hmisc::describe(test_boost)## test_boost
##
## 2 Variables 200 Observations
## ---------------------------------------------------------------------------
## LOGITBOOST_LABEL
## n missing distinct
## 200 0 2
##
## Value ham spam
## Frequency 103 97
## Proportion 0.515 0.485
## ---------------------------------------------------------------------------
## LOGITBOOST_PROB
## n missing distinct Info Mean Gmd .05 .10
## 200 0 29 0.993 0.992 0.01559 0.9820 0.9995
## .25 .50 .75 .90 .95
## 1.0000 1.0000 1.0000 1.0000 1.0000
##
## Value 0.500 0.881 0.982 0.998 1.000
## Frequency 2 4 5 9 180
## Proportion 0.010 0.020 0.025 0.045 0.900
## ---------------------------------------------------------------------------
Hmisc::describe(test_bagging)## test_bagging
##
## 2 Variables 200 Observations
## ---------------------------------------------------------------------------
## BAGGING_LABEL
## n missing distinct
## 200 0 2
##
## Value ham spam
## Frequency 99 101
## Proportion 0.495 0.505
## ---------------------------------------------------------------------------
## BAGGING_PROB
## n missing distinct Info Mean Gmd .05 .10
## 200 0 13 0.742 0.9208 0.126 0.56 0.60
## .25 .50 .75 .90 .95
## 0.95 1.00 1.00 1.00 1.00
##
## Value 0.52 0.56 0.60 0.64 0.68 0.72 0.76 0.80 0.84 0.88
## Frequency 7 10 4 1 3 3 2 6 5 5
## Proportion 0.035 0.050 0.020 0.005 0.015 0.015 0.010 0.030 0.025 0.025
##
## Value 0.92 0.96 1.00
## Frequency 4 23 127
## Proportion 0.020 0.115 0.635
## ---------------------------------------------------------------------------