library(tm)
## Warning: package 'tm' was built under R version 3.4.4
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.4.1
library(SnowballC)
## Warning: package 'SnowballC' was built under R version 3.4.1
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.4.4
## Loading required package: RColorBrewer
#Setting up Corpus
spam <- Corpus(DirSource("C:/Users/ahwang/Desktop/Spamham/spam"))
no_spam <- Corpus(DirSource("C:/Users/ahwang/Desktop/Spamham/easy_ham"))
#Checking to see if Corpus works correctly
writeLines(as.character(spam[[3]]))
## From sabrina@mx3.1premio.com Thu Aug 22 14:44:07 2002
## Return-Path: <sabrina@mx3.1premio.com>
## Delivered-To: zzzz@localhost.spamassassin.taint.org
## Received: from localhost (localhost [127.0.0.1])
## by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 1E90847C66
## for <zzzz@localhost>; Thu, 22 Aug 2002 09:44:02 -0400 (EDT)
## Received: from mail.webnote.net [193.120.211.219]
## by localhost with POP3 (fetchmail-5.9.0)
## for zzzz@localhost (single-drop); Thu, 22 Aug 2002 14:44:03 +0100 (IST)
## Received: from email.qves.com (email1.qves.net [209.63.151.251] (may be forged))
## by webnote.net (8.9.3/8.9.3) with ESMTP id OAA04953
## for <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 14:37:23 +0100
## Received: from qvp0086 ([169.254.6.17]) by email.qves.com with Microsoft SMTPSVC(5.0.2195.2966);
## Thu, 22 Aug 2002 07:36:20 -0600
## From: "Slim Down" <sabrina@mx3.1premio.com>
## To: <zzzz@spamassassin.taint.org>
## Subject: Guaranteed to lose 10-12 lbs in 30 days 11.150
## Date: Thu, 22 Aug 2002 07:36:19 -0600
## Message-ID: <9a63c01c249e0$e5a9d610$1106fea9@freeyankeedom.com>
## MIME-Version: 1.0
## Content-Type: text/plain;
## charset="iso-8859-1"
## Content-Transfer-Encoding: 7bit
## X-Mailer: Microsoft CDO for Windows 2000
## Thread-Index: AcJJ4OWpowGq7rdNSwCz5HE3x9ZZDQ==
## Content-Class: urn:content-classes:message
## X-MimeOLE: Produced By Microsoft MimeOLE V6.00.2462.0000
## X-OriginalArrivalTime: 22 Aug 2002 13:36:20.0969 (UTC) FILETIME=[E692FD90:01C249E0]
##
## 1) Fight The Risk of Cancer!
## http://www.adclick.ws/p.cfm?o=315&s=pk007
##
## 2) Slim Down - Guaranteed to lose 10-12 lbs in 30 days
## http://www.adclick.ws/p.cfm?o=249&s=pk007
##
## 3) Get the Child Support You Deserve - Free Legal Advice
## http://www.adclick.ws/p.cfm?o=245&s=pk002
##
## 4) Join the Web's Fastest Growing Singles Community
## http://www.adclick.ws/p.cfm?o=259&s=pk007
##
## 5) Start Your Private Photo Album Online!
## http://www.adclick.ws/p.cfm?o=283&s=pk007
##
## Have a Wonderful Day,
## Offer Manager
## PrizeMama
##
##
##
##
##
##
##
##
##
##
##
##
##
## If you wish to leave this list please use the link below.
## http://www.qves.com/trim/?zzzz@spamassassin.taint.org%7C17%7C308417
writeLines(as.character(no_spam[[3]]))
## From timc@2ubh.com Thu Aug 22 13:52:59 2002
## Return-Path: <timc@2ubh.com>
## Delivered-To: zzzz@localhost.netnoteinc.com
## Received: from localhost (localhost [127.0.0.1])
## by phobos.labs.netnoteinc.com (Postfix) with ESMTP id 0314547C66
## for <zzzz@localhost>; Thu, 22 Aug 2002 08:52:58 -0400 (EDT)
## Received: from phobos [127.0.0.1]
## by localhost with IMAP (fetchmail-5.9.0)
## for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:52:59 +0100 (IST)
## Received: from n16.grp.scd.yahoo.com (n16.grp.scd.yahoo.com
## [66.218.66.71]) by dogma.slashnull.org (8.11.6/8.11.6) with SMTP id
## g7MCrdZ07070 for <zzzz@example.com>; Thu, 22 Aug 2002 13:53:39 +0100
## X-Egroups-Return: sentto-2242572-52733-1030020820-zzzz=example.com@returns.groups.yahoo.com
## Received: from [66.218.67.198] by n16.grp.scd.yahoo.com with NNFMP;
## 22 Aug 2002 12:53:40 -0000
## X-Sender: timc@2ubh.com
## X-Apparently-To: zzzzteana@yahoogroups.com
## Received: (EGP: mail-8_1_0_1); 22 Aug 2002 12:53:39 -0000
## Received: (qmail 76099 invoked from network); 22 Aug 2002 12:53:39 -0000
## Received: from unknown (66.218.66.218) by m5.grp.scd.yahoo.com with QMQP;
## 22 Aug 2002 12:53:39 -0000
## Received: from unknown (HELO rhenium.btinternet.com) (194.73.73.93) by
## mta3.grp.scd.yahoo.com with SMTP; 22 Aug 2002 12:53:39 -0000
## Received: from host217-36-23-185.in-addr.btopenworld.com ([217.36.23.185])
## by rhenium.btinternet.com with esmtp (Exim 3.22 #8) id 17hrT0-0004gj-00
## for forteana@yahoogroups.com; Thu, 22 Aug 2002 13:53:38 +0100
## X-Mailer: Microsoft Outlook Express Macintosh Edition - 4.5 (0410)
## To: zzzzteana <zzzzteana@yahoogroups.com>
## X-Priority: 3
## Message-Id: <E17hrT0-0004gj-00@rhenium.btinternet.com>
## From: "Tim Chapman" <timc@2ubh.com>
## X-Yahoo-Profile: tim2ubh
## MIME-Version: 1.0
## Mailing-List: list zzzzteana@yahoogroups.com; contact
## forteana-owner@yahoogroups.com
## Delivered-To: mailing list zzzzteana@yahoogroups.com
## Precedence: bulk
## List-Unsubscribe: <mailto:zzzzteana-unsubscribe@yahoogroups.com>
## Date: Thu, 22 Aug 2002 13:52:38 +0100
## Subject: [zzzzteana] Moscow bomber
## Reply-To: zzzzteana@yahoogroups.com
## Content-Type: text/plain; charset=US-ASCII
## Content-Transfer-Encoding: 7bit
##
## Man Threatens Explosion In Moscow
##
## Thursday August 22, 2002 1:40 PM
## MOSCOW (AP) - Security officers on Thursday seized an unidentified man who
## said he was armed with explosives and threatened to blow up his truck in
## front of Russia's Federal Security Services headquarters in Moscow, NTV
## television reported.
## The officers seized an automatic rifle the man was carrying, then the man
## got out of the truck and was taken into custody, NTV said. No other details
## were immediately available.
## The man had demanded talks with high government officials, the Interfax and
## ITAR-Tass news agencies said. Ekho Moskvy radio reported that he wanted to
## talk with Russian President Vladimir Putin.
## Police and security forces rushed to the Security Service building, within
## blocks of the Kremlin, Red Square and the Bolshoi Ballet, and surrounded the
## man, who claimed to have one and a half tons of explosives, the news
## agencies said. Negotiations continued for about one and a half hours outside
## the building, ITAR-Tass and Interfax reported, citing witnesses.
## The man later drove away from the building, under police escort, and drove
## to a street near Moscow's Olympic Penta Hotel, where authorities held
## further negotiations with him, the Moscow police press service said. The
## move appeared to be an attempt by security services to get him to a more
## secure location.
##
## ------------------------ Yahoo! Groups Sponsor ---------------------~-->
## 4 DVDs Free +s&p Join Now
## http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM
## ---------------------------------------------------------------------~->
##
## To unsubscribe from this group, send an email to:
## forteana-unsubscribe@egroups.com
##
##
##
## Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/
####write a function that eliminates unnecessary characters
cleaning <- content_transformer(function(x, pattern) {return (gsub(pattern, " ", x))})
#Apply cleaning function to remove unnecessary characters
spam <- tm_map(spam, cleaning, "-")
spam <- tm_map(spam, cleaning, ":")
spam <- tm_map(spam, cleaning, "'")
spam <- tm_map(spam, cleaning, "`")
spam <- tm_map(spam, cleaning, " -")
no_spam <- tm_map(no_spam, cleaning, "-")
no_spam <- tm_map(no_spam, cleaning, ":")
no_spam <- tm_map(no_spam, cleaning, "'")
no_spam <- tm_map(no_spam, cleaning, "`")
no_spam <- tm_map(no_spam, cleaning, " -")
#Remove punctuation
spam <- tm_map(spam, removePunctuation)
no_spam <- tm_map(no_spam, removePunctuation)
#Lower text
spam <- tm_map(spam, content_transformer(tolower))
no_spam <- tm_map(no_spam, content_transformer(tolower))
#Remove stopwords
spam <- tm_map(spam, removeWords, stopwords("english"))
no_spam <- tm_map(no_spam, removeWords, stopwords("english"))
#Remove numbers
spam <- tm_map(spam, removeNumbers)
no_spam<- tm_map(no_spam, removeNumbers)
##Stemming, chopping off the ends of the words to reduce counting of related words
#Stem document
spam <- tm_map(spam,stemDocument)
no_spam <- tm_map(no_spam,stemDocument)
#Use stripWhitespace
spam <- tm_map(spam, stripWhitespace)
no_spam <- tm_map(no_spam, stripWhitespace)
writeLines(as.character(spam[[3]]))
## sabrinamxpremiocom thu aug return path sabrinamxpremiocom deliv zzzzlocalhostspamassassintaintorg receiv localhost localhost phoboslabsspamassassintaintorg postfix esmtp id ec zzzzlocalhost thu aug edt receiv mailwebnotenet localhost pop fetchmail zzzzlocalhost singl drop thu aug ist receiv emailqvescom emailqvesnet may forg webnotenet esmtp id oaa zzzzspamassassintaintorg thu aug receiv qvp emailqvescom microsoft smtpsvc thu aug slim sabrinamxpremiocom zzzzspamassassintaintorg subject guarante lose lbs day date thu aug messag id acceeadfeafreeyankeedomcom mime version content type textplain charsetiso content transfer encod bit x mailer microsoft cdo window thread index acjjowpowgqrdnswczhexzzdq content class urn content class messag x mimeol produc microsoft mimeol v x originalarrivaltim aug utc filetimeefd ce fight risk cancer http wwwadclickwspcfmospk slim guarante lose lbs day http wwwadclickwspcfmospk get child support deserv free legal advic http wwwadclickwspcfmospk join web s fastest grow singl communiti http wwwadclickwspcfmospk start privat photo album onlin http wwwadclickwspcfmospk wonder day offer manag prizemama wish leav list pleas use link http wwwqvescomtrimzzzzspamassassintaintorgcc
writeLines(as.character(no_spam[[3]]))
## timcubhcom thu aug return path timcubhcom deliv zzzzlocalhostnetnoteinccom receiv localhost localhost phoboslabsnetnoteinccom postfix esmtp id c zzzzlocalhost thu aug edt receiv phobo localhost imap fetchmail zzzzlocalhost singl drop thu aug ist receiv ngrpscdyahoocom ngrpscdyahoocom dogmaslashnullorg smtp id gmcrdz zzzzexamplecom thu aug x egroup return sentto zzzzexamplecomreturnsgroupsyahoocom receiv ngrpscdyahoocom nnfmp aug x sender timcubhcom x appar zzzzteanayahoogroupscom receiv egp mail aug receiv qmail invok network aug receiv unknown mgrpscdyahoocom qmqp aug receiv unknown helo rheniumbtinternetcom mtagrpscdyahoocom smtp aug receiv host in addrbtopenworldcom rheniumbtinternetcom esmtp exim id hrt gj forteanayahoogroupscom thu aug x mailer microsoft outlook express macintosh edit zzzzteana zzzzteanayahoogroupscom x prioriti messag id ehrt gj rheniumbtinternetcom tim chapman timcubhcom x yahoo profil timubh mime version mail list list zzzzteanayahoogroupscom contact forteana owneryahoogroupscom deliv mail list zzzzteanayahoogroupscom preced bulk list unsubscrib mailto zzzzteana unsubscribeyahoogroupscom date thu aug subject zzzzteana moscow bomber repli zzzzteanayahoogroupscom content type textplain charsetus ascii content transfer encod bit man threaten explos moscow thursday august pm moscow ap secur offic thursday seiz unidentifi man said arm explos threaten blow truck front russia s feder secur servic headquart moscow ntv televis report offic seiz automat rifl man carri man got truck taken custodi ntv said detail immedi avail man demand talk high govern offici interfax itar tass news agenc said ekho moskvi radio report want talk russian presid vladimir putin polic secur forc rush secur servic build within block kremlin red squar bolshoi ballet surround man claim one half ton explos news agenc said negoti continu one half hour outsid build itar tass interfax report cite wit man later drove away build polic escort drove street near moscow s olymp penta hotel author held negoti moscow polic press servic said move appear attempt secur servic get secur locat yahoo group sponsor dvds free sp join now http usclickyahoocomptybbnxieaamghaagsolbtm unsubscrib group send email forteana unsubscribeegroupscom use yahoo group subject http docsyahoocominfoterm
##Document Term Matrix
spam_dtm <- DocumentTermMatrix(spam)
no_spam_dtm <- DocumentTermMatrix(no_spam)
spam_dtm
## <<DocumentTermMatrix (documents: 501, terms: 30091)>>
## Non-/sparse entries: 106856/14968735
## Sparsity : 99%
## Maximal term length: 298
## Weighting : term frequency (tf)
no_spam_dtm
## <<DocumentTermMatrix (documents: 2551, terms: 31034)>>
## Non-/sparse entries: 385988/78781746
## Sparsity : 100%
## Maximal term length: 261
## Weighting : term frequency (tf)
##Word frequency, analytics
findFreqTerms(spam_dtm,lowfreq=200)
## [1] "aug"
## [2] "bodi"
## [3] "borderd"
## [4] "can"
## [5] "cellpaddingd"
## [6] "cellspacingd"
## [7] "center"
## [8] "charsetiso"
## [9] "click"
## [10] "colord"
## [11] "compani"
## [12] "content"
## [13] "date"
## [14] "deliv"
## [15] "drop"
## [16] "edt"
## [17] "email"
## [18] "encod"
## [19] "esmtp"
## [20] "facedari"
## [21] "famili"
## [22] "fetchmail"
## [23] "font"
## [24] "free"
## [25] "get"
## [26] "heightd"
## [27] "help"
## [28] "helvetica"
## [29] "hrefdhttp"
## [30] "html"
## [31] "http"
## [32] "invest"
## [33] "ist"
## [34] "life"
## [35] "list"
## [36] "localhost"
## [37] "mail"
## [38] "mailwebnotenet"
## [39] "messag"
## [40] "meta"
## [41] "microsoft"
## [42] "mime"
## [43] "money"
## [44] "new"
## [45] "path"
## [46] "phoboslabsspamassassintaintorg"
## [47] "pleas"
## [48] "postfix"
## [49] "quot"
## [50] "receiv"
## [51] "remov"
## [52] "return"
## [53] "right"
## [54] "san"
## [55] "sat"
## [56] "serif"
## [57] "servic"
## [58] "singl"
## [59] "size"
## [60] "state"
## [61] "subject"
## [62] "tabl"
## [63] "texthtml"
## [64] "thu"
## [65] "transfer"
## [66] "type"
## [67] "version"
## [68] "webnotenet"
## [69] "wed"
## [70] "widthd"
## [71] "will"
## [72] "zzzzlocalhost"
## [73] "zzzzlocalhostspamassassintaintorg"
## [74] "zzzzspamassassintaintorg"
## [75] "bit"
## [76] "day"
## [77] "dogmaslashnullorg"
## [78] "ilug"
## [79] "iluglinuxi"
## [80] "imap"
## [81] "inform"
## [82] "lughtuathaorg"
## [83] "mailer"
## [84] "offer"
## [85] "textplain"
## [86] "use"
## [87] "may"
## [88] "don"
## [89] "internet"
## [90] "just"
## [91] "million"
## [92] "name"
## [93] "now"
## [94] "one"
## [95] "prioriti"
## [96] "site"
## [97] "smtp"
## [98] "year"
## [99] "address"
## [100] "make"
## [101] "market"
## [102] "peopl"
## [103] "program"
## [104] "repli"
## [105] "send"
## [106] "time"
## [107] "want"
## [108] "work"
## [109] "form"
## [110] "home"
## [111] "normal"
## [112] "arial"
## [113] "border"
## [114] "cellpad"
## [115] "cellspac"
## [116] "color"
## [117] "div"
## [118] "faceari"
## [119] "head"
## [120] "height"
## [121] "hrefhttp"
## [122] "img"
## [123] "input"
## [124] "need"
## [125] "option"
## [126] "srchttp"
## [127] "width"
## [128] "busi"
## [129] "fri"
## [130] "price"
## [131] "faceverdana"
## [132] "order"
## [133] "report"
## [134] "govern"
## [135] "aligndcent"
## [136] "srcdhttp"
## [137] "top"
## [138] "tue"
## [139] "bottom"
## [140] "sun"
## [141] "mon"
## [142] "margin"
## [143] "sep"
## [144] "jalapeno"
## [145] "zzzzasonorg"
## [146] "grant"
## [147] "blockquotefont"
## [148] "faceverdanafont"
findFreqTerms(no_spam_dtm,lowfreq=200)
## [1] "actual"
## [2] "adminexamplecom"
## [3] "archiv"
## [4] "ascii"
## [5] "aug"
## [6] "beenther"
## [7] "bulk"
## [8] "can"
## [9] "charsetus"
## [10] "chris"
## [11] "code"
## [12] "come"
## [13] "content"
## [14] "creat"
## [15] "cvs"
## [16] "cwg"
## [17] "date"
## [18] "day"
## [19] "deliv"
## [20] "develop"
## [21] "discuss"
## [22] "dogmaslashnullorg"
## [23] "drop"
## [24] "edt"
## [25] "error"
## [26] "esmtp"
## [27] "everi"
## [28] "exmh"
## [29] "fetchmail"
## [30] "get"
## [31] "happen"
## [32] "help"
## [33] "hit"
## [34] "https"
## [35] "imap"
## [36] "int"
## [37] "issu"
## [38] "ist"
## [39] "like"
## [40] "line"
## [41] "list"
## [42] "listmanexamplecom"
## [43] "listmanexamplecommailmanlistinfoexmh"
## [44] "listmanredhatcom"
## [45] "local"
## [46] "localhost"
## [47] "localhostlocaldomain"
## [48] "log"
## [49] "loop"
## [50] "mail"
## [51] "mailman"
## [52] "mailto"
## [53] "mark"
## [54] "messag"
## [55] "mime"
## [56] "mxcorpexamplecom"
## [57] "mxexamplecom"
## [58] "new"
## [59] "one"
## [60] "part"
## [61] "path"
## [62] "phobo"
## [63] "phoboslabsnetnoteinccom"
## [64] "post"
## [65] "postfix"
## [66] "preced"
## [67] "receiv"
## [68] "refer"
## [69] "repli"
## [70] "return"
## [71] "run"
## [72] "sender"
## [73] "sequenc"
## [74] "sinc"
## [75] "singl"
## [76] "smtp"
## [77] "still"
## [78] "subject"
## [79] "subscrib"
## [80] "sun"
## [81] "textplain"
## [82] "think"
## [83] "thu"
## [84] "time"
## [85] "today"
## [86] "type"
## [87] "unsubscrib"
## [88] "use"
## [89] "version"
## [90] "wed"
## [91] "window"
## [92] "without"
## [93] "work"
## [94] "worker"
## [95] "workersexamplecom"
## [96] "workersredhatcom"
## [97] "zzzzlocalhost"
## [98] "zzzzlocalhostnetnoteinccom"
## [99] "bit"
## [100] "email"
## [101] "encod"
## [102] "far"
## [103] "free"
## [104] "group"
## [105] "helo"
## [106] "high"
## [107] "http"
## [108] "internet"
## [109] "invok"
## [110] "mailer"
## [111] "network"
## [112] "ngrpscdyahoocom"
## [113] "now"
## [114] "qmail"
## [115] "send"
## [116] "servic"
## [117] "sponsor"
## [118] "technolog"
## [119] "transfer"
## [120] "unknown"
## [121] "well"
## [122] "yahoo"
## [123] "zzzzexamplecom"
## [124] "zzzzteanayahoogroupscom"
## [125] "build"
## [126] "exim"
## [127] "forc"
## [128] "got"
## [129] "govern"
## [130] "microsoft"
## [131] "move"
## [132] "news"
## [133] "outlook"
## [134] "presid"
## [135] "prioriti"
## [136] "red"
## [137] "report"
## [138] "said"
## [139] "secur"
## [140] "talk"
## [141] "want"
## [142] "within"
## [143] "data"
## [144] "home"
## [145] "last"
## [146] "make"
## [147] "may"
## [148] "say"
## [149] "septemb"
## [150] "softwar"
## [151] "warn"
## [152] "web"
## [153] "won"
## [154] "world"
## [155] "add"
## [156] "also"
## [157] "even"
## [158] "file"
## [159] "folder"
## [160] "give"
## [161] "header"
## [162] "key"
## [163] "linux"
## [164] "long"
## [165] "might"
## [166] "must"
## [167] "old"
## [168] "organ"
## [169] "possibl"
## [170] "put"
## [171] "quot"
## [172] "see"
## [173] "sent"
## [174] "server"
## [175] "set"
## [176] "signatur"
## [177] "user"
## [178] "usersexamplecom"
## [179] "usersredhatcom"
## [180] "way"
## [181] "wrote"
## [182] "accept"
## [183] "agent"
## [184] "ask"
## [185] "authent"
## [186] "just"
## [187] "languag"
## [188] "lot"
## [189] "mozilla"
## [190] "never"
## [191] "person"
## [192] "manag"
## [193] "normal"
## [194] "realli"
## [195] "applic"
## [196] "better"
## [197] "charsetiso"
## [198] "end"
## [199] "first"
## [200] "prefer"
## [201] "will"
## [202] "year"
## [203] "found"
## [204] "look"
## [205] "peopl"
## [206] "probabl"
## [207] "right"
## [208] "tri"
## [209] "anyon"
## [210] "big"
## [211] "call"
## [212] "chang"
## [213] "current"
## [214] "design"
## [215] "good"
## [216] "industri"
## [217] "interest"
## [218] "live"
## [219] "made"
## [220] "mani"
## [221] "mean"
## [222] "name"
## [223] "need"
## [224] "open"
## [225] "origin"
## [226] "point"
## [227] "power"
## [228] "read"
## [229] "reason"
## [230] "releas"
## [231] "result"
## [232] "second"
## [233] "seem"
## [234] "someth"
## [235] "start"
## [236] "stuff"
## [237] "thing"
## [238] "two"
## [239] "unit"
## [240] "word"
## [241] "write"
## [242] "begin"
## [243] "check"
## [244] "clean"
## [245] "comment"
## [246] "deliveri"
## [247] "don"
## [248] "imag"
## [249] "instal"
## [250] "pgp"
## [251] "process"
## [252] "real"
## [253] "test"
## [254] "unseen"
## [255] "url"
## [256] "war"
## [257] "adminexamplesourceforgenet"
## [258] "adminlistssourceforgenet"
## [259] "base"
## [260] "bsourceforgenet"
## [261] "debian"
## [262] "engin"
## [263] "examplesourceforgenetlistslistinfospamassassin"
## [264] "exist"
## [265] "find"
## [266] "fwsourceforgenet"
## [267] "helousw"
## [268] "listsourceforgenet"
## [269] "listssourceforgenetlistslistinfospamassassin"
## [270] "mimeol"
## [271] "msmail"
## [272] "phone"
## [273] "produc"
## [274] "provid"
## [275] "requestexamplesourceforgenetsubjecthelp"
## [276] "requestlistssourceforgenetsubjectsubscrib"
## [277] "requestlistssourceforgenetsubjectunsubscrib"
## [278] "script"
## [279] "sfnet"
## [280] "spam"
## [281] "spamassassin"
## [282] "talkexamplesourceforgenet"
## [283] "talklistssourceforgenet"
## [284] "usw"
## [285] "devel"
## [286] "develexamplesourceforgenet"
## [287] "mailscann"
## [288] "rule"
## [289] "thank"
## [290] "case"
## [291] "differ"
## [292] "doesn"
## [293] "done"
## [294] "experi"
## [295] "fri"
## [296] "generat"
## [297] "includ"
## [298] "know"
## [299] "let"
## [300] "mayb"
## [301] "play"
## [302] "problem"
## [303] "yet"
## [304] "anyth"
## [305] "formatflow"
## [306] "allow"
## [307] "great"
## [308] "idea"
## [309] "requir"
## [310] "though"
## [311] "updat"
## [312] "week"
## [313] "adminlinuxi"
## [314] "didn"
## [315] "disposit"
## [316] "friend"
## [317] "hat"
## [318] "ilug"
## [319] "iluglinuxi"
## [320] "inform"
## [321] "inlin"
## [322] "irish"
## [323] "kernel"
## [324] "link"
## [325] "lughtuathaorg"
## [326] "mon"
## [327] "tell"
## [328] "userid"
## [329] "bug"
## [330] "less"
## [331] "much"
## [332] "take"
## [333] "thought"
## [334] "address"
## [335] "adminxentcom"
## [336] "american"
## [337] "anoth"
## [338] "best"
## [339] "communic"
## [340] "compani"
## [341] "countri"
## [342] "fork"
## [343] "forkexamplecom"
## [344] "forkxentcom"
## [345] "khare"
## [346] "lairxentcom"
## [347] "law"
## [348] "least"
## [349] "level"
## [350] "market"
## [351] "nation"
## [352] "pdt"
## [353] "polit"
## [354] "public"
## [355] "requestxentcomsubjecthelp"
## [356] "requestxentcomsubjectsubscrib"
## [357] "requestxentcomsubjectunsubscrib"
## [358] "rohit"
## [359] "state"
## [360] "support"
## [361] "xentcom"
## [362] "xentcommailmanlistinfofork"
## [363] "xentcompipermailfork"
## [364] "question"
## [365] "comput"
## [366] "number"
## [367] "show"
## [368] "oper"
## [369] "someon"
## [370] "sure"
## [371] "system"
## [372] "import"
## [373] "justin"
## [374] "els"
## [375] "remov"
## [376] "trade"
## [377] "pinelnx"
## [378] "product"
## [379] "keep"
## [380] "packag"
## [381] "java"
## [382] "place"
## [383] "pleas"
## [384] "program"
## [385] "sourc"
## [386] "status"
## [387] "next"
## [388] "rpm"
## [389] "perl"
## [390] "around"
## [391] "crankslacknet"
## [392] "back"
## [393] "suppli"
## [394] "cours"
## [395] "bush"
## [396] "html"
## [397] "stori"
## [398] "page"
## [399] "site"
## [400] "devic"
## [401] "adminfreshrpmsnet"
## [402] "alsa"
## [403] "authnlegwnnet"
## [404] "custom"
## [405] "egwn"
## [406] "egwnnet"
## [407] "exampl"
## [408] "freshrpm"
## [409] "listfreshrpmsnet"
## [410] "listsfreshrpmsnetmailmanlistinforpm"
## [411] "listsfreshrpmsnetpipermailrpm"
## [412] "matthia"
## [413] "requestfreshrpmsnetsubjecthelp"
## [414] "requestfreshrpmsnetsubjectsubscrib"
## [415] "requestfreshrpmsnetsubjectunsubscrib"
## [416] "size"
## [417] "tue"
## [418] "zzzlist"
## [419] "zzzlistfreshrpmsnet"
## [420] "busi"
## [421] "gari"
## [422] "lawrenc"
## [423] "murphi"
## [424] "habea"
## [425] "follow"
## [426] "razor"
## [427] "sep"
## [428] "usersexamplesourceforgenet"
## [429] "userslistssourceforgenet"
## [430] "sat"
## [431] "zoonet"
## [432] "jmjmasonorg"
## [433] "jmlocalhost"
## [434] "pyzor"
## [435] "yyyylocalhostnetnoteinccom"
## [436] "apt"
## [437] "xml"
## [438] "global"
## [439] "train"
## [440] "examplecom"
## [441] "jalapeno"
## [442] "oct"
## [443] "encodingutf"
## [444] "rssfeedsexamplecom"
## [445] "wwwnewsisfreecomclick"
## [446] "datapow"
## [447] "swe"
## [448] "commit"
## [449] "yyyyexamplecom"
## [450] "feb"
## [451] "jmasonorg"
## [452] "yyyylocalhostexamplecom"
## [453] "rpmjmasonorg"
## [454] "alb"
## [455] "tnonsensefromtnonsensefrom"
## [456] "testsawl"
## [457] "rssfeedsjmasonorg"
wordcloud(spam, min.freq=500)
wordcloud(no_spam, min.freq=500)
findAssocs(spam_dtm,"font",0.5)
## $font
## famili classmsonorm
## 0.78 0.77
## style bidi
## 0.65 0.65
## accesso activitieso
## 0.65 0.65
## ada aligntablesrowbyrow
## 0.65 0.65
## authordonald baeo
## 0.65 0.65
## bhatti budgeth
## 0.65 0.65
## bullet characterso
## 0.65 0.65
## characterswithspac characterswithspaceso
## 0.65 0.65
## charset chicagocomap
## 0.65 0.65
## chicagocomwwwradisson classmsobodytextspan
## 0.65 0.65
## classmsobodytextwhen classmsoheadingtak
## 0.65 0.65
## classmsonormalspan complianto
## 0.65 0.65
## contentworddocu convien
## 0.65 0.65
## displayhorizontaldrawinggrideveri displayhorizontaldrawinggrideveryw
## 0.65 0.65
## displayverticaldrawinggrideveri displayverticaldrawinggrideveryw
## 0.65 0.65
## divmsobodytext divmsohead
## 0.65 0.65
## divmsonorm divsect
## 0.65 0.65
## documentproperti downtown
## 0.65 0.65
## endifo expresswayo
## 0.65 0.65
## fema flexibl
## 0.65 0.65
## footnotelayoutlikeww forgetlasttabalign
## 0.65 0.65
## grmtdz grmtdzdogmaslashnullorg
## 0.65 0.65
## hchicagoh hdonald
## 0.65 0.65
## heasi hrefillinoisflyerfilesfilelistxml
## 0.65 0.65
## idmap indent
## 0.65 0.65
## infopurplehotelcom inmso
## 0.65 0.65
## interv intext
## 0.65 0.65
## kern lastauthor
## 0.65 0.65
## lastauthordonald lastprint
## 0.65 0.65
## lastsav layoutrawtablewidth
## 0.65 0.65
## layouttablerowsapart lfo
## 0.65 0.65
## limsobodytext limsohead
## 0.65 0.65
## limsonorm lineso
## 0.65 0.65
## mbe nameorigin
## 0.65 0.65
## nbspnbspnbspnbspnbsp northshorespanp
## 0.65 0.65
## ofh oftitl
## 0.65 0.65
## orh orphan
## 0.65 0.65
## outlin oâ<U+0080><U+0099>har
## 0.65 0.65
## pageso pagin
## 0.65 0.65
## panos paragraphso
## 0.65 0.65
## parkingo pmsobodytext
## 0.65 0.65
## pmsohead pmsonorm
## 0.65 0.65
## pspanp ptmso
## 0.65 0.65
## purpl resortso
## 0.65 0.65
## revisiono riz
## 0.65 0.65
## rosemont shapelayout
## 0.65 0.65
## shapelayoutlikeww shapelayoutxmlendif
## 0.65 0.65
## shuttl spaceo
## 0.65 0.65
## spanmsohyperlink spanmsohyperlinkfollow
## 0.65 0.65
## spanspanendifspan supportemptyparasnbsp
## 0.65 0.65
## supportemptyparasnbspendifo supportlistsspan
## 0.65 0.65
## symbolmso templatenormalo
## 0.65 0.65
## totaltim totaltimeo
## 0.65 0.65
## tredsjunoscom usemarginsfordrawinggridorigin
## 0.65 0.65
## variabl versiono
## 0.65 0.65
## wingd worddocu
## 0.65 0.65
## wordso worldwideo
## 0.65 0.65
## wurn wwwradisson
## 0.65 0.65
## xml linkblu
## 0.65 0.64
## mso underlin
## 0.64 0.64
## widow ptfont
## 0.63 0.63
## radisson xmlendif
## 0.63 0.63
## left level
## 0.62 0.62
## tahoma hotel
## 0.61 0.60
## gte pitch
## 0.59 0.58
## extedit schema
## 0.56 0.56
## xmlns size
## 0.56 0.54
## decor roman
## 0.54 0.54
## stop avoid
## 0.53 0.53
###For Spam, we notice words like font, size, content, width, helvetica and http. These are parts of HTML codes that change font size and color and http represents links to malicious websites.
###For no-spam, we do not necessarily see a lot of HTML/link related keywords; in fact, you see lots of random words.
###We know that spam emails look nicer and prettier and in fact spammers tend to use HTML codes to make mails look nicer.
###We can predict that mails that contain many HTML font size/coloring related codes are more likely spam mails.
###When you look at word association with font, you will see that words like style, roman, size, xml, ptfont and etc are coming together.