library(tm)
## Warning: package 'tm' was built under R version 3.4.4
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.4.1
library(SnowballC)
## Warning: package 'SnowballC' was built under R version 3.4.1
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.4.4
## Loading required package: RColorBrewer
#Setting up Corpus
spam <- Corpus(DirSource("C:/Users/ahwang/Desktop/Spamham/spam"))
no_spam <- Corpus(DirSource("C:/Users/ahwang/Desktop/Spamham/easy_ham"))

#Checking to see if Corpus works correctly
writeLines(as.character(spam[[3]]))
## From sabrina@mx3.1premio.com  Thu Aug 22 14:44:07 2002
## Return-Path: <sabrina@mx3.1premio.com>
## Delivered-To: zzzz@localhost.spamassassin.taint.org
## Received: from localhost (localhost [127.0.0.1])
##  by phobos.labs.spamassassin.taint.org (Postfix) with ESMTP id 1E90847C66
##  for <zzzz@localhost>; Thu, 22 Aug 2002 09:44:02 -0400 (EDT)
## Received: from mail.webnote.net [193.120.211.219]
##  by localhost with POP3 (fetchmail-5.9.0)
##  for zzzz@localhost (single-drop); Thu, 22 Aug 2002 14:44:03 +0100 (IST)
## Received: from email.qves.com (email1.qves.net [209.63.151.251] (may be forged))
##  by webnote.net (8.9.3/8.9.3) with ESMTP id OAA04953
##  for <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 14:37:23 +0100
## Received: from qvp0086 ([169.254.6.17]) by email.qves.com with Microsoft SMTPSVC(5.0.2195.2966);
##   Thu, 22 Aug 2002 07:36:20 -0600
## From: "Slim Down" <sabrina@mx3.1premio.com>
## To: <zzzz@spamassassin.taint.org>
## Subject: Guaranteed to lose 10-12 lbs in 30 days                          11.150
## Date: Thu, 22 Aug 2002 07:36:19 -0600
## Message-ID: <9a63c01c249e0$e5a9d610$1106fea9@freeyankeedom.com>
## MIME-Version: 1.0
## Content-Type: text/plain;
##  charset="iso-8859-1"
## Content-Transfer-Encoding: 7bit
## X-Mailer: Microsoft CDO for Windows 2000
## Thread-Index: AcJJ4OWpowGq7rdNSwCz5HE3x9ZZDQ==
## Content-Class: urn:content-classes:message
## X-MimeOLE: Produced By Microsoft MimeOLE V6.00.2462.0000
## X-OriginalArrivalTime: 22 Aug 2002 13:36:20.0969 (UTC) FILETIME=[E692FD90:01C249E0]
## 
## 1) Fight The Risk of Cancer!
## http://www.adclick.ws/p.cfm?o=315&s=pk007
## 
## 2) Slim Down - Guaranteed to lose 10-12 lbs in 30 days
## http://www.adclick.ws/p.cfm?o=249&s=pk007
## 
## 3) Get the Child Support You Deserve - Free Legal Advice
## http://www.adclick.ws/p.cfm?o=245&s=pk002
## 
## 4) Join the Web's Fastest Growing Singles Community
## http://www.adclick.ws/p.cfm?o=259&s=pk007
## 
## 5) Start Your Private Photo Album Online!
## http://www.adclick.ws/p.cfm?o=283&s=pk007
## 
## Have a Wonderful Day,
## Offer Manager
## PrizeMama
## 
## 
## 
## 
## 
## 
## 
## 
## 
## 
## 
## 
## 
## If you wish to leave this list please use the link below.
## http://www.qves.com/trim/?zzzz@spamassassin.taint.org%7C17%7C308417
writeLines(as.character(no_spam[[3]]))
## From timc@2ubh.com  Thu Aug 22 13:52:59 2002
## Return-Path: <timc@2ubh.com>
## Delivered-To: zzzz@localhost.netnoteinc.com
## Received: from localhost (localhost [127.0.0.1])
##  by phobos.labs.netnoteinc.com (Postfix) with ESMTP id 0314547C66
##  for <zzzz@localhost>; Thu, 22 Aug 2002 08:52:58 -0400 (EDT)
## Received: from phobos [127.0.0.1]
##  by localhost with IMAP (fetchmail-5.9.0)
##  for zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:52:59 +0100 (IST)
## Received: from n16.grp.scd.yahoo.com (n16.grp.scd.yahoo.com
##     [66.218.66.71]) by dogma.slashnull.org (8.11.6/8.11.6) with SMTP id
##     g7MCrdZ07070 for <zzzz@example.com>; Thu, 22 Aug 2002 13:53:39 +0100
## X-Egroups-Return: sentto-2242572-52733-1030020820-zzzz=example.com@returns.groups.yahoo.com
## Received: from [66.218.67.198] by n16.grp.scd.yahoo.com with NNFMP;
##     22 Aug 2002 12:53:40 -0000
## X-Sender: timc@2ubh.com
## X-Apparently-To: zzzzteana@yahoogroups.com
## Received: (EGP: mail-8_1_0_1); 22 Aug 2002 12:53:39 -0000
## Received: (qmail 76099 invoked from network); 22 Aug 2002 12:53:39 -0000
## Received: from unknown (66.218.66.218) by m5.grp.scd.yahoo.com with QMQP;
##     22 Aug 2002 12:53:39 -0000
## Received: from unknown (HELO rhenium.btinternet.com) (194.73.73.93) by
##     mta3.grp.scd.yahoo.com with SMTP; 22 Aug 2002 12:53:39 -0000
## Received: from host217-36-23-185.in-addr.btopenworld.com ([217.36.23.185])
##     by rhenium.btinternet.com with esmtp (Exim 3.22 #8) id 17hrT0-0004gj-00
##     for forteana@yahoogroups.com; Thu, 22 Aug 2002 13:53:38 +0100
## X-Mailer: Microsoft Outlook Express Macintosh Edition - 4.5 (0410)
## To: zzzzteana <zzzzteana@yahoogroups.com>
## X-Priority: 3
## Message-Id: <E17hrT0-0004gj-00@rhenium.btinternet.com>
## From: "Tim Chapman" <timc@2ubh.com>
## X-Yahoo-Profile: tim2ubh
## MIME-Version: 1.0
## Mailing-List: list zzzzteana@yahoogroups.com; contact
##     forteana-owner@yahoogroups.com
## Delivered-To: mailing list zzzzteana@yahoogroups.com
## Precedence: bulk
## List-Unsubscribe: <mailto:zzzzteana-unsubscribe@yahoogroups.com>
## Date: Thu, 22 Aug 2002 13:52:38 +0100
## Subject: [zzzzteana] Moscow bomber
## Reply-To: zzzzteana@yahoogroups.com
## Content-Type: text/plain; charset=US-ASCII
## Content-Transfer-Encoding: 7bit
## 
## Man Threatens Explosion In Moscow 
## 
## Thursday August 22, 2002 1:40 PM
## MOSCOW (AP) - Security officers on Thursday seized an unidentified man who
## said he was armed with explosives and threatened to blow up his truck in
## front of Russia's Federal Security Services headquarters in Moscow, NTV
## television reported.
## The officers seized an automatic rifle the man was carrying, then the man
## got out of the truck and was taken into custody, NTV said. No other details
## were immediately available.
## The man had demanded talks with high government officials, the Interfax and
## ITAR-Tass news agencies said. Ekho Moskvy radio reported that he wanted to
## talk with Russian President Vladimir Putin.
## Police and security forces rushed to the Security Service building, within
## blocks of the Kremlin, Red Square and the Bolshoi Ballet, and surrounded the
## man, who claimed to have one and a half tons of explosives, the news
## agencies said. Negotiations continued for about one and a half hours outside
## the building, ITAR-Tass and Interfax reported, citing witnesses.
## The man later drove away from the building, under police escort, and drove
## to a street near Moscow's Olympic Penta Hotel, where authorities held
## further negotiations with him, the Moscow police press service said. The
## move appeared to be an attempt by security services to get him to a more
## secure location. 
## 
## ------------------------ Yahoo! Groups Sponsor ---------------------~-->
## 4 DVDs Free +s&p Join Now
## http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM
## ---------------------------------------------------------------------~->
## 
## To unsubscribe from this group, send an email to:
## forteana-unsubscribe@egroups.com
## 
##  
## 
## Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/
####write a function that eliminates unnecessary characters
cleaning <- content_transformer(function(x, pattern) {return (gsub(pattern, " ", x))})

#Apply cleaning function to remove unnecessary characters
spam <- tm_map(spam, cleaning, "-")
spam <- tm_map(spam, cleaning, ":")
spam <- tm_map(spam, cleaning, "'")
spam <- tm_map(spam, cleaning, "`")
spam <- tm_map(spam, cleaning, " -")

no_spam <- tm_map(no_spam, cleaning, "-")
no_spam <- tm_map(no_spam, cleaning, ":")
no_spam <- tm_map(no_spam, cleaning, "'")
no_spam <- tm_map(no_spam, cleaning, "`")
no_spam <- tm_map(no_spam, cleaning, " -")

#Remove punctuation
spam <- tm_map(spam, removePunctuation)
no_spam <- tm_map(no_spam, removePunctuation)

#Lower text
spam <- tm_map(spam, content_transformer(tolower))
no_spam <- tm_map(no_spam, content_transformer(tolower))

#Remove stopwords
spam <- tm_map(spam, removeWords, stopwords("english"))
no_spam <- tm_map(no_spam, removeWords, stopwords("english"))

#Remove numbers
spam <- tm_map(spam, removeNumbers)
no_spam<- tm_map(no_spam, removeNumbers)

##Stemming, chopping off the ends of the words to reduce counting of related words

#Stem document
spam <- tm_map(spam,stemDocument)
no_spam <- tm_map(no_spam,stemDocument)


#Use stripWhitespace
spam <- tm_map(spam, stripWhitespace)
no_spam <- tm_map(no_spam, stripWhitespace)

writeLines(as.character(spam[[3]]))
## sabrinamxpremiocom thu aug return path sabrinamxpremiocom deliv zzzzlocalhostspamassassintaintorg receiv localhost localhost phoboslabsspamassassintaintorg postfix esmtp id ec zzzzlocalhost thu aug edt receiv mailwebnotenet localhost pop fetchmail zzzzlocalhost singl drop thu aug ist receiv emailqvescom emailqvesnet may forg webnotenet esmtp id oaa zzzzspamassassintaintorg thu aug receiv qvp emailqvescom microsoft smtpsvc thu aug slim sabrinamxpremiocom zzzzspamassassintaintorg subject guarante lose lbs day date thu aug messag id acceeadfeafreeyankeedomcom mime version content type textplain charsetiso content transfer encod bit x mailer microsoft cdo window thread index acjjowpowgqrdnswczhexzzdq content class urn content class messag x mimeol produc microsoft mimeol v x originalarrivaltim aug utc filetimeefd ce fight risk cancer http wwwadclickwspcfmospk slim guarante lose lbs day http wwwadclickwspcfmospk get child support deserv free legal advic http wwwadclickwspcfmospk join web s fastest grow singl communiti http wwwadclickwspcfmospk start privat photo album onlin http wwwadclickwspcfmospk wonder day offer manag prizemama wish leav list pleas use link http wwwqvescomtrimzzzzspamassassintaintorgcc
writeLines(as.character(no_spam[[3]]))
## timcubhcom thu aug return path timcubhcom deliv zzzzlocalhostnetnoteinccom receiv localhost localhost phoboslabsnetnoteinccom postfix esmtp id c zzzzlocalhost thu aug edt receiv phobo localhost imap fetchmail zzzzlocalhost singl drop thu aug ist receiv ngrpscdyahoocom ngrpscdyahoocom dogmaslashnullorg smtp id gmcrdz zzzzexamplecom thu aug x egroup return sentto zzzzexamplecomreturnsgroupsyahoocom receiv ngrpscdyahoocom nnfmp aug x sender timcubhcom x appar zzzzteanayahoogroupscom receiv egp mail aug receiv qmail invok network aug receiv unknown mgrpscdyahoocom qmqp aug receiv unknown helo rheniumbtinternetcom mtagrpscdyahoocom smtp aug receiv host in addrbtopenworldcom rheniumbtinternetcom esmtp exim id hrt gj forteanayahoogroupscom thu aug x mailer microsoft outlook express macintosh edit zzzzteana zzzzteanayahoogroupscom x prioriti messag id ehrt gj rheniumbtinternetcom tim chapman timcubhcom x yahoo profil timubh mime version mail list list zzzzteanayahoogroupscom contact forteana owneryahoogroupscom deliv mail list zzzzteanayahoogroupscom preced bulk list unsubscrib mailto zzzzteana unsubscribeyahoogroupscom date thu aug subject zzzzteana moscow bomber repli zzzzteanayahoogroupscom content type textplain charsetus ascii content transfer encod bit man threaten explos moscow thursday august pm moscow ap secur offic thursday seiz unidentifi man said arm explos threaten blow truck front russia s feder secur servic headquart moscow ntv televis report offic seiz automat rifl man carri man got truck taken custodi ntv said detail immedi avail man demand talk high govern offici interfax itar tass news agenc said ekho moskvi radio report want talk russian presid vladimir putin polic secur forc rush secur servic build within block kremlin red squar bolshoi ballet surround man claim one half ton explos news agenc said negoti continu one half hour outsid build itar tass interfax report cite wit man later drove away build polic escort drove street near moscow s olymp penta hotel author held negoti moscow polic press servic said move appear attempt secur servic get secur locat yahoo group sponsor dvds free sp join now http usclickyahoocomptybbnxieaamghaagsolbtm unsubscrib group send email forteana unsubscribeegroupscom use yahoo group subject http docsyahoocominfoterm
##Document Term Matrix
spam_dtm <- DocumentTermMatrix(spam)
no_spam_dtm <- DocumentTermMatrix(no_spam)

spam_dtm
## <<DocumentTermMatrix (documents: 501, terms: 30091)>>
## Non-/sparse entries: 106856/14968735
## Sparsity           : 99%
## Maximal term length: 298
## Weighting          : term frequency (tf)
no_spam_dtm
## <<DocumentTermMatrix (documents: 2551, terms: 31034)>>
## Non-/sparse entries: 385988/78781746
## Sparsity           : 100%
## Maximal term length: 261
## Weighting          : term frequency (tf)
##Word frequency, analytics
findFreqTerms(spam_dtm,lowfreq=200)
##   [1] "aug"                              
##   [2] "bodi"                             
##   [3] "borderd"                          
##   [4] "can"                              
##   [5] "cellpaddingd"                     
##   [6] "cellspacingd"                     
##   [7] "center"                           
##   [8] "charsetiso"                       
##   [9] "click"                            
##  [10] "colord"                           
##  [11] "compani"                          
##  [12] "content"                          
##  [13] "date"                             
##  [14] "deliv"                            
##  [15] "drop"                             
##  [16] "edt"                              
##  [17] "email"                            
##  [18] "encod"                            
##  [19] "esmtp"                            
##  [20] "facedari"                         
##  [21] "famili"                           
##  [22] "fetchmail"                        
##  [23] "font"                             
##  [24] "free"                             
##  [25] "get"                              
##  [26] "heightd"                          
##  [27] "help"                             
##  [28] "helvetica"                        
##  [29] "hrefdhttp"                        
##  [30] "html"                             
##  [31] "http"                             
##  [32] "invest"                           
##  [33] "ist"                              
##  [34] "life"                             
##  [35] "list"                             
##  [36] "localhost"                        
##  [37] "mail"                             
##  [38] "mailwebnotenet"                   
##  [39] "messag"                           
##  [40] "meta"                             
##  [41] "microsoft"                        
##  [42] "mime"                             
##  [43] "money"                            
##  [44] "new"                              
##  [45] "path"                             
##  [46] "phoboslabsspamassassintaintorg"   
##  [47] "pleas"                            
##  [48] "postfix"                          
##  [49] "quot"                             
##  [50] "receiv"                           
##  [51] "remov"                            
##  [52] "return"                           
##  [53] "right"                            
##  [54] "san"                              
##  [55] "sat"                              
##  [56] "serif"                            
##  [57] "servic"                           
##  [58] "singl"                            
##  [59] "size"                             
##  [60] "state"                            
##  [61] "subject"                          
##  [62] "tabl"                             
##  [63] "texthtml"                         
##  [64] "thu"                              
##  [65] "transfer"                         
##  [66] "type"                             
##  [67] "version"                          
##  [68] "webnotenet"                       
##  [69] "wed"                              
##  [70] "widthd"                           
##  [71] "will"                             
##  [72] "zzzzlocalhost"                    
##  [73] "zzzzlocalhostspamassassintaintorg"
##  [74] "zzzzspamassassintaintorg"         
##  [75] "bit"                              
##  [76] "day"                              
##  [77] "dogmaslashnullorg"                
##  [78] "ilug"                             
##  [79] "iluglinuxi"                       
##  [80] "imap"                             
##  [81] "inform"                           
##  [82] "lughtuathaorg"                    
##  [83] "mailer"                           
##  [84] "offer"                            
##  [85] "textplain"                        
##  [86] "use"                              
##  [87] "may"                              
##  [88] "don"                              
##  [89] "internet"                         
##  [90] "just"                             
##  [91] "million"                          
##  [92] "name"                             
##  [93] "now"                              
##  [94] "one"                              
##  [95] "prioriti"                         
##  [96] "site"                             
##  [97] "smtp"                             
##  [98] "year"                             
##  [99] "address"                          
## [100] "make"                             
## [101] "market"                           
## [102] "peopl"                            
## [103] "program"                          
## [104] "repli"                            
## [105] "send"                             
## [106] "time"                             
## [107] "want"                             
## [108] "work"                             
## [109] "form"                             
## [110] "home"                             
## [111] "normal"                           
## [112] "arial"                            
## [113] "border"                           
## [114] "cellpad"                          
## [115] "cellspac"                         
## [116] "color"                            
## [117] "div"                              
## [118] "faceari"                          
## [119] "head"                             
## [120] "height"                           
## [121] "hrefhttp"                         
## [122] "img"                              
## [123] "input"                            
## [124] "need"                             
## [125] "option"                           
## [126] "srchttp"                          
## [127] "width"                            
## [128] "busi"                             
## [129] "fri"                              
## [130] "price"                            
## [131] "faceverdana"                      
## [132] "order"                            
## [133] "report"                           
## [134] "govern"                           
## [135] "aligndcent"                       
## [136] "srcdhttp"                         
## [137] "top"                              
## [138] "tue"                              
## [139] "bottom"                           
## [140] "sun"                              
## [141] "mon"                              
## [142] "margin"                           
## [143] "sep"                              
## [144] "jalapeno"                         
## [145] "zzzzasonorg"                      
## [146] "grant"                            
## [147] "blockquotefont"                   
## [148] "faceverdanafont"
findFreqTerms(no_spam_dtm,lowfreq=200)
##   [1] "actual"                                        
##   [2] "adminexamplecom"                               
##   [3] "archiv"                                        
##   [4] "ascii"                                         
##   [5] "aug"                                           
##   [6] "beenther"                                      
##   [7] "bulk"                                          
##   [8] "can"                                           
##   [9] "charsetus"                                     
##  [10] "chris"                                         
##  [11] "code"                                          
##  [12] "come"                                          
##  [13] "content"                                       
##  [14] "creat"                                         
##  [15] "cvs"                                           
##  [16] "cwg"                                           
##  [17] "date"                                          
##  [18] "day"                                           
##  [19] "deliv"                                         
##  [20] "develop"                                       
##  [21] "discuss"                                       
##  [22] "dogmaslashnullorg"                             
##  [23] "drop"                                          
##  [24] "edt"                                           
##  [25] "error"                                         
##  [26] "esmtp"                                         
##  [27] "everi"                                         
##  [28] "exmh"                                          
##  [29] "fetchmail"                                     
##  [30] "get"                                           
##  [31] "happen"                                        
##  [32] "help"                                          
##  [33] "hit"                                           
##  [34] "https"                                         
##  [35] "imap"                                          
##  [36] "int"                                           
##  [37] "issu"                                          
##  [38] "ist"                                           
##  [39] "like"                                          
##  [40] "line"                                          
##  [41] "list"                                          
##  [42] "listmanexamplecom"                             
##  [43] "listmanexamplecommailmanlistinfoexmh"          
##  [44] "listmanredhatcom"                              
##  [45] "local"                                         
##  [46] "localhost"                                     
##  [47] "localhostlocaldomain"                          
##  [48] "log"                                           
##  [49] "loop"                                          
##  [50] "mail"                                          
##  [51] "mailman"                                       
##  [52] "mailto"                                        
##  [53] "mark"                                          
##  [54] "messag"                                        
##  [55] "mime"                                          
##  [56] "mxcorpexamplecom"                              
##  [57] "mxexamplecom"                                  
##  [58] "new"                                           
##  [59] "one"                                           
##  [60] "part"                                          
##  [61] "path"                                          
##  [62] "phobo"                                         
##  [63] "phoboslabsnetnoteinccom"                       
##  [64] "post"                                          
##  [65] "postfix"                                       
##  [66] "preced"                                        
##  [67] "receiv"                                        
##  [68] "refer"                                         
##  [69] "repli"                                         
##  [70] "return"                                        
##  [71] "run"                                           
##  [72] "sender"                                        
##  [73] "sequenc"                                       
##  [74] "sinc"                                          
##  [75] "singl"                                         
##  [76] "smtp"                                          
##  [77] "still"                                         
##  [78] "subject"                                       
##  [79] "subscrib"                                      
##  [80] "sun"                                           
##  [81] "textplain"                                     
##  [82] "think"                                         
##  [83] "thu"                                           
##  [84] "time"                                          
##  [85] "today"                                         
##  [86] "type"                                          
##  [87] "unsubscrib"                                    
##  [88] "use"                                           
##  [89] "version"                                       
##  [90] "wed"                                           
##  [91] "window"                                        
##  [92] "without"                                       
##  [93] "work"                                          
##  [94] "worker"                                        
##  [95] "workersexamplecom"                             
##  [96] "workersredhatcom"                              
##  [97] "zzzzlocalhost"                                 
##  [98] "zzzzlocalhostnetnoteinccom"                    
##  [99] "bit"                                           
## [100] "email"                                         
## [101] "encod"                                         
## [102] "far"                                           
## [103] "free"                                          
## [104] "group"                                         
## [105] "helo"                                          
## [106] "high"                                          
## [107] "http"                                          
## [108] "internet"                                      
## [109] "invok"                                         
## [110] "mailer"                                        
## [111] "network"                                       
## [112] "ngrpscdyahoocom"                               
## [113] "now"                                           
## [114] "qmail"                                         
## [115] "send"                                          
## [116] "servic"                                        
## [117] "sponsor"                                       
## [118] "technolog"                                     
## [119] "transfer"                                      
## [120] "unknown"                                       
## [121] "well"                                          
## [122] "yahoo"                                         
## [123] "zzzzexamplecom"                                
## [124] "zzzzteanayahoogroupscom"                       
## [125] "build"                                         
## [126] "exim"                                          
## [127] "forc"                                          
## [128] "got"                                           
## [129] "govern"                                        
## [130] "microsoft"                                     
## [131] "move"                                          
## [132] "news"                                          
## [133] "outlook"                                       
## [134] "presid"                                        
## [135] "prioriti"                                      
## [136] "red"                                           
## [137] "report"                                        
## [138] "said"                                          
## [139] "secur"                                         
## [140] "talk"                                          
## [141] "want"                                          
## [142] "within"                                        
## [143] "data"                                          
## [144] "home"                                          
## [145] "last"                                          
## [146] "make"                                          
## [147] "may"                                           
## [148] "say"                                           
## [149] "septemb"                                       
## [150] "softwar"                                       
## [151] "warn"                                          
## [152] "web"                                           
## [153] "won"                                           
## [154] "world"                                         
## [155] "add"                                           
## [156] "also"                                          
## [157] "even"                                          
## [158] "file"                                          
## [159] "folder"                                        
## [160] "give"                                          
## [161] "header"                                        
## [162] "key"                                           
## [163] "linux"                                         
## [164] "long"                                          
## [165] "might"                                         
## [166] "must"                                          
## [167] "old"                                           
## [168] "organ"                                         
## [169] "possibl"                                       
## [170] "put"                                           
## [171] "quot"                                          
## [172] "see"                                           
## [173] "sent"                                          
## [174] "server"                                        
## [175] "set"                                           
## [176] "signatur"                                      
## [177] "user"                                          
## [178] "usersexamplecom"                               
## [179] "usersredhatcom"                                
## [180] "way"                                           
## [181] "wrote"                                         
## [182] "accept"                                        
## [183] "agent"                                         
## [184] "ask"                                           
## [185] "authent"                                       
## [186] "just"                                          
## [187] "languag"                                       
## [188] "lot"                                           
## [189] "mozilla"                                       
## [190] "never"                                         
## [191] "person"                                        
## [192] "manag"                                         
## [193] "normal"                                        
## [194] "realli"                                        
## [195] "applic"                                        
## [196] "better"                                        
## [197] "charsetiso"                                    
## [198] "end"                                           
## [199] "first"                                         
## [200] "prefer"                                        
## [201] "will"                                          
## [202] "year"                                          
## [203] "found"                                         
## [204] "look"                                          
## [205] "peopl"                                         
## [206] "probabl"                                       
## [207] "right"                                         
## [208] "tri"                                           
## [209] "anyon"                                         
## [210] "big"                                           
## [211] "call"                                          
## [212] "chang"                                         
## [213] "current"                                       
## [214] "design"                                        
## [215] "good"                                          
## [216] "industri"                                      
## [217] "interest"                                      
## [218] "live"                                          
## [219] "made"                                          
## [220] "mani"                                          
## [221] "mean"                                          
## [222] "name"                                          
## [223] "need"                                          
## [224] "open"                                          
## [225] "origin"                                        
## [226] "point"                                         
## [227] "power"                                         
## [228] "read"                                          
## [229] "reason"                                        
## [230] "releas"                                        
## [231] "result"                                        
## [232] "second"                                        
## [233] "seem"                                          
## [234] "someth"                                        
## [235] "start"                                         
## [236] "stuff"                                         
## [237] "thing"                                         
## [238] "two"                                           
## [239] "unit"                                          
## [240] "word"                                          
## [241] "write"                                         
## [242] "begin"                                         
## [243] "check"                                         
## [244] "clean"                                         
## [245] "comment"                                       
## [246] "deliveri"                                      
## [247] "don"                                           
## [248] "imag"                                          
## [249] "instal"                                        
## [250] "pgp"                                           
## [251] "process"                                       
## [252] "real"                                          
## [253] "test"                                          
## [254] "unseen"                                        
## [255] "url"                                           
## [256] "war"                                           
## [257] "adminexamplesourceforgenet"                    
## [258] "adminlistssourceforgenet"                      
## [259] "base"                                          
## [260] "bsourceforgenet"                               
## [261] "debian"                                        
## [262] "engin"                                         
## [263] "examplesourceforgenetlistslistinfospamassassin"
## [264] "exist"                                         
## [265] "find"                                          
## [266] "fwsourceforgenet"                              
## [267] "helousw"                                       
## [268] "listsourceforgenet"                            
## [269] "listssourceforgenetlistslistinfospamassassin"  
## [270] "mimeol"                                        
## [271] "msmail"                                        
## [272] "phone"                                         
## [273] "produc"                                        
## [274] "provid"                                        
## [275] "requestexamplesourceforgenetsubjecthelp"       
## [276] "requestlistssourceforgenetsubjectsubscrib"     
## [277] "requestlistssourceforgenetsubjectunsubscrib"   
## [278] "script"                                        
## [279] "sfnet"                                         
## [280] "spam"                                          
## [281] "spamassassin"                                  
## [282] "talkexamplesourceforgenet"                     
## [283] "talklistssourceforgenet"                       
## [284] "usw"                                           
## [285] "devel"                                         
## [286] "develexamplesourceforgenet"                    
## [287] "mailscann"                                     
## [288] "rule"                                          
## [289] "thank"                                         
## [290] "case"                                          
## [291] "differ"                                        
## [292] "doesn"                                         
## [293] "done"                                          
## [294] "experi"                                        
## [295] "fri"                                           
## [296] "generat"                                       
## [297] "includ"                                        
## [298] "know"                                          
## [299] "let"                                           
## [300] "mayb"                                          
## [301] "play"                                          
## [302] "problem"                                       
## [303] "yet"                                           
## [304] "anyth"                                         
## [305] "formatflow"                                    
## [306] "allow"                                         
## [307] "great"                                         
## [308] "idea"                                          
## [309] "requir"                                        
## [310] "though"                                        
## [311] "updat"                                         
## [312] "week"                                          
## [313] "adminlinuxi"                                   
## [314] "didn"                                          
## [315] "disposit"                                      
## [316] "friend"                                        
## [317] "hat"                                           
## [318] "ilug"                                          
## [319] "iluglinuxi"                                    
## [320] "inform"                                        
## [321] "inlin"                                         
## [322] "irish"                                         
## [323] "kernel"                                        
## [324] "link"                                          
## [325] "lughtuathaorg"                                 
## [326] "mon"                                           
## [327] "tell"                                          
## [328] "userid"                                        
## [329] "bug"                                           
## [330] "less"                                          
## [331] "much"                                          
## [332] "take"                                          
## [333] "thought"                                       
## [334] "address"                                       
## [335] "adminxentcom"                                  
## [336] "american"                                      
## [337] "anoth"                                         
## [338] "best"                                          
## [339] "communic"                                      
## [340] "compani"                                       
## [341] "countri"                                       
## [342] "fork"                                          
## [343] "forkexamplecom"                                
## [344] "forkxentcom"                                   
## [345] "khare"                                         
## [346] "lairxentcom"                                   
## [347] "law"                                           
## [348] "least"                                         
## [349] "level"                                         
## [350] "market"                                        
## [351] "nation"                                        
## [352] "pdt"                                           
## [353] "polit"                                         
## [354] "public"                                        
## [355] "requestxentcomsubjecthelp"                     
## [356] "requestxentcomsubjectsubscrib"                 
## [357] "requestxentcomsubjectunsubscrib"               
## [358] "rohit"                                         
## [359] "state"                                         
## [360] "support"                                       
## [361] "xentcom"                                       
## [362] "xentcommailmanlistinfofork"                    
## [363] "xentcompipermailfork"                          
## [364] "question"                                      
## [365] "comput"                                        
## [366] "number"                                        
## [367] "show"                                          
## [368] "oper"                                          
## [369] "someon"                                        
## [370] "sure"                                          
## [371] "system"                                        
## [372] "import"                                        
## [373] "justin"                                        
## [374] "els"                                           
## [375] "remov"                                         
## [376] "trade"                                         
## [377] "pinelnx"                                       
## [378] "product"                                       
## [379] "keep"                                          
## [380] "packag"                                        
## [381] "java"                                          
## [382] "place"                                         
## [383] "pleas"                                         
## [384] "program"                                       
## [385] "sourc"                                         
## [386] "status"                                        
## [387] "next"                                          
## [388] "rpm"                                           
## [389] "perl"                                          
## [390] "around"                                        
## [391] "crankslacknet"                                 
## [392] "back"                                          
## [393] "suppli"                                        
## [394] "cours"                                         
## [395] "bush"                                          
## [396] "html"                                          
## [397] "stori"                                         
## [398] "page"                                          
## [399] "site"                                          
## [400] "devic"                                         
## [401] "adminfreshrpmsnet"                             
## [402] "alsa"                                          
## [403] "authnlegwnnet"                                 
## [404] "custom"                                        
## [405] "egwn"                                          
## [406] "egwnnet"                                       
## [407] "exampl"                                        
## [408] "freshrpm"                                      
## [409] "listfreshrpmsnet"                              
## [410] "listsfreshrpmsnetmailmanlistinforpm"           
## [411] "listsfreshrpmsnetpipermailrpm"                 
## [412] "matthia"                                       
## [413] "requestfreshrpmsnetsubjecthelp"                
## [414] "requestfreshrpmsnetsubjectsubscrib"            
## [415] "requestfreshrpmsnetsubjectunsubscrib"          
## [416] "size"                                          
## [417] "tue"                                           
## [418] "zzzlist"                                       
## [419] "zzzlistfreshrpmsnet"                           
## [420] "busi"                                          
## [421] "gari"                                          
## [422] "lawrenc"                                       
## [423] "murphi"                                        
## [424] "habea"                                         
## [425] "follow"                                        
## [426] "razor"                                         
## [427] "sep"                                           
## [428] "usersexamplesourceforgenet"                    
## [429] "userslistssourceforgenet"                      
## [430] "sat"                                           
## [431] "zoonet"                                        
## [432] "jmjmasonorg"                                   
## [433] "jmlocalhost"                                   
## [434] "pyzor"                                         
## [435] "yyyylocalhostnetnoteinccom"                    
## [436] "apt"                                           
## [437] "xml"                                           
## [438] "global"                                        
## [439] "train"                                         
## [440] "examplecom"                                    
## [441] "jalapeno"                                      
## [442] "oct"                                           
## [443] "encodingutf"                                   
## [444] "rssfeedsexamplecom"                            
## [445] "wwwnewsisfreecomclick"                         
## [446] "datapow"                                       
## [447] "swe"                                           
## [448] "commit"                                        
## [449] "yyyyexamplecom"                                
## [450] "feb"                                           
## [451] "jmasonorg"                                     
## [452] "yyyylocalhostexamplecom"                       
## [453] "rpmjmasonorg"                                  
## [454] "alb"                                           
## [455] "tnonsensefromtnonsensefrom"                    
## [456] "testsawl"                                      
## [457] "rssfeedsjmasonorg"
wordcloud(spam, min.freq=500)

wordcloud(no_spam, min.freq=500)

findAssocs(spam_dtm,"font",0.5)
## $font
##                             famili                       classmsonorm 
##                               0.78                               0.77 
##                              style                               bidi 
##                               0.65                               0.65 
##                            accesso                        activitieso 
##                               0.65                               0.65 
##                                ada                aligntablesrowbyrow 
##                               0.65                               0.65 
##                       authordonald                               baeo 
##                               0.65                               0.65 
##                             bhatti                            budgeth 
##                               0.65                               0.65 
##                             bullet                        characterso 
##                               0.65                               0.65 
##                 characterswithspac              characterswithspaceso 
##                               0.65                               0.65 
##                            charset                       chicagocomap 
##                               0.65                               0.65 
##              chicagocomwwwradisson               classmsobodytextspan 
##                               0.65                               0.65 
##               classmsobodytextwhen                 classmsoheadingtak 
##                               0.65                               0.65 
##                 classmsonormalspan                         complianto 
##                               0.65                               0.65 
##                    contentworddocu                            convien 
##                               0.65                               0.65 
##  displayhorizontaldrawinggrideveri displayhorizontaldrawinggrideveryw 
##                               0.65                               0.65 
##    displayverticaldrawinggrideveri   displayverticaldrawinggrideveryw 
##                               0.65                               0.65 
##                     divmsobodytext                         divmsohead 
##                               0.65                               0.65 
##                         divmsonorm                            divsect 
##                               0.65                               0.65 
##                   documentproperti                           downtown 
##                               0.65                               0.65 
##                             endifo                        expresswayo 
##                               0.65                               0.65 
##                               fema                            flexibl 
##                               0.65                               0.65 
##               footnotelayoutlikeww                 forgetlasttabalign 
##                               0.65                               0.65 
##                             grmtdz            grmtdzdogmaslashnullorg 
##                               0.65                               0.65 
##                          hchicagoh                            hdonald 
##                               0.65                               0.65 
##                              heasi  hrefillinoisflyerfilesfilelistxml 
##                               0.65                               0.65 
##                              idmap                             indent 
##                               0.65                               0.65 
##                 infopurplehotelcom                              inmso 
##                               0.65                               0.65 
##                             interv                             intext 
##                               0.65                               0.65 
##                               kern                         lastauthor 
##                               0.65                               0.65 
##                   lastauthordonald                          lastprint 
##                               0.65                               0.65 
##                            lastsav                layoutrawtablewidth 
##                               0.65                               0.65 
##               layouttablerowsapart                                lfo 
##                               0.65                               0.65 
##                      limsobodytext                          limsohead 
##                               0.65                               0.65 
##                          limsonorm                             lineso 
##                               0.65                               0.65 
##                                mbe                         nameorigin 
##                               0.65                               0.65 
##               nbspnbspnbspnbspnbsp                    northshorespanp 
##                               0.65                               0.65 
##                                ofh                             oftitl 
##                               0.65                               0.65 
##                                orh                             orphan 
##                               0.65                               0.65 
##                             outlin                            oâ<U+0080><U+0099>har 
##                               0.65                               0.65 
##                             pageso                              pagin 
##                               0.65                               0.65 
##                              panos                        paragraphso 
##                               0.65                               0.65 
##                           parkingo                       pmsobodytext 
##                               0.65                               0.65 
##                           pmsohead                           pmsonorm 
##                               0.65                               0.65 
##                             pspanp                              ptmso 
##                               0.65                               0.65 
##                              purpl                           resortso 
##                               0.65                               0.65 
##                          revisiono                                riz 
##                               0.65                               0.65 
##                           rosemont                        shapelayout 
##                               0.65                               0.65 
##                  shapelayoutlikeww                shapelayoutxmlendif 
##                               0.65                               0.65 
##                             shuttl                             spaceo 
##                               0.65                               0.65 
##                   spanmsohyperlink             spanmsohyperlinkfollow 
##                               0.65                               0.65 
##                  spanspanendifspan              supportemptyparasnbsp 
##                               0.65                               0.65 
##        supportemptyparasnbspendifo                   supportlistsspan 
##                               0.65                               0.65 
##                          symbolmso                    templatenormalo 
##                               0.65                               0.65 
##                           totaltim                         totaltimeo 
##                               0.65                               0.65 
##                      tredsjunoscom     usemarginsfordrawinggridorigin 
##                               0.65                               0.65 
##                            variabl                           versiono 
##                               0.65                               0.65 
##                              wingd                           worddocu 
##                               0.65                               0.65 
##                             wordso                         worldwideo 
##                               0.65                               0.65 
##                               wurn                        wwwradisson 
##                               0.65                               0.65 
##                                xml                            linkblu 
##                               0.65                               0.64 
##                                mso                           underlin 
##                               0.64                               0.64 
##                              widow                             ptfont 
##                               0.63                               0.63 
##                           radisson                           xmlendif 
##                               0.63                               0.63 
##                               left                              level 
##                               0.62                               0.62 
##                             tahoma                              hotel 
##                               0.61                               0.60 
##                                gte                              pitch 
##                               0.59                               0.58 
##                            extedit                             schema 
##                               0.56                               0.56 
##                              xmlns                               size 
##                               0.56                               0.54 
##                              decor                              roman 
##                               0.54                               0.54 
##                               stop                              avoid 
##                               0.53                               0.53
###For Spam, we notice words like font, size, content, width, helvetica and http. These are parts of HTML codes that change font size and color and http represents links to malicious websites.
###For no-spam, we do not necessarily see a lot of HTML/link related keywords; in fact, you see lots of random words.
###We know that spam emails look nicer and prettier and in fact spammers tend to use HTML codes to make mails look nicer.
###We can predict that mails that contain many HTML font size/coloring related codes are more likely spam mails.
###When you look at word association with font, you will see that words like style, roman, size, xml, ptfont and etc are coming together.