For this project, you can start with a spam/ham dataset, then predict the class of new documents (either withheld from the training dataset or from another source such as your own spam folder)

First we need to get the data into dataframes.

ham_dir = 'ham'
spam_dir = 'spam'

ham_files = list.files(path = ham_dir, full.names = T)
spam_files = list.files(path = spam_dir, full.names = T)

ham = data.frame(do.call(rbind, lapply(ham_files, read_file)))
names(ham)[1] = 'Text'
ham$Spam = 0
head(ham, 3)
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    Text
## 1 From exmh-workers-admin@redhat.com  Thu Aug 22 12:36:23 2002\nReturn-Path: <exmh-workers-admin@spamassassin.taint.org>\nDelivered-To: zzzz@localhost.netnoteinc.com\nReceived: from localhost (localhost [127.0.0.1])\n\tby phobos.labs.netnoteinc.com (Postfix) with ESMTP id D03E543C36\n\tfor <zzzz@localhost>; Thu, 22 Aug 2002 07:36:16 -0400 (EDT)\nReceived: from phobos [127.0.0.1]\n\tby localhost with IMAP (fetchmail-5.9.0)\n\tfor zzzz@localhost (single-drop); Thu, 22 Aug 2002 12:36:16 +0100 (IST)\nReceived: from listman.spamassassin.taint.org (listman.spamassassin.taint.org [66.187.233.211]) by\n    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g7MBYrZ04811 for\n    <zzzz-exmh@spamassassin.taint.org>; Thu, 22 Aug 2002 12:34:53 +0100\nReceived: from listman.spamassassin.taint.org (localhost.localdomain [127.0.0.1]) by\n    listman.redhat.com (Postfix) with ESMTP id 8386540858; Thu, 22 Aug 2002\n    07:35:02 -0400 (EDT)\nDelivered-To: exmh-workers@listman.spamassassin.taint.org\nReceived: from int-mx1.corp.spamassassin.taint.org (int-mx1.corp.spamassassin.taint.org\n    [172.16.52.254]) by listman.redhat.com (Postfix) with ESMTP id 10CF8406D7\n    for <exmh-workers@listman.redhat.com>; Thu, 22 Aug 2002 07:34:10 -0400\n    (EDT)\nReceived: (from mail@localhost) by int-mx1.corp.spamassassin.taint.org (8.11.6/8.11.6)\n    id g7MBY7g11259 for exmh-workers@listman.redhat.com; Thu, 22 Aug 2002\n    07:34:07 -0400\nReceived: from mx1.spamassassin.taint.org (mx1.spamassassin.taint.org [172.16.48.31]) by\n    int-mx1.corp.redhat.com (8.11.6/8.11.6) with SMTP id g7MBY7Y11255 for\n    <exmh-workers@redhat.com>; Thu, 22 Aug 2002 07:34:07 -0400\nReceived: from ratree.psu.ac.th ([202.28.97.6]) by mx1.spamassassin.taint.org\n    (8.11.6/8.11.6) with SMTP id g7MBIhl25223 for <exmh-workers@redhat.com>;\n    Thu, 22 Aug 2002 07:18:55 -0400\nReceived: from delta.cs.mu.OZ.AU (delta.coe.psu.ac.th [172.30.0.98]) by\n    ratree.psu.ac.th (8.11.6/8.11.6) with ESMTP id g7MBWel29762;\n    Thu, 22 Aug 2002 18:32:40 +0700 (ICT)\nReceived: from munnari.OZ.AU (localhost [127.0.0.1]) by delta.cs.mu.OZ.AU\n    (8.11.6/8.11.6) with ESMTP id g7MBQPW13260; Thu, 22 Aug 2002 18:26:25\n    +0700 (ICT)\nFrom: Robert Elz <kre@munnari.OZ.AU>\nTo: Chris Garrigues <cwg-dated-1030377287.06fa6d@DeepEddy.Com>\nCc: exmh-workers@spamassassin.taint.org\nSubject: Re: New Sequences Window\nIn-Reply-To: <1029945287.4797.TMDA@deepeddy.vircio.com>\nReferences: <1029945287.4797.TMDA@deepeddy.vircio.com>\n    <1029882468.3116.TMDA@deepeddy.vircio.com> <9627.1029933001@munnari.OZ.AU>\n    <1029943066.26919.TMDA@deepeddy.vircio.com>\n    <1029944441.398.TMDA@deepeddy.vircio.com>\nMIME-Version: 1.0\nContent-Type: text/plain; charset=us-ascii\nMessage-Id: <13258.1030015585@munnari.OZ.AU>\nX-Loop: exmh-workers@spamassassin.taint.org\nSender: exmh-workers-admin@spamassassin.taint.org\nErrors-To: exmh-workers-admin@spamassassin.taint.org\nX-Beenthere: exmh-workers@spamassassin.taint.org\nX-Mailman-Version: 2.0.1\nPrecedence: bulk\nList-Help: <mailto:exmh-workers-request@spamassassin.taint.org?subject=help>\nList-Post: <mailto:exmh-workers@spamassassin.taint.org>\nList-Subscribe: <https://listman.spamassassin.taint.org/mailman/listinfo/exmh-workers>,\n    <mailto:exmh-workers-request@redhat.com?subject=subscribe>\nList-Id: Discussion list for EXMH developers <exmh-workers.spamassassin.taint.org>\nList-Unsubscribe: <https://listman.spamassassin.taint.org/mailman/listinfo/exmh-workers>,\n    <mailto:exmh-workers-request@redhat.com?subject=unsubscribe>\nList-Archive: <https://listman.spamassassin.taint.org/mailman/private/exmh-workers/>\nDate: Thu, 22 Aug 2002 18:26:25 +0700\n\n    Date:        Wed, 21 Aug 2002 10:54:46 -0500\n    From:        Chris Garrigues <cwg-dated-1030377287.06fa6d@DeepEddy.Com>\n    Message-ID:  <1029945287.4797.TMDA@deepeddy.vircio.com>\n\n\n  | I can't reproduce this error.\n\nFor me it is very repeatable... (like every time, without fail).\n\nThis is the debug log of the pick happening ...\n\n18:19:03 Pick_It {exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace} {4852-4852 -sequence mercury}\n18:19:03 exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace 4852-4852 -sequence mercury\n18:19:04 Ftoc_PickMsgs {{1 hit}}\n18:19:04 Marking 1 hits\n18:19:04 tkerror: syntax error in expression "int ...\n\nNote, if I run the pick command by hand ...\n\ndelta$ pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace  4852-4852 -sequence mercury\n1 hit\n\nThat's where the "1 hit" comes from (obviously).  The version of nmh I'm\nusing is ...\n\ndelta$ pick -version\npick -- nmh-1.0.4 [compiled on fuchsia.cs.mu.OZ.AU at Sun Mar 17 14:55:56 ICT 2002]\n\nAnd the relevant part of my .mh_profile ...\n\ndelta$ mhparam pick\n-seq sel -list\n\n\nSince the pick command works, the sequence (actually, both of them, the\none that's explicit on the command line, from the search popup, and the\none that comes from .mh_profile) do get created.\n\nkre\n\nps: this is still using the version of the code form a day ago, I haven't\nbeen able to reach the cvs repository today (local routing issue I think).\n\n\n\n_______________________________________________\nExmh-workers mailing list\nExmh-workers@redhat.com\nhttps://listman.redhat.com/mailman/listinfo/exmh-workers\n\n
## 2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         From Steve_Burt@cursor-system.com  Thu Aug 22 12:46:39 2002\nReturn-Path: <Steve_Burt@cursor-system.com>\nDelivered-To: zzzz@localhost.netnoteinc.com\nReceived: from localhost (localhost [127.0.0.1])\n\tby phobos.labs.netnoteinc.com (Postfix) with ESMTP id BE12E43C34\n\tfor <zzzz@localhost>; Thu, 22 Aug 2002 07:46:38 -0400 (EDT)\nReceived: from phobos [127.0.0.1]\n\tby localhost with IMAP (fetchmail-5.9.0)\n\tfor zzzz@localhost (single-drop); Thu, 22 Aug 2002 12:46:38 +0100 (IST)\nReceived: from n20.grp.scd.yahoo.com (n20.grp.scd.yahoo.com\n    [66.218.66.76]) by dogma.slashnull.org (8.11.6/8.11.6) with SMTP id\n    g7MBkTZ05087 for <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 12:46:29 +0100\nX-Egroups-Return: sentto-2242572-52726-1030016790-zzzz=spamassassin.taint.org@returns.groups.yahoo.com\nReceived: from [66.218.67.196] by n20.grp.scd.yahoo.com with NNFMP;\n    22 Aug 2002 11:46:30 -0000\nX-Sender: steve.burt@cursor-system.com\nX-Apparently-To: zzzzteana@yahoogroups.com\nReceived: (EGP: mail-8_1_0_1); 22 Aug 2002 11:46:29 -0000\nReceived: (qmail 11764 invoked from network); 22 Aug 2002 11:46:29 -0000\nReceived: from unknown (66.218.66.217) by m3.grp.scd.yahoo.com with QMQP;\n    22 Aug 2002 11:46:29 -0000\nReceived: from unknown (HELO mailgateway.cursor-system.com) (62.189.7.27)\n    by mta2.grp.scd.yahoo.com with SMTP; 22 Aug 2002 11:46:29 -0000\nReceived: from exchange1.cps.local (unverified) by\n    mailgateway.cursor-system.com (Content Technologies SMTPRS 4.2.10) with\n    ESMTP id <T5cde81f695ac1d100407d@mailgateway.cursor-system.com> for\n    <forteana@yahoogroups.com>; Thu, 22 Aug 2002 13:14:10 +0100\nReceived: by exchange1.cps.local with Internet Mail Service (5.5.2653.19)\n    id <PXX6AT23>; Thu, 22 Aug 2002 12:46:27 +0100\nMessage-Id: <5EC2AD6D2314D14FB64BDA287D25D9EF12B4F6@exchange1.cps.local>\nTo: "'zzzzteana@yahoogroups.com'" <zzzzteana@yahoogroups.com>\nX-Mailer: Internet Mail Service (5.5.2653.19)\nX-Egroups-From: Steve Burt <steve.burt@cursor-system.com>\nFrom: Steve Burt <Steve_Burt@cursor-system.com>\nX-Yahoo-Profile: pyruse\nMIME-Version: 1.0\nMailing-List: list zzzzteana@yahoogroups.com; contact\n    forteana-owner@yahoogroups.com\nDelivered-To: mailing list zzzzteana@yahoogroups.com\nPrecedence: bulk\nList-Unsubscribe: <mailto:zzzzteana-unsubscribe@yahoogroups.com>\nDate: Thu, 22 Aug 2002 12:46:18 +0100\nSubject: [zzzzteana] RE: Alexander\nReply-To: zzzzteana@yahoogroups.com\nContent-Type: text/plain; charset=US-ASCII\nContent-Transfer-Encoding: 7bit\n\nMartin A posted:\nTassos Papadopoulos, the Greek sculptor behind the plan, judged that the\n limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the\n Mount Athos monastic community, was ideal for the patriotic sculpture. \n \n As well as Alexander's granite features, 240 ft high and 170 ft wide, a\n museum, a restored amphitheatre and car park for admiring crowds are\nplanned\n---------------------\nSo is this mountain limestone or granite?\nIf it's limestone, it'll weather pretty fast.\n\n------------------------ Yahoo! Groups Sponsor ---------------------~-->\n4 DVDs Free +s&p Join Now\nhttp://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM\n---------------------------------------------------------------------~->\n\nTo unsubscribe from this group, send an email to:\nforteana-unsubscribe@egroups.com\n\n \n\nYour use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/ \n\n\n\n
## 3                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  From timc@2ubh.com  Thu Aug 22 13:52:59 2002\nReturn-Path: <timc@2ubh.com>\nDelivered-To: zzzz@localhost.netnoteinc.com\nReceived: from localhost (localhost [127.0.0.1])\n\tby phobos.labs.netnoteinc.com (Postfix) with ESMTP id 0314547C66\n\tfor <zzzz@localhost>; Thu, 22 Aug 2002 08:52:58 -0400 (EDT)\nReceived: from phobos [127.0.0.1]\n\tby localhost with IMAP (fetchmail-5.9.0)\n\tfor zzzz@localhost (single-drop); Thu, 22 Aug 2002 13:52:59 +0100 (IST)\nReceived: from n16.grp.scd.yahoo.com (n16.grp.scd.yahoo.com\n    [66.218.66.71]) by dogma.slashnull.org (8.11.6/8.11.6) with SMTP id\n    g7MCrdZ07070 for <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 13:53:39 +0100\nX-Egroups-Return: sentto-2242572-52733-1030020820-zzzz=spamassassin.taint.org@returns.groups.yahoo.com\nReceived: from [66.218.67.198] by n16.grp.scd.yahoo.com with NNFMP;\n    22 Aug 2002 12:53:40 -0000\nX-Sender: timc@2ubh.com\nX-Apparently-To: zzzzteana@yahoogroups.com\nReceived: (EGP: mail-8_1_0_1); 22 Aug 2002 12:53:39 -0000\nReceived: (qmail 76099 invoked from network); 22 Aug 2002 12:53:39 -0000\nReceived: from unknown (66.218.66.218) by m5.grp.scd.yahoo.com with QMQP;\n    22 Aug 2002 12:53:39 -0000\nReceived: from unknown (HELO rhenium.btinternet.com) (194.73.73.93) by\n    mta3.grp.scd.yahoo.com with SMTP; 22 Aug 2002 12:53:39 -0000\nReceived: from host217-36-23-185.in-addr.btopenworld.com ([217.36.23.185])\n    by rhenium.btinternet.com with esmtp (Exim 3.22 #8) id 17hrT0-0004gj-00\n    for forteana@yahoogroups.com; Thu, 22 Aug 2002 13:53:38 +0100\nX-Mailer: Microsoft Outlook Express Macintosh Edition - 4.5 (0410)\nTo: zzzzteana <zzzzteana@yahoogroups.com>\nX-Priority: 3\nMessage-Id: <E17hrT0-0004gj-00@rhenium.btinternet.com>\nFrom: "Tim Chapman" <timc@2ubh.com>\nX-Yahoo-Profile: tim2ubh\nMIME-Version: 1.0\nMailing-List: list zzzzteana@yahoogroups.com; contact\n    forteana-owner@yahoogroups.com\nDelivered-To: mailing list zzzzteana@yahoogroups.com\nPrecedence: bulk\nList-Unsubscribe: <mailto:zzzzteana-unsubscribe@yahoogroups.com>\nDate: Thu, 22 Aug 2002 13:52:38 +0100\nSubject: [zzzzteana] Moscow bomber\nReply-To: zzzzteana@yahoogroups.com\nContent-Type: text/plain; charset=US-ASCII\nContent-Transfer-Encoding: 7bit\n\nMan Threatens Explosion In Moscow \n\nThursday August 22, 2002 1:40 PM\nMOSCOW (AP) - Security officers on Thursday seized an unidentified man who\nsaid he was armed with explosives and threatened to blow up his truck in\nfront of Russia's Federal Security Services headquarters in Moscow, NTV\ntelevision reported.\nThe officers seized an automatic rifle the man was carrying, then the man\ngot out of the truck and was taken into custody, NTV said. No other details\nwere immediately available.\nThe man had demanded talks with high government officials, the Interfax and\nITAR-Tass news agencies said. Ekho Moskvy radio reported that he wanted to\ntalk with Russian President Vladimir Putin.\nPolice and security forces rushed to the Security Service building, within\nblocks of the Kremlin, Red Square and the Bolshoi Ballet, and surrounded the\nman, who claimed to have one and a half tons of explosives, the news\nagencies said. Negotiations continued for about one and a half hours outside\nthe building, ITAR-Tass and Interfax reported, citing witnesses.\nThe man later drove away from the building, under police escort, and drove\nto a street near Moscow's Olympic Penta Hotel, where authorities held\nfurther negotiations with him, the Moscow police press service said. The\nmove appeared to be an attempt by security services to get him to a more\nsecure location. \n\n------------------------ Yahoo! Groups Sponsor ---------------------~-->\n4 DVDs Free +s&p Join Now\nhttp://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM\n---------------------------------------------------------------------~->\n\nTo unsubscribe from this group, send an email to:\nforteana-unsubscribe@egroups.com\n\n \n\nYour use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/ \n\n\n\n
##   Spam
## 1    0
## 2    0
## 3    0
spam = data.frame(do.call(rbind, lapply(spam_files, read_file)))
names(spam)[1] = 'Text'
spam$Spam = 1
head(spam, 3)
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                Text
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 From ilug-admin@linux.ie  Tue Aug  6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@localhost.netnoteinc.com\nReceived: from localhost (localhost [127.0.0.1])\n\tby phobos.labs.netnoteinc.com (Postfix) with ESMTP id 9E1F5441DD\n\tfor <jm@localhost>; Tue,  6 Aug 2002 06:48:09 -0400 (EDT)\nReceived: from phobos [127.0.0.1]\n\tby localhost with IMAP (fetchmail-5.9.0)\n\tfor jm@localhost (single-drop); Tue, 06 Aug 2002 11:48:09 +0100 (IST)\nReceived: from lugh.tuatha.org (root@lugh.tuatha.org [194.125.145.45]) by\n    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g72LqWv13294 for\n    <jm-ilug@jmason.org>; Fri, 2 Aug 2002 22:52:32 +0100\nReceived: from lugh (root@localhost [127.0.0.1]) by lugh.tuatha.org\n    (8.9.3/8.9.3) with ESMTP id WAA31224; Fri, 2 Aug 2002 22:50:17 +0100\nReceived: from bettyjagessar.com (w142.z064000057.nyc-ny.dsl.cnc.net\n    [64.0.57.142]) by lugh.tuatha.org (8.9.3/8.9.3) with ESMTP id WAA31201 for\n    <ilug@linux.ie>; Fri, 2 Aug 2002 22:50:11 +0100\nX-Authentication-Warning: lugh.tuatha.org: Host w142.z064000057.nyc-ny.dsl.cnc.net\n    [64.0.57.142] claimed to be bettyjagessar.com\nReceived: from 64.0.57.142 [202.63.165.34] by bettyjagessar.com\n    (SMTPD32-7.06 EVAL) id A42A7FC01F2; Fri, 02 Aug 2002 02:18:18 -0400\nMessage-Id: <1028311679.886@0.57.142>\nDate: Fri, 02 Aug 2002 23:37:59 0530\nTo: ilug@linux.ie\nFrom: "Start Now" <startnow2002@hotmail.com>\nMIME-Version: 1.0\nContent-Type: text/plain; charset="US-ASCII"; format=flowed\nSubject: [ILUG] STOP THE MLM INSANITY\nSender: ilug-admin@linux.ie\nErrors-To: ilug-admin@linux.ie\nX-Mailman-Version: 1.1\nPrecedence: bulk\nList-Id: Irish Linux Users' Group <ilug.linux.ie>\nX-Beenthere: ilug@linux.ie\n\nGreetings!\n\nYou are receiving this letter because you have expressed an interest in \nreceiving information about online business opportunities. If this is \nerroneous then please accept my most sincere apology. This is a one-time \nmailing, so no removal is necessary.\n\nIf you've been burned, betrayed, and back-stabbed by multi-level marketing, \nMLM, then please read this letter. It could be the most important one that \nhas ever landed in your Inbox.\n\nMULTI-LEVEL MARKETING IS A HUGE MISTAKE FOR MOST PEOPLE\n\nMLM has failed to deliver on its promises for the past 50 years. The pursuit \nof the "MLM Dream" has cost hundreds of thousands of people their friends, \ntheir fortunes and their sacred honor. The fact is that MLM is fatally \nflawed, meaning that it CANNOT work for most people.\n\nThe companies and the few who earn the big money in MLM are NOT going to \ntell you the real story. FINALLY, there is someone who has the courage to \ncut through the hype and lies and tell the TRUTH about MLM.\n\nHERE'S GOOD NEWS\n\nThere IS an alternative to MLM that WORKS, and works BIG! If you haven't yet \nabandoned your dreams, then you need to see this. Earning the kind of income \nyou've dreamed about is easier than you think!\n\nWith your permission, I'd like to send you a brief letter that will tell you \nWHY MLM doesn't work for most people and will then introduce you to \nsomething so new and refreshing that you'll wonder why you haven't heard of \nthis before.\n\nI promise that there will be NO unwanted follow up, NO sales pitch, no one \nwill call you, and your email address will only be used to send you the \ninformation. Period.\n\nTo receive this free, life-changing information, simply click Reply, type \n"Send Info" in the Subject box and hit Send. I'll get the information to you \nwithin 24 hours. Just look for the words MLM WALL OF SHAME in your Inbox.\n\nCordially,\n\nSiddhi\n\nP.S. Someone recently sent the letter to me and it has been the most \neye-opening, financially beneficial information I have ever received. I \nhonestly believe that you will feel the same way once you've read it. And \nit's FREE!\n\n\n------------------------------------------------------------\nThis email is NEVER sent unsolicited.  THIS IS NOT "SPAM". You are receiving \nthis email because you EXPLICITLY signed yourself up to our list with our \nonline signup form or through use of our FFA Links Page and E-MailDOM \nsystems, which have EXPLICIT terms of use which state that through its use \nyou agree to receive our emailings.  You may also be a member of a Altra \nComputer Systems list or one of many numerous FREE Marketing Services and as \nsuch you agreed when you signed up for such list that you would also be \nreceiving this emailing.\nDue to the above, this email message cannot be considered unsolicitated, or \nspam.\n-----------------------------------------------------------\n\n\n\n\n-- \nIrish Linux Users' Group: ilug@linux.ie\nhttp://www.linux.ie/mailman/listinfo/ilug for (un)subscription information.\nList maintainer: listmaster@linux.ie\n\n\n
## 2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   From lmrn@mailexcite.com  Mon Jun 24 17:03:24 2002\nReturn-Path: merchantsworld2001@juno.com\nDelivery-Date: Mon May 13 04:46:13 2002\nReceived: from mandark.labs.netnoteinc.com ([213.105.180.140]) by\n    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g4D3kCe15097 for\n    <jm@jmason.org>; Mon, 13 May 2002 04:46:12 +0100\nReceived: from 203.129.205.5.205.129.203.in-addr.arpa ([203.129.205.5]) by\n    mandark.labs.netnoteinc.com (8.11.2/8.11.2) with SMTP id g4D3k2D12605 for\n    <jm@netnoteinc.com>; Mon, 13 May 2002 04:46:04 +0100\nReceived: from html (unverified [207.95.174.49]) by\n    203.129.205.5.205.129.203.in-addr.arpa (EMWAC SMTPRS 0.83) with SMTP id\n    <B0000178595@203.129.205.5.205.129.203.in-addr.arpa>; Mon, 13 May 2002\n    09:04:46 +0530\nMessage-Id: <B0000178595@203.129.205.5.205.129.203.in-addr.arpa>\nFrom: lmrn@mailexcite.com\nTo: ranmoore@cybertime.net\nSubject: Real Protection, Stun Guns!  Free Shipping! Time:2:01:35 PM\nDate: Mon, 28 Jul 1980 14:01:35\nMIME-Version: 1.0\nX-Keywords: \nContent-Type: text/html; charset="DEFAULT"\n\n<html>\n<body>\n<center>\n<h3>\n<font color="blue">\n<b>\nThe Need For Safety Is Real In 2002, You Might Only Get One Chance - Be Ready!\n<p>\nFree Shipping & Handling Within The (USA) If You Order Before May 25, 2002! \n<p>\n3 Day Super Sale, Now Until May 7, 2002!  Save Up To $30.00 On Some Items!\n\n</b>\n</font>\n</h3>\n</center>\n<p>\nIT'S GETTING TO BE SPRING AGAIN, PROTECT YOURSELF AS YOU WALK,<br>\nJOG AND EXERCISE OUTSIDE.  ALSO PROTECT YOUR LOVED ONES AS<br>\nTHEY RETURN HOME FROM COLLEGE!<br>\n<p>\n*     LEGAL PROTECTION FOR COLLEGE STUDENTS!<br>\n*     GREAT UP'COMING OUTDOOR PROTECTION GIFTS!<br>\n*     THERE IS NOTHING WORTH MORE PROTECTING THAN LIFE!<br>\n*     OUR STUN DEVICES & PEPPER PRODUCTS ARE LEGAL PROTECTION!\n<p>\n<b>\n<font color="red">\nJOIN THE WAR ON CRIME!\n</b>\n</font>\n<p>\n\nSTUN GUNS AND BATONS \n<p>\nEFFECTIVE - SAFE - NONLETHAL\n<p>\nPROTECT YOUR LOVED ONES AND YOURSELF\n<p>\nNo matter who you are, no matter what City or Town you live in,<br>\nif you live in America, you will be touched by crime.\n<p>\nYou hear about it on TV.  You read about it in the newspaper.<br>\nIt's no secret that crime is a major problem in the U.S. today.<br>\nCriminals are finding it easier to commit crimes all the time.\n<p>\nWeapons are readily available.  Our cities' police forces have<br>\nmore work than they can handle.  Even if these criminal are<br>\ncaught, they won't be spending long in our nation's overcrowded<br>\njails.  And while lawmakers are well aware of the crime problem,<br>\nthey don't seem to have any effective answers.\n<p>\nOur Email Address:  <a\nhref="mailto:Merchants4all@aol.com">Merchants4all@aol.com</a>\n<p>\nINTERESTED:\n<p>\nYou will be protecting yourself within 7 days!  Don't Wait,<br>\nvisit our web page below, and join The War On Crime!\n<p>\n*****************<br>\n<a\nhref="http://www.geocities.com/realprotection_20022003/">http://www.geocities.com/realprotection_20022003/</a><br>\n*****************\n<p>\nWell, there is an effective answer.  Take responsibility for<br>\nyour own security.  Our site has a variety of quality personal<br>\nsecurity products.  Visit our site, choose the personal security<br>\nproducts that are right for you.  Use them, and join the war on\ncrime!\n<p>\nFREE PEPPER SPRAY WITH ANY STUN UNIT PURCHASE.<br>\n(A Value of $15.95)\n<p>\nWe Ship Orders Within 5 To 7 Days, To Every State In The U.S.A.<br>\nby UPS, FEDEX, or U.S. POSTAL SERVICE.  Visa, MasterCard, American<br>\nExpress & Debt Card Gladly Accepted.\n<p>\nAsk yourself this question, if you don't help your loved ones,\nwho will?\n<p>\nINTERESTED:\n<p>\n*****************<br>\n<a\nhref="http://www.geocities.com/realprotection_20022003/">http://www.geocities.com/realprotection_20022003/</a><br>\n*****************\n<p>\n___The Stun Monster 625,000 Volts ($86.95)<br>\n___The Z-Force Slim Style 300,000 Volts ($64.95)<br>\n___The StunMaster 300,000 Volts Straight ($59.95)<br>\n___The StunMaster 300,000 Volts Curb ($59.95)<br>\n___The StunMaster 200,000 Volts Straight ($49.95)<br>\n___The StunMaster 200,000 Volts Curb ($49.95)<br>\n___The StunBaton 500,000 Volts ($89.95)<br>\n___The StunBaton 300,000 Volts ($79.95)<br>\n___Pen Knife (One $12.50, Two Or More $9.00)<br>\n___Wildfire Pepper Spray  (One $15.95, Two Or More $11.75)\n<p>\n___Add $5.75 For Shipping & Handling Charge.\n<p>\n\nTo Order by postal mail, please send to the below address.<br>\nMake payable to Mega Safety Technology.\n<p>\nMega Safety Technology<br>\n3215 Merrimac Ave.<br>\nDayton, Ohio  45405<br>\nOur Email Address:  <a\nhref="mailto:Merchants4all@aol.com">Merchants4all@aol.com</a>\n<p>\nOrder by 24 Hour Fax!!!  775-257-6657.\n<p>\n*****<br>\n<b><font color="red">Important Credit Card Information! Please Read Below!</b></font>\n <br><br>\n*     Credit Card Address, City, State and Zip Code, must match\n      billing address to be processed. \n<br><br>\n\nCHECK____  MONEYORDER____  VISA____ MASTERCARD____ AmericanExpress___\nDebt Card___\n<br><br>\nName_______________________________________________________<br>\n(As it appears on Check or Credit Card)\n<br><br>\nAddress____________________________________________________<br>\n(As it appears on Check or Credit Card)\n<br><br>\n___________________________________________________<br>\nCity,State,Zip(As it appears on Check or Credit Card)\n<br><br>\n___________________________________________________<br>\nCountry\n<br><br>\n___________________________________________________<br>\n(Credit Card Number)\n<br><br>\nExpiration Month_____  Year_____\n<br><br>\n___________________________________________________<br>\nAuthorized Signature\n<br><br>\n<b>\n*****IMPORTANT NOTE*****\n</b>\n<br><br>\nIf Shipping Address Is Different From The Billing Address Above,\nPlease Fill Out Information Below.\n<br><br>\nShipping Name______________________________________________\n<br><br>\nShipping Address___________________________________________\n<br><br>\n___________________________________________________________<br>\nShipping City,State,Zip\n<br><br>\n___________________________________________________________<br>\nCountry\n<br><br>\n___________________________________________________________<br>\nEmail Address & Phone Number(Please Write Neat)\n</body>\n</html>\n
## 3 From amknight@mailexcite.com  Mon Jun 24 17:03:49 2002\nReturn-Path: merchantsworld2001@juno.com\nDelivery-Date: Wed May 15 08:58:23 2002\nReceived: from mandark.labs.netnoteinc.com ([213.105.180.140]) by\n    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g4F7wIe23864 for\n    <jm@jmason.org>; Wed, 15 May 2002 08:58:18 +0100\nReceived: from webcust2.hightowertech.com (webcust2.hightowertech.com\n    [216.41.166.100]) by mandark.labs.netnoteinc.com (8.11.2/8.11.2) with\n    ESMTP id g4F7wGD24120 for <jm@netnoteinc.com>; Wed, 15 May 2002 08:58:17\n    +0100\nReceived: from html ([206.216.197.214]) by webcust2.hightowertech.com with\n    Microsoft SMTPSVC(5.5.1877.197.19); Wed, 15 May 2002 00:55:53 -0700\nFrom: amknight@mailexcite.com\nTo: cbmark@cbmark.com\nSubject: New Improved Fat Burners, Now With TV Fat Absorbers! Time:6:25:49 PM\nDate: Wed, 30 Jul 1980 18:25:49\nMIME-Version: 1.0\nMessage-Id: <0845b5355070f52WEBCUST2@webcust2.hightowertech.com>\nX-Keywords: \nContent-Type: text/html; charset="DEFAULT"\n\n<html>\n<body>\n<center>\n<b>\n<font color="blue">\n*****Bonus Fat Absorbers As Seen On TV, Included Free With Purchase Of 2 Or More Bottle, $24.95 Value*****\n</font>\n<br>\n<br>\n***TAKE $10.00 OFF 2 & 3 MONTH SUPPLY ORDERS, $5.00 OFF 1 MONTH SUPPLY!\n***AND STILL GET YOUR BONUS!  PRICE WILL BE DEDUCTED DURING PROCESSING.\n<br>\n<br>\n***FAT ABSORBERS ARE GREAT FOR THOSE WHO WANT TO LOSE WEIGHT,  BUT CAN'T STAY ON A DIET***\n<br>\n<br>\n***OFFER GOOD UNTIL MAY 27, 2002!  FOREIGN ORDERS INCLUDED!\n<br>\n<br>\n\n<font color="blue">\n\nLOSE 30 POUNDS  IN 30 DAYS... GUARANTEED!!!\n<br>\n<br>\n\nAll Natural Weight-Loss Program, Speeds Up The Metabolism Safely\nRated #1 In Both Categories of SAFETY & EFFECTIVENESS In<br>\n(THE United States Today)\n<br><br>\nWE'LL HELP YOU GET THINNER!\nWE'RE GOING TO HELP YOU LOOK GOOD, FEEL GOOD AND TAKE CONTROL IN\n2002\n<br>\n<br>\n</b>\n</font color="blue">\n</center>\n\nWhy Use Our Amazing Weight Loss Capsules?\n<br><br>\n*  They act like a natural magnet to attract fat.<br>\n*  Stimulates the body's natural metabolism. <br>\n*  Controls appetite naturally and makes it easier to\n   eat the right foods consistently.<br>\n*  Reduces craving for sweets.<br>\n*  Aids in the absorption of fat and in overall digestion.<br>\n*  Inhibits bad cholesterol and boosts good cholesterol.<br>\n*  Aids in the process of weight loss and long-term weight management.<br>\n*  Completely safe, UltraTrim New Century contains no banned\n   substances and has no known side effects.<br>\n<br>\nWhat Makes UltraTrim New Century Unique?\n<br><br>\nA scientifically designed combination of natural ingredients that\nprovide long-term weight management in a safe and effective manner.\n<br><br>\n*****<br>\nReceive A Bonus Supply Of Ultra Trim New Century & A Bottle Of Fat Absorbers Listed Above, \nWith Every Order Of 2 Or More Bottles. Offer Good Until May. 27, 2002! <br>\n*****\n<br><br>\nWE GLADLY SHIP TO ALL FOREIGN COUNTRIES! \n<br><br>\nYou will be losing by tomorrow!  Don't Wait, visit our web\npage below, and order now!\n<br><br>\nEmail Address:   <a\nhref="mailto:ultratrimnow2001@aol.com">ultratrimnow2001@aol.com</a>\n<br><br>\nOrder by 24 Hour Fax!!!  775-257-6657.<br>\n<br>\n*****************<br>\n<a\nhref="http://www.geocities.com/ultra_weightloss_2002/">http://www.geocities.com/ultra_weightloss_2002/</a><br>\n*****************\n<br><br>\nThis is the easiest, fastest, and most effective way to lose both\npounds and inches permanently!!!  This weight loss program is\ndesigned specifically to "boost" weight-loss efforts by assisting\nbody metabolism, and helping the body's ability to manage weight.\nA powerful, safe, 30 Day Program.  This is one program you won't\nfeel starved on.  Complete program for one amazing low price!\nProgram includes: <b>BONUS AMAZING FAT ABSORBER CAPSULES, 30 DAY -\nWEIGHT\nREDUCTION PLAN, PROGRESS REPORT!</b>\n<br><br>\nSPECIAL BONUS..."FAT ABSORBERS", AS SEEN ON TV\nWith every order...AMAZING MELT AWAY FAT ABSORBER CAPSULES with\ndirections ( Absolutely Free ) ...With these capsules\nyou can eat what you enjoy, without the worry of fat in your diet.\n2 to 3 capsules 15 minutes before eating or snack, and the fat will be\nabsorbed and passed through the body without the digestion of fat into\nthe body. \n<br><br>\nYou will be losing by tomorrow!  Don't Wait, visit our web\npage below, and order now!\n<br><br>\nEmail Address:  <a href="mailto:ultratrimnow2001@aol.com">ultratrimnow2001@aol.com</a>\n<br><br>\n\nOrder by 24 Hour Fax!!!  775-257-6657.<br>\n<br>\n*****************<br>\n<a\nhref="http://www.geocities.com/ultra_weightloss_2002/">http://www.geocities.com/ultra_weightloss_2002/</a><br>\n*****************\n<br><br>\n___1 Month Supply $32.95 plus $4.75 S & H, 100 Amazing MegaTrim\n     Capsules.\n<br><br>\n___2 Month Supply $54.95 plus $4.75 S & H, 200 Amazing MegaTrim\n     Capsules.  (A $10.95 Savings, Free Bottle)!\n<br><br>\n___3 Month Supply $69.95,  Plus $4.75 S & H, 300 Amazing MegaTrim\n     Capsules.  (A $28.90 Savings, Free Bottle)!\n<br><br>\nTo Order by postal mail, please send to the below address.\nMake payable to UltraTrim 2002.\n<br><br>\nUltra Trim 2002<br>\n4132 Pompton Ct.<br>\nDayton, Ohio  45405<br>\n(937) 567-9807<br>\n<br>\nOrder by 24 Hour Voice/Fax!!!  775-257-6657.<br>\n<br>\n*****<br>\n<b><font color="red">Important Credit Card Information! Please Read Below!</b></font>\n <br><br>\n*     Credit Card Address, City, State and Zip Code, must match\n      billing address to be processed. \n<br><br>\n\n___Check<br>\n___MoneyOrder<br>\n___Visa<br>\n___MasterCard<br>\n___AmericanExpress<br>\n___Debt Card\n<br><br>\nName_______________________________________________________<br>\n(As it appears on Check or Credit Card)\n<br><br>\nAddress____________________________________________________<br>\n(As it appears on Check or Credit Card)\n<br><br>\n___________________________________________________<br>\nCity,State,Zip(As it appears on Check or Credit Card)\n<br><br>\n___________________________________________________<br>\nCountry\n<br><br>\n___________________________________________________<br>\n(Credit Card Number)\n<br><br>\nExpiration Month_____  Year_____\n<br><br>\n___________________________________________________<br>\nAuthorized Signature\n<br><br>\n<b>\n*****IMPORTANT NOTE*****\n</b>\n<br><br>\nIf Shipping Address Is Different From The Billing Address Above,\nPlease Fill Out Information Below.\n<br><br>\nShipping Name______________________________________________\n<br><br>\nShipping Address___________________________________________\n<br><br>\n___________________________________________________________<br>\nShipping City,State,Zip\n<br><br>\n___________________________________________________________<br>\nCountry\n<br><br>\n___________________________________________________________<br>\nEmail Address & Phone Number(Please Write Neat)\n<br>\n<br>\n<center>\n<a\nhref="mailto:ultratrim2002dontsend@yahoo.com">To Be Removed From Our Mail List, Click Here And Put The Word Remove In The Subject Line.</a>\n</center>\n<br>\n<br>\n</body>\n</html>\n
##   Spam
## 1    1
## 2    1
## 3    1

Do some conversion as to utf as one of the emails throws an error.

# Need some conversion
ham$Text = iconv(enc2utf8(as.character(ham$Text)))
spam$Text = iconv(enc2utf8(as.character(spam$Text)))
ham_c = VCorpus(VectorSource(ham$Text))
meta(ham_c, tag = "type") = 0
spam_c = VCorpus(VectorSource(spam$Text))
meta(spam_c, tag = "type") = 1
corpus = c(ham_c, spam_c)

Create a single dataframe. Create the corpus with the following pre-processing steps 1) Build a new corpus variable called corpus. 2) Using tm_map, convert the text to lowercase. 3) Using tm_map, remove all punctuation from the corpus. 4) Using tm_map, remove all English stopwords from the corpus. 5) Using tm_map, stem the words in the corpus. 6) Build a document term matrix from the corpus, called dtm.

#data$Text = iconv(enc2utf8(as.character(data$Text))) # need proper encoding to not error?
#corpus = VCorpus(VectorSource(data$Text))
corpus = tm_map(corpus, tolower)
corpus = tm_map(corpus, PlainTextDocument)
corpus = tm_map(corpus, removePunctuation)
corpus = tm_map(corpus, removeWords, stopwords("english"))
corpus = tm_map(corpus, stemDocument)

Now we can make a document term matrix and remove the sparse terms.

dtm = DocumentTermMatrix(corpus)
spdtm = removeSparseTerms(dtm, 0.95)
spdtm
## <<DocumentTermMatrix (documents: 3898, terms: 455)>>
## Non-/sparse entries: 272932/1500658
## Sparsity           : 85%
## Maximal term length: 49
## Weighting          : term frequency (tf)

At this point, we can look at the word frequency to get an idea of what kind of words are common

dtm2 = as.matrix(spdtm)
frequency = colSums(dtm2)
frequency = sort(frequency, decreasing=T)
table_freq = head(frequency, 15)
kable(table_freq, "html", escape = F) %>%
  kable_styling("striped", full_width = T) %>%
  column_spec(1, bold = T)
x
2002 27480
receiv 20049
esmtp 10735
sep 9409
0100 9016
localhost 8612
127001 7455
mon 5548
postfix 5249
aug 5224
thu 4920
wed 4906
jmlocalhost 4895
oct 4874
date 4598
wf = data.frame(word=names(frequency), frequency=frequency)

p = ggplot(subset(wf, frequency>2000), aes(x = reorder(word, -frequency), y = frequency)) +
  geom_bar(stat = "identity", fill='#35a2c4') +
  theme(axis.text.x=element_text(angle=90, hjust=1)) + 
  theme(panel.background = element_rect(fill = '#adc8d1'))
p

Create a dataframe from teh term document matrix and add the labels

sparse_df = as.data.frame(as.matrix(spdtm))
meta_type = as.vector(unlist(meta(corpus)))
#meta_data = as.double(type=unlist(meta_type))
sparse_df$LABEL = meta_type
head(sparse_df)
##                0000 0100 0200 0400 0500 0700 0800 100 1000 103113 127001
## character.0.      0    2    0    6    1    3    0   0    0      0      4
## character.0..1    5    5    0    1    0    0    0   0    0      0      2
## character.0..2    5    4    0    1    0    0    0   0    0      0      2
## character.0..3    1    2    0    5    0    0    0   0    0      0      3
## character.0..4    5    4    0    1    0    0    0   0    0      0      2
## character.0..5    7    4    0    1    0    0    0   0    0      0      2
##                193120211219 19317254 2001 2002 201 2011 209sfnet 213105180140
## character.0.              0        0    0   14   1    0        0            0
## character.0..1            0        0    0   12   0    0        0            0
## character.0..2            0        0    0   12   0    0        0            0
## character.0..3            0        0    0   11   0    0        0            0
## character.0..4            0        0    0   11   0    0        0            0
## character.0..5            0        0    0   11   0    0        0            0
##                216136171252 331vamm2 6416122236 7bit 81128112 81168116
## character.0.              0        0          0    0        0        6
## character.0..1            0        0          0    1        0        1
## character.0..2            0        0          0    1        0        1
## character.0..3            0        0          0    0        0        4
## character.0..4            0        0          0    1        0        1
## character.0..5            0        0          0    1        0        2
##                81168116egwn 893893 8bit abl access actual add address allow
## character.0.              0      0    0   1      0      1   0       0     0
## character.0..1            0      0    0   0      0      0   0       0     0
## character.0..2            0      0    0   0      0      0   0       0     0
## character.0..3            0      0    0   0      0      0   0       0     0
## character.0..4            0      0    0   0      0      0   0       0     0
## character.0..5            0      0    0   0      0      0   0       0     0
##                alreadi also alway anoth anyon anyth around ask aug
## character.0.         0    0     0     0     0     0      0   0  13
## character.0..1       0    0     0     0     0     0      0   0  12
## character.0..2       0    0     0     0     0     0      0   0  11
## character.0..3       2    0     0     0     0     0      0   0   9
## character.0..4       0    0     0     0     0     0      0   1  11
## character.0..5       0    0     0     0     0     0      0   1  13
##                auth02nlegwnnet avail back base begin best better bill bodi
## character.0.                 0     0    0    0     0    0      0    0    0
## character.0..1               0     0    0    0     0    0      0    0    0
## character.0..2               0     1    0    0     0    0      0    0    0
## character.0..3               0     0    0    0     0    0      0    0    0
## character.0..4               0     0    0    0     0    0      0    0    0
## character.0..5               0     0    0    0     0    0      0    0    1
##                border3d0 build bulk busi bythinkgeek call can cant case cdt
## character.0.           0     0    1    0           0    0   0    1    0   0
## character.0..1         0     0    1    0           0    0   0    0    0   0
## character.0..2         0     3    1    0           0    0   0    0    0   0
## character.0..3         0     0    1    0           0    0   1    0    0   0
## character.0..4         0     0    1    0           0    0   0    0    0   0
## character.0..5         0     0    1    0           0    0   0    0    0   0
##                cellspacing3d0 center chang charsetiso88591 charsetusascii check
## character.0.                0      0     0               0              1     0
## character.0..1              0      0     0               0              1     0
## character.0..2              0      0     0               0              1     0
## character.0..3              0      0     0               0              1     0
## character.0..4              0      0     0               0              1     0
## character.0..5              0      0     0               0              1     0
##                claim clean click code come communic compani complet comput
## character.0.       0     0     0    1    2        0       0       0      0
## character.0..1     0     0     0    0    0        0       0       0      0
## character.0..2     1     0     0    0    0        0       0       0      0
## character.0..3     0     0     0    0    0        0       0       0      0
## character.0..4     0     0     0    0    0        0       0       0      0
## character.0..5     0     0     0    0    0        0       0       0      0
##                contact contentdisposit contenttransferencod contenttyp cost
## character.0.         0               0                    0          1    0
## character.0..1       1               0                    1          1    0
## character.0..2       1               0                    1          1    0
## character.0..3       0               0                    0          1    0
## character.0..4       1               0                    1          1    0
## character.0..5       1               0                    1          1    0
##                cours creat credit current custom date day debian deliveredto
## character.0.       0     1      0       0      0    2   1      0           2
## character.0..1     0     0      0       0      0    1   0      0           2
## character.0..2     0     0      0       0      0    1   0      0           2
## character.0..3     0     0      0       0      0    1   0      0           1
## character.0..4     0     0      0       0      0    1   0      0           2
## character.0..5     0     0      0       0      0    1   0      0           2
##                deliveryd develop differ direct discuss div doesnt
## character.0.           0       1      0      0       1   0      0
## character.0..1         0       0      0      0       0   0      0
## character.0..2         0       0      0      0       0   0      0
## character.0..3         0       0      0      0       0   0      0
## character.0..4         0       0      0      0       0   0      0
## character.0..5         0       0      0      0       0   0      0
##                dogmaslashnullorg done dont easi edt els email encodingutf8 end
## character.0.                   1    0    0    0   3   0     0            0   0
## character.0..1                 1    0    0    0   1   0     1            0   0
## character.0..2                 1    0    0    0   1   0     1            0   0
## character.0..3                 1    0    0    0   1   0     1            0   0
## character.0..4                 1    0    0    0   1   0     1            0   0
## character.0..5                 1    0    0    0   1   0     1            0   0
##                error errorsto esmtp even ever everi exchang exim experi express
## character.0.       2        1     6    0    0     1       0    0      0       1
## character.0..1     0        0     2    0    0     0       0    0      0       0
## character.0..2     0        0     2    0    0     0       0    1      0       1
## character.0..3     0        1     6    0    3     1       0    0      0       0
## character.0..4     0        0     2    0    0     0       0    0      0       0
## character.0..5     0        0     2    0    0     0       0    0      0       0
##                face3dari fact feel fetchmail590 file fill find first follow
## character.0.           0    0    0            1    0    0    0     0      0
## character.0..1         0    0    0            1    0    0    0     0      0
## character.0..2         0    0    0            1    0    0    0     0      0
## character.0..3         0    0    0            1    0    0    0     0      0
## character.0..4         0    0    0            1    0    0    0     0      0
## character.0..5         0    0    0            1    0    0    0     0      0
##                font forg fork forkadminxentcom forkspamassassintaintorg
## character.0.      0    0    0                0                        0
## character.0..1    0    0    0                0                        0
## character.0..2    0    0    0                0                        0
## character.0..3    0    1    0                0                        0
## character.0..4    0    0    0                0                        0
## character.0..5    0    0    0                0                        0
##                forkxentcom form format formatflow found free freshrpm fri
## character.0.             0    1      0          0     0    0        0   0
## character.0..1           0    0      0          0     0    1        0   0
## character.0..2           0    0      0          0     0    1        0   0
## character.0..3           0    0      0          0     0    0        0   0
## character.0..4           0    0      0          0     0    1        0   0
## character.0..5           0    0      0          0     0    1        0   0
##                friend full futur geek get give gmt good got great group
## character.0.        0    0     0    0   1    0   0    0   0     0     0
## character.0..1      0    0     0    0   0    0   0    0   0     0     3
## character.0..2      0    0     0    0   1    0   0    0   1     0     3
## character.0..3      0    0     0    0   0    0   0    0   0     0     0
## character.0..4      0    0     0    0   0    0   0    0   0     0     3
## character.0..5      0    0     0    0   0    0   0    0   0     0     3
##                guarante happen head heaven helo helouswsflist1sourceforgenet
## character.0.          0      1    0      0    0                            0
## character.0..1        0      0    0      0    1                            0
## character.0..2        0      0    0      0    1                            0
## character.0..3        0      0    0      0    0                            0
## character.0..4        0      0    0      0    1                            0
## character.0..5        0      0    0      0    1                            0
##                help helvetica high home host hour howev html
## character.0.      0         0    0    0    0    0     0    0
## character.0..1    0         0    1    0    0    0     0    0
## character.0..2    0         0    1    0    0    1     0    0
## character.0..3    0         0    0    1    0    0     0    0
## character.0..4    0         0    0    0    0    0     0    0
## character.0..5    0         0    0    0    0    0     0    0
##                httplistsfreshrpmsnetmailmanlistinforpmlist
## character.0.                                             0
## character.0..1                                           0
## character.0..2                                           0
## character.0..3                                           0
## character.0..4                                           0
## character.0..5                                           0
##                httplistsfreshrpmsnetmailmanlistinforpmzzzlist
## character.0.                                                0
## character.0..1                                              0
## character.0..2                                              0
## character.0..3                                              0
## character.0..4                                              0
## character.0..5                                              0
##                httplistsfreshrpmsnetpipermailrpmzzzlist httpthinkgeekcomsf
## character.0.                                          0                  0
## character.0..1                                        0                  0
## character.0..2                                        0                  0
## character.0..3                                        0                  0
## character.0..4                                        0                  0
## character.0..5                                        0                  0
##                httpxentcommailmanlistinfofork httpxentcompipermailfork idea ill
## character.0.                                0                        0    0   0
## character.0..1                              0                        0    0   0
## character.0..2                              0                        0    0   0
## character.0..3                              0                        0    0   0
## character.0..4                              0                        0    0   0
## character.0..5                              0                        0    0   0
##                imap img import includ increas inform inreplyto instal instead
## character.0.      1   0      0      0       0      0         1      0       0
## character.0..1    1   0      0      0       0      0         0      0       0
## character.0..2    1   0      0      0       0      0         0      0       0
## character.0..3    1   0      0      0       0      0         0      0       0
## character.0..4    1   0      0      0       0      0         0      0       1
## character.0..5    1   0      0      0       0      0         1      0       1
##                interest internet invok issu ist ive jalapeno jmasonorg
## character.0.          0        0     0    1   1   0        0         0
## character.0..1        0        2     1    0   1   0        0         0
## character.0..2        0        0     1    0   1   0        0         0
## character.0..3        0        0     0    1   1   0        0         0
## character.0..4        0        0     1    0   1   2        0         0
## character.0..5        0        0     1    0   1   0        0         0
##                jmjmasonorg jmlocalhost jmnetnoteinccom jmrpmjmasonorg jul jun
## character.0.             0           0               0              0   0   0
## character.0..1           0           0               0              0   0   0
## character.0..2           0           0               0              0   0   0
## character.0..3           0           0               0              0   0   0
## character.0..4           0           0               0              0   0   0
## character.0..5           0           0               0              0   0   0
##                just keep khare know lairxentcom last least less let life like
## character.0.      0    0     0    0           0    0     0    0   0    0    1
## character.0..1    0    0     0    0           0    0     0    0   0    0    0
## character.0..2    0    0     0    0           0    0     0    0   0    0    0
## character.0..3    0    0     0    0           0    1     0    0   0    0    0
## character.0..4    1    0     0    0           0    0     0    0   0    0    1
## character.0..5    2    0     0    0           0    0     0    0   0    0    1
##                line link linux list listarch listhelp listid listpost
## character.0.      1    0     0    6        1        1      1        1
## character.0..1    0    0     0    2        0        0      0        0
## character.0..2    0    0     0    2        0        0      0        0
## character.0..3    0    0     0    2        1        1      1        1
## character.0..4    0    0     0    2        0        0      0        0
## character.0..5    0    0     0    2        0        0      0        0
##                listsubscrib listunsubscrib live local localhost
## character.0.              1              1    0     1         4
## character.0..1            0              1    0     0         3
## character.0..2            0              1    0     0         3
## character.0..3            1              1    0     0         3
## character.0..4            0              1    0     0         3
## character.0..5            0              1    0     0         3
##                localhostlocaldomain long look lot low made mail
## character.0.                      1    0    0   0   0    0    1
## character.0..1                    0    0    0   0   0    0    3
## character.0..2                    0    0    0   0   0    0    1
## character.0..3                    1    0    0   0   0    0    2
## character.0..4                    0    0    0   1   1    0    1
## character.0..5                    0    0    0   0   0    0    3
##                mailtoforkrequestxentcomsubjecthelp
## character.0.                                     0
## character.0..1                                   0
## character.0..2                                   0
## character.0..3                                   0
## character.0..4                                   0
## character.0..5                                   0
##                mailtoforkrequestxentcomsubjectsubscrib
## character.0.                                         0
## character.0..1                                       0
## character.0..2                                       0
## character.0..3                                       0
## character.0..4                                       0
## character.0..5                                       0
##                mailtoforkrequestxentcomsubjectunsubscrib
## character.0.                                           0
## character.0..1                                         0
## character.0..2                                         0
## character.0..3                                         0
## character.0..4                                         0
## character.0..5                                         0
##                mailtoforkspamassassintaintorg
## character.0.                                0
## character.0..1                              0
## character.0..2                              0
## character.0..3                              0
## character.0..4                              0
## character.0..5                              0
##                mailtorpmlistrequestfreshrpmsnetsubjectsubscrib
## character.0.                                                 0
## character.0..1                                               0
## character.0..2                                               0
## character.0..3                                               0
## character.0..4                                               0
## character.0..5                                               0
##                mailtorpmlistrequestfreshrpmsnetsubjectunsubscrib
## character.0.                                                   0
## character.0..1                                                 0
## character.0..2                                                 0
## character.0..3                                                 0
## character.0..4                                                 0
## character.0..5                                                 0
##                mailtorpmzzzlistfreshrpmsnet
## character.0.                              0
## character.0..1                            0
## character.0..2                            0
## character.0..3                            0
## character.0..4                            0
## character.0..5                            0
##                mailtorpmzzzlistrequestfreshrpmsnetsubjecthelp mailwebnotenet
## character.0.                                                0              0
## character.0..1                                              0              0
## character.0..2                                              0              0
## character.0..3                                              0              0
## character.0..4                                              0              0
## character.0..5                                              0              0
##                make manag mandarklabsnetnoteinccom mani market may mean messag
## character.0.      0     0                        0    0      0   0    0      0
## character.0..1    0     0                        0    0      0   0    0      0
## character.0..2    0     0                        0    0      0   0    0      0
## character.0..3    1     0                        0    0      0   2    0      1
## character.0..4    3     0                        0    0      0   0    0      0
## character.0..5    2     1                        0    0      0   0    0      1
##                messageid meta microsoft might million mimeol mimevers mon money
## character.0.           2    0         0     0       0      0        1   0     0
## character.0..1         1    0         0     0       0      0        1   0     0
## character.0..2         1    0         1     0       0      0        1   0     0
## character.0..3         1    0         0     0       0      0        1   0     0
## character.0..4         1    0         0     0       0      0        1   0     0
## character.0..5         1    0         0     0       0      0        1   0     0
##                month much must name need network never new news next normal
## character.0.       0    0    0    0    0       0     0   1    0    0      0
## character.0..1     0    0    0    0    0       1     0   0    0    0      0
## character.0..2     0    0    0    0    0       1     0   0    2    0      0
## character.0..3     1    0    0    0    0       0     0   3    0    0      0
## character.0..4     0    0    0    0    0       1     1   0    0    0      0
## character.0..5     0    0    0    0    0       1     0   0    0    0      1
##                noth now number oct offer old one onlin open order organ origin
## character.0.      0   0      0   0     0   0   2     0    0     0     0      0
## character.0..1    0   1      0   0     0   0   0     0    0     0     0      0
## character.0..2    0   1      0   0     0   0   2     0    0     0     0      0
## character.0..3    0   0      0   0     0   0   1     0    0     0     0      0
## character.0..4    1   1      0   0     0   0   1     0    0     0     1      0
## character.0..5    1   1      0   0     0   0   1     0    0     0     1      0
##                outlook packag page part past pay pdt peopl per person pfont
## character.0.         0      0    0    1    0   0   0     0   0      0     0
## character.0..1       0      0    0    0    0   0   0     0   0      0     0
## character.0..2       1      0    0    0    0   0   0     0   0      0     0
## character.0..3       0      0    0    0    0   0   0     0   0      0     0
## character.0..4       0      0    0    0    0   0   0     0   0      1     0
## character.0..5       0      0    0    0    0   0   0     0   0      0     0
##                phobo phoboslabsnetnoteinccom phone place pleas point possibl
## character.0.       1                       1     0     0     0     0       0
## character.0..1     1                       1     0     0     0     0       0
## character.0..2     1                       1     0     0     0     0       0
## character.0..3     1                       1     0     0     0     0       0
## character.0..4     1                       1     0     0     0     0       0
## character.0..5     1                       1     0     0     0     0       0
##                post postfix power preced price probabl problem process produc
## character.0.      0       3     0      1     0       0       0       0      0
## character.0..1    1       1     0      1     0       0       0       0      0
## character.0..2    0       1     0      1     0       0       0       0      0
## character.0..3    1       1     0      1     0       0       0       0      0
## character.0..4    0       1     0      1     0       0       0       0      0
## character.0..5    0       1     0      1     0       0       0       0      0
##                product program provid public put qmail question quotedprint
## character.0.         0       0      0      0   0     0        0           0
## character.0..1       0       0      0      0   0     1        0           0
## character.0..2       0       0      0      0   0     1        0           0
## character.0..3       0       0      0      0   0     0        0           0
## character.0..4       0       0      0      0   0     1        0           0
## character.0..5       0       0      0      0   0     1        0           0
##                rate read real realli reason receiv refer regard relat releas
## character.0.      0    0    0      0      0     10     1      0     0      0
## character.0..1    0    0    0      0      0     10     0      0     0      0
## character.0..2    0    0    0      0      0      9     0      0     0      0
## character.0..3    0    0    0      0      0      7     0      0     0      0
## character.0..4    0    0    0      0      0      9     1      0     0      0
## character.0..5    0    0    0      1      0     11     0      0     0      0
##                remov repli replyto report request requir result returnpath
## character.0.       0     0       0      0       0      0      0          1
## character.0..1     0     0       1      0       0      0      0          1
## character.0..2     0     0       1      3       0      0      0          1
## character.0..3     0     0       0      1       0      0      0          1
## character.0..4     0     0       1      0       0      0      0          1
## character.0..5     0     0       1      0       0      0      0          1
##                right rohit rpm rpmlist rpmlistadminfreshrpmsnet
## character.0.       0     0   0       0                        0
## character.0..1     0     0   0       0                        0
## character.0..2     0     0   0       0                        0
## character.0..3     0     0   0       0                        0
## character.0..4     0     0   0       0                        0
## character.0..5     0     0   0       0                        0
##                rpmlistfreshrpmsnet rpmzzzlistadminfreshrpmsnet
## character.0.                     0                           0
## character.0..1                   0                           0
## character.0..2                   0                           0
## character.0..3                   0                           0
## character.0..4                   0                           0
## character.0..5                   0                           0
##                rpmzzzlistfreshrpmsnet rssfeedsjmasonorg
## character.0.                        0                 0
## character.0..1                      0                 0
## character.0..2                      0                 0
## character.0..3                      0                 0
## character.0..4                      0                 0
## character.0..5                      0                 0
##                rssfeedsspamassassintaintorg run said sansserif sat save say
## character.0.                              0   1    0         0   0    0   0
## character.0..1                            0   0    0         0   0    0   0
## character.0..2                            0   0    5         0   0    0   0
## character.0..3                            0   0    0         0   0    0   1
## character.0..4                            0   0    0         0   0    0   0
## character.0..5                            0   0    0         0   0    0   0
##                search secur see seem sell send sender sent sep server servic
## character.0.        1     0   0    0    0    0      1    0   0      0      0
## character.0..1      0     0   0    0    0    1      0    0   0      0      2
## character.0..2      0     6   0    0    0    1      0    0   0      0      4
## character.0..3      0     1   0    0    0    0      1    0   0      0      0
## character.0..4      0     0   0    0    0    1      0    0   0      0      0
## character.0..5      0     0   0    0    0    1      0    0   0      0      0
##                set sfnet show simpl simpli sinc singledrop site smtp softwar
## character.0.     0     0    0     0      0    1          1    0    2       0
## character.0..1   0     0    0     0      0    0          1    0    2       0
## character.0..2   0     0    0     0      0    0          1    0    2       0
## character.0..3   0     0    0     0      0    0          1    0    0       1
## character.0..4   0     0    0     0      0    0          1    0    2       0
## character.0..5   0     0    0     0      0    0          1    0    2       0
##                someon someth sourc spam spamassassin special sponsor start
## character.0.        0      0     0    0            0       0       0     0
## character.0..1      0      0     0    0            0       0       1     0
## character.0..2      0      0     0    0            0       0       1     0
## character.0..3      0      0     0    0            0       0       0     0
## character.0..4      0      0     0    0            0       0       1     0
## character.0..5      0      0     0    0            0       0       1     0
##                state still stop subject sun suppli support sure system tabl
## character.0.       0     1    0       4   1      0       0    0      0    0
## character.0..1     0     0    0       2   0      0       0    0      0    0
## character.0..2     0     0    0       2   0      0       0    0      0    0
## character.0..3     0     0    0       1   0      0       0    0      0    0
## character.0..4     0     0    0       2   0      0       0    0      0    0
## character.0..5     0     0    0       2   0      0       0    0      0    0
##                take talk tell test texthtml textplain thank that there thing
## character.0.      0    0    0    0        0         1     0    2     0     0
## character.0..1    0    0    0    0        0         1     0    0     0     0
## character.0..2    0    2    0    0        0         1     0    0     0     0
## character.0..3    0    0    0    0        0         1     0    0     0     0
## character.0..4    0    0    0    0        0         1     0    0     0     0
## character.0..5    0    0    0    0        0         1     0    1     0     0
##                think though thu time today tri true tue two type unknown
## character.0.       1      0  12    1     1   0    0   0   0    0       0
## character.0..1     0      0   7    0     0   0    0   0   0    0       2
## character.0..2     0      0   6    0     0   0    0   0   0    0       2
## character.0..3     0      0   9    1     0   0    0   0   0    0       0
## character.0..4     0      0   6    1     0   0    0   0   0    0       2
## character.0..5     0      0   6    0     0   0    0   0   0    0       2
##                unsubscrib url use user userag userid uswsffw2sourceforgenet
## character.0.            0   0   2    0      0      0                      0
## character.0..1          1   0   1    0      0      0                      0
## character.0..2          1   0   1    0      0      0                      0
## character.0..3          0   0   1    0      0      0                      0
## character.0..4          1   0   5    0      1      0                      0
## character.0..5          1   0   3    0      0      0                      0
##                uswsflist1bsourceforgenet uswsflist1sourceforgenet
## character.0.                           0                        0
## character.0..1                         0                        0
## character.0..2                         0                        0
## character.0..3                         0                        0
## character.0..4                         0                        0
## character.0..5                         0                        0
##                uswsflist2sourceforgenet utc version visit want way web
## character.0.                          0   0       3     0    0   0   0
## character.0..1                        0   0       0     0    0   0   0
## character.0..2                        0   0       0     0    1   0   0
## character.0..3                        0   0       0     0    0   0   1
## character.0..4                        0   0       0     0    0   0   0
## character.0..5                        0   0       0     0    0   0   0
##                webnotenet wed week welcom well will window wish within without
## character.0.            0   1    0      0    0    0      1    0      0       1
## character.0..1          0   0    0      0    1    0      0    0      0       0
## character.0..2          0   0    0      0    0    0      0    0      1       0
## character.0..3          0   0    0      0    0    0      0    0      0       0
## character.0..4          0   0    0      0    0    0      0    0      0       0
## character.0..5          0   0    0      0    0    0      1    0      0       0
##                word work world write wrote xauthenticationwarn xbeenther
## character.0.      0    1     0     0     0                   0         1
## character.0..1    0    0     0     0     0                   0         0
## character.0..2    0    0     0     0     0                   0         0
## character.0..3    0    0     1     0     0                   0         1
## character.0..4    0    1     0     0     0                   0         0
## character.0..5    0    0     0     0     0                   0         0
##                xentcom xkeyword xloop xmailer xmailmanvers xmimeol xmsmailprior
## character.0.         0        0     1       0            1       0            0
## character.0..1       0        0     0       1            0       0            0
## character.0..2       0        0     0       1            0       0            0
## character.0..3       0        0     0       0            1       0            0
## character.0..4       0        0     0       0            0       0            0
## character.0..5       0        0     0       1            0       0            0
##                xoriginalarrivaltim xoriginald xprioriti year yes your
## character.0.                     0          0         0    0   0    0
## character.0..1                   0          0         0    0   0    0
## character.0..2                   0          0         1    0   0    0
## character.0..3                   0          0         0    0   0    0
## character.0..4                   0          0         0    0   0    0
## character.0..5                   0          0         0    0   0    0
##                yyyylocalhostnetnoteinccom yyyylocalhostspamassassintaintorg
## character.0.                            0                                 0
## character.0..1                          0                                 0
## character.0..2                          0                                 0
## character.0..3                          0                                 0
## character.0..4                          0                                 0
## character.0..5                          0                                 0
##                yyyynetnoteinccom yyyyspamassassintaintorg zzzzlocalhost LABEL
## character.0.                   0                        0             2     0
## character.0..1                 0                        0             2     0
## character.0..2                 0                        0             2     0
## character.0..3                 0                        0             2     0
## character.0..4                 0                        0             2     0
## character.0..5                 0                        0             2     0

Now we can split the train and test sets, and build a generalized linear model

set.seed(42)
spl = sample.split(sparse_df$LABEL, 0.8)
train = subset(sparse_df, spl == TRUE)
test = subset(sparse_df, spl == FALSE)

spam_glm = glm(LABEL~., data=train, family="binomial")
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
pred = predict(spam_glm, newdata = test, type="response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
table(test$LABEL, pred > 0.5)
##    
##     FALSE TRUE
##   0   498    2
##   1    31  248
(483+285)/nrow(test)
## [1] 0.9858793
# We see the glm achieved 98.58% accuracy

References: https://rpubs.com/anilcs13m/126170 https://rstudio-pubs-static.s3.amazonaws.com/378660_18a426eb1a864413a98c8c2c20df7e7b.html