Introducion

In this project we will build a text classifier to identify spam messages using a Naive Bayes classifier. Individual AMSCII files will be used to build the datasets of “spam” and “ham” to train the model.

pacman::p_load(tidyverse, readr, tidyr, dplyr, magrittr, quanteda, tm, caret, e1071, caTools, randomForest)

Load in the path to the ham and spam files from the local folders on the machine

ham_files <- list.files(path = "C:/Users/micel/Documents/607/Project4/easy_ham", full.names = T)
spam_files <- list.files(path = "C:/Users/micel/Documents/607/Project4/spam_2", full.names = T)

Read in the spam files one by one and combine them into a dataframe, and create a new column Tag to identify spam vs ham

spam <- data.frame(do.call(rbind, lapply(spam_files, read_file)))
spam %<>%
  rename("Text" = "do.call.rbind..lapply.spam_files..read_file..") %>%
  mutate(Spam = 1)
head(spam, 2)
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              Text
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   From ilug-admin@linux.ie  Tue Aug  6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@localhost.netnoteinc.com\nReceived: from localhost (localhost [127.0.0.1])\n\tby phobos.labs.netnoteinc.com (Postfix) with ESMTP id 9E1F5441DD\n\tfor <jm@localhost>; Tue,  6 Aug 2002 06:48:09 -0400 (EDT)\nReceived: from phobos [127.0.0.1]\n\tby localhost with IMAP (fetchmail-5.9.0)\n\tfor jm@localhost (single-drop); Tue, 06 Aug 2002 11:48:09 +0100 (IST)\nReceived: from lugh.tuatha.org (root@lugh.tuatha.org [194.125.145.45]) by\n    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g72LqWv13294 for\n    <jm-ilug@jmason.org>; Fri, 2 Aug 2002 22:52:32 +0100\nReceived: from lugh (root@localhost [127.0.0.1]) by lugh.tuatha.org\n    (8.9.3/8.9.3) with ESMTP id WAA31224; Fri, 2 Aug 2002 22:50:17 +0100\nReceived: from bettyjagessar.com (w142.z064000057.nyc-ny.dsl.cnc.net\n    [64.0.57.142]) by lugh.tuatha.org (8.9.3/8.9.3) with ESMTP id WAA31201 for\n    <ilug@linux.ie>; Fri, 2 Aug 2002 22:50:11 +0100\nX-Authentication-Warning: lugh.tuatha.org: Host w142.z064000057.nyc-ny.dsl.cnc.net\n    [64.0.57.142] claimed to be bettyjagessar.com\nReceived: from 64.0.57.142 [202.63.165.34] by bettyjagessar.com\n    (SMTPD32-7.06 EVAL) id A42A7FC01F2; Fri, 02 Aug 2002 02:18:18 -0400\nMessage-Id: <1028311679.886@0.57.142>\nDate: Fri, 02 Aug 2002 23:37:59 0530\nTo: ilug@linux.ie\nFrom: "Start Now" <startnow2002@hotmail.com>\nMIME-Version: 1.0\nContent-Type: text/plain; charset="US-ASCII"; format=flowed\nSubject: [ILUG] STOP THE MLM INSANITY\nSender: ilug-admin@linux.ie\nErrors-To: ilug-admin@linux.ie\nX-Mailman-Version: 1.1\nPrecedence: bulk\nList-Id: Irish Linux Users' Group <ilug.linux.ie>\nX-Beenthere: ilug@linux.ie\n\nGreetings!\n\nYou are receiving this letter because you have expressed an interest in \nreceiving information about online business opportunities. If this is \nerroneous then please accept my most sincere apology. This is a one-time \nmailing, so no removal is necessary.\n\nIf you've been burned, betrayed, and back-stabbed by multi-level marketing, \nMLM, then please read this letter. It could be the most important one that \nhas ever landed in your Inbox.\n\nMULTI-LEVEL MARKETING IS A HUGE MISTAKE FOR MOST PEOPLE\n\nMLM has failed to deliver on its promises for the past 50 years. The pursuit \nof the "MLM Dream" has cost hundreds of thousands of people their friends, \ntheir fortunes and their sacred honor. The fact is that MLM is fatally \nflawed, meaning that it CANNOT work for most people.\n\nThe companies and the few who earn the big money in MLM are NOT going to \ntell you the real story. FINALLY, there is someone who has the courage to \ncut through the hype and lies and tell the TRUTH about MLM.\n\nHERE'S GOOD NEWS\n\nThere IS an alternative to MLM that WORKS, and works BIG! If you haven't yet \nabandoned your dreams, then you need to see this. Earning the kind of income \nyou've dreamed about is easier than you think!\n\nWith your permission, I'd like to send you a brief letter that will tell you \nWHY MLM doesn't work for most people and will then introduce you to \nsomething so new and refreshing that you'll wonder why you haven't heard of \nthis before.\n\nI promise that there will be NO unwanted follow up, NO sales pitch, no one \nwill call you, and your email address will only be used to send you the \ninformation. Period.\n\nTo receive this free, life-changing information, simply click Reply, type \n"Send Info" in the Subject box and hit Send. I'll get the information to you \nwithin 24 hours. Just look for the words MLM WALL OF SHAME in your Inbox.\n\nCordially,\n\nSiddhi\n\nP.S. Someone recently sent the letter to me and it has been the most \neye-opening, financially beneficial information I have ever received. I \nhonestly believe that you will feel the same way once you've read it. And \nit's FREE!\n\n\n------------------------------------------------------------\nThis email is NEVER sent unsolicited.  THIS IS NOT "SPAM". You are receiving \nthis email because you EXPLICITLY signed yourself up to our list with our \nonline signup form or through use of our FFA Links Page and E-MailDOM \nsystems, which have EXPLICIT terms of use which state that through its use \nyou agree to receive our emailings.  You may also be a member of a Altra \nComputer Systems list or one of many numerous FREE Marketing Services and as \nsuch you agreed when you signed up for such list that you would also be \nreceiving this emailing.\nDue to the above, this email message cannot be considered unsolicitated, or \nspam.\n-----------------------------------------------------------\n\n\n\n\n-- \nIrish Linux Users' Group: ilug@linux.ie\nhttp://www.linux.ie/mailman/listinfo/ilug for (un)subscription information.\nList maintainer: listmaster@linux.ie\n\n\n
## 2 From lmrn@mailexcite.com  Mon Jun 24 17:03:24 2002\nReturn-Path: merchantsworld2001@juno.com\nDelivery-Date: Mon May 13 04:46:13 2002\nReceived: from mandark.labs.netnoteinc.com ([213.105.180.140]) by\n    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g4D3kCe15097 for\n    <jm@jmason.org>; Mon, 13 May 2002 04:46:12 +0100\nReceived: from 203.129.205.5.205.129.203.in-addr.arpa ([203.129.205.5]) by\n    mandark.labs.netnoteinc.com (8.11.2/8.11.2) with SMTP id g4D3k2D12605 for\n    <jm@netnoteinc.com>; Mon, 13 May 2002 04:46:04 +0100\nReceived: from html (unverified [207.95.174.49]) by\n    203.129.205.5.205.129.203.in-addr.arpa (EMWAC SMTPRS 0.83) with SMTP id\n    <B0000178595@203.129.205.5.205.129.203.in-addr.arpa>; Mon, 13 May 2002\n    09:04:46 +0530\nMessage-Id: <B0000178595@203.129.205.5.205.129.203.in-addr.arpa>\nFrom: lmrn@mailexcite.com\nTo: ranmoore@cybertime.net\nSubject: Real Protection, Stun Guns!  Free Shipping! Time:2:01:35 PM\nDate: Mon, 28 Jul 1980 14:01:35\nMIME-Version: 1.0\nX-Keywords: \nContent-Type: text/html; charset="DEFAULT"\n\n<html>\n<body>\n<center>\n<h3>\n<font color="blue">\n<b>\nThe Need For Safety Is Real In 2002, You Might Only Get One Chance - Be Ready!\n<p>\nFree Shipping & Handling Within The (USA) If You Order Before May 25, 2002! \n<p>\n3 Day Super Sale, Now Until May 7, 2002!  Save Up To $30.00 On Some Items!\n\n</b>\n</font>\n</h3>\n</center>\n<p>\nIT'S GETTING TO BE SPRING AGAIN, PROTECT YOURSELF AS YOU WALK,<br>\nJOG AND EXERCISE OUTSIDE.  ALSO PROTECT YOUR LOVED ONES AS<br>\nTHEY RETURN HOME FROM COLLEGE!<br>\n<p>\n*     LEGAL PROTECTION FOR COLLEGE STUDENTS!<br>\n*     GREAT UP'COMING OUTDOOR PROTECTION GIFTS!<br>\n*     THERE IS NOTHING WORTH MORE PROTECTING THAN LIFE!<br>\n*     OUR STUN DEVICES & PEPPER PRODUCTS ARE LEGAL PROTECTION!\n<p>\n<b>\n<font color="red">\nJOIN THE WAR ON CRIME!\n</b>\n</font>\n<p>\n\nSTUN GUNS AND BATONS \n<p>\nEFFECTIVE - SAFE - NONLETHAL\n<p>\nPROTECT YOUR LOVED ONES AND YOURSELF\n<p>\nNo matter who you are, no matter what City or Town you live in,<br>\nif you live in America, you will be touched by crime.\n<p>\nYou hear about it on TV.  You read about it in the newspaper.<br>\nIt's no secret that crime is a major problem in the U.S. today.<br>\nCriminals are finding it easier to commit crimes all the time.\n<p>\nWeapons are readily available.  Our cities' police forces have<br>\nmore work than they can handle.  Even if these criminal are<br>\ncaught, they won't be spending long in our nation's overcrowded<br>\njails.  And while lawmakers are well aware of the crime problem,<br>\nthey don't seem to have any effective answers.\n<p>\nOur Email Address:  <a\nhref="mailto:Merchants4all@aol.com">Merchants4all@aol.com</a>\n<p>\nINTERESTED:\n<p>\nYou will be protecting yourself within 7 days!  Don't Wait,<br>\nvisit our web page below, and join The War On Crime!\n<p>\n*****************<br>\n<a\nhref="http://www.geocities.com/realprotection_20022003/">http://www.geocities.com/realprotection_20022003/</a><br>\n*****************\n<p>\nWell, there is an effective answer.  Take responsibility for<br>\nyour own security.  Our site has a variety of quality personal<br>\nsecurity products.  Visit our site, choose the personal security<br>\nproducts that are right for you.  Use them, and join the war on\ncrime!\n<p>\nFREE PEPPER SPRAY WITH ANY STUN UNIT PURCHASE.<br>\n(A Value of $15.95)\n<p>\nWe Ship Orders Within 5 To 7 Days, To Every State In The U.S.A.<br>\nby UPS, FEDEX, or U.S. POSTAL SERVICE.  Visa, MasterCard, American<br>\nExpress & Debt Card Gladly Accepted.\n<p>\nAsk yourself this question, if you don't help your loved ones,\nwho will?\n<p>\nINTERESTED:\n<p>\n*****************<br>\n<a\nhref="http://www.geocities.com/realprotection_20022003/">http://www.geocities.com/realprotection_20022003/</a><br>\n*****************\n<p>\n___The Stun Monster 625,000 Volts ($86.95)<br>\n___The Z-Force Slim Style 300,000 Volts ($64.95)<br>\n___The StunMaster 300,000 Volts Straight ($59.95)<br>\n___The StunMaster 300,000 Volts Curb ($59.95)<br>\n___The StunMaster 200,000 Volts Straight ($49.95)<br>\n___The StunMaster 200,000 Volts Curb ($49.95)<br>\n___The StunBaton 500,000 Volts ($89.95)<br>\n___The StunBaton 300,000 Volts ($79.95)<br>\n___Pen Knife (One $12.50, Two Or More $9.00)<br>\n___Wildfire Pepper Spray  (One $15.95, Two Or More $11.75)\n<p>\n___Add $5.75 For Shipping & Handling Charge.\n<p>\n\nTo Order by postal mail, please send to the below address.<br>\nMake payable to Mega Safety Technology.\n<p>\nMega Safety Technology<br>\n3215 Merrimac Ave.<br>\nDayton, Ohio  45405<br>\nOur Email Address:  <a\nhref="mailto:Merchants4all@aol.com">Merchants4all@aol.com</a>\n<p>\nOrder by 24 Hour Fax!!!  775-257-6657.\n<p>\n*****<br>\n<b><font color="red">Important Credit Card Information! Please Read Below!</b></font>\n <br><br>\n*     Credit Card Address, City, State and Zip Code, must match\n      billing address to be processed. \n<br><br>\n\nCHECK____  MONEYORDER____  VISA____ MASTERCARD____ AmericanExpress___\nDebt Card___\n<br><br>\nName_______________________________________________________<br>\n(As it appears on Check or Credit Card)\n<br><br>\nAddress____________________________________________________<br>\n(As it appears on Check or Credit Card)\n<br><br>\n___________________________________________________<br>\nCity,State,Zip(As it appears on Check or Credit Card)\n<br><br>\n___________________________________________________<br>\nCountry\n<br><br>\n___________________________________________________<br>\n(Credit Card Number)\n<br><br>\nExpiration Month_____  Year_____\n<br><br>\n___________________________________________________<br>\nAuthorized Signature\n<br><br>\n<b>\n*****IMPORTANT NOTE*****\n</b>\n<br><br>\nIf Shipping Address Is Different From The Billing Address Above,\nPlease Fill Out Information Below.\n<br><br>\nShipping Name______________________________________________\n<br><br>\nShipping Address___________________________________________\n<br><br>\n___________________________________________________________<br>\nShipping City,State,Zip\n<br><br>\n___________________________________________________________<br>\nCountry\n<br><br>\n___________________________________________________________<br>\nEmail Address & Phone Number(Please Write Neat)\n</body>\n</html>\n
##   Spam
## 1    1
## 2    1

Repeat for the ham files

ham <- data.frame(do.call(rbind, lapply(ham_files, read_file)))
ham %<>%
  rename("Text" = "do.call.rbind..lapply.ham_files..read_file..") %>%
  mutate(Spam = 0)
head(ham, 2)
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                Text
## 1 From exmh-workers-admin@redhat.com  Thu Aug 22 12:36:23 2002\nReturn-Path: <exmh-workers-admin@spamassassin.taint.org>\nDelivered-To: zzzz@localhost.netnoteinc.com\nReceived: from localhost (localhost [127.0.0.1])\n\tby phobos.labs.netnoteinc.com (Postfix) with ESMTP id D03E543C36\n\tfor <zzzz@localhost>; Thu, 22 Aug 2002 07:36:16 -0400 (EDT)\nReceived: from phobos [127.0.0.1]\n\tby localhost with IMAP (fetchmail-5.9.0)\n\tfor zzzz@localhost (single-drop); Thu, 22 Aug 2002 12:36:16 +0100 (IST)\nReceived: from listman.spamassassin.taint.org (listman.spamassassin.taint.org [66.187.233.211]) by\n    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g7MBYrZ04811 for\n    <zzzz-exmh@spamassassin.taint.org>; Thu, 22 Aug 2002 12:34:53 +0100\nReceived: from listman.spamassassin.taint.org (localhost.localdomain [127.0.0.1]) by\n    listman.redhat.com (Postfix) with ESMTP id 8386540858; Thu, 22 Aug 2002\n    07:35:02 -0400 (EDT)\nDelivered-To: exmh-workers@listman.spamassassin.taint.org\nReceived: from int-mx1.corp.spamassassin.taint.org (int-mx1.corp.spamassassin.taint.org\n    [172.16.52.254]) by listman.redhat.com (Postfix) with ESMTP id 10CF8406D7\n    for <exmh-workers@listman.redhat.com>; Thu, 22 Aug 2002 07:34:10 -0400\n    (EDT)\nReceived: (from mail@localhost) by int-mx1.corp.spamassassin.taint.org (8.11.6/8.11.6)\n    id g7MBY7g11259 for exmh-workers@listman.redhat.com; Thu, 22 Aug 2002\n    07:34:07 -0400\nReceived: from mx1.spamassassin.taint.org (mx1.spamassassin.taint.org [172.16.48.31]) by\n    int-mx1.corp.redhat.com (8.11.6/8.11.6) with SMTP id g7MBY7Y11255 for\n    <exmh-workers@redhat.com>; Thu, 22 Aug 2002 07:34:07 -0400\nReceived: from ratree.psu.ac.th ([202.28.97.6]) by mx1.spamassassin.taint.org\n    (8.11.6/8.11.6) with SMTP id g7MBIhl25223 for <exmh-workers@redhat.com>;\n    Thu, 22 Aug 2002 07:18:55 -0400\nReceived: from delta.cs.mu.OZ.AU (delta.coe.psu.ac.th [172.30.0.98]) by\n    ratree.psu.ac.th (8.11.6/8.11.6) with ESMTP id g7MBWel29762;\n    Thu, 22 Aug 2002 18:32:40 +0700 (ICT)\nReceived: from munnari.OZ.AU (localhost [127.0.0.1]) by delta.cs.mu.OZ.AU\n    (8.11.6/8.11.6) with ESMTP id g7MBQPW13260; Thu, 22 Aug 2002 18:26:25\n    +0700 (ICT)\nFrom: Robert Elz <kre@munnari.OZ.AU>\nTo: Chris Garrigues <cwg-dated-1030377287.06fa6d@DeepEddy.Com>\nCc: exmh-workers@spamassassin.taint.org\nSubject: Re: New Sequences Window\nIn-Reply-To: <1029945287.4797.TMDA@deepeddy.vircio.com>\nReferences: <1029945287.4797.TMDA@deepeddy.vircio.com>\n    <1029882468.3116.TMDA@deepeddy.vircio.com> <9627.1029933001@munnari.OZ.AU>\n    <1029943066.26919.TMDA@deepeddy.vircio.com>\n    <1029944441.398.TMDA@deepeddy.vircio.com>\nMIME-Version: 1.0\nContent-Type: text/plain; charset=us-ascii\nMessage-Id: <13258.1030015585@munnari.OZ.AU>\nX-Loop: exmh-workers@spamassassin.taint.org\nSender: exmh-workers-admin@spamassassin.taint.org\nErrors-To: exmh-workers-admin@spamassassin.taint.org\nX-Beenthere: exmh-workers@spamassassin.taint.org\nX-Mailman-Version: 2.0.1\nPrecedence: bulk\nList-Help: <mailto:exmh-workers-request@spamassassin.taint.org?subject=help>\nList-Post: <mailto:exmh-workers@spamassassin.taint.org>\nList-Subscribe: <https://listman.spamassassin.taint.org/mailman/listinfo/exmh-workers>,\n    <mailto:exmh-workers-request@redhat.com?subject=subscribe>\nList-Id: Discussion list for EXMH developers <exmh-workers.spamassassin.taint.org>\nList-Unsubscribe: <https://listman.spamassassin.taint.org/mailman/listinfo/exmh-workers>,\n    <mailto:exmh-workers-request@redhat.com?subject=unsubscribe>\nList-Archive: <https://listman.spamassassin.taint.org/mailman/private/exmh-workers/>\nDate: Thu, 22 Aug 2002 18:26:25 +0700\n\n    Date:        Wed, 21 Aug 2002 10:54:46 -0500\n    From:        Chris Garrigues <cwg-dated-1030377287.06fa6d@DeepEddy.Com>\n    Message-ID:  <1029945287.4797.TMDA@deepeddy.vircio.com>\n\n\n  | I can't reproduce this error.\n\nFor me it is very repeatable... (like every time, without fail).\n\nThis is the debug log of the pick happening ...\n\n18:19:03 Pick_It {exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace} {4852-4852 -sequence mercury}\n18:19:03 exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace 4852-4852 -sequence mercury\n18:19:04 Ftoc_PickMsgs {{1 hit}}\n18:19:04 Marking 1 hits\n18:19:04 tkerror: syntax error in expression "int ...\n\nNote, if I run the pick command by hand ...\n\ndelta$ pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace  4852-4852 -sequence mercury\n1 hit\n\nThat's where the "1 hit" comes from (obviously).  The version of nmh I'm\nusing is ...\n\ndelta$ pick -version\npick -- nmh-1.0.4 [compiled on fuchsia.cs.mu.OZ.AU at Sun Mar 17 14:55:56 ICT 2002]\n\nAnd the relevant part of my .mh_profile ...\n\ndelta$ mhparam pick\n-seq sel -list\n\n\nSince the pick command works, the sequence (actually, both of them, the\none that's explicit on the command line, from the search popup, and the\none that comes from .mh_profile) do get created.\n\nkre\n\nps: this is still using the version of the code form a day ago, I haven't\nbeen able to reach the cvs repository today (local routing issue I think).\n\n\n\n_______________________________________________\nExmh-workers mailing list\nExmh-workers@redhat.com\nhttps://listman.redhat.com/mailman/listinfo/exmh-workers\n\n
## 2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         From Steve_Burt@cursor-system.com  Thu Aug 22 12:46:39 2002\nReturn-Path: <Steve_Burt@cursor-system.com>\nDelivered-To: zzzz@localhost.netnoteinc.com\nReceived: from localhost (localhost [127.0.0.1])\n\tby phobos.labs.netnoteinc.com (Postfix) with ESMTP id BE12E43C34\n\tfor <zzzz@localhost>; Thu, 22 Aug 2002 07:46:38 -0400 (EDT)\nReceived: from phobos [127.0.0.1]\n\tby localhost with IMAP (fetchmail-5.9.0)\n\tfor zzzz@localhost (single-drop); Thu, 22 Aug 2002 12:46:38 +0100 (IST)\nReceived: from n20.grp.scd.yahoo.com (n20.grp.scd.yahoo.com\n    [66.218.66.76]) by dogma.slashnull.org (8.11.6/8.11.6) with SMTP id\n    g7MBkTZ05087 for <zzzz@spamassassin.taint.org>; Thu, 22 Aug 2002 12:46:29 +0100\nX-Egroups-Return: sentto-2242572-52726-1030016790-zzzz=spamassassin.taint.org@returns.groups.yahoo.com\nReceived: from [66.218.67.196] by n20.grp.scd.yahoo.com with NNFMP;\n    22 Aug 2002 11:46:30 -0000\nX-Sender: steve.burt@cursor-system.com\nX-Apparently-To: zzzzteana@yahoogroups.com\nReceived: (EGP: mail-8_1_0_1); 22 Aug 2002 11:46:29 -0000\nReceived: (qmail 11764 invoked from network); 22 Aug 2002 11:46:29 -0000\nReceived: from unknown (66.218.66.217) by m3.grp.scd.yahoo.com with QMQP;\n    22 Aug 2002 11:46:29 -0000\nReceived: from unknown (HELO mailgateway.cursor-system.com) (62.189.7.27)\n    by mta2.grp.scd.yahoo.com with SMTP; 22 Aug 2002 11:46:29 -0000\nReceived: from exchange1.cps.local (unverified) by\n    mailgateway.cursor-system.com (Content Technologies SMTPRS 4.2.10) with\n    ESMTP id <T5cde81f695ac1d100407d@mailgateway.cursor-system.com> for\n    <forteana@yahoogroups.com>; Thu, 22 Aug 2002 13:14:10 +0100\nReceived: by exchange1.cps.local with Internet Mail Service (5.5.2653.19)\n    id <PXX6AT23>; Thu, 22 Aug 2002 12:46:27 +0100\nMessage-Id: <5EC2AD6D2314D14FB64BDA287D25D9EF12B4F6@exchange1.cps.local>\nTo: "'zzzzteana@yahoogroups.com'" <zzzzteana@yahoogroups.com>\nX-Mailer: Internet Mail Service (5.5.2653.19)\nX-Egroups-From: Steve Burt <steve.burt@cursor-system.com>\nFrom: Steve Burt <Steve_Burt@cursor-system.com>\nX-Yahoo-Profile: pyruse\nMIME-Version: 1.0\nMailing-List: list zzzzteana@yahoogroups.com; contact\n    forteana-owner@yahoogroups.com\nDelivered-To: mailing list zzzzteana@yahoogroups.com\nPrecedence: bulk\nList-Unsubscribe: <mailto:zzzzteana-unsubscribe@yahoogroups.com>\nDate: Thu, 22 Aug 2002 12:46:18 +0100\nSubject: [zzzzteana] RE: Alexander\nReply-To: zzzzteana@yahoogroups.com\nContent-Type: text/plain; charset=US-ASCII\nContent-Transfer-Encoding: 7bit\n\nMartin A posted:\nTassos Papadopoulos, the Greek sculptor behind the plan, judged that the\n limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the\n Mount Athos monastic community, was ideal for the patriotic sculpture. \n \n As well as Alexander's granite features, 240 ft high and 170 ft wide, a\n museum, a restored amphitheatre and car park for admiring crowds are\nplanned\n---------------------\nSo is this mountain limestone or granite?\nIf it's limestone, it'll weather pretty fast.\n\n------------------------ Yahoo! Groups Sponsor ---------------------~-->\n4 DVDs Free +s&p Join Now\nhttp://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM\n---------------------------------------------------------------------~->\n\nTo unsubscribe from this group, send an email to:\nforteana-unsubscribe@egroups.com\n\n \n\nYour use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/ \n\n\n\n
##   Spam
## 1    0
## 2    0

Combine the two dataframes of ham and spam into one dataframe called data

data <- rbind(spam, ham)
data %<>%
  mutate(Text = as.character(Text))
head(data, 2)
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              Text
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   From ilug-admin@linux.ie  Tue Aug  6 11:51:02 2002\nReturn-Path: <ilug-admin@linux.ie>\nDelivered-To: yyyy@localhost.netnoteinc.com\nReceived: from localhost (localhost [127.0.0.1])\n\tby phobos.labs.netnoteinc.com (Postfix) with ESMTP id 9E1F5441DD\n\tfor <jm@localhost>; Tue,  6 Aug 2002 06:48:09 -0400 (EDT)\nReceived: from phobos [127.0.0.1]\n\tby localhost with IMAP (fetchmail-5.9.0)\n\tfor jm@localhost (single-drop); Tue, 06 Aug 2002 11:48:09 +0100 (IST)\nReceived: from lugh.tuatha.org (root@lugh.tuatha.org [194.125.145.45]) by\n    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g72LqWv13294 for\n    <jm-ilug@jmason.org>; Fri, 2 Aug 2002 22:52:32 +0100\nReceived: from lugh (root@localhost [127.0.0.1]) by lugh.tuatha.org\n    (8.9.3/8.9.3) with ESMTP id WAA31224; Fri, 2 Aug 2002 22:50:17 +0100\nReceived: from bettyjagessar.com (w142.z064000057.nyc-ny.dsl.cnc.net\n    [64.0.57.142]) by lugh.tuatha.org (8.9.3/8.9.3) with ESMTP id WAA31201 for\n    <ilug@linux.ie>; Fri, 2 Aug 2002 22:50:11 +0100\nX-Authentication-Warning: lugh.tuatha.org: Host w142.z064000057.nyc-ny.dsl.cnc.net\n    [64.0.57.142] claimed to be bettyjagessar.com\nReceived: from 64.0.57.142 [202.63.165.34] by bettyjagessar.com\n    (SMTPD32-7.06 EVAL) id A42A7FC01F2; Fri, 02 Aug 2002 02:18:18 -0400\nMessage-Id: <1028311679.886@0.57.142>\nDate: Fri, 02 Aug 2002 23:37:59 0530\nTo: ilug@linux.ie\nFrom: "Start Now" <startnow2002@hotmail.com>\nMIME-Version: 1.0\nContent-Type: text/plain; charset="US-ASCII"; format=flowed\nSubject: [ILUG] STOP THE MLM INSANITY\nSender: ilug-admin@linux.ie\nErrors-To: ilug-admin@linux.ie\nX-Mailman-Version: 1.1\nPrecedence: bulk\nList-Id: Irish Linux Users' Group <ilug.linux.ie>\nX-Beenthere: ilug@linux.ie\n\nGreetings!\n\nYou are receiving this letter because you have expressed an interest in \nreceiving information about online business opportunities. If this is \nerroneous then please accept my most sincere apology. This is a one-time \nmailing, so no removal is necessary.\n\nIf you've been burned, betrayed, and back-stabbed by multi-level marketing, \nMLM, then please read this letter. It could be the most important one that \nhas ever landed in your Inbox.\n\nMULTI-LEVEL MARKETING IS A HUGE MISTAKE FOR MOST PEOPLE\n\nMLM has failed to deliver on its promises for the past 50 years. The pursuit \nof the "MLM Dream" has cost hundreds of thousands of people their friends, \ntheir fortunes and their sacred honor. The fact is that MLM is fatally \nflawed, meaning that it CANNOT work for most people.\n\nThe companies and the few who earn the big money in MLM are NOT going to \ntell you the real story. FINALLY, there is someone who has the courage to \ncut through the hype and lies and tell the TRUTH about MLM.\n\nHERE'S GOOD NEWS\n\nThere IS an alternative to MLM that WORKS, and works BIG! If you haven't yet \nabandoned your dreams, then you need to see this. Earning the kind of income \nyou've dreamed about is easier than you think!\n\nWith your permission, I'd like to send you a brief letter that will tell you \nWHY MLM doesn't work for most people and will then introduce you to \nsomething so new and refreshing that you'll wonder why you haven't heard of \nthis before.\n\nI promise that there will be NO unwanted follow up, NO sales pitch, no one \nwill call you, and your email address will only be used to send you the \ninformation. Period.\n\nTo receive this free, life-changing information, simply click Reply, type \n"Send Info" in the Subject box and hit Send. I'll get the information to you \nwithin 24 hours. Just look for the words MLM WALL OF SHAME in your Inbox.\n\nCordially,\n\nSiddhi\n\nP.S. Someone recently sent the letter to me and it has been the most \neye-opening, financially beneficial information I have ever received. I \nhonestly believe that you will feel the same way once you've read it. And \nit's FREE!\n\n\n------------------------------------------------------------\nThis email is NEVER sent unsolicited.  THIS IS NOT "SPAM". You are receiving \nthis email because you EXPLICITLY signed yourself up to our list with our \nonline signup form or through use of our FFA Links Page and E-MailDOM \nsystems, which have EXPLICIT terms of use which state that through its use \nyou agree to receive our emailings.  You may also be a member of a Altra \nComputer Systems list or one of many numerous FREE Marketing Services and as \nsuch you agreed when you signed up for such list that you would also be \nreceiving this emailing.\nDue to the above, this email message cannot be considered unsolicitated, or \nspam.\n-----------------------------------------------------------\n\n\n\n\n-- \nIrish Linux Users' Group: ilug@linux.ie\nhttp://www.linux.ie/mailman/listinfo/ilug for (un)subscription information.\nList maintainer: listmaster@linux.ie\n\n\n
## 2 From lmrn@mailexcite.com  Mon Jun 24 17:03:24 2002\nReturn-Path: merchantsworld2001@juno.com\nDelivery-Date: Mon May 13 04:46:13 2002\nReceived: from mandark.labs.netnoteinc.com ([213.105.180.140]) by\n    dogma.slashnull.org (8.11.6/8.11.6) with ESMTP id g4D3kCe15097 for\n    <jm@jmason.org>; Mon, 13 May 2002 04:46:12 +0100\nReceived: from 203.129.205.5.205.129.203.in-addr.arpa ([203.129.205.5]) by\n    mandark.labs.netnoteinc.com (8.11.2/8.11.2) with SMTP id g4D3k2D12605 for\n    <jm@netnoteinc.com>; Mon, 13 May 2002 04:46:04 +0100\nReceived: from html (unverified [207.95.174.49]) by\n    203.129.205.5.205.129.203.in-addr.arpa (EMWAC SMTPRS 0.83) with SMTP id\n    <B0000178595@203.129.205.5.205.129.203.in-addr.arpa>; Mon, 13 May 2002\n    09:04:46 +0530\nMessage-Id: <B0000178595@203.129.205.5.205.129.203.in-addr.arpa>\nFrom: lmrn@mailexcite.com\nTo: ranmoore@cybertime.net\nSubject: Real Protection, Stun Guns!  Free Shipping! Time:2:01:35 PM\nDate: Mon, 28 Jul 1980 14:01:35\nMIME-Version: 1.0\nX-Keywords: \nContent-Type: text/html; charset="DEFAULT"\n\n<html>\n<body>\n<center>\n<h3>\n<font color="blue">\n<b>\nThe Need For Safety Is Real In 2002, You Might Only Get One Chance - Be Ready!\n<p>\nFree Shipping & Handling Within The (USA) If You Order Before May 25, 2002! \n<p>\n3 Day Super Sale, Now Until May 7, 2002!  Save Up To $30.00 On Some Items!\n\n</b>\n</font>\n</h3>\n</center>\n<p>\nIT'S GETTING TO BE SPRING AGAIN, PROTECT YOURSELF AS YOU WALK,<br>\nJOG AND EXERCISE OUTSIDE.  ALSO PROTECT YOUR LOVED ONES AS<br>\nTHEY RETURN HOME FROM COLLEGE!<br>\n<p>\n*     LEGAL PROTECTION FOR COLLEGE STUDENTS!<br>\n*     GREAT UP'COMING OUTDOOR PROTECTION GIFTS!<br>\n*     THERE IS NOTHING WORTH MORE PROTECTING THAN LIFE!<br>\n*     OUR STUN DEVICES & PEPPER PRODUCTS ARE LEGAL PROTECTION!\n<p>\n<b>\n<font color="red">\nJOIN THE WAR ON CRIME!\n</b>\n</font>\n<p>\n\nSTUN GUNS AND BATONS \n<p>\nEFFECTIVE - SAFE - NONLETHAL\n<p>\nPROTECT YOUR LOVED ONES AND YOURSELF\n<p>\nNo matter who you are, no matter what City or Town you live in,<br>\nif you live in America, you will be touched by crime.\n<p>\nYou hear about it on TV.  You read about it in the newspaper.<br>\nIt's no secret that crime is a major problem in the U.S. today.<br>\nCriminals are finding it easier to commit crimes all the time.\n<p>\nWeapons are readily available.  Our cities' police forces have<br>\nmore work than they can handle.  Even if these criminal are<br>\ncaught, they won't be spending long in our nation's overcrowded<br>\njails.  And while lawmakers are well aware of the crime problem,<br>\nthey don't seem to have any effective answers.\n<p>\nOur Email Address:  <a\nhref="mailto:Merchants4all@aol.com">Merchants4all@aol.com</a>\n<p>\nINTERESTED:\n<p>\nYou will be protecting yourself within 7 days!  Don't Wait,<br>\nvisit our web page below, and join The War On Crime!\n<p>\n*****************<br>\n<a\nhref="http://www.geocities.com/realprotection_20022003/">http://www.geocities.com/realprotection_20022003/</a><br>\n*****************\n<p>\nWell, there is an effective answer.  Take responsibility for<br>\nyour own security.  Our site has a variety of quality personal<br>\nsecurity products.  Visit our site, choose the personal security<br>\nproducts that are right for you.  Use them, and join the war on\ncrime!\n<p>\nFREE PEPPER SPRAY WITH ANY STUN UNIT PURCHASE.<br>\n(A Value of $15.95)\n<p>\nWe Ship Orders Within 5 To 7 Days, To Every State In The U.S.A.<br>\nby UPS, FEDEX, or U.S. POSTAL SERVICE.  Visa, MasterCard, American<br>\nExpress & Debt Card Gladly Accepted.\n<p>\nAsk yourself this question, if you don't help your loved ones,\nwho will?\n<p>\nINTERESTED:\n<p>\n*****************<br>\n<a\nhref="http://www.geocities.com/realprotection_20022003/">http://www.geocities.com/realprotection_20022003/</a><br>\n*****************\n<p>\n___The Stun Monster 625,000 Volts ($86.95)<br>\n___The Z-Force Slim Style 300,000 Volts ($64.95)<br>\n___The StunMaster 300,000 Volts Straight ($59.95)<br>\n___The StunMaster 300,000 Volts Curb ($59.95)<br>\n___The StunMaster 200,000 Volts Straight ($49.95)<br>\n___The StunMaster 200,000 Volts Curb ($49.95)<br>\n___The StunBaton 500,000 Volts ($89.95)<br>\n___The StunBaton 300,000 Volts ($79.95)<br>\n___Pen Knife (One $12.50, Two Or More $9.00)<br>\n___Wildfire Pepper Spray  (One $15.95, Two Or More $11.75)\n<p>\n___Add $5.75 For Shipping & Handling Charge.\n<p>\n\nTo Order by postal mail, please send to the below address.<br>\nMake payable to Mega Safety Technology.\n<p>\nMega Safety Technology<br>\n3215 Merrimac Ave.<br>\nDayton, Ohio  45405<br>\nOur Email Address:  <a\nhref="mailto:Merchants4all@aol.com">Merchants4all@aol.com</a>\n<p>\nOrder by 24 Hour Fax!!!  775-257-6657.\n<p>\n*****<br>\n<b><font color="red">Important Credit Card Information! Please Read Below!</b></font>\n <br><br>\n*     Credit Card Address, City, State and Zip Code, must match\n      billing address to be processed. \n<br><br>\n\nCHECK____  MONEYORDER____  VISA____ MASTERCARD____ AmericanExpress___\nDebt Card___\n<br><br>\nName_______________________________________________________<br>\n(As it appears on Check or Credit Card)\n<br><br>\nAddress____________________________________________________<br>\n(As it appears on Check or Credit Card)\n<br><br>\n___________________________________________________<br>\nCity,State,Zip(As it appears on Check or Credit Card)\n<br><br>\n___________________________________________________<br>\nCountry\n<br><br>\n___________________________________________________<br>\n(Credit Card Number)\n<br><br>\nExpiration Month_____  Year_____\n<br><br>\n___________________________________________________<br>\nAuthorized Signature\n<br><br>\n<b>\n*****IMPORTANT NOTE*****\n</b>\n<br><br>\nIf Shipping Address Is Different From The Billing Address Above,\nPlease Fill Out Information Below.\n<br><br>\nShipping Name______________________________________________\n<br><br>\nShipping Address___________________________________________\n<br><br>\n___________________________________________________________<br>\nShipping City,State,Zip\n<br><br>\n___________________________________________________________<br>\nCountry\n<br><br>\n___________________________________________________________<br>\nEmail Address & Phone Number(Please Write Neat)\n</body>\n</html>\n
##   Spam
## 1    1
## 2    1

Create a corpus and document term matrix from the text in each file to be used for classification

data$Text <- iconv(enc2utf8(data$Text),sub="byte")
corpus <- VCorpus(VectorSource(data$Text))
corpus = tm_map(corpus, tolower)
corpus = tm_map(corpus, PlainTextDocument)
corpus = tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stemDocument)
dtm = DocumentTermMatrix(corpus)

Removve sparse terms to obtain a more reasonable number of terms to work with for prediction

spdtm = removeSparseTerms(dtm, 0.95)

Create a dataframe from the document term matrix

emailsSparse = as.data.frame(as.matrix(spdtm))
colnames(emailsSparse) = make.names(colnames(emailsSparse))

Copy the spam labelling from the original dataset to the emailsSparse dataframe

emailsSparse$Spam = data$Spam

Now we split the dataframe into train and test sets

set.seed(777)
spl = sample.split(emailsSparse$spam, 0.8)
train = subset(emailsSparse, spl == TRUE)
test = subset(emailsSparse, spl == FALSE)

One model we can build is a logistic regression model for classification:

spamLog = glm(Spam~., data=train, family="binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

Accuracy

pred = predict(spamLog, newdata = test, type="response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
table(test$Spam, pred > 0.5)
##    
##     FALSE TRUE
##   0   483    8
##   1     3  285
cat("The accuracy out of sample is:", (483+285)/nrow(test))
## The accuracy out of sample is: 0.9858793