Project4

2024-12-02

library(tidyverse)
library(tidytext)
library(knitr)
library(readtext)
library(dplyr)
library(stringr)
library(rvest)
library(tm)
library(e1071)
#you'll need to replace with the path to the local GitHub Repo 
repofolder <- "/Users/alliewrubel/Documents/GitHub/Datal607_Project4"
ham_folder <- paste0(repofolder, "/easy_ham/")
spam_folder <- paste0(repofolder, "/spam/")

#read files from the ham and spam folders, respectively
ham_files <- list.files(ham_folder, full.names = TRUE)
spam_files <- list.files(spam_folder, full.names = TRUE)
#function to iteratively go through files in both folders and read in contents, as readlines, ignoring content up until first line break (indicates beginning of body of email) 

read_emails_to_df <- function(file_paths, label) {
  email_list <- lapply(file_paths, function(file) {
    email_content <- readLines(file, warn = FALSE)
    
    # Identify the first blank line and extract the content after it
    body_start <- which(email_content == "")[1] + 1
    if (!is.na(body_start)) {
      email_body <- email_content[body_start:length(email_content)] 
      email_text <- paste(email_body, collapse = " ")  
    } else {
      email_text <- NA  # No body found
    }
    
    return(email_text)
  })
  
  # Create dataframe with text col and spam col
  email_df <- data.frame(
    text = unlist(email_list),  
    spam = label,
    stringsAsFactors = FALSE
  )
  
  return(email_df)
}

#run functions on both ham and spam folders then combine into single dataframe
ham_df <- read_emails_to_df(ham_files, label = 0)
spam_df <- read_emails_to_df(spam_files, label = 1)
email_data_raw <- rbind(ham_df, spam_df)

kable(head(email_data_raw, 5), caption = "Raw Email Data")
#function to clean the text portion of the email using regex expressions to remove unnecessary or unhelpful words that might add noise to model learning
clean_text <- function(email_text) {
  email_text <- str_squish(email_text)
  email_text <- str_replace_all(email_text, "http\\S+|www\\S+", "<URL>")
  
  email_text <- str_remove_all(email_text, "(?i)^(From|To|Subject|Date|Received|Return-Path|Delivered-To|Message-ID|X-.*|Content-.*|Mime-Version|Thread-Index|Precedence|List-Id|Errors-To):.*?(\\n\\s.*?)*")
  
  email_text <- str_remove_all(email_text, "[^[:alnum:][:punct:]\\s]")
  
  email_text <- str_replace_all(email_text, "\\b\\d+\\b", "<NUM>")
  
  email_text <- tolower(email_text)
  
  email_text <- str_squish(email_text)
  return(email_text)
}

#apply the cleaning function to email df
email_data_cleaned <- email_data_raw %>%
  mutate(cleaned_text = clean_text(text))
#ensure that all content is UTF-8 so that it can be processed for nlp
email_data_cleaned$text <- iconv(email_data_cleaned$text, from = "latin1", to = "UTF-8", sub = "")

#add col of unique identifiers, this is necessary for nlp processing to ensure labels and text are aligned
email_data_cleaned <- email_data_cleaned %>%
  mutate(doc_id = row_number())

kable(head(email_data_cleaned, 5), caption = "Cleaned Email Data")
# Create corpus using the email_data_cleaned df and clean once more to ensure it's preprocessed
corpus <- VCorpus(VectorSource(email_data_cleaned$text))
names(corpus) <- email_data_cleaned$doc_id
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, stripWhitespace)

# create Document-Term Matrix
dtm <- DocumentTermMatrix(corpus)

# reduce sparsity - .9 is best
dtm_reduced <- removeSparseTerms(dtm, 0.90)
dtm_matrix <- as.matrix(dtm_reduced)

head(dtm_matrix)
##     Terms
## Docs also can date dont email even find first free get good group information
##    1    0   0    1    0     0    0    0     0    0   1    0     0           0
##    2    0   0    0    0     1    0    0     0    1   0    0     1           0
##    3    0   0    0    0     1    0    0     0    1   1    0     1           0
##    4    0   1    0    0     1    0    0     0    0   0    0     0           0
##    5    0   0    0    0     1    0    0     0    1   0    0     1           0
##    6    0   0    0    0     1    0    0     0    1   0    0     1           0
##     Terms
## Docs ive just know like list mailing make many may message much need new now
##    1   0    0    0    1    5       1    0    0   0       0    0    0   0   0
##    2   0    0    0    0    0       0    0    0   0       0    0    0   0   1
##    3   0    0    0    0    0       0    0    0   0       0    0    0   0   1
##    4   0    0    0    0    1       1    0    0   1       0    0    0   2   0
##    5   2    1    0    0    0       0    1    0   0       0    0    0   0   1
##    6   0    2    0    0    0       0    1    0   0       0    0    0   0   1
##     Terms
## Docs one people please really right said see send sep since something still
##    1   2      0      0      0     0    0   0    0   0     1         0     1
##    2   0      0      0      0     0    0   0    1   0     0         0     0
##    3   2      0      0      0     0    5   0    1   0     0         0     0
##    4   1      0      0      0     0    0   0    0   0     0         0     0
##    5   1      0      0      0     0    0   0    1   0     0         0     0
##    6   1      0      0      1     0    0   0    1   0     0         0     0
##     Terms
## Docs subject take thats think time two url use using want way well will work
##    1       3    0     2     1    1   0   0   0     2    0   0    0    0    0
##    2       1    0     0     0    0   0   0   1     0    0   0    1    0    0
##    3       1    0     0     0    0   0   0   1     0    0   0    0    0    0
##    4       0    0     0     0    1   0   0   1     0    0   0    0    0    0
##    5       1    0     0     0    1   0   0   3     0    0   0    0    0    0
##    6       1    0     1     0    0   0   0   2     0    0   0    0    0    0
##     Terms
## Docs wrote
##    1     0
##    2     0
##    3     0
##    4     0
##    5     0
##    6     0
# align labels with the reduced matrix using doc_id
labels <- email_data_cleaned %>%
  filter(doc_id %in% rownames(dtm_matrix)) %>%
  pull(spam)

set.seed(123)  # for reproducibility

# create training and testing indices, 80/20 split between testing and training data
train_indices <- sample(1:nrow(dtm_matrix), 0.8 * nrow(dtm_matrix))
test_indices <- setdiff(1:nrow(dtm_matrix), train_indices)

# split the data
train_data <- dtm_matrix[train_indices, ]
test_data <- dtm_matrix[test_indices, ]
train_labels <- labels[train_indices]
test_labels <- labels[test_indices]
# train Naive Bayes classifier
nb_model <- naiveBayes(train_data, as.factor(train_labels))

# make predictions using test_data
nb_predictions <- predict(nb_model, test_data)

# calculate accuracy of model to predict spam
accuracy <- 100*(sum(nb_predictions == test_labels) / length(test_labels))
print(paste("Model Accuracy:", accuracy, "%"))
## [1] "Model Accuracy: 85.5241264559068 %"

text	spam
Date: Wed, 21 Aug 2002 10:54:46 -0500 From: Chris Garrigues cwg-dated-1030377287.06fa6d@DeepEddy.Com Message-ID: 1029945287.4797.TMDA@deepeddy.vircio.com \| I can’t reproduce this error. For me it is very repeatable… (like every time, without fail). This is the debug log of the pick happening … 18:19:03 Pick_It {exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace} {4852-4852 -sequence mercury} 18:19:03 exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace 4852-4852 -sequence mercury 18:19:04 Ftoc_PickMsgs {{1 hit}} 18:19:04 Marking 1 hits 18:19:04 tkerror: syntax error in expression “int … Note, if I run the pick command by hand … delta$ pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace 4852-4852 -sequence mercury 1 hit That’s where the”1 hit” comes from (obviously). The version of nmh I’m using is … delta$ pick -version pick – nmh-1.0.4 [compiled on fuchsia.cs.mu.OZ.AU at Sun Mar 17 14:55:56 ICT 2002] And the relevant part of my .mh_profile … delta$ mhparam pick -seq sel -list Since the pick command works, the sequence (actually, both of them, the one that’s explicit on the command line, from the search popup, and the one that comes from .mh_profile) do get created. kre ps: this is still using the version of the code form a day ago, I haven’t been able to reach the cvs repository today (local routing issue I think). _______________________________________________ Exmh-workers mailing list Exmh-workers@redhat.com https://listman.redhat.com/mailman/listinfo/exmh-workers	0
Martin A posted: Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the Mount Athos monastic community, was ideal for the patriotic sculpture. As well as Alexander’s granite features, 240 ft high and 170 ft wide, a museum, a restored amphitheatre and car park for admiring crowds are planned ——————— So is this mountain limestone or granite? If it’s limestone, it’ll weather pretty fast. ———————— Yahoo! Groups Sponsor ———————~–> 4 DVDs Free +s&p Join Now http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM ———————————————————————~-> To unsubscribe from this group, send an email to: forteana-unsubscribe@egroups.com Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/	0
Man Threatens Explosion In Moscow Thursday August 22, 2002 1:40 PM MOSCOW (AP) - Security officers on Thursday seized an unidentified man who said he was armed with explosives and threatened to blow up his truck in front of Russia’s Federal Security Services headquarters in Moscow, NTV television reported. The officers seized an automatic rifle the man was carrying, then the man got out of the truck and was taken into custody, NTV said. No other details were immediately available. The man had demanded talks with high government officials, the Interfax and ITAR-Tass news agencies said. Ekho Moskvy radio reported that he wanted to talk with Russian President Vladimir Putin. Police and security forces rushed to the Security Service building, within blocks of the Kremlin, Red Square and the Bolshoi Ballet, and surrounded the man, who claimed to have one and a half tons of explosives, the news agencies said. Negotiations continued for about one and a half hours outside the building, ITAR-Tass and Interfax reported, citing witnesses. The man later drove away from the building, under police escort, and drove to a street near Moscow’s Olympic Penta Hotel, where authorities held further negotiations with him, the Moscow police press service said. The move appeared to be an attempt by security services to get him to a more secure location. ———————— Yahoo! Groups Sponsor ———————~–> 4 DVDs Free +s&p Join Now http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM ———————————————————————~-> To unsubscribe from this group, send an email to: forteana-unsubscribe@egroups.com Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/	0
Klez: The Virus That Won’t Die Already the most prolific virus ever, Klez continues to wreak havoc. Andrew Brandt >>From the September 2002 issue of PC World magazine Posted Thursday, August 01, 2002 The Klez worm is approaching its seventh month of wriggling across the Web, making it one of the most persistent viruses ever. And experts warn that it may be a harbinger of new viruses that use a combination of pernicious approaches to go from PC to PC. Antivirus software makers Symantec and McAfee both report more than 2000 new infections daily, with no sign of letup at press time. The British security firm MessageLabs estimates that 1 in every 300 e-mail messages holds a variation of the Klez virus, and says that Klez has already surpassed last summer’s SirCam as the most prolific virus ever. And some newer Klez variants aren’t merely nuisances–they can carry other viruses in them that corrupt your data. … http://www.pcworld.com/news/article/0,aid,103259,00.asp _______________________________________________ Irregulars mailing list Irregulars@tb.tf http://tb.tf/mailman/listinfo/irregulars	0
> in adding cream to spaghetti carbonara, which has the same effect on pasta as > making a pizza a deep-pie; I just had to jump in here as Carbonara is one of my favourites to make and ask what the hell are you supposed to use instead of cream? I’ve never seen a recipe that hasn’t used this. Personally I use low fat creme fraiche because it works quite nicely but the only time I’ve seen an supposedly authentic recipe for carbonara it was identical to mine (cream, eggs and lots of fresh parmesan) except for the creme fraiche. Stew – Stewart Smith Scottish Microelectronics Centre, University of Edinburgh. http://www.ee.ed.ac.uk/~sxs/ ———————— Yahoo! Groups Sponsor ———————~–> 4 DVDs Free +s&p Join Now http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM ———————————————————————~-> To unsubscribe from this group, send an email to: forteana-unsubscribe@egroups.com Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/	0

text	cleaned_text	doc_id
Date: Wed, 21 Aug 2002 10:54:46 -0500 From: Chris Garrigues cwg-dated-1030377287.06fa6d@DeepEddy.Com Message-ID: 1029945287.4797.TMDA@deepeddy.vircio.com \| I can’t reproduce this error. For me it is very repeatable… (like every time, without fail). This is the debug log of the pick happening … 18:19:03 Pick_It {exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace} {4852-4852 -sequence mercury} 18:19:03 exec pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace 4852-4852 -sequence mercury 18:19:04 Ftoc_PickMsgs {{1 hit}} 18:19:04 Marking 1 hits 18:19:04 tkerror: syntax error in expression “int … Note, if I run the pick command by hand … delta$ pick +inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace 4852-4852 -sequence mercury 1 hit That’s where the”1 hit” comes from (obviously). The version of nmh I’m using is … delta$ pick -version pick – nmh-1.0.4 [compiled on fuchsia.cs.mu.OZ.AU at Sun Mar 17 14:55:56 ICT 2002] And the relevant part of my .mh_profile … delta$ mhparam pick -seq sel -list Since the pick command works, the sequence (actually, both of them, the one that’s explicit on the command line, from the search popup, and the one that comes from .mh_profile) do get created. kre ps: this is still using the version of the code form a day ago, I haven’t been able to reach the cvs repository today (local routing issue I think). _______________________________________________ Exmh-workers mailing list Exmh-workers@redhat.com https://listman.redhat.com/mailman/listinfo/exmh-workers	wed, aug :: - from: chris garrigues cwg-dated-.06fa6d@deepeddy.com message-id: ..tmda@deepeddy.vircio.com i can’t reproduce this error. for me it is very repeatable… (like every time, without fail). this is the debug log of the pick happening … :: pick_it {exec pick inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace} {- -sequence mercury} :: exec pick inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace - -sequence mercury :: ftoc_pickmsgs {{ hit}} :: marking hits :: tkerror: syntax error in expression “int … note, if i run the pick command by hand … delta pick inbox -list -lbrace -lbrace -subject ftp -rbrace -rbrace - -sequence mercury hit that’s where the” hit” comes from (obviously). the version of nmh i’m using is … delta pick -version pick – nmh-.. [compiled on fuchsia.cs.mu.oz.au at sun mar :: ict ] and the relevant part of my .mh_profile … delta mhparam pick -seq sel -list since the pick command works, the sequence (actually, both of them, the one that’s explicit on the command line, from the search popup, and the one that comes from .mh_profile) do get created. kre ps: this is still using the version of the code form a day ago, i haven’t been able to reach the cvs repository today (local routing issue i think). _______________________________________________ exmh-workers mailing list exmh-workers@redhat.com url	1
Martin A posted: Tassos Papadopoulos, the Greek sculptor behind the plan, judged that the limestone of Mount Kerdylio, 70 miles east of Salonika and not far from the Mount Athos monastic community, was ideal for the patriotic sculpture. As well as Alexander’s granite features, 240 ft high and 170 ft wide, a museum, a restored amphitheatre and car park for admiring crowds are planned ——————— So is this mountain limestone or granite? If it’s limestone, it’ll weather pretty fast. ———————— Yahoo! Groups Sponsor ———————~–> 4 DVDs Free +s&p Join Now http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM ———————————————————————~-> To unsubscribe from this group, send an email to: forteana-unsubscribe@egroups.com Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/	martin a posted: tassos papadopoulos, the greek sculptor behind the plan, judged that the limestone of mount kerdylio, miles east of salonika and not far from the mount athos monastic community, was ideal for the patriotic sculpture. as well as alexander’s granite features, ft high and ft wide, a museum, a restored amphitheatre and car park for admiring crowds are planned ——————— so is this mountain limestone or granite? if it’s limestone, it’ll weather pretty fast. ———————— yahoo! groups sponsor ———————– dvds free s&p join now url ———————————————————————- to unsubscribe from this group, send an email to: forteana-unsubscribe@egroups.com your use of yahoo! groups is subject to url	2
Man Threatens Explosion In Moscow Thursday August 22, 2002 1:40 PM MOSCOW (AP) - Security officers on Thursday seized an unidentified man who said he was armed with explosives and threatened to blow up his truck in front of Russia’s Federal Security Services headquarters in Moscow, NTV television reported. The officers seized an automatic rifle the man was carrying, then the man got out of the truck and was taken into custody, NTV said. No other details were immediately available. The man had demanded talks with high government officials, the Interfax and ITAR-Tass news agencies said. Ekho Moskvy radio reported that he wanted to talk with Russian President Vladimir Putin. Police and security forces rushed to the Security Service building, within blocks of the Kremlin, Red Square and the Bolshoi Ballet, and surrounded the man, who claimed to have one and a half tons of explosives, the news agencies said. Negotiations continued for about one and a half hours outside the building, ITAR-Tass and Interfax reported, citing witnesses. The man later drove away from the building, under police escort, and drove to a street near Moscow’s Olympic Penta Hotel, where authorities held further negotiations with him, the Moscow police press service said. The move appeared to be an attempt by security services to get him to a more secure location. ———————— Yahoo! Groups Sponsor ———————~–> 4 DVDs Free +s&p Join Now http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM ———————————————————————~-> To unsubscribe from this group, send an email to: forteana-unsubscribe@egroups.com Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/	man threatens explosion in moscow thursday august , : pm moscow (ap) - security officers on thursday seized an unidentified man who said he was armed with explosives and threatened to blow up his truck in front of russia’s federal security services headquarters in moscow, ntv television reported. the officers seized an automatic rifle the man was carrying, then the man got out of the truck and was taken into custody, ntv said. no other details were immediately available. the man had demanded talks with high government officials, the interfax and itar-tass news agencies said. ekho moskvy radio reported that he wanted to talk with russian president vladimir putin. police and security forces rushed to the security service building, within blocks of the kremlin, red square and the bolshoi ballet, and surrounded the man, who claimed to have one and a half tons of explosives, the news agencies said. negotiations continued for about one and a half hours outside the building, itar-tass and interfax reported, citing witnesses. the man later drove away from the building, under police escort, and drove to a street near moscow’s olympic penta hotel, where authorities held further negotiations with him, the moscow police press service said. the move appeared to be an attempt by security services to get him to a more secure location. ———————— yahoo! groups sponsor ———————– dvds free s&p join now url ———————————————————————- to unsubscribe from this group, send an email to: forteana-unsubscribe@egroups.com your use of yahoo! groups is subject to url	3
Klez: The Virus That Won’t Die Already the most prolific virus ever, Klez continues to wreak havoc. Andrew Brandt >>From the September 2002 issue of PC World magazine Posted Thursday, August 01, 2002 The Klez worm is approaching its seventh month of wriggling across the Web, making it one of the most persistent viruses ever. And experts warn that it may be a harbinger of new viruses that use a combination of pernicious approaches to go from PC to PC. Antivirus software makers Symantec and McAfee both report more than 2000 new infections daily, with no sign of letup at press time. The British security firm MessageLabs estimates that 1 in every 300 e-mail messages holds a variation of the Klez virus, and says that Klez has already surpassed last summer’s SirCam as the most prolific virus ever. And some newer Klez variants aren’t merely nuisances–they can carry other viruses in them that corrupt your data. … http://www.pcworld.com/news/article/0,aid,103259,00.asp _______________________________________________ Irregulars mailing list Irregulars@tb.tf http://tb.tf/mailman/listinfo/irregulars	klez: the virus that won’t die already the most prolific virus ever, klez continues to wreak havoc. andrew brandt from the september issue of pc world magazine posted thursday, august , the klez worm is approaching its seventh month of wriggling across the web, making it one of the most persistent viruses ever. and experts warn that it may be a harbinger of new viruses that use a combination of pernicious approaches to go from pc to pc. antivirus software makers symantec and mcafee both report more than new infections daily, with no sign of letup at press time. the british security firm messagelabs estimates that in every e-mail messages holds a variation of the klez virus, and says that klez has already surpassed last summer’s sircam as the most prolific virus ever. and some newer klez variants aren’t merely nuisances–they can carry other viruses in them that corrupt your data. … url _______________________________________________ irregulars mailing list irregulars@tb.tf url	4
> in adding cream to spaghetti carbonara, which has the same effect on pasta as > making a pizza a deep-pie; I just had to jump in here as Carbonara is one of my favourites to make and ask what the hell are you supposed to use instead of cream? I’ve never seen a recipe that hasn’t used this. Personally I use low fat creme fraiche because it works quite nicely but the only time I’ve seen an supposedly authentic recipe for carbonara it was identical to mine (cream, eggs and lots of fresh parmesan) except for the creme fraiche. Stew – Stewart Smith Scottish Microelectronics Centre, University of Edinburgh. http://www.ee.ed.ac.uk/~sxs/ ———————— Yahoo! Groups Sponsor ———————~–> 4 DVDs Free +s&p Join Now http://us.click.yahoo.com/pt6YBB/NXiEAA/mG3HAA/7gSolB/TM ———————————————————————~-> To unsubscribe from this group, send an email to: forteana-unsubscribe@egroups.com Your use of Yahoo! Groups is subject to http://docs.yahoo.com/info/terms/	in adding cream to spaghetti carbonara, which has the same effect on pasta as making a pizza a deep-pie; i just had to jump in here as carbonara is one of my favourites to make and ask what the hell are you supposed to use instead of cream? i’ve never seen a recipe that hasn’t used this. personally i use low fat creme fraiche because it works quite nicely but the only time i’ve seen an supposedly authentic recipe for carbonara it was identical to mine (cream, eggs and lots of fresh parmesan) except for the creme fraiche. stew – stewart smith scottish microelectronics centre, university of edinburgh. url ———————— yahoo! groups sponsor ———————– dvds free s&p join now url ———————————————————————- to unsubscribe from this group, send an email to: forteana-unsubscribe@egroups.com your use of yahoo! groups is subject to url	5

Project4

aw

2024-12-02

Project 4 - Text Classification

Loading and Cleaning Data

Analyzing Data Structure

Cleaning Raw Data

Natural Language Processing

Pre-processing for Text Mining

Creating a Corpus

Train-Test Split

Model Training and Evaluation

Conclusion