Project 4

Load Libraries

library(dplyr)
library(tidyr)
library(purrr)
library(readr)
library(tidytext)
library(stringr)
library(ggplot2)
library(wordcloud)
library(topicmodels)

In this code we are:

  • Setting Spam File folder
  • Extracting Files and lines from folder to a dataframe
  • Filtering data to rmeove text and remove lines with unwante text
  • Generate workd list
  • Remove stop words
#Set Spam file folder
spfolder<-"./spam_2"

#Extract files and lines from folder to a dataframe
srawdata<-data_frame(file= dir(spfolder, full.names =TRUE))%>%
    mutate(text = map(file,read_lines))%>%
    transmute(id = basename(file), text)%>%
    unnest(text)

#Filter clean data from unwanted text
clsp<-srawdata%>%
    group_by(id)%>%
    filter(cumsum(text =="") >0,
           cumsum(str_detect(text,"^--")) == 0)%>%
    ungroup()

#More text cleaning and extracting word list
stf_idf <- clsp%>%
    filter(cumsum(str_detect(text,"<")) == 0) %>%
    unnest_tokens(word, text)

#Remove stop words
stf_idf<-stf_idf%>%
    anti_join(stop_words)%>%
    count(word, sort=TRUE)

GGPLOT

  • List top 25 word count
  • Words with less than 20 characers
stf_idf%>%
    filter(nchar(word)<20) %>%
    mutate(word = reorder(word, n))%>%
    top_n(25)%>%
    ggplot(aes(word, n)) +
    geom_col()+
    xlab(NULL)+
    coord_flip()

WordCloud with words with at least 1 frequency

wordcloud(stf_idf$word, min.freq = 1)

In this code we are:

  • Setting HAM File folder
  • Extracting Files and lines from folder to a dataframe
  • Filtering data to rmeove text and remove lines with unwante text
  • Generate workd list
  • Remove stop words
#Set Ham Folder
hfolder<-"./easy_ham_2"

#Extract files and lines into a data frame
rawdata<-data_frame(file= dir(hfolder, full.names =TRUE))%>%
    mutate(text = map(file,read_lines))%>%
    transmute(id = basename(file), text)%>%
    unnest(text)

#Clean data remove lines with unwanted text
clh<-rawdata%>%
    group_by(id)%>%
    filter(cumsum(text =="") >0,
           cumsum(str_detect(text,"^--")) == 0)%>%
    ungroup()

#Exract words
htf_idf <- clh%>%
    unnest_tokens(word, text)%>%
    filter(str_detect(word, "[a-z']$"))
 
#Remove stop words
htf_idf<-htf_idf%>%
    anti_join(stop_words)%>%
    count(word, sort=TRUE)

GGPLOT

  • Top 25 words with 100 or more words
htf_idf%>%
    filter(n>100) %>%
    mutate(word = reorder(word, n))%>%
    top_n(25)%>%
    ggplot(aes(word, n)) +
    geom_col()+
    xlab(NULL)+
    coord_flip()

Word cloud listing words with a max of 400 word count

wordcloud(htf_idf$word, max = 400)

Extracting emails that have SPAM words

#pass rawdata to a data frame
df<-rawdata

#Find Emails with Spam words and add column with y or n
for (i in stf_idf$word)
{
    df$spam <- ifelse(grepl(i,df$text),'y','n')
}

#Filter for emails that have spam column as y
sdf<-df %>%
    filter(spam =="y")
sdf
## # A tibble: 88 x 3
##    id                                     text                       spam 
##    <chr>                                  <chr>                      <chr>
##  1 00087.809e03adf935435f9e493a3ffdfd9e85 I would also like, if at ~ y    
##  2 00285.49d8664ce245cb396687a3f303ad124c explaining that concept, ~ y    
##  3 00299.f5ee5d9a3056c28135db57935818e138 we funded which has now e~ y    
##  4 00301.bd38f1d07527919c3c177e564ee7c908 | we funded which has now~ y    
##  5 00384.26101c9502879b02e44058519cc52b8d "I think the words you ar~ y    
##  6 00421.eb3eb924262149264f92d8dc3db00db1 > > Cheers All for your w~ y    
##  7 00422.41c3bd638e1a077ba0c692579417f299 >>Cheers All for your wor~ y    
##  8 00426.6cd93f7b4c74456e414f1f01fec6b05f > >>Cheers All for your w~ y    
##  9 00511.9de62092d57725e40cb59410c9abfe79 Passwords for jm-webdev@j~ y    
## 10 00525.b4f3489039137593e0afc1db9ba466cb From owner-worldwidewords~ y    
## # ... with 78 more rows