clean corpus

# used to clean up a corpus file
clean_corpus <- function(corpus) {
  corpus <- tm_map(corpus, removePunctuation)                          # remove punctuations
  corpus <- tm_map(corpus, content_transformer(tolower))               # change to lowercase
  corpus <- tm_map(corpus, removeWords, words = c(stopwords("en")))    # remove stopwords
  corpus <- tm_map(corpus, stripWhitespace)                            # remove whitespace
  corpus <- tm_map(corpus, removeNumbers)                              # remove numbers
  return(corpus)
}

#clean up the corpus using the clean corpus
ham_spam <- clean_corpus(corpus3)

#Build a document term matrix
token <-DocumentTermMatrix(ham_spam)
token

## <<DocumentTermMatrix (documents: 1, terms: 36250)>>
## Non-/sparse entries: 36250/0
## Sparsity           : 0%
## Maximal term length: 76
## Weighting          : term frequency (tf)

#create dataframe
ham_spam2 <- as.data.frame(as.matrix(token))

ham3<-pivot_longer(ham_spam2, cols = everything()) %>%
  mutate(ham = "ham")

# wordcloud
wordcloud(ham_spam, max.words = 70)

## Warning in wordcloud(ham_spam, max.words = 70): received could not be fit on
## page. It will not be plotted.

#loop to list all files with spam
spamcorpus <- NA
        for(x in file2)
{
  doc <- print(paste0(easy_spam, x))               #print spam and file name in one line
  read <-readLines(doc)                            #read the file 
  spamcorpus = c(spamcorpus, read)                 # print out all the file information
}

spamcorpus <- data.frame(text=unlist(spamcorpus), stringsAsFactors=FALSE) %>%
  unnest_tokens(word, text)

# Make a vector source and volatile corpus
spamcorpus2 <- VectorSource(spamcorpus)
spamcorpus3 <- VCorpus(spamcorpus2)


#clean up the corpus using the clean corpus
ham_spam30 <- clean_corpus(spamcorpus3)

#Build a document term matrix
token23 <-DocumentTermMatrix(ham_spam30)
token23

#create dataframe
spam <- as.data.frame(as.matrix(token23)) %>%
  arrange()

spam3<-pivot_longer(spam, cols = everything()) %>%
  mutate(ham = "spam")

# wordcloud
wordcloud(spam3$name, max.words = 70)

together <- rbind(spam3,ham3)

together <-as.data.frame(together) %>%
  select(name,ham)
together$ham<-as.factor(together$ham)
str(together)

# use 70% of dataset as training set and remaining 30% as testing set. 
set.seed(1)
sample <- sample(c(TRUE, FALSE), nrow(together), replace=TRUE, prob=c(0.7,0.3))

train  <- together[sample, ]
test   <- together[!sample, ]

#view dimensions of training set
dim(train)

## [1] 53003     2

#view dimensions of test set
dim(test)

## [1] 22898     2

Project 4

Anna Moy

2024-04-14

clean corpus