## Loading necessary packages.
library(stringi)
library(SnowballC)
library(tidyverse)
library(tidytext)
library(dplyr)
data("stop_words")
library(ggplot2)
library(wordcloud)

Introduction

This project looks at the word frequency in the 2022 Clark Journal, with the hope that an insight to word frequency would

I omitted five types of article in this analysis: 8. acquisition checklist, 12. visitor information, 13. donor list, 14. tribute gifts, 15. board of trustees.

I. What does the word frequency look like in each article?

To find out, I wrote four custom functions:

## function: get individual word df
getDfNontrivialWords <- function(txtfile) {
  
## read director's text
text <- readLines(txtfile, encoding = 'UTF-8', skipNul = T)
## convert UTF-8 to ASCII encoding
text <- stri_trans_general(text, "Latin-ASCII")
## into dplyr tibble
df <- tibble(line=1:length(text), text=text)
## separate text into individual words
## delete trivial stopwords
df <- df %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words)
## remove numbers
df <- df[-grep("\\b\\d+\\b", df$word),]  
## stemming (get basic conjugate)
df <- df %>%
      mutate_at("word", funs(wordStem((.), language="en")))  

return(df)  

}
## function: get word frequency table
getTableWordFreq <- function(txtfile) {
  
df <- getDfNontrivialWords(txtfile)  
dffreq <- df %>%
  count(word) %>%
  arrange(desc(n))

return(dffreq)

}
## function: barplot of word frequency table
getBarplotWordFreq <- function(txtfile){
  
df <- getDfNontrivialWords(txtfile)  
dfPlot <- df %>%
  count(word, sort = TRUE) %>%
  mutate(word = reorder(word, n)) %>%
  slice(1:20) %>%
  ggplot(aes(n, word, fill=word)) +
  geom_bar(stat = "identity") +
  theme(panel.background = element_blank()) +
  theme(legend.position = "none") +
  xlab("Frequency") + ylab("Word Stem") +
  scale_fill_manual(
    values = rep(c("darkgoldenrod2", "lightblue"), 999)
  )
return(dfPlot)

}
## function: get wordcloud
getWordcloud <- function(txtfile) {
  
palette <- brewer.pal(9, "GnBu")
set.seed(0)
wordcloud <- getTableWordFreq(txtfile) %>% 
  with(wordcloud(word, n, random.order = FALSE, min.freq = 1, max.words = 80, colors=palette))
return(wordcloud)

}

1. Director’s text

2. Dec arts interpretation

3. RAP

4. Library

5. Grounds + WTF

6. Musical Performance

9. Upcoming Exhibitions

10. Advancement

II. What does the word frequency look like for the whole journal?

The top-20 word stems, in order of frequency.

## # A tibble: 20 × 2
##    word         n
##    <chr>    <int>
##  1 clark      109
##  2 art        101
##  3 histori     52
##  4 exhibit     46
##  5 program     43
##  6 collect     42
##  7 museum      42
##  8 artist      40
##  9 project     40
## 10 institut    38
## 11 perform     35
## 12 print       34
## 13 centuri     33
## 14 intern      31
## 15 galleri     30
## 16 music       28
## 17 object      27
## 18 digit       20
## 19 draw        20
## 20 printmak    20

Word cloud, visualizing the frequently-used word stems throughout the entire 2022 journal.