## Loading necessary packages.
library(stringi)
library(SnowballC)
library(tidyverse)
library(tidytext)
library(dplyr)
data("stop_words")
library(ggplot2)
library(wordcloud)
This project looks at the word frequency in the 2022 Clark Journal, with the hope that an insight to word frequency would
consolidate a word-bank for museum writers
provide keywords of institutional identity
reflect broader lexicon trend (presence AND absence of certain words) in the industry
I omitted five types of article in this analysis: 8. acquisition checklist, 12. visitor information, 13. donor list, 14. tribute gifts, 15. board of trustees.
To find out, I wrote four custom functions:
## function: get individual word df
getDfNontrivialWords <- function(txtfile) {
## read director's text
text <- readLines(txtfile, encoding = 'UTF-8', skipNul = T)
## convert UTF-8 to ASCII encoding
text <- stri_trans_general(text, "Latin-ASCII")
## into dplyr tibble
df <- tibble(line=1:length(text), text=text)
## separate text into individual words
## delete trivial stopwords
df <- df %>%
unnest_tokens(word, text) %>%
anti_join(stop_words)
## remove numbers
df <- df[-grep("\\b\\d+\\b", df$word),]
## stemming (get basic conjugate)
df <- df %>%
mutate_at("word", funs(wordStem((.), language="en")))
return(df)
}
## function: get word frequency table
getTableWordFreq <- function(txtfile) {
df <- getDfNontrivialWords(txtfile)
dffreq <- df %>%
count(word) %>%
arrange(desc(n))
return(dffreq)
}
## function: barplot of word frequency table
getBarplotWordFreq <- function(txtfile){
df <- getDfNontrivialWords(txtfile)
dfPlot <- df %>%
count(word, sort = TRUE) %>%
mutate(word = reorder(word, n)) %>%
slice(1:20) %>%
ggplot(aes(n, word, fill=word)) +
geom_bar(stat = "identity") +
theme(panel.background = element_blank()) +
theme(legend.position = "none") +
xlab("Frequency") + ylab("Word Stem") +
scale_fill_manual(
values = rep(c("darkgoldenrod2", "lightblue"), 999)
)
return(dfPlot)
}
## function: get wordcloud
getWordcloud <- function(txtfile) {
palette <- brewer.pal(9, "GnBu")
set.seed(0)
wordcloud <- getTableWordFreq(txtfile) %>%
with(wordcloud(word, n, random.order = FALSE, min.freq = 1, max.words = 80, colors=palette))
return(wordcloud)
}
The top-20 word stems, in order of frequency.
## # A tibble: 20 × 2
## word n
## <chr> <int>
## 1 clark 109
## 2 art 101
## 3 histori 52
## 4 exhibit 46
## 5 program 43
## 6 collect 42
## 7 museum 42
## 8 artist 40
## 9 project 40
## 10 institut 38
## 11 perform 35
## 12 print 34
## 13 centuri 33
## 14 intern 31
## 15 galleri 30
## 16 music 28
## 17 object 27
## 18 digit 20
## 19 draw 20
## 20 printmak 20
Word cloud, visualizing the frequently-used word stems throughout the entire 2022 journal.