# Load necessary libraries
pacman::p_load(pacman, tidytext, dplyr, ggplot2, readr, topicmodels, textdata)
# Load the CSV file
gilgamesh_data <- read_csv("Gilgamesh.csv", col_names = FALSE)
## Rows: 4150 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): X1
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Rename the column for easier reference
colnames(gilgamesh_data) <- c("text")
# Display the first few rows of the data
head(gilgamesh_data)
## # A tibble: 6 × 1
## text
## <chr>
## 1 The Project Gutenberg eBook of An Old Babylonian Version of the Gilgamesh Epic
## 2 This ebook is for the use of anyone anywhere in the United States and
## 3 most other parts of the world at no cost and with almost no restrictions
## 4 whatsoever. You may copy it, give it away or re-use it under the terms
## 5 of the Project Gutenberg License included with this ebook or online
## 6 at www.gutenberg.org. If you are not located in the United States,
# Tokenize the text into words
tokens <- gilgamesh_data %>%
unnest_tokens(word, text)
# Remove stop words (common words that are usually removed in text analysis)
data("stop_words")
tokens <- tokens %>%
anti_join(stop_words)
## Joining with `by = join_by(word)`
# Count word frequencies
word_counts <- tokens %>%
count(word, sort = TRUE)
# Display the most common words
head(word_counts, 20)
## # A tibble: 20 × 2
## word n
## <chr> <int>
## 1 enkidu 306
## 2 tablet 271
## 3 line 268
## 4 gilgamesh 244
## 5 version 232
## 6 na 198
## 7 ma 196
## 8 assyrian 191
## 9 lines 166
## 10 sú 149
## 11 sá 130
## 12 ki 126
## 13 gish 119
## 14 ta 105
## 15 si 99
## 16 babylonian 98
## 17 epic 95
## 18 form 91
## 19 ka 91
## 20 erech 90
# Plot the most common words
word_counts %>%
top_n(20) %>%
ggplot(aes(x = reorder(word, n), y = n)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Most Common Words in the Old Babylonian Version of the Gilgamesh Epic",
x = "Words",
y = "Frequency") +
theme_minimal()
## Selecting by n

# Sentiment Analysis using Bing Lexicon
bing_sentiments <- tokens %>%
inner_join(get_sentiments("bing"))
## Joining with `by = join_by(word)`
# Count positive and negative words
bing_sentiments_count <- bing_sentiments %>%
count(sentiment, sort = TRUE)
# Plot sentiment counts
bing_sentiments_count %>%
ggplot(aes(x = sentiment, y = n, fill = sentiment)) +
geom_bar(stat = "identity") +
labs(title = "Sentiment Analysis of the Old Babylonian Version of the Gilgamesh Epic",
x = "Sentiment",
y = "Count") +
theme_minimal()

# Sentiment Analysis using NRC Lexicon
nrc_sentiments <- tokens %>%
inner_join(get_sentiments("nrc"))
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("nrc")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 9 of `x` matches multiple rows in `y`.
## ℹ Row 13027 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# Count NRC sentiments
nrc_sentiments_count <- nrc_sentiments %>%
count(sentiment, sort = TRUE)
# Plot NRC sentiments
nrc_sentiments_count %>%
ggplot(aes(x = reorder(sentiment, n), y = n, fill = sentiment)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "NRC Sentiment Analysis of the Old Babylonian Version of the Gilgamesh Epic",
x = "Sentiment",
y = "Count") +
theme_minimal()

# Prepare data for Topic Modeling
# Create a document-term matrix
dtm <- tokens %>%
count(document = row_number(), word) %>%
cast_dtm(document, word, n)
# Set the number of topics
num_topics <- 4
# Run LDA
lda_model <- LDA(dtm, k = num_topics, control = list(seed = 1234))
# Get the top terms for each topic
lda_terms <- tidy(lda_model, matrix = "beta")
# Display the top terms for each topic in a table format
top_terms <- lda_terms %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
# Print the top terms for each topic
top_terms %>%
group_by(topic) %>%
summarize(terms = paste(term, collapse = ", ")) %>%
print()
## # A tibble: 4 × 2
## topic terms
## <int> <chr>
## 1 1 tablet, line, gilgamesh, na, assyrian, version, sú, ta, ma, enkidu
## 2 2 enkidu, gilgamesh, assyrian, line, version, na, sú, lines, ki, ta
## 3 3 tablet, ma, na, line, sá, lines, gish, form, enkidu, babylonian
## 4 4 version, enkidu, ka, ma, lines, si, woman, ki, la, gish
# Plot the top terms for each topic
top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(x = term, y = beta, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free_y") +
coord_flip() +
labs(title = "Top Terms in Each Topic",
x = "Terms",
y = "Beta") +
scale_x_reordered() +
theme_minimal()
