# Load necessary libraries
pacman::p_load(pacman, tidytext, dplyr, ggplot2, readr, topicmodels, textdata)

# Load the CSV file
gilgamesh_data <- read_csv("Gilgamesh.csv", col_names = FALSE)
## Rows: 4150 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): X1
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Rename the column for easier reference
colnames(gilgamesh_data) <- c("text")

# Display the first few rows of the data
head(gilgamesh_data)
## # A tibble: 6 × 1
##   text                                                                          
##   <chr>                                                                         
## 1 The Project Gutenberg eBook of An Old Babylonian Version of the Gilgamesh Epic
## 2 This ebook is for the use of anyone anywhere in the United States and         
## 3 most other parts of the world at no cost and with almost no restrictions      
## 4 whatsoever. You may copy it, give it away or re-use it under the terms        
## 5 of the Project Gutenberg License included with this ebook or online           
## 6 at www.gutenberg.org. If you are not located in the United States,
# Tokenize the text into words
tokens <- gilgamesh_data %>%
  unnest_tokens(word, text)

# Remove stop words (common words that are usually removed in text analysis)
data("stop_words")
tokens <- tokens %>%
  anti_join(stop_words)
## Joining with `by = join_by(word)`
# Count word frequencies
word_counts <- tokens %>%
  count(word, sort = TRUE)

# Display the most common words
head(word_counts, 20)
## # A tibble: 20 × 2
##    word           n
##    <chr>      <int>
##  1 enkidu       306
##  2 tablet       271
##  3 line         268
##  4 gilgamesh    244
##  5 version      232
##  6 na           198
##  7 ma           196
##  8 assyrian     191
##  9 lines        166
## 10 sú           149
## 11 sá           130
## 12 ki           126
## 13 gish         119
## 14 ta           105
## 15 si            99
## 16 babylonian    98
## 17 epic          95
## 18 form          91
## 19 ka            91
## 20 erech         90
# Plot the most common words
word_counts %>%
  top_n(20) %>%
  ggplot(aes(x = reorder(word, n), y = n)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Most Common Words in the Old Babylonian Version of the Gilgamesh Epic",
       x = "Words",
       y = "Frequency") +
  theme_minimal()
## Selecting by n

# Sentiment Analysis using Bing Lexicon
bing_sentiments <- tokens %>%
  inner_join(get_sentiments("bing"))
## Joining with `by = join_by(word)`
# Count positive and negative words
bing_sentiments_count <- bing_sentiments %>%
  count(sentiment, sort = TRUE)

# Plot sentiment counts
bing_sentiments_count %>%
  ggplot(aes(x = sentiment, y = n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  labs(title = "Sentiment Analysis of the Old Babylonian Version of the Gilgamesh Epic",
       x = "Sentiment",
       y = "Count") +
  theme_minimal()

# Sentiment Analysis using NRC Lexicon
nrc_sentiments <- tokens %>%
  inner_join(get_sentiments("nrc"))
## Joining with `by = join_by(word)`
## Warning in inner_join(., get_sentiments("nrc")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 9 of `x` matches multiple rows in `y`.
## ℹ Row 13027 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
# Count NRC sentiments
nrc_sentiments_count <- nrc_sentiments %>%
  count(sentiment, sort = TRUE)

# Plot NRC sentiments
nrc_sentiments_count %>%
  ggplot(aes(x = reorder(sentiment, n), y = n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "NRC Sentiment Analysis of the Old Babylonian Version of the Gilgamesh Epic",
       x = "Sentiment",
       y = "Count") +
  theme_minimal()

# Prepare data for Topic Modeling
# Create a document-term matrix
dtm <- tokens %>%
  count(document = row_number(), word) %>%
  cast_dtm(document, word, n)

# Set the number of topics
num_topics <- 4

# Run LDA
lda_model <- LDA(dtm, k = num_topics, control = list(seed = 1234))

# Get the top terms for each topic
lda_terms <- tidy(lda_model, matrix = "beta")

# Display the top terms for each topic in a table format
top_terms <- lda_terms %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

# Print the top terms for each topic
top_terms %>%
  group_by(topic) %>%
  summarize(terms = paste(term, collapse = ", ")) %>%
  print()
## # A tibble: 4 × 2
##   topic terms                                                             
##   <int> <chr>                                                             
## 1     1 tablet, line, gilgamesh, na, assyrian, version, sú, ta, ma, enkidu
## 2     2 enkidu, gilgamesh, assyrian, line, version, na, sú, lines, ki, ta 
## 3     3 tablet, ma, na, line, sá, lines, gish, form, enkidu, babylonian   
## 4     4 version, enkidu, ka, ma, lines, si, woman, ki, la, gish
# Plot the top terms for each topic
top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(x = term, y = beta, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free_y") +
  coord_flip() +
  labs(title = "Top Terms in Each Topic",
       x = "Terms",
       y = "Beta") +
  scale_x_reordered() +
  theme_minimal()