# Load necessary libraries
pacman::p_load(pacman, tidytext, dplyr, tidyr, ggplot2, readr, topicmodels, gridExtra, wordcloud, RColorBrewer, quanteda, quanteda.textstats)

# Load the CSV file
Ion_data <- read_csv("ION_Plato.csv")
## Rows: 180 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ION
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Rename the column for easier reference
colnames(Ion_data) <- c("text")

# Tokenize the text into words
tokens <- Ion_data %>%
  unnest_tokens(word, text)

# Remove stop words
data("stop_words")
tokens <- tokens %>%
  anti_join(stop_words)
## Joining with `by = join_by(word)`
# Count word frequencies
word_counts <- tokens %>%
  count(word, sort = TRUE)

# Display the most common words
head(word_counts, 30)
## # A tibble: 30 × 2
##    word          n
##    <chr>     <int>
##  1 socrates    113
##  2 ion         112
##  3 art          49
##  4 homer        42
##  5 rhapsode     28
##  6 speak        22
##  7 poets        19
##  8 knowledge    18
##  9 speaking     10
## 10 poet          9
## # ℹ 20 more rows
# Plot the most common words
word_counts %>%
  top_n(30) %>%
  ggplot(aes(x = reorder(word, n), y = n)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Most Common Words in Ion by Plato",
       x = "Words",
       y = "Frequency") +
  theme_minimal()
## Selecting by n

# Sentiment Analysis using Bing Lexicon
bing_sentiments <- tokens %>%
  inner_join(get_sentiments("bing"))
## Joining with `by = join_by(word)`
# Count positive and negative words
bing_sentiments_count <- bing_sentiments %>%
  count(sentiment, sort = TRUE)

# Add percentage column
bing_sentiments_count <- bing_sentiments_count %>%
  mutate(percentage = n / sum(n) * 100)

# Plot Bing sentiment counts and percentages
bing_plot <- ggplot(bing_sentiments_count, aes(x = sentiment, y = n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = paste0(round(percentage, 1), "%")), vjust = 1) +
  labs(title = "Sentiment Analysis of Ion by Plato using Bing Lexicon",
       x = "Sentiment",
       y = "Count") +
  theme()

# Handle many-to-many relationship warning
 nrc_sentiments <- tokens %>%
  inner_join(get_sentiments("nrc"), relationship = "many-to-many")
## Joining with `by = join_by(word)`
# Count NRC sentiments
nrc_sentiments_count <- nrc_sentiments %>%
  count(sentiment, sort = TRUE)

# Add percentage column
nrc_sentiments_count <- nrc_sentiments_count %>%
  mutate(percentage = n / sum(n) * 100)

# Plot NRC sentiment counts and percentages with adjusted label positioning
nrc_plot <- ggplot(nrc_sentiments_count, aes(x = reorder(sentiment, n), y = n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = paste0(round(percentage, 1), "%")), hjust = 1) +  coord_flip() +
  labs(title = "NRC Sentiment Analysis of Ion by Plato",
       x = "Sentiment",
       y = "Count") +
  theme()

# Arrange Bing and NRC plots side by side
grid.arrange(bing_plot, nrc_plot, nrow = 1)

# Generate Word Cloud 
set.seed(1234)
wordcloud(words = word_counts$word, freq = word_counts$n, min.freq = 5,
          max.words = 100,  
          random.order = FALSE, rot.per = 0.1,  
          scale = c(3.5, 0.8),  
          colors = brewer.pal(8, "Dark2"))
mtext("Word Cloud of Ion by Plato", side = 3, adj = 0, line = 1, cex = 1, font = 2)

# Prepare data for Topic Modeling
# Create a document-term matrix
dtm <- tokens %>%
  count(document = row_number(), word) %>%
  cast_dtm(document, word, n)

# Set the number of topics
num_topics <- 4

# Run LDA
lda_model <- LDA(dtm, k = num_topics, control = list(seed = 1234))

# Get the top terms for each topic
lda_terms <- tidy(lda_model, matrix = "beta")

# Display the top terms for each topic in a table format
top_terms <- lda_terms %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

# Print the top terms for each topic
top_terms %>%
  group_by(topic) %>%
  summarize(terms = paste(term, collapse = ", ")) %>%
  print()
## # A tibble: 4 × 2
##   topic terms                                                                   
##   <int> <chr>                                                                   
## 1     1 ion, socrates, art, rhapsode, speak, poet, knowledge, homer, true, judge
## 2     2 socrates, ion, rhapsode, homer, knowledge, speak, poets, hesiod, god, p…
## 3     3 homer, poets, ion, socrates, rhapsode, words, god, reason, true, physic…
## 4     4 ion, socrates, art, homer, speak, speaking, knowledge, mind, hear, char…
# Plot the top terms for each topic
top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(x = term, y = beta, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free_y") +
  coord_flip() +
  labs(title = "Top Terms in Each Topic",
       x = "Terms",
       y = "Beta") +
  scale_x_reordered() +
  theme()

# Textual Complexity: Flesch-Kincaid Readability
Ion_text <- paste(Ion_data$text, collapse = " ")
readability <- textstat_readability(Ion_text, measure = "Flesch.Kincaid")

print(paste("Flesch-Kincaid Readability Score: ", readability))
## [1] "Flesch-Kincaid Readability Score:  text1"          
## [2] "Flesch-Kincaid Readability Score:  9.3041250890948"