# Load necessary libraries
pacman::p_load(pacman, tidytext, dplyr, tidyr, ggplot2, readr, topicmodels, gridExtra, 
               wordcloud, RColorBrewer, quanteda, quanteda.textstats, grid)

# Define the directory path and list of files
file_path <- "/cloud/project"
files <- c("Children_of_the_Stones_Full_Circle(7).csv",
           "Children_of_the_Stones_Charmed_Circle(5).csv",
           "Children_of_the_Stones_Narrowing_Circle(4).csv",
           "Children_of_the_Stones_Serpent_in_the_Circle(3).csv",
           "Children_of_the_Stones_Circle_of_Fear(2).csv",
           "Children_of_the_Stones_Into_the_Circle(1).csv")

# Initialise data structures for combined processing
full_text <- data.frame()
nrc_plots <- list()

# Loop through files to process data and generate sentiment plots
for (i in seq_along(files)) {
  # Extract episode info
  episode_info <- gsub("Children_of_the_Stones_|\\.csv", "", files[i])
  episode_name <- gsub("\\(\\d+\\)", "", episode_info) %>% trimws()
  episode_number <- gsub(".*\\((\\d+)\\).*", "\\1", episode_info)
  
  # Read the file
  file_name <- file.path(file_path, files[i])
  data <- read_csv(file_name, show_col_types = FALSE)
  colnames(data) <- "text" # Ensure column consistency
  
  # Combine data for overall analysis
  full_text <- rbind(full_text, data)
  
  # Tokenize text into words
  tokens <- data %>%
    unnest_tokens(word, text) %>%
    anti_join(stop_words, by = "word")
  
  # Perform NRC sentiment analysis for each episode
  nrc_sentiments <- tokens %>%
    inner_join(get_sentiments("nrc"), by = "word", relationship = "many-to-many") %>%
    count(sentiment, sort = TRUE) %>%
    mutate(percentage = n / sum(n) * 100)
  
  # Create a plot for each episode
  plot <- ggplot(nrc_sentiments, aes(x = reorder(sentiment, n), y = n, fill = sentiment)) +
    geom_bar(stat = "identity") +
    geom_text(aes(label = paste0(round(percentage, 1), "%")), hjust = 1) +
    coord_flip() +
    labs(title = paste("NRC Sentiments for", episode_name, episode_number),
         x = "Sentiment", y = "Count") +
    theme_minimal()
  
  # Store the plot in the list
  nrc_plots[[i]] <- plot
}

# Reverse the order of the episode plots
nrc_plots <- rev(nrc_plots)

# Add an overall title to the grid of plots

# Create a title grob
title_grob <- textGrob(
  "NRC Sentiment Analysis by Episode: Children of the Stones; Analysis by Patrick Ford.", 
  gp = gpar(fontsize = 16, fontface = "bold")
)

# Create a caption grob
caption_grob <- textGrob(
  "Written by: Jeremy Burnham and Trevor Ray; Produced and Directed by: Peter Graham Scott (note - episode 6 is missing (Squaring the Circle))",
  gp = gpar(fontsize = 10, fontface = "bold")
)

# Arrange the plots with the title
grid_with_title <- arrangeGrob( 
  grobs = nrc_plots, 
  nrow = 2, ncol = 3,  # Arrange the plots in a grid
  top = title_grob,      # Add the title at the top
  bottom = caption_grob  # Add the caption at the bottom
)

# Draw the grid
grid.draw(grid_with_title)

# Tokenize full text for combined analysis
tokens <- full_text %>%
  unnest_tokens(word, text) %>%
  anti_join(stop_words, by = "word")

# Word Frequency Analysis
word_counts <- tokens %>%
  count(word, sort = TRUE)

# Plot the most common words
word_counts %>%
  top_n(30) %>%
  ggplot(aes(x = reorder(word, n), y = n)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Most Common Words in Children of the Stones; Series (note - episode 6 not included)",
       x = "Words", y = "Frequency") +
  theme_minimal()
## Selecting by n

# Bing Sentiment Analysis
bing_sentiments <- tokens %>%
  inner_join(get_sentiments("bing"), by = "word")

bing_sentiments_count <- bing_sentiments %>%
  count(sentiment, sort = TRUE) %>%
  mutate(percentage = n / sum(n) * 100)

bing_plot <- ggplot(bing_sentiments_count, aes(x = sentiment, y = n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = paste0(round(percentage, 1), "%")), vjust = 1) +
  labs(title = "Bing Sentiment Analysis: Children of the Stones; Series (note - episode 6 not included)",
       x = "Sentiment", y = "Count") +
  theme_minimal()

# Create NRC sentiment analysis for the full series
nrc_sentiments_count <- tokens %>%
  inner_join(get_sentiments("nrc"), by = "word", relationship = "many-to-many") %>%
  count(sentiment, sort = TRUE) %>%
  mutate(percentage = n / sum(n) * 100)

nrc_plot <- ggplot(nrc_sentiments_count, aes(x = reorder(sentiment, n), y = n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = paste0(round(percentage, 1), "%")), hjust = 1) +
  coord_flip() +
  labs(title = "NRC Sentiment Analysis: Children of the Stones; Series (note - episode 6 not included)",
       x = "Sentiment", y = "Count") +
  theme_minimal()

# Generate Word Cloud
set.seed(1234)
wordcloud(words = word_counts$word, freq = word_counts$n, min.freq = 5,
          max.words = 150, random.order = FALSE, rot.per = 0.1,
          scale = c(2.5, 0.5), colors = brewer.pal(8, "Dark2"))
mtext("Word Cloud: Children of the Stones; Series (note - episode 6 not included)", side = 3, adj = 0, line = 1, cex = 1, font = 2)

# Textual Complexity: Flesch-Kincaid Readability
full_text_string <- paste(full_text$text, collapse = " ")
readability <- textstat_readability(full_text_string, measure = "Flesch.Kincaid")

print(paste("Flesch-Kincaid Readability Score; Series (note - episode 6 not included):", readability))
## [1] "Flesch-Kincaid Readability Score; Series (note - episode 6 not included): text1"           
## [2] "Flesch-Kincaid Readability Score; Series (note - episode 6 not included): 3.26994233566308"
# Combine Bing and NRC sentiment plots side by side
grid.arrange(bing_plot, nrc_plot, nrow = 1)