# Load necessary libraries
pacman::p_load(pacman, tidytext, dplyr, tidyr, ggplot2, readr, topicmodels, gridExtra, wordcloud, RColorBrewer, quanteda, quanteda.textstats)

# Load the CSV file
Eighty_data <- read_csv("Around_the_World_in_Eighty_Days.csv")
## Rows: 1703 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): CHAPTER I.
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Rename the column for easier reference
colnames(Eighty_data) <- c("text")

# Tokenize the text into words
tokens <- Eighty_data %>%
  unnest_tokens(word, text)

# Remove stop words
data("stop_words")
tokens <- tokens %>%
  anti_join(stop_words)
## Joining with `by = join_by(word)`
# Count word frequencies
word_counts <- tokens %>%
  count(word, sort = TRUE)

# Display the most common words
head(word_counts, 30)
## # A tibble: 30 × 2
##    word             n
##    <chr>        <int>
##  1 fogg           584
##  2 passepartout   392
##  3 phileas        238
##  4 fix            234
##  5 time           126
##  6 aouda          125
##  7 train          119
##  8 sir            103
##  9 master         102
## 10 hundred         97
## # ℹ 20 more rows
# Plot the most common words
word_counts %>%
  top_n(30) %>%
  ggplot(aes(x = reorder(word, n), y = n)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Most Common Words in Around the World in Eighty Days by Jules Verne",
       x = "Words",
       y = "Frequency") +
  theme_minimal()
## Selecting by n

# Sentiment Analysis using Bing Lexicon
bing_sentiments <- tokens %>%
  inner_join(get_sentiments("bing"))
## Joining with `by = join_by(word)`
# Count positive and negative words
bing_sentiments_count <- bing_sentiments %>%
  count(sentiment, sort = TRUE)

# Add percentage column
bing_sentiments_count <- bing_sentiments_count %>%
  mutate(percentage = n / sum(n) * 100)

# Plot Bing sentiment counts and percentages
bing_plot <- ggplot(bing_sentiments_count, aes(x = sentiment, y = n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = paste0(round(percentage, 1), "%")), vjust = 1) +
  labs(title = "Sentiment Analysis of Around the World in Eighty Days by Jules Verne using Bing Lexicon",
       x = "Sentiment",
       y = "Count") +
  theme()

# Handle many-to-many relationship warning
nrc_sentiments <- tokens %>%
  inner_join(get_sentiments("nrc"), relationship = "many-to-many")
## Joining with `by = join_by(word)`
# Count NRC sentiments
nrc_sentiments_count <- nrc_sentiments %>%
  count(sentiment, sort = TRUE)

# Add percentage column
nrc_sentiments_count <- nrc_sentiments_count %>%
  mutate(percentage = n / sum(n) * 100)

# Plot NRC sentiment counts and percentages with adjusted label positioning
nrc_plot <- ggplot(nrc_sentiments_count, aes(x = reorder(sentiment, n), y = n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = paste0(round(percentage, 1), "%")), hjust = 1) +  coord_flip() +
  labs(title = "NRC Sentiment Analysis of Around the World in Eighty Days by Jules Verne",
       x = "Sentiment",
       y = "Count") +
  theme()

# Arrange Bing and NRC plots side by side
grid.arrange(bing_plot, nrc_plot, nrow = 1)

# Generate Word Cloud 
set.seed(1234)
wordcloud(words = word_counts$word, freq = word_counts$n, min.freq = 5,
          max.words = 100,  
          random.order = FALSE, rot.per = 0.1,  
          scale = c(3.5, 0.8),  
          colors = brewer.pal(8, "Dark2"))
mtext("Word Cloud of Around the World in Eighty Days by Jules Verne", side = 3, adj = 0, line = 1, cex = 1, font = 2)

# Prepare data for Topic Modeling
# Create a document-term matrix
dtm <- tokens %>%
  count(document = row_number(), word) %>%
  cast_dtm(document, word, n)

# Set the number of topics
num_topics <- 4

# Run LDA
lda_model <- LDA(dtm, k = num_topics, control = list(seed = 1234))

# Get the top terms for each topic
lda_terms <- tidy(lda_model, matrix = "beta")

# Display the top terms for each topic in a table format
top_terms <- lda_terms %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

# Print the top terms for each topic
top_terms %>%
  group_by(topic) %>%
  summarize(terms = paste(term, collapse = ", ")) %>%
  print()
## # A tibble: 4 × 2
##   topic terms                                                                   
##   <int> <chr>                                                                   
## 1     1 fogg, passepartout, aouda, fix, days, time, phileas, hours, twenty, fra…
## 2     2 fogg, fix, passepartout, phileas, sir, miles, replied, minutes, twenty,…
## 3     3 fogg, phileas, hundred, train, day, fix, aouda, thousand, master, passe…
## 4     4 fogg, passepartout, phileas, fix, train, time, master, hour, evening, m…
# Plot the top terms for each topic
top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(x = term, y = beta, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free_y") +
  coord_flip() +
  labs(title = "Top Terms in Each Topic",
       x = "Terms",
       y = "Beta") +
  scale_x_reordered() +
  theme()

# Textual Complexity: Flesch-Kincaid Readability
Eighty_text <- paste(Eighty_data$text, collapse = " ")
readability <- textstat_readability(Eighty_text, measure = "Flesch.Kincaid")

print(paste("Flesch-Kincaid Readability Score: ", readability))
## [1] "Flesch-Kincaid Readability Score:  text1"           
## [2] "Flesch-Kincaid Readability Score:  8.36865126063391"