pacman::p_load(pacman, tidytext, dplyr, tidyr, ggplot2, readr, topicmodels, gridExtra, wordcloud, RColorBrewer, quanteda, quanteda.textstats)


# Load the CSV file
sherlock_data <- read_csv("ADVENTURES_OF_SHERLOCK_HOLMES.csv", col_names = FALSE)
## Rows: 2570 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): X1
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Rename the column for easier reference
colnames(sherlock_data) <- c("text")

# Display the first few rows of the data
head(sherlock_data)
## # A tibble: 6 × 1
##   text                                                                          
##   <chr>                                                                         
## 1 ADVENTURES OF SHERLOCK HOLMES                                                 
## 2 Adventure I                                                                   
## 3 A SCANDAL IN BOHEMIA                                                          
## 4 I                                                                             
## 5 O Sherlock Holmes she is always the woman. I have seldom heard him mention he…
## 6 I had seen little of Holmes lately. My marriage had drifted us away from each…
# Tokenize the text into words
tokens <- sherlock_data %>%
  unnest_tokens(word, text)

# Remove stop words
data("stop_words")
tokens <- tokens %>%
  anti_join(stop_words)
## Joining with `by = join_by(word)`
# Count word frequencies
word_counts <- tokens %>%
  count(word, sort = TRUE)

# Display the most common words
head(word_counts, 30)
## # A tibble: 30 × 2
##    word       n
##    <chr>  <int>
##  1 holmes   446
##  2 time     154
##  3 door     146
##  4 house    127
##  5 matter   125
##  6 hand     121
##  7 night    115
##  8 heard    113
##  9 found    109
## 10 day      108
## # ℹ 20 more rows
# Plot the most common words
word_counts %>%
  top_n(30) %>%
  ggplot(aes(x = reorder(word, n), y = n)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Most Common Words in The Adventures of Sherlock Holmes",
       x = "Words",
       y = "Frequency") +
  theme_minimal()
## Selecting by n

# Sentiment Analysis using Bing Lexicon
bing_sentiments <- tokens %>%
  inner_join(get_sentiments("bing"))
## Joining with `by = join_by(word)`
# Count positive and negative words
bing_sentiments_count <- bing_sentiments %>%
  count(sentiment, sort = TRUE)

# Add percentage column
bing_sentiments_count <- bing_sentiments_count %>%
  mutate(percentage = n / sum(n) * 100)

# Plot Bing sentiment counts and percentages
bing_plot <- ggplot(bing_sentiments_count, aes(x = sentiment, y = n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = paste0(round(percentage, 1), "%")), vjust = 1) +
  labs(title = "Sentiment Analysis of The Adventures of Sherlock Holmes using Bing Lexicon",
       x = "Sentiment",
       y = "Count") +
  theme()

# Handle many-to-many relationship warning
nrc_sentiments <- tokens %>%
  inner_join(get_sentiments("nrc"), relationship = "many-to-many")
## Joining with `by = join_by(word)`
# Count NRC sentiments
nrc_sentiments_count <- nrc_sentiments %>%
  count(sentiment, sort = TRUE)

# Add percentage column
nrc_sentiments_count <- nrc_sentiments_count %>%
  mutate(percentage = n / sum(n) * 100)

# Plot NRC sentiment counts and percentages with adjusted label positioning
nrc_plot <- ggplot(nrc_sentiments_count, aes(x = reorder(sentiment, n), y = n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = paste0(round(percentage, 1), "%")), hjust = 1) +  coord_flip() +
  labs(title = "NRC Sentiment Analysis of The Adventures of Sherlock Holmes",
       x = "Sentiment",
       y = "Count") +
  theme()

# Arrange Bing and NRC plots side by side
grid.arrange(bing_plot, nrc_plot, nrow = 1)

# Generate Word Cloud 
set.seed(1234)
wordcloud(words = word_counts$word, freq = word_counts$n, min.freq = 5,
          max.words = 100,  
          random.order = FALSE, rot.per = 0.1,  
          scale = c(3.5, 0.8),  
          colors = brewer.pal(8, "Dark2"))
mtext("Word Cloud of The Adventures of Sherlock Holmes", side = 3, adj = 0, line = 1, cex = 1, font = 2)

# Prepare data for Topic Modeling
# Create a document-term matrix
dtm <- tokens %>%
  count(document = row_number(), word) %>%
  cast_dtm(document, word, n)

# Set the number of topics
num_topics <- 4

# Run LDA
lda_model <- LDA(dtm, k = num_topics, control = list(seed = 1234))

# Get the top terms for each topic
lda_terms <- tidy(lda_model, matrix = "beta")

# Display the top terms for each topic in a table format
top_terms <- lda_terms %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

# Print the top terms for each topic
top_terms %>%
  group_by(topic) %>%
  summarize(terms = paste(term, collapse = ", ")) %>%
  print()
## # A tibble: 4 × 2
##   topic terms                                                                   
##   <int> <chr>                                                                   
## 1     1 holmes, door, heard, day, house, round, street, remarked, cried, watson 
## 2     2 holmes, time, left, hand, matter, miss, business, morning, sherlock, ey…
## 3     3 holmes, house, heard, door, sherlock, lady, eyes, laid, night, day      
## 4     4 matter, time, night, found, light, door, miss, hand, holmes, morning
# Plot the top terms for each topic
top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(x = term, y = beta, fill = as.factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free_y") +
  coord_flip() +
  labs(title = "Top Terms in Each Topic",
       x = "Terms",
       y = "Beta") +
  scale_x_reordered() +
  theme()

# Textual Complexity: Flesch-Kincaid Readability
sherlock_text <- paste(sherlock_data$text, collapse = " ")
readability <- textstat_readability(sherlock_text, measure = "Flesch.Kincaid")

print(paste("Flesch-Kincaid Readability Score: ", readability))
## [1] "Flesch-Kincaid Readability Score:  text1"           
## [2] "Flesch-Kincaid Readability Score:  6.08705528060085"
# Bigram Analysis
bigrams <- sherlock_data %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  separate(bigram, into = c("word1", "word2"), sep = " ") %>%
  filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word) %>%
  count(word1, word2, sort = TRUE)

# Plot Most Common Bigrams
bigrams %>%
  top_n(20, n) %>%
  ggplot(aes(x = reorder(paste(word1, word2, sep = " "), n), y = n)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Most Common Bigrams in The Adventures of Sherlock Holmes",
       x = "Bigram",
       y = "Frequency") +
  theme()