pacman::p_load(pacman, tidytext, dplyr, tidyr, ggplot2, readr, topicmodels, gridExtra, wordcloud, RColorBrewer, quanteda, quanteda.textstats)
# Load the CSV file
sherlock_data <- read_csv("ADVENTURES_OF_SHERLOCK_HOLMES.csv", col_names = FALSE)
## Rows: 2570 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): X1
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Rename the column for easier reference
colnames(sherlock_data) <- c("text")
# Display the first few rows of the data
head(sherlock_data)
## # A tibble: 6 × 1
## text
## <chr>
## 1 ADVENTURES OF SHERLOCK HOLMES
## 2 Adventure I
## 3 A SCANDAL IN BOHEMIA
## 4 I
## 5 O Sherlock Holmes she is always the woman. I have seldom heard him mention he…
## 6 I had seen little of Holmes lately. My marriage had drifted us away from each…
# Tokenize the text into words
tokens <- sherlock_data %>%
unnest_tokens(word, text)
# Remove stop words
data("stop_words")
tokens <- tokens %>%
anti_join(stop_words)
## Joining with `by = join_by(word)`
# Count word frequencies
word_counts <- tokens %>%
count(word, sort = TRUE)
# Display the most common words
head(word_counts, 30)
## # A tibble: 30 × 2
## word n
## <chr> <int>
## 1 holmes 446
## 2 time 154
## 3 door 146
## 4 house 127
## 5 matter 125
## 6 hand 121
## 7 night 115
## 8 heard 113
## 9 found 109
## 10 day 108
## # ℹ 20 more rows
# Plot the most common words
word_counts %>%
top_n(30) %>%
ggplot(aes(x = reorder(word, n), y = n)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Most Common Words in The Adventures of Sherlock Holmes",
x = "Words",
y = "Frequency") +
theme_minimal()
## Selecting by n

# Sentiment Analysis using Bing Lexicon
bing_sentiments <- tokens %>%
inner_join(get_sentiments("bing"))
## Joining with `by = join_by(word)`
# Count positive and negative words
bing_sentiments_count <- bing_sentiments %>%
count(sentiment, sort = TRUE)
# Add percentage column
bing_sentiments_count <- bing_sentiments_count %>%
mutate(percentage = n / sum(n) * 100)
# Plot Bing sentiment counts and percentages
bing_plot <- ggplot(bing_sentiments_count, aes(x = sentiment, y = n, fill = sentiment)) +
geom_bar(stat = "identity") +
geom_text(aes(label = paste0(round(percentage, 1), "%")), vjust = 1) +
labs(title = "Sentiment Analysis of The Adventures of Sherlock Holmes using Bing Lexicon",
x = "Sentiment",
y = "Count") +
theme()
# Handle many-to-many relationship warning
nrc_sentiments <- tokens %>%
inner_join(get_sentiments("nrc"), relationship = "many-to-many")
## Joining with `by = join_by(word)`
# Count NRC sentiments
nrc_sentiments_count <- nrc_sentiments %>%
count(sentiment, sort = TRUE)
# Add percentage column
nrc_sentiments_count <- nrc_sentiments_count %>%
mutate(percentage = n / sum(n) * 100)
# Plot NRC sentiment counts and percentages with adjusted label positioning
nrc_plot <- ggplot(nrc_sentiments_count, aes(x = reorder(sentiment, n), y = n, fill = sentiment)) +
geom_bar(stat = "identity") +
geom_text(aes(label = paste0(round(percentage, 1), "%")), hjust = 1) + coord_flip() +
labs(title = "NRC Sentiment Analysis of The Adventures of Sherlock Holmes",
x = "Sentiment",
y = "Count") +
theme()
# Arrange Bing and NRC plots side by side
grid.arrange(bing_plot, nrc_plot, nrow = 1)

# Generate Word Cloud
set.seed(1234)
wordcloud(words = word_counts$word, freq = word_counts$n, min.freq = 5,
max.words = 100,
random.order = FALSE, rot.per = 0.1,
scale = c(3.5, 0.8),
colors = brewer.pal(8, "Dark2"))
mtext("Word Cloud of The Adventures of Sherlock Holmes", side = 3, adj = 0, line = 1, cex = 1, font = 2)

# Prepare data for Topic Modeling
# Create a document-term matrix
dtm <- tokens %>%
count(document = row_number(), word) %>%
cast_dtm(document, word, n)
# Set the number of topics
num_topics <- 4
# Run LDA
lda_model <- LDA(dtm, k = num_topics, control = list(seed = 1234))
# Get the top terms for each topic
lda_terms <- tidy(lda_model, matrix = "beta")
# Display the top terms for each topic in a table format
top_terms <- lda_terms %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
# Print the top terms for each topic
top_terms %>%
group_by(topic) %>%
summarize(terms = paste(term, collapse = ", ")) %>%
print()
## # A tibble: 4 × 2
## topic terms
## <int> <chr>
## 1 1 holmes, door, heard, day, house, round, street, remarked, cried, watson
## 2 2 holmes, time, left, hand, matter, miss, business, morning, sherlock, ey…
## 3 3 holmes, house, heard, door, sherlock, lady, eyes, laid, night, day
## 4 4 matter, time, night, found, light, door, miss, hand, holmes, morning
# Plot the top terms for each topic
top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(x = term, y = beta, fill = as.factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free_y") +
coord_flip() +
labs(title = "Top Terms in Each Topic",
x = "Terms",
y = "Beta") +
scale_x_reordered() +
theme()

# Textual Complexity: Flesch-Kincaid Readability
sherlock_text <- paste(sherlock_data$text, collapse = " ")
readability <- textstat_readability(sherlock_text, measure = "Flesch.Kincaid")
print(paste("Flesch-Kincaid Readability Score: ", readability))
## [1] "Flesch-Kincaid Readability Score: text1"
## [2] "Flesch-Kincaid Readability Score: 6.08705528060085"
# Bigram Analysis
bigrams <- sherlock_data %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, into = c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word, !word2 %in% stop_words$word) %>%
count(word1, word2, sort = TRUE)
# Plot Most Common Bigrams
bigrams %>%
top_n(20, n) %>%
ggplot(aes(x = reorder(paste(word1, word2, sep = " "), n), y = n)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Most Common Bigrams in The Adventures of Sherlock Holmes",
x = "Bigram",
y = "Frequency") +
theme()
