# Load necessary libraries
pacman::p_load(pacman, tidytext, dplyr, ggplot2, readr, topicmodels, textdata, gridExtra, wordcloud, RColorBrewer)
# Function to process text data
process_text_data <- function(file_path, text_col_name) {
data <- read_csv(file_path) %>%
mutate(document = row_number()) %>% # Add a document column for unique identification
select(document, text = all_of(text_col_name)) %>%
mutate(text = gsub("[0-9]+", "", text))
tokens <- data %>%
unnest_tokens(word, text) %>%
filter(!is.na(word)) %>%
anti_join(stop_words, by = "word")
return(tokens)
}
# Function to create a word cloud with a title
create_word_cloud <- function(tokens, title) {
word_counts <- tokens %>%
count(word, sort = TRUE)
wordcloud(words = word_counts$word, freq = word_counts$n, min.freq = 2,
max.words = 100, random.order = FALSE, rot.per = 0.35,
colors = brewer.pal(8, "Dark2"), scale = c(4, 0.5))
title(main = title)
}
# Function to plot word frequencies with counts and percentages
plot_word_freq_with_percentage <- function(tokens, title) {
word_counts <- tokens %>%
count(word, sort = TRUE)
count_plot <- ggplot(word_counts[1:10, ], aes(x = reorder(word, n), y = n)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = title, x = "Words", y = "Count")
percentage_plot <- ggplot(word_counts[1:10, ], aes(x = reorder(word, n), y = n/sum(n) * 100)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = title, x = "Words", y = "Percentage")
return(list(count_plot = count_plot, percentage_plot = percentage_plot))
}
# Function to plot sentiment analysis
plot_sentiment_analysis <- function(tokens, lexicon, title) {
sentiments <- get_sentiments(lexicon)
sentiment_counts <- tokens %>%
inner_join(sentiments, by = "word") %>%
count(sentiment, sort = TRUE)
count_plot <- ggplot(sentiment_counts, aes(x = reorder(sentiment, n), y = n)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = title, x = "Sentiment", y = "Count")
percentage_plot <- ggplot(sentiment_counts, aes(x = reorder(sentiment, n), y = n/sum(n) * 100)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = title, x = "Sentiment", y = "Percentage")
return(list(count_plot = count_plot, percentage_plot = percentage_plot))
}
# Function to compare sentiment distribution
compare_sentiment_distribution <- function(bible_tokens, quran_tokens, lexicon) {
sentiments <- get_sentiments(lexicon)
bible_word_count <- nrow(bible_tokens)
quran_word_count <- nrow(quran_tokens)
bible_sentiment_counts <- bible_tokens %>%
inner_join(sentiments, by = "word") %>%
count(sentiment, sort = TRUE) %>%
mutate(total_words = bible_word_count)
quran_sentiment_counts <- quran_tokens %>%
inner_join(sentiments, by = "word") %>%
count(sentiment, sort = TRUE) %>%
mutate(total_words = quran_word_count)
combined <- bind_rows(
bible_sentiment_counts %>% mutate(source = "Bible"),
quran_sentiment_counts %>% mutate(source = "Quran")
)
combined <- combined %>%
mutate(normalized_count = n / total_words * max(c(bible_word_count, quran_word_count)),
percentage = n / total_words * 100)
count_plot <- ggplot(combined, aes(x = sentiment, y = n, fill = source)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = paste(lexicon, "Sentiment Comparison"), x = "Sentiment", y = "Count") +
scale_fill_brewer(palette = "Paired")
percentage_plot <- ggplot(combined, aes(x = sentiment, y = percentage, fill = source)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = paste(lexicon, "Sentiment Comparison (Percentage)"), x = "Sentiment", y = "Percentage") +
scale_fill_brewer(palette = "Paired")
return(list(count_plot = count_plot, percentage_plot = percentage_plot))
}
# Function to calculate lexical diversity
lexical_diversity <- function(tokens, title) {
diversity <- tokens %>%
summarise(lexical_diversity = n_distinct(word) / n())
plot <- ggplot(diversity, aes(x = "", y = lexical_diversity)) +
geom_bar(stat = "identity") +
labs(title = title, y = "Lexical Diversity", x = "")
return(plot)
}
# Function to plot topics
plot_topics <- function(tokens, title) {
dtm <- tokens %>%
count(document, word) %>%
cast_dtm(document, word, n)
lda <- LDA(dtm, k = 4, control = list(seed = 1234))
topics <- tidy(lda, matrix = "beta")
top_terms <- topics %>%
group_by(topic) %>%
slice_max(beta, n = 10) %>%
ungroup() %>%
arrange(topic, -beta)
plot <- ggplot(top_terms, aes(x = reorder(term, beta), y = beta)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
labs(title = title, x = "Terms", y = "Beta")
return(plot)
}
# Process Bible and Quran texts with correct column names
bible_tokens <- process_text_data("Old_Testament_KJ_Bible.csv", "The Old Testament of the King James Version of the Bible")
## Rows: 24606 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): The Old Testament of the King James Version of the Bible
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
quran_tokens <- process_text_data("Quran_english.csv", "ayahs-translation")
## Rows: 6236 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ayahs-translation
## dbl (3): id, surahs, ayahs
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Plot word clouds with titles
bible_wordcloud <- function() {
create_word_cloud(bible_tokens, "Word Cloud of the Bible (KJV)")
}
quran_wordcloud <- function() {
create_word_cloud(quran_tokens, "Word Cloud of the Quran")
}
# Plot the word clouds side by side
par(mfrow = c(1, 2))
bible_wordcloud()
quran_wordcloud()

# Plotting word frequencies with counts and percentages
bible_word_plots <- plot_word_freq_with_percentage(bible_tokens, "Most Common Words in the Bible (KJV)")
quran_word_plots <- plot_word_freq_with_percentage(quran_tokens, "Most Common Words in the Quran")
# Display plots
grid.arrange(bible_word_plots$count_plot, quran_word_plots$count_plot, bible_word_plots$percentage_plot, quran_word_plots$percentage_plot, nrow = 2, ncol = 2)

# Plot sentiment analysis
bible_bing_plots <- plot_sentiment_analysis(bible_tokens, "bing", "Bing Sentiment Analysis of the Bible (KJV)")
## Warning in inner_join(., sentiments, by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 137263 of `x` matches multiple rows in `y`.
## ℹ Row 1220 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
quran_bing_plots <- plot_sentiment_analysis(quran_tokens, "bing", "Bing Sentiment Analysis of the Quran")
## Warning in inner_join(., sentiments, by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 54828 of `x` matches multiple rows in `y`.
## ℹ Row 2715 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# Display sentiment analysis plots
grid.arrange(bible_bing_plots$count_plot, quran_bing_plots$count_plot, bible_bing_plots$percentage_plot, quran_bing_plots$percentage_plot, nrow = 2, ncol = 2)

# Plot NRC sentiment analysis
bible_nrc_plots <- plot_sentiment_analysis(bible_tokens, "nrc", "NRC Sentiment Analysis of the Bible (KJV)")
## Warning in inner_join(., sentiments, by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 6 of `x` matches multiple rows in `y`.
## ℹ Row 5600 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
quran_nrc_plots <- plot_sentiment_analysis(quran_tokens, "nrc", "NRC Sentiment Analysis of the Quran")
## Warning in inner_join(., sentiments, by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 4 of `x` matches multiple rows in `y`.
## ℹ Row 5664 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# Display NRC sentiment analysis plots
grid.arrange(bible_nrc_plots$count_plot, quran_nrc_plots$count_plot, bible_nrc_plots$percentage_plot, quran_nrc_plots$percentage_plot, nrow = 2, ncol = 2)

# Compare Bing sentiment distribution
bing_sentiment_comparison_plots <- compare_sentiment_distribution(bible_tokens, quran_tokens, "bing")
## Warning in inner_join(., sentiments, by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 137263 of `x` matches multiple rows in `y`.
## ℹ Row 1220 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
## Warning in inner_join(., sentiments, by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 54828 of `x` matches multiple rows in `y`.
## ℹ Row 2715 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# Display Bing sentiment comparison plots
grid.arrange(bing_sentiment_comparison_plots$count_plot, bing_sentiment_comparison_plots$percentage_plot, nrow = 1)

# Compare NRC sentiment distribution
nrc_sentiment_comparison_plots <- compare_sentiment_distribution(bible_tokens, quran_tokens, "nrc")
## Warning in inner_join(., sentiments, by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 6 of `x` matches multiple rows in `y`.
## ℹ Row 5600 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
## Warning in inner_join(., sentiments, by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 4 of `x` matches multiple rows in `y`.
## ℹ Row 5664 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# Display NRC sentiment comparison plots
grid.arrange(nrc_sentiment_comparison_plots$count_plot, nrc_sentiment_comparison_plots$percentage_plot, nrow = 1)

# Calculate lexical diversity
bible_lexical_diversity_plot <- lexical_diversity(bible_tokens, "Bible (KJV)")
quran_lexical_diversity_plot <- lexical_diversity(quran_tokens, "Quran")
# Display lexical diversity plots
grid.arrange(bible_lexical_diversity_plot, quran_lexical_diversity_plot, nrow = 1)

# Plot topics
bible_topic_plot <- plot_topics(bible_tokens, "Top Terms in Each Topic of the Bible (KJV)")
quran_topic_plot <- plot_topics(quran_tokens, "Top Terms in Each Topic of the Quran")
# Display topic plots
grid.arrange(bible_topic_plot, quran_topic_plot, nrow = 1)
