Bible and Quran: Sentiment Analysis

# Load necessary libraries
pacman::p_load(pacman, tidytext, dplyr, ggplot2, readr, topicmodels, textdata, gridExtra, wordcloud, RColorBrewer)

# Function to process text data
process_text_data <- function(file_path, text_col_name) {
  data <- read_csv(file_path) %>%
    mutate(document = row_number()) %>% # Add a document column for unique identification
    select(document, text = all_of(text_col_name)) %>%
    mutate(text = gsub("[0-9]+", "", text))
  
  tokens <- data %>%
    unnest_tokens(word, text) %>%
    filter(!is.na(word)) %>%
    anti_join(stop_words, by = "word")
  
  return(tokens)
}

# Function to create a word cloud with a title
create_word_cloud <- function(tokens, title) {
  word_counts <- tokens %>%
    count(word, sort = TRUE)
  
  wordcloud(words = word_counts$word, freq = word_counts$n, min.freq = 2,
            max.words = 100, random.order = FALSE, rot.per = 0.35, 
            colors = brewer.pal(8, "Dark2"), scale = c(4, 0.5))
  title(main = title)
}

# Function to plot word frequencies with counts and percentages
plot_word_freq_with_percentage <- function(tokens, title) {
  word_counts <- tokens %>%
    count(word, sort = TRUE)
  
  count_plot <- ggplot(word_counts[1:10, ], aes(x = reorder(word, n), y = n)) +
    geom_bar(stat = "identity") +
    coord_flip() +
    labs(title = title, x = "Words", y = "Count")
  
  percentage_plot <- ggplot(word_counts[1:10, ], aes(x = reorder(word, n), y = n/sum(n) * 100)) +
    geom_bar(stat = "identity") +
    coord_flip() +
    labs(title = title, x = "Words", y = "Percentage")
  
  return(list(count_plot = count_plot, percentage_plot = percentage_plot))
}

# Function to plot sentiment analysis
plot_sentiment_analysis <- function(tokens, lexicon, title) {
  sentiments <- get_sentiments(lexicon)
  sentiment_counts <- tokens %>%
    inner_join(sentiments, by = "word") %>%
    count(sentiment, sort = TRUE)
  
  count_plot <- ggplot(sentiment_counts, aes(x = reorder(sentiment, n), y = n)) +
    geom_bar(stat = "identity") +
    coord_flip() +
    labs(title = title, x = "Sentiment", y = "Count")
  
  percentage_plot <- ggplot(sentiment_counts, aes(x = reorder(sentiment, n), y = n/sum(n) * 100)) +
    geom_bar(stat = "identity") +
    coord_flip() +
    labs(title = title, x = "Sentiment", y = "Percentage")
  
  return(list(count_plot = count_plot, percentage_plot = percentage_plot))
}

# Function to compare sentiment distribution
compare_sentiment_distribution <- function(bible_tokens, quran_tokens, lexicon) {
  sentiments <- get_sentiments(lexicon)
  
  bible_word_count <- nrow(bible_tokens)
  quran_word_count <- nrow(quran_tokens)
  
  bible_sentiment_counts <- bible_tokens %>%
    inner_join(sentiments, by = "word") %>%
    count(sentiment, sort = TRUE) %>%
    mutate(total_words = bible_word_count)
  
  quran_sentiment_counts <- quran_tokens %>%
    inner_join(sentiments, by = "word") %>%
    count(sentiment, sort = TRUE) %>%
    mutate(total_words = quran_word_count)
  
  combined <- bind_rows(
    bible_sentiment_counts %>% mutate(source = "Bible"),
    quran_sentiment_counts %>% mutate(source = "Quran")
  )
  
  combined <- combined %>%
    mutate(normalized_count = n / total_words * max(c(bible_word_count, quran_word_count)),
           percentage = n / total_words * 100)
  
  count_plot <- ggplot(combined, aes(x = sentiment, y = n, fill = source)) +
    geom_bar(stat = "identity", position = "dodge") +
    labs(title = paste(lexicon, "Sentiment Comparison"), x = "Sentiment", y = "Count") +
    scale_fill_brewer(palette = "Paired")
  
  percentage_plot <- ggplot(combined, aes(x = sentiment, y = percentage, fill = source)) +
    geom_bar(stat = "identity", position = "dodge") +
    labs(title = paste(lexicon, "Sentiment Comparison (Percentage)"), x = "Sentiment", y = "Percentage") +
    scale_fill_brewer(palette = "Paired")
  
  return(list(count_plot = count_plot, percentage_plot = percentage_plot))
}

# Function to calculate lexical diversity
lexical_diversity <- function(tokens, title) {
  diversity <- tokens %>%
    summarise(lexical_diversity = n_distinct(word) / n())
  
  plot <- ggplot(diversity, aes(x = "", y = lexical_diversity)) +
    geom_bar(stat = "identity") +
    labs(title = title, y = "Lexical Diversity", x = "")
  
  return(plot)
}

# Function to plot topics
plot_topics <- function(tokens, title) {
  dtm <- tokens %>%
    count(document, word) %>%
    cast_dtm(document, word, n)
  
  lda <- LDA(dtm, k = 4, control = list(seed = 1234))
  topics <- tidy(lda, matrix = "beta")
  
  top_terms <- topics %>%
    group_by(topic) %>%
    slice_max(beta, n = 10) %>%
    ungroup() %>%
    arrange(topic, -beta)
  
  plot <- ggplot(top_terms, aes(x = reorder(term, beta), y = beta)) +
    geom_col(show.legend = FALSE) +
    facet_wrap(~ topic, scales = "free") +
    coord_flip() +
    labs(title = title, x = "Terms", y = "Beta")
  
  return(plot)
}

# Process Bible and Quran texts with correct column names
bible_tokens <- process_text_data("Old_Testament_KJ_Bible.csv", "The Old Testament of the King James Version of the Bible")

## Rows: 24606 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): The Old Testament of the King James Version of the Bible
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

quran_tokens <- process_text_data("Quran_english.csv", "ayahs-translation")

## Rows: 6236 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): ayahs-translation
## dbl (3): id, surahs, ayahs
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Plot word clouds with titles
bible_wordcloud <- function() {
  create_word_cloud(bible_tokens, "Word Cloud of the Bible (KJV)")
}

quran_wordcloud <- function() {
  create_word_cloud(quran_tokens, "Word Cloud of the Quran")
}

# Plot the word clouds side by side
par(mfrow = c(1, 2))
bible_wordcloud()
quran_wordcloud()

# Plotting word frequencies with counts and percentages
bible_word_plots <- plot_word_freq_with_percentage(bible_tokens, "Most Common Words in the Bible (KJV)")
quran_word_plots <- plot_word_freq_with_percentage(quran_tokens, "Most Common Words in the Quran")

# Display plots
grid.arrange(bible_word_plots$count_plot, quran_word_plots$count_plot, bible_word_plots$percentage_plot, quran_word_plots$percentage_plot, nrow = 2, ncol = 2)

# Plot sentiment analysis
bible_bing_plots <- plot_sentiment_analysis(bible_tokens, "bing", "Bing Sentiment Analysis of the Bible (KJV)")

## Warning in inner_join(., sentiments, by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 137263 of `x` matches multiple rows in `y`.
## ℹ Row 1220 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

quran_bing_plots <- plot_sentiment_analysis(quran_tokens, "bing", "Bing Sentiment Analysis of the Quran")

## Warning in inner_join(., sentiments, by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 54828 of `x` matches multiple rows in `y`.
## ℹ Row 2715 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

# Display sentiment analysis plots
grid.arrange(bible_bing_plots$count_plot, quran_bing_plots$count_plot, bible_bing_plots$percentage_plot, quran_bing_plots$percentage_plot, nrow = 2, ncol = 2)

# Plot NRC sentiment analysis
bible_nrc_plots <- plot_sentiment_analysis(bible_tokens, "nrc", "NRC Sentiment Analysis of the Bible (KJV)")

## Warning in inner_join(., sentiments, by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 6 of `x` matches multiple rows in `y`.
## ℹ Row 5600 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

quran_nrc_plots <- plot_sentiment_analysis(quran_tokens, "nrc", "NRC Sentiment Analysis of the Quran")

## Warning in inner_join(., sentiments, by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 4 of `x` matches multiple rows in `y`.
## ℹ Row 5664 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

# Display NRC sentiment analysis plots
grid.arrange(bible_nrc_plots$count_plot, quran_nrc_plots$count_plot, bible_nrc_plots$percentage_plot, quran_nrc_plots$percentage_plot, nrow = 2, ncol = 2)

# Compare Bing sentiment distribution
bing_sentiment_comparison_plots <- compare_sentiment_distribution(bible_tokens, quran_tokens, "bing")

## Warning in inner_join(., sentiments, by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 137263 of `x` matches multiple rows in `y`.
## ℹ Row 1220 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

## Warning in inner_join(., sentiments, by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 54828 of `x` matches multiple rows in `y`.
## ℹ Row 2715 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

# Display Bing sentiment comparison plots
grid.arrange(bing_sentiment_comparison_plots$count_plot, bing_sentiment_comparison_plots$percentage_plot, nrow = 1)

# Compare NRC sentiment distribution
nrc_sentiment_comparison_plots <- compare_sentiment_distribution(bible_tokens, quran_tokens, "nrc")

## Warning in inner_join(., sentiments, by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 6 of `x` matches multiple rows in `y`.
## ℹ Row 5600 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

## Warning in inner_join(., sentiments, by = "word"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 4 of `x` matches multiple rows in `y`.
## ℹ Row 5664 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

# Display NRC sentiment comparison plots
grid.arrange(nrc_sentiment_comparison_plots$count_plot, nrc_sentiment_comparison_plots$percentage_plot, nrow = 1)

# Calculate lexical diversity
bible_lexical_diversity_plot <- lexical_diversity(bible_tokens, "Bible (KJV)")
quran_lexical_diversity_plot <- lexical_diversity(quran_tokens, "Quran")

# Display lexical diversity plots
grid.arrange(bible_lexical_diversity_plot, quran_lexical_diversity_plot, nrow = 1)

# Plot topics
bible_topic_plot <- plot_topics(bible_tokens, "Top Terms in Each Topic of the Bible (KJV)")
quran_topic_plot <- plot_topics(quran_tokens, "Top Terms in Each Topic of the Quran")

# Display topic plots
grid.arrange(bible_topic_plot, quran_topic_plot, nrow = 1)

Bible and Quran: Sentiment Analysis

Patrick Ford

2024-08-05