Load required libraries

pacman::p_load(pacman, tidytext, dplyr, tidyr, ggplot2, readr, topicmodels, udpipe, gridExtra, wordcloud, RColorBrewer, quanteda, quanteda.textstats)

Load the CSV file

bible_data <- read_csv(“Douay_Rheims_Bible2.csv”)

Rename columns

colnames(bible_data) <- c(“Book”, “Chapter”, “Verse”, “Verse_Text”)

Add document IDs

bible_data <- bible_data %>% mutate(document = row_number())

Tokenize the text into words

tokens <- bible_data %>% unnest_tokens(word, Verse_Text)

Remove stop words

data(“stop_words”)

tokens <- tokens %>% anti_join(stop_words, by = “word”)

Count word frequencies

word_counts <- tokens %>% count(word, sort = TRUE)

Display the most common words

print(head(word_counts, 30))

Plot the most common words

word_counts %>% top_n(30) %>% ggplot(aes(x = reorder(word, n), y = n)) + geom_bar(stat = “identity”) + coord_flip() + labs(title = “Most Common Words in the Douay-Rheims Bible”, x = “Words”, y = “Frequency”) + theme_minimal()

Sentiment Analysis using Bing Lexicon

bing_sentiments <- tokens %>% inner_join(get_sentiments(“bing”))

Count positive and negative words

bing_sentiments_count <- bing_sentiments %>% count(sentiment, sort = TRUE)

Add percentage column

bing_sentiments_count <- bing_sentiments_count %>% mutate(percentage = n / sum(n) * 100)

Plot Bing sentiment counts and percentages

bing_plot <- ggplot(bing_sentiments_count, aes(x = sentiment, y = n, fill = sentiment)) + geom_bar(stat = “identity”) + geom_text(aes(label = paste0(round(percentage, 1), “%”)), vjust = 1) + labs(title = “Sentiment Analysis of the Douay-Rheims Bible (Bing Lexicon)”, x = “Sentiment”, y = “Count”) + theme_minimal()

NRC Sentiment Analysis

nrc_sentiments <- tokens %>% inner_join(get_sentiments(“nrc”), by = “word”)

Count NRC sentiments

nrc_sentiments_count <- nrc_sentiments %>% count(sentiment, sort = TRUE)

Add percentage column

nrc_sentiments_count <- nrc_sentiments_count %>% mutate(percentage = n / sum(n) * 100)

Plot NRC sentiment counts and percentages

nrc_plot <- ggplot(nrc_sentiments_count, aes(x = reorder(sentiment, n), y = n, fill = sentiment)) + geom_bar(stat = “identity”) + geom_text(aes(label = paste0(round(percentage, 1), “%”)), hjust = 1) + coord_flip() + labs(title = “NRC Sentiment Analysis of the Douay-Rheims Bible”, x = “Sentiment”, y = “Count”) + theme_minimal()

Arrange Bing and NRC plots side by side

grid.arrange(bing_plot, nrc_plot, nrow = 1)

Generate Word Cloud

set.seed(1234) wordcloud(words = word_counts\(word, freq = word_counts\)n, min.freq = 5, max.words = 100, random.order = FALSE, rot.per = 0.1, scale = c(3.5, .75), colors = brewer.pal(8, “Dark2”)) mtext(“Word Cloud of the Douay-Rheims Bible”, side = 3, adj = 0, line = 1, cex = 1, font = 2)

Prepare data for Topic Modeling

Create a document-term matrix

dtm <- tokens %>% count(document, word) %>% cast_dtm(document, word, n)

Set the number of topics

num_topics <- 6 # Adjust as needed

Run LDA

lda_model <- LDA(dtm, k = num_topics, control = list(seed = 1234))

Get the top terms for each topic

lda_terms <- tidy(lda_model, matrix = “beta”)

Display the top terms for each topic

top_terms <- lda_terms %>% group_by(topic) %>% top_n(10, beta) %>% ungroup() %>% arrange(topic, -beta)

Plot the top terms for each topic

top_terms %>% mutate(term = reorder_within(term, beta, topic)) %>% ggplot(aes(x = term, y = beta, fill = as.factor(topic))) + geom_col(show.legend = FALSE) + facet_wrap(~ topic, scales = “free_y”) + coord_flip() + labs(title = “Top Terms in Each Topic in the Douay-Rheims Bible”, x = “Terms”, y = “Beta”) + scale_x_reordered() + theme_minimal()

Textual Complexity: Flesch-Kincaid Readability

bible_text <- paste(bible_data$Verse_Text, collapse = ” “) readability <- textstat_readability(bible_text, measure =”Flesch.Kincaid”)

print(paste(“Flesch-Kincaid Readability Score for the Douay-Rheims Bible:”, readability))