pacman::p_load(pacman, tidytext, dplyr, tidyr, ggplot2, readr, topicmodels, udpipe, gridExtra, wordcloud, RColorBrewer, quanteda, quanteda.textstats)
bible_data <- read_csv(“Douay_Rheims_Bible2.csv”)
colnames(bible_data) <- c(“Book”, “Chapter”, “Verse”, “Verse_Text”)
bible_data <- bible_data %>% mutate(document = row_number())
tokens <- bible_data %>% unnest_tokens(word, Verse_Text)
data(“stop_words”)
tokens <- tokens %>% anti_join(stop_words, by = “word”)
word_counts <- tokens %>% count(word, sort = TRUE)
print(head(word_counts, 30))
word_counts %>% top_n(30) %>% ggplot(aes(x = reorder(word, n), y = n)) + geom_bar(stat = “identity”) + coord_flip() + labs(title = “Most Common Words in the Douay-Rheims Bible”, x = “Words”, y = “Frequency”) + theme_minimal()
bing_sentiments <- tokens %>% inner_join(get_sentiments(“bing”))
bing_sentiments_count <- bing_sentiments %>% count(sentiment, sort = TRUE)
bing_sentiments_count <- bing_sentiments_count %>% mutate(percentage = n / sum(n) * 100)
bing_plot <- ggplot(bing_sentiments_count, aes(x = sentiment, y = n, fill = sentiment)) + geom_bar(stat = “identity”) + geom_text(aes(label = paste0(round(percentage, 1), “%”)), vjust = 1) + labs(title = “Sentiment Analysis of the Douay-Rheims Bible (Bing Lexicon)”, x = “Sentiment”, y = “Count”) + theme_minimal()
nrc_sentiments <- tokens %>% inner_join(get_sentiments(“nrc”), by = “word”)
nrc_sentiments_count <- nrc_sentiments %>% count(sentiment, sort = TRUE)
nrc_sentiments_count <- nrc_sentiments_count %>% mutate(percentage = n / sum(n) * 100)
nrc_plot <- ggplot(nrc_sentiments_count, aes(x = reorder(sentiment, n), y = n, fill = sentiment)) + geom_bar(stat = “identity”) + geom_text(aes(label = paste0(round(percentage, 1), “%”)), hjust = 1) + coord_flip() + labs(title = “NRC Sentiment Analysis of the Douay-Rheims Bible”, x = “Sentiment”, y = “Count”) + theme_minimal()
grid.arrange(bing_plot, nrc_plot, nrow = 1)
set.seed(1234) wordcloud(words = word_counts\(word, freq = word_counts\)n, min.freq = 5, max.words = 100, random.order = FALSE, rot.per = 0.1, scale = c(3.5, .75), colors = brewer.pal(8, “Dark2”)) mtext(“Word Cloud of the Douay-Rheims Bible”, side = 3, adj = 0, line = 1, cex = 1, font = 2)
dtm <- tokens %>% count(document, word) %>% cast_dtm(document, word, n)
num_topics <- 6 # Adjust as needed
lda_model <- LDA(dtm, k = num_topics, control = list(seed = 1234))
lda_terms <- tidy(lda_model, matrix = “beta”)
top_terms <- lda_terms %>% group_by(topic) %>% top_n(10, beta) %>% ungroup() %>% arrange(topic, -beta)
top_terms %>% group_by(topic) %>% summarize(terms = paste(term, collapse = “,”)) %>% print()
top_terms %>% mutate(term = reorder_within(term, beta, topic)) %>% ggplot(aes(x = term, y = beta, fill = as.factor(topic))) + geom_col(show.legend = FALSE) + facet_wrap(~ topic, scales = “free_y”) + coord_flip() + labs(title = “Top Terms in Each Topic in the Douay-Rheims Bible”, x = “Terms”, y = “Beta”) + scale_x_reordered() + theme_minimal()
bible_text <- paste(bible_data$Verse_Text, collapse = ” “) readability <- textstat_readability(bible_text, measure =”Flesch.Kincaid”)
print(paste(“Flesch-Kincaid Readability Score for the Douay-Rheims Bible:”, readability))