library(tidytext) # Text cleaning
library(quanteda) # Document-feature matrices
## Package version: 4.2.0
## Unicode version: 15.1
## ICU version: 74.1
## Parallel computing: 16 of 16 threads used.
## See https://quanteda.io for tutorials and examples.
library(topicmodels) # LDA modeling
library(ggplot2) # Visualization
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.2.1
## ✔ purrr 1.0.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
papers_df <- read_csv("C:/PostDoc Journey_Coky/Content Analysis Study/scopus_abstract_example.csv", na = "")
## Rows: 68 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): title, text
## dbl (1): year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
tidy_papers <- papers_df %>%
unnest_tokens(word, text, to_lower = TRUE) %>%
anti_join(stop_words) %>%
filter(!grepl("[0-9]", word)) %>%
filter(nchar(word) > 3) # Add this to remove short words
## Joining with `by = join_by(word)`
tidy_papers %>%
count(word, sort = TRUE) %>%
head(20) %>%
ggplot(aes(x = reorder(word, n), y = n)) +
geom_col(fill = "steelblue") + # Added color
coord_flip() +
labs(title = "Top 20 Most Frequent Words",
x = "Word",
y = "Frequency") +
theme_minimal() # Cleaner theme

library(wordcloud2)
word_counts <- tidy_papers %>% count(word)
wordcloud2(word_counts, size = 0.8)
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
dtm <- tidy_papers %>%
count(title, word) %>%
cast_dfm(document = title, term = word, value = n) %>%
dfm_trim(min_termfreq = 5) # Remove rare terms
lda_model <- LDA(dtm, k = 5, control = list(seed = 1234)) # 5 topics
topics <- tidy(lda_model, matrix = "beta") # Extract topic-term probabilities
top_terms <- topics %>%
group_by(topic) %>%
slice_max(beta, n = 5) %>%
ungroup() %>%
arrange(topic, -beta)
ggplot(top_terms, aes(x = reorder(term, beta), y = beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~topic, scales = "free") +
coord_flip()

tidy_papers %>%
inner_join(papers_df %>% select(title, year)) %>% # Join metadata
count(year, word) %>%
group_by(year) %>%
slice_max(n, n = 5) %>% # Top 5 words per year
ggplot(aes(x = year, y = n, color = word)) +
geom_line() +
labs(title = "Term Trends Over Time")
## Joining with `by = join_by(title, year)`
