library(tidytext)    # Text cleaning
library(quanteda)    # Document-feature matrices
## Package version: 4.2.0
## Unicode version: 15.1
## ICU version: 74.1
## Parallel computing: 16 of 16 threads used.
## See https://quanteda.io for tutorials and examples.
library(topicmodels) # LDA modeling
library(ggplot2)     # Visualization
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
papers_df <- read_csv("C:/PostDoc Journey_Coky/Content Analysis Study/scopus_abstract_example.csv", na = "")
## Rows: 68 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): title, text
## dbl (1): year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
tidy_papers <- papers_df %>%
  unnest_tokens(word, text, to_lower = TRUE) %>%
  anti_join(stop_words) %>%
  filter(!grepl("[0-9]", word)) %>%
  filter(nchar(word) > 3)  # Add this to remove short words
## Joining with `by = join_by(word)`
tidy_papers %>%
  count(word, sort = TRUE) %>%
  head(20) %>%
  ggplot(aes(x = reorder(word, n), y = n)) +
  geom_col(fill = "steelblue") +  # Added color
  coord_flip() +
  labs(title = "Top 20 Most Frequent Words",
       x = "Word",
       y = "Frequency") +
  theme_minimal()  # Cleaner theme

library(wordcloud2)
word_counts <- tidy_papers %>% count(word)
wordcloud2(word_counts, size = 0.8)
library(reshape2)
## 
## Attaching package: 'reshape2'
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
dtm <- tidy_papers %>%
  count(title, word) %>%
  cast_dfm(document = title, term = word, value = n) %>%
  dfm_trim(min_termfreq = 5)  # Remove rare terms

lda_model <- LDA(dtm, k = 5, control = list(seed = 1234))  # 5 topics
topics <- tidy(lda_model, matrix = "beta")  # Extract topic-term probabilities

top_terms <- topics %>%
  group_by(topic) %>%
  slice_max(beta, n = 5) %>%
  ungroup() %>%
  arrange(topic, -beta)

ggplot(top_terms, aes(x = reorder(term, beta), y = beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~topic, scales = "free") +
  coord_flip()

tidy_papers %>%
  inner_join(papers_df %>% select(title, year)) %>%  # Join metadata
  count(year, word) %>%
  group_by(year) %>%
  slice_max(n, n = 5) %>%  # Top 5 words per year
  ggplot(aes(x = year, y = n, color = word)) +
  geom_line() +
  labs(title = "Term Trends Over Time")
## Joining with `by = join_by(title, year)`