library(tidytext)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ ggplot2   3.5.2     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.2.1
## ✔ purrr     1.0.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggdendro)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(cluster)
library(igraph)
## 
## Attaching package: 'igraph'
## 
## The following objects are masked from 'package:lubridate':
## 
##     %--%, union
## 
## The following objects are masked from 'package:purrr':
## 
##     compose, simplify
## 
## The following object is masked from 'package:tidyr':
## 
##     crossing
## 
## The following object is masked from 'package:tibble':
## 
##     as_data_frame
## 
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## 
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## 
## The following object is masked from 'package:base':
## 
##     union
library(ggraph)

papers_df <- read_csv("C:/PostDoc Journey_Coky/Content Analysis Study/scopus_abstract_example.csv", na = "")
## Rows: 68 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): title, text
## dbl (1): year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
bigram_counts <- papers_df %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>%
  filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word) %>%
  unite(bigram, word1, word2, sep = " ") %>%
  count(bigram, sort = TRUE)

# Filter to keep only meaningful bigrams (adjust threshold as needed)
bigram_filtered <- bigram_counts %>%
  filter(n > 5) %>%  # Only keep bigrams that appear more than 5 times
  slice_head(n = 100)  # Keep top 100 bigrams

## 1. Bigram Network Visualization (as in your original script but enhanced)
bigram_network <- bigram_filtered %>%
  separate(bigram, c("from", "to"), sep = " ") %>%
  graph_from_data_frame()

set.seed(123)
ggraph(bigram_network, layout = "fr") +
  geom_edge_link(aes(edge_width = n, edge_alpha = n), 
                 color = "darkgray", show.legend = FALSE) +
  geom_node_point(aes(size = degree(bigram_network)), 
                  color = "lightblue", alpha = 0.7) +
  geom_node_text(aes(label = name), repel = TRUE, size = 3) +
  scale_edge_width(range = c(0.5, 3)) +
  theme_void() +
  labs(title = "Bigram Network Analysis of Islamic Finance Research",
       subtitle = "Node size represents degree centrality, edge width represents co-occurrence frequency")

## 2. Bigram Co-occurrence Matrix
# Create a document-term matrix for bigrams
bigram_dtm <- papers_df %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
  count(title, bigram) %>%
  filter(bigram %in% bigram_filtered$bigram) %>%
  cast_dtm(title, bigram, n)

# Convert to matrix
bigram_matrix <- as.matrix(bigram_dtm)

## 3. Hierarchical Clustering
# Calculate distance matrix (using cosine similarity)
dist_matrix <- dist(scale(t(bigram_matrix)), method = "euclidean")

# Perform hierarchical clustering
hc <- hclust(dist_matrix, method = "ward.D2")

# Dendrogram visualization
ggdendrogram(hc, rotate = TRUE, size = 3) +
  labs(title = "Hierarchical Clustering of Islamic Finance Bigrams",
       subtitle = "Ward's method with Euclidean distance") +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 8))

# Cut dendrogram to create clusters
num_clusters <- 5  # Adjust based on dendrogram inspection
bigram_clusters <- cutree(hc, k = num_clusters)

# Visualize clusters on dendrogram
fviz_dend(hc, k = num_clusters, 
          cex = 0.6, 
          k_colors = "jco",
          rect = TRUE, 
          rect_fill = TRUE,
          rect_border = "jco",
          labels_track_height = 0.8,
          main = "Bigram Clusters in Islamic Finance Research")
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <https://github.com/kassambara/factoextra/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## 4. K-means Clustering
# Perform k-means clustering
set.seed(123)
kmeans_result <- kmeans(t(bigram_matrix), centers = num_clusters, nstart = 25)

# Visualize k-means clusters
fviz_cluster(kmeans_result, data = t(bigram_matrix),
             ellipse.type = "norm",
             repel = TRUE,
             labelsize = 8,
             ggtheme = theme_minimal()) +
  labs(title = "K-means Clustering of Islamic Finance Bigrams",
       subtitle = paste("Visualization of", num_clusters, "clusters"))
## Too few points to calculate an ellipse

## 5. Cluster Interpretation
# Create a data frame with bigrams and their cluster assignments
cluster_assignments <- data.frame(
  bigram = colnames(bigram_matrix),
  cluster_hc = bigram_clusters,
  cluster_kmeans = kmeans_result$cluster
)

# View top terms in each cluster (hierarchical)
cluster_terms_hc <- cluster_assignments %>%
  group_by(cluster_hc) %>%
  summarise(terms = paste(bigram, collapse = ", "),
            count = n()) %>%
  arrange(desc(count))

print(cluster_terms_hc)
## # A tibble: 5 × 3
##   cluster_hc terms                                                         count
##        <int> <chr>                                                         <int>
## 1          1 economic development, islamic economics, rights reserved, so…    52
## 2          4 design methodology, limitations implications, methodology ap…     7
## 3          5 islamic banking, islamic banks                                    2
## 4          2 islamic finance                                                   1
## 5          3 islamic financial                                                 1
# View top terms in each cluster (k-means)
cluster_terms_kmeans <- cluster_assignments %>%
  group_by(cluster_kmeans) %>%
  summarise(terms = paste(bigram, collapse = ", "),
            count = n()) %>%
  arrange(desc(count))

print(cluster_terms_kmeans)
## # A tibble: 5 × 3
##   cluster_kmeans terms                                                     count
##            <int> <chr>                                                     <int>
## 1              1 islamic economics, islamic financial, rights reserved, s…    47
## 2              4 economic development, saudi arabia, vision 2030, financi…     7
## 3              2 islamic banks, financial performance, halal income, isla…     4
## 4              5 al shariah, maqasid al, takaful operators, esg principles     4
## 5              3 islamic finance                                               1
## 6. Heatmap Visualization
# Create a heatmap of bigram frequencies by cluster
heatmap_data <- bigram_matrix %>%
  t() %>%
  scale()  # Scale for better visualization

# Reorder rows based on cluster assignments
row_order <- order(cluster_assignments$cluster_hc)
heatmap_data <- heatmap_data[row_order, ]

# Create annotation for clusters
annotation_row <- data.frame(
  Cluster = factor(cluster_assignments$cluster_hc[row_order])
)

# First, ensure the pheatmap package is installed and loaded
library(pheatmap)

# Create a cleaner version of the heatmap
try({
  # Ensure annotation_row has row names matching heatmap_data
  rownames(annotation_row) <- rownames(heatmap_data)
  
  # Create color palette for clusters
  cluster_colors <- RColorBrewer::brewer.pal(num_clusters, "Set1")
  names(cluster_colors) <- unique(annotation_row$Cluster)
  anno_colors <- list(Cluster = cluster_colors)
  
  # Create the heatmap with error handling
  pheatmap(heatmap_data,
           cluster_rows = FALSE,
           cluster_cols = TRUE,
           annotation_row = annotation_row,
           annotation_colors = anno_colors,
           show_colnames = FALSE,
           show_rownames = TRUE,
           fontsize_row = 6,
           color = colorRampPalette(c("navy", "white", "firebrick3"))(50),
           main = "Bigram Frequency Heatmap with Cluster Annotation")
})

# Alternative visualization if pheatmap still fails
if (exists("pheatmap_failed") || !exists("pheatmap_success")) {
  heatmap(heatmap_data, 
          Colv = NA, 
          Rowv = NA,
          scale = "none",
          col = colorRampPalette(c("navy", "white", "firebrick3"))(50),
          main = "Bigram Frequency Heatmap",
          margins = c(5,10),
          cexRow = 0.6)
  
  # Add cluster information manually
  legend("topright", 
         legend = paste("Cluster", 1:num_clusters),
         fill = RColorBrewer::brewer.pal(num_clusters, "Set1"),
         border = NA,
         bty = "n",
         cex = 0.8)
}