library(dplyr)
library(stringr)
library(tm)
library(tidyverse)
library(tidytext)
library(tidyr)
library(ggplot2)
library(dplyr)
library(igraph)     
library(networkD3)   
library(stopwords)   
library(cowplot)     

Introduction

The dataset is taken from the Stanford Large Network Dataset Collection and consists of Amazon online reviews in the software category, covering the period from May 1996 to October 2018. In total, the dataset contains 459,436 reviews. From this raw data, I later extract the relevant information needed to construct and analyse the review network.

Link for the dataset: https://nijianmo.github.io/amazon/index.html

The data is originally stored in JSON format. I first load it in this form and then transform it into a more manageable structure so that it can be processed and analysed more easily. Additionally, from the JSON file, I use the reviewText field, since it contains the textual content that is central to the analysis.

Dataset Description

reviews <- jsonlite::stream_in(
  file("Software.json", open = "r"),
  verbose = FALSE
)

review_text_only <- reviews %>%
  transmute(text = reviewText) %>%
  filter(!is.na(text), text != "")

head(review_text_only$text, 3)
## [1] "The materials arrived early and were in excellent condition.  However for the money spent they really should've come with a binder and not just loose leaf."                                                                                                                                                
## [2] "I am really enjoying this book with the worksheets that make you review your goals, what to do when you do not make it, it reminds me  of my human sexuality classwork."                                                                                                                                    
## [3] "IF YOU ARE TAKING THIS CLASS DON\"T WASTE YOUR MONEY ON THIS SO CALLED BOOK! $140.00 FOR A \"BOOK\" THAT ISIN'T EVEN BOUND LOOSE LEAFS, THAT I HAD TO PROVIDE MY OWN BINDER FOR. TURNS OUT YOU CAN BUY ACCESS TO THE BOOK AT MCGRAW HILL CONNECT CORE FOR $70.00\n\nTHIS BOOK IS A COMPLETE WASTE OF MONEY!"

Data Preprocessing

corpus <- VCorpus(VectorSource(review_text_only$text))
corpus <- tm_map(
  corpus,
  content_transformer(function(x) iconv(x, to = "UTF-8", sub = "byte"))
)


toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))


corpus <- tm_map(corpus, toSpace, "@")
corpus <- tm_map(corpus, toSpace, "@\\w+")
corpus <- tm_map(corpus, toSpace, "\\|")
corpus <- tm_map(corpus, toSpace, "[ \t]{2,}")
corpus <- tm_map(corpus, toSpace, "(s?)(f|ht)tp(s?)://\\S+\\b")  # URLs
corpus <- tm_map(corpus, toSpace, "http\\w*")
corpus <- tm_map(corpus, toSpace, "/")
corpus <- tm_map(corpus, toSpace, "www")
corpus <- tm_map(corpus, toSpace, "~")


corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)

After cleaning the corpus, the resulting object contains three documents in the sample shown, each stored as a plain text document with associated metadata. While the median review contains 128 characters, the mean length is noticeably higher, indicating the presence of a small number of very long reviews, with the longest exceeding 22,000 characters. Empty documents are extremely rare, making up less than 0.1% of the corpus, whereas short reviews with fewer than 20 characters account for around 11% of the data.

Corpus Inspection

inspect(corpus[1:3])
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 3
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 102
## 
## [[2]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 89
## 
## [[3]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 168
doc_lens <- nchar(sapply(corpus, `[[`, "content"))
summary(doc_lens)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    53.0   128.0   266.8   313.0 22863.0
mean(doc_lens == 0)
## [1] 0.000885996
mean(doc_lens < 20)
## [1] 0.1112785

Corpus Preparation and Bigram Edge Construction

data <- data.frame(
  doc_id = seq_along(corpus),
  Text = sapply(corpus, `[[`, "content"),
  stringsAsFactors = FALSE
) %>%
  filter(!is.na(Text), Text != "")

# Corpus size is too big so I only take ~10% as a sample
set.seed(42)
N_SAMPLE <- 5000
data_s <- data %>% slice_sample(n = min(N_SAMPLE, nrow(data)))

sw <- stopwords("en")

remove_stopword_edges <- function(df, w1, w2) {
  df %>%
    filter(!.data[[w1]] %in% sw,
           !.data[[w2]] %in% sw,
           !is.na(.data[[w1]]), !is.na(.data[[w2]]),
           nchar(.data[[w1]]) > 1, nchar(.data[[w2]]) > 1)
}

make_bigram_edges <- function(df) {
  df %>%
    unnest_tokens(bigram, Text, token = "ngrams", n = 2) %>%
    filter(!is.na(bigram)) %>%
    separate(bigram, into = c("word1", "word2"), sep = " ") %>%
    remove_stopword_edges(w1 = "word1", w2 = "word2") %>%
    count(word1, word2, sort = TRUE, name = "weight")
}

bigram_edges <- make_bigram_edges(data_s)

In this step, I converted the corpus into a clean text data frame and a smaller random sample is taken to reduce computational load. The text is then tokenised into bigrams, stopwords and trivial tokens are removed, and co-occurring word pairs are counted to form weighted edges for the text network.

Inspecting bigram frequency distribution

print(head(bigram_edges, 15))
##        word1     word2 weight
## 1       easy       use    268
## 2      turbo       tax    181
## 3   customer   service    117
## 4      great   product    112
## 5         hr     block    110
## 6       tech   support    108
## 7   internet  security    100
## 8    windows        xp     97
## 9      works     great     92
## 10 operating    system     87
## 11    highly recommend     86
## 12       new   version     85
## 13      user  friendly     84
## 14      hard     drive     83
## 15      many     years     83
print(quantile(bigram_edges$weight, c(.90, .95, .99, .995, .999)))
##   90%   95%   99% 99.5% 99.9% 
##     2     3     6    10    24

This output shows the most frequent bigrams extracted from the sampled reviews, along with how often each word pair appears in the corpus. The results indicate that commonly occurring pairs reflect typical software-related themes, such as their usability, customer service, system functionality, and technical support. It could suggest that the bigram network captures meaningful patterns in how users describe their experiences.

Next, I construct the word co-occurrence network and focus the analysis on its main connected structure.

Constructing the bigram network

make_graph <- function(edges, threshold, directed = FALSE) {
  edges %>%
    filter(weight >= threshold) %>%
    graph_from_data_frame(directed = directed)
}

largest_cc <- function(g) {
  comps <- components(g)
  lcc_id <- which.max(comps$csize)
  induced_subgraph(g, which(comps$membership == lcc_id))
}

bigram_threshold <- as.numeric(quantile(bigram_edges$weight, 0.999)) # I use high quantile because it has too many noise even with the sampled-down size

g_bigram <- make_graph(bigram_edges, threshold = bigram_threshold, directed = FALSE)
g_bigram_cc <- largest_cc(g_bigram)

cat("Sample N:", nrow(data_s), "\n")
## Sample N: 5000
cat("Threshold:", bigram_threshold, "\n")
## Threshold: 24
cat("Nodes (LCC):", vcount(g_bigram_cc), "\n")
## Nodes (LCC): 91
cat("Edges (LCC):", ecount(g_bigram_cc), "\n")
## Edges (LCC): 103
cat("Density:", edge_density(g_bigram_cc), "\n")
## Density: 0.02515263

In this step, I construct an undirected bigram co-occurrence network by retaining only word pairs whose frequencies exceed a high quantile threshold, which helps filter out weak and noisy connections. However, I only focus on focus on the largest connected component, resulting in a network of 91 nodes and 103 edges with low density, indicating a sparse structure where only the most strongly related terms remain connected.

Next, I build a simple network plot

Simple Plot of Word Bigram Co-occurence Network

set.seed(42)
E(g_bigram_cc)$w_scaled <- pmax(1, log1p(E(g_bigram_cc)$weight))

plot(
  g_bigram_cc,
  vertex.size = 4,
  vertex.label.cex = 0.6,
  edge.width = E(g_bigram_cc)$w_scaled,
  main = paste0("Word Bigram Co-occurrence Network (N=", nrow(data_s),
                ", threshold=", bigram_threshold, ")")
)

In this step, I prepare the nodes and edges of the bigram network so they can be displayed interactively.

Interactive Bigram Network Visualisation

nodes <- data.frame(name = V(g_bigram_cc)$name, stringsAsFactors = FALSE)

edges <- as_data_frame(g_bigram_cc, what = "edges") %>%
  transmute(
    source = match(from, nodes$name) - 1,
    target = match(to,   nodes$name) - 1,
    value  = as.numeric(weight)
  )

# scale link value so thick edges don't dominate
edges$value <- log1p(edges$value)

# node size by weighted strength (sum of edge weights)
strg <- strength(g_bigram_cc, weights = E(g_bigram_cc)$weight)
nodes$nodesize <- as.numeric(strg)
nodes$nodesize <- log1p(nodes$nodesize)

nodes$nodesize <- (nodes$nodesize - min(nodes$nodesize)) /
  (max(nodes$nodesize) - min(nodes$nodesize) + 1e-9) + 0.2

nodes$group <- 1

forceNetwork(
  Links = edges,
  Nodes = nodes,
  Source = "source",
  Target = "target",
  NodeID = "name",
  Group = "group",      
  Value  = "value",
  Nodesize = "nodesize",
  opacity = 0.9,
  fontSize = 12,
  linkDistance = 60,
  charge = -30,
  zoom = TRUE,
  opacityNoHover = 1
)

In this step, I compute standard network centrality measures to assess how important each word is within the bigram network.

Node Importance in the Bigram Network

node_impo_df <- tibble(
  word = V(g_bigram_cc)$name,
  degree = as.numeric(degree(g_bigram_cc)),
  closeness = as.numeric(closeness(g_bigram_cc, normalized = TRUE)),
  betweenness = as.numeric(betweenness(g_bigram_cc, directed = FALSE, normalized = TRUE))
)

# Distributions (skewness of importance)
plt_deg <- node_impo_df %>%
  ggplot(aes(degree)) + theme_light() +
  geom_histogram(alpha = 0.8, bins = 30)

plt_clo <- node_impo_df %>%
  ggplot(aes(closeness)) + theme_light() +
  geom_histogram(alpha = 0.8, bins = 30)

plt_bet <- node_impo_df %>%
  ggplot(aes(betweenness)) + theme_light() +
  geom_histogram(alpha = 0.8, bins = 30)

cowplot::plot_grid(plt_deg, plt_clo, plt_bet, ncol = 1, align = "v")

The distributions show that most words in the network have low degree and low betweenness, meaning they are connected to only a few other terms and do not act as major bridges in the network. In contrast, a small number of words exhibit higher degree, closeness, or betweenness, indicating that the network structure is driven by a limited set of central terms that play a key role in connecting different parts of the bigram network.

Next, I detect communities in the bigram network and colour the interactive graph by these groups to highlight clusters of words that tend to appear together.

Community Detection and Interactive Network by Group

# Ensure community membership exists 
if (is.null(vertex_attr(g_bigram_cc, "membership"))) {
  comm_det <- cluster_louvain(g_bigram_cc, weights = E(g_bigram_cc)$weight)
  V(g_bigram_cc)$membership <- membership(comm_det)
}


stopifnot(length(V(g_bigram_cc)$name) == length(V(g_bigram_cc)$membership))

nodes <- data.frame(
  name = V(g_bigram_cc)$name,
  group = as.integer(V(g_bigram_cc)$membership),
  stringsAsFactors = FALSE
)

edges <- as_data_frame(g_bigram_cc, what = "edges") %>%
  transmute(
    source = match(from, nodes$name) - 1,
    target = match(to,   nodes$name) - 1,
    value  = log1p(as.numeric(weight))
  )


strg <- strength(g_bigram_cc, weights = E(g_bigram_cc)$weight)
nodes$nodesize <- log1p(as.numeric(strg))
nodes$nodesize <- (nodes$nodesize - min(nodes$nodesize)) /
  (max(nodes$nodesize) - min(nodes$nodesize) + 1e-9) + 0.2

forceNetwork(
  Links = edges,
  Nodes = nodes,
  Source = "source",
  Target = "target",
  NodeID = "name",
  Group = "group",
  Value = "value",
  Nodesize = "nodesize",
  fontSize = 12,
  linkDistance = 60,
  charge = -30,
  opacity = 0.9,
  opacityNoHover = 1,
  zoom = TRUE
)

Lastly, I summarise each detected community by listing its most strongly connected words to help interpret the themes within each cluster.

Top Words within Communities

membership_df <- tibble(
  word    = V(g_bigram_cc)$name,
  cluster = as.integer(V(g_bigram_cc)$membership),
  degree  = as.numeric(strength(g_bigram_cc, weights = E(g_bigram_cc)$weight))
)

membership_df %>%
  group_by(cluster) %>%
  arrange(desc(degree), .by_group = TRUE) %>%
  slice_head(n = 15) %>%
  summarise(top_words = str_c(word, collapse = ", "), .groups = "drop") %>%
  arrange(cluster)
## # A tibble: 11 × 2
##    cluster top_words                                                            
##      <int> <chr>                                                                
##  1       1 use, easy, much, program, easier, better, install, pretty, simple    
##  2       2 tax, turbo, software, using, used, quicken, turbotax, antivirus, pie…
##  3       3 product, great, works, well, work, worked, fine, good, price, key, d…
##  4       4 internet, security, norton                                           
##  5       5 windows, xp, vista, running                                          
##  6       6 recommend, highly, anyone                                            
##  7       7 version, new, previous, computer, mac, older, features, latest, os, …
##  8       8 years, many, several, ago, now, times, people, past, two             
##  9       9 can, get, back, money, go, waste, worth, able, say, find, trying, wa…
## 10      10 year, time, every, old, last, next, first, long, lot, consuming, day 
## 11      11 like, just, really, looks, things