library(dplyr)
library(stringr)
library(tm)
library(tidyverse)
library(tidytext)
library(tidyr)
library(ggplot2)
library(dplyr)
library(igraph)
library(networkD3)
library(stopwords)
library(cowplot)
The dataset is taken from the Stanford Large Network Dataset Collection and consists of Amazon online reviews in the software category, covering the period from May 1996 to October 2018. In total, the dataset contains 459,436 reviews. From this raw data, I later extract the relevant information needed to construct and analyse the review network.
Link for the dataset: https://nijianmo.github.io/amazon/index.html
The data is originally stored in JSON format. I first load it in this form and then transform it into a more manageable structure so that it can be processed and analysed more easily. Additionally, from the JSON file, I use the reviewText field, since it contains the textual content that is central to the analysis.
reviews <- jsonlite::stream_in(
file("Software.json", open = "r"),
verbose = FALSE
)
review_text_only <- reviews %>%
transmute(text = reviewText) %>%
filter(!is.na(text), text != "")
head(review_text_only$text, 3)
## [1] "The materials arrived early and were in excellent condition. However for the money spent they really should've come with a binder and not just loose leaf."
## [2] "I am really enjoying this book with the worksheets that make you review your goals, what to do when you do not make it, it reminds me of my human sexuality classwork."
## [3] "IF YOU ARE TAKING THIS CLASS DON\"T WASTE YOUR MONEY ON THIS SO CALLED BOOK! $140.00 FOR A \"BOOK\" THAT ISIN'T EVEN BOUND LOOSE LEAFS, THAT I HAD TO PROVIDE MY OWN BINDER FOR. TURNS OUT YOU CAN BUY ACCESS TO THE BOOK AT MCGRAW HILL CONNECT CORE FOR $70.00\n\nTHIS BOOK IS A COMPLETE WASTE OF MONEY!"
corpus <- VCorpus(VectorSource(review_text_only$text))
corpus <- tm_map(
corpus,
content_transformer(function(x) iconv(x, to = "UTF-8", sub = "byte"))
)
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "@")
corpus <- tm_map(corpus, toSpace, "@\\w+")
corpus <- tm_map(corpus, toSpace, "\\|")
corpus <- tm_map(corpus, toSpace, "[ \t]{2,}")
corpus <- tm_map(corpus, toSpace, "(s?)(f|ht)tp(s?)://\\S+\\b") # URLs
corpus <- tm_map(corpus, toSpace, "http\\w*")
corpus <- tm_map(corpus, toSpace, "/")
corpus <- tm_map(corpus, toSpace, "www")
corpus <- tm_map(corpus, toSpace, "~")
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
After cleaning the corpus, the resulting object contains three documents in the sample shown, each stored as a plain text document with associated metadata. While the median review contains 128 characters, the mean length is noticeably higher, indicating the presence of a small number of very long reviews, with the longest exceeding 22,000 characters. Empty documents are extremely rare, making up less than 0.1% of the corpus, whereas short reviews with fewer than 20 characters account for around 11% of the data.
inspect(corpus[1:3])
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 3
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 102
##
## [[2]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 89
##
## [[3]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 168
doc_lens <- nchar(sapply(corpus, `[[`, "content"))
summary(doc_lens)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 53.0 128.0 266.8 313.0 22863.0
mean(doc_lens == 0)
## [1] 0.000885996
mean(doc_lens < 20)
## [1] 0.1112785
data <- data.frame(
doc_id = seq_along(corpus),
Text = sapply(corpus, `[[`, "content"),
stringsAsFactors = FALSE
) %>%
filter(!is.na(Text), Text != "")
# Corpus size is too big so I only take ~10% as a sample
set.seed(42)
N_SAMPLE <- 5000
data_s <- data %>% slice_sample(n = min(N_SAMPLE, nrow(data)))
sw <- stopwords("en")
remove_stopword_edges <- function(df, w1, w2) {
df %>%
filter(!.data[[w1]] %in% sw,
!.data[[w2]] %in% sw,
!is.na(.data[[w1]]), !is.na(.data[[w2]]),
nchar(.data[[w1]]) > 1, nchar(.data[[w2]]) > 1)
}
make_bigram_edges <- function(df) {
df %>%
unnest_tokens(bigram, Text, token = "ngrams", n = 2) %>%
filter(!is.na(bigram)) %>%
separate(bigram, into = c("word1", "word2"), sep = " ") %>%
remove_stopword_edges(w1 = "word1", w2 = "word2") %>%
count(word1, word2, sort = TRUE, name = "weight")
}
bigram_edges <- make_bigram_edges(data_s)
In this step, I converted the corpus into a clean text data frame and a smaller random sample is taken to reduce computational load. The text is then tokenised into bigrams, stopwords and trivial tokens are removed, and co-occurring word pairs are counted to form weighted edges for the text network.
print(head(bigram_edges, 15))
## word1 word2 weight
## 1 easy use 268
## 2 turbo tax 181
## 3 customer service 117
## 4 great product 112
## 5 hr block 110
## 6 tech support 108
## 7 internet security 100
## 8 windows xp 97
## 9 works great 92
## 10 operating system 87
## 11 highly recommend 86
## 12 new version 85
## 13 user friendly 84
## 14 hard drive 83
## 15 many years 83
print(quantile(bigram_edges$weight, c(.90, .95, .99, .995, .999)))
## 90% 95% 99% 99.5% 99.9%
## 2 3 6 10 24
This output shows the most frequent bigrams extracted from the sampled reviews, along with how often each word pair appears in the corpus. The results indicate that commonly occurring pairs reflect typical software-related themes, such as their usability, customer service, system functionality, and technical support. It could suggest that the bigram network captures meaningful patterns in how users describe their experiences.
Next, I construct the word co-occurrence network and focus the analysis on its main connected structure.
make_graph <- function(edges, threshold, directed = FALSE) {
edges %>%
filter(weight >= threshold) %>%
graph_from_data_frame(directed = directed)
}
largest_cc <- function(g) {
comps <- components(g)
lcc_id <- which.max(comps$csize)
induced_subgraph(g, which(comps$membership == lcc_id))
}
bigram_threshold <- as.numeric(quantile(bigram_edges$weight, 0.999)) # I use high quantile because it has too many noise even with the sampled-down size
g_bigram <- make_graph(bigram_edges, threshold = bigram_threshold, directed = FALSE)
g_bigram_cc <- largest_cc(g_bigram)
cat("Sample N:", nrow(data_s), "\n")
## Sample N: 5000
cat("Threshold:", bigram_threshold, "\n")
## Threshold: 24
cat("Nodes (LCC):", vcount(g_bigram_cc), "\n")
## Nodes (LCC): 91
cat("Edges (LCC):", ecount(g_bigram_cc), "\n")
## Edges (LCC): 103
cat("Density:", edge_density(g_bigram_cc), "\n")
## Density: 0.02515263
In this step, I construct an undirected bigram co-occurrence network by retaining only word pairs whose frequencies exceed a high quantile threshold, which helps filter out weak and noisy connections. However, I only focus on focus on the largest connected component, resulting in a network of 91 nodes and 103 edges with low density, indicating a sparse structure where only the most strongly related terms remain connected.
Next, I build a simple network plot
set.seed(42)
E(g_bigram_cc)$w_scaled <- pmax(1, log1p(E(g_bigram_cc)$weight))
plot(
g_bigram_cc,
vertex.size = 4,
vertex.label.cex = 0.6,
edge.width = E(g_bigram_cc)$w_scaled,
main = paste0("Word Bigram Co-occurrence Network (N=", nrow(data_s),
", threshold=", bigram_threshold, ")")
)
In this step, I prepare the nodes and edges of the bigram network so they can be displayed interactively.
nodes <- data.frame(name = V(g_bigram_cc)$name, stringsAsFactors = FALSE)
edges <- as_data_frame(g_bigram_cc, what = "edges") %>%
transmute(
source = match(from, nodes$name) - 1,
target = match(to, nodes$name) - 1,
value = as.numeric(weight)
)
# scale link value so thick edges don't dominate
edges$value <- log1p(edges$value)
# node size by weighted strength (sum of edge weights)
strg <- strength(g_bigram_cc, weights = E(g_bigram_cc)$weight)
nodes$nodesize <- as.numeric(strg)
nodes$nodesize <- log1p(nodes$nodesize)
nodes$nodesize <- (nodes$nodesize - min(nodes$nodesize)) /
(max(nodes$nodesize) - min(nodes$nodesize) + 1e-9) + 0.2
nodes$group <- 1
forceNetwork(
Links = edges,
Nodes = nodes,
Source = "source",
Target = "target",
NodeID = "name",
Group = "group",
Value = "value",
Nodesize = "nodesize",
opacity = 0.9,
fontSize = 12,
linkDistance = 60,
charge = -30,
zoom = TRUE,
opacityNoHover = 1
)
In this step, I compute standard network centrality measures to assess how important each word is within the bigram network.
node_impo_df <- tibble(
word = V(g_bigram_cc)$name,
degree = as.numeric(degree(g_bigram_cc)),
closeness = as.numeric(closeness(g_bigram_cc, normalized = TRUE)),
betweenness = as.numeric(betweenness(g_bigram_cc, directed = FALSE, normalized = TRUE))
)
# Distributions (skewness of importance)
plt_deg <- node_impo_df %>%
ggplot(aes(degree)) + theme_light() +
geom_histogram(alpha = 0.8, bins = 30)
plt_clo <- node_impo_df %>%
ggplot(aes(closeness)) + theme_light() +
geom_histogram(alpha = 0.8, bins = 30)
plt_bet <- node_impo_df %>%
ggplot(aes(betweenness)) + theme_light() +
geom_histogram(alpha = 0.8, bins = 30)
cowplot::plot_grid(plt_deg, plt_clo, plt_bet, ncol = 1, align = "v")
The distributions show that most words in the network have low degree and low betweenness, meaning they are connected to only a few other terms and do not act as major bridges in the network. In contrast, a small number of words exhibit higher degree, closeness, or betweenness, indicating that the network structure is driven by a limited set of central terms that play a key role in connecting different parts of the bigram network.
Next, I detect communities in the bigram network and colour the interactive graph by these groups to highlight clusters of words that tend to appear together.
# Ensure community membership exists
if (is.null(vertex_attr(g_bigram_cc, "membership"))) {
comm_det <- cluster_louvain(g_bigram_cc, weights = E(g_bigram_cc)$weight)
V(g_bigram_cc)$membership <- membership(comm_det)
}
stopifnot(length(V(g_bigram_cc)$name) == length(V(g_bigram_cc)$membership))
nodes <- data.frame(
name = V(g_bigram_cc)$name,
group = as.integer(V(g_bigram_cc)$membership),
stringsAsFactors = FALSE
)
edges <- as_data_frame(g_bigram_cc, what = "edges") %>%
transmute(
source = match(from, nodes$name) - 1,
target = match(to, nodes$name) - 1,
value = log1p(as.numeric(weight))
)
strg <- strength(g_bigram_cc, weights = E(g_bigram_cc)$weight)
nodes$nodesize <- log1p(as.numeric(strg))
nodes$nodesize <- (nodes$nodesize - min(nodes$nodesize)) /
(max(nodes$nodesize) - min(nodes$nodesize) + 1e-9) + 0.2
forceNetwork(
Links = edges,
Nodes = nodes,
Source = "source",
Target = "target",
NodeID = "name",
Group = "group",
Value = "value",
Nodesize = "nodesize",
fontSize = 12,
linkDistance = 60,
charge = -30,
opacity = 0.9,
opacityNoHover = 1,
zoom = TRUE
)
Lastly, I summarise each detected community by listing its most strongly connected words to help interpret the themes within each cluster.
membership_df <- tibble(
word = V(g_bigram_cc)$name,
cluster = as.integer(V(g_bigram_cc)$membership),
degree = as.numeric(strength(g_bigram_cc, weights = E(g_bigram_cc)$weight))
)
membership_df %>%
group_by(cluster) %>%
arrange(desc(degree), .by_group = TRUE) %>%
slice_head(n = 15) %>%
summarise(top_words = str_c(word, collapse = ", "), .groups = "drop") %>%
arrange(cluster)
## # A tibble: 11 × 2
## cluster top_words
## <int> <chr>
## 1 1 use, easy, much, program, easier, better, install, pretty, simple
## 2 2 tax, turbo, software, using, used, quicken, turbotax, antivirus, pie…
## 3 3 product, great, works, well, work, worked, fine, good, price, key, d…
## 4 4 internet, security, norton
## 5 5 windows, xp, vista, running
## 6 6 recommend, highly, anyone
## 7 7 version, new, previous, computer, mac, older, features, latest, os, …
## 8 8 years, many, several, ago, now, times, people, past, two
## 9 9 can, get, back, money, go, waste, worth, able, say, find, trying, wa…
## 10 10 year, time, every, old, last, next, first, long, lot, consuming, day
## 11 11 like, just, really, looks, things