1 Setup

This chunk loads the packages required for tokenization, text cleaning, word clouds, and plotting.

library(dplyr)
library(tidyr)
library(tidytext)
library(stringr)
library(wordcloud)
library(SnowballC)
library(ggplot2)

2 Build Example Dataset

This chunk creates a small corpus of 10 documents that we can use for end-to-end text analysis.

dat <- tibble(
  doc_id = 1:10,
  text = c(
    "The river and Signal drift in the narrow cloud, with quiet stone and frame. We align the mirror to the Window, and the packet is in motion; the grain is soft, but the ladder was sharp. Orbit and pulse trace the carbon scale, not the minor light. From the field to the Anchor, paper and vector shift, and the shadow is still.",
    "In the Forest the copper index was quiet, and the Cloud moved as the signal slipped to the corner. He and she were not sure if the pulse or the scale would align, but the packet and the motion were in the frame. To the river we return, with grain and soft rhythm, and the ladder in the narrow light.",
    "We were in the garden and the delta was calm, but the Matrix began to rotate. The vector and the filter moved with the stream, and the edge of the shell was faint. My memory is brief; your texture is still. In a quiet orbit, the signal and the paper shift as the cloud drifts.",
    "The Anchor was by the field, and the paper route was narrow. This mirror and that window were in the frame; the stone was quiet, but the shadow was sharp. You and I were not in agreement: the pulse and the carbon trace were heavy, and the scale drifted with the cloud.",
    "A packet of motion in the circuit: copper buffer, plate and curve, and a clean rhythm. The signal was in the frame, and the river was still. We were with the forest, but the orbit and the pulse were not calm; the narrow index traced the edge, and the stone was faint.",
    "From the garden to the field, the vector and the shadow align. The cloud is in the window, and the mirror was on the route. They were at the basin with a quiet tone, and the grain was soft. Not the copper, but the plastic filter drifted; the scale and the motion were in the frame.",
    "The Matrix and the delta were in motion, and the stream was narrow. We trace the orbit with a quiet signal, and the pulse is in the frame. My paper and your texture drift with the cloud; the stone and the shell are still. In the forest, the ladder was by the edge.",
    "She was in the window as the river drifted, and the signal was faint. The anchor and the route were in the frame; the shadow was quiet, but the corner was sharp. We were not sure if the scale or the pulse would shift, yet the motion and the grain were still.",
    "The circuit was clean and the buffer was in the plate; the curve and the slope were narrow. In the forest, the mirror and the window align, and the packet is in motion. The cloud and the stone are quiet; the river is still. To the edge and back, the vector and the filter trace the stream.",
    "We were by the death basin and the garden was dreadful and on fire and everyone died, it was catastrophic but the orbit began to rotate. The signal and the packet moved in the frame, and the shadow drifted with the cloud. This paper and that texture were in the stream; not the copper but the plastic tone was faint, and the stone was quiet."
  )
)

head(dat, 3)

3 Quick Data Inspection

This chunk provides a compact summary so we can confirm document count and text field structure.

summary(dat)
##      doc_id          text          
##  Min.   : 1.00   Length:10         
##  1st Qu.: 3.25   Class :character  
##  Median : 5.50   Mode  :character  
##  Mean   : 5.50                     
##  3rd Qu.: 7.75                     
##  Max.   :10.00

4 Create Raw Word Cloud

This chunk tokenizes all text and visualizes raw word frequency without removing stop words.

raw_tokens <- dat %>%
  mutate(text_lower = str_to_lower(text)) %>%
  unnest_tokens(word, text_lower)

raw_word_freq <- raw_tokens %>%
  count(word, sort = TRUE)

wordcloud(
  words = raw_word_freq$word,
  freq = raw_word_freq$n,
  max.words = 100,
  random.order = FALSE,
  colors = RColorBrewer::brewer.pal(8, "Dark2")
)
Raw word cloud from all tokens

Raw word cloud from all tokens

5 Prepare Working Text for Analysis

This chunk creates a working data frame with lowercase text and tokenized terms for downstream steps.

dat_w <- dat %>%
  mutate(text2 = str_to_lower(text))

tokens <- dat_w %>%
  unnest_tokens(word, text2)

head(tokens)

6 Frequency Counts

This chunk computes overall token frequencies and prints the top words before stop-word filtering.

word_freq_count <- tokens %>%
  count(word, sort = TRUE)

word_freq_table <- tokens %>%
  group_by(word) %>%
  summarize(word_total = n(), .groups = "drop") %>%
  arrange(desc(word_total))

head(word_freq_count, 15)

7 Remove Stop Words

This chunk removes common stop words and shows the top remaining content words.

data(stop_words)

word_freq_count_stop <- tokens %>%
  anti_join(stop_words, by = "word") %>%
  count(word, sort = TRUE)

head(word_freq_count_stop, 15)

8 Stem Terms and Plot Clean Word Cloud

This chunk applies stemming after stop-word removal and then visualizes cleaned stem frequencies.

tokens_stem <- tokens %>%
  anti_join(stop_words, by = "word") %>%
  mutate(stem = wordStem(word))

stem_freq <- tokens_stem %>%
  count(stem, sort = TRUE)

wordcloud(
  words = stem_freq$stem,
  freq = stem_freq$n,
  max.words = 100,
  random.order = FALSE,
  colors = RColorBrewer::brewer.pal(8, "Set2")
)
Cleaned word cloud after stop-word removal and stemming

Cleaned word cloud after stop-word removal and stemming

9 Keyword Proportion by Document

This chunk computes the proportion of a selected keyword in each document and ranks documents by that share.

keyword <- "river"

dat_w <- dat_w %>%
  left_join(
    tokens_stem %>%
      group_by(doc_id) %>%
      summarise(proportion = mean(word == keyword), .groups = "drop"),
    by = "doc_id"
  )

dat_w %>%
  arrange(desc(proportion)) %>%
  select(doc_id, proportion)

10 Sentiment Scoring

This chunk uses the Bing sentiment lexicon to calculate positive, negative, and net sentiment per document.

doc_sentiment <- tokens %>%
  anti_join(stop_words, by = "word") %>%
  inner_join(get_sentiments("bing"), by = "word") %>%
  count(doc_id, sentiment) %>%
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  mutate(net_sentiment = positive - negative)

dat_w <- dat_w %>%
  left_join(doc_sentiment, by = "doc_id")

dat_w %>%
  arrange(desc(proportion)) %>%
  select(doc_id, proportion, net_sentiment)

11 Plot Net Sentiment

This chunk visualizes net sentiment for each document, with color showing positive versus non-positive values.

ggplot(dat_w, aes(x = factor(doc_id), y = net_sentiment, fill = net_sentiment > 0)) +
  geom_col() +
  geom_hline(yintercept = 0, color = "black") +
  scale_fill_manual(values = c("red", "blue"), guide = "none") +
  labs(
    x = "Document",
    y = "Net sentiment (positive - negative)",
    title = "Net sentiment by document"
  ) +
  theme_minimal(base_size = 13)
Net sentiment by document

Net sentiment by document