Introduction

This document presents a comprehensive sentiment analysis framework for analyzing text data across time series and cross-sectional dimensions. The analysis uses the tidytext package for text processing and sentiment scoring, combined with interactive visualizations using plotly and professionally formatted tables using gt.

Load Required Libraries

library(tidyverse)
library(tidytext)
library(textdata)
library(plotly)
library(gt)
library(lubridate)
library(htmlwidgets)
library(DiagrammeR)

Workflow Flowchart

The following flowchart illustrates the complete sentiment analysis workflow from raw data to insights:

flowchart <- grViz("
digraph sentiment_workflow {
  
  # Graph attributes
  graph [layout = dot, rankdir = TB, 
         label = 'Sentiment Analysis - Call Flow', labelloc = 't', fontsize = 20,
         fontname = Helvetica, bgcolor = '#f8f9fa', nodesep = 0.8, ranksep = 0.8]
  
  # Node attributes
  node [shape = box, style = filled, fontname = Helvetica, fontsize = 15, width = 3, height = 1, fixedsize = true]
  
  # Define nodes with colors
  start [label = 'Raw Text Data\n(Reviews, Comments, etc.)', 
         fillcolor = '#e3f2fd', color = '#1976d2', penwidth = 2]
  
  load [label = 'Load Data\n(Date, Product, Region, Text)', 
        fillcolor = '#fff3e0', color = '#f57c00']
  
  tokenize [label = 'Tokenization\n(Break text into words)', 
            fillcolor = '#f3e5f5', color = '#7b1fa2']
  
  clean [label = 'Remove Stop Words\n(the, and, is, etc.)', 
         fillcolor = '#f3e5f5', color = '#7b1fa2']
  
  lexicon [label = 'Apply Sentiment Lexicons\n(AFINN & Bing)', 
           fillcolor = '#e8f5e9', color = '#388e3c']
  
  score [label = 'Calculate Sentiment Scores\n• Total Score\n• Avg Intensity\n• Positive/Negative Count', 
         fillcolor = '#e8f5e9', color = '#388e3c']
  
  categorize [label = 'Categorize Sentiment\n(Positive, Neutral, Negative)', 
              fillcolor = '#e8f5e9', color = '#388e3c']
  
  aggregate [label = 'Aggregate Data\n• By Time (Daily/Weekly)\n• By Product\n• By Region', 
             fillcolor = '#fff9c4', color = '#f9a825']
  
  visualize [label = 'Create Visualizations\n• Time Series Plots\n• Geographic Maps\n• Bar Charts', 
             fillcolor = '#fce4ec', color = '#c2185b']
  
  tables [label = 'Generate Tables\n• Summary Statistics\n• Trend Analysis\n• Cross-sectional Data', 
          fillcolor = '#fce4ec', color = '#c2185b']
  
  insights [label = 'Insights & Reporting\n✓ Sentiment Trends\n✓ Product Comparison\n✓ Regional Patterns', 
            fillcolor = '#c8e6c9', color = '#2e7d32', penwidth = 3]
  
  # Define edges with increased spacing
  start -> load [penwidth = 2]
  load -> tokenize [penwidth = 2]
  tokenize -> clean [penwidth = 2]
  clean -> lexicon [penwidth = 2]
  lexicon -> score [penwidth = 2]
  score -> categorize [penwidth = 2]
  categorize -> aggregate [penwidth = 2]
  aggregate -> visualize [penwidth = 2]
  aggregate -> tables [penwidth = 2]
  visualize -> insights [penwidth = 2]
  tables -> insights [penwidth = 2]
  
  # Subgraph for legend
  subgraph cluster_legend {
    label = 'Process Stages'
    fontsize = 12
    style = dashed
    color = grey
    
    node [width = 2.5, height = 0.8]
    
    legend1 [label = 'Data Input', fillcolor = '#e3f2fd', color = '#1976d2']
    legend2 [label = 'Text Processing', fillcolor = '#f3e5f5', color = '#7b1fa2']
    legend3 [label = 'Sentiment Analysis', fillcolor = '#e8f5e9', color = '#388e3c']
    legend4 [label = 'Aggregation', fillcolor = '#fff9c4', color = '#f9a825']
    legend5 [label = 'Output', fillcolor = '#fce4ec', color = '#c2185b']
    
    legend1 -> legend2 -> legend3 -> legend4 -> legend5 [style = invis]
  }
}
")

flowchart

Data Generation Function

# Generate example dataset
set.seed(123)
generate_sample_data <- function(n_products = 4, n_days = 90) {
  products <- c("Laptop", "Smartphone", "Tablet", "Smartwatch")
  
  # Sample reviews with varying sentiment
  positive_reviews <- c(
    "Absolutely love this product! Best purchase ever.",
    "Outstanding quality and great performance.",
    "Exceeded my expectations. Highly recommend!",
    "Amazing features and very easy to use.",
    "Perfect product, worth every penny!"
  )
  
  negative_reviews <- c(
    "Terrible quality, broke after one week.",
    "Very disappointed with this purchase.",
    "Poor performance and bad customer service.",
    "Not worth the money, very frustrating.",
    "Worst product I've ever bought."
  )
  
  neutral_reviews <- c(
    "It's okay, nothing special but works.",
    "Average product for the price.",
    "Does what it's supposed to do.",
    "Not bad, not great either.",
    "Acceptable quality for everyday use."
  )
  
  data <- tibble(
    date = sample(seq(today() - days(n_days), today(), by = "day"), 
                  n_products * 30, replace = TRUE),
    product = sample(products, n_products * 30, replace = TRUE),
    region = sample(c("North America", "Europe", "Asia", "South America"), 
                    n_products * 30, replace = TRUE),
    review = sample(c(positive_reviews, negative_reviews, neutral_reviews), 
                    n_products * 30, replace = TRUE)
  ) %>%
    arrange(date)
  
  return(data)
}

Sentiment Analysis Function

# Function to perform sentiment analysis
perform_sentiment_analysis <- function(data, text_col = "review") {
  # Get sentiment lexicons
  afinn <- get_sentiments("afinn")
  bing <- get_sentiments("bing")
  
  # Tokenize and analyze sentiment
  sentiment_data <- data %>%
    mutate(review_id = row_number()) %>%
    unnest_tokens(word, !!sym(text_col)) %>%
    anti_join(stop_words, by = "word") %>%
    left_join(afinn, by = "word") %>%
    left_join(bing, by = "word") %>%
    group_by(review_id, date, product, region) %>%
    summarise(
      afinn_score = sum(value, na.rm = TRUE),
      n_words = n(),
      n_sentiment_words = sum(!is.na(value)),
      positive_words = sum(sentiment == "positive", na.rm = TRUE),
      negative_words = sum(sentiment == "negative", na.rm = TRUE),
      .groups = "drop"
    ) %>%
    mutate(
      # Normalize by number of sentiment words to get average intensity
      avg_word_sentiment = ifelse(n_sentiment_words > 0, 
                                   afinn_score / n_sentiment_words, 
                                   0),
      net_sentiment = positive_words - negative_words,
      sentiment_category = case_when(
        afinn_score > 2 ~ "Positive",
        afinn_score < -2 ~ "Negative",
        TRUE ~ "Neutral"
      )
    )
  
  return(sentiment_data)
}

Generate and Analyze Data

# Generate sample data
review_data <- generate_sample_data(n_products = 4, n_days = 90)

# Display first few rows
head(review_data) %>%
  gt() %>%
  tab_header(title = "Sample Raw Data") %>%
  tab_options(table.font.size = px(12))
Sample Raw Data
date product region review
2025-09-13 Laptop Europe Perfect product, worth every penny!
2025-09-14 Smartwatch North America Worst product I've ever bought.
2025-09-15 Smartphone North America Exceeded my expectations. Highly recommend!
2025-09-16 Smartwatch South America Acceptable quality for everyday use.
2025-09-16 Smartphone South America Not worth the money, very frustrating.
2025-09-16 Smartwatch Europe Does what it's supposed to do.
# Perform sentiment analysis
sentiment_results <- perform_sentiment_analysis(review_data)

# Display sample results
head(sentiment_results) %>%
  gt() %>%
  tab_header(title = "Sample Sentiment Analysis Results") %>%
  fmt_number(columns = c(afinn_score, avg_word_sentiment), decimals = 2) %>%
  tab_options(table.font.size = px(12))
Sample Sentiment Analysis Results
review_id date product region afinn_score n_words n_sentiment_words positive_words negative_words avg_word_sentiment net_sentiment sentiment_category
1 2025-09-13 Laptop Europe 5.00 4 2 2 0 2.50 2 Positive
2 2025-09-14 Smartwatch North America −3.00 3 1 0 1 −3.00 -1 Negative
3 2025-09-15 Smartphone North America 2.00 4 1 2 0 2.00 2 Neutral
4 2025-09-16 Smartwatch South America 0.00 3 0 0 0 0.00 0 Neutral
5 2025-09-16 Smartphone South America 0.00 3 2 1 1 0.00 0 Neutral
6 2025-09-16 Smartwatch Europe 0.00 1 0 0 0 0.00 0 Neutral

Time Series Analysis

Cross-Sectional Analysis

Sentiment by Product

# Sentiment by product
product_sentiment <- sentiment_results %>%
  group_by(product) %>%
  summarise(
    avg_sentiment = mean(afinn_score, na.rm = TRUE),
    median_sentiment = median(afinn_score, na.rm = TRUE),
    std_sentiment = sd(afinn_score, na.rm = TRUE),
    avg_intensity = mean(avg_word_sentiment, na.rm = TRUE),
    total_reviews = n(),
    pct_positive = sum(sentiment_category == "Positive") / n() * 100,
    pct_negative = sum(sentiment_category == "Negative") / n() * 100,
    pct_neutral = sum(sentiment_category == "Neutral") / n() * 100,
    .groups = "drop"
  ) %>%
  arrange(desc(avg_sentiment))

# Plot: Bar chart of average sentiment by product
plot_product <- plot_ly(product_sentiment, x = ~product, y = ~avg_sentiment,
                        type = 'bar',
                        marker = list(color = ~avg_sentiment,
                                      colorscale = list(c(0, 'red'), 
                                                       c(0.5, 'gray'), 
                                                       c(1, 'green')),
                                      showscale = TRUE,
                                      colorbar = list(title = "Sentiment"))) %>%
  layout(title = "Average Sentiment by Product",
         xaxis = list(title = "Product"),
         yaxis = list(title = "Average Sentiment Score"))

plot_product

Sentiment by Region (Geographic Map)

# Sentiment by region
region_sentiment <- sentiment_results %>%
  group_by(region) %>%
  summarise(
    avg_sentiment = mean(afinn_score, na.rm = TRUE),
    total_reviews = n(),
    pct_positive = sum(sentiment_category == "Positive") / n() * 100,
    pct_negative = sum(sentiment_category == "Negative") / n() * 100,
    .groups = "drop"
  ) %>%
  mutate(
    # Map regions to country codes for choropleth
    location = case_when(
      region == "North America" ~ "USA",
      region == "Europe" ~ "DEU",  # Germany as representative
      region == "Asia" ~ "CHN",    # China as representative
      region == "South America" ~ "BRA"  # Brazil as representative
    ),
    hover_text = paste0(
      "<b>", region, "</b><br>",
      "Avg Sentiment: ", round(avg_sentiment, 2), "<br>",
      "Total Reviews: ", total_reviews, "<br>",
      "Positive: ", round(pct_positive, 1), "%<br>",
      "Negative: ", round(pct_negative, 1), "%"
    )
  )

# Plot: Choropleth map of sentiment by region
plot_region <- plot_ly(
  region_sentiment,
  type = 'choropleth',
  locations = ~location,
  z = ~avg_sentiment,
  text = ~hover_text,
  hoverinfo = 'text',
  colorscale = list(
    c(0, '#d62728'),    # Red for negative
    c(0.5, '#ffeda0'),  # Yellow for neutral
    c(1, '#2ca02c')     # Green for positive
  ),
  zmin = -5,
  zmax = 5,
  colorbar = list(
    title = "Avg<br>Sentiment",
    tickmode = "linear",
    tick0 = -5,
    dtick = 2
  ),
  marker = list(line = list(color = 'rgb(255,255,255)', width = 2))
) %>%
  layout(
    title = list(
      text = "Global Sentiment Distribution by Region",
      font = list(size = 16)
    ),
    geo = list(
      showframe = FALSE,
      showcoastlines = TRUE,
      projection = list(type = 'natural earth')
    )
  )

plot_region

Panel Data Analysis

Time Series by Product

# Time series by product (panel data)
panel_data <- sentiment_results %>%
  mutate(week = floor_date(date, "week")) %>%
  group_by(week, product) %>%
  summarise(
    avg_sentiment = mean(afinn_score, na.rm = TRUE),
    n_reviews = n(),
    .groups = "drop"
  )

# Plot: Panel time series
plot_panel <- plot_ly(panel_data, x = ~week, y = ~avg_sentiment,
                      color = ~product, type = 'scatter', mode = 'lines+markers') %>%
  layout(title = "Sentiment Trends by Product Over Time",
         xaxis = list(title = "Week"),
         yaxis = list(title = "Average Sentiment Score"),
         hovermode = 'x unified')

plot_panel

Summary Tables

Table 1: Product Summary Statistics

Summary Table

table_product <- product_sentiment %>%
  gt() %>%
  tab_header(
    title = "Sentiment Analysis by Product",
    subtitle = "Summary statistics across all time periods"
  ) %>%
  fmt_number(
    columns = c(avg_sentiment, median_sentiment, std_sentiment, avg_intensity),
    decimals = 2
  ) %>%
  fmt_number(
    columns = c(pct_positive, pct_negative, pct_neutral),
    decimals = 1
  ) %>%
  cols_label(
    product = "Product",
    avg_sentiment = "Avg Score",
    median_sentiment = "Median Score",
    std_sentiment = "Std Dev",
    avg_intensity = "Avg Intensity",
    total_reviews = "Reviews",
    pct_positive = "Positive (%)",
    pct_negative = "Negative (%)",
    pct_neutral = "Neutral (%)"
  ) %>%
  data_color(
    columns = avg_sentiment,
    colors = scales::col_numeric(
      palette = c("#d62728", "#ffeda0", "#2ca02c"),
      domain = c(-5, 5)
    )
  ) %>%
  data_color(
    columns = avg_intensity,
    colors = scales::col_numeric(
      palette = c("#d62728", "#f7f7f7", "#2ca02c"),
      domain = c(-2, 2)
    )
  ) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_column_labels()
  ) %>%
  tab_options(
    table.font.size = px(12),
    heading.background.color = "#1f77b4",
    heading.title.font.size = px(16),
    column_labels.background.color = "#d9d9d9"
  )

table_product
Sentiment Analysis by Product
Summary statistics across all time periods
Product Avg Score Median Score Std Dev Avg Intensity Reviews Positive (%) Negative (%) Neutral (%)
Smartphone 1.00 0.00 2.93 0.88 25 32.0 16.0 52.0
Smartwatch 0.43 0.00 2.96 0.26 35 25.7 28.6 45.7
Laptop 0.37 0.00 3.29 0.47 35 31.4 25.7 42.9
Tablet −0.32 0.00 3.44 −0.14 25 24.0 36.0 40.0

Table 3: Cross-Sectional Summary (Product × Region)

cross_sectional <- sentiment_results %>%
  group_by(product, region) %>%
  summarise(
    avg_sentiment = mean(afinn_score, na.rm = TRUE),
    reviews = n(),
    .groups = "drop"
  ) %>%
  pivot_wider(
    names_from = region,
    values_from = avg_sentiment
  )

table_cross <- cross_sectional %>%
  gt() %>%
  tab_header(
    title = "Sentiment Analysis: Product × Region",
    subtitle = "Average sentiment scores"
  ) %>%
  fmt_number(
    columns = -c(product, reviews),
    decimals = 2
  ) %>%
  cols_label(
    product = "Product",
    reviews = "Total Reviews"
  ) %>%
  data_color(
    columns = -c(product, reviews),
    colors = scales::col_numeric(
      palette = c("#d62728", "#fff7bc", "#2ca02c"),
      domain = c(-3, 3)
    )
  ) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_column_labels()
  )

table_cross
Sentiment Analysis: Product × Region
Average sentiment scores
Product Total Reviews Asia Europe North America South America
Laptop 10 −0.30 0.80 NA NA
Laptop 11 NA NA 1.55 NA
Laptop 4 NA NA NA −2.25
Smartphone 7 0.86 NA NA NA
Smartphone 4 NA 0.00 NA NA
Smartphone 9 NA NA 1.22 NA
Smartphone 5 NA NA NA 1.60
Smartwatch 6 0.67 NA NA −0.83
Smartwatch 7 NA 0.43 NA NA
Smartwatch 16 NA NA 0.81 NA
Tablet 4 −1.25 NA NA −2.00
Tablet 10 NA −1.00 NA NA
Tablet 7 NA NA 2.14 NA

Summary Statistics

cat("=== SENTIMENT ANALYSIS SUMMARY ===\n")
## === SENTIMENT ANALYSIS SUMMARY ===
cat("Total reviews analyzed:", nrow(sentiment_results), "\n")
## Total reviews analyzed: 120
cat("Date range:", format(min(sentiment_results$date), "%Y-%m-%d"), 
    "to", format(max(sentiment_results$date), "%Y-%m-%d"), "\n")
## Date range: 2025-09-13 to 2025-12-09
cat("Overall average sentiment:", round(mean(sentiment_results$afinn_score), 2), "\n")
## Overall average sentiment: 0.38
cat("Overall sentiment standard deviation:", round(sd(sentiment_results$afinn_score), 2), "\n")
## Overall sentiment standard deviation: 3.15
cat("\nSentiment Distribution:\n")
## 
## Sentiment Distribution:
cat("  Positive:", sum(sentiment_results$sentiment_category == "Positive"), 
    "(", round(sum(sentiment_results$sentiment_category == "Positive")/nrow(sentiment_results)*100, 1), "%)\n")
##   Positive: 34 ( 28.3 %)
cat("  Neutral:", sum(sentiment_results$sentiment_category == "Neutral"), 
    "(", round(sum(sentiment_results$sentiment_category == "Neutral")/nrow(sentiment_results)*100, 1), "%)\n")
##   Neutral: 54 ( 45 %)
cat("  Negative:", sum(sentiment_results$sentiment_category == "Negative"), 
    "(", round(sum(sentiment_results$sentiment_category == "Negative")/nrow(sentiment_results)*100, 1), "%)\n")
##   Negative: 32 ( 26.7 %)

Conclusion

This sentiment analysis framework provides comprehensive insights into text data across multiple dimensions:

  • Time Series Analysis: Tracks sentiment evolution over time to identify trends and patterns
  • Cross-Sectional Analysis: Compares sentiment across products and geographic regions
  • Panel Data Analysis: Combines temporal and cross-sectional dimensions for deeper insights

The combination of interactive visualizations (Plotly) and professionally formatted tables (GT) creates a complete analytical toolkit suitable for research, business intelligence, and market analysis applications.


Note: This analysis uses sample data. Replace the generate_sample_data() function with your actual data loading code to analyze real datasets.