Introduction

This document presents a comprehensive sentiment analysis framework for analyzing text data across time series and cross-sectional dimensions. The analysis uses the tidytext package for text processing and sentiment scoring, combined with interactive visualizations using plotly and professionally formatted tables using gt.

Load Required Libraries

library(tidyverse)
library(tidytext)
library(textdata)
library(plotly)
library(gt)
library(lubridate)
library(htmlwidgets)
library(DiagrammeR)

Workflow Flowchart

The following flowchart illustrates the complete sentiment analysis workflow from raw data to insights:

flowchart <- grViz("
digraph sentiment_workflow {
  
  # Graph attributes
  graph [layout = dot, rankdir = TB, 
         label = 'Sentiment Analysis - Call Flow', labelloc = 't', fontsize = 20,
         fontname = Helvetica, bgcolor = '#f8f9fa', nodesep = 0.8, ranksep = 0.8]
  
  # Node attributes
  node [shape = box, style = filled, fontname = Helvetica, fontsize = 15, width = 3, height = 1, fixedsize = true]
  
  # Define nodes with colors
  start [label = 'Raw Text Data\n(Reviews, Comments, etc.)', 
         fillcolor = '#e3f2fd', color = '#1976d2', penwidth = 2]
  
  load [label = 'Load Data\n(Date, Product, Region, Text)', 
        fillcolor = '#fff3e0', color = '#f57c00']
  
  tokenize [label = 'Tokenization\n(Break text into words)', 
            fillcolor = '#f3e5f5', color = '#7b1fa2']
  
  clean [label = 'Remove Stop Words\n(the, and, is, etc.)', 
         fillcolor = '#f3e5f5', color = '#7b1fa2']
  
  lexicon [label = 'Apply Sentiment Lexicons\n(AFINN & Bing)', 
           fillcolor = '#e8f5e9', color = '#388e3c']
  
  score [label = 'Calculate Sentiment Scores\n• Total Score\n• Avg Intensity\n• Positive/Negative Count', 
         fillcolor = '#e8f5e9', color = '#388e3c']
  
  categorize [label = 'Categorize Sentiment\n(Positive, Neutral, Negative)', 
              fillcolor = '#e8f5e9', color = '#388e3c']
  
  aggregate [label = 'Aggregate Data\n• By Time (Daily/Weekly)\n• By Product\n• By Region', 
             fillcolor = '#fff9c4', color = '#f9a825']
  
  visualize [label = 'Create Visualizations\n• Time Series Plots\n• Geographic Maps\n• Bar Charts', 
             fillcolor = '#fce4ec', color = '#c2185b']
  
  tables [label = 'Generate Tables\n• Summary Statistics\n• Trend Analysis\n• Cross-sectional Data', 
          fillcolor = '#fce4ec', color = '#c2185b']
  
  insights [label = 'Insights & Reporting\n✓ Sentiment Trends\n✓ Product Comparison\n✓ Regional Patterns', 
            fillcolor = '#c8e6c9', color = '#2e7d32', penwidth = 3]
  
  # Define edges with increased spacing
  start -> load [penwidth = 2]
  load -> tokenize [penwidth = 2]
  tokenize -> clean [penwidth = 2]
  clean -> lexicon [penwidth = 2]
  lexicon -> score [penwidth = 2]
  score -> categorize [penwidth = 2]
  categorize -> aggregate [penwidth = 2]
  aggregate -> visualize [penwidth = 2]
  aggregate -> tables [penwidth = 2]
  visualize -> insights [penwidth = 2]
  tables -> insights [penwidth = 2]
  
  # Subgraph for legend
  subgraph cluster_legend {
    label = 'Process Stages'
    fontsize = 12
    style = dashed
    color = grey
    
    node [width = 2.5, height = 0.8]
    
    legend1 [label = 'Data Input', fillcolor = '#e3f2fd', color = '#1976d2']
    legend2 [label = 'Text Processing', fillcolor = '#f3e5f5', color = '#7b1fa2']
    legend3 [label = 'Sentiment Analysis', fillcolor = '#e8f5e9', color = '#388e3c']
    legend4 [label = 'Aggregation', fillcolor = '#fff9c4', color = '#f9a825']
    legend5 [label = 'Output', fillcolor = '#fce4ec', color = '#c2185b']
    
    legend1 -> legend2 -> legend3 -> legend4 -> legend5 [style = invis]
  }
}
")

flowchart

Data Generation Function

# Generate example dataset
set.seed(123)
generate_sample_data <- function(n_products = 4, n_days = 90) {
  products <- c("Laptop", "Smartphone", "Tablet", "Smartwatch")
  
  # Sample reviews with varying sentiment
  positive_reviews <- c(
    "Absolutely love this product! Best purchase ever.",
    "Outstanding quality and great performance.",
    "Exceeded my expectations. Highly recommend!",
    "Amazing features and very easy to use.",
    "Perfect product, worth every penny!"
  )
  
  negative_reviews <- c(
    "Terrible quality, broke after one week.",
    "Very disappointed with this purchase.",
    "Poor performance and bad customer service.",
    "Not worth the money, very frustrating.",
    "Worst product I've ever bought."
  )
  
  neutral_reviews <- c(
    "It's okay, nothing special but works.",
    "Average product for the price.",
    "Does what it's supposed to do.",
    "Not bad, not great either.",
    "Acceptable quality for everyday use."
  )
  
  data <- tibble(
    date = sample(seq(today() - days(n_days), today(), by = "day"), 
                  n_products * 30, replace = TRUE),
    product = sample(products, n_products * 30, replace = TRUE),
    region = sample(c("North America", "Europe", "Asia", "South America"), 
                    n_products * 30, replace = TRUE),
    review = sample(c(positive_reviews, negative_reviews, neutral_reviews), 
                    n_products * 30, replace = TRUE)
  ) %>%
    arrange(date)
  
  return(data)
}

Sentiment Analysis Function

# Function to perform sentiment analysis
perform_sentiment_analysis <- function(data, text_col = "review") {
  # Get sentiment lexicons
  afinn <- get_sentiments("afinn")
  bing <- get_sentiments("bing")
  
  # Tokenize and analyze sentiment
  sentiment_data <- data %>%
    mutate(review_id = row_number()) %>%
    unnest_tokens(word, !!sym(text_col)) %>%
    anti_join(stop_words, by = "word") %>%
    left_join(afinn, by = "word") %>%
    left_join(bing, by = "word") %>%
    group_by(review_id, date, product, region) %>%
    summarise(
      afinn_score = sum(value, na.rm = TRUE),
      n_words = n(),
      n_sentiment_words = sum(!is.na(value)),
      positive_words = sum(sentiment == "positive", na.rm = TRUE),
      negative_words = sum(sentiment == "negative", na.rm = TRUE),
      .groups = "drop"
    ) %>%
    mutate(
      # Normalize by number of sentiment words to get average intensity
      avg_word_sentiment = ifelse(n_sentiment_words > 0, 
                                   afinn_score / n_sentiment_words, 
                                   0),
      net_sentiment = positive_words - negative_words,
      sentiment_category = case_when(
        afinn_score > 2 ~ "Positive",
        afinn_score < -2 ~ "Negative",
        TRUE ~ "Neutral"
      )
    )
  
  return(sentiment_data)
}

Generate and Analyze Data

# Generate sample data
review_data <- generate_sample_data(n_products = 4, n_days = 90)

# Display first few rows
head(review_data) %>%
  gt() %>%
  tab_header(title = "Sample Raw Data") %>%
  tab_options(table.font.size = px(12))

date	product	region	review
Sample Raw Data
2025-09-13	Laptop	Europe	Perfect product, worth every penny!
2025-09-14	Smartwatch	North America	Worst product I've ever bought.
2025-09-15	Smartphone	North America	Exceeded my expectations. Highly recommend!
2025-09-16	Smartwatch	South America	Acceptable quality for everyday use.
2025-09-16	Smartphone	South America	Not worth the money, very frustrating.
2025-09-16	Smartwatch	Europe	Does what it's supposed to do.

# Perform sentiment analysis
sentiment_results <- perform_sentiment_analysis(review_data)

# Display sample results
head(sentiment_results) %>%
  gt() %>%
  tab_header(title = "Sample Sentiment Analysis Results") %>%
  fmt_number(columns = c(afinn_score, avg_word_sentiment), decimals = 2) %>%
  tab_options(table.font.size = px(12))

review_id	date	product	region	afinn_score	n_words	n_sentiment_words	positive_words	negative_words	avg_word_sentiment	net_sentiment	sentiment_category
Sample Sentiment Analysis Results
1	2025-09-13	Laptop	Europe	5.00	4	2	2	0	2.50	2	Positive
2	2025-09-14	Smartwatch	North America	−3.00	3	1	0	1	−3.00	-1	Negative
3	2025-09-15	Smartphone	North America	2.00	4	1	2	0	2.00	2	Neutral
4	2025-09-16	Smartwatch	South America	0.00	3	0	0	0	0.00	0	Neutral
5	2025-09-16	Smartphone	South America	0.00	3	2	1	1	0.00	0	Neutral
6	2025-09-16	Smartwatch	Europe	0.00	1	0	0	0	0.00	0	Neutral

Time Series Analysis

Daily Sentiment Trends

# Time series aggregation by day
ts_daily <- sentiment_results %>%
  group_by(date) %>%
  summarise(
    avg_sentiment = mean(afinn_score, na.rm = TRUE),
    total_reviews = n(),
    pct_positive = sum(sentiment_category == "Positive") / n() * 100,
    pct_negative = sum(sentiment_category == "Negative") / n() * 100,
    .groups = "drop"
  )

# Plot: Time series of average sentiment
plot_ts <- plot_ly(ts_daily, x = ~date, y = ~avg_sentiment, type = 'scatter', 
                   mode = 'lines+markers',
                   name = 'Average Sentiment',
                   line = list(color = '#1f77b4', width = 2),
                   marker = list(size = 6)) %>%
  add_trace(y = 0, type = 'scatter', mode = 'lines',
            line = list(color = 'gray', dash = 'dash', width = 1),
            showlegend = FALSE) %>%
  layout(title = "Sentiment Score Over Time",
         xaxis = list(title = "Date"),
         yaxis = list(title = "Average AFINN Sentiment Score"),
         hovermode = 'x unified')

plot_ts

Cross-Sectional Analysis

Sentiment by Product

# Sentiment by product
product_sentiment <- sentiment_results %>%
  group_by(product) %>%
  summarise(
    avg_sentiment = mean(afinn_score, na.rm = TRUE),
    median_sentiment = median(afinn_score, na.rm = TRUE),
    std_sentiment = sd(afinn_score, na.rm = TRUE),
    avg_intensity = mean(avg_word_sentiment, na.rm = TRUE),
    total_reviews = n(),
    pct_positive = sum(sentiment_category == "Positive") / n() * 100,
    pct_negative = sum(sentiment_category == "Negative") / n() * 100,
    pct_neutral = sum(sentiment_category == "Neutral") / n() * 100,
    .groups = "drop"
  ) %>%
  arrange(desc(avg_sentiment))

# Plot: Bar chart of average sentiment by product
plot_product <- plot_ly(product_sentiment, x = ~product, y = ~avg_sentiment,
                        type = 'bar',
                        marker = list(color = ~avg_sentiment,
                                      colorscale = list(c(0, 'red'), 
                                                       c(0.5, 'gray'), 
                                                       c(1, 'green')),
                                      showscale = TRUE,
                                      colorbar = list(title = "Sentiment"))) %>%
  layout(title = "Average Sentiment by Product",
         xaxis = list(title = "Product"),
         yaxis = list(title = "Average Sentiment Score"))

plot_product

Sentiment by Region (Geographic Map)

# Sentiment by region
region_sentiment <- sentiment_results %>%
  group_by(region) %>%
  summarise(
    avg_sentiment = mean(afinn_score, na.rm = TRUE),
    total_reviews = n(),
    pct_positive = sum(sentiment_category == "Positive") / n() * 100,
    pct_negative = sum(sentiment_category == "Negative") / n() * 100,
    .groups = "drop"
  ) %>%
  mutate(
    # Map regions to country codes for choropleth
    location = case_when(
      region == "North America" ~ "USA",
      region == "Europe" ~ "DEU",  # Germany as representative
      region == "Asia" ~ "CHN",    # China as representative
      region == "South America" ~ "BRA"  # Brazil as representative
    ),
    hover_text = paste0(
      "<b>", region, "</b><br>",
      "Avg Sentiment: ", round(avg_sentiment, 2), "<br>",
      "Total Reviews: ", total_reviews, "<br>",
      "Positive: ", round(pct_positive, 1), "%<br>",
      "Negative: ", round(pct_negative, 1), "%"
    )
  )

# Plot: Choropleth map of sentiment by region
plot_region <- plot_ly(
  region_sentiment,
  type = 'choropleth',
  locations = ~location,
  z = ~avg_sentiment,
  text = ~hover_text,
  hoverinfo = 'text',
  colorscale = list(
    c(0, '#d62728'),    # Red for negative
    c(0.5, '#ffeda0'),  # Yellow for neutral
    c(1, '#2ca02c')     # Green for positive
  ),
  zmin = -5,
  zmax = 5,
  colorbar = list(
    title = "Avg<br>Sentiment",
    tickmode = "linear",
    tick0 = -5,
    dtick = 2
  ),
  marker = list(line = list(color = 'rgb(255,255,255)', width = 2))
) %>%
  layout(
    title = list(
      text = "Global Sentiment Distribution by Region",
      font = list(size = 16)
    ),
    geo = list(
      showframe = FALSE,
      showcoastlines = TRUE,
      projection = list(type = 'natural earth')
    )
  )

plot_region

Panel Data Analysis

Time Series by Product

# Time series by product (panel data)
panel_data <- sentiment_results %>%
  mutate(week = floor_date(date, "week")) %>%
  group_by(week, product) %>%
  summarise(
    avg_sentiment = mean(afinn_score, na.rm = TRUE),
    n_reviews = n(),
    .groups = "drop"
  )

# Plot: Panel time series
plot_panel <- plot_ly(panel_data, x = ~week, y = ~avg_sentiment,
                      color = ~product, type = 'scatter', mode = 'lines+markers') %>%
  layout(title = "Sentiment Trends by Product Over Time",
         xaxis = list(title = "Week"),
         yaxis = list(title = "Average Sentiment Score"),
         hovermode = 'x unified')

plot_panel

Summary Tables

Table 1: Product Summary Statistics

Summary Table

table_product <- product_sentiment %>%
  gt() %>%
  tab_header(
    title = "Sentiment Analysis by Product",
    subtitle = "Summary statistics across all time periods"
  ) %>%
  fmt_number(
    columns = c(avg_sentiment, median_sentiment, std_sentiment, avg_intensity),
    decimals = 2
  ) %>%
  fmt_number(
    columns = c(pct_positive, pct_negative, pct_neutral),
    decimals = 1
  ) %>%
  cols_label(
    product = "Product",
    avg_sentiment = "Avg Score",
    median_sentiment = "Median Score",
    std_sentiment = "Std Dev",
    avg_intensity = "Avg Intensity",
    total_reviews = "Reviews",
    pct_positive = "Positive (%)",
    pct_negative = "Negative (%)",
    pct_neutral = "Neutral (%)"
  ) %>%
  data_color(
    columns = avg_sentiment,
    colors = scales::col_numeric(
      palette = c("#d62728", "#ffeda0", "#2ca02c"),
      domain = c(-5, 5)
    )
  ) %>%
  data_color(
    columns = avg_intensity,
    colors = scales::col_numeric(
      palette = c("#d62728", "#f7f7f7", "#2ca02c"),
      domain = c(-2, 2)
    )
  ) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_column_labels()
  ) %>%
  tab_options(
    table.font.size = px(12),
    heading.background.color = "#1f77b4",
    heading.title.font.size = px(16),
    column_labels.background.color = "#d9d9d9"
  )

table_product

Product	Avg Score	Median Score	Std Dev	Avg Intensity	Reviews	Positive (%)	Negative (%)	Neutral (%)
Sentiment Analysis by Product
Summary statistics across all time periods
Smartphone	1.00	0.00	2.93	0.88	25	32.0	16.0	52.0
Smartwatch	0.43	0.00	2.96	0.26	35	25.7	28.6	45.7
Laptop	0.37	0.00	3.29	0.47	35	31.4	25.7	42.9
Tablet	−0.32	0.00	3.44	−0.14	25	24.0	36.0	40.0

Table 2: Recent Sentiment Trends

recent_trends <- sentiment_results %>%
  mutate(week = floor_date(date, "week")) %>%
  group_by(week, product) %>%
  summarise(
    avg_sentiment = mean(afinn_score, na.rm = TRUE),
    reviews = n(),
    .groups = "drop"
  ) %>%
  arrange(desc(week)) %>%
  head(20)

table_trends <- recent_trends %>%
  gt() %>%
  tab_header(
    title = "Recent Sentiment Trends",
    subtitle = "Weekly averages by product"
  ) %>%
  fmt_number(
    columns = avg_sentiment,
    decimals = 2
  ) %>%
  fmt_date(
    columns = week,
    date_style = 6
  ) %>%
  cols_label(
    week = "Week",
    product = "Product",
    avg_sentiment = "Avg Sentiment",
    reviews = "# Reviews"
  ) %>%
  data_color(
    columns = avg_sentiment,
    colors = scales::col_numeric(
      palette = c("#d62728", "#f7f7f7", "#2ca02c"),
      domain = c(-3, 3)
    )
  ) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_column_labels()
  )

table_trends

Week	Product	Avg Sentiment	# Reviews
Recent Sentiment Trends
Weekly averages by product
Dec 7, 2025	Laptop	0.00	2
Dec 7, 2025	Smartphone	2.50	2
Dec 7, 2025	Smartwatch	0.00	1
Dec 7, 2025	Tablet	2.50	2
Nov 30, 2025	Laptop	−1.00	3
Nov 30, 2025	Smartphone	5.00	1
Nov 30, 2025	Smartwatch	2.60	5
Nov 23, 2025	Laptop	−4.00	3
Nov 23, 2025	Smartphone	−3.33	3
Nov 23, 2025	Smartwatch	−3.00	1
Nov 23, 2025	Tablet	1.25	4
Nov 16, 2025	Laptop	0.67	3
Nov 16, 2025	Smartphone	1.75	4
Nov 16, 2025	Smartwatch	−3.50	2
Nov 16, 2025	Tablet	4.00	2
Nov 9, 2025	Laptop	−1.00	2
Nov 9, 2025	Smartphone	−2.00	1
Nov 9, 2025	Tablet	−1.50	2
Nov 2, 2025	Laptop	3.00	1
Nov 2, 2025	Smartphone	0.00	1

Table 3: Cross-Sectional Summary (Product × Region)

cross_sectional <- sentiment_results %>%
  group_by(product, region) %>%
  summarise(
    avg_sentiment = mean(afinn_score, na.rm = TRUE),
    reviews = n(),
    .groups = "drop"
  ) %>%
  pivot_wider(
    names_from = region,
    values_from = avg_sentiment
  )

table_cross <- cross_sectional %>%
  gt() %>%
  tab_header(
    title = "Sentiment Analysis: Product × Region",
    subtitle = "Average sentiment scores"
  ) %>%
  fmt_number(
    columns = -c(product, reviews),
    decimals = 2
  ) %>%
  cols_label(
    product = "Product",
    reviews = "Total Reviews"
  ) %>%
  data_color(
    columns = -c(product, reviews),
    colors = scales::col_numeric(
      palette = c("#d62728", "#fff7bc", "#2ca02c"),
      domain = c(-3, 3)
    )
  ) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_column_labels()
  )

table_cross

Product	Total Reviews	Asia	Europe	North America	South America
Sentiment Analysis: Product × Region
Average sentiment scores
Laptop	10	−0.30	0.80	NA	NA
Laptop	11	NA	NA	1.55	NA
Laptop	4	NA	NA	NA	−2.25
Smartphone	7	0.86	NA	NA	NA
Smartphone	4	NA	0.00	NA	NA
Smartphone	9	NA	NA	1.22	NA
Smartphone	5	NA	NA	NA	1.60
Smartwatch	6	0.67	NA	NA	−0.83
Smartwatch	7	NA	0.43	NA	NA
Smartwatch	16	NA	NA	0.81	NA
Tablet	4	−1.25	NA	NA	−2.00
Tablet	10	NA	−1.00	NA	NA
Tablet	7	NA	NA	2.14	NA

Summary Statistics

cat("=== SENTIMENT ANALYSIS SUMMARY ===\n")

## === SENTIMENT ANALYSIS SUMMARY ===

cat("Total reviews analyzed:", nrow(sentiment_results), "\n")

## Total reviews analyzed: 120

cat("Date range:", format(min(sentiment_results$date), "%Y-%m-%d"), 
    "to", format(max(sentiment_results$date), "%Y-%m-%d"), "\n")

## Date range: 2025-09-13 to 2025-12-09

cat("Overall average sentiment:", round(mean(sentiment_results$afinn_score), 2), "\n")

## Overall average sentiment: 0.38

cat("Overall sentiment standard deviation:", round(sd(sentiment_results$afinn_score), 2), "\n")

## Overall sentiment standard deviation: 3.15

cat("\nSentiment Distribution:\n")

## 
## Sentiment Distribution:

cat("  Positive:", sum(sentiment_results$sentiment_category == "Positive"), 
    "(", round(sum(sentiment_results$sentiment_category == "Positive")/nrow(sentiment_results)*100, 1), "%)\n")

##   Positive: 34 ( 28.3 %)

cat("  Neutral:", sum(sentiment_results$sentiment_category == "Neutral"), 
    "(", round(sum(sentiment_results$sentiment_category == "Neutral")/nrow(sentiment_results)*100, 1), "%)\n")

##   Neutral: 54 ( 45 %)

cat("  Negative:", sum(sentiment_results$sentiment_category == "Negative"), 
    "(", round(sum(sentiment_results$sentiment_category == "Negative")/nrow(sentiment_results)*100, 1), "%)\n")

##   Negative: 32 ( 26.7 %)

Conclusion

This sentiment analysis framework provides comprehensive insights into text data across multiple dimensions:

Time Series Analysis: Tracks sentiment evolution over time to identify trends and patterns
Cross-Sectional Analysis: Compares sentiment across products and geographic regions
Panel Data Analysis: Combines temporal and cross-sectional dimensions for deeper insights

The combination of interactive visualizations (Plotly) and professionally formatted tables (GT) creates a complete analytical toolkit suitable for research, business intelligence, and market analysis applications.

Note: This analysis uses sample data. Replace the generate_sample_data() function with your actual data loading code to analyze real datasets.

Sentiment Analysis: Time Series and Cross-Sectional Analysis

David Guy PhD - DataBridge Statistical Consulting Inc.

2025-12-09