Introduction

Retail sales are a key indicator of economic performance. This report analyzes year-over-year (YoY) changes in retail sales across different U.S. states and retail subsectors. We aim to answer the following questions:

  1. Which states experienced the highest and lowest YoY retail sales growth in 2020?
  2. How does population size impact retail sales growth?
  3. Which retail subsectors performed the best and worst in 2022?
  4. How have the top five states by population performed over the years?
  5. What was the top retail subsector of each state in 2022 when seen on the map?

This analysis uses data from TidyTuesday that has retail subsectors sales YoY change as per State from 2019 to 2022 and the U.S. Census API that has the population of each state as per census held in 2020.

Install and Load Necessary Packages

install.packages(c("data.table", "ggplot2", "colorblindr", "RColorBrewer", "readr", "tidycensus", "dplyr"))
## package 'data.table' successfully unpacked and MD5 sums checked
## package 'ggplot2' successfully unpacked and MD5 sums checked
## package 'RColorBrewer' successfully unpacked and MD5 sums checked
## package 'readr' successfully unpacked and MD5 sums checked
## package 'tidycensus' successfully unpacked and MD5 sums checked
## package 'dplyr' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\T490\AppData\Local\Temp\Rtmp8mXuMg\downloaded_packages
library(data.table)
library(ggplot2)
library(RColorBrewer)
library(colorblindr)
library(readr)
library(tidycensus)
library(dplyr)

Load Datasets

state_retail <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2022/2022-12-13/state_retail.csv', col_types = "cciciiccc")
head(state_retail)
## # A tibble: 6 × 9
##   fips  state_abbr naics subsector           year month change_yoy change_yoy_se
##   <chr> <chr>      <int> <chr>              <int> <int> <chr>      <chr>        
## 1 00    USA          441 Motor vehicle and…  2019     1 0.0        0.6          
## 2 00    USA          441 Motor vehicle and…  2019     2 0.4        0.6          
## 3 00    USA          441 Motor vehicle and…  2019     3 0.9        0.6          
## 4 00    USA          441 Motor vehicle and…  2019     4 4.3        0.6          
## 5 00    USA          441 Motor vehicle and…  2019     5 2.6        0.6          
## 6 00    USA          441 Motor vehicle and…  2019     6 0.0        0.6          
## # ℹ 1 more variable: coverage_code <chr>

Load US Census Data

census_api_key("3bdb3afe07080342dd4dd7c01079ad30170b2c8e", install = TRUE, overwrite = TRUE)
## [1] "3bdb3afe07080342dd4dd7c01079ad30170b2c8e"
readRenviron("~/.Renviron")

population_data <- get_decennial(
  geography = "state",
  variables = "P1_001N",
  year = 2020
)

head(population_data)
## # A tibble: 6 × 4
##   GEOID NAME                 variable    value
##   <chr> <chr>                <chr>       <dbl>
## 1 42    Pennsylvania         P1_001N  13002700
## 2 06    California           P1_001N  39538223
## 3 54    West Virginia        P1_001N   1793716
## 4 49    Utah                 P1_001N   3271616
## 5 36    New York             P1_001N  20201249
## 6 11    District of Columbia P1_001N    689545

Merge Datasets

state_retail_final <- left_join(
  state_retail, 
  population_data %>%
    select(GEOID, NAME, value),
  by = c("fips" = "GEOID")
)

head(state_retail_final)
## # A tibble: 6 × 11
##   fips  state_abbr naics subsector           year month change_yoy change_yoy_se
##   <chr> <chr>      <int> <chr>              <int> <int> <chr>      <chr>        
## 1 00    USA          441 Motor vehicle and…  2019     1 0.0        0.6          
## 2 00    USA          441 Motor vehicle and…  2019     2 0.4        0.6          
## 3 00    USA          441 Motor vehicle and…  2019     3 0.9        0.6          
## 4 00    USA          441 Motor vehicle and…  2019     4 4.3        0.6          
## 5 00    USA          441 Motor vehicle and…  2019     5 2.6        0.6          
## 6 00    USA          441 Motor vehicle and…  2019     6 0.0        0.6          
## # ℹ 3 more variables: coverage_code <chr>, NAME <chr>, value <dbl>

Data Aggregation

state_retail_dt <- as.data.table(state_retail_final)

sales_summary_by_state <- state_retail_dt[, .(
  avg_yoy_change = mean(as.numeric(change_yoy), na.rm = TRUE),
  population = mean(as.numeric(value), na.rm = TRUE)
), by = .(state_abbr, NAME, year)]

head(sales_summary_by_state)
##    state_abbr    NAME  year avg_yoy_change population
##        <char>  <char> <int>          <num>      <num>
## 1:        USA    <NA>  2019      0.8784722        NaN
## 2:        USA    <NA>  2020     -3.1312500        NaN
## 3:        USA    <NA>  2021     30.0437500        NaN
## 4:        USA    <NA>  2022      8.6645833        NaN
## 5:         AL Alabama  2019      1.3534722    5024279
## 6:         AL Alabama  2020      1.2242647    5024279

Visualization:

Average YoY Change in Total Retail Sales by State (2020)

state_retail_2020 <- sales_summary_by_state[year == 2020]

ggplot(state_retail_2020, aes(x = reorder(state_abbr, avg_yoy_change), y = avg_yoy_change)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  labs(title = "Average YoY Change % in Retail Sales by U.S. State (2020)",
       x = "State", y = "Average YoY Change %") +
  theme_minimal() +
  theme(panel.grid = element_blank(),
        panel.background = element_rect(fill = "beige")) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

The above graph shows that 17 out of 50 US states had positive change in the their total retail sales in 2020 compared to 2019, with RI (Rhode Island) having the highest 5% YoY Chagne and CA (California) most negative i.e. -6%

Population vs YoY Change in Retail Sales

ggplot(state_retail_2020, aes(x = population, y = avg_yoy_change)) +
  geom_point(color = "blue") +
  geom_label(aes(label = state_abbr), size = 3, fill = "white", color = "black") +
  labs(title = "YoY Change in Retail Sales vs Population (2020)",
       x = "Population of the State",
       y = "Average YoY Change %") +
  theme_minimal() +
  theme(panel.grid = element_blank(),
        panel.background = element_rect(fill = "beige"))

The above graph shows that US states with greater population (more than 10M) had a negative growth in retail sales, which shows thats that in absolute terms, there was a decline retail sales. This decline could be attributed to COVID-19.

Top 5 States Based on Average Population

top_states <- sales_summary_by_state[, .(avg_population = mean(population, na.rm = TRUE)), by = state_abbr][order(-avg_population)][1:5, state_abbr]

top_states_data <- sales_summary_by_state[state_abbr %in% top_states]

ggplot(top_states_data, aes(x = year, y = avg_yoy_change, color = state_abbr)) +
  geom_line() +
  scale_color_manual(values = brewer.pal(5, "Dark2")) +
  labs(title = "YoY Change in Retail Sales by Year for Top 5 States",
       x = "Year", y = "YoY Change %", color = "State") +
  theme_minimal() +
  theme(panel.grid = element_blank(),
        panel.background = element_rect(fill = "beige"))

The Line chart above shows that in top 5 states of US (population wise) the retail sales declined from 2019 to 2020, and it greatly increased in 2021 in terms of %, primarily due to retail sales taking a major hit in 2020 due to COVID-19.

Retail Sales Trends by Subsector

sales_summary_by_subsector <- state_retail_dt[, .(
  avg_yoy_change = mean(as.numeric(change_yoy), na.rm = TRUE)
), by = .(subsector, year)]

top_subsectors <- sales_summary_by_subsector[, .(
  overall_avg_yoy = mean(avg_yoy_change, na.rm = TRUE)
), by = subsector][order(-overall_avg_yoy)][1:5, subsector]

top_subsectors_data <- sales_summary_by_subsector[subsector %in% top_subsectors]

ggplot(top_subsectors_data, aes(x = year, y = avg_yoy_change, color = subsector)) +
  geom_line() +
  scale_color_manual(values = brewer.pal(5, "Dark2")) +
  labs(title = "YoY Change in Retail Sales for Top 5 Subsectors",
       x = "Year", y = "YoY Change %", color = "Subsector") +
  theme_minimal() +
  theme(panel.grid = element_blank(),
        panel.background = element_rect(fill = "beige"))

When analyzing the trends of top subsectors, it can be seen that building materials and supplies and gasoline stations had the highest increase and decrease in terms of %. This shows that the top sectors that took the most hit due to COVID-19.

Top 10 Retail Subsectors in 2022

top_2022 <- sales_summary_by_subsector[
  year == 2022 & subsector != "total",
  .(subsector, avg_yoy_change)
][order(-avg_yoy_change)][1:10]

top_2022[, subsector := factor(subsector, levels = subsector[order(avg_yoy_change)])]

ggplot(top_2022, aes(x = avg_yoy_change, y = subsector, fill = avg_yoy_change)) +
  geom_col() +
  geom_text(aes(label = paste0(round(avg_yoy_change, 1), "%")),
            hjust = -0.1, size = 3.5, color = "black") +
  scale_fill_gradient(low = "#104E8B", high = "#63B8FF") +
  labs(title = "Top 10 Retail Subsectors in US by YoY Change (2022)",
       x = "Year-over-Year Change (%)", y = NULL) +
  theme_minimal() +
  theme(legend.position = "none", panel.grid = element_blank()) +
  coord_cartesian(xlim = c(0, max(top_2022$avg_yoy_change) * 1.1))

As per the latest data we have i.e. of 2022, the bar chart above shows the subsectors of US that had the highest YoY change in sales.

Bottom 10 Retail subsectors in 2022

bottom_2022 <- sales_summary_by_subsector[
  year == 2022 & subsector != "total",
  .(subsector, avg_yoy_change)
][
  order(avg_yoy_change)  # Sort ascending (worst performers first)
][1:10]

# Order subsectors by performance (reverse for proper bar sorting)
bottom_2022[, subsector := factor(subsector, levels = subsector[order(-avg_yoy_change)])]

# Create horizontal bar chart (with blue gradient for contrast)
ggplot(bottom_2022, aes(x = avg_yoy_change, y = subsector, fill = avg_yoy_change)) +
  geom_col() +
  geom_text(
    aes(label = paste0(round(avg_yoy_change, 1), "%")),
    hjust = -0.1,  # Place labels INSIDE bars for negative values
    size = 3.5,
    color = "black"
  ) +
  scale_fill_gradient(low = "#ff0000", high =  "#ffcccc") +  # red gradient
  labs(
    title = "Bottom 10 Retail Subsectors by YoY Change (2022)",
    x = "Year-over-Year Change (%)",
    y = NULL
  ) +
  theme_minimal() +
  theme(
    legend.position = "none",
    panel.grid = element_blank(),
    panel.background = element_rect(fill = "beige")
  ) 

As per the latest data we have i.e. of 2022, the bar chart above shows the subsectors of US that had the lowest YoY change in sales. Apart from Electronics and Appliances, all subsectors had a postive growth

Top Retail subsector per U.S. state in US Map (by % YOY Change in 2022)

sales_summary <- state_retail_dt[, .(
  avg_yoy_change = mean(as.numeric(change_yoy), na.rm = TRUE)
), by = .(subsector, state_abbr, NAME, year)]  # Key: added subsector to grouping

# Filter for 2022 and find the top subsector per state
top_subsectors_per_state <- sales_summary[
  year == 2022,
  .SD[which.max(avg_yoy_change)],  # Subset row with highest YoY for each state
  by = state_abbr
][, .(
  state_abbr, 
  NAME, 
  subsector, 
  avg_yoy_change
)]


library(ggplot2)
library(maps)
library(ggrepel)  # For avoiding overlapping labels

# Get US state map data
us_states <- map_data("state")

# Get state centroids (for label placement)
centroids <- data.frame(
  region = tolower(state.name),
  state_abbr = state.abb,
  long = state.center$x,
  lat = state.center$y
)

# Merge centroids with your top subsectors
map_label_data <- merge(centroids, top_subsectors_per_state, by = "state_abbr", all.x = TRUE)

# Use state abbreviations and smaller font
ggplot() +
  geom_polygon(
    data = us_states,
    aes(x = long, y = lat, group = group),
    fill = "lightgray", color = "white"
  ) +
  geom_label_repel(
    data = map_label_data,
    aes(
      x = long, 
      y = lat,
      label = paste0(
        state_abbr, ": ",  # State abbreviation (e.g., "CA")
        subsector, "\n", 
        round(avg_yoy_change, 1), "%"
      )
    ),
    size = 2.2,  # Smaller font
    box.padding = 0.3,
    segment.color = "transparent",
    fill = alpha("white", 0.7)
  ) +
  theme_void() +
  labs(title = "Top Retail Subsector by YoY Change (2022)")

The graph/map above shows that in almost all states apart from Arizona (AZ) Gasoline Stations had the highest YoY increase in sales in 2022

Conclusion & Findings:

This report analyzes U.S. retail sales trends, focusing on state-wise and subsector-wise changes. Using data from the U.S. Census and state retail datasets, we visualize key insights with ggplot2. The findings reveal a sharp decline in retail sales from 2019 to 2020, primarily due to the impact of COVID-19, followed by a strong recovery from 2021 onward. Among retail subsectors, gasoline stations experienced the highest percentage growth in 2022. Additionally, our analysis suggests a negative correlation between a state’s population and its year-over-year (YoY) retail sales change, with California—the most populous state—recording the steepest decline in 2020.