Retail Sales Data Analysis

This R notebook explores the Retail Sales Data with Seasonal Trends & Marketing (kaggle.com).

https://www.kaggle.com/datasets/abdullah0a/retail-sales-data-with-seasonal-trends-and-marketing/data

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(scales)
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
# Change figure size of plots
options(repr.plot.width = 16, repr.plot.height = 8)

Read the Data

Load all 30K+ records into a tidyverse data frame.

retail.data <- read.csv("retail-sales-data-with-seasonal-trends-and-marketing.csv")
head(retail.data, 20)
##      Store.ID Product.ID       Date Units.Sold Sales.Revenue..USD.
## 1  Spearsland   52372247 2022-01-01          9             2741.69
## 2  Spearsland   52372247 2022-01-02          7             2665.53
## 3  Spearsland   52372247 2022-01-03          1              380.79
## 4  Spearsland   52372247 2022-01-04          4             1523.16
## 5  Spearsland   52372247 2022-01-05          2              761.58
## 6  Spearsland   52372247 2022-01-06          8             3046.32
## 7  Spearsland   52372247 2022-01-07          6             2284.74
## 8  Spearsland   52372247 2022-01-08          9             3427.11
## 9  Spearsland   52372247 2022-01-09          7             2665.53
## 10 Spearsland   52372247 2022-01-10          1              380.79
## 11 Spearsland   52372247 2022-01-11          4             1523.16
## 12 Spearsland   52372247 2022-01-12          6             2284.74
## 13 Spearsland   52372247 2022-01-13          3              971.01
## 14 Spearsland   52372247 2022-01-14          6             2284.74
## 15 Spearsland   52372247 2022-01-15          3              971.01
## 16 Spearsland   52372247 2022-01-16          2              761.58
## 17 Spearsland   52372247 2022-01-17          8             3046.32
## 18 Spearsland   52372247 2022-01-18          6             2284.74
## 19 Spearsland   52372247 2022-01-19         13             4950.27
## 20 Spearsland   52372247 2022-01-20          5             1903.95
##    Discount.Percentage Marketing.Spend..USD.            Store.Location
## 1                   20                    81                  Tanzania
## 2                    0                     0                Mauritania
## 3                    0                     0 Saint Pierre and Miquelon
## 4                    0                     0                 Australia
## 5                    0                     0                 Swaziland
## 6                    0                    41                    Bhutan
## 7                    0                     0                  Suriname
## 8                    0                    83                    Taiwan
## 9                    0                     0          Papua New Guinea
## 10                   0                   164                    Canada
## 11                   0                    61                   Vietnam
## 12                   0                     0   Cocos (Keeling) Islands
## 13                  15                   197                   Lebanon
## 14                   0                     0                Luxembourg
## 15                  15                   163             French Guiana
## 16                   0                   160                  Paraguay
## 17                   0                     0                     Nauru
## 18                   0                     0                     Italy
## 19                   0                   188              Saint Helena
## 20                   0                     0                  Portugal
##    Product.Category Day.of.the.Week Holiday.Effect
## 1         Furniture        Saturday          False
## 2         Furniture          Sunday          False
## 3         Furniture          Monday          False
## 4         Furniture         Tuesday          False
## 5         Furniture       Wednesday          False
## 6         Furniture        Thursday          False
## 7         Furniture          Friday          False
## 8         Furniture        Saturday          False
## 9         Furniture          Sunday          False
## 10        Furniture          Monday          False
## 11        Furniture         Tuesday          False
## 12        Furniture       Wednesday          False
## 13        Furniture        Thursday          False
## 14        Furniture          Friday          False
## 15        Furniture        Saturday          False
## 16        Furniture          Sunday          False
## 17        Furniture          Monday          False
## 18        Furniture         Tuesday          False
## 19        Furniture       Wednesday          False
## 20        Furniture        Thursday          False

Count the number of distinct countries.

# Count number of distinct countries

length(unique(retail.data$Store.Location))
## [1] 243

Sales Revenue

Examine top selling countries.

# Chart of Top 20 Stores by Revenue

chart.data <- retail.data %>%
    group_by(Store.Location) %>%
    summarise(Sales = sum(Sales.Revenue..USD.)) %>%
    arrange(desc(Sales)) %>%
    head(20)

ggplot(chart.data, aes(x = Sales, y = reorder(Store.Location, Sales))) + 
    geom_col(
        fill = "cornflowerblue"
    )  +
    labs(
        x = "Revenue (USD)", y = "", 
        title = "Top 20 Countries with Highest Revenue"
    ) +
    scale_x_continuous(
        labels = scales::number_format(big.mark = ",")
    ) +
    theme_bw()

Examine Sales Revenue by Product Category

# Sales by Product Category

chart.data <- retail.data %>%
    group_by(Product.Category) %>%
    summarise(Sales = sum(Sales.Revenue..USD.))

ggplot(chart.data, aes(x = "", y = Sales, fill = Product.Category)) +
    geom_col(color = "black") +
    geom_text(
        aes(label = round(Sales / 1000000, 1)),
        position = position_stack(vjust = 0.5)
    ) +
    scale_y_continuous(labels = NULL) +
    scale_fill_manual(
        values = c("pink", "royalblue", "orange", "lightgreen")
    ) +
    labs(
        x = "", 
        y = "Sales (MM)", 
        title = "Sales Revenue by Product Category"
    ) +
    coord_polar(theta = "y") +
    theme_bw() +
    theme(
        plot.title = element_text(hjust = 0.5)
    )

ggplot(chart.data, aes(x = Product.Category, y = Sales)) +
    geom_col(
        fill = c("pink", "royalblue", "orange", "lightgreen")
    ) +
    labs(
        title = "Sales Revenue by Product Category",
        ylab = "Sales (USD)", xlab = "Product Category"
    ) +
    scale_y_continuous(
        labels = scales::number_format(big.mark = ",")
    ) +
    theme_bw() +
    theme(
        plot.title = element_text(hjust = 0.5)
    )
## Ignoring unknown labels:
## • ylab : "Sales (USD)"
## • xlab : "Product Category"

boxplot(
    retail.data$Sales.Revenue..USD. ~ retail.data$Product.Category,
    col = c("pink", "royalblue", "orange", "lightgreen"),
    main = "Distribution of Sales Revenue by Product Category",
    ylab = "Sales (USD)", 
    xlab = "Product Category"
)


Marketing Spend

Examine Marketing Spend across various attributes.

hist(
    retail.data$Marketing.Spend..USD., 
    col = "royalblue",
    main = "Histogram of Marketing Spend (USD)",
    xlab = "USD"
)
grid(nx = NULL, ny = NULL)

# Scatter plot of Marketing by Sales per Location

chart.data <- 
    subset(retail.data, Marketing.Spend..USD. >= 0) %>%
    group_by(Store.Location) %>%
    summarise(
        Sales = mean(Sales.Revenue..USD.),
        Marketing = mean(Marketing.Spend..USD.)
    )

plot(
    chart.data$Marketing ~ chart.data$Sales,
    main = "Average (Marketing ~ Sales) by Country",
    ylab = "Marking Spend (USD)", xlab = "Sales Revenue (USD)",
    col = "royalblue"
)
grid(nx = NULL, ny = NULL)

# Chart the distribution of Marketing Spend by Product Category

chart.data <- subset(retail.data, Marketing.Spend..USD. > 0)

boxplot(
    chart.data$Marketing.Spend..USD. ~ chart.data$Product.Category,
    col = c("pink", "royalblue", "orange", "lightgreen"),
    main = "Distribution of Marketing Spend by Product Category",
    ylab = "Marketing Spend (USD)", xlab = "Product Category"
)

chart.data <- subset(retail.data, Marketing.Spend..USD. > 0) %>%
    group_by(
        Product.Category
    ) %>%
    summarise(
        Marketing.Max = max(Marketing.Spend..USD.),
        Marketing.Min = min(Marketing.Spend..USD.),
        Marketing.Avg = mean(Marketing.Spend..USD.)
    )

chart.data
## # A tibble: 4 × 4
##   Product.Category Marketing.Max Marketing.Min Marketing.Avg
##   <chr>                    <int>         <int>         <dbl>
## 1 Clothing                   199             1          99.4
## 2 Electronics                199             1          99.2
## 3 Furniture                  199             1         101. 
## 4 Groceries                  199             1         100.
chart.data <- retail.data %>%
    group_by(Product.Category) %>%
    summarise(Marketing = sum(Marketing.Spend..USD.))

pie(
    chart.data$Marketing, labels = chart.data$Product.Category, 
    main = "Marketing Spend by Product Category",
    col = c("pink", "royalblue", "orange", "lightgreen")
)

# Chart of Top 20 Countries by Marketing Spend

chart.data <- retail.data %>%
    group_by(Store.Location) %>%
    summarise(Marketing = sum(Marketing.Spend..USD.)) %>%
    arrange(desc(Marketing)) %>%
    head(20)

ggplot(chart.data, aes(x = Marketing, y = reorder(Store.Location, Marketing))) + 
    geom_col(
        fill = "purple"
    )  +
    labs(
        x = "Spend (USD)", 
        y = "", 
        title = "Top 20 Countries with Highest Marketing Spend"
    ) +
    scale_x_continuous(
        labels = scales::number_format(big.mark = ",")
    ) +
    theme_bw()

Create a chart visualizing the top 20 highest marketing spend countries by product category.

# Chart of Top 20 Countries with Highest Marketing Spend by Product Category

# Get Top 20 countries with highest marketing spend.
countries <- retail.data %>%
    group_by(Store.Location) %>%
    summarise(Total.Marketing = sum(Marketing.Spend..USD.)) %>%
    arrange(desc(Total.Marketing)) %>%
    head(20) %>%
    arrange(Store.Location)

# Get Marketing Spend for Top 20 countries by Product Category
chart.data <- 
    subset(retail.data, Store.Location %in% unique(countries$Store.Location)) %>%
    group_by(Store.Location, Product.Category) %>%
    summarise(Marketing = sum(Marketing.Spend..USD.), .groups = "keep") %>%
    arrange(Store.Location, Product.Category) %>%
    pivot_wider(
        names_from = "Store.Location", 
        values_from = "Marketing"
    )

# Create Spend matrix by Product Category (rows) across Countries (columns)
chart.matrix <- as.matrix(chart.data[, countries$Store.Location])
rownames(chart.matrix) <- sort(unique(chart.data$Product.Category))

colors <- c("pink", "royalblue", "orange", "lightgreen")
country.names <- colnames(chart.matrix)

barplot(
    chart.matrix,
    col = colors,
    cex.axis = 0.8,
    cex.names = 0.7,
    names.arg = country.names,
    ylab = "",
    las = 2,
    ylim = c(0, 14500),
    axes = TRUE
)
title("Marketing Spend for Top 20 Countries by Product Category")
legend(x = 19, y = 14000, legend = rownames(chart.matrix),
       col = colors, lwd = 4, cex = 0.8)
grid(nx = NULL, ny = NULL)

chart.data
## # A tibble: 4 × 21
## # Groups:   Product.Category [4]
##   Product.Category Armenia Bahrain Congo  Guam Kenya Korea
##   <chr>              <int>   <int> <int> <int> <int> <int>
## 1 Clothing            1600    1415  2292  2170  1661  1620
## 2 Electronics         1724    1784  2863  1481  1927  3532
## 3 Furniture           2643    2969  4403  2381  3050  3789
## 4 Groceries           1467    1794  1829  2020   907  1791
## # ℹ 14 more variables: `Libyan Arab Jamahiriya` <int>, Mauritius <int>,
## #   Morocco <int>, Nauru <int>, `Norfolk Island` <int>, `Sierra Leone` <int>,
## #   Tanzania <int>, Tokelau <int>, Turkey <int>,
## #   `Turks and Caicos Islands` <int>, `United Arab Emirates` <int>,
## #   `United States Virgin Islands` <int>, `Wallis and Futuna` <int>,
## #   Yemen <int>

Create a chart of top 20 countries with highest sales revenue, but visualize the marketing spend values across product category.

# Chart of Marketing Spend for Top 20 Countries with 
# Highest Sales Revenue by Product Category

# Get list of Top 20 countries in Sales Revenue
countries <- retail.data %>%
    group_by(Store.Location) %>%
    summarise(Total.Sales = sum(Sales.Revenue..USD.)) %>%
    arrange(desc(Total.Sales)) %>%
    head(20) %>%
    arrange(Store.Location)

colors <- c("pink", "royalblue", "orange", "lightgreen")

# Get data frame of Top 20 countries summarizing Marketing Spend 
# by Product Category
chart.data <- 
    subset(retail.data, Store.Location %in% unique(countries$Store.Location)) %>%
    group_by(Store.Location, Product.Category) %>%
    summarise(
        Marketing = sum(Marketing.Spend..USD.), 
        .groups = "keep"
    ) %>%
    arrange(Store.Location, Product.Category)

ggplot(chart.data, 
       aes(y = Store.Location, x = Marketing, fill = Product.Category)
    ) +
    geom_bar(stat = "identity") +
    scale_fill_manual(values = colors) +
    scale_x_continuous(
        labels = scales::number_format(big.mark = ",")
    ) +
    labs(
        x = "Marketing Spend (USD)", 
        y = "",
        title = "Marketing Spend for Top 20 Sales Countries by Product Category"
    ) +
    theme_bw() +
    theme(
        plot.title = element_text(hjust = 0.5)
    )

head(chart.data, 10)
## # A tibble: 10 × 3
## # Groups:   Store.Location, Product.Category [10]
##    Store.Location Product.Category Marketing
##    <chr>          <chr>                <int>
##  1 Anguilla       Clothing              1265
##  2 Anguilla       Electronics           2322
##  3 Anguilla       Furniture             2284
##  4 Anguilla       Groceries             1012
##  5 Benin          Clothing              1749
##  6 Benin          Electronics           1692
##  7 Benin          Furniture             2500
##  8 Benin          Groceries              910
##  9 Cayman Islands Clothing              1033
## 10 Cayman Islands Electronics           1485

Holiday Effect

Examine Sales Revenue by Day of the Week accounting for Holiday Effect.

chart.data <- retail.data %>%
    group_by(Day.of.the.Week) %>%
    summarise(
        Sales = sum(Sales.Revenue..USD.),
        Sales.Regular = sum(ifelse(Holiday.Effect, 0, Sales.Revenue..USD.)),
        Sales.Holiday = sum(ifelse(Holiday.Effect, Sales.Revenue..USD., 0))
    )

chart.data
## # A tibble: 7 × 4
##   Day.of.the.Week     Sales Sales.Regular Sales.Holiday
##   <chr>               <dbl>         <dbl>         <dbl>
## 1 Friday          11274065.     11037438.       236628.
## 2 Monday          11242331.     11003579.       238752.
## 3 Saturday        13128833.     13128833.            0 
## 4 Sunday          13373194.     13111477.       261716.
## 5 Thursday        11294932.     11065873.       229059.
## 6 Tuesday         11117111.     11117111.            0 
## 7 Wednesday       11054821.     11054821.            0
# Chart of Sales Revenue across Holiday Effect

chart.data <- retail.data %>%
    group_by(Holiday.Effect) %>%
    summarise(Sales = sum(Sales.Revenue..USD.))

chart.data$Holiday.Effect <- 
    ifelse(chart.data$Holiday.Effect, 'Holiday', 'Non-Holiday')

chart.data
## # A tibble: 2 × 2
##   Holiday.Effect     Sales
##   <chr>              <dbl>
## 1 Non-Holiday    81519132.
## 2 Holiday          966155.
ggplot(chart.data, aes(y = Sales, x = Holiday.Effect)) +
    geom_col(
        fill = c("royalblue", "green")
    ) +
    labs(
        x = "Holiday Effect", 
        y = "Sales", 
        title = "Sales by Holiday Effect"
    ) +
    scale_y_continuous(
        labels = scales::number_format(big.mark = ",")
    ) +
    theme_bw() +
    theme(
        plot.title = element_text(hjust = 0.5)
    )

# Chart Sales by Day of Week across Holiday Effect

chart.data <- retail.data %>%
    group_by(Day.of.the.Week, Holiday.Effect) %>%
    summarise(
        Sales = sum(Sales.Revenue..USD.), 
        .groups = "keep"
    )

chart.data$Holiday.Effect <- 
    ifelse(chart.data$Holiday.Effect, 'Holiday', 'Non-Holiday')

ggplot(chart.data, aes(x = Day.of.the.Week, y = Sales, fill = Holiday.Effect)) +
    geom_bar(stat = "identity") +
    scale_fill_manual(
        values = c("green", "royalblue")
    ) +
    labs(
        x = "Day of Week",
        y = "Sales (USD)", 
        title = "Sales by Day of Week across Holiday Effect"
    ) +
    scale_y_continuous(
        labels = scales::number_format(big.mark = ",")
    ) +
    theme_bw() +
    theme(
        plot.title = element_text(hjust = 0.5)
    )

Holiday Effect sales do not seem to impact the overall Sales Revenue.


Day of the Week

Create a stacked bar chart of Sales Revenue across Product Category by Day of the Week.

# Chart Sales by Day of Week across Product Category

chart.data <- retail.data %>%
    group_by(Day.of.the.Week, Product.Category) %>%
    summarise(Sales = sum(Sales.Revenue..USD.), .groups = "keep")

ggplot(chart.data, 
       aes(x = Day.of.the.Week, y = Sales, fill = Product.Category)
    ) +
    geom_bar(stat = "identity") +
    scale_fill_manual(
        values = c("pink", "royalblue", "orange", "green")
    ) +
    labs(
        x = "",
        y = "Sales (USD)", 
        title = "Sales by Day of Week across Product Category"
    ) +
    scale_y_continuous(
        labels = scales::number_format(big.mark = ",")
    ) +
    theme_bw() +
    theme(
        plot.title = element_text(hjust = 0.5)
    )


Time Series Plots

Summarize Sales Revenue by Date

chart.data <- retail.data %>%
    group_by(Date) %>%
    summarise(Sales = sum(Sales.Revenue..USD.)) %>%
    arrange(Date)

chart.data$Date <- as.Date(chart.data$Date)

head(chart.data,10)
## # A tibble: 10 × 2
##    Date         Sales
##    <date>       <dbl>
##  1 2022-01-01 120733.
##  2 2022-01-02 113007.
##  3 2022-01-03  85866.
##  4 2022-01-04  88386.
##  5 2022-01-05  84535.
##  6 2022-01-06  94267.
##  7 2022-01-07 100736.
##  8 2022-01-08 120248.
##  9 2022-01-09  99686.
## 10 2022-01-10  98114.
ggplot(chart.data, aes(x = Date, y = Sales)) +
    geom_line(stat = "identity", col = "royalblue") +
    labs(title = "Total Sales Revenue by Date") +
    scale_y_continuous(labels = scales::number_format(big.mark = ",")) +
    theme_bw() +
    theme(plot.title = element_text(hjust = 0.5))

Summarize Sales Revenue across Product Categories by Date

chart.data <- retail.data %>%
    group_by(Date, Product.Category) %>%
    summarise(Sales = sum(Sales.Revenue..USD.), .groups = "keep") %>%
    arrange(Date)

chart.data$Date <- as.Date(chart.data$Date)

colors <- c("Clothing" = "pink", 
            "Electronics" = "royalblue", 
            "Furniture" = "orange", 
            "Groceries" = "green")

ggplot(chart.data, aes(x = Date, y = Sales, col = Product.Category)) +
    geom_line(
        show.legend = TRUE, 
        stat = "identity"
    ) +
    scale_color_manual(values = colors) +
    scale_y_continuous(
        labels = scales::number_format(big.mark = ",")
    ) +
    labs(title = "Sales Revenue across Product Category by Date") +
    theme_bw() +
    theme(plot.title = element_text(hjust = 0.5))

Summarize Sales Revenue across Product Categories by Month

# Barchart of Sales Revenue by Month across Product Category

retail.data$Report.Date <- as.Date(format(as.Date(retail.data$Date), "%Y-%m-01"))

chart.data <- retail.data %>%
    group_by(Report.Date, Product.Category) %>%
    summarise(Sales = sum(Sales.Revenue..USD.), .groups = "keep") %>%
    arrange(Report.Date)

colors <- c("Clothing" = "pink", 
            "Electronics" = "royalblue", 
            "Furniture" = "orange", 
            "Groceries" = "green")

ggplot(chart.data, aes(x = Report.Date, y = Sales, fill = Product.Category)) +
    geom_bar(show.legend = TRUE, stat = "identity") +
    scale_fill_manual(values = colors) +
    scale_y_continuous(labels = scales::number_format(big.mark = ",")) +
    labs(title = "Sales Revenue across Product Category by Month") +
    theme_bw() +
    theme(plot.title = element_text(hjust = 0.5))