This R notebook explores the Retail Sales Data with Seasonal Trends & Marketing (kaggle.com).
https://www.kaggle.com/datasets/abdullah0a/retail-sales-data-with-seasonal-trends-and-marketing/data
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
# Change figure size of plots
options(repr.plot.width = 16, repr.plot.height = 8)
Load all 30K+ records into a tidyverse data frame.
retail.data <- read.csv("retail-sales-data-with-seasonal-trends-and-marketing.csv")
head(retail.data, 20)
## Store.ID Product.ID Date Units.Sold Sales.Revenue..USD.
## 1 Spearsland 52372247 2022-01-01 9 2741.69
## 2 Spearsland 52372247 2022-01-02 7 2665.53
## 3 Spearsland 52372247 2022-01-03 1 380.79
## 4 Spearsland 52372247 2022-01-04 4 1523.16
## 5 Spearsland 52372247 2022-01-05 2 761.58
## 6 Spearsland 52372247 2022-01-06 8 3046.32
## 7 Spearsland 52372247 2022-01-07 6 2284.74
## 8 Spearsland 52372247 2022-01-08 9 3427.11
## 9 Spearsland 52372247 2022-01-09 7 2665.53
## 10 Spearsland 52372247 2022-01-10 1 380.79
## 11 Spearsland 52372247 2022-01-11 4 1523.16
## 12 Spearsland 52372247 2022-01-12 6 2284.74
## 13 Spearsland 52372247 2022-01-13 3 971.01
## 14 Spearsland 52372247 2022-01-14 6 2284.74
## 15 Spearsland 52372247 2022-01-15 3 971.01
## 16 Spearsland 52372247 2022-01-16 2 761.58
## 17 Spearsland 52372247 2022-01-17 8 3046.32
## 18 Spearsland 52372247 2022-01-18 6 2284.74
## 19 Spearsland 52372247 2022-01-19 13 4950.27
## 20 Spearsland 52372247 2022-01-20 5 1903.95
## Discount.Percentage Marketing.Spend..USD. Store.Location
## 1 20 81 Tanzania
## 2 0 0 Mauritania
## 3 0 0 Saint Pierre and Miquelon
## 4 0 0 Australia
## 5 0 0 Swaziland
## 6 0 41 Bhutan
## 7 0 0 Suriname
## 8 0 83 Taiwan
## 9 0 0 Papua New Guinea
## 10 0 164 Canada
## 11 0 61 Vietnam
## 12 0 0 Cocos (Keeling) Islands
## 13 15 197 Lebanon
## 14 0 0 Luxembourg
## 15 15 163 French Guiana
## 16 0 160 Paraguay
## 17 0 0 Nauru
## 18 0 0 Italy
## 19 0 188 Saint Helena
## 20 0 0 Portugal
## Product.Category Day.of.the.Week Holiday.Effect
## 1 Furniture Saturday False
## 2 Furniture Sunday False
## 3 Furniture Monday False
## 4 Furniture Tuesday False
## 5 Furniture Wednesday False
## 6 Furniture Thursday False
## 7 Furniture Friday False
## 8 Furniture Saturday False
## 9 Furniture Sunday False
## 10 Furniture Monday False
## 11 Furniture Tuesday False
## 12 Furniture Wednesday False
## 13 Furniture Thursday False
## 14 Furniture Friday False
## 15 Furniture Saturday False
## 16 Furniture Sunday False
## 17 Furniture Monday False
## 18 Furniture Tuesday False
## 19 Furniture Wednesday False
## 20 Furniture Thursday False
Count the number of distinct countries.
# Count number of distinct countries
length(unique(retail.data$Store.Location))
## [1] 243
Examine top selling countries.
# Chart of Top 20 Stores by Revenue
chart.data <- retail.data %>%
group_by(Store.Location) %>%
summarise(Sales = sum(Sales.Revenue..USD.)) %>%
arrange(desc(Sales)) %>%
head(20)
ggplot(chart.data, aes(x = Sales, y = reorder(Store.Location, Sales))) +
geom_col(
fill = "cornflowerblue"
) +
labs(
x = "Revenue (USD)", y = "",
title = "Top 20 Countries with Highest Revenue"
) +
scale_x_continuous(
labels = scales::number_format(big.mark = ",")
) +
theme_bw()
Examine Sales Revenue by Product Category
# Sales by Product Category
chart.data <- retail.data %>%
group_by(Product.Category) %>%
summarise(Sales = sum(Sales.Revenue..USD.))
ggplot(chart.data, aes(x = "", y = Sales, fill = Product.Category)) +
geom_col(color = "black") +
geom_text(
aes(label = round(Sales / 1000000, 1)),
position = position_stack(vjust = 0.5)
) +
scale_y_continuous(labels = NULL) +
scale_fill_manual(
values = c("pink", "royalblue", "orange", "lightgreen")
) +
labs(
x = "",
y = "Sales (MM)",
title = "Sales Revenue by Product Category"
) +
coord_polar(theta = "y") +
theme_bw() +
theme(
plot.title = element_text(hjust = 0.5)
)
ggplot(chart.data, aes(x = Product.Category, y = Sales)) +
geom_col(
fill = c("pink", "royalblue", "orange", "lightgreen")
) +
labs(
title = "Sales Revenue by Product Category",
ylab = "Sales (USD)", xlab = "Product Category"
) +
scale_y_continuous(
labels = scales::number_format(big.mark = ",")
) +
theme_bw() +
theme(
plot.title = element_text(hjust = 0.5)
)
## Ignoring unknown labels:
## • ylab : "Sales (USD)"
## • xlab : "Product Category"
boxplot(
retail.data$Sales.Revenue..USD. ~ retail.data$Product.Category,
col = c("pink", "royalblue", "orange", "lightgreen"),
main = "Distribution of Sales Revenue by Product Category",
ylab = "Sales (USD)",
xlab = "Product Category"
)
Examine Marketing Spend across various attributes.
hist(
retail.data$Marketing.Spend..USD.,
col = "royalblue",
main = "Histogram of Marketing Spend (USD)",
xlab = "USD"
)
grid(nx = NULL, ny = NULL)
# Scatter plot of Marketing by Sales per Location
chart.data <-
subset(retail.data, Marketing.Spend..USD. >= 0) %>%
group_by(Store.Location) %>%
summarise(
Sales = mean(Sales.Revenue..USD.),
Marketing = mean(Marketing.Spend..USD.)
)
plot(
chart.data$Marketing ~ chart.data$Sales,
main = "Average (Marketing ~ Sales) by Country",
ylab = "Marking Spend (USD)", xlab = "Sales Revenue (USD)",
col = "royalblue"
)
grid(nx = NULL, ny = NULL)
# Chart the distribution of Marketing Spend by Product Category
chart.data <- subset(retail.data, Marketing.Spend..USD. > 0)
boxplot(
chart.data$Marketing.Spend..USD. ~ chart.data$Product.Category,
col = c("pink", "royalblue", "orange", "lightgreen"),
main = "Distribution of Marketing Spend by Product Category",
ylab = "Marketing Spend (USD)", xlab = "Product Category"
)
chart.data <- subset(retail.data, Marketing.Spend..USD. > 0) %>%
group_by(
Product.Category
) %>%
summarise(
Marketing.Max = max(Marketing.Spend..USD.),
Marketing.Min = min(Marketing.Spend..USD.),
Marketing.Avg = mean(Marketing.Spend..USD.)
)
chart.data
## # A tibble: 4 × 4
## Product.Category Marketing.Max Marketing.Min Marketing.Avg
## <chr> <int> <int> <dbl>
## 1 Clothing 199 1 99.4
## 2 Electronics 199 1 99.2
## 3 Furniture 199 1 101.
## 4 Groceries 199 1 100.
chart.data <- retail.data %>%
group_by(Product.Category) %>%
summarise(Marketing = sum(Marketing.Spend..USD.))
pie(
chart.data$Marketing, labels = chart.data$Product.Category,
main = "Marketing Spend by Product Category",
col = c("pink", "royalblue", "orange", "lightgreen")
)
# Chart of Top 20 Countries by Marketing Spend
chart.data <- retail.data %>%
group_by(Store.Location) %>%
summarise(Marketing = sum(Marketing.Spend..USD.)) %>%
arrange(desc(Marketing)) %>%
head(20)
ggplot(chart.data, aes(x = Marketing, y = reorder(Store.Location, Marketing))) +
geom_col(
fill = "purple"
) +
labs(
x = "Spend (USD)",
y = "",
title = "Top 20 Countries with Highest Marketing Spend"
) +
scale_x_continuous(
labels = scales::number_format(big.mark = ",")
) +
theme_bw()
Create a chart visualizing the top 20 highest marketing spend countries by product category.
# Chart of Top 20 Countries with Highest Marketing Spend by Product Category
# Get Top 20 countries with highest marketing spend.
countries <- retail.data %>%
group_by(Store.Location) %>%
summarise(Total.Marketing = sum(Marketing.Spend..USD.)) %>%
arrange(desc(Total.Marketing)) %>%
head(20) %>%
arrange(Store.Location)
# Get Marketing Spend for Top 20 countries by Product Category
chart.data <-
subset(retail.data, Store.Location %in% unique(countries$Store.Location)) %>%
group_by(Store.Location, Product.Category) %>%
summarise(Marketing = sum(Marketing.Spend..USD.), .groups = "keep") %>%
arrange(Store.Location, Product.Category) %>%
pivot_wider(
names_from = "Store.Location",
values_from = "Marketing"
)
# Create Spend matrix by Product Category (rows) across Countries (columns)
chart.matrix <- as.matrix(chart.data[, countries$Store.Location])
rownames(chart.matrix) <- sort(unique(chart.data$Product.Category))
colors <- c("pink", "royalblue", "orange", "lightgreen")
country.names <- colnames(chart.matrix)
barplot(
chart.matrix,
col = colors,
cex.axis = 0.8,
cex.names = 0.7,
names.arg = country.names,
ylab = "",
las = 2,
ylim = c(0, 14500),
axes = TRUE
)
title("Marketing Spend for Top 20 Countries by Product Category")
legend(x = 19, y = 14000, legend = rownames(chart.matrix),
col = colors, lwd = 4, cex = 0.8)
grid(nx = NULL, ny = NULL)
chart.data
## # A tibble: 4 × 21
## # Groups: Product.Category [4]
## Product.Category Armenia Bahrain Congo Guam Kenya Korea
## <chr> <int> <int> <int> <int> <int> <int>
## 1 Clothing 1600 1415 2292 2170 1661 1620
## 2 Electronics 1724 1784 2863 1481 1927 3532
## 3 Furniture 2643 2969 4403 2381 3050 3789
## 4 Groceries 1467 1794 1829 2020 907 1791
## # ℹ 14 more variables: `Libyan Arab Jamahiriya` <int>, Mauritius <int>,
## # Morocco <int>, Nauru <int>, `Norfolk Island` <int>, `Sierra Leone` <int>,
## # Tanzania <int>, Tokelau <int>, Turkey <int>,
## # `Turks and Caicos Islands` <int>, `United Arab Emirates` <int>,
## # `United States Virgin Islands` <int>, `Wallis and Futuna` <int>,
## # Yemen <int>
Create a chart of top 20 countries with highest sales revenue, but visualize the marketing spend values across product category.
# Chart of Marketing Spend for Top 20 Countries with
# Highest Sales Revenue by Product Category
# Get list of Top 20 countries in Sales Revenue
countries <- retail.data %>%
group_by(Store.Location) %>%
summarise(Total.Sales = sum(Sales.Revenue..USD.)) %>%
arrange(desc(Total.Sales)) %>%
head(20) %>%
arrange(Store.Location)
colors <- c("pink", "royalblue", "orange", "lightgreen")
# Get data frame of Top 20 countries summarizing Marketing Spend
# by Product Category
chart.data <-
subset(retail.data, Store.Location %in% unique(countries$Store.Location)) %>%
group_by(Store.Location, Product.Category) %>%
summarise(
Marketing = sum(Marketing.Spend..USD.),
.groups = "keep"
) %>%
arrange(Store.Location, Product.Category)
ggplot(chart.data,
aes(y = Store.Location, x = Marketing, fill = Product.Category)
) +
geom_bar(stat = "identity") +
scale_fill_manual(values = colors) +
scale_x_continuous(
labels = scales::number_format(big.mark = ",")
) +
labs(
x = "Marketing Spend (USD)",
y = "",
title = "Marketing Spend for Top 20 Sales Countries by Product Category"
) +
theme_bw() +
theme(
plot.title = element_text(hjust = 0.5)
)
head(chart.data, 10)
## # A tibble: 10 × 3
## # Groups: Store.Location, Product.Category [10]
## Store.Location Product.Category Marketing
## <chr> <chr> <int>
## 1 Anguilla Clothing 1265
## 2 Anguilla Electronics 2322
## 3 Anguilla Furniture 2284
## 4 Anguilla Groceries 1012
## 5 Benin Clothing 1749
## 6 Benin Electronics 1692
## 7 Benin Furniture 2500
## 8 Benin Groceries 910
## 9 Cayman Islands Clothing 1033
## 10 Cayman Islands Electronics 1485
Examine Sales Revenue by Day of the Week accounting for Holiday Effect.
chart.data <- retail.data %>%
group_by(Day.of.the.Week) %>%
summarise(
Sales = sum(Sales.Revenue..USD.),
Sales.Regular = sum(ifelse(Holiday.Effect, 0, Sales.Revenue..USD.)),
Sales.Holiday = sum(ifelse(Holiday.Effect, Sales.Revenue..USD., 0))
)
chart.data
## # A tibble: 7 × 4
## Day.of.the.Week Sales Sales.Regular Sales.Holiday
## <chr> <dbl> <dbl> <dbl>
## 1 Friday 11274065. 11037438. 236628.
## 2 Monday 11242331. 11003579. 238752.
## 3 Saturday 13128833. 13128833. 0
## 4 Sunday 13373194. 13111477. 261716.
## 5 Thursday 11294932. 11065873. 229059.
## 6 Tuesday 11117111. 11117111. 0
## 7 Wednesday 11054821. 11054821. 0
# Chart of Sales Revenue across Holiday Effect
chart.data <- retail.data %>%
group_by(Holiday.Effect) %>%
summarise(Sales = sum(Sales.Revenue..USD.))
chart.data$Holiday.Effect <-
ifelse(chart.data$Holiday.Effect, 'Holiday', 'Non-Holiday')
chart.data
## # A tibble: 2 × 2
## Holiday.Effect Sales
## <chr> <dbl>
## 1 Non-Holiday 81519132.
## 2 Holiday 966155.
ggplot(chart.data, aes(y = Sales, x = Holiday.Effect)) +
geom_col(
fill = c("royalblue", "green")
) +
labs(
x = "Holiday Effect",
y = "Sales",
title = "Sales by Holiday Effect"
) +
scale_y_continuous(
labels = scales::number_format(big.mark = ",")
) +
theme_bw() +
theme(
plot.title = element_text(hjust = 0.5)
)
# Chart Sales by Day of Week across Holiday Effect
chart.data <- retail.data %>%
group_by(Day.of.the.Week, Holiday.Effect) %>%
summarise(
Sales = sum(Sales.Revenue..USD.),
.groups = "keep"
)
chart.data$Holiday.Effect <-
ifelse(chart.data$Holiday.Effect, 'Holiday', 'Non-Holiday')
ggplot(chart.data, aes(x = Day.of.the.Week, y = Sales, fill = Holiday.Effect)) +
geom_bar(stat = "identity") +
scale_fill_manual(
values = c("green", "royalblue")
) +
labs(
x = "Day of Week",
y = "Sales (USD)",
title = "Sales by Day of Week across Holiday Effect"
) +
scale_y_continuous(
labels = scales::number_format(big.mark = ",")
) +
theme_bw() +
theme(
plot.title = element_text(hjust = 0.5)
)
Holiday Effect sales do not seem to impact the overall Sales Revenue.
Create a stacked bar chart of Sales Revenue across Product Category by Day of the Week.
# Chart Sales by Day of Week across Product Category
chart.data <- retail.data %>%
group_by(Day.of.the.Week, Product.Category) %>%
summarise(Sales = sum(Sales.Revenue..USD.), .groups = "keep")
ggplot(chart.data,
aes(x = Day.of.the.Week, y = Sales, fill = Product.Category)
) +
geom_bar(stat = "identity") +
scale_fill_manual(
values = c("pink", "royalblue", "orange", "green")
) +
labs(
x = "",
y = "Sales (USD)",
title = "Sales by Day of Week across Product Category"
) +
scale_y_continuous(
labels = scales::number_format(big.mark = ",")
) +
theme_bw() +
theme(
plot.title = element_text(hjust = 0.5)
)
Summarize Sales Revenue by Date
chart.data <- retail.data %>%
group_by(Date) %>%
summarise(Sales = sum(Sales.Revenue..USD.)) %>%
arrange(Date)
chart.data$Date <- as.Date(chart.data$Date)
head(chart.data,10)
## # A tibble: 10 × 2
## Date Sales
## <date> <dbl>
## 1 2022-01-01 120733.
## 2 2022-01-02 113007.
## 3 2022-01-03 85866.
## 4 2022-01-04 88386.
## 5 2022-01-05 84535.
## 6 2022-01-06 94267.
## 7 2022-01-07 100736.
## 8 2022-01-08 120248.
## 9 2022-01-09 99686.
## 10 2022-01-10 98114.
ggplot(chart.data, aes(x = Date, y = Sales)) +
geom_line(stat = "identity", col = "royalblue") +
labs(title = "Total Sales Revenue by Date") +
scale_y_continuous(labels = scales::number_format(big.mark = ",")) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
Summarize Sales Revenue across Product Categories by Date
chart.data <- retail.data %>%
group_by(Date, Product.Category) %>%
summarise(Sales = sum(Sales.Revenue..USD.), .groups = "keep") %>%
arrange(Date)
chart.data$Date <- as.Date(chart.data$Date)
colors <- c("Clothing" = "pink",
"Electronics" = "royalblue",
"Furniture" = "orange",
"Groceries" = "green")
ggplot(chart.data, aes(x = Date, y = Sales, col = Product.Category)) +
geom_line(
show.legend = TRUE,
stat = "identity"
) +
scale_color_manual(values = colors) +
scale_y_continuous(
labels = scales::number_format(big.mark = ",")
) +
labs(title = "Sales Revenue across Product Category by Date") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
Summarize Sales Revenue across Product Categories by Month
# Barchart of Sales Revenue by Month across Product Category
retail.data$Report.Date <- as.Date(format(as.Date(retail.data$Date), "%Y-%m-01"))
chart.data <- retail.data %>%
group_by(Report.Date, Product.Category) %>%
summarise(Sales = sum(Sales.Revenue..USD.), .groups = "keep") %>%
arrange(Report.Date)
colors <- c("Clothing" = "pink",
"Electronics" = "royalblue",
"Furniture" = "orange",
"Groceries" = "green")
ggplot(chart.data, aes(x = Report.Date, y = Sales, fill = Product.Category)) +
geom_bar(show.legend = TRUE, stat = "identity") +
scale_fill_manual(values = colors) +
scale_y_continuous(labels = scales::number_format(big.mark = ",")) +
labs(title = "Sales Revenue across Product Category by Month") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))