General
# Clear the workspace
rm(list = ls()) # Clear environment
gc() # Clear unused memory
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 509761 27.3 1134340 60.6 644240 34.5
## Vcells 905789 7.0 8388608 64.0 1635293 12.5
cat("\f") # Clear the console
if(!is.null(dev.list())) dev.off() # Clear all plots
## null device
## 1
# Prepare needed libraries
packages <- c("ggplot2"
, "lemon"
, "gridExtra" # For Q1
, "ggrepel" # For labels in Q2.b
, "scales"
)
for (i in 1:length(packages)) {
if (!packages[i] %in% rownames(installed.packages())) {
install.packages(packages[i], dependencies = TRUE)
}
library(packages[i], character.only = TRUE)
}
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'lemon' was built under R version 4.2.3
## Warning: package 'ggrepel' was built under R version 4.2.3
rm(packages)
color <- c("red", "blue", "cyan")
color <- c("#999999",
"#D55E00",
"#E69F00",
"#56B4E9",
"#009E73",
"#F0E442",
"#CC79A7",
"#74c476",
"#1c92d5",
"#0072B2",
"#9e79cc"
)
# Set working directory and path to data, if need be
# setwd("")
# Load data
sales <- read.csv("C:/Users/91976/Desktop/Software tools/R/Week 3/r.hw03.sales.csv")
items <- read.csv("C:/Users/91976/Desktop/Software tools/R/Week 3/r.hw03.items.csv")
# Merge data together
sales <- merge(sales,items)
# Reorder variables
var.order <- c("date"
, "category"
, "subcategory"
, "item.name"
, "volume"
, "price"
, "sale.bottles"
, "sale.volume"
, "sale.dollars"
)
sales <- sales[,var.order]
# Clean up merged data
sales$item.id <- NULL
# Reorder variables
var_order <- c("date", "category", "subcategory", "item.name", "volume", "price", "sale.bottles", "sale.volume", "sale.dollars")
sales <- sales[, var_order]
# Convert variables to proper types
sales$date <- as.Date(sales$date, format = "%Y-%m-%d")
sales$category <- factor(
x = sales$category,
levels = c("Amaretto", "Brandy", "Distilled Spirits", "Gin", "Rum", "Schnapps", "Tequila", "Vodka", "Whisky", "Misc"),
ordered = TRUE
)
Calendar Heatmap
# Generate a palette of random professional colors
num_colors <- 5
set.seed(123) # Setting seed for reproducibility
color_palette <- sample(colors(), num_colors)
# Aggregate sales by date
q1_agg <- aggregate(sale.dollars ~ date, data = sales, sum)
# Create a dataframe with sale dates in 2015
q1_dates <- data.frame(date = seq(from = as.Date("2015/01/01"), to = as.Date("2015/12/31"), by = "day"))
# Merge data to have sales data for each day with NA values for dates without sales
q1_data <- merge(q1_dates, q1_agg, by = "date", all.x = TRUE)
# Create sales level cut-offs for heatmap
q1_data$sale.level <- cut(q1_data$sale.dollars, breaks = c(0, 1e6, 1.2e6, 1.4e6, 1.6e6, 2e6), labels = c("0-1 mln", "1-1.2 mln", "1.2-1.4 mln", "1.4-1.6 mln", "1.6-2 mln"), ordered_result = TRUE)
# Create calendar dimensions
q1_data$day <- as.numeric(format(q1_data$date, "%d"))
q1_data$weekday <- factor(weekdays(q1_data$date, abbreviate = TRUE), levels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"), ordered = TRUE)
q1_data$week <- as.numeric(format(q1_data$date, "%W")) + 1
q1_data$week <- q1_data$week - as.numeric(format(as.Date(cut(q1_data$date, breaks = "month")), "%W"))
q1_data$month <- factor(months(q1_data$date, abbreviate = FALSE), levels = month.name, ordered = TRUE)
q1_data$quarter <- factor(quarters(q1_data$date, abbreviate = TRUE), levels = c("Q1", "Q2", "Q3", "Q4"), ordered = TRUE)
q1_data$day <- as.numeric(format(x = q1_data$date, "%d"))
q1_data$weekday <- factor(x = weekdays(x = q1_data$date, abbreviate = TRUE), levels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"), ordered = TRUE)
q1_data$week <- as.numeric(format(x = q1_data$date, "%W")) + 1
q1_data$week <- q1_data$week - as.numeric(format(x = as.Date(cut(x = q1_data$date, breaks = "month")), "%W"))
q1_data$month <- factor(x = months(q1_data$date, abbreviate = FALSE), levels = month.name, ordered = TRUE)
q1_heatmap <- ggplot(data = q1_data, aes(x = weekday, y = week, fill = sale.level)) +
geom_tile(color = "white", size = 0.5) +
geom_text(aes(label = day)) +
scale_fill_manual(values = c("#4CAF50", "#2196F3", "#FFEB3B", "#FF5722", "#9C27B0"), na.value = "gray") +
labs(x = "", y = "", title = "Daily Total Sales, 2015") +
guides(fill = guide_legend(nrow = 1)) +
facet_rep_wrap(~quarter + month, ncol = 3, strip.position = "top", repeat.tick.labels = TRUE) +
theme_bw() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 20, color = "red", margin = margin(t = 10, b = -20)),
strip.background = element_blank(),
strip.placement = "outside",
strip.text.x = element_text(size = 14, face = "bold", color = "red"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.text.y = element_blank(),
axis.text.x = element_text(size = 10, face = "bold"),
axis.ticks = element_blank(),
legend.position = "bottom",
legend.direction = "horizontal",
legend.title = element_blank()
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Print the final heatmap
print(q1_heatmap)

Q2
sales$price.per.l <- sales$sale.dollars / sales$sale.volume
# Assuming 'volume' is in milliliters, you may need to adjust the conversion factor if it's in a different unit
conversion_factor <- 0.001
# Create 'price.per.l' column
sales$price.per.l <- sales$sale.dollars / (sales$volume * conversion_factor)
# Exclude the top 5% of price per liter values
filtered_sales <- subset(sales, price.per.l <= quantile(price.per.l, 0.95, na.rm = TRUE))
# Order of categories in reverse
category_order <- c("Miscellaneous", "Whisky", "Vodka", "Tequila", "Schnapps", "Rum", "Gin", "Distilled Spirits", "Brandy", "Amaretto")
# Convert 'category' to an ordered factor
filtered_sales$category <- factor(filtered_sales$category, levels = category_order, ordered = TRUE)
# Replace "NA" with "Miscellaneous"
filtered_sales$category[is.na(filtered_sales$category)] <- "Miscellaneous"
# Randomly sample 10% of the data
set.seed(123) # for reproducibility
sampled_data <- filtered_sales[sample(nrow(filtered_sales), 0.1 * nrow(filtered_sales)), ]
# Box-and-whisker plot
library(ggplot2)
library(RColorBrewer)
# Define colors for categories
category_colors <- brewer.pal(10, "Set3")
# Plot
ggplot(sampled_data, aes(x = price.per.l, y = category, fill = category)) +
geom_boxplot(outlier.colour = "red", outlier.shape = 16, outlier.size = 2) +
scale_fill_manual(values = category_colors) +
theme_minimal() +
theme(legend.position = "right") +
labs(x = "Price per Liter ($)", y = "Category", fill = "Category") +
ggtitle("Distribution of Price per Liter Across Categories (Excluding Top 5%)")

Q2B
# Scatter plot
q2b.data <- aggregate(cbind(sale.volume) ~ category + subcategory, data = sales, FUN = sum)
colnames(q2b.data)[colnames(q2b.data) == "sale.volume"] <- "sale.volume.agg"
q2b.data <- merge(x = sales[, c("category", "subcategory", "item.name", "price.per.l", "sale.volume")],
y = q2b.data,
by = c("category", "subcategory"))
q2b.data$price.per.l.w <- q2b.data$price.per.l * q2b.data$sale.volume / q2b.data$sale.volume.agg
q2b.data <- aggregate(cbind(price.per.l.w, sale.volume) ~ category + subcategory, data = q2b.data, FUN = sum)
# Create labels to be used by ggrepel
labels <- c("80 Proof Vodkas", "Canadian Whiskies", "Spiced Rum", "Vodkas", "Miscellaneous Whiskies", "Single Barrel Bourbon Whiskies", "Single Malt Scotch", "Japanese Whisky")
# Create empty labels variable
q2b.data$labels <- ifelse(q2b.data$subcategory %in% labels, paste(q2b.data$subcategory, ", ", round(q2b.data$price.per.l.w, 1), " $/l"), NA)
# Scatter plot
q2b_plot <- ggplot(data = q2b.data, aes(x = price.per.l.w, y = sale.volume / 1000, label = labels)) +
geom_point(aes(color = category), fill = "white", shape = 21, stroke = 2, size = 4) +
scale_color_manual(values = color) +
geom_text_repel(
force = 10,
direction = "both",
nudge_x = 1,
nudge_y = 1,
point.padding = 1.5,
box.padding = 1.5,
segment.size = 0.5,
size = 4,
max.overlaps = 50
) +
scale_x_continuous(breaks = seq(0, round(max(q2b.data$price.per.l.w * 1.1), 0), 2)) +
scale_y_continuous(breaks = seq(0, round(max(q2b.data$sale.volume * 1.1 / 1000), 0), 250)) +
labs(
x = "Average weighted price per liter, $",
y = "Liters sold, thousands",
title = "Liquor subcategories",
subtitle = "price vs quantity"
) +
theme_bw() +
theme(
plot.title = element_text(
hjust = 0.5,
face = "bold",
size = 20,
color = "red",
margin = margin(b = 5)
),
panel.grid.minor.y = element_blank(),
panel.grid.major = element_blank(), # Remove major grid lines
axis.text.y = element_text(size = 10, face = "bold"),
axis.title.y = element_text(
size = 16,
face = "bold",
color = "red",
margin = margin(r = 10, l = 10)
),
axis.text.x = element_text(size = 10, face = "bold"),
axis.title.x = element_text(
size = 16,
face = "bold",
color = "red",
margin = margin(t = 10, b = 10)
),
legend.position = c(0.9, 0.8),
legend.direction = "vertical",
legend.title = element_blank(),
legend.text = element_text(size = 10, face = "bold"),
legend.background = element_rect(fill = "transparent")
)
print(q2b_plot)
## Warning: Removed 62 rows containing missing values (`geom_text_repel()`).

Q3a
q3a <- ggplot()
library(psych)
## Warning: package 'psych' was built under R version 4.2.3
##
## Attaching package: 'psych'
## The following objects are masked from 'package:scales':
##
## alpha, rescale
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
describe(sales[c(1, 2, 9)])
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## vars n mean sd median trimmed mad min max
## date 1 2184477 NaN NA NA NaN NA Inf -Inf
## category* 2 2184477 7.38 2.16 8.00 7.67 1.48 1.00 10
## sale.dollars 3 2184477 130.16 405.47 70.56 84.54 69.03 1.34 106326
## range skew kurtosis se
## date -Inf NA NA NA
## category* 9.0 -1.09 0.36 0.00
## sale.dollars 106324.7 39.55 4816.76 0.27
q3a.data <- aggregate(cbind(sale.dollars) ~ category + months(date), data = sales, FUN = sum)
q3a.data$month <- factor(q3a.data$`months(date)`,
levels = month.name,
labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"),
ordered = TRUE
)
q3a.data$`months(date)` <- NULL
q3a.data <- merge(q3a.data, aggregate(x = cbind(sale.dollars) ~ category, data = q3a.data, FUN = sum),
by = c("category"),
all = TRUE
)
names(q3a.data) <- c("category", "sales.monthly", "month", "sales.annual")
q3a.data$sales.monthly.share <- round(x = 100 * q3a.data$sales.monthly/q3a.data$sales.annual, digits = 2)
q3a_plot <- ggplot(q3a.data, aes(x = month, y = sales.monthly.share, group = category, color = category, fill = category)) +
geom_line(size = 1.25) +
geom_point(size = 4, shape = 21, color = "black", stroke = 1.5) +
scale_color_manual(values = color) +
scale_fill_manual(values = color) +
scale_y_continuous(breaks = breaks_extended(10)) +
labs(x = "Month", y = "Share of total sales", title = "% of Total Sales per month") +
theme_bw() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 20, color = "red", margin = margin(b = 10)),
axis.text.y = element_text(size = 10, face = "bold"),
axis.title.y = element_blank(),
axis.text.x = element_text(size = 10, face = "bold"),
axis.title.x = element_text(size = 16, face = "bold", color = "red", margin = margin(t = 10, b = 10)),
legend.position = "right",
legend.direction = "vertical",
legend.title = element_blank(),
legend.text = element_text(size = 10, face = "bold"),
panel.grid = element_blank() # Remove background lines
)
q3a_plot

Q3b chart
q3b <- ggplot()
color <- c("#66c2a5", "#fc8d62", "#8da0cb", "#e78ac3", "#a6d854", "#ffd92f", "#e5c494", "#b3b3b3", "Red","blue")
# Aggregate sales data
q3b_data <- aggregate(cbind(sale.dollars) ~ category + weekdays(date, abbreviate = TRUE), sales, sum)
# Factorize weekdays
q3b_data$weekday <- factor(q3b_data[, 2], levels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"), ordered = TRUE)
q3b_data[, 2] <- NULL
# Merge with total sales data
q3b_data <- merge(q3b_data, aggregate(x = cbind(sale.dollars) ~ category, data = q3b_data, FUN = sum),
by = c("category"),
all = TRUE
)
# Rename columns
names(q3b_data) <- c("category", "sales.weekday", "weekday", "sales.annual")
# Calculate sales share
q3b_data$sales.weekday.share <- round(x = 100 * q3b_data$sales.weekday/q3b_data$sales.annual, digits = 2)
# Plot
q3b_plot <- ggplot(q3b_data, aes(x = weekday, y = sales.weekday.share, group = category, color = category, fill = category)) +
geom_line(size = 1.25) +
geom_point(size = 4, shape = 21, color = "black", stroke = 1.5) +
scale_color_manual(values = color) +
scale_fill_manual(values = color) +
labs(x = "Weekday", y = "Share of sales", title = "% of Total Sales per weekday") +
scale_y_continuous(breaks = breaks_extended(10)) +
theme_bw() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 20, color = "#912600", margin = margin(b = 10)),
axis.text.y = element_text(size = 10, face = "bold"),
axis.title.y = element_blank(),
axis.text.x = element_text(size = 10, face = "bold"),
axis.title.x = element_text(size = 16, face = "bold", color = "#912600", margin = margin(t = 10, b = 10)),
legend.position = "none",
panel.grid = element_blank() # Remove background lines
)
q3b_plot

q3b <- q3b_plot
Q4 chart
q4 <- ggplot()
# Aggregate category sales by dollars, liters, and bottles
q4.data <- aggregate(cbind(sale.dollars, sale.volume, sale.bottles) ~ category, data = sales, FUN = sum)
# Add percentage of total (optional)
q4.data$share.dollars <- round(100 * q4.data$sale.dollars / sum(q4.data$sale.dollars), digits = 2)
q4.data$share.volume <- round(100 * q4.data$sale.volume / sum(q4.data$sale.volume), digits = 2)
q4.data$share.bottles <- round(100 * q4.data$sale.bottles / sum(q4.data$sale.bottles), digits = 2)
# Calculate ranks
q4.data$rank.dollars <- rank(-q4.data$share.dollars)
q4.data$rank.volume <- rank(-q4.data$share.volume)
q4.data$rank.bottles <- rank(-q4.data$share.bottles)
# Set up horizontal spacing coordinates
x0 <- 0
x1 <- 1
x2 <- 2
x3 <- 3
x4 <- 4
q4_plot <- ggplot(q4.data) +
geom_segment(aes(x = x1, xend = x2, y = rank.dollars, yend = rank.volume, color = category), size = 1.25, show.legend = FALSE) +
geom_segment(aes(x = x2, xend = x3, y = rank.volume, yend = rank.bottles, color = category), size = 1.25, show.legend = FALSE) +
geom_vline(xintercept = x1, linetype = "dashed", size = 0.5) +
geom_vline(xintercept = x2, linetype = "dashed", size = 0.5) +
geom_vline(xintercept = x3, linetype = "dashed", size = 0.5) +
geom_text(aes(label = category, y = rank.dollars, x = x1 - 0.1), hjust = 1, size = 5.5, fontface = "bold") +
geom_text(aes(label = category, y = rank.bottles, x = x3 + 0.1), hjust = 0, size = 5.5, fontface = "bold") +
geom_point(aes(x = x1, y = rank.dollars, color = category), size = 10) +
geom_text(aes(label = rank.dollars, y = rank.dollars, x = x1), color = "white", size = 6, fontface = "bold") +
geom_point(aes(x = x2, y = rank.volume, color = category), size = 10) +
geom_text(aes(label = rank.volume, y = rank.volume, x = x2), color = "white", size = 6, fontface = "bold") +
geom_point(aes(x = x3, y = rank.bottles, color = category), size = 10) +
geom_text(aes(label = rank.bottles, y = rank.bottles, x = x3), color = "white", size = 6, fontface = "bold") +
scale_color_manual(values = color) +
scale_y_reverse(breaks = c(min(q4.data$rank.dollars):max(q4.data$rank.dollars))) +
scale_x_continuous(breaks = c(x0, x1, x2, x3, x4), labels = c("", "Sales, $", "Sales, liters", "Sales, bottles", "")) +
coord_cartesian(xlim = c(x0, x4)) +
labs(title = "Liquor Category Rankings") +
theme_bw() +
theme(
plot.title = element_text(hjust = 0.5, face = "bold", size = 20, color = "#912600", margin = margin(b = 30)),
axis.text.y = element_blank(),
axis.title.y = element_blank(),
axis.ticks.y = element_blank(),
axis.text.x = element_text(size = 16, face = "bold", color = "#912600", margin = margin(t = 10, b = 10)),
axis.title.x = element_blank(),
axis.ticks.x = element_blank(),
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.position = "none"
)
q4_plot

q4 <- q4_plot