rm(list = ls())
packages <- c("ggplot2"
, "lemon"
, "gridExtra" # For Q1
, "ggrepel" # For labels in Q2.b
, "scales"
, "dplyr",
"lubridate",
"tidyr"
)
for (i in 1:length(packages)) {
if (!packages[i] %in% rownames(installed.packages())) {
install.packages(packages[i]
, repos = "http://cran.rstudio.com/"
, dependencies = TRUE
)
}
library(packages[i], character.only = TRUE)
}
##
## 载入程辑包:'dplyr'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## 载入程辑包:'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
setwd("D:/Study/Tools 4 Data Analysis/R Assignment 3")
sales <- read.csv("D:/Study/Tools 4 Data Analysis/R Assignment 3/r.hw03.sales.csv"
, check.names = FALSE
, stringsAsFactors = FALSE
, na.strings = ""
)
items <- read.csv("D:/Study/Tools 4 Data Analysis/R Assignment 3/r.hw03.items.csv"
, check.names = FALSE
, stringsAsFactors = FALSE
, na.strings = ""
)
# Merge data together
sales <- merge(sales, items, by="item.id", all.x=T, all.y=F)
# Reorder variables
var.order <- c("date"
, "category"
, "subcategory"
, "item.name"
, "volume"
, "price"
, "sale.bottles"
, "sale.volume"
, "sale.dollars"
)
sales <- sales[var.order]
sales$date <- as.Date(sales$date)
dollar.sales.date <- aggregate(sale.dollars ~ date, sales, sum)
year2015 <- seq(from=as.Date("2015-01-01"), to=as.Date("2015-12-31"), by="day")
sales.dates <- data.frame(date = year2015)
daily.sales <- merge(sales.dates, dollar.sales.date, by="date", all.x=TRUE)
daily.sales$day <- as.numeric(format(daily.sales$date, "%d"))
daily.sales$weekday <- factor(weekdays(daily.sales$date, abbreviate = TRUE),
levels = c("周一", "周二", "周三", "周四", "周五", "周六", "周日"),
ordered=TRUE
)
daily.sales$month <- factor(months(daily.sales$date, abbreviate = TRUE),
levels = c("1月", "2月", "3月", "4月", "5月", "6月", "7月", "8月", "9月", "10月", "11月", "12月"),
ordered = TRUE
)
daily.sales$week <- as.numeric(format(daily.sales$date, "%W")) + 1
weeknum <- as.numeric(format(as.Date(cut(daily.sales$date, "month")), "%W"))
daily.sales$week <- daily.sales$week - weeknum
#
# chinese_weekdays <- c("星期日", "星期一", "星期二", "星期三", "星期四", "星期五", "星期六")
# english_weekdays <- c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat")
#
# chinese_months <- c("一月", "二月", "三月", "四月", "五月", "六月", "七月", "八月", "九月", "十月", "十一月", "十二月")
# english_months <- c("Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec")
#
# weekday_lookup <- setNames(english_weekdays, chinese_weekdays)
# month_lookup <- setNames(english_months, chinese_months)
#
# daily_sales$weekday <- weekday_lookup[as.character(daily_sales$weekday)]
# daily_sales$month <- month_lookup[as.character(daily_sales$month)]
heatmap <- ggplot(daily.sales, aes(x = weekday,
y = week,
fill = factor(cut(sale.dollars,
breaks = c(0, 7*10^5, 1*10^6, 1.3*10^6, 1.7*10^6, 2*10^6),
labels=c("0-1 min", "1-1.2 min", "1.2 - 1.4min", "1.4-1.6min", "1.6-2 min")
)))) +
geom_tile(colour = "white") +
geom_text(aes(label = day)) +
facet_rep_wrap( ~ month
, ncol = 3
, strip.position = "top"
, repeat.tick.labels = TRUE
) +
scale_y_reverse() +
scale_x_discrete(limits = levels(daily.sales$weekday), position = "top")+
theme(strip.background = element_blank(), strip.placement = "outside", axis.text.y = element_blank()
, legend.direction = "horizontal"
, legend.position = "bottom"
, legend.title = element_blank()) +
labs(x = "", y = "", title = "Daily Total Sales, 2015")
heatmap
sales$price.L <- sales$price / ((sales$volume)/1000.0)
sales$category <- factor(sales$category,
levels=c("Amaretto", "Brandy", "Distilled Spirits", "Gin", "Rum","Schnapps", "Tequila", "Vodka", "Whisky", "Misc"),
ordered = T)
category.price.L <- sales[c("category", "price.L")]
quantile.limit <- quantile(sales$price.L, 0.95)
data.filtered <- dplyr::filter(sales, price.L <= quantile.limit)
boxplot <- ggplot(data.filtered,
aes(x = category,
y = price.L,
fill = category)) +
geom_boxplot(outlier.fill = NULL) +
coord_flip() +
scale_x_discrete(limits = rev(levels(category.price.L$category))) +
scale_y_continuous(breaks = seq(from = 0, to = 40, by = 2),
labels = seq(from = 0, to = 40, by = 2)) +
labs(x = "Category", y = "Price per liter", title = "Categories", subtitle = "Excluding top 5% values") +
theme(legend.title = element_blank())
boxplot
Q2b <- sales %>%
group_by(subcategory, category) %>%
summarize(
total_dollar_sales = sum(sale.dollars),
total_liter_sales = sum(sale.volume),
.groups = 'drop'
) %>%
mutate(price.L.w = total_dollar_sales / total_liter_sales)
top_subcategories <- Q2b %>%
arrange(desc(total_dollar_sales)) %>%
slice(1:10) %>%
pull(subcategory)
Q2bPlot <- ggplot(Q2b, aes(x = price.L.w, y = total_liter_sales / 1000, color = category)) +
geom_point() +
scale_color_brewer(palette = "Set3") +
geom_text_repel(
aes(label = ifelse(subcategory %in% top_subcategories, paste(subcategory, round(price.L.w, 2)), '')),
max.overlaps = 20
) +
theme_minimal() +
theme(legend.position = c(0.9, 0.9), legend.background = element_blank()) +
scale_x_continuous(breaks = seq(floor(min(Q2b$price.L.w)), ceiling(max(Q2b$price.L.w)), by = 2)) +
scale_y_continuous(breaks = seq(0, max(Q2b$total_liter_sales / 1000), by = 250)) +
labs(x = "Weighted Average Price per Liter ($)",
y = "Total Volume Sold (Thousands of Liters)",
title = "Liquor Subcategories", subtitle = "Price vs. Quantity"
)
Q2bPlot
sales$month <- month(sales$date, label = TRUE, abbr = TRUE)
sales$weekday <- wday(sales$date, label = TRUE, abbr = TRUE)
monthly.cat.sales <- aggregate(sale.dollars ~ category + month, sales, sum)
weekday.cat.sales <- aggregate(sale.dollars ~ category + weekday, sales, sum)
names(monthly.cat.sales)[3] <- "monthly.sales"
temp <- aggregate(monthly.sales ~ category, monthly.cat.sales, sum)
names(temp)[2] <- "total.month"
names(weekday.cat.sales)[3] <- "each.weekday"
temp_weekday <- aggregate(each.weekday ~ category, weekday.cat.sales, sum)
names(temp_weekday)[2] <- "total.weekday"
Q3a <- merge(monthly.cat.sales, temp, all.x = TRUE)
Q3a$share <- Q3a$monthly / Q3a$total.month
Q3b <- merge(weekday.cat.sales, temp_weekday, all.x = TRUE)
Q3b$share <- Q3b$each.weekday / Q3b$total.weekday
monthlyplot <- ggplot(Q3a, aes(x = month, y = share * 100, group = category, color = category)) +
geom_line() +
geom_point(aes(color = category), size = 2) +
scale_color_brewer(palette = "Set3") +
theme_minimal() +
labs(y = "", x = "Month", title = "Monthly Sales Percentage (%)")
monthlyplot
WeekdayPlot <- ggplot(Q3b, aes(x = weekday, y = share * 100, group = category, color = category)) +
geom_line() +
geom_point(aes(color = category), size = 2) +
scale_color_brewer(palette = "Set3") +
theme_minimal() +
labs(y = "", x = "Weekday", title = "Weekday Sales Percentage (%)")
WeekdayPlot
WeekdayPlot <- WeekdayPlot + theme(legend.position = "none")
grid.arrange(monthlyplot, WeekdayPlot, ncol = 2)
sales_ranking <- sales %>%
group_by(category) %>%
summarize(
sale_dollars = sum(sale.dollars),
sale_volume = sum(sale.volume),
sale_bottles = sum(sale.bottles)
) %>%
pivot_longer(
cols = starts_with("sale"),
names_to = "sale_type",
values_to = "sale"
# ) %>%
# mutate(
# sale_type = case_when(
# sale_type == "sale_dollars" ~ 1,
# sale_type == "sale_volume" ~ 2,
# sale_type == "sale_bottles" ~ 3
# )
)
sales_ranking <- sales_ranking %>%
group_by(sale_type) %>%
mutate(rank = rank(-sale))
ggplot(sales_ranking, aes(x = sale_type, y = rank, group = category, color = category)) +
geom_line() +
geom_point(size = 5) +
scale_y_reverse(breaks = 1:max(sales_ranking$rank)) + # To make rank 1 at the top
labs(title = "Liquor Category Rankings", x = "Sale Type", y = "Rank") +
theme_minimal() +
theme(legend.position = "none", # Remove legend if you don't want it
axis.text.x = element_text(angle = 0, hjust = 1)) +
theme(panel.grid.major = element_blank(),
panel.grid.minor = element_blank()) +
theme(axis.title.x = element_blank(),
axis.title.y = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank()) +
geom_text(aes(label = rank), vjust = 0.4, size = 3, colour = "white")