General

  # Clear the workspace
  rm(list = ls()) # Clear environment
  gc()            # Clear unused memory
##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 509761 27.3    1134340 60.6   644240 34.5
## Vcells 905789  7.0    8388608 64.0  1635293 12.5
  cat("\f")       # Clear the console
  if(!is.null(dev.list())) dev.off() # Clear all plots
## null device 
##           1
  # Prepare needed libraries
  packages <- c("ggplot2"
                , "lemon"
                , "gridExtra" # For Q1
                , "ggrepel"   # For labels in Q2.b
                , "scales"
                )
  for (i in 1:length(packages)) {
    if (!packages[i] %in% rownames(installed.packages())) {
      install.packages(packages[i], dependencies = TRUE)
    }
    library(packages[i], character.only = TRUE)
  }
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'lemon' was built under R version 4.2.3
## Warning: package 'ggrepel' was built under R version 4.2.3
  rm(packages)
  
  color <- c("red", "blue", "cyan")
  
  color <- c("#999999",
          "#D55E00",
          "#E69F00",
          "#56B4E9",
          "#009E73",
          "#F0E442",
          "#CC79A7",
          "#74c476",
          "#1c92d5",
          "#0072B2",
          "#9e79cc"
          )

  
  # Set working directory and path to data, if need be
  # setwd("")
  
  # Load data
  
  sales <- read.csv("C:/Users/91976/Desktop/Software tools/R/Week 3/r.hw03.sales.csv")
  
  items <- read.csv("C:/Users/91976/Desktop/Software tools/R/Week 3/r.hw03.items.csv")
  
  
  # Merge data together
  sales <- merge(sales,items)
  
  # Reorder variables
  var.order <- c("date"
                , "category"
                , "subcategory"
                , "item.name"
                , "volume"
                , "price"
                , "sale.bottles"
                , "sale.volume"
                , "sale.dollars"
                )
  sales <- sales[,var.order]
  
  
  # Clean up merged data
  sales$item.id <- NULL
  
  # Reorder variables
  var_order <- c("date", "category", "subcategory", "item.name", "volume", "price", "sale.bottles", "sale.volume", "sale.dollars")
  sales <- sales[, var_order]
  
  # Convert variables to proper types
  sales$date <- as.Date(sales$date, format = "%Y-%m-%d")
  
  sales$category <- factor(
    x = sales$category,
    levels = c("Amaretto", "Brandy", "Distilled Spirits", "Gin", "Rum", "Schnapps", "Tequila", "Vodka", "Whisky", "Misc"),
    ordered = TRUE
  )

Calendar Heatmap

# Generate a palette of random professional colors
num_colors <- 5
set.seed(123)  # Setting seed for reproducibility
color_palette <- sample(colors(), num_colors)

# Aggregate sales by date
q1_agg <- aggregate(sale.dollars ~ date, data = sales, sum)

# Create a dataframe with sale dates in 2015
q1_dates <- data.frame(date = seq(from = as.Date("2015/01/01"), to = as.Date("2015/12/31"), by = "day"))

# Merge data to have sales data for each day with NA values for dates without sales
q1_data <- merge(q1_dates, q1_agg, by = "date", all.x = TRUE)

# Create sales level cut-offs for heatmap
q1_data$sale.level <- cut(q1_data$sale.dollars, breaks = c(0, 1e6, 1.2e6, 1.4e6, 1.6e6, 2e6), labels = c("0-1 mln", "1-1.2 mln", "1.2-1.4 mln", "1.4-1.6 mln", "1.6-2 mln"), ordered_result = TRUE)

# Create calendar dimensions
q1_data$day <- as.numeric(format(q1_data$date, "%d"))
q1_data$weekday <- factor(weekdays(q1_data$date, abbreviate = TRUE), levels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"), ordered = TRUE)
q1_data$week <- as.numeric(format(q1_data$date, "%W")) + 1
q1_data$week <- q1_data$week - as.numeric(format(as.Date(cut(q1_data$date, breaks = "month")), "%W"))
q1_data$month <- factor(months(q1_data$date, abbreviate = FALSE), levels = month.name, ordered = TRUE)
q1_data$quarter <- factor(quarters(q1_data$date, abbreviate = TRUE), levels = c("Q1", "Q2", "Q3", "Q4"), ordered = TRUE)

q1_data$day <- as.numeric(format(x = q1_data$date, "%d"))
q1_data$weekday <- factor(x = weekdays(x = q1_data$date, abbreviate = TRUE), levels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"), ordered = TRUE)
q1_data$week <- as.numeric(format(x = q1_data$date, "%W")) + 1
q1_data$week <- q1_data$week - as.numeric(format(x = as.Date(cut(x = q1_data$date, breaks = "month")), "%W"))
q1_data$month <- factor(x = months(q1_data$date, abbreviate = FALSE), levels = month.name, ordered = TRUE)

q1_heatmap <- ggplot(data = q1_data, aes(x = weekday, y = week, fill = sale.level)) +
  geom_tile(color = "white", size = 0.5) +
  geom_text(aes(label = day)) +
  scale_fill_manual(values = c("#4CAF50", "#2196F3", "#FFEB3B", "#FF5722", "#9C27B0"), na.value = "gray") +
  labs(x = "", y = "", title = "Daily Total Sales, 2015") +
  guides(fill = guide_legend(nrow = 1)) +
  facet_rep_wrap(~quarter + month, ncol = 3, strip.position = "top", repeat.tick.labels = TRUE) +
  theme_bw() +
  theme(
    plot.title = element_text(hjust = 0.5, face = "bold", size = 20, color = "red", margin = margin(t = 10, b = -20)),
    strip.background = element_blank(),
    strip.placement = "outside",
    strip.text.x = element_text(size = 14, face = "bold", color = "red"),
    panel.grid.major = element_blank(),
    panel.grid.minor = element_blank(),
    axis.text.y = element_blank(),
    axis.text.x = element_text(size = 10, face = "bold"),
    axis.ticks = element_blank(),
    legend.position = "bottom",
    legend.direction = "horizontal",
    legend.title = element_blank()
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Print the final heatmap
print(q1_heatmap)

Q2

  sales$price.per.l <- sales$sale.dollars / sales$sale.volume
  # Assuming 'volume' is in milliliters, you may need to adjust the conversion factor if it's in a different unit
  conversion_factor <- 0.001
  
  # Create 'price.per.l' column
  sales$price.per.l <- sales$sale.dollars / (sales$volume * conversion_factor)
  
  # Exclude the top 5% of price per liter values
  filtered_sales <- subset(sales, price.per.l <= quantile(price.per.l, 0.95, na.rm = TRUE))
  
  # Order of categories in reverse
  category_order <- c("Miscellaneous", "Whisky", "Vodka", "Tequila", "Schnapps", "Rum", "Gin", "Distilled Spirits", "Brandy", "Amaretto")
  
  # Convert 'category' to an ordered factor
  filtered_sales$category <- factor(filtered_sales$category, levels = category_order, ordered = TRUE)
  
  # Replace "NA" with "Miscellaneous"
  filtered_sales$category[is.na(filtered_sales$category)] <- "Miscellaneous"
  
  # Randomly sample 10% of the data
  set.seed(123)  # for reproducibility
  sampled_data <- filtered_sales[sample(nrow(filtered_sales), 0.1 * nrow(filtered_sales)), ]
  
  # Box-and-whisker plot
  library(ggplot2)
  library(RColorBrewer)
  
  # Define colors for categories
  category_colors <- brewer.pal(10, "Set3")
  
  # Plot
  ggplot(sampled_data, aes(x = price.per.l, y = category, fill = category)) +
    geom_boxplot(outlier.colour = "red", outlier.shape = 16, outlier.size = 2) +
    scale_fill_manual(values = category_colors) +
    theme_minimal() +
    theme(legend.position = "right") +
    labs(x = "Price per Liter ($)", y = "Category", fill = "Category") +
    ggtitle("Distribution of Price per Liter Across Categories (Excluding Top 5%)")

Q2B

# Scatter plot
  q2b.data <- aggregate(cbind(sale.volume) ~ category + subcategory, data = sales, FUN = sum)
  colnames(q2b.data)[colnames(q2b.data) == "sale.volume"] <- "sale.volume.agg"
  q2b.data <- merge(x = sales[, c("category", "subcategory", "item.name", "price.per.l", "sale.volume")],
                    y = q2b.data,
                    by = c("category", "subcategory"))
  
  q2b.data$price.per.l.w <- q2b.data$price.per.l * q2b.data$sale.volume / q2b.data$sale.volume.agg
  q2b.data <- aggregate(cbind(price.per.l.w, sale.volume) ~ category + subcategory, data = q2b.data, FUN = sum)
  
  # Create labels to be used by ggrepel
  labels <- c("80 Proof Vodkas", "Canadian Whiskies", "Spiced Rum", "Vodkas", "Miscellaneous Whiskies", "Single Barrel Bourbon Whiskies", "Single Malt Scotch", "Japanese Whisky")
  
  # Create empty labels variable
  q2b.data$labels <- ifelse(q2b.data$subcategory %in% labels, paste(q2b.data$subcategory, ", ", round(q2b.data$price.per.l.w, 1), " $/l"), NA)
  
  # Scatter plot
  q2b_plot <- ggplot(data = q2b.data, aes(x = price.per.l.w, y = sale.volume / 1000, label = labels)) +
    geom_point(aes(color = category), fill = "white", shape = 21, stroke = 2, size = 4) +
    scale_color_manual(values = color) +
    geom_text_repel(
      force = 10,
      direction = "both",
      nudge_x = 1,
      nudge_y = 1,
      point.padding = 1.5,
      box.padding = 1.5,
      segment.size = 0.5,
      size = 4,
      max.overlaps = 50
    ) +
    scale_x_continuous(breaks = seq(0, round(max(q2b.data$price.per.l.w * 1.1), 0), 2)) +
    scale_y_continuous(breaks = seq(0, round(max(q2b.data$sale.volume * 1.1 / 1000), 0), 250)) +
    labs(
      x = "Average weighted price per liter, $",
      y = "Liters sold, thousands",
      title = "Liquor subcategories",
      subtitle = "price vs quantity"
    ) +
    theme_bw() +
    theme(
      plot.title = element_text(
        hjust = 0.5,
        face = "bold",
        size = 20,
        color = "red",
        margin = margin(b = 5)
      ),
      panel.grid.minor.y = element_blank(),
      panel.grid.major = element_blank(),  # Remove major grid lines
      axis.text.y = element_text(size = 10, face = "bold"),
      axis.title.y = element_text(
        size = 16,
        face = "bold",
        color = "red",
        margin = margin(r = 10, l = 10)
      ),
      axis.text.x = element_text(size = 10, face = "bold"),
      axis.title.x = element_text(
        size = 16,
        face = "bold",
        color = "red",
        margin = margin(t = 10, b = 10)
      ),
      legend.position = c(0.9, 0.8),
      legend.direction = "vertical",
      legend.title = element_blank(),
      legend.text = element_text(size = 10, face = "bold"),
      legend.background = element_rect(fill = "transparent")
    )
  
  print(q2b_plot)
## Warning: Removed 62 rows containing missing values (`geom_text_repel()`).

Q3a

  q3a <- ggplot()
  
  
library(psych)
## Warning: package 'psych' was built under R version 4.2.3
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:scales':
## 
##     alpha, rescale
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
describe(sales[c(1, 2, 9)])
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
##              vars       n   mean     sd median trimmed   mad  min    max
## date            1 2184477    NaN     NA     NA     NaN    NA  Inf   -Inf
## category*       2 2184477   7.38   2.16   8.00    7.67  1.48 1.00     10
## sale.dollars    3 2184477 130.16 405.47  70.56   84.54 69.03 1.34 106326
##                 range  skew kurtosis   se
## date             -Inf    NA       NA   NA
## category*         9.0 -1.09     0.36 0.00
## sale.dollars 106324.7 39.55  4816.76 0.27
  q3a.data <- aggregate(cbind(sale.dollars) ~ category + months(date), data = sales, FUN = sum)
  
  q3a.data$month <- factor(q3a.data$`months(date)`,
                           levels = month.name,
                           labels = c("Jan", "Feb", "Mar", "Apr", "May", "Jun",
                                      "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"),
                           ordered = TRUE
  )
  
  q3a.data$`months(date)` <- NULL
  
  q3a.data <- merge(q3a.data, aggregate(x = cbind(sale.dollars) ~ category, data = q3a.data, FUN = sum),
                    by = c("category"),
                    all = TRUE
  )
  
  names(q3a.data) <- c("category", "sales.monthly", "month", "sales.annual")
  q3a.data$sales.monthly.share <- round(x = 100 * q3a.data$sales.monthly/q3a.data$sales.annual, digits =  2)
  
  q3a_plot <- ggplot(q3a.data, aes(x = month, y = sales.monthly.share, group = category, color = category, fill = category)) +
    geom_line(size = 1.25) +
    geom_point(size = 4, shape = 21, color = "black", stroke = 1.5) +
    scale_color_manual(values = color) + 
    scale_fill_manual(values = color) + 
    scale_y_continuous(breaks = breaks_extended(10)) +
    labs(x = "Month", y = "Share of total sales", title = "% of Total Sales per month") +
    theme_bw() + 
    theme(
      plot.title = element_text(hjust = 0.5, face = "bold", size = 20, color = "red", margin = margin(b = 10)),
      axis.text.y  = element_text(size = 10, face = "bold"),
      axis.title.y  = element_blank(),
      axis.text.x  = element_text(size = 10, face = "bold"),
      axis.title.x  = element_text(size = 16, face = "bold", color = "red", margin = margin(t = 10, b = 10)),
      legend.position = "right",
      legend.direction = "vertical",
      legend.title = element_blank(),
      legend.text = element_text(size = 10, face = "bold"),
      panel.grid = element_blank()  # Remove background lines
    )
  
  q3a_plot

Q3b chart

  q3b <- ggplot()
  
  color <- c("#66c2a5", "#fc8d62", "#8da0cb", "#e78ac3", "#a6d854", "#ffd92f", "#e5c494", "#b3b3b3", "Red","blue")
  
  # Aggregate sales data
  q3b_data <- aggregate(cbind(sale.dollars) ~ category + weekdays(date, abbreviate = TRUE), sales, sum)
  
  # Factorize weekdays
  q3b_data$weekday <- factor(q3b_data[, 2], levels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"), ordered = TRUE)
  q3b_data[, 2] <- NULL
  
  # Merge with total sales data
  q3b_data <- merge(q3b_data, aggregate(x = cbind(sale.dollars) ~ category, data = q3b_data, FUN = sum),
                    by = c("category"),
                    all = TRUE
  )
  
  # Rename columns
  names(q3b_data) <- c("category", "sales.weekday", "weekday", "sales.annual")
  
  # Calculate sales share
  q3b_data$sales.weekday.share <- round(x = 100 * q3b_data$sales.weekday/q3b_data$sales.annual, digits =  2)
  
  # Plot
  q3b_plot <- ggplot(q3b_data, aes(x = weekday, y = sales.weekday.share, group = category, color = category, fill = category)) +
    geom_line(size = 1.25) +
    geom_point(size = 4, shape = 21, color = "black", stroke = 1.5) +
    scale_color_manual(values = color) +
    scale_fill_manual(values = color) +
    labs(x = "Weekday", y = "Share of sales", title = "% of Total Sales per weekday") +
    scale_y_continuous(breaks = breaks_extended(10)) +
    theme_bw() +
    theme(
      plot.title = element_text(hjust = 0.5, face = "bold", size = 20, color = "#912600", margin = margin(b = 10)),
      axis.text.y  = element_text(size = 10, face = "bold"),
      axis.title.y  = element_blank(),
      axis.text.x  = element_text(size = 10, face = "bold"),
      axis.title.x  = element_text(size = 16, face = "bold", color = "#912600", margin = margin(t = 10, b = 10)),
      legend.position = "none",
      panel.grid = element_blank()  # Remove background lines
    )
  
  q3b_plot

  q3b <- q3b_plot

Q4 chart

  q4 <- ggplot()
  
  # Aggregate category sales by dollars, liters, and bottles
  q4.data <- aggregate(cbind(sale.dollars, sale.volume, sale.bottles) ~ category, data = sales, FUN = sum)
  
  # Add percentage of total (optional)
  q4.data$share.dollars <- round(100 * q4.data$sale.dollars / sum(q4.data$sale.dollars), digits = 2)
  q4.data$share.volume  <- round(100 * q4.data$sale.volume / sum(q4.data$sale.volume), digits = 2)
  q4.data$share.bottles <- round(100 * q4.data$sale.bottles / sum(q4.data$sale.bottles), digits = 2)
  
  # Calculate ranks
  q4.data$rank.dollars  <- rank(-q4.data$share.dollars)
  q4.data$rank.volume   <- rank(-q4.data$share.volume)
  q4.data$rank.bottles  <- rank(-q4.data$share.bottles)
  
  # Set up horizontal spacing coordinates
  x0 <- 0
  x1 <- 1
  x2 <- 2
  x3 <- 3
  x4 <- 4
  
    q4_plot <- ggplot(q4.data) +
    geom_segment(aes(x = x1, xend = x2, y = rank.dollars, yend = rank.volume, color = category), size = 1.25, show.legend = FALSE) +
    geom_segment(aes(x = x2, xend = x3, y = rank.volume, yend = rank.bottles, color = category), size = 1.25, show.legend = FALSE) +
    geom_vline(xintercept = x1, linetype = "dashed", size = 0.5) +
    geom_vline(xintercept = x2, linetype = "dashed", size = 0.5) +
    geom_vline(xintercept = x3, linetype = "dashed", size = 0.5) +
    geom_text(aes(label = category, y = rank.dollars, x = x1 - 0.1), hjust = 1, size = 5.5, fontface = "bold") +
    geom_text(aes(label = category, y = rank.bottles, x = x3 + 0.1), hjust = 0, size = 5.5, fontface = "bold") +
    geom_point(aes(x = x1, y = rank.dollars, color = category), size = 10) +
    geom_text(aes(label = rank.dollars, y = rank.dollars, x = x1), color = "white", size = 6, fontface = "bold") +
    geom_point(aes(x = x2, y = rank.volume, color = category), size = 10) +
    geom_text(aes(label = rank.volume, y = rank.volume, x = x2), color = "white", size = 6, fontface = "bold") +
    geom_point(aes(x = x3, y = rank.bottles, color = category), size = 10) +
    geom_text(aes(label = rank.bottles, y = rank.bottles, x = x3), color = "white", size = 6, fontface = "bold") +
    scale_color_manual(values = color) +
    scale_y_reverse(breaks = c(min(q4.data$rank.dollars):max(q4.data$rank.dollars))) +
    scale_x_continuous(breaks = c(x0, x1, x2, x3, x4), labels = c("", "Sales, $", "Sales, liters", "Sales, bottles", "")) +
    coord_cartesian(xlim = c(x0, x4)) +
    labs(title = "Liquor Category Rankings") +
    theme_bw() +
    theme(
      plot.title = element_text(hjust = 0.5, face = "bold", size = 20, color = "#912600", margin = margin(b = 30)),
      axis.text.y = element_blank(),
      axis.title.y = element_blank(),
      axis.ticks.y = element_blank(),
      axis.text.x = element_text(size = 16, face = "bold", color = "#912600", margin = margin(t = 10, b = 10)),
      axis.title.x = element_blank(),
      axis.ticks.x = element_blank(),
      panel.border = element_blank(),
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      legend.position = "none"
    )
  
  q4_plot

  q4 <- q4_plot