Introduction

The dataset used in this analysis is called “Grocery Sales Database”. It was compiled in the interest of giving aspiring data analysts/scientists practice analyzing sales transactions, customer demographics, product details, employee records, and geographical information.

Dataset

This dataset is comprised of a series of CSV files: Categories, Cities, Countries, Customers, Employees, Products, Sales. These files are tied together by variables that allow for the seamless combination and manipulation of the data. This has allowed me to analyze and interpret the story that each transaction tells.

Findings

While examining this dataset I was able to answer several questions I had when I chose it. Namely I was able to determine if certain products were more popular in certain times of the year, certain days of the week, and times of the day. I was also curious as to which city in the United States had the most sales overall and if there were any standout products that caused it to be the most grossing city. The answers to these findings can be found below in the prepared visualizations.

##Visualization 1 This visualization depicts sales by category vs month of the year. This visualization was created with the intention of finding any standout products that would warrant further investigation. The visualization did not show significant difference between product so further analysis with other visualizations will be explored.

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(scales)
library(ggthemes)

## Warning: package 'ggthemes' was built under R version 4.2.3

library(RColorBrewer)

Grocery_Data <- read.csv("C:\\Users\\Joe\\OneDrive\\Desktop\\ST473data\\CSV_FIles\\grocery data\\products.csv")
Grocery_sales <- read.csv("C:\\Users\\Joe\\OneDrive\\Desktop\\ST473data\\CSV_FIles\\grocery data\\sales.csv")


ProductDF <- data.frame(Grocery_Data)
SalesDF <- data.frame(Grocery_sales)


Grocery_mergedDF <- merge(ProductDF, SalesDF, by = "ProductID")

Grocery_mergedDF$Month <- format(as.Date(Grocery_mergedDF$SalesDate), "%m")


monthly_sales_df <- Grocery_mergedDF %>%
  group_by(Month, CategoryID) %>%
  summarise(total_sales = sum(Quantity)) %>%
  ungroup()

## `summarise()` has grouped output by 'Month'. You can override using the
## `.groups` argument.

monthly_sales_df$CategoryID <- as.factor(monthly_sales_df$CategoryID)

ggplot(monthly_sales_df, aes(x = Month, y = total_sales, fill = CategoryID)) +
  geom_bar(stat = "identity", position = position_stack(reverse = TRUE)) +
  coord_flip() +  
  labs(title = "Monthly Sales Quantity by Category", 
       x = "Month", y = "Sales Quantity") +
  theme_hc() +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_fill_brewer(palette = "Paired", guide = guide_legend(reverse = TRUE)) + 
  geom_text(aes(label = total_sales), position = position_fill(vjust = 0.6))

## Visualization 2 This Visualization shows the amount of sales per hour.This information could inform stores on when to expect a higher volume of customers. There are peaks at 8am, 4pm and 8pm with the highest peak at 8am.

library(dplyr)
library(ggplot2)
library(scales)
library(ggthemes)
library(RColorBrewer)
library(lubridate)

## Warning: package 'lubridate' was built under R version 4.2.3

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Grocery_Data <- read.csv("C:\\Users\\Joe\\OneDrive\\Desktop\\ST473data\\CSV_FIles\\grocery data\\products.csv")
Grocery_sales <- read.csv("C:\\Users\\Joe\\OneDrive\\Desktop\\ST473data\\CSV_FIles\\grocery data\\sales.csv")


ProductDF <- data.frame(Grocery_Data)
SalesDF <- data.frame(Grocery_sales)

 Grocery_mergedDF <- merge(ProductDF, SalesDF, by = "ProductID")
 Grocery_mergedDF$Hour <- format(as.Date(Grocery_mergedDF$SalesDate), "%H")
 Grocery_mergedDF$SalesDate <- ymd_hms(Grocery_mergedDF$SalesDate)
 
 sales_by_hour_df <- Grocery_mergedDF %>%
  mutate(Hour = format(as.POSIXct(SalesDate), "%H")) %>%  
  group_by(Hour) %>%  
  summarise(total_sales = sum(Quantity, na.rm = TRUE)) %>%  
  ungroup() 

 
 hours_df <- SalesDF %>%
       select(SalesDate) %>%
       mutate(hour24 = hour(mdy_hms(SalesDate))) %>%
       group_by(hour24) %>%
       summarise(n = length(SalesDate), .groups = 'keep') %>%
       data.frame()

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `hour24 = hour(mdy_hms(SalesDate))`.
## Caused by warning:
## ! All formats failed to parse. No formats found.

 x_axis_labels <- 1:24
 
 hi_lo <- sales_by_hour_df %>%
   filter(total_sales==min(total_sales) | total_sales ==max(total_sales)) %>%
   data.frame()
 
 ggplot(sales_by_hour_df, aes(x = as.numeric(Hour), y = total_sales)) +  
  geom_line(color = 'black', size = 1) +  
  geom_point(shape = 21, size = 4, color = 'red', fill = 'white') +  
  labs(x = "Hour", y = "Sales", title = "Sales per Hour") +
  scale_y_continuous(labels = comma, limits = c(3610000, 3640000)) +  
  theme_light() +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_x_continuous(labels = 1:24, breaks = 1:24, minor_breaks = NULL)

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Removed 1 row containing missing values (`geom_line()`).

## Warning: Removed 1 rows containing missing values (`geom_point()`).

Visualization 3

This visualization depicts the total amount of sales per month across cities located in the United States. Only the top 10 highest grossing cities were chosen with Tucson as the city with the highest total sales.

library(dplyr)
library(ggplot2)
library(RColorBrewer)

Grocery_sales <- read.csv("C:\\Users\\Joe\\OneDrive\\Desktop\\ST473data\\CSV_FIles\\grocery data\\sales.csv")

countriesData <- read.csv("C:\\Users\\Joe\\OneDrive\\Desktop\\ST473data\\CSV_FIles\\grocery data\\countries.csv")
customerData  <- read.csv("C:\\Users\\Joe\\OneDrive\\Desktop\\ST473data\\CSV_FIles\\grocery data\\customers.csv")
 citiesData <- read.csv("C:\\Users\\Joe\\OneDrive\\Desktop\\ST473data\\CSV_FIles\\grocery data\\cities.csv")
 
 countrydata <- merge(citiesData, countriesData, by = "CountryID")
 countrydata <- merge(countrydata, customerData, by = "CityID")
 countrydata <- merge(countrydata, Grocery_sales, by = "CustomerID")
 countrydata$Month <- format(as.Date(countrydata$SalesDate), "%m")
 
 monthly_sales_df <- countrydata %>%
      group_by(CityName, Month) %>%
      summarise(total_sales = n_distinct(SalesID)) %>%
      ungroup()

## `summarise()` has grouped output by 'CityName'. You can override using the
## `.groups` argument.

 total_sales_per_city <- monthly_sales_df %>%
     group_by(CityName) %>%
     summarise(total_sales_all_months = sum(total_sales, na.rm = TRUE)) %>%
     arrange(desc(total_sales_all_months))
 
 top_10_cities <- total_sales_per_city %>%
     slice_head(n = 10) %>%
     pull(CityName)
 
 top_10_citiesDf <- monthly_sales_df %>%
         filter(CityName %in% top_10_cities)
 
 top_10_citiesDf <- top_10_citiesDf[!is.na(top_10_citiesDf$Month), ]
 
 total_sales_per_city <- monthly_sales_df %>%
       group_by(CityName) %>%
       summarise(total_sales_all_months = sum(total_sales, na.rm = TRUE)) %>%
       arrange(desc(total_sales_all_months))
 
 ggplot(data = top_10_citiesDf, aes(x= "", y = total_sales, fill = Month)) +
     geom_bar(stat = "identity", position = "fill") +
     coord_polar(theta = "y", start = 0) +
     labs(fill = "Month", x = NULL, Y = NULL, title = "Monthly Sales for Top 10 Cities")+
     theme_light()+
     theme(plot.title = element_text(hjust = 0.5),
           axis.text = element_blank(),
           axis.ticks = element_blank(),
           panel.grid = element_blank())+
     facet_wrap(~CityName, ncol = 4, nrow = 3)+
     scale_fill_brewer(palette = "Blues")+
     geom_text(aes(x=1.7, label= paste0(total_sales)),
                size=2,
                position = position_fill(vjust = 0.5))

Visualization 4

This visualization delves deeper into what made categories netted the highest sales in Tucson. Given that Tucson was the highest grossing city, this information may be indicative on where other locations can improve. The top category in this visualization was confections with just over 13% of the total. One surprising revelation found here is that dairy is the lowest category in total sales. This may indicate that people buy more than one item in the confections category when going to the store but will not buy multiple dairy products.

This visualization is an html plot and will not load properly into Rmarkdown html provided in coments

library(dplyr) library(ggplot2) library(RColorBrewer) library(scales) Grocery_Data <- read.csv(“C:\Users\Joe\OneDrive\Desktop\ST473data\CSV_FIles\grocery data\products.csv”) Grocery_sales <- read.csv(“C:\Users\Joe\OneDrive\Desktop\ST473data\CSV_FIles\grocery data\sales.csv”)

ProductDF <- data.frame(Grocery_Data)

SalesDF <- data.frame(Grocery_sales)

Grocery_mergedDF <- merge(ProductDF, SalesDF, by = “ProductID”)

Grocery_mergedDF$Month <- format(as.Date(Grocery_mergedDF$SalesDate), “%m”)

categoryData <- read.csv(“C:\Users\Joe\OneDrive\Desktop\ST473data\CSV_FIles\grocery data\categories.csv”)

categorydata <- merge(categoryData, Grocery_mergedDF, by = “CategoryID”)

categorydata <- merge(categorydata, customerData[, c(“CustomerID”, “CityID”)], by = “CustomerID”, all.x = TRUE)

categorydata <- merge(categorydata, citiesData[, c(“CityID”, “CityName”)], by = “CityID”, all.x = TRUE)

tucsonSales <- categorydata %>% filter(CityName == “Tucson”) category_sales_tucson <- tucsonSales %>% group_by(CategoryName) %>% summarise(total_quantity_sold = sum(Quantity, na.rm = TRUE)) %>% arrange(desc(total_quantity_sold)) top_10_categories_tucson <- category_sales_tucson %>% slice_head(n = 10) plot_ly(top_10_categories_tucson, labels = ~CategoryName, values = ~total_quantity_sold) %>% add_pie(hole = 0.6) %>% layout(title = “Sales by Category for Tucson”) %>% layout(annotations = list(text=paste0(“Total Sale Count: /n”, scales::comma(sum(top_10_categories_tucson$total_quantity_sold))), “showarrow”=F))

Visualization 5

This final visualizaion is a heatmap plotting day of the week vs category. Like the previous plot it targets only the top 10 categories with the intention of seeing if total sales follows the same trend as the sales in the top city of Tucson.Confections has the highest count out of all categories. A slight increase in sales can be seen in the beginnign of the week between Sunday and Tuesday.

library(dplyr)
library(plotly)

## Warning: package 'plotly' was built under R version 4.2.3

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

library(ggplot2)
library(scales)
library(lubridate)

Grocery_Data <- read.csv("C:\\Users\\Joe\\OneDrive\\Desktop\\ST473data\\CSV_FIles\\grocery data\\products.csv")
 Grocery_sales <- read.csv("C:\\Users\\Joe\\OneDrive\\Desktop\\ST473data\\CSV_FIles\\grocery data\\sales.csv")
 customerData <- read.csv("C:\\Users\\Joe\\OneDrive\\Desktop\\ST473data\\CSV_FIles\\grocery data\\customers.csv")
citiesData <- read.csv("C:\\Users\\Joe\\OneDrive\\Desktop\\ST473data\\CSV_FIles\\grocery data\\cities.csv")
 categoryData <- read.csv("C:\\Users\\Joe\\OneDrive\\Desktop\\ST473data\\CSV_FIles\\grocery data\\categories.csv")
 
  ProductDF <- data.frame(Grocery_Data)

  SalesDF <- data.frame(Grocery_sales)
  
  ProductDf <- merge(ProductDF, categoryData, by = "CategoryID")
 
  Grocery_mergedDF <- merge(ProductDF, SalesDF, by = "ProductID")
 
  Grocery_mergedDF$Month <- format(as.Date(Grocery_mergedDF$SalesDate), "%m")

 
categorydata <- merge(categoryData, Grocery_mergedDF, by = "CategoryID")

categorydata <- merge(categorydata, customerData[, c("CustomerID", "CityID")], by = "CustomerID", all.x = TRUE)

categorydata <- merge(categorydata, citiesData[, c("CityID", "CityName")], by = "CityID", all.x = TRUE)

categorydata$SalesDate = ymd_hms(categorydata$SalesDate)

categorydata$day_of_week <- wday(categorydata$SalesDate, label = TRUE)

category_sales_by_day <- categorydata %>%
  group_by(CategoryName, day_of_week) %>%
  summarise(total_sales = sum(Quantity, na.rm = TRUE)) %>%
  ungroup()

## `summarise()` has grouped output by 'CategoryName'. You can override using the
## `.groups` argument.

category_sales_quantity <- categorydata %>%
       group_by(CategoryName) %>%
       summarise(total_quantity_sold = sum(Quantity, na.rm = TRUE)) %>%
       ungroup()

top_10_categories <- category_sales_quantity %>%
       arrange(desc(total_quantity_sold)) %>%
       slice_head(n = 10)

top_categories_data <- category_sales_by_day %>%
     filter(CategoryName %in% top_10_categories$CategoryName)

top_categories_data <- top_categories_data %>%
       filter(!is.na(day_of_week))

ggplot(top_categories_data, aes(x = CategoryName, y = day_of_week, fill = total_sales)) +
  geom_tile(color = 'black') +
  geom_text(aes(label = comma(total_sales))) +
  coord_equal(ratio = 1) +
  labs(title = "Heatmap: Category sales by day of the week", 
       x = "Category",
       y = "Day of Week", 
       fill = "Total Sales") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5)) +
  scale_y_discrete(limits = rev(levels(top_categories_data$day_of_week))) +
  scale_fill_continuous(low = "white", high = "red") +
  guides(fill = guide_legend(reverse = TRUE, override.aes = list(colour = "black")))

Assignment 1 IS*460

Joseph Carnovale

2025-02-27