library(DataExplorer)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(dplyr)
library(readr)
library(car)

## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some

Exploratory Data Analysis (EDA)

First lets look at what the dataset looks like as far as column count and their type and quick snaps of observations.

introduce(grocery)

##   rows columns discrete_columns continuous_columns all_missing_columns
## 1  990      16               11                  5                   0
##   total_missing_values complete_rows total_observations memory_usage
## 1                    0           990              15840       413808

str(grocery)

## 'data.frame':    990 obs. of  16 variables:
##  $ Product_ID             : chr  "29-205-1132" "40-681-9981" "06-955-3428" "71-594-6552" ...
##  $ Product_Name           : chr  "Sushi Rice" "Arabica Coffee" "Black Rice" "Long Grain Rice" ...
##  $ Catagory               : chr  "Grains & Pulses" "Beverages" "Grains & Pulses" "Grains & Pulses" ...
##  $ Supplier_ID            : chr  "38-037-1699" "54-470-2479" "54-031-2945" "63-492-7603" ...
##  $ Supplier_Name          : chr  "Jaxnation" "Feedmix" "Vinder" "Brightbean" ...
##  $ Stock_Quantity         : int  22 45 30 12 37 55 96 44 91 43 ...
##  $ Reorder_Level          : int  72 77 38 59 30 33 52 90 84 10 ...
##  $ Reorder_Quantity       : int  70 2 83 62 74 14 16 17 11 15 ...
##  $ Unit_Price             : chr  "$4.50 " "$20.00 " "$6.00 " "$1.50 " ...
##  $ Date_Received          : chr  "8/16/2024" "11/1/2024" "8/3/2024" "12/8/2024" ...
##  $ Last_Order_Date        : chr  "6/29/2024" "5/29/2024" "6/10/2024" "2/19/2025" ...
##  $ Expiration_Date        : chr  "9/19/2024" "5/8/2024" "9/22/2024" "4/17/2024" ...
##  $ Warehouse_Location     : chr  "48 Del Sol Trail" "36 3rd Place" "3296 Walton Court" "3 Westerfield Crossing" ...
##  $ Sales_Volume           : int  32 85 31 95 62 34 67 21 56 27 ...
##  $ Inventory_Turnover_Rate: int  19 1 34 99 25 62 13 91 90 69 ...
##  $ Status                 : chr  "Discontinued" "Discontinued" "Backordered" "Active" ...

After taking a looking at the variables and their classes now lets look at some basic statistics of the dataset.

summary(grocery)

##   Product_ID        Product_Name         Catagory         Supplier_ID       
##  Length:990         Length:990         Length:990         Length:990        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Supplier_Name      Stock_Quantity   Reorder_Level    Reorder_Quantity
##  Length:990         Min.   : 10.00   Min.   :  1.00   Min.   :  1.00  
##  Class :character   1st Qu.: 33.00   1st Qu.: 25.25   1st Qu.: 25.00  
##  Mode  :character   Median : 56.00   Median : 53.00   Median : 54.00  
##                     Mean   : 55.61   Mean   : 51.22   Mean   : 51.91  
##                     3rd Qu.: 79.00   3rd Qu.: 77.00   3rd Qu.: 77.00  
##                     Max.   :100.00   Max.   :100.00   Max.   :100.00  
##   Unit_Price        Date_Received      Last_Order_Date    Expiration_Date   
##  Length:990         Length:990         Length:990         Length:990        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Warehouse_Location  Sales_Volume    Inventory_Turnover_Rate    Status         
##  Length:990         Min.   : 20.00   Min.   :  1.00          Length:990        
##  Class :character   1st Qu.: 39.00   1st Qu.: 25.00          Class :character  
##  Mode  :character   Median : 58.00   Median : 50.00          Mode  :character  
##                     Mean   : 58.93   Mean   : 50.15                            
##                     3rd Qu.: 78.00   3rd Qu.: 74.75                            
##                     Max.   :100.00   Max.   :100.00

plot_intro(grocery)

plot_bar(grocery)

## 9 columns ignored with more than 50 categories.
## Product_ID: 990 categories
## Product_Name: 121 categories
## Supplier_ID: 990 categories
## Supplier_Name: 350 categories
## Unit_Price: 112 categories
## Date_Received: 351 categories
## Last_Order_Date: 332 categories
## Expiration_Date: 341 categories
## Warehouse_Location: 990 categories

plot_histogram(grocery)

plot_correlation(na.omit(grocery), maxcat = 5L)

## 10 features with more than 5 categories ignored!
## Product_ID: 990 categories
## Product_Name: 121 categories
## Catagory: 8 categories
## Supplier_ID: 990 categories
## Supplier_Name: 350 categories
## Unit_Price: 112 categories
## Date_Received: 351 categories
## Last_Order_Date: 332 categories
## Expiration_Date: 341 categories
## Warehouse_Location: 990 categories

Data Wrangling & Cleaning

Most of the categorical variables classified as character, for us to do analysis we have to convert these variables into quantifiable format. In addition, unit price is characther as well, therefore I will have to extract values from their typed format that has dollar sign next to it.

grocery_edited <- grocery |> 
  mutate(Unit_Price = parse_number(Unit_Price),
         Catagory = as.factor(Catagory),
         Supplier_Name = as.factor(Supplier_Name),
         Status = as.factor(Status),
        Date_Received = as.Date(Date_Received, format = "%d/%M/%Y"),
    Last_Order_Date = as.Date(Last_Order_Date, format = "%d/%M/%Y"),
    Expiration_Date = as.Date(Expiration_Date, format = "%d/%M/%Y")
    )
head(grocery_edited)

##    Product_ID      Product_Name            Catagory Supplier_ID Supplier_Name
## 1 29-205-1132        Sushi Rice     Grains & Pulses 38-037-1699     Jaxnation
## 2 40-681-9981    Arabica Coffee           Beverages 54-470-2479       Feedmix
## 3 06-955-3428        Black Rice     Grains & Pulses 54-031-2945        Vinder
## 4 71-594-6552   Long Grain Rice     Grains & Pulses 63-492-7603    Brightbean
## 5 57-437-1828              Plum Fruits & Vegetables 54-226-4308    Topicstorm
## 6 21-120-6238 All-Purpose Flour     Grains & Pulses 86-292-4587        Dabjam
##   Stock_Quantity Reorder_Level Reorder_Quantity Unit_Price Date_Received
## 1             22            72               70       4.50    2024-12-08
## 2             45            77                2      20.00    2024-12-11
## 3             30            38               83       6.00    2024-12-08
## 4             12            59               62       1.50    2024-12-12
## 5             37            30               74       4.00    2024-12-07
## 6             55            33               14       1.75    2024-12-12
##   Last_Order_Date Expiration_Date     Warehouse_Location Sales_Volume
## 1      2024-12-06      2024-12-09       48 Del Sol Trail           32
## 2      2024-12-05      2024-12-05           36 3rd Place           85
## 3      2024-12-06      2024-12-09      3296 Walton Court           31
## 4      2025-12-02      2024-12-04 3 Westerfield Crossing           95
## 5      2024-12-10      2024-12-10   15068 Scoville Court           62
## 6      2024-12-05      2024-12-09     050 Mcbride Avenue           34
##   Inventory_Turnover_Rate       Status
## 1                      19 Discontinued
## 2                       1 Discontinued
## 3                      34  Backordered
## 4                      99       Active
## 5                      25  Backordered
## 6                      62 Discontinued

Asking the Questions

Once all the variables converted to desired format, as next step I will look into which variables could have tell me something about the Sales volume.

First we can check to see which Category has the most Sales volume in the store.

plot1 <- grocery_edited |> 
  group_by(Catagory) |> 
  summarise(total_sales_vol = sum(Sales_Volume)) 

ggplot(plot1, aes(x = reorder(Catagory, -total_sales_vol), y = total_sales_vol, fill = Catagory)) +
  geom_col() +
  geom_text(aes(label = total_sales_vol), vjust = -0.5) + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  labs(x = "Catagory")

Which producty category has the most stock? lets find out.

 plot2 <- grocery_edited |> 
  group_by(Catagory) |> 
  summarise(total_stock = sum(Stock_Quantity))

ggplot(plot2, aes(x = reorder(Catagory, -total_stock), y = total_stock, fill = Catagory)) +
  geom_col() +
  geom_text(aes(label = total_stock), vjust = -0.5) + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  labs(x = "Catagory")

Now lets see them together to see corralations between Stock amount vs Sales volume. First step, I need to create a new dataset where it combines the Category, Stock Quantitiy and Sales volume into one dataframe.

plot3 <- grocery_edited |> 
  group_by(Catagory) |> 
  select(Catagory, Stock_Quantity, Sales_Volume) |> 
  mutate(total_stock = sum(Stock_Quantity),
         total_sales_vol = sum(Sales_Volume)) 
head(plot3)

## # A tibble: 6 × 5
## # Groups:   Catagory [3]
##   Catagory            Stock_Quantity Sales_Volume total_stock total_sales_vol
##   <fct>                        <int>        <int>       <int>           <int>
## 1 Grains & Pulses                 22           32        8240            8886
## 2 Beverages                       45           85        3822            4533
## 3 Grains & Pulses                 30           31        8240            8886
## 4 Grains & Pulses                 12           95        8240            8886
## 5 Fruits & Vegetables             37           62       18489           19361
## 6 Grains & Pulses                 55           34        8240            8886

For me to compare Total stock vs Total sales by category, I will need to pivot the table.

plot3_long <- plot3  |> 
  pivot_longer(
    cols = c(total_stock, total_sales_vol), 
    names_to = "Metric",                    
    values_to = "Value"                     
  )

head(plot3_long)

## # A tibble: 6 × 5
## # Groups:   Catagory [2]
##   Catagory        Stock_Quantity Sales_Volume Metric          Value
##   <fct>                    <int>        <int> <chr>           <int>
## 1 Grains & Pulses             22           32 total_stock      8240
## 2 Grains & Pulses             22           32 total_sales_vol  8886
## 3 Beverages                   45           85 total_stock      3822
## 4 Beverages                   45           85 total_sales_vol  4533
## 5 Grains & Pulses             30           31 total_stock      8240
## 6 Grains & Pulses             30           31 total_sales_vol  8886

Next I will plot my new dataset where it shows total stock volume compare to total sales volume by category.

ggplot(plot3_long, aes(x = Catagory, y = Value, color = Metric, group = Metric)) +
  geom_line(linewidth = 1) + 
  geom_point(size = 3) +     
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
  labs(
    title = "Stock and Sales Volume by Category",
    y = "Amount"
  ) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

Conclusion:

As it shows in the line graph above, Stock volume is highly corralted with Sales volume regardless of the category type of the product.

Grocery Store inventory and Sales Analysis

Emrah Akbas

2025-12-03

Exploratory Data Analysis (EDA)

Data Wrangling & Cleaning

Asking the Questions

Conclusion: