library(DataExplorer)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(readr)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
First lets look at what the dataset looks like as far as column count and their type and quick snaps of observations.
introduce(grocery)
## rows columns discrete_columns continuous_columns all_missing_columns
## 1 990 16 11 5 0
## total_missing_values complete_rows total_observations memory_usage
## 1 0 990 15840 413808
str(grocery)
## 'data.frame': 990 obs. of 16 variables:
## $ Product_ID : chr "29-205-1132" "40-681-9981" "06-955-3428" "71-594-6552" ...
## $ Product_Name : chr "Sushi Rice" "Arabica Coffee" "Black Rice" "Long Grain Rice" ...
## $ Catagory : chr "Grains & Pulses" "Beverages" "Grains & Pulses" "Grains & Pulses" ...
## $ Supplier_ID : chr "38-037-1699" "54-470-2479" "54-031-2945" "63-492-7603" ...
## $ Supplier_Name : chr "Jaxnation" "Feedmix" "Vinder" "Brightbean" ...
## $ Stock_Quantity : int 22 45 30 12 37 55 96 44 91 43 ...
## $ Reorder_Level : int 72 77 38 59 30 33 52 90 84 10 ...
## $ Reorder_Quantity : int 70 2 83 62 74 14 16 17 11 15 ...
## $ Unit_Price : chr "$4.50 " "$20.00 " "$6.00 " "$1.50 " ...
## $ Date_Received : chr "8/16/2024" "11/1/2024" "8/3/2024" "12/8/2024" ...
## $ Last_Order_Date : chr "6/29/2024" "5/29/2024" "6/10/2024" "2/19/2025" ...
## $ Expiration_Date : chr "9/19/2024" "5/8/2024" "9/22/2024" "4/17/2024" ...
## $ Warehouse_Location : chr "48 Del Sol Trail" "36 3rd Place" "3296 Walton Court" "3 Westerfield Crossing" ...
## $ Sales_Volume : int 32 85 31 95 62 34 67 21 56 27 ...
## $ Inventory_Turnover_Rate: int 19 1 34 99 25 62 13 91 90 69 ...
## $ Status : chr "Discontinued" "Discontinued" "Backordered" "Active" ...
After taking a looking at the variables and their classes now lets look at some basic statistics of the dataset.
summary(grocery)
## Product_ID Product_Name Catagory Supplier_ID
## Length:990 Length:990 Length:990 Length:990
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Supplier_Name Stock_Quantity Reorder_Level Reorder_Quantity
## Length:990 Min. : 10.00 Min. : 1.00 Min. : 1.00
## Class :character 1st Qu.: 33.00 1st Qu.: 25.25 1st Qu.: 25.00
## Mode :character Median : 56.00 Median : 53.00 Median : 54.00
## Mean : 55.61 Mean : 51.22 Mean : 51.91
## 3rd Qu.: 79.00 3rd Qu.: 77.00 3rd Qu.: 77.00
## Max. :100.00 Max. :100.00 Max. :100.00
## Unit_Price Date_Received Last_Order_Date Expiration_Date
## Length:990 Length:990 Length:990 Length:990
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Warehouse_Location Sales_Volume Inventory_Turnover_Rate Status
## Length:990 Min. : 20.00 Min. : 1.00 Length:990
## Class :character 1st Qu.: 39.00 1st Qu.: 25.00 Class :character
## Mode :character Median : 58.00 Median : 50.00 Mode :character
## Mean : 58.93 Mean : 50.15
## 3rd Qu.: 78.00 3rd Qu.: 74.75
## Max. :100.00 Max. :100.00
plot_intro(grocery)
plot_bar(grocery)
## 9 columns ignored with more than 50 categories.
## Product_ID: 990 categories
## Product_Name: 121 categories
## Supplier_ID: 990 categories
## Supplier_Name: 350 categories
## Unit_Price: 112 categories
## Date_Received: 351 categories
## Last_Order_Date: 332 categories
## Expiration_Date: 341 categories
## Warehouse_Location: 990 categories
plot_histogram(grocery)
plot_correlation(na.omit(grocery), maxcat = 5L)
## 10 features with more than 5 categories ignored!
## Product_ID: 990 categories
## Product_Name: 121 categories
## Catagory: 8 categories
## Supplier_ID: 990 categories
## Supplier_Name: 350 categories
## Unit_Price: 112 categories
## Date_Received: 351 categories
## Last_Order_Date: 332 categories
## Expiration_Date: 341 categories
## Warehouse_Location: 990 categories
Most of the categorical variables classified as
character, for us to do analysis we have to convert these variables into quantifiable format. In addition, unit price is characther as well, therefore I will have to extract values from their typed format that has dollar sign next to it.
grocery_edited <- grocery |>
mutate(Unit_Price = parse_number(Unit_Price),
Catagory = as.factor(Catagory),
Supplier_Name = as.factor(Supplier_Name),
Status = as.factor(Status),
Date_Received = as.Date(Date_Received, format = "%d/%M/%Y"),
Last_Order_Date = as.Date(Last_Order_Date, format = "%d/%M/%Y"),
Expiration_Date = as.Date(Expiration_Date, format = "%d/%M/%Y")
)
head(grocery_edited)
## Product_ID Product_Name Catagory Supplier_ID Supplier_Name
## 1 29-205-1132 Sushi Rice Grains & Pulses 38-037-1699 Jaxnation
## 2 40-681-9981 Arabica Coffee Beverages 54-470-2479 Feedmix
## 3 06-955-3428 Black Rice Grains & Pulses 54-031-2945 Vinder
## 4 71-594-6552 Long Grain Rice Grains & Pulses 63-492-7603 Brightbean
## 5 57-437-1828 Plum Fruits & Vegetables 54-226-4308 Topicstorm
## 6 21-120-6238 All-Purpose Flour Grains & Pulses 86-292-4587 Dabjam
## Stock_Quantity Reorder_Level Reorder_Quantity Unit_Price Date_Received
## 1 22 72 70 4.50 2024-12-08
## 2 45 77 2 20.00 2024-12-11
## 3 30 38 83 6.00 2024-12-08
## 4 12 59 62 1.50 2024-12-12
## 5 37 30 74 4.00 2024-12-07
## 6 55 33 14 1.75 2024-12-12
## Last_Order_Date Expiration_Date Warehouse_Location Sales_Volume
## 1 2024-12-06 2024-12-09 48 Del Sol Trail 32
## 2 2024-12-05 2024-12-05 36 3rd Place 85
## 3 2024-12-06 2024-12-09 3296 Walton Court 31
## 4 2025-12-02 2024-12-04 3 Westerfield Crossing 95
## 5 2024-12-10 2024-12-10 15068 Scoville Court 62
## 6 2024-12-05 2024-12-09 050 Mcbride Avenue 34
## Inventory_Turnover_Rate Status
## 1 19 Discontinued
## 2 1 Discontinued
## 3 34 Backordered
## 4 99 Active
## 5 25 Backordered
## 6 62 Discontinued
Once all the variables converted to desired format, as next step I will look into which variables could have tell me something about the Sales volume.
First we can check to see which Category has the most Sales volume in the store.
plot1 <- grocery_edited |>
group_by(Catagory) |>
summarise(total_sales_vol = sum(Sales_Volume))
ggplot(plot1, aes(x = reorder(Catagory, -total_sales_vol), y = total_sales_vol, fill = Catagory)) +
geom_col() +
geom_text(aes(label = total_sales_vol), vjust = -0.5) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
labs(x = "Catagory")
Which producty category has the most stock? lets find out.
plot2 <- grocery_edited |>
group_by(Catagory) |>
summarise(total_stock = sum(Stock_Quantity))
ggplot(plot2, aes(x = reorder(Catagory, -total_stock), y = total_stock, fill = Catagory)) +
geom_col() +
geom_text(aes(label = total_stock), vjust = -0.5) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
labs(x = "Catagory")
Now lets see them together to see corralations between Stock amount vs Sales volume. First step, I need to create a new dataset where it combines the Category, Stock Quantitiy and Sales volume into one dataframe.
plot3 <- grocery_edited |>
group_by(Catagory) |>
select(Catagory, Stock_Quantity, Sales_Volume) |>
mutate(total_stock = sum(Stock_Quantity),
total_sales_vol = sum(Sales_Volume))
head(plot3)
## # A tibble: 6 × 5
## # Groups: Catagory [3]
## Catagory Stock_Quantity Sales_Volume total_stock total_sales_vol
## <fct> <int> <int> <int> <int>
## 1 Grains & Pulses 22 32 8240 8886
## 2 Beverages 45 85 3822 4533
## 3 Grains & Pulses 30 31 8240 8886
## 4 Grains & Pulses 12 95 8240 8886
## 5 Fruits & Vegetables 37 62 18489 19361
## 6 Grains & Pulses 55 34 8240 8886
For me to compare Total stock vs Total sales by category, I will need to pivot the table.
plot3_long <- plot3 |>
pivot_longer(
cols = c(total_stock, total_sales_vol),
names_to = "Metric",
values_to = "Value"
)
head(plot3_long)
## # A tibble: 6 × 5
## # Groups: Catagory [2]
## Catagory Stock_Quantity Sales_Volume Metric Value
## <fct> <int> <int> <chr> <int>
## 1 Grains & Pulses 22 32 total_stock 8240
## 2 Grains & Pulses 22 32 total_sales_vol 8886
## 3 Beverages 45 85 total_stock 3822
## 4 Beverages 45 85 total_sales_vol 4533
## 5 Grains & Pulses 30 31 total_stock 8240
## 6 Grains & Pulses 30 31 total_sales_vol 8886
Next I will plot my new dataset where it shows total stock volume compare to total sales volume by category.
ggplot(plot3_long, aes(x = Catagory, y = Value, color = Metric, group = Metric)) +
geom_line(linewidth = 1) +
geom_point(size = 3) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
labs(
title = "Stock and Sales Volume by Category",
y = "Amount"
) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
As it shows in the line graph above, Stock volume is highly corralted with Sales volume regardless of the category type of the product.