Cleaning Data FMCG
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.2
## Warning: package 'ggplot2' was built under R version 4.4.3
## Warning: package 'tibble' was built under R version 4.4.2
## Warning: package 'tidyr' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'purrr' was built under R version 4.4.2
## Warning: package 'dplyr' was built under R version 4.4.3
## Warning: package 'stringr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.2
## Warning: package 'lubridate' was built under R version 4.4.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load the data
fmcg <- read_csv("C:/Users/nyayu/Downloads/archive (3)/FMCG_2022_2024.csv")
## Rows: 190757 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): sku, brand, segment, category, channel, region, pack_type
## dbl (6): price_unit, promotion_flag, delivery_days, stock_available, delive...
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(fmcg)
## Rows: 190,757
## Columns: 14
## $ date <date> 2022-01-21, 2022-01-21, 2022-01-21, 2022-01-21, 2022-…
## $ sku <chr> "MI-006", "MI-006", "MI-006", "MI-006", "MI-006", "MI-…
## $ brand <chr> "MiBrand1", "MiBrand1", "MiBrand1", "MiBrand1", "MiBra…
## $ segment <chr> "Milk-Seg3", "Milk-Seg3", "Milk-Seg3", "Milk-Seg3", "M…
## $ category <chr> "Milk", "Milk", "Milk", "Milk", "Milk", "Milk", "Milk"…
## $ channel <chr> "Retail", "Retail", "Retail", "Discount", "Discount", …
## $ region <chr> "PL-Central", "PL-North", "PL-South", "PL-Central", "P…
## $ pack_type <chr> "Multipack", "Single", "Carton", "Single", "Single", "…
## $ price_unit <dbl> 2.38, 1.55, 4.00, 5.16, 7.66, 5.09, 8.58, 4.21, 8.27, …
## $ promotion_flag <dbl> 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, …
## $ delivery_days <dbl> 1, 3, 5, 2, 4, 3, 2, 3, 2, 5, 5, 3, 3, 1, 5, 1, 4, 1, …
## $ stock_available <dbl> 141, 0, 118, 81, 148, 163, 140, 194, 180, 117, 294, 19…
## $ delivered_qty <dbl> 128, 129, 161, 114, 204, 154, 156, 185, 161, 192, 249,…
## $ units_sold <dbl> 9, 0, 8, 7, 12, 25, 7, 17, 14, 5, 38, 12, 22, 11, 17, …
# filter data
fmcg_filtered <- fmcg %>%
filter(sku == "MI-006", region == "PL-Central") %>%
select(date, units_sold, price_unit, ) %>%
arrange(date)
# Total sales
fmcg_daily <- fmcg_filtered %>%
group_by(date) %>%
summarise(
daily_revenue = sum(price_unit * units_sold, na.rm = TRUE)
) %>%
arrange(date)
fmcg_daily
## # A tibble: 1,073 × 2
## date daily_revenue
## <date> <dbl>
## 1 2022-01-21 57.5
## 2 2022-01-22 273.
## 3 2022-01-23 213.
## 4 2022-01-24 157.
## 5 2022-01-25 168.
## 6 2022-01-26 52.5
## 7 2022-01-27 39.4
## 8 2022-01-28 258.
## 9 2022-01-29 140.
## 10 2022-01-30 337.
## # ℹ 1,063 more rows
# Save the cleaned data
write_csv(fmcg_daily, "fmcg_cleaned_daily.csv")