#install packages

install.packages('readr')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages('dplyr')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages('ggplot2')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages('scales')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages('lubridate')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages('zoo')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages('ggrepel')
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)

#load libraries

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:readr':
## 
##     col_factor
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(ggrepel)

#load data

sourcing_data <- read_csv("Procurement_Cleaned_Data.csv")
## Rows: 767 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): PO_ID, Supplier, Order_Date, Delivery_Date, Item_Category, Order_S...
## dbl (11): Delivery_Year, Delivery_Month, Quantity, Unit_Price, Negotiated_Pr...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#view data

head(sourcing_data)
## # A tibble: 6 × 18
##   PO_ID    Supplier        Order_Date Delivery_Date Delivery_Year Delivery_Month
##   <chr>    <chr>           <chr>      <chr>                 <dbl>          <dbl>
## 1 PO-00001 Alpha_Inc       10/17/2023 10/25/2023             2023             10
## 2 PO-00002 Delta_Logistics 4/25/2022  5/5/2022               2022              5
## 3 PO-00003 Gamma_Co        1/26/2022  2/15/2022              2022              2
## 4 PO-00004 Beta_Supplies   10/9/2022  10/28/2022             2022             10
## 5 PO-00005 Delta_Logistics 9/8/2022   9/20/2022              2022              9
## 6 PO-00006 Epsilon_Group   8/17/2022  8/29/2022              2022              8
## # ℹ 12 more variables: Item_Category <chr>, Order_Status <chr>, Quantity <dbl>,
## #   Unit_Price <dbl>, Negotiated_Price <dbl>, Defective_Units <dbl>,
## #   Compliance <chr>, Comp_True_Flag <dbl>, Total_Purchase_Amount <dbl>,
## #   Defect_Value <dbl>, Actual_Time <dbl>, Negotiated_Savings <dbl>

#structure of the data

str(sourcing_data)
## spc_tbl_ [767 × 18] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ PO_ID                : chr [1:767] "PO-00001" "PO-00002" "PO-00003" "PO-00004" ...
##  $ Supplier             : chr [1:767] "Alpha_Inc" "Delta_Logistics" "Gamma_Co" "Beta_Supplies" ...
##  $ Order_Date           : chr [1:767] "10/17/2023" "4/25/2022" "1/26/2022" "10/9/2022" ...
##  $ Delivery_Date        : chr [1:767] "10/25/2023" "5/5/2022" "2/15/2022" "10/28/2022" ...
##  $ Delivery_Year        : num [1:767] 2023 2022 2022 2022 2022 ...
##  $ Delivery_Month       : num [1:767] 10 5 2 10 9 8 6 4 11 7 ...
##  $ Item_Category        : chr [1:767] "Office Supplies" "Office Supplies" "MRO" "Packaging" ...
##  $ Order_Status         : chr [1:767] "Cancelled" "Delivered" "Delivered" "Delivered" ...
##  $ Quantity             : num [1:767] 1176 1509 910 1344 1180 ...
##  $ Unit_Price           : num [1:767] 20.1 39.3 95.5 99.8 64.1 ...
##  $ Negotiated_Price     : num [1:767] 17.8 37.3 92.3 95.5 60.5 ...
##  $ Defective_Units      : num [1:767] 0 235 41 112 171 39 96 22 89 8 ...
##  $ Compliance           : chr [1:767] "Yes" "Yes" "Yes" "Yes" ...
##  $ Comp_True_Flag       : num [1:767] 1 1 1 1 0 1 0 1 1 1 ...
##  $ Total_Purchase_Amount: num [1:767] 20945 56346 83957 128379 71425 ...
##  $ Defect_Value         : num [1:767] 0 8775 3783 10698 10351 ...
##  $ Actual_Time          : num [1:767] 8 10 20 19 12 12 11 14 4 12 ...
##  $ Negotiated_Savings   : num [1:767] 2728 2988 2958 5820 4177 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   PO_ID = col_character(),
##   ..   Supplier = col_character(),
##   ..   Order_Date = col_character(),
##   ..   Delivery_Date = col_character(),
##   ..   Delivery_Year = col_double(),
##   ..   Delivery_Month = col_double(),
##   ..   Item_Category = col_character(),
##   ..   Order_Status = col_character(),
##   ..   Quantity = col_double(),
##   ..   Unit_Price = col_double(),
##   ..   Negotiated_Price = col_double(),
##   ..   Defective_Units = col_double(),
##   ..   Compliance = col_character(),
##   ..   Comp_True_Flag = col_double(),
##   ..   Total_Purchase_Amount = col_double(),
##   ..   Defect_Value = col_double(),
##   ..   Actual_Time = col_double(),
##   ..   Negotiated_Savings = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

#change the dates to date datatype

sourcing_data$Delivery_Date <- as.Date(sourcing_data$Delivery_Date, format = "%m/%d/%Y")
sourcing_data$Order_Date <- as.Date(sourcing_data$Order_Date, format = "%m/%d/%y")

#verify the datatype change

str(sourcing_data$Delivery_Date)
##  Date[1:767], format: "2023-10-25" "2022-05-05" "2022-02-15" "2022-10-28" "2022-09-20" ...
summary(sourcing_data$Delivery_Date)
##         Min.      1st Qu.       Median         Mean      3rd Qu.         Max. 
## "2022-01-06" "2022-07-10" "2022-12-24" "2023-01-03" "2023-07-10" "2023-12-31"
str(sourcing_data$Order_Date)
##  Date[1:767], format: "2020-10-17" "2020-04-25" "2020-01-26" "2020-10-09" "2020-09-08" ...
summary(sourcing_data$Order_Date)
##         Min.      1st Qu.       Median         Mean      3rd Qu.         Max. 
## "2020-01-01" "2020-03-29" "2020-07-06" "2020-06-30" "2020-09-28" "2020-12-30"

#column names

names(sourcing_data)
##  [1] "PO_ID"                 "Supplier"              "Order_Date"           
##  [4] "Delivery_Date"         "Delivery_Year"         "Delivery_Month"       
##  [7] "Item_Category"         "Order_Status"          "Quantity"             
## [10] "Unit_Price"            "Negotiated_Price"      "Defective_Units"      
## [13] "Compliance"            "Comp_True_Flag"        "Total_Purchase_Amount"
## [16] "Defect_Value"          "Actual_Time"           "Negotiated_Savings"

#summarize $ amount purchased by supplier by year

summary_data <- sourcing_data %>%
  group_by(Supplier, Delivery_Year) %>%
  summarise(Total_Purchase_Amount = sum(Total_Purchase_Amount, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'Supplier'. You can override using the
## `.groups` argument.

#identify desc order of suppliers based on 2023 spend

top_2023_suppliers <- summary_data %>%
  filter(Delivery_Year == 2023) %>%
  arrange(desc(Total_Purchase_Amount)) %>%
  pull(Supplier)

#reorder supplier factor levels

summary_data <- summary_data %>%
  mutate(Supplier = factor(Supplier, levels = top_2023_suppliers))

#plot the graph

ggplot(summary_data, aes(x = Supplier, y = Total_Purchase_Amount, fill = as.factor(Delivery_Year))) + 
  geom_bar(stat = "identity", position = "dodge") + 
  scale_y_continuous(labels = label_dollar(scale_cut = cut_short_scale())) +
  labs(title = "Total Purchase Amount by Supplier and Year", 
       x = "Supplier",
       y = "Total Purchase Amount", 
       fill = "Delivery Year") + 
  theme_minimal()

  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## List of 1
##  $ axis.text.x:List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 1
##   ..$ vjust        : NULL
##   ..$ angle        : num 45
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE

#summarize $ amount purchased by category by year

summary_category_data <- sourcing_data %>%
  group_by(Item_Category, Delivery_Year) %>%
  summarise(Total_Purchase_Amount = sum(Total_Purchase_Amount, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'Item_Category'. You can override using the
## `.groups` argument.

#identify desc order of category based on 2023 spend

top_2023_categories <- summary_category_data %>%
  filter(Delivery_Year == 2023) %>%
  arrange(desc(Total_Purchase_Amount)) %>%
  pull(Item_Category)

#reorder supplier factor levels

summary_category_data <- summary_category_data %>%
  mutate(Item_Category = factor(Item_Category, levels = top_2023_categories))

#plot the graph

ggplot(summary_category_data, aes(x = Item_Category, y = Total_Purchase_Amount, fill = as.factor(Delivery_Year))) + 
  geom_bar(stat = "identity", position = "dodge") + 
  scale_y_continuous(labels = label_dollar(scale_cut = cut_short_scale())) +
  labs(title = "Total Purchase Amount by Category and Year", 
       x = "Category",
       y = "Total Purchase Amount", 
       fill = "Delivery Year") + 
  theme_minimal()

  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## List of 1
##  $ axis.text.x:List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 1
##   ..$ vjust        : NULL
##   ..$ angle        : num 45
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE

#summarize $ amount saved by supplier by year

summary_savings_data <- sourcing_data %>%
  group_by(Supplier, Delivery_Year) %>%
  summarise(Negotiated_Savings = sum(Negotiated_Savings, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'Supplier'. You can override using the
## `.groups` argument.

#identify desc order of savings by supplier based on 2023 spend

top_2023_savings <- summary_savings_data %>%
  filter(Delivery_Year == 2023) %>%
  arrange(desc(Negotiated_Savings)) %>%
  pull(Supplier)

#reorder supplier factor levels

summary_savings_data <- summary_savings_data %>%
  mutate(Supplier = factor(Supplier, levels = top_2023_savings))

#plot the graph

ggplot(summary_savings_data, aes(x = Supplier, y = Negotiated_Savings, fill = as.factor(Delivery_Year))) + 
  geom_bar(stat = "identity", position = "dodge") + 
  scale_y_continuous(labels = label_dollar(scale_cut = cut_short_scale())) +
  labs(title = "Total Savings Amount by Supplier and Year", 
       x = "Supplier",
       y = "Negotiated Savings", 
       fill = "Delivery Year") + 
  theme_minimal()

  theme(axis.text.x = element_text(angle = 45, hjust = 1))
## List of 1
##  $ axis.text.x:List of 11
##   ..$ family       : NULL
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : num 1
##   ..$ vjust        : NULL
##   ..$ angle        : num 45
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE

#monthly summary comparison

monthly_summary <- sourcing_data %>%
  group_by(Delivery_Year, Delivery_Month) %>%
  summarise(Total_Spend = sum(Total_Purchase_Amount, na.rm = TRUE), Orders = n()) %>%
  arrange(Delivery_Year, Delivery_Month)
## `summarise()` has grouped output by 'Delivery_Year'. You can override using the
## `.groups` argument.

#bar chart monthly spend

# Prepare data
monthly_summary <- sourcing_data %>%
  mutate(Delivery_MonthDate = floor_date(Delivery_Date, unit = "month")) %>%
  group_by(Delivery_MonthDate) %>%
  summarise(
    Total_Spend = sum(Total_Purchase_Amount, na.rm = TRUE),
    Orders = n()
  ) %>%
  arrange(Delivery_MonthDate) %>%
  ungroup() %>%
  # Create labels for x-axis:
  mutate(
    MonthLabel = ifelse(
      month(Delivery_MonthDate) == 1,
      paste0("Jan ", year(Delivery_MonthDate)),
      format(Delivery_MonthDate, "%b")  # This gives "Feb", "Mar", etc.
    )
  )

# Plot with explicit breaks and labels:
ggplot(monthly_summary, aes(x = Delivery_MonthDate, y = Total_Spend)) +
  geom_col(fill = "steelblue") +
  scale_y_continuous(labels = label_dollar(scale = 1e-6, suffix = "M", prefix = "$")) +
  scale_x_date(
    breaks = monthly_summary$Delivery_MonthDate,  # exact breaks
    labels = monthly_summary$MonthLabel,          # exact labels
    expand = c(0, 0)
  ) +
  labs(title = "Monthly Spend Totals",
       x = NULL,
       y = "Total Spend") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#line graph for monthly data to compare 2022 to 2023

# Prepare data: get year, month number, and total spend per month-year
monthly_summary <- sourcing_data %>%
  mutate(
    Delivery_Year = year(Delivery_Date),
    Delivery_Month = month(Delivery_Date)
  ) %>%
  group_by(Delivery_Year, Delivery_Month) %>%
  summarise(
    Total_Spend = sum(Total_Purchase_Amount, na.rm = TRUE),
    Orders = n()
  ) %>%
  ungroup()
## `summarise()` has grouped output by 'Delivery_Year'. You can override using the
## `.groups` argument.
# Create factor for month to ensure Jan-Dec order
monthly_summary <- monthly_summary %>%
  mutate(
    Month_Factor = factor(
      month.abb[Delivery_Month], levels = month.abb
    )
  )

# Plot line chart
ggplot(monthly_summary, aes(x = Month_Factor, y = Total_Spend, group = factor(Delivery_Year), color = factor(Delivery_Year))) +
  geom_line(linewidth = 1.2) +      # changed size to linewidth here
  geom_point(size = 2) +             # size is still used for points
  scale_y_continuous(labels = label_dollar(scale = 1e-6, suffix = "M", prefix = "$")) +
  labs(
    title = "Monthly Spend by Year",
    x = "Month",
    y = "Total Spend",
    color = "Year"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5),
    legend.position = "top"
  )

#rolling 3 month average chart

# Step 1: Create a date column for grouping by month
monthly_summary <- sourcing_data %>%
  mutate(Delivery_MonthDate = floor_date(as.Date(Delivery_Date), "month")) %>%
  group_by(Delivery_MonthDate) %>%
  summarise(
    Total_Spend = sum(Total_Purchase_Amount, na.rm = TRUE),
    Orders = n(),
    .groups = "drop"
  ) %>%
  arrange(Delivery_MonthDate)

# Step 2: Rolling average over full timeline
monthly_summary <- monthly_summary %>%
  mutate(Rolling_Spend = rollmean(Total_Spend, k = 3, fill = NA, align = "right")) %>%
  mutate(
    Delivery_Year = year(Delivery_MonthDate),
    Delivery_Month = month(Delivery_MonthDate),
    Month_Factor = factor(month.abb[Delivery_Month], levels = month.abb)
  )

# Step 3: Plot rolling average
ggplot(monthly_summary, aes(x = Month_Factor, y = Rolling_Spend, group = factor(Delivery_Year), color = factor(Delivery_Year))) +
  geom_line(linewidth = 1.2) +
  geom_point(size = 2) +
  scale_y_continuous(labels = label_dollar(scale = 1e-6, suffix = "M", prefix = "$")) +
  labs(
    title = "Rolling 3-Month Average Spend by Year",
    x = "Month",
    y = "Rolling 3-Month Avg Spend",
    color = "Year"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5),
    legend.position = "top"
  )
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

#getting crazy with graphing - overlaying the 3 month trend and actual

# Step 1: Summarize monthly spend and compute rolling average
monthly_summary <- sourcing_data %>%
  mutate(Delivery_MonthDate = floor_date(as.Date(Delivery_Date), "month")) %>%
  group_by(Delivery_MonthDate) %>%
  summarise(
    Total_Spend = sum(Total_Purchase_Amount, na.rm = TRUE),
    Orders = n(),
    .groups = "drop"
  ) %>%
  arrange(Delivery_MonthDate) %>%
  mutate(
    Rolling_Spend = rollmean(Total_Spend, k = 3, fill = NA, align = "right"),
    Delivery_Year = year(Delivery_MonthDate),
    Delivery_Month = month(Delivery_MonthDate),
    Month_Factor = factor(month.abb[Delivery_Month], levels = month.abb)
  )

# Step 2: Plot both actual spend and rolling average
ggplot(monthly_summary, aes(x = Month_Factor)) +
  # Bars for monthly spend
  geom_col(aes(y = Total_Spend, fill = factor(Delivery_Year)), position = position_dodge(width = 0.8), width = 0.7, alpha = 0.6) +
  # Line for rolling 3-month average
  geom_line(aes(y = Rolling_Spend, group = factor(Delivery_Year), color = factor(Delivery_Year)), linewidth = 1.2) +
  geom_point(aes(y = Rolling_Spend, color = factor(Delivery_Year)), size = 2) +
  scale_y_continuous(labels = label_dollar(scale = 1e-6, suffix = "M", prefix = "$")) +
  labs(
    title = "Monthly Spend with Rolling 3-Month Average",
    x = "Month",
    y = "Spend",
    fill = "Year",
    color = "Rolling Avg Year"
  ) +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 0, hjust = 0.5),
    legend.position = "top"
  )
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

#percent savings by supplier, not dollar value savings next #summarize both savings and spend by supplier and year

summary_savings_data <- sourcing_data %>%
  group_by(Supplier, Delivery_Year) %>%
  summarise(
    Total_Spend = sum(Total_Purchase_Amount, na.rm = TRUE),
    Negotiated_Savings = sum(Negotiated_Savings, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(Savings_Percent = Negotiated_Savings / Total_Spend)

#order suppliers by 2023 savings percentage

top_2023_savings <- summary_savings_data %>%
  filter(Delivery_Year == 2023) %>%
  arrange(desc(Savings_Percent)) %>%
  pull(Supplier)

summary_savings_data <- summary_savings_data %>%
  mutate(Supplier = factor(Supplier, levels = top_2023_savings))

#plot percentage savings by supplier

library(ggplot2)
library(scales)

ggplot(summary_savings_data, aes(x = Supplier, y = Savings_Percent, fill = as.factor(Delivery_Year))) + 
  geom_bar(stat = "identity", position = "dodge") + 
  scale_y_continuous(labels = percent_format(accuracy = 1)) +
  labs(
    title = "Negotiated Savings by Supplier and Year", 
    x = "Supplier",
    y = "Percent Saved", 
    fill = "Delivery Year"
  ) + 
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#defect 2nd attempt - defect value based on compliant orders

# Summarize compliance vs. defects
compliance_summary <- sourcing_data %>%
  group_by(Delivery_Year, Compliance) %>%
  summarise(
    Total_Spend = sum(Total_Purchase_Amount, na.rm = TRUE),
    Total_Defects = sum(Defect_Value, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(Defect_Rate = Total_Defects / Total_Spend)

# Plot
ggplot(compliance_summary, aes(x = factor(Delivery_Year), y = Defect_Rate, fill = Compliance)) +
  geom_col(position = "dodge") +
  geom_text(aes(label = percent(Defect_Rate, accuracy = 0.1)),
            position = position_dodge(width = 0.9),
            vjust = -0.3, size = 3) +
  scale_y_continuous(labels = percent_format(accuracy = 0.1), expand = expansion(mult = c(0, 0.1))) +
  scale_fill_manual(values = c("Yes" = "#00C400", "No" = "#D00000")) +
  labs(
    title = "Defect Rate by Year and Compliance Status",
    x = "Year",
    y = "Defect Rate (% of Spend)",
    fill = "Compliance"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    axis.text.x = element_text(face = "bold"),
    legend.position = "top"
  )

#Defect Rate Per Supplier

defect_summary <- sourcing_data %>%
  group_by(Supplier, Delivery_Year) %>%
  summarise(Total_Defects = sum(Defect_Value, na.rm = TRUE)) %>%
  ungroup()
## `summarise()` has grouped output by 'Supplier'. You can override using the
## `.groups` argument.
top_2023_defects <- defect_summary %>%
  filter(Delivery_Year == 2023) %>%
  arrange(desc(Total_Defects)) %>%
  pull(Supplier)

defect_summary <- defect_summary %>%
  mutate(Supplier = factor(Supplier, levels = top_2023_defects))

ggplot(defect_summary, aes(x = Supplier, y = Total_Defects, fill = as.factor(Delivery_Year))) +
  geom_col(position = "dodge") +
  scale_y_continuous(labels = label_dollar(scale_cut = cut_short_scale())) +
  labs(
    title = "Total Defect Value by Supplier and Year",
    x = "Supplier",
    y = "Defect Value ($)",
    fill = "Year"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "top"
  )

#defect rate by supplier, with percent of spend

defect_summary <- sourcing_data %>%
  group_by(Supplier, Delivery_Year) %>%
  summarise(
    Total_Defects = sum(Defect_Value, na.rm = TRUE),
    Total_Spend = sum(Total_Purchase_Amount, na.rm = TRUE)
  ) %>%
  mutate(Defect_Rate = Total_Defects / Total_Spend * 100) %>%
  ungroup()
## `summarise()` has grouped output by 'Supplier'. You can override using the
## `.groups` argument.
top_2023_defects <- defect_summary %>%
  filter(Delivery_Year == 2023) %>%
  arrange(desc(Defect_Rate)) %>%
  pull(Supplier)

defect_summary <- defect_summary %>%
  mutate(Supplier = factor(Supplier, levels = top_2023_defects))

ggplot(defect_summary, aes(x = Supplier, y = Defect_Rate, fill = as.factor(Delivery_Year))) +
  geom_col(position = "dodge") +
  scale_y_continuous(labels = scales::percent_format(scale = 1)) +
  labs(
    title = "Defect Rate by Supplier and Year",
    x = "Supplier",
    y = "Defect Rate (%)",
    fill = "Year"
  ) +
  theme_minimal(base_size = 12) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#defect rate with percentage of spend and totals

scale_factor <- max(defect_summary$Total_Defects) / max(defect_summary$Defect_Rate)

ggplot(defect_summary, aes(x = Supplier, y = Total_Defects)) +
  geom_col(aes(fill = as.factor(Delivery_Year)), position = position_dodge(width = 0.8), width = 0.6) +
  
  geom_line(
    aes(y = Defect_Rate * scale_factor, group = as.factor(Delivery_Year), color = as.factor(Delivery_Year)),
    position = position_dodge(width = 0.8),
    size = 1
  ) +
  
  scale_y_continuous(
    name = "Total Defect Value ($)",
    labels = scales::label_dollar(scale_cut = scales::cut_short_scale()),
    sec.axis = sec_axis(
      ~ . / scale_factor,
      name = "Defect Rate (%)",
      labels = scales::percent_format(scale = 1)
    )
  ) +
  
  labs(
    title = "Defect Value and Defect Rate by Supplier and Year",
    x = "Supplier",
    fill = "Year (Bars)",
    color = "Year (Line)"
  ) +
  
  theme_minimal(base_size = 12) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "top"
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#defect Rate by category by year

# Summarize by Category and Year
defect_summary <- sourcing_data %>%
  group_by(Item_Category, Delivery_Year) %>%
  summarise(
    Total_Defects = sum(Defect_Value, na.rm = TRUE),
    Total_Spend = sum(Total_Purchase_Amount, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  mutate(Defect_Rate = Total_Defects / Total_Spend * 100)

# Get order based on most recent year's defect rate (e.g., 2023)
category_order <- defect_summary %>%
  filter(Delivery_Year == 2023) %>%
  arrange(desc(Defect_Rate)) %>%
  pull(Item_Category)

# Apply factor order to control bar sort
defect_summary <- defect_summary %>%
  mutate(Item_Category = factor(Item_Category, levels = category_order))

# Compute scale factor to align percent with dollar values
scale_factor <- max(defect_summary$Total_Defects) / max(defect_summary$Defect_Rate)

# Add scaled y-position for line and labels
defect_summary <- defect_summary %>%
  mutate(
    scaled_rate_y = Defect_Rate * scale_factor,
    label_y = scaled_rate_y + max(Total_Defects) * 0.05  # small space above the line
  )

# Plot
ggplot(defect_summary, aes(x = Item_Category, y = Total_Defects)) +
  geom_col(aes(fill = as.factor(Delivery_Year)),
           position = position_dodge(width = 0.8),
           width = 0.6) +

  geom_line(
    aes(y = scaled_rate_y, group = as.factor(Delivery_Year), color = as.factor(Delivery_Year)),
    position = position_dodge(width = 0.8),
    linewidth = 1
  ) +

  # Add percentage labels
  geom_text(
    aes(y = label_y, label = paste0(round(Defect_Rate, 1), "%"), group = as.factor(Delivery_Year), color = as.factor(Delivery_Year)),
    position = position_dodge(width = 0.8),
    size = 3,
    vjust = 0
  ) +

  scale_y_continuous(
    name = "Total Defect Value ($)",
    labels = scales::label_dollar(scale_cut = scales::cut_short_scale()),
    sec.axis = sec_axis(
      ~ . / scale_factor,
      name = "Defect Rate (%)",
      labels = scales::percent_format(scale = 1)
    )
  ) +

  labs(
    title = "Defect Value and Defect Rate by Item Category and Year",
    x = "Item Category",
    fill = "Year (Bars)",
    color = "Year (Line)"
  ) +

  theme_minimal(base_size = 12) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "top"
  )

#compliance status by supplier by year

# Summarize total spend by Supplier, Year, and Compliance
compliance_summary <- sourcing_data %>%
  group_by(Supplier, Delivery_Year, Compliance) %>%
  summarise(Total_Spend = sum(Total_Purchase_Amount, na.rm = TRUE), .groups = "drop")

# Calculate percent of spend that is compliant per supplier and year
compliance_summary <- compliance_summary %>%
  group_by(Supplier, Delivery_Year) %>%
  mutate(Spend_Share = Total_Spend / sum(Total_Spend)) %>%
  ungroup()

# Order suppliers by non-compliant spend share in most recent year (e.g., 2023)
supplier_order <- compliance_summary %>%
  filter(Delivery_Year == 2023, Compliance == "No") %>%
  arrange(desc(Spend_Share)) %>%
  pull(Supplier)

# Apply factor order
compliance_summary <- compliance_summary %>%
  mutate(Supplier = factor(Supplier, levels = unique(supplier_order)))

# Plot
ggplot(compliance_summary, aes(x = Supplier, y = Spend_Share, fill = Compliance)) +
  geom_col(position = "stack", width = 0.7) +
  facet_wrap(~Delivery_Year, nrow = 1) +
  scale_y_continuous(
    labels = percent_format(accuracy = 1),
    name = "Share of Total Spend"
  ) +
  scale_fill_manual(
    values = c("No" = "#d73027", "Yes" = "#1a9850"),
    name = "Compliance"
  ) +
  labs(
    title = "Compliance Status by Supplier and Year",
    x = "Supplier"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "top"
  )