KCDB DATA PROCESSING - INTRAREGIONAL CMCS REVIEW (SIM)

INTRODUCTION

This document presents the R script for the exploratory analysis of the intraregional review cycle of the CMCs (Calibration and Measurement Capabilities) of the Inter-American Metrology System (SIM). The data comes from KCDB from a file generated on March 23, 2026, and is used in this document for illustrative purposes.

FIRST SECTION: ANALYSIS OF QUANTITATIVE VARIABLES

1. Loading Libraries and Data

# Libraries
library(readxl)
library(dplyr)
library(stringr)
library(ggplot2)
library(tidyr)
library(forcats)
library(writexl)

# Load Data (Ensure the path is correct in your rendering environment)
data <- read_excel("C:/Users/Bryan Calderón/Desktop/service122780601959971919.xlsx")

# Select relevant columns
data_clean <- data %>%
  select(
    `KCDB internal CMC identifier`,
    Category,
    `Sub-category`,
    `Group of expertise`,
    Matrix,
    `Analyte or component`,
    `Mechanism(s) for service delivery`,
    `Measurement technique(s) used`,
    `Uncertainty convention`,
    `KCDB Support for CMC claim`,
    `Other support`
  ) %>%
  mutate(
    country_code = str_sub(`KCDB internal CMC identifier`, 8, 9),
    Category_num = as.numeric(Category),
    Sub_num = as.numeric(`Sub-category`)
  )

2. Data Dictionary and Classification

# Create official dictionary
qm_dict <- tibble::tibble(
  Category_num = c(1,1,1,1,1,2,2,2,3,3,3,3,4,4,4,4,4,4,5,5,5,5,6,7,8,8,8,8,
                   9,9,9,9,9,10,10,10,10,10,10,10,11,11,11,11,12,12,12,12,
                   13,13,13,13,13,14,14,14,14,14,14,14,14,14,14,15,15,15,15),
  
  Sub_num = c(1,2,3,4,5,1,2,3,1,2,3,4,1,2,3,4,5,6,1,2,3,4,1,1,1,2,3,4,
              1,2,3,4,5,1,2,3,4,5,6,7,1,2,3,4,1,2,3,4,1,2,3,4,5,
              1,2,3,4,5,6,7,8,9,10,1,2,3,4),
  
  Category_name = c(rep("High purity chemicals",5), rep("Inorganic solutions",3),
                    rep("Organic solutions",4), rep("Gases",6), rep("Water",4),
                    rep("pH",1), rep("Electrolytic conductivity",1),
                    rep("Metal and metal alloys",4), rep("Advanced materials",5),
                    rep("Biological fluids and materials",7), rep("Food",4),
                    rep("Fuels",4), rep("Sediments, soils, ores, and particulates",5),
                    rep("Other materials",10), rep("Surfaces, films, and engineered nanomaterials",4)),
  
  Subcategory_name = c("Inorganic compounds","Organic compounds","Metals","Isotopics","Other",
                       "Elemental","Anionic","Other","PAHs","PCBs","Pesticides","Other",
                       "High purity","Environmental","Fuel","Forensic","Medical","Other",
                       "Fresh water","Contaminated water","Sea water","Other","pH",
                       "Electrolytic conductivity","Ferrous metals","Non-ferrous metals","Precious metals","Other",
                       "Semiconductors","Superconductors","Polymers and plastics","Ceramics","Other",
                       "Blood serum","Renal fluids","Hair","Tissues","Bone","Botanical materials","Other",
                       "Nutritional constituents","Contaminants","GMOs","Other",
                       "Coal and coke","Petroleum products","Bio-mass","Other",
                       "Sediments","Soils","Ores","Particulates","Other",
                       "Cements","Paints","Textiles","Glasses","Thin films","Coatings","Insulating materials","Rubber","Adhesives","Other",
                       "Inorganic","Organic","Biomaterials","Other")
)

# Join with dictionary and apply classification logic
data_clean <- data_clean %>%
  left_join(qm_dict, by = c("Category_num", "Sub_num")) %>%
  mutate(
    Category_final = Category_name,
    Subcategory_final = case_when(
      Category_num %in% c(6,7) ~ Category_name,  
      is.na(Subcategory_name) ~ Category_name,
      TRUE ~ Subcategory_name
    ),
    Sub_num = ifelse(Category_num %in% c(6,7), NA, Sub_num),
    full_code = case_when(
      Category_num %in% c(6,7) ~ as.character(Category_num),
      is.na(Sub_num) ~ as.character(Category_num),
      TRUE ~ paste0(Category_num, ".", Sub_num)
    )
  )

# Summary table
summary_table <- data_clean %>%
  group_by(country_code, full_code, Category_final, Subcategory_final, `Group of expertise`) %>%
  summarise(n = n(), .groups = "drop") %>%
  arrange(desc(n))

3. Visualizations (Quantitative)

# Country vs Category
ggplot(data_clean, aes(x = fct_infreq(country_code), fill = Category_final)) +
  geom_bar(position = "stack") +
  labs(title = "Distribution of CMCs (Country vs Category)", x = "Country", y = "Frequency") +
  theme_minimal()

# Country vs Sub-category
ggplot(data_clean, aes(x = fct_infreq(country_code), fill = Subcategory_final)) +
  geom_bar(position = "stack") +
  labs(title = "Distribution of CMCs (Country vs Sub-category)", x = "Country", y = "Frequency") +
  theme_minimal()

# Heatmap Country vs Category
heatmap_data <- data_clean %>% count(country_code, Category_final)

ggplot(heatmap_data, aes(x = country_code, y = Category_final, fill = n)) +
  geom_tile() +
  labs(title = "Heatmap: Country vs Category", x = "Country", y = "Category") +
  theme_minimal()

# KCDB Support
ggplot(data_clean, aes(x = fct_infreq(country_code), fill = `Group of expertise`)) +
  geom_bar(position = "stack") +
  labs(title = "Distribution of KCDB Support for CMC claim", x = "Country", y = "Frequency") +
  theme_minimal()

SECOND SECTION: EXPLORATORY ANALYSIS OF QUALITATIVE VARIABLES

library(wordcloud)
library(RColorBrewer)
library(widyr)

# Cleaning and plotting functions
clean_1 <- function(data, column_name){
  data %>%
    mutate(
      value_raw = !!sym(column_name),
      value_raw = str_replace_all(value_raw, ";|\\.| and ", ","),
      value_raw = str_squish(value_raw)
    ) %>%
    separate_rows(value_raw, sep = ",") %>%
    mutate(value_raw = str_trim(value_raw)) %>%
    filter(value_raw != "")
}

clean_2 <- function(data, column_name){
  data %>%
    mutate(
      value_raw = !!sym(column_name),
      value_raw = str_replace_all(value_raw, ";| and ", ","),
      value_raw = str_squish(value_raw)
    ) %>%
    separate_rows(value_raw, sep = ",") %>%
    mutate(value_raw = str_trim(value_raw)) %>%
    filter(value_raw != "")
}

plot_top <- function(data, var, top_n = 10){
  data %>%
    count(.data[[var]], sort = TRUE) %>%
    slice_head(n = top_n) %>%
    ggplot(aes(x = reorder(.data[[var]], n), y = n)) +
    geom_col(fill = "steelblue") +
    coord_flip() +
    labs(title = paste("Top", top_n, var), x = "", y = "Frequency") +
    theme_minimal()
}

# Apply cleaning
data_clean1 <- clean_1(data_clean, "Measurement technique(s) used")
data_clean2 <- clean_2(data_clean, "KCDB Support for CMC claim")

4. Visualizations (Qualitative)

plot_top(data_clean, "Matrix", 10)

plot_top(data_clean, "Analyte or component", 10)

plot_top(data_clean1, "value_raw", 10) + labs(title = "Top 10 Measurement technique(s) used")

plot_top(data_clean2, "value_raw", 10) + labs(title = "Top 10 KCDB Support for CMC claim")