# Load required packages
library(data.table)
library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(gridExtra)
library(knitr)
library(kableExtra)
library(viridis)

# Set theme
theme_set(theme_bw(base_size = 12))

Executive Summary

0.1 Purpose of This Report

This report assesses the data quality of the Kenya ADGG/TDGG dairy datasets by:

Identifying outliers - values that are statistically unusual
Detecting errors - values that are biologically impossible (e.g., negative ages)
Quantifying data loss - how many records are lost when we apply quality filters
Providing recommendations - suggested filters and cleaning procedures

0.2 Color Coding Used in This Report

Throughout this report, we use color coding to highlight data quality issues:

Color	Meaning
🔴 Red	Error or impossible value (e.g., negative AFC, negative DIM)
🟠 Orange	Warning - unusual but possible (e.g., AFC < 18 months)
🟢 Green	Normal/acceptable range
🔵 Blue	High but acceptable (e.g., daily yield 30-50 kg)
🟣 Purple	Extreme outlier - needs verification

1 Data Loading

# SET YOUR FILE PATH HERE
data_path <- "C:/Users/SAlemu/OneDrive - CGIAR/Documents/KenyaData/New_Kenya_data/"

# Read data - treating "NULL" as NA
lactation <- fread(paste0(data_path, "lactation_calving.csv"), 
                   na.strings = c("NULL", "null", "NA", "N/A", "", " ", "NaN"),
                   stringsAsFactors = FALSE)

milking <- fread(paste0(data_path, "milking_data.csv"), 
                 na.strings = c("NULL", "null", "NA", "N/A", "", " ", "NaN"),
                 stringsAsFactors = FALSE)

# Convert numeric columns
lactation <- lactation %>%
  mutate(
    parity = as.numeric(parity),
    afc_months = as.numeric(afc_months),
    afc_days = as.numeric(afc_days),
    calving_interval_days = as.numeric(calving_interval_days),
    calving_interval_months = as.numeric(calving_interval_months),
    lactation_length_days = as.numeric(lactation_length_days),
    total_lactation_yield = as.numeric(total_lactation_yield),
    testday_count = as.numeric(testday_count),
    days_open = as.numeric(days_open)
  )

milking <- milking %>%
  mutate(
    parity = as.numeric(parity),
    afc_months = as.numeric(afc_months),
    afc_days = as.numeric(afc_days),
    dim = as.numeric(dim),
    daily_yield = as.numeric(daily_yield),
    milk_morning = as.numeric(milk_morning),
    milk_evening = as.numeric(milk_evening),
    milk_midday = as.numeric(milk_midday),
    testday_no = as.numeric(testday_no),
    calving_age_months = as.numeric(calving_age_months)
  )

Datasets Loaded:

Dataset	Records	Description
Lactation	88,352	One record per lactation (calving event)
Milking	1,048,575	One record per test-day milk recording

What is a “test-day”? A test-day is a single day when milk production was recorded. Each lactation typically has multiple test-days (monthly recordings throughout the lactation).

2 Raw Data Statistics

2.1 Why Look at Raw Statistics First?

Important: Before cleaning data, we must examine the raw statistics (Min, Max, Mean) to:

Identify impossible values - e.g., negative ages indicate calculation errors
Detect extreme outliers - e.g., AFC of 24,000 months is clearly wrong
Understand the data range - see what values exist before filtering
Quantify the problem - know how many records need cleaning

The Min and Max columns are especially important for detecting errors!

# Function to calculate comprehensive stats
calc_stats <- function(x, var_name) {
  x_valid <- x[!is.na(x)]
  n_total <- length(x)
  n_valid <- length(x_valid)
  n_missing <- sum(is.na(x))
  
  if (n_valid == 0) {
    return(data.frame(
      Variable = var_name,
      N_Total = n_total,
      N_Valid = 0,
      N_Missing = n_missing,
      Pct_Missing = 100,
      Min = NA, Max = NA, Mean = NA, SD = NA, Median = NA
    ))
  }
  
  data.frame(
    Variable = var_name,
    N_Total = n_total,
    N_Valid = n_valid,
    N_Missing = n_missing,
    Pct_Missing = round(100 * n_missing / n_total, 2),
    Min = round(min(x_valid), 2),
    Max = round(max(x_valid), 2),
    Mean = round(mean(x_valid), 2),
    SD = round(sd(x_valid), 2),
    Median = round(median(x_valid), 2)
  )
}

# Create subsets first
afc_parity1 <- lactation$afc_months[which(lactation$parity == 1)]
ci_parity_gt1 <- lactation$calving_interval_days[which(lactation$parity > 1)]

lact_stats_raw <- rbind(
  calc_stats(afc_parity1, "AFC (months)"),
  calc_stats(ci_parity_gt1, "Calving Interval (days)"),
  calc_stats(lactation$lactation_length_days, "Lactation Length (days)"),
  calc_stats(lactation$total_lactation_yield, "Total Lactation Yield (kg)"),
  calc_stats(lactation$days_open, "Days Open"),
  calc_stats(lactation$parity, "Parity")
)

# Determine which rows have problematic Min or Max values
# For color coding
lact_stats_raw <- lact_stats_raw %>%
  mutate(
    Min_Problem = case_when(
      Variable == "AFC (months)" & Min < 0 ~ TRUE,
      Variable == "Calving Interval (days)" & Min < 280 ~ TRUE,
      Variable == "Days Open" & Min < 0 ~ TRUE,
      Variable == "Parity" & Min < 1 ~ TRUE,
      TRUE ~ FALSE
    ),
    Max_Problem = case_when(
      Variable == "AFC (months)" & Max > 100 ~ TRUE,
      Variable == "Calving Interval (days)" & Max > 730 ~ TRUE,
      Variable == "Total Lactation Yield (kg)" & Max > 20000 ~ TRUE,
      Variable == "Days Open" & Max > 500 ~ TRUE,
      Variable == "Parity" & Max > 15 ~ TRUE,
      TRUE ~ FALSE
    )
  )

kable(lact_stats_raw %>% select(-Min_Problem, -Max_Problem), 
      caption = "Table 1: Lactation Data - Raw Statistics (Before Any Cleaning)",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  column_spec(6, color = ifelse(lact_stats_raw$Min_Problem, "red", "black"),
              bold = ifelse(lact_stats_raw$Min_Problem, TRUE, FALSE)) %>%
  column_spec(7, color = ifelse(lact_stats_raw$Max_Problem, "red", "black"),
              bold = ifelse(lact_stats_raw$Max_Problem, TRUE, FALSE))

Table 1: Lactation Data - Raw Statistics (Before Any Cleaning)
Variable	N_Total	N_Valid	N_Missing	Pct_Missing	Min	Max	Mean	SD	Median
AFC (months)	37,236	33,015	4,221	11.34	-394	24,295.0	50.96	364.55	38
Calving Interval (days)	50,754	50,463	291	0.57	0	16,530.0	521.70	342.46	424
Lactation Length (days)	88,352	50,707	37,645	42.61	0	669,342.0	535.21	2,990.38	424
Total Lactation Yield (kg)	88,352	88,347	5	0.01	0	23,186.4	112.02	642.21	0
Days Open	88,352	50,710	37,642	42.60	-280	669,062.0	255.21	2,990.29	144
Parity	88,352	87,990	362	0.41	1	32.0	2.42	1.82	2

📊 Column Descriptions for Table 1:

Column	What It Means	How to Interpret
Variable	The trait being measured	-
N_Total	Total records in this subset	Higher = more data available
N_Valid	Records with actual values (not missing)	Should be close to N_Total
N_Missing	Records with missing (NA) values	High numbers indicate data collection gaps
Pct_Missing	% of records that are missing	>20% is concerning
Min	🔴 Smallest value found	Negative values = ERROR
Max	🔴 Largest value found	Extreme values = likely ERROR
Mean	Average of all valid values	Affected by outliers
SD	Standard Deviation (spread)	Large SD suggests high variability or outliers
Median	Middle value (50th percentile)	Less affected by outliers than Mean

🔴 Values shown in RED indicate problems that need attention!

🚨 Critical Issues Found in Lactation Data:

# AFC issues
afc_min <- round(min(afc_parity1, na.rm = TRUE), 1)
afc_max <- round(max(afc_parity1, na.rm = TRUE), 1)

# CI issues
ci_min <- round(min(ci_parity_gt1, na.rm = TRUE), 1)
ci_max <- round(max(ci_parity_gt1, na.rm = TRUE), 1)

# Days open issues
do_min <- round(min(lactation$days_open, na.rm = TRUE), 1)
do_max <- round(max(lactation$days_open, na.rm = TRUE), 1)

cat(paste0("
| Trait | Problem Value | Why It's Wrong |
|-------|---------------|----------------|
| **AFC** | Min = ", afc_min, " months | 🔴 **Negative age is impossible!** Birthdate must be before calving date |
| **AFC** | Max = ", format(afc_max, big.mark=','), " months | 🔴 **", round(afc_max/12, 0), " years old at first calving is impossible!** |
| **Calving Interval** | Max = ", format(ci_max, big.mark=','), " days | 🔴 **", round(ci_max/365, 1), " years between calvings is extreme** |
| **Days Open** | Min = ", do_min, " days | 🔴 **Negative days open is impossible!** |
| **Days Open** | Max = ", format(do_max, big.mark=','), " days | 🔴 **", round(do_max/365, 1), " years to conception is extreme** |
"))

Trait	Problem Value	Why It’s Wrong
AFC	Min = -394 months	🔴 Negative age is impossible! Birthdate must be before calving date
AFC	Max = 24,295 months	🔴 2025 years old at first calving is impossible!
Calving Interval	Max = 16,530 days	🔴 45.3 years between calvings is extreme
Days Open	Min = -280 days	🔴 Negative days open is impossible!
Days Open	Max = 669,062 days	🔴 1833 years to conception is extreme

# Milking traits - RAW
milk_stats_raw <- rbind(
  calc_stats(milking$daily_yield, "Daily Milk Yield (kg)"),
  calc_stats(milking$milk_morning, "Morning Milk (kg)"),
  calc_stats(milking$milk_evening, "Evening Milk (kg)"),
  calc_stats(milking$dim, "Days in Milk (DIM)"),
  calc_stats(milking$parity, "Parity"),
  calc_stats(milking$afc_months, "AFC (months)")
)

# Flag problems
milk_stats_raw <- milk_stats_raw %>%
  mutate(
    Min_Problem = case_when(
      Variable == "Daily Milk Yield (kg)" & Min < 0 ~ TRUE,
      Variable == "Morning Milk (kg)" & Min < 0 ~ TRUE,
      Variable == "Evening Milk (kg)" & Min < 0 ~ TRUE,
      Variable == "Days in Milk (DIM)" & Min < 0 ~ TRUE,
      Variable == "AFC (months)" & Min < 0 ~ TRUE,
      TRUE ~ FALSE
    ),
    Max_Problem = case_when(
      Variable == "Daily Milk Yield (kg)" & Max > 60 ~ TRUE,
      Variable == "Days in Milk (DIM)" & Max > 1000 ~ TRUE,
      Variable == "AFC (months)" & Max > 100 ~ TRUE,
      TRUE ~ FALSE
    )
  )

kable(milk_stats_raw %>% select(-Min_Problem, -Max_Problem), 
      caption = "Table 2: Milking Data - Raw Statistics (Before Any Cleaning)",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  column_spec(6, color = ifelse(milk_stats_raw$Min_Problem, "red", "black"),
              bold = ifelse(milk_stats_raw$Min_Problem, TRUE, FALSE)) %>%
  column_spec(7, color = ifelse(milk_stats_raw$Max_Problem, "red", "black"),
              bold = ifelse(milk_stats_raw$Max_Problem, TRUE, FALSE))

Table 2: Milking Data - Raw Statistics (Before Any Cleaning)
Variable	N_Total	N_Valid	N_Missing	Pct_Missing	Min	Max	Mean	SD	Median
Daily Milk Yield (kg)	1,048,575	1,048,546	29	0.00	0	50	5.51	6.99	1.38
Morning Milk (kg)	1,048,575	1,048,415	160	0.02	0	459	1.90	3.84	0.00
Evening Milk (kg)	1,048,575	1,048,413	162	0.02	-1	29	1.81	3.18	0.00
Days in Milk (DIM)	1,048,575	1,048,575	0	0.00	-729,939	7,752	220.35	2,008.49	168.00
Parity	1,048,575	1,048,403	172	0.02	1	32	2.66	1.99	2.00
AFC (months)	1,048,575	389,867	658,708	62.82	-288	270	38.29	19.43	33.00

🚨 Critical Issues Found in Milking Data:

dim_min <- round(min(milking$dim, na.rm = TRUE), 0)
dim_max <- round(max(milking$dim, na.rm = TRUE), 0)
eve_min <- round(min(milking$milk_evening, na.rm = TRUE), 2)

cat(paste0("
| Trait | Problem Value | Why It's Wrong |
|-------|---------------|----------------|
| **DIM** | Min = ", format(dim_min, big.mark=','), " days | 🔴 **Extreme negative!** Milking date is ~2000 years before calving date! |
| **DIM** | Max = ", format(dim_max, big.mark=','), " days | 🔴 **", round(dim_max/365, 1), " years in milk is impossible** |
| **Evening Milk** | Min = ", eve_min, " kg | 🔴 **Negative milk yield is impossible** |
"))

Trait	Problem Value	Why It’s Wrong
DIM	Min = -729,939 days	🔴 Extreme negative! Milking date is ~2000 years before calving date!
DIM	Max = 7,752 days	🔴 21.2 years in milk is impossible
Evening Milk	Min = -1 kg	🔴 Negative milk yield is impossible

What causes negative DIM? - DIM = Milking Date - Calving Date - Negative DIM means the milking was recorded BEFORE the calving date - This is a data entry error - dates were entered incorrectly

3 Raw vs Cleaned Data Comparison

3.1 What Does “Cleaning” Mean?

Data Cleaning = Removing biologically implausible values

We apply filters to keep only values within acceptable biological ranges:

Trait	Filter Range	Biological Justification
AFC	18-72 months	Heifers reach puberty ~12-15 mo; first calving typically 24-36 mo
Calving Interval	300-700 days	Minimum ~280 days (gestation); >700 days indicates fertility problems
Daily Milk Yield	0.5-50 kg	Typical range; >50 kg/day is rare even for elite Holsteins
DIM	5-500 days	Standard lactation ~305 days; extended lactations possible
Days Open	21-400 days	Minimum ~21 days postpartum; >400 days is poor fertility
Total Lactation Yield	>0 to 15,000 kg	Zero = incomplete data; >15,000 kg is exceptional

# Calculate each trait separately to avoid NA issues

# ============ AFC (Parity 1) ============
afc_raw <- lactation %>% filter(parity == 1, !is.na(afc_months))
afc_clean <- lactation %>% filter(parity == 1, !is.na(afc_months), afc_months >= 18, afc_months <= 72)

afc_row <- data.frame(
  Trait = "AFC (months)",
  Raw_N = nrow(afc_raw),
  Raw_Min = round(min(afc_raw$afc_months), 1),
  Raw_Max = round(max(afc_raw$afc_months), 1),
  Raw_Mean = round(mean(afc_raw$afc_months), 2),
  Raw_SD = round(sd(afc_raw$afc_months), 2),
  Filter_Applied = "18 to 72",
  Clean_N = nrow(afc_clean),
  Clean_Mean = round(mean(afc_clean$afc_months), 2),
  Clean_SD = round(sd(afc_clean$afc_months), 2)
)

# ============ Calving Interval (Parity > 1) ============
ci_raw <- lactation %>% filter(parity > 1, !is.na(calving_interval_days))
ci_clean <- lactation %>% filter(parity > 1, !is.na(calving_interval_days), 
                                  calving_interval_days >= 300, calving_interval_days <= 700)

ci_row <- data.frame(
  Trait = "Calving Interval (days)",
  Raw_N = nrow(ci_raw),
  Raw_Min = round(min(ci_raw$calving_interval_days), 1),
  Raw_Max = round(max(ci_raw$calving_interval_days), 1),
  Raw_Mean = round(mean(ci_raw$calving_interval_days), 2),
  Raw_SD = round(sd(ci_raw$calving_interval_days), 2),
  Filter_Applied = "300 to 700",
  Clean_N = nrow(ci_clean),
  Clean_Mean = round(mean(ci_clean$calving_interval_days), 2),
  Clean_SD = round(sd(ci_clean$calving_interval_days), 2)
)

# ============ Lactation Length ============
ll_raw <- lactation %>% filter(!is.na(lactation_length_days))
ll_clean <- lactation %>% filter(!is.na(lactation_length_days), 
                                  lactation_length_days >= 60, lactation_length_days <= 500)

ll_row <- data.frame(
  Trait = "Lactation Length (days)",
  Raw_N = nrow(ll_raw),
  Raw_Min = round(min(ll_raw$lactation_length_days), 1),
  Raw_Max = round(max(ll_raw$lactation_length_days), 1),
  Raw_Mean = round(mean(ll_raw$lactation_length_days), 2),
  Raw_SD = round(sd(ll_raw$lactation_length_days), 2),
  Filter_Applied = "60 to 500",
  Clean_N = nrow(ll_clean),
  Clean_Mean = round(mean(ll_clean$lactation_length_days), 2),
  Clean_SD = round(sd(ll_clean$lactation_length_days), 2)
)

# ============ Total Lactation Yield ============
tly_raw <- lactation %>% filter(!is.na(total_lactation_yield))
tly_clean <- lactation %>% filter(!is.na(total_lactation_yield), 
                                   total_lactation_yield > 0, total_lactation_yield <= 15000)

tly_row <- data.frame(
  Trait = "Total Lactation Yield (kg)",
  Raw_N = nrow(tly_raw),
  Raw_Min = round(min(tly_raw$total_lactation_yield), 1),
  Raw_Max = round(max(tly_raw$total_lactation_yield), 1),
  Raw_Mean = round(mean(tly_raw$total_lactation_yield), 2),
  Raw_SD = round(sd(tly_raw$total_lactation_yield), 2),
  Filter_Applied = ">0 to 15,000",
  Clean_N = nrow(tly_clean),
  Clean_Mean = round(mean(tly_clean$total_lactation_yield), 2),
  Clean_SD = round(sd(tly_clean$total_lactation_yield), 2)
)

# ============ Days Open ============
do_raw <- lactation %>% filter(!is.na(days_open))
do_clean <- lactation %>% filter(!is.na(days_open), days_open >= 21, days_open <= 400)

do_row <- data.frame(
  Trait = "Days Open",
  Raw_N = nrow(do_raw),
  Raw_Min = round(min(do_raw$days_open), 1),
  Raw_Max = round(max(do_raw$days_open), 1),
  Raw_Mean = round(mean(do_raw$days_open), 2),
  Raw_SD = round(sd(do_raw$days_open), 2),
  Filter_Applied = "21 to 400",
  Clean_N = nrow(do_clean),
  Clean_Mean = round(mean(do_clean$days_open), 2),
  Clean_SD = round(sd(do_clean$days_open), 2)
)

# ============ Daily Milk Yield ============
dmy_raw <- milking %>% filter(!is.na(daily_yield))
dmy_clean <- milking %>% filter(!is.na(daily_yield), daily_yield >= 0.5, daily_yield <= 50)

dmy_row <- data.frame(
  Trait = "Daily Milk Yield (kg)",
  Raw_N = nrow(dmy_raw),
  Raw_Min = round(min(dmy_raw$daily_yield), 2),
  Raw_Max = round(max(dmy_raw$daily_yield), 2),
  Raw_Mean = round(mean(dmy_raw$daily_yield), 2),
  Raw_SD = round(sd(dmy_raw$daily_yield), 2),
  Filter_Applied = "0.5 to 50",
  Clean_N = nrow(dmy_clean),
  Clean_Mean = round(mean(dmy_clean$daily_yield), 2),
  Clean_SD = round(sd(dmy_clean$daily_yield), 2)
)

# ============ DIM ============
dim_raw <- milking %>% filter(!is.na(dim))
dim_clean <- milking %>% filter(!is.na(dim), dim >= 5, dim <= 500)

dim_row <- data.frame(
  Trait = "Days in Milk (DIM)",
  Raw_N = nrow(dim_raw),
  Raw_Min = round(min(dim_raw$dim), 0),
  Raw_Max = round(max(dim_raw$dim), 0),
  Raw_Mean = round(mean(dim_raw$dim), 2),
  Raw_SD = round(sd(dim_raw$dim), 2),
  Filter_Applied = "5 to 500",
  Clean_N = nrow(dim_clean),
  Clean_Mean = round(mean(dim_clean$dim), 2),
  Clean_SD = round(sd(dim_clean$dim), 2)
)

# ============ Parity ============
parity_raw <- lactation %>% filter(!is.na(parity))
parity_clean <- lactation %>% filter(!is.na(parity), parity >= 1, parity <= 12)

parity_row <- data.frame(
  Trait = "Parity",
  Raw_N = nrow(parity_raw),
  Raw_Min = min(parity_raw$parity),
  Raw_Max = max(parity_raw$parity),
  Raw_Mean = round(mean(parity_raw$parity), 2),
  Raw_SD = round(sd(parity_raw$parity), 2),
  Filter_Applied = "1 to 12",
  Clean_N = nrow(parity_clean),
  Clean_Mean = round(mean(parity_clean$parity), 2),
  Clean_SD = round(sd(parity_clean$parity), 2)
)

# Combine all rows
comparison <- rbind(afc_row, ci_row, ll_row, tly_row, do_row, dmy_row, dim_row, parity_row)

# Calculate records lost and percentage
comparison <- comparison %>%
  mutate(
    Records_Lost = Raw_N - Clean_N,
    Pct_Lost = round(100 * Records_Lost / Raw_N, 2),
    # Flag problematic values
    Min_Flag = case_when(
      Trait == "AFC (months)" & Raw_Min < 0 ~ TRUE,
      Trait == "Days Open" & Raw_Min < 0 ~ TRUE,
      Trait == "Days in Milk (DIM)" & Raw_Min < 0 ~ TRUE,
      TRUE ~ FALSE
    ),
    Max_Flag = case_when(
      Trait == "AFC (months)" & Raw_Max > 100 ~ TRUE,
      Trait == "Calving Interval (days)" & Raw_Max > 1000 ~ TRUE,
      Trait == "Total Lactation Yield (kg)" & Raw_Max > 20000 ~ TRUE,
      Trait == "Days Open" & Raw_Max > 1000 ~ TRUE,
      Trait == "Days in Milk (DIM)" & Raw_Max > 1000 ~ TRUE,
      TRUE ~ FALSE
    ),
    Loss_Flag = Pct_Lost > 20
  )

# Display table with color coding
kable(comparison %>% select(-Min_Flag, -Max_Flag, -Loss_Flag), 
      caption = "Table 3: Comparison of Raw vs Cleaned Statistics",
      col.names = c("Trait", "Raw N", "Raw Min", "Raw Max", "Raw Mean", "Raw SD",
                    "Filter", "Clean N", "Clean Mean", "Clean SD", "N Lost", "% Lost"),
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  column_spec(3, color = ifelse(comparison$Min_Flag, "red", "black"),
              bold = ifelse(comparison$Min_Flag, TRUE, FALSE)) %>%
  column_spec(4, color = ifelse(comparison$Max_Flag, "red", "black"),
              bold = ifelse(comparison$Max_Flag, TRUE, FALSE)) %>%
  column_spec(12, color = ifelse(comparison$Loss_Flag, "red", "darkgreen"),
              bold = TRUE)

Table 3: Comparison of Raw vs Cleaned Statistics
Trait	Raw N	Raw Min	Raw Max	Raw Mean	Raw SD	Filter	Clean N	Clean Mean	Clean SD	N Lost	% Lost
AFC (months)	33,015	-394	24,295.0	50.96	364.55	18 to 72	28,229	39.36	11.89	4,786	14.50
Calving Interval (days)	50,463	0	16,530.0	521.70	342.46	300 to 700	41,248	433.45	92.15	9,215	18.26
Lactation Length (days)	50,707	0	669,342.0	535.21	2,990.38	60 to 500	33,573	386.08	60.86	17,134	33.79
Total Lactation Yield (kg)	88,347	0	23,186.4	112.02	642.21	>0 to 15,000	37,184	261.79	927.82	51,163	57.91
Days Open	50,710	-280	669,062.0	255.21	2,990.29	21 to 400	40,802	150.28	87.90	9,908	19.54
Daily Milk Yield (kg)	1,048,546	0	50.0	5.51	6.99	0.5 to 50	675,193	8.54	7.08	373,353	35.61
Days in Milk (DIM)	1,048,575	-729,939	7,752.0	220.35	2,008.49	5 to 500	957,939	169.61	112.75	90,636	8.64
Parity	87,990	1	32.0	2.42	1.82	1 to 12	87,927	2.41	1.76	63	0.07

📊 Column Descriptions for Table 3:

Column	Description	What to Look For
Trait	Variable being analyzed	-
Raw N	Number of records BEFORE filtering	Total available data
Raw Min	Minimum value in raw data	🔴 Red = impossible value (e.g., negative)
Raw Max	Maximum value in raw data	🔴 Red = extreme outlier
Raw Mean	Average BEFORE filtering	⚠️ Biased by outliers - don’t report this!
Raw SD	Standard deviation before filtering	Large SD may indicate outliers
Filter	Acceptable biological range	Values used to clean data
Clean N	Number of records AFTER filtering	Data available for analysis
Clean Mean	Average AFTER filtering	✅ Report this value!
Clean SD	Standard deviation after filtering	Use for reporting
N Lost	Records removed by filtering	How many outliers existed
% Lost	Percentage removed	🔴 Red if >20% = major data quality issue

🚨 Major Data Quality Issues Identified:

major_issues <- comparison %>% filter(Pct_Lost > 20)

if(nrow(major_issues) > 0) {
  cat("| Trait | % Lost | Interpretation |\n")
  cat("|-------|--------|----------------|\n")
  for(i in 1:nrow(major_issues)) {
    interpretation <- case_when(
      major_issues$Trait[i] == "Total Lactation Yield (kg)" ~ "Many incomplete lactations or zeros recorded as missing",
      major_issues$Trait[i] == "Daily Milk Yield (kg)" ~ "Many zero yields - may be dry cows or missing data",
      major_issues$Trait[i] == "Calving Interval (days)" ~ "Many extreme values - date recording issues",
      TRUE ~ "Significant outliers present"
    )
    cat(paste0("| **", major_issues$Trait[i], "** | ", major_issues$Pct_Lost[i], "% | ", interpretation, " |\n"))
  }
}

Trait	% Lost	Interpretation
Lactation Length (days)	33.79%	Significant outliers present
Total Lactation Yield (kg)	57.91%	Many incomplete lactations or zeros recorded as missing
Daily Milk Yield (kg)	35.61%	Many zero yields - may be dry cows or missing data

Impact: When >20% of records are removed, this affects:

Statistical power - fewer records for analysis
Representativeness - are we losing certain farms/breeds?
Genetic evaluations - missing data reduces accuracy

Recommendation: Investigate WHY these records have problematic values before simply deleting them.

4 Age at First Calving (AFC) Analysis

4.1 Distribution by Category

# AFC data with categories
afc_data <- lactation %>%
  filter(parity == 1, !is.na(afc_months)) %>%
  mutate(
    afc_category = case_when(
      afc_months < 0 ~ "Negative (ERROR)",
      afc_months >= 0 & afc_months < 18 ~ "Too Young (<18 mo)",
      afc_months >= 18 & afc_months <= 36 ~ "Optimal (18-36 mo)",
      afc_months > 36 & afc_months <= 72 ~ "Late (37-72 mo)",
      afc_months > 72 & afc_months <= 100 ~ "Very Late (73-100 mo)",
      afc_months > 100 ~ "Extreme (>100 mo)"
    ),
    afc_category = factor(afc_category, levels = c("Negative (ERROR)", "Too Young (<18 mo)", 
                                                    "Optimal (18-36 mo)", "Late (37-72 mo)",
                                                    "Very Late (73-100 mo)", "Extreme (>100 mo)"))
  )

# Summary table
afc_summary <- afc_data %>%
  group_by(afc_category) %>%
  summarise(
    N = n(),
    Min = round(min(afc_months), 1),
    Max = round(max(afc_months), 1),
    Mean = round(mean(afc_months), 2),
    SD = round(sd(afc_months), 2),
    Median = round(median(afc_months), 1),
    .groups = "drop"
  ) %>%
  mutate(
    Percentage = round(100 * N / sum(N), 2),
    Status = case_when(
      afc_category == "Negative (ERROR)" ~ "🔴 ERROR",
      afc_category == "Too Young (<18 mo)" ~ "🟠 WARNING",
      afc_category == "Optimal (18-36 mo)" ~ "🟢 OPTIMAL",
      afc_category == "Late (37-72 mo)" ~ "🟡 ACCEPTABLE",
      afc_category == "Very Late (73-100 mo)" ~ "🟠 WARNING",
      afc_category == "Extreme (>100 mo)" ~ "🔴 ERROR"
    )
  )

kable(afc_summary, 
      caption = "Table 4: AFC Distribution by Category with Quality Status",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Table 4: AFC Distribution by Category with Quality Status
afc_category	N	Min	Max	Mean	SD	Median	Percentage	Status
Negative (ERROR)	127	-394	-1	-102.76	106.54	-56	0.38	🔴 ERROR \|
Too Young (<18 mo)	683	0	17	5.74	6.30	3	2.07	🟠 WARNING \|
Optimal (18-36 mo)	14,700	18	36	30.33	3.85	31	44.53	🟢 OPTIMAL \|
Late (37-72 mo)	13,529	37	72	49.17	9.68	47	40.98	🟡 ACCEPTABLE \|
Very Late (73-100 mo)	2,495	73	100	84.47	8.11	84	7.56	🟠 WARNING \|
Extreme (>100 mo)	1,481	101	24,295	249.59	1,706.99	117	4.49	🔴 ERROR \|

📊 Understanding AFC Categories:

Category	Range	Status	Explanation
Negative	< 0 months	🔴 ERROR	Impossible - birthdate after calving date
Too Young	0-17 months	🟠 WARNING	Biologically unlikely - heifers not sexually mature
Optimal	18-36 months	🟢 OPTIMAL	Target range for dairy cattle
Late	37-72 months	🟡 ACCEPTABLE	Possible but economically suboptimal
Very Late	73-100 months	🟠 WARNING	Unusual - verify data
Extreme	>100 months	🔴 ERROR	Impossible - >8 years at first calving

Why AFC Matters:

Too early AFC → underdeveloped heifers, calving difficulties
Optimal AFC (24-30 mo) → balance of heifer development and economic returns
Late AFC → increased rearing costs, delayed milk production

# Plot 1: Full distribution
p1 <- ggplot(afc_data, aes(x = afc_months)) +
  geom_histogram(aes(fill = afc_category), bins = 100, color = "black", linewidth = 0.1) +
  scale_fill_manual(values = c(
    "Negative (ERROR)" = "#D32F2F",
    "Too Young (<18 mo)" = "#FF9800",
    "Optimal (18-36 mo)" = "#4CAF50",
    "Late (37-72 mo)" = "#FFC107",
    "Very Late (73-100 mo)" = "#FF5722",
    "Extreme (>100 mo)" = "#9C27B0"
  )) +
  geom_vline(xintercept = c(0, 18, 36, 72), linetype = "dashed", color = "black", linewidth = 0.5) +
  annotate("text", x = 27, y = Inf, label = "Optimal\nRange", vjust = 1.5, size = 3, color = "darkgreen") +
  labs(
    title = "Figure 1: Age at First Calving - Complete Distribution",
    subtitle = paste0("Total N = ", format(nrow(afc_data), big.mark = ","), 
                      " | Raw Mean = ", round(mean(afc_data$afc_months), 1), " months",
                      " | Note: Y-axis is log scale to show outliers"),
    x = "AFC (months)",
    y = "Count (log scale)",
    fill = "Category"
  ) +
  theme(legend.position = "bottom") +
  scale_y_log10(labels = comma)

# Plot 2: Cleaned data only
afc_clean <- afc_data %>% filter(afc_months >= 18, afc_months <= 72)
p2 <- ggplot(afc_clean, aes(x = afc_months)) +
  geom_histogram(bins = 40, fill = "steelblue", color = "black", linewidth = 0.2) +
  geom_vline(xintercept = mean(afc_clean$afc_months), linetype = "dashed", color = "red", linewidth = 1) +
  annotate("text", x = mean(afc_clean$afc_months) + 2, y = Inf, 
           label = paste0("Clean Mean = ", round(mean(afc_clean$afc_months), 1), " months"),
           vjust = 2, hjust = 0, color = "red", fontface = "bold") +
  labs(
    title = "Figure 2: AFC - Cleaned Data Only (18-72 months)",
    subtitle = paste0("Clean N = ", format(nrow(afc_clean), big.mark = ","),
                      " (", round(100*nrow(afc_clean)/nrow(afc_data), 1), "% of total)",
                      " | Mean = ", round(mean(afc_clean$afc_months), 1),
                      " ± ", round(sd(afc_clean$afc_months), 1), " months"),
    x = "AFC (months)",
    y = "Count"
  )

grid.arrange(p1, p2, ncol = 1, heights = c(1.2, 1))

📈 How to Read These Figures:

Figure 1 (All Data):

Shows COMPLETE distribution including all errors and outliers
Y-axis is LOG scale - necessary to see both common values and rare extremes
Dashed vertical lines mark category boundaries (0, 18, 36, 72 months)
Colors show data quality: 🟢 Green = good, 🔴 Red = errors

Figure 2 (Cleaned Data):

Shows ONLY records within acceptable range (18-72 months)
Red dashed line = mean of cleaned data
Subtitle shows: sample size retained, percentage of total, and mean ± SD
This is what you should report in publications!

Key Insight: The raw mean is pulled upward by extreme outliers. Always report the CLEANED mean.

4.2 AFC by Breed

# AFC by breed - CLEANED
afc_by_breed <- lactation %>%
  filter(parity == 1, !is.na(afc_months), !is.na(main_breed_name),
         afc_months >= 18, afc_months <= 72) %>%
  group_by(main_breed_name) %>%
  summarise(
    N = n(),
    Min = round(min(afc_months), 1),
    Max = round(max(afc_months), 1),
    Mean = round(mean(afc_months), 2),
    SD = round(sd(afc_months), 2),
    SE = round(sd(afc_months) / sqrt(n()), 2),
    Median = round(median(afc_months), 1),
    .groups = "drop"
  ) %>%
  filter(N >= 30) %>%
  arrange(Mean)

kable(afc_by_breed, 
      caption = "Table 5: AFC by Breed (Cleaned: 18-72 months, N ≥ 30)",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Table 5: AFC by Breed (Cleaned: 18-72 months, N ≥ 30)
main_breed_name	N	Min	Max	Mean	SD	SE	Median
Scandinavian Red	50	18	38	26.84	4.51	0.64	25.5
Jersey crosses	70	18	52	30.11	6.17	0.74	29.5
Gir	199	21	72	31.07	7.48	0.53	30.0
Fleckvieh	55	21	62	31.49	8.06	1.09	30.0
Jersey	1,144	18	72	31.50	10.19	0.30	28.0
Holstein-Friesian crosses	1,255	18	72	35.78	7.02	0.20	35.0
Ayrshire crosses	88	18	72	36.75	10.89	1.16	34.0
Holstein Friesian	15,944	18	72	37.21	11.06	0.09	34.0
Sahiwal crosses	237	22	60	38.36	8.24	0.54	37.0
Boran	101	19	69	38.67	9.09	0.90	37.0
Unknown	43	24	71	40.09	14.00	2.13	36.0
Guernsey	574	19	72	41.44	12.62	0.53	38.0
Ayrshire	5,188	18	72	43.49	11.60	0.16	41.0
Brown Swiss	147	19	72	44.18	13.94	1.15	42.0
Sahiwal	2,875	18	72	49.27	11.44	0.21	49.0

📊 Column Descriptions for Table 5:

Column	Description
main_breed_name	Breed of the animal
N	Number of first-parity records (after filtering 18-72 mo)
Min	Minimum AFC for this breed (within filtered range)
Max	Maximum AFC for this breed (within filtered range)
Mean	✅ Average AFC - use this for reporting
SD	Standard Deviation - spread of AFC values
SE	Standard Error = SD/√N - precision of mean estimate
Median	Middle value - alternative to mean

Interpretation:

Lower Mean = earlier maturing breed (e.g., some crossbreds)
Higher Mean = later maturing or management issues
Lower SE = more precise estimate (larger sample size)
Breeds with N < 30 are excluded (unreliable estimates)

ggplot(afc_by_breed, aes(x = reorder(main_breed_name, Mean), y = Mean)) +
  geom_bar(stat = "identity", fill = "steelblue", width = 0.7) +
  geom_errorbar(aes(ymin = Mean - SE, ymax = Mean + SE), width = 0.2, color = "black") +
  geom_text(aes(label = paste0(round(Mean, 1), " mo\n(n=", format(N, big.mark=","), ")")), 
            hjust = -0.1, size = 3) +
  geom_hline(yintercept = 30, linetype = "dashed", color = "darkgreen", linewidth = 1) +
  annotate("text", x = 0.5, y = 31, label = "Target: 30 months", hjust = 0, color = "darkgreen", size = 3) +
  coord_flip() +
  labs(
    title = "Figure 3: Mean AFC by Breed (Cleaned Data: 18-72 months)",
    subtitle = "Error bars = ± 1 Standard Error | Green line = target AFC of 30 months",
    x = "Breed",
    y = "Mean AFC (months)"
  ) +
  scale_y_continuous(limits = c(0, max(afc_by_breed$Mean + afc_by_breed$SE) * 1.5))

📈 How to Read Figure 3:

Bar height = Mean AFC for that breed
Error bars = ± 1 Standard Error
- Shorter error bars = more precise estimate (larger N)
- If error bars don’t overlap between breeds, difference may be significant
Green dashed line = Target AFC of 30 months
- Bars below this line = good (earlier calving)
- Bars above this line = room for improvement
Labels show exact mean and sample size

5 Daily Milk Yield Analysis

5.1 Distribution by Category

# Daily yield categories
dmy_data <- milking %>%
  filter(!is.na(daily_yield)) %>%
  mutate(
    yield_category = case_when(
      daily_yield < 0 ~ "Negative (ERROR)",
      daily_yield == 0 ~ "Zero Yield",
      daily_yield > 0 & daily_yield < 0.5 ~ "Very Low (<0.5 kg)",
      daily_yield >= 0.5 & daily_yield <= 10 ~ "Low (0.5-10 kg)",
      daily_yield > 10 & daily_yield <= 20 ~ "Medium (10-20 kg)",
      daily_yield > 20 & daily_yield <= 30 ~ "Good (20-30 kg)",
      daily_yield > 30 & daily_yield <= 50 ~ "High (30-50 kg)",
      daily_yield > 50 ~ "Extreme (>50 kg)"
    ),
    yield_category = factor(yield_category, levels = c("Negative (ERROR)", "Zero Yield", 
                                                        "Very Low (<0.5 kg)", "Low (0.5-10 kg)",
                                                        "Medium (10-20 kg)", "Good (20-30 kg)",
                                                        "High (30-50 kg)", "Extreme (>50 kg)"))
  )

# Summary
dmy_summary <- dmy_data %>%
  group_by(yield_category) %>%
  summarise(
    N = n(),
    Min = round(min(daily_yield), 2),
    Max = round(max(daily_yield), 2),
    Mean = round(mean(daily_yield), 2),
    SD = round(sd(daily_yield), 2),
    .groups = "drop"
  ) %>%
  mutate(
    Percentage = round(100 * N / sum(N), 2),
    Status = case_when(
      yield_category == "Negative (ERROR)" ~ "🔴 ERROR",
      yield_category == "Zero Yield" ~ "🟠 CHECK",
      yield_category == "Very Low (<0.5 kg)" ~ "🟠 WARNING",
      yield_category %in% c("Low (0.5-10 kg)", "Medium (10-20 kg)", "Good (20-30 kg)") ~ "🟢 NORMAL",
      yield_category == "High (30-50 kg)" ~ "🔵 HIGH",
      yield_category == "Extreme (>50 kg)" ~ "🟣 VERIFY"
    )
  )

kable(dmy_summary, 
      caption = "Table 6: Daily Milk Yield Distribution by Category",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)

Table 6: Daily Milk Yield Distribution by Category
yield_category	N	Min	Max	Mean	SD	Percentage	Status
Zero Yield	333,069	0.00	0.00	0.00	0.00	31.76	🟠 CHECK \|
Very Low (<0.5 kg)	40,284	0.01	0.49	0.28	0.12	3.84	🟠 WARNING \|
Low (0.5-10 kg)	393,426	0.50	10.00	3.43	3.11	37.52	🟢 NORMAL \|
Medium (10-20 kg)	237,650	10.01	20.00	14.23	2.69	22.66	🟢 NORMAL \|
Good (20-30 kg)	42,181	20.10	30.00	23.08	2.45	4.02	🟢 NORMAL \|
High (30-50 kg)	1,936	30.10	50.00	32.68	2.83	0.18	🔵 HIGH \|

📊 Understanding Milk Yield Categories:

Category	Range	Status	Explanation
Negative	< 0 kg	🔴 ERROR	Impossible - data entry error
Zero	= 0 kg	🟠 CHECK	May be: dry cow, sick, or missing data recorded as 0
Very Low	0-0.5 kg	🟠 WARNING	Unusual - verify recording
Low	0.5-10 kg	🟢 NORMAL	Typical for local/indigenous breeds or late lactation
Medium	10-20 kg	🟢 NORMAL	Good production for crossbreds
Good	20-30 kg	🟢 NORMAL	High production crossbreds or pure exotics
High	30-50 kg	🔵 HIGH	Elite producers - verify if correct
Extreme	> 50 kg	🟣 VERIFY	Very rare - likely error unless elite Holstein

Common Reasons for Zero Yields:

Dry period - cow is not being milked before next calving
Sick animal - temporarily not milking
Missing data - no recording made but 0 entered instead of NA
Data entry error - value not recorded properly

# Plot 1: All data
p1 <- ggplot(dmy_data, aes(x = daily_yield)) +
  geom_histogram(aes(fill = yield_category), bins = 100, color = "black", linewidth = 0.1) +
  scale_fill_manual(values = c(
    "Negative (ERROR)" = "#D32F2F",
    "Zero Yield" = "#757575",
    "Very Low (<0.5 kg)" = "#FF9800",
    "Low (0.5-10 kg)" = "#8BC34A",
    "Medium (10-20 kg)" = "#4CAF50",
    "Good (20-30 kg)" = "#2E7D32",
    "High (30-50 kg)" = "#2196F3",
    "Extreme (>50 kg)" = "#9C27B0"
  )) +
  labs(
    title = "Figure 4: Daily Milk Yield - Complete Distribution",
    subtitle = paste0("Total N = ", format(nrow(dmy_data), big.mark = ","),
                      " | Note: Large spike at zero - investigate these records"),
    x = "Daily Yield (kg)",
    y = "Count (log scale)",
    fill = "Category"
  ) +
  scale_y_log10(labels = comma) +
  theme(legend.position = "bottom")

# Plot 2: Cleaned data
clean_dmy <- dmy_data %>% filter(daily_yield >= 0.5, daily_yield <= 50)
p2 <- ggplot(clean_dmy, aes(x = daily_yield)) +
  geom_histogram(bins = 50, fill = "steelblue", color = "black", linewidth = 0.1) +
  geom_vline(xintercept = mean(clean_dmy$daily_yield), linetype = "dashed", color = "red", linewidth = 1) +
  annotate("text", x = mean(clean_dmy$daily_yield) + 1, y = Inf, 
           label = paste0("Clean Mean = ", round(mean(clean_dmy$daily_yield), 2), " kg/day"),
           vjust = 2, hjust = 0, color = "red", fontface = "bold", size = 4) +
  labs(
    title = "Figure 5: Daily Milk Yield - Cleaned Data (0.5-50 kg)",
    subtitle = paste0("Clean N = ", format(nrow(clean_dmy), big.mark = ","),
                      " (", round(100*nrow(clean_dmy)/nrow(dmy_data), 1), "% retained)",
                      " | Mean = ", round(mean(clean_dmy$daily_yield), 2),
                      " ± ", round(sd(clean_dmy$daily_yield), 2), " kg/day"),
    x = "Daily Yield (kg)",
    y = "Count"
  )

grid.arrange(p1, p2, ncol = 1)

⚠️ Important Finding: High Percentage of Zero Yields

n_zeros <- sum(dmy_data$daily_yield == 0)
pct_zeros <- round(100 * n_zeros / nrow(dmy_data), 1)

cat(paste0("
**", format(n_zeros, big.mark=","), " records (", pct_zeros, "%) have zero daily yield.**

This needs investigation:

1. **Are these dry cows?** Check if DIM is near end of lactation
2. **Are these sick animals?** Cross-reference with health records
3. **Is this missing data?** Should these be NA instead of 0?
4. **Project-specific issue?** Check if zeros concentrated in certain projects

**Recommendation:** Do NOT simply delete zeros - investigate the cause first!
"))

333,069 records (31.8%) have zero daily yield.

This needs investigation:

Are these dry cows? Check if DIM is near end of lactation
Are these sick animals? Cross-reference with health records
Is this missing data? Should these be NA instead of 0?
Project-specific issue? Check if zeros concentrated in certain projects

Recommendation: Do NOT simply delete zeros - investigate the cause first!

6 Summary and Recommendations

6.1 Final Quality Assessment

# Create status indicators
comparison <- comparison %>%
  mutate(
    Quality_Status = case_when(
      Pct_Lost < 5 ~ "🟢 Good",
      Pct_Lost < 10 ~ "🟡 Acceptable",
      Pct_Lost < 20 ~ "🟠 Warning",
      TRUE ~ "🔴 Major Issue"
    )
  )

kable(comparison %>% select(Trait, Raw_N, Raw_Min, Raw_Max, Raw_Mean, 
                             Filter_Applied, Clean_N, Clean_Mean, Clean_SD, 
                             Records_Lost, Pct_Lost, Quality_Status), 
      caption = "Table 7: Final Data Quality Summary",
      col.names = c("Trait", "Raw N", "Raw Min", "Raw Max", "Raw Mean", 
                    "Filter", "Clean N", "Clean Mean", "Clean SD", 
                    "N Lost", "% Lost", "Status"),
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  column_spec(3, color = ifelse(comparison$Min_Flag, "red", "black"),
              bold = ifelse(comparison$Min_Flag, TRUE, FALSE)) %>%
  column_spec(4, color = ifelse(comparison$Max_Flag, "red", "black"),
              bold = ifelse(comparison$Max_Flag, TRUE, FALSE)) %>%
  column_spec(11, color = ifelse(comparison$Loss_Flag, "red", "darkgreen"),
              bold = TRUE)

Table 7: Final Data Quality Summary
Trait	Raw N	Raw Min	Raw Max	Raw Mean	Filter	Clean N	Clean Mean	Clean SD	N Lost	% Lost	Status
AFC (months)	33,015	-394	24,295.0	50.96	18 to 72	28,229	39.36	11.89	4,786	14.50	🟠 Warning \|
Calving Interval (days)	50,463	0	16,530.0	521.70	300 to 700	41,248	433.45	92.15	9,215	18.26	🟠 Warning \|
Lactation Length (days)	50,707	0	669,342.0	535.21	60 to 500	33,573	386.08	60.86	17,134	33.79	🔴 Major Issue \|
Total Lactation Yield (kg)	88,347	0	23,186.4	112.02	>0 to 15,000	37,184	261.79	927.82	51,163	57.91	🔴 Major Issue \|
Days Open	50,710	-280	669,062.0	255.21	21 to 400	40,802	150.28	87.90	9,908	19.54	🟠 Warning \|
Daily Milk Yield (kg)	1,048,546	0	50.0	5.51	0.5 to 50	675,193	8.54	7.08	373,353	35.61	🔴 Major Issue \|
Days in Milk (DIM)	1,048,575	-729,939	7,752.0	220.35	5 to 500	957,939	169.61	112.75	90,636	8.64	🟡 Acceptable \|
Parity	87,990	1	32.0	2.42	1 to 12	87,927	2.41	1.76	63	0.07	🟢 Good \|

📊 How to Use Table 7:

What to Report	Column to Use	Notes
Sample size	Clean N	After removing outliers
Average value	Clean Mean	NOT Raw Mean (biased by outliers)
Variation	Clean SD	Standard deviation of cleaned data
Data quality	% Lost & Status	Document in methods section

Status Legend:

🟢 Good (< 5% lost) - Minor outliers, data is reliable
🟡 Acceptable (5-10% lost) - Some outliers, proceed with caution
🟠 Warning (10-20% lost) - Significant outliers, investigate cause
🔴 Major Issue (> 20% lost) - Serious data quality problem, investigate before use

6.2 Key Findings

🚨 Critical Issues Requiring Attention:

cat("
| Issue | Severity | Records Affected | Action Required |
|-------|----------|------------------|-----------------|
| **Negative AFC values** | 🔴 Critical | Check birthdate/calving date | Fix or remove |
| **Negative DIM values** | 🔴 Critical | Milking date before calving | Fix dates |
| **Zero total lactation yields** | 🔴 Major | ~58% of records | Investigate - incomplete data? |
| **Zero daily yields** | 🟠 Warning | ~32% of records | Check if dry cows or missing |
| **Extreme AFC (>100 mo)** | 🔴 Critical | Data entry errors | Remove |
| **Negative Days Open** | 🔴 Critical | Conception before calving | Fix dates |
")

Issue	Severity	Records Affected	Action Required
Negative AFC values	🔴 Critical	Check birthdate/calving date	Fix or remove
Negative DIM values	🔴 Critical	Milking date before calving	Fix dates
Zero total lactation yields	🔴 Major	~58% of records	Investigate - incomplete data?
Zero daily yields	🟠 Warning	~32% of records	Check if dry cows or missing
Extreme AFC (>100 mo)	🔴 Critical	Data entry errors	Remove
Negative Days Open	🔴 Critical	Conception before calving	Fix dates

6.3 Recommendations

✅ Recommended Data Cleaning Workflow:

Step 1: Remove Impossible Values

# Remove biologically impossible values
data_clean <- data %>%
  filter(
    afc_months >= 0,              # No negative ages
    dim >= 0,                      # No negative DIM
    days_open >= 0,                # No negative days open
    daily_yield >= 0               # No negative yields
  )

Step 2: Apply Biological Filters

# Keep only biologically plausible values
data_clean <- data_clean %>%
  filter(
    afc_months >= 18 & afc_months <= 72,
    calving_interval_days >= 300 & calving_interval_days <= 700,
    daily_yield >= 0.5 & daily_yield <= 50,
    dim >= 5 & dim <= 500,
    days_open >= 21 & days_open <= 400,
    total_lactation_yield > 0 & total_lactation_yield <= 15000
  )

Step 3: Document What You Did

# Always report:
cat("Original records:", nrow(data_raw))
cat("After cleaning:", nrow(data_clean))
cat("Records removed:", nrow(data_raw) - nrow(data_clean))
cat("Filters applied: AFC 18-72 mo, CI 300-700 d, etc.")

📝 What to Report in Publications:

When reporting results, always include:

Sample size after cleaning (Clean N)
Mean ± SD from cleaned data
The filters applied (e.g., “AFC values outside 18-72 months were excluded”)
Percentage of data removed (for transparency)

Example Methods Statement:

“Age at first calving (AFC) was calculated for first-parity animals. Records with AFC outside the biologically plausible range of 18-72 months were excluded (N = X, Y% of records), resulting in a final sample of N = Z animals. Mean AFC was X.X ± Y.Y months.”

7 Session Info

sessionInfo()

## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_Kenya.utf8  LC_CTYPE=English_Kenya.utf8   
## [3] LC_MONETARY=English_Kenya.utf8 LC_NUMERIC=C                  
## [5] LC_TIME=English_Kenya.utf8    
## 
## time zone: Africa/Nairobi
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] viridis_0.6.5     viridisLite_0.4.2 kableExtra_1.4.0  knitr_1.50       
##  [5] gridExtra_2.3     scales_1.4.0      ggplot2_3.5.2     tidyr_1.3.1      
##  [9] dplyr_1.1.4       data.table_1.17.8
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       jsonlite_2.0.0     compiler_4.5.1     tidyselect_1.2.1  
##  [5] xml2_1.4.0         stringr_1.5.1      jquerylib_0.1.4    textshaping_1.0.2 
##  [9] systemfonts_1.2.3  yaml_2.3.10        fastmap_1.2.0      R6_2.6.1          
## [13] labeling_0.4.3     generics_0.1.4     tibble_3.3.0       svglite_2.2.1     
## [17] bslib_0.9.0        pillar_1.11.0      RColorBrewer_1.1-3 rlang_1.1.6       
## [21] stringi_1.8.7      cachem_1.1.0       xfun_0.52          sass_0.4.10       
## [25] cli_3.6.5          withr_3.0.2        magrittr_2.0.3     digest_0.6.37     
## [29] grid_4.5.1         rstudioapi_0.17.1  lifecycle_1.0.4    vctrs_0.6.5       
## [33] evaluate_1.0.5     glue_1.8.0         farver_2.1.2       rmarkdown_2.29    
## [37] purrr_1.1.0        tools_4.5.1        pkgconfig_2.0.3    htmltools_0.5.8.1

Report generated on 08 December 2025 at 12:39

Kenya ADGG/TDGG Dairy Data Quality Assessment

Outlier Detection & Data Anomaly Report

Set Gebreyohannes - ILRI

08 December 2025