# Load required packages
library(data.table)
library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(gridExtra)
library(knitr)
library(kableExtra)
library(viridis)

# Set theme
theme_set(theme_bw(base_size = 12))

Executive Summary

0.1 Purpose of This Report

This report assesses the data quality of the Kenya ADGG/TDGG dairy datasets by:

  1. Identifying outliers - values that are statistically unusual
  2. Detecting errors - values that are biologically impossible (e.g., negative ages)
  3. Quantifying data loss - how many records are lost when we apply quality filters
  4. Providing recommendations - suggested filters and cleaning procedures

0.2 Color Coding Used in This Report

Throughout this report, we use color coding to highlight data quality issues:

Color Meaning
πŸ”΄ Red Error or impossible value (e.g., negative AFC, negative DIM)
🟠 Orange Warning - unusual but possible (e.g., AFC < 18 months)
🟒 Green Normal/acceptable range
πŸ”΅ Blue High but acceptable (e.g., daily yield 30-50 kg)
🟣 Purple Extreme outlier - needs verification

1 Data Loading

# SET YOUR FILE PATH HERE
data_path <- "C:/Users/SAlemu/OneDrive - CGIAR/Documents/KenyaData/New_Kenya_data/"

# Read data - treating "NULL" as NA
lactation <- fread(paste0(data_path, "lactation_calving.csv"), 
                   na.strings = c("NULL", "null", "NA", "N/A", "", " ", "NaN"),
                   stringsAsFactors = FALSE)

milking <- fread(paste0(data_path, "milking_data.csv"), 
                 na.strings = c("NULL", "null", "NA", "N/A", "", " ", "NaN"),
                 stringsAsFactors = FALSE)

# Convert numeric columns
lactation <- lactation %>%
  mutate(
    parity = as.numeric(parity),
    afc_months = as.numeric(afc_months),
    afc_days = as.numeric(afc_days),
    calving_interval_days = as.numeric(calving_interval_days),
    calving_interval_months = as.numeric(calving_interval_months),
    lactation_length_days = as.numeric(lactation_length_days),
    total_lactation_yield = as.numeric(total_lactation_yield),
    testday_count = as.numeric(testday_count),
    days_open = as.numeric(days_open)
  )

milking <- milking %>%
  mutate(
    parity = as.numeric(parity),
    afc_months = as.numeric(afc_months),
    afc_days = as.numeric(afc_days),
    dim = as.numeric(dim),
    daily_yield = as.numeric(daily_yield),
    milk_morning = as.numeric(milk_morning),
    milk_evening = as.numeric(milk_evening),
    milk_midday = as.numeric(milk_midday),
    testday_no = as.numeric(testday_no),
    calving_age_months = as.numeric(calving_age_months)
  )

Datasets Loaded:

Dataset Records Description
Lactation 88,352 One record per lactation (calving event)
Milking 1,048,575 One record per test-day milk recording

What is a β€œtest-day”? A test-day is a single day when milk production was recorded. Each lactation typically has multiple test-days (monthly recordings throughout the lactation).


2 Raw Data Statistics

2.1 Why Look at Raw Statistics First?

Important: Before cleaning data, we must examine the raw statistics (Min, Max, Mean) to:

  1. Identify impossible values - e.g., negative ages indicate calculation errors
  2. Detect extreme outliers - e.g., AFC of 24,000 months is clearly wrong
  3. Understand the data range - see what values exist before filtering
  4. Quantify the problem - know how many records need cleaning

The Min and Max columns are especially important for detecting errors!

# Function to calculate comprehensive stats
calc_stats <- function(x, var_name) {
  x_valid <- x[!is.na(x)]
  n_total <- length(x)
  n_valid <- length(x_valid)
  n_missing <- sum(is.na(x))
  
  if (n_valid == 0) {
    return(data.frame(
      Variable = var_name,
      N_Total = n_total,
      N_Valid = 0,
      N_Missing = n_missing,
      Pct_Missing = 100,
      Min = NA, Max = NA, Mean = NA, SD = NA, Median = NA
    ))
  }
  
  data.frame(
    Variable = var_name,
    N_Total = n_total,
    N_Valid = n_valid,
    N_Missing = n_missing,
    Pct_Missing = round(100 * n_missing / n_total, 2),
    Min = round(min(x_valid), 2),
    Max = round(max(x_valid), 2),
    Mean = round(mean(x_valid), 2),
    SD = round(sd(x_valid), 2),
    Median = round(median(x_valid), 2)
  )
}

# Create subsets first
afc_parity1 <- lactation$afc_months[which(lactation$parity == 1)]
ci_parity_gt1 <- lactation$calving_interval_days[which(lactation$parity > 1)]

lact_stats_raw <- rbind(
  calc_stats(afc_parity1, "AFC (months)"),
  calc_stats(ci_parity_gt1, "Calving Interval (days)"),
  calc_stats(lactation$lactation_length_days, "Lactation Length (days)"),
  calc_stats(lactation$total_lactation_yield, "Total Lactation Yield (kg)"),
  calc_stats(lactation$days_open, "Days Open"),
  calc_stats(lactation$parity, "Parity")
)

# Determine which rows have problematic Min or Max values
# For color coding
lact_stats_raw <- lact_stats_raw %>%
  mutate(
    Min_Problem = case_when(
      Variable == "AFC (months)" & Min < 0 ~ TRUE,
      Variable == "Calving Interval (days)" & Min < 280 ~ TRUE,
      Variable == "Days Open" & Min < 0 ~ TRUE,
      Variable == "Parity" & Min < 1 ~ TRUE,
      TRUE ~ FALSE
    ),
    Max_Problem = case_when(
      Variable == "AFC (months)" & Max > 100 ~ TRUE,
      Variable == "Calving Interval (days)" & Max > 730 ~ TRUE,
      Variable == "Total Lactation Yield (kg)" & Max > 20000 ~ TRUE,
      Variable == "Days Open" & Max > 500 ~ TRUE,
      Variable == "Parity" & Max > 15 ~ TRUE,
      TRUE ~ FALSE
    )
  )

kable(lact_stats_raw %>% select(-Min_Problem, -Max_Problem), 
      caption = "Table 1: Lactation Data - Raw Statistics (Before Any Cleaning)",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  column_spec(6, color = ifelse(lact_stats_raw$Min_Problem, "red", "black"),
              bold = ifelse(lact_stats_raw$Min_Problem, TRUE, FALSE)) %>%
  column_spec(7, color = ifelse(lact_stats_raw$Max_Problem, "red", "black"),
              bold = ifelse(lact_stats_raw$Max_Problem, TRUE, FALSE))
Table 1: Lactation Data - Raw Statistics (Before Any Cleaning)
Variable N_Total N_Valid N_Missing Pct_Missing Min Max Mean SD Median
AFC (months) 37,236 33,015 4,221 11.34 -394 24,295.0 50.96 364.55 38
Calving Interval (days) 50,754 50,463 291 0.57 0 16,530.0 521.70 342.46 424
Lactation Length (days) 88,352 50,707 37,645 42.61 0 669,342.0 535.21 2,990.38 424
Total Lactation Yield (kg) 88,352 88,347 5 0.01 0 23,186.4 112.02 642.21 0
Days Open 88,352 50,710 37,642 42.60 -280 669,062.0 255.21 2,990.29 144
Parity 88,352 87,990 362 0.41 1 32.0 2.42 1.82 2

πŸ“Š Column Descriptions for Table 1:

Column What It Means How to Interpret
Variable The trait being measured -
N_Total Total records in this subset Higher = more data available
N_Valid Records with actual values (not missing) Should be close to N_Total
N_Missing Records with missing (NA) values High numbers indicate data collection gaps
Pct_Missing % of records that are missing >20% is concerning
Min πŸ”΄ Smallest value found Negative values = ERROR
Max πŸ”΄ Largest value found Extreme values = likely ERROR
Mean Average of all valid values Affected by outliers
SD Standard Deviation (spread) Large SD suggests high variability or outliers
Median Middle value (50th percentile) Less affected by outliers than Mean

πŸ”΄ Values shown in RED indicate problems that need attention!

🚨 Critical Issues Found in Lactation Data:

# AFC issues
afc_min <- round(min(afc_parity1, na.rm = TRUE), 1)
afc_max <- round(max(afc_parity1, na.rm = TRUE), 1)

# CI issues
ci_min <- round(min(ci_parity_gt1, na.rm = TRUE), 1)
ci_max <- round(max(ci_parity_gt1, na.rm = TRUE), 1)

# Days open issues
do_min <- round(min(lactation$days_open, na.rm = TRUE), 1)
do_max <- round(max(lactation$days_open, na.rm = TRUE), 1)

cat(paste0("
| Trait | Problem Value | Why It's Wrong |
|-------|---------------|----------------|
| **AFC** | Min = ", afc_min, " months | πŸ”΄ **Negative age is impossible!** Birthdate must be before calving date |
| **AFC** | Max = ", format(afc_max, big.mark=','), " months | πŸ”΄ **", round(afc_max/12, 0), " years old at first calving is impossible!** |
| **Calving Interval** | Max = ", format(ci_max, big.mark=','), " days | πŸ”΄ **", round(ci_max/365, 1), " years between calvings is extreme** |
| **Days Open** | Min = ", do_min, " days | πŸ”΄ **Negative days open is impossible!** |
| **Days Open** | Max = ", format(do_max, big.mark=','), " days | πŸ”΄ **", round(do_max/365, 1), " years to conception is extreme** |
"))
Trait Problem Value Why It’s Wrong
AFC Min = -394 months πŸ”΄ Negative age is impossible! Birthdate must be before calving date
AFC Max = 24,295 months πŸ”΄ 2025 years old at first calving is impossible!
Calving Interval Max = 16,530 days πŸ”΄ 45.3 years between calvings is extreme
Days Open Min = -280 days πŸ”΄ Negative days open is impossible!
Days Open Max = 669,062 days πŸ”΄ 1833 years to conception is extreme
# Milking traits - RAW
milk_stats_raw <- rbind(
  calc_stats(milking$daily_yield, "Daily Milk Yield (kg)"),
  calc_stats(milking$milk_morning, "Morning Milk (kg)"),
  calc_stats(milking$milk_evening, "Evening Milk (kg)"),
  calc_stats(milking$dim, "Days in Milk (DIM)"),
  calc_stats(milking$parity, "Parity"),
  calc_stats(milking$afc_months, "AFC (months)")
)

# Flag problems
milk_stats_raw <- milk_stats_raw %>%
  mutate(
    Min_Problem = case_when(
      Variable == "Daily Milk Yield (kg)" & Min < 0 ~ TRUE,
      Variable == "Morning Milk (kg)" & Min < 0 ~ TRUE,
      Variable == "Evening Milk (kg)" & Min < 0 ~ TRUE,
      Variable == "Days in Milk (DIM)" & Min < 0 ~ TRUE,
      Variable == "AFC (months)" & Min < 0 ~ TRUE,
      TRUE ~ FALSE
    ),
    Max_Problem = case_when(
      Variable == "Daily Milk Yield (kg)" & Max > 60 ~ TRUE,
      Variable == "Days in Milk (DIM)" & Max > 1000 ~ TRUE,
      Variable == "AFC (months)" & Max > 100 ~ TRUE,
      TRUE ~ FALSE
    )
  )

kable(milk_stats_raw %>% select(-Min_Problem, -Max_Problem), 
      caption = "Table 2: Milking Data - Raw Statistics (Before Any Cleaning)",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  column_spec(6, color = ifelse(milk_stats_raw$Min_Problem, "red", "black"),
              bold = ifelse(milk_stats_raw$Min_Problem, TRUE, FALSE)) %>%
  column_spec(7, color = ifelse(milk_stats_raw$Max_Problem, "red", "black"),
              bold = ifelse(milk_stats_raw$Max_Problem, TRUE, FALSE))
Table 2: Milking Data - Raw Statistics (Before Any Cleaning)
Variable N_Total N_Valid N_Missing Pct_Missing Min Max Mean SD Median
Daily Milk Yield (kg) 1,048,575 1,048,546 29 0.00 0 50 5.51 6.99 1.38
Morning Milk (kg) 1,048,575 1,048,415 160 0.02 0 459 1.90 3.84 0.00
Evening Milk (kg) 1,048,575 1,048,413 162 0.02 -1 29 1.81 3.18 0.00
Days in Milk (DIM) 1,048,575 1,048,575 0 0.00 -729,939 7,752 220.35 2,008.49 168.00
Parity 1,048,575 1,048,403 172 0.02 1 32 2.66 1.99 2.00
AFC (months) 1,048,575 389,867 658,708 62.82 -288 270 38.29 19.43 33.00

🚨 Critical Issues Found in Milking Data:

dim_min <- round(min(milking$dim, na.rm = TRUE), 0)
dim_max <- round(max(milking$dim, na.rm = TRUE), 0)
eve_min <- round(min(milking$milk_evening, na.rm = TRUE), 2)

cat(paste0("
| Trait | Problem Value | Why It's Wrong |
|-------|---------------|----------------|
| **DIM** | Min = ", format(dim_min, big.mark=','), " days | πŸ”΄ **Extreme negative!** Milking date is ~2000 years before calving date! |
| **DIM** | Max = ", format(dim_max, big.mark=','), " days | πŸ”΄ **", round(dim_max/365, 1), " years in milk is impossible** |
| **Evening Milk** | Min = ", eve_min, " kg | πŸ”΄ **Negative milk yield is impossible** |
"))
Trait Problem Value Why It’s Wrong
DIM Min = -729,939 days πŸ”΄ Extreme negative! Milking date is ~2000 years before calving date!
DIM Max = 7,752 days πŸ”΄ 21.2 years in milk is impossible
Evening Milk Min = -1 kg πŸ”΄ Negative milk yield is impossible

What causes negative DIM? - DIM = Milking Date - Calving Date - Negative DIM means the milking was recorded BEFORE the calving date - This is a data entry error - dates were entered incorrectly


3 Raw vs Cleaned Data Comparison

3.1 What Does β€œCleaning” Mean?

Data Cleaning = Removing biologically implausible values

We apply filters to keep only values within acceptable biological ranges:

Trait Filter Range Biological Justification
AFC 18-72 months Heifers reach puberty ~12-15 mo; first calving typically 24-36 mo
Calving Interval 300-700 days Minimum ~280 days (gestation); >700 days indicates fertility problems
Daily Milk Yield 0.5-50 kg Typical range; >50 kg/day is rare even for elite Holsteins
DIM 5-500 days Standard lactation ~305 days; extended lactations possible
Days Open 21-400 days Minimum ~21 days postpartum; >400 days is poor fertility
Total Lactation Yield >0 to 15,000 kg Zero = incomplete data; >15,000 kg is exceptional
# Calculate each trait separately to avoid NA issues

# ============ AFC (Parity 1) ============
afc_raw <- lactation %>% filter(parity == 1, !is.na(afc_months))
afc_clean <- lactation %>% filter(parity == 1, !is.na(afc_months), afc_months >= 18, afc_months <= 72)

afc_row <- data.frame(
  Trait = "AFC (months)",
  Raw_N = nrow(afc_raw),
  Raw_Min = round(min(afc_raw$afc_months), 1),
  Raw_Max = round(max(afc_raw$afc_months), 1),
  Raw_Mean = round(mean(afc_raw$afc_months), 2),
  Raw_SD = round(sd(afc_raw$afc_months), 2),
  Filter_Applied = "18 to 72",
  Clean_N = nrow(afc_clean),
  Clean_Mean = round(mean(afc_clean$afc_months), 2),
  Clean_SD = round(sd(afc_clean$afc_months), 2)
)

# ============ Calving Interval (Parity > 1) ============
ci_raw <- lactation %>% filter(parity > 1, !is.na(calving_interval_days))
ci_clean <- lactation %>% filter(parity > 1, !is.na(calving_interval_days), 
                                  calving_interval_days >= 300, calving_interval_days <= 700)

ci_row <- data.frame(
  Trait = "Calving Interval (days)",
  Raw_N = nrow(ci_raw),
  Raw_Min = round(min(ci_raw$calving_interval_days), 1),
  Raw_Max = round(max(ci_raw$calving_interval_days), 1),
  Raw_Mean = round(mean(ci_raw$calving_interval_days), 2),
  Raw_SD = round(sd(ci_raw$calving_interval_days), 2),
  Filter_Applied = "300 to 700",
  Clean_N = nrow(ci_clean),
  Clean_Mean = round(mean(ci_clean$calving_interval_days), 2),
  Clean_SD = round(sd(ci_clean$calving_interval_days), 2)
)

# ============ Lactation Length ============
ll_raw <- lactation %>% filter(!is.na(lactation_length_days))
ll_clean <- lactation %>% filter(!is.na(lactation_length_days), 
                                  lactation_length_days >= 60, lactation_length_days <= 500)

ll_row <- data.frame(
  Trait = "Lactation Length (days)",
  Raw_N = nrow(ll_raw),
  Raw_Min = round(min(ll_raw$lactation_length_days), 1),
  Raw_Max = round(max(ll_raw$lactation_length_days), 1),
  Raw_Mean = round(mean(ll_raw$lactation_length_days), 2),
  Raw_SD = round(sd(ll_raw$lactation_length_days), 2),
  Filter_Applied = "60 to 500",
  Clean_N = nrow(ll_clean),
  Clean_Mean = round(mean(ll_clean$lactation_length_days), 2),
  Clean_SD = round(sd(ll_clean$lactation_length_days), 2)
)

# ============ Total Lactation Yield ============
tly_raw <- lactation %>% filter(!is.na(total_lactation_yield))
tly_clean <- lactation %>% filter(!is.na(total_lactation_yield), 
                                   total_lactation_yield > 0, total_lactation_yield <= 15000)

tly_row <- data.frame(
  Trait = "Total Lactation Yield (kg)",
  Raw_N = nrow(tly_raw),
  Raw_Min = round(min(tly_raw$total_lactation_yield), 1),
  Raw_Max = round(max(tly_raw$total_lactation_yield), 1),
  Raw_Mean = round(mean(tly_raw$total_lactation_yield), 2),
  Raw_SD = round(sd(tly_raw$total_lactation_yield), 2),
  Filter_Applied = ">0 to 15,000",
  Clean_N = nrow(tly_clean),
  Clean_Mean = round(mean(tly_clean$total_lactation_yield), 2),
  Clean_SD = round(sd(tly_clean$total_lactation_yield), 2)
)

# ============ Days Open ============
do_raw <- lactation %>% filter(!is.na(days_open))
do_clean <- lactation %>% filter(!is.na(days_open), days_open >= 21, days_open <= 400)

do_row <- data.frame(
  Trait = "Days Open",
  Raw_N = nrow(do_raw),
  Raw_Min = round(min(do_raw$days_open), 1),
  Raw_Max = round(max(do_raw$days_open), 1),
  Raw_Mean = round(mean(do_raw$days_open), 2),
  Raw_SD = round(sd(do_raw$days_open), 2),
  Filter_Applied = "21 to 400",
  Clean_N = nrow(do_clean),
  Clean_Mean = round(mean(do_clean$days_open), 2),
  Clean_SD = round(sd(do_clean$days_open), 2)
)

# ============ Daily Milk Yield ============
dmy_raw <- milking %>% filter(!is.na(daily_yield))
dmy_clean <- milking %>% filter(!is.na(daily_yield), daily_yield >= 0.5, daily_yield <= 50)

dmy_row <- data.frame(
  Trait = "Daily Milk Yield (kg)",
  Raw_N = nrow(dmy_raw),
  Raw_Min = round(min(dmy_raw$daily_yield), 2),
  Raw_Max = round(max(dmy_raw$daily_yield), 2),
  Raw_Mean = round(mean(dmy_raw$daily_yield), 2),
  Raw_SD = round(sd(dmy_raw$daily_yield), 2),
  Filter_Applied = "0.5 to 50",
  Clean_N = nrow(dmy_clean),
  Clean_Mean = round(mean(dmy_clean$daily_yield), 2),
  Clean_SD = round(sd(dmy_clean$daily_yield), 2)
)

# ============ DIM ============
dim_raw <- milking %>% filter(!is.na(dim))
dim_clean <- milking %>% filter(!is.na(dim), dim >= 5, dim <= 500)

dim_row <- data.frame(
  Trait = "Days in Milk (DIM)",
  Raw_N = nrow(dim_raw),
  Raw_Min = round(min(dim_raw$dim), 0),
  Raw_Max = round(max(dim_raw$dim), 0),
  Raw_Mean = round(mean(dim_raw$dim), 2),
  Raw_SD = round(sd(dim_raw$dim), 2),
  Filter_Applied = "5 to 500",
  Clean_N = nrow(dim_clean),
  Clean_Mean = round(mean(dim_clean$dim), 2),
  Clean_SD = round(sd(dim_clean$dim), 2)
)

# ============ Parity ============
parity_raw <- lactation %>% filter(!is.na(parity))
parity_clean <- lactation %>% filter(!is.na(parity), parity >= 1, parity <= 12)

parity_row <- data.frame(
  Trait = "Parity",
  Raw_N = nrow(parity_raw),
  Raw_Min = min(parity_raw$parity),
  Raw_Max = max(parity_raw$parity),
  Raw_Mean = round(mean(parity_raw$parity), 2),
  Raw_SD = round(sd(parity_raw$parity), 2),
  Filter_Applied = "1 to 12",
  Clean_N = nrow(parity_clean),
  Clean_Mean = round(mean(parity_clean$parity), 2),
  Clean_SD = round(sd(parity_clean$parity), 2)
)

# Combine all rows
comparison <- rbind(afc_row, ci_row, ll_row, tly_row, do_row, dmy_row, dim_row, parity_row)

# Calculate records lost and percentage
comparison <- comparison %>%
  mutate(
    Records_Lost = Raw_N - Clean_N,
    Pct_Lost = round(100 * Records_Lost / Raw_N, 2),
    # Flag problematic values
    Min_Flag = case_when(
      Trait == "AFC (months)" & Raw_Min < 0 ~ TRUE,
      Trait == "Days Open" & Raw_Min < 0 ~ TRUE,
      Trait == "Days in Milk (DIM)" & Raw_Min < 0 ~ TRUE,
      TRUE ~ FALSE
    ),
    Max_Flag = case_when(
      Trait == "AFC (months)" & Raw_Max > 100 ~ TRUE,
      Trait == "Calving Interval (days)" & Raw_Max > 1000 ~ TRUE,
      Trait == "Total Lactation Yield (kg)" & Raw_Max > 20000 ~ TRUE,
      Trait == "Days Open" & Raw_Max > 1000 ~ TRUE,
      Trait == "Days in Milk (DIM)" & Raw_Max > 1000 ~ TRUE,
      TRUE ~ FALSE
    ),
    Loss_Flag = Pct_Lost > 20
  )

# Display table with color coding
kable(comparison %>% select(-Min_Flag, -Max_Flag, -Loss_Flag), 
      caption = "Table 3: Comparison of Raw vs Cleaned Statistics",
      col.names = c("Trait", "Raw N", "Raw Min", "Raw Max", "Raw Mean", "Raw SD",
                    "Filter", "Clean N", "Clean Mean", "Clean SD", "N Lost", "% Lost"),
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  column_spec(3, color = ifelse(comparison$Min_Flag, "red", "black"),
              bold = ifelse(comparison$Min_Flag, TRUE, FALSE)) %>%
  column_spec(4, color = ifelse(comparison$Max_Flag, "red", "black"),
              bold = ifelse(comparison$Max_Flag, TRUE, FALSE)) %>%
  column_spec(12, color = ifelse(comparison$Loss_Flag, "red", "darkgreen"),
              bold = TRUE)
Table 3: Comparison of Raw vs Cleaned Statistics
Trait Raw N Raw Min Raw Max Raw Mean Raw SD Filter Clean N Clean Mean Clean SD N Lost % Lost
AFC (months) 33,015 -394 24,295.0 50.96 364.55 18 to 72 28,229 39.36 11.89 4,786 14.50
Calving Interval (days) 50,463 0 16,530.0 521.70 342.46 300 to 700 41,248 433.45 92.15 9,215 18.26
Lactation Length (days) 50,707 0 669,342.0 535.21 2,990.38 60 to 500 33,573 386.08 60.86 17,134 33.79
Total Lactation Yield (kg) 88,347 0 23,186.4 112.02 642.21 >0 to 15,000 37,184 261.79 927.82 51,163 57.91
Days Open 50,710 -280 669,062.0 255.21 2,990.29 21 to 400 40,802 150.28 87.90 9,908 19.54
Daily Milk Yield (kg) 1,048,546 0 50.0 5.51 6.99 0.5 to 50 675,193 8.54 7.08 373,353 35.61
Days in Milk (DIM) 1,048,575 -729,939 7,752.0 220.35 2,008.49 5 to 500 957,939 169.61 112.75 90,636 8.64
Parity 87,990 1 32.0 2.42 1.82 1 to 12 87,927 2.41 1.76 63 0.07

πŸ“Š Column Descriptions for Table 3:

Column Description What to Look For
Trait Variable being analyzed -
Raw N Number of records BEFORE filtering Total available data
Raw Min Minimum value in raw data πŸ”΄ Red = impossible value (e.g., negative)
Raw Max Maximum value in raw data πŸ”΄ Red = extreme outlier
Raw Mean Average BEFORE filtering ⚠️ Biased by outliers - don’t report this!
Raw SD Standard deviation before filtering Large SD may indicate outliers
Filter Acceptable biological range Values used to clean data
Clean N Number of records AFTER filtering Data available for analysis
Clean Mean Average AFTER filtering βœ… Report this value!
Clean SD Standard deviation after filtering Use for reporting
N Lost Records removed by filtering How many outliers existed
% Lost Percentage removed πŸ”΄ Red if >20% = major data quality issue

🚨 Major Data Quality Issues Identified:

major_issues <- comparison %>% filter(Pct_Lost > 20)

if(nrow(major_issues) > 0) {
  cat("| Trait | % Lost | Interpretation |\n")
  cat("|-------|--------|----------------|\n")
  for(i in 1:nrow(major_issues)) {
    interpretation <- case_when(
      major_issues$Trait[i] == "Total Lactation Yield (kg)" ~ "Many incomplete lactations or zeros recorded as missing",
      major_issues$Trait[i] == "Daily Milk Yield (kg)" ~ "Many zero yields - may be dry cows or missing data",
      major_issues$Trait[i] == "Calving Interval (days)" ~ "Many extreme values - date recording issues",
      TRUE ~ "Significant outliers present"
    )
    cat(paste0("| **", major_issues$Trait[i], "** | ", major_issues$Pct_Lost[i], "% | ", interpretation, " |\n"))
  }
}
Trait % Lost Interpretation
Lactation Length (days) 33.79% Significant outliers present
Total Lactation Yield (kg) 57.91% Many incomplete lactations or zeros recorded as missing
Daily Milk Yield (kg) 35.61% Many zero yields - may be dry cows or missing data

Impact: When >20% of records are removed, this affects:

  1. Statistical power - fewer records for analysis
  2. Representativeness - are we losing certain farms/breeds?
  3. Genetic evaluations - missing data reduces accuracy

Recommendation: Investigate WHY these records have problematic values before simply deleting them.


4 Age at First Calving (AFC) Analysis

4.1 Distribution by Category

# AFC data with categories
afc_data <- lactation %>%
  filter(parity == 1, !is.na(afc_months)) %>%
  mutate(
    afc_category = case_when(
      afc_months < 0 ~ "Negative (ERROR)",
      afc_months >= 0 & afc_months < 18 ~ "Too Young (<18 mo)",
      afc_months >= 18 & afc_months <= 36 ~ "Optimal (18-36 mo)",
      afc_months > 36 & afc_months <= 72 ~ "Late (37-72 mo)",
      afc_months > 72 & afc_months <= 100 ~ "Very Late (73-100 mo)",
      afc_months > 100 ~ "Extreme (>100 mo)"
    ),
    afc_category = factor(afc_category, levels = c("Negative (ERROR)", "Too Young (<18 mo)", 
                                                    "Optimal (18-36 mo)", "Late (37-72 mo)",
                                                    "Very Late (73-100 mo)", "Extreme (>100 mo)"))
  )

# Summary table
afc_summary <- afc_data %>%
  group_by(afc_category) %>%
  summarise(
    N = n(),
    Min = round(min(afc_months), 1),
    Max = round(max(afc_months), 1),
    Mean = round(mean(afc_months), 2),
    SD = round(sd(afc_months), 2),
    Median = round(median(afc_months), 1),
    .groups = "drop"
  ) %>%
  mutate(
    Percentage = round(100 * N / sum(N), 2),
    Status = case_when(
      afc_category == "Negative (ERROR)" ~ "πŸ”΄ ERROR",
      afc_category == "Too Young (<18 mo)" ~ "🟠 WARNING",
      afc_category == "Optimal (18-36 mo)" ~ "🟒 OPTIMAL",
      afc_category == "Late (37-72 mo)" ~ "🟑 ACCEPTABLE",
      afc_category == "Very Late (73-100 mo)" ~ "🟠 WARNING",
      afc_category == "Extreme (>100 mo)" ~ "πŸ”΄ ERROR"
    )
  )

kable(afc_summary, 
      caption = "Table 4: AFC Distribution by Category with Quality Status",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Table 4: AFC Distribution by Category with Quality Status
afc_category N Min Max Mean SD Median Percentage Status
Negative (ERROR) 127 -394 -1 -102.76 106.54 -56 0.38 πŸ”΄ ERROR |
Too Young (<18 mo) 683 0 17 5.74 6.30 3 2.07 🟠 WARNING |
Optimal (18-36 mo) 14,700 18 36 30.33 3.85 31 44.53 🟒 OPTIMAL |
Late (37-72 mo) 13,529 37 72 49.17 9.68 47 40.98 🟑 ACCEPTABLE |
Very Late (73-100 mo) 2,495 73 100 84.47 8.11 84 7.56 🟠 WARNING |
Extreme (>100 mo) 1,481 101 24,295 249.59 1,706.99 117 4.49 πŸ”΄ ERROR |

πŸ“Š Understanding AFC Categories:

Category Range Status Explanation
Negative < 0 months πŸ”΄ ERROR Impossible - birthdate after calving date
Too Young 0-17 months 🟠 WARNING Biologically unlikely - heifers not sexually mature
Optimal 18-36 months 🟒 OPTIMAL Target range for dairy cattle
Late 37-72 months 🟑 ACCEPTABLE Possible but economically suboptimal
Very Late 73-100 months 🟠 WARNING Unusual - verify data
Extreme >100 months πŸ”΄ ERROR Impossible - >8 years at first calving

Why AFC Matters:

  • Too early AFC β†’ underdeveloped heifers, calving difficulties
  • Optimal AFC (24-30 mo) β†’ balance of heifer development and economic returns
  • Late AFC β†’ increased rearing costs, delayed milk production
# Plot 1: Full distribution
p1 <- ggplot(afc_data, aes(x = afc_months)) +
  geom_histogram(aes(fill = afc_category), bins = 100, color = "black", linewidth = 0.1) +
  scale_fill_manual(values = c(
    "Negative (ERROR)" = "#D32F2F",
    "Too Young (<18 mo)" = "#FF9800",
    "Optimal (18-36 mo)" = "#4CAF50",
    "Late (37-72 mo)" = "#FFC107",
    "Very Late (73-100 mo)" = "#FF5722",
    "Extreme (>100 mo)" = "#9C27B0"
  )) +
  geom_vline(xintercept = c(0, 18, 36, 72), linetype = "dashed", color = "black", linewidth = 0.5) +
  annotate("text", x = 27, y = Inf, label = "Optimal\nRange", vjust = 1.5, size = 3, color = "darkgreen") +
  labs(
    title = "Figure 1: Age at First Calving - Complete Distribution",
    subtitle = paste0("Total N = ", format(nrow(afc_data), big.mark = ","), 
                      " | Raw Mean = ", round(mean(afc_data$afc_months), 1), " months",
                      " | Note: Y-axis is log scale to show outliers"),
    x = "AFC (months)",
    y = "Count (log scale)",
    fill = "Category"
  ) +
  theme(legend.position = "bottom") +
  scale_y_log10(labels = comma)

# Plot 2: Cleaned data only
afc_clean <- afc_data %>% filter(afc_months >= 18, afc_months <= 72)
p2 <- ggplot(afc_clean, aes(x = afc_months)) +
  geom_histogram(bins = 40, fill = "steelblue", color = "black", linewidth = 0.2) +
  geom_vline(xintercept = mean(afc_clean$afc_months), linetype = "dashed", color = "red", linewidth = 1) +
  annotate("text", x = mean(afc_clean$afc_months) + 2, y = Inf, 
           label = paste0("Clean Mean = ", round(mean(afc_clean$afc_months), 1), " months"),
           vjust = 2, hjust = 0, color = "red", fontface = "bold") +
  labs(
    title = "Figure 2: AFC - Cleaned Data Only (18-72 months)",
    subtitle = paste0("Clean N = ", format(nrow(afc_clean), big.mark = ","),
                      " (", round(100*nrow(afc_clean)/nrow(afc_data), 1), "% of total)",
                      " | Mean = ", round(mean(afc_clean$afc_months), 1),
                      " Β± ", round(sd(afc_clean$afc_months), 1), " months"),
    x = "AFC (months)",
    y = "Count"
  )

grid.arrange(p1, p2, ncol = 1, heights = c(1.2, 1))

πŸ“ˆ How to Read These Figures:

Figure 1 (All Data):

  • Shows COMPLETE distribution including all errors and outliers
  • Y-axis is LOG scale - necessary to see both common values and rare extremes
  • Dashed vertical lines mark category boundaries (0, 18, 36, 72 months)
  • Colors show data quality: 🟒 Green = good, πŸ”΄ Red = errors

Figure 2 (Cleaned Data):

  • Shows ONLY records within acceptable range (18-72 months)
  • Red dashed line = mean of cleaned data
  • Subtitle shows: sample size retained, percentage of total, and mean Β± SD
  • This is what you should report in publications!

Key Insight: The raw mean is pulled upward by extreme outliers. Always report the CLEANED mean.


4.2 AFC by Breed

# AFC by breed - CLEANED
afc_by_breed <- lactation %>%
  filter(parity == 1, !is.na(afc_months), !is.na(main_breed_name),
         afc_months >= 18, afc_months <= 72) %>%
  group_by(main_breed_name) %>%
  summarise(
    N = n(),
    Min = round(min(afc_months), 1),
    Max = round(max(afc_months), 1),
    Mean = round(mean(afc_months), 2),
    SD = round(sd(afc_months), 2),
    SE = round(sd(afc_months) / sqrt(n()), 2),
    Median = round(median(afc_months), 1),
    .groups = "drop"
  ) %>%
  filter(N >= 30) %>%
  arrange(Mean)

kable(afc_by_breed, 
      caption = "Table 5: AFC by Breed (Cleaned: 18-72 months, N β‰₯ 30)",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Table 5: AFC by Breed (Cleaned: 18-72 months, N β‰₯ 30)
main_breed_name N Min Max Mean SD SE Median
Scandinavian Red 50 18 38 26.84 4.51 0.64 25.5
Jersey crosses 70 18 52 30.11 6.17 0.74 29.5
Gir 199 21 72 31.07 7.48 0.53 30.0
Fleckvieh 55 21 62 31.49 8.06 1.09 30.0
Jersey 1,144 18 72 31.50 10.19 0.30 28.0
Holstein-Friesian crosses 1,255 18 72 35.78 7.02 0.20 35.0
Ayrshire crosses 88 18 72 36.75 10.89 1.16 34.0
Holstein Friesian 15,944 18 72 37.21 11.06 0.09 34.0
Sahiwal crosses 237 22 60 38.36 8.24 0.54 37.0
Boran 101 19 69 38.67 9.09 0.90 37.0
Unknown 43 24 71 40.09 14.00 2.13 36.0
Guernsey 574 19 72 41.44 12.62 0.53 38.0
Ayrshire 5,188 18 72 43.49 11.60 0.16 41.0
Brown Swiss 147 19 72 44.18 13.94 1.15 42.0
Sahiwal 2,875 18 72 49.27 11.44 0.21 49.0

πŸ“Š Column Descriptions for Table 5:

Column Description
main_breed_name Breed of the animal
N Number of first-parity records (after filtering 18-72 mo)
Min Minimum AFC for this breed (within filtered range)
Max Maximum AFC for this breed (within filtered range)
Mean βœ… Average AFC - use this for reporting
SD Standard Deviation - spread of AFC values
SE Standard Error = SD/√N - precision of mean estimate
Median Middle value - alternative to mean

Interpretation:

  • Lower Mean = earlier maturing breed (e.g., some crossbreds)
  • Higher Mean = later maturing or management issues
  • Lower SE = more precise estimate (larger sample size)
  • Breeds with N < 30 are excluded (unreliable estimates)
ggplot(afc_by_breed, aes(x = reorder(main_breed_name, Mean), y = Mean)) +
  geom_bar(stat = "identity", fill = "steelblue", width = 0.7) +
  geom_errorbar(aes(ymin = Mean - SE, ymax = Mean + SE), width = 0.2, color = "black") +
  geom_text(aes(label = paste0(round(Mean, 1), " mo\n(n=", format(N, big.mark=","), ")")), 
            hjust = -0.1, size = 3) +
  geom_hline(yintercept = 30, linetype = "dashed", color = "darkgreen", linewidth = 1) +
  annotate("text", x = 0.5, y = 31, label = "Target: 30 months", hjust = 0, color = "darkgreen", size = 3) +
  coord_flip() +
  labs(
    title = "Figure 3: Mean AFC by Breed (Cleaned Data: 18-72 months)",
    subtitle = "Error bars = Β± 1 Standard Error | Green line = target AFC of 30 months",
    x = "Breed",
    y = "Mean AFC (months)"
  ) +
  scale_y_continuous(limits = c(0, max(afc_by_breed$Mean + afc_by_breed$SE) * 1.5))

πŸ“ˆ How to Read Figure 3:

  • Bar height = Mean AFC for that breed
  • Error bars = Β± 1 Standard Error
    • Shorter error bars = more precise estimate (larger N)
    • If error bars don’t overlap between breeds, difference may be significant
  • Green dashed line = Target AFC of 30 months
    • Bars below this line = good (earlier calving)
    • Bars above this line = room for improvement
  • Labels show exact mean and sample size

5 Daily Milk Yield Analysis

5.1 Distribution by Category

# Daily yield categories
dmy_data <- milking %>%
  filter(!is.na(daily_yield)) %>%
  mutate(
    yield_category = case_when(
      daily_yield < 0 ~ "Negative (ERROR)",
      daily_yield == 0 ~ "Zero Yield",
      daily_yield > 0 & daily_yield < 0.5 ~ "Very Low (<0.5 kg)",
      daily_yield >= 0.5 & daily_yield <= 10 ~ "Low (0.5-10 kg)",
      daily_yield > 10 & daily_yield <= 20 ~ "Medium (10-20 kg)",
      daily_yield > 20 & daily_yield <= 30 ~ "Good (20-30 kg)",
      daily_yield > 30 & daily_yield <= 50 ~ "High (30-50 kg)",
      daily_yield > 50 ~ "Extreme (>50 kg)"
    ),
    yield_category = factor(yield_category, levels = c("Negative (ERROR)", "Zero Yield", 
                                                        "Very Low (<0.5 kg)", "Low (0.5-10 kg)",
                                                        "Medium (10-20 kg)", "Good (20-30 kg)",
                                                        "High (30-50 kg)", "Extreme (>50 kg)"))
  )

# Summary
dmy_summary <- dmy_data %>%
  group_by(yield_category) %>%
  summarise(
    N = n(),
    Min = round(min(daily_yield), 2),
    Max = round(max(daily_yield), 2),
    Mean = round(mean(daily_yield), 2),
    SD = round(sd(daily_yield), 2),
    .groups = "drop"
  ) %>%
  mutate(
    Percentage = round(100 * N / sum(N), 2),
    Status = case_when(
      yield_category == "Negative (ERROR)" ~ "πŸ”΄ ERROR",
      yield_category == "Zero Yield" ~ "🟠 CHECK",
      yield_category == "Very Low (<0.5 kg)" ~ "🟠 WARNING",
      yield_category %in% c("Low (0.5-10 kg)", "Medium (10-20 kg)", "Good (20-30 kg)") ~ "🟒 NORMAL",
      yield_category == "High (30-50 kg)" ~ "πŸ”΅ HIGH",
      yield_category == "Extreme (>50 kg)" ~ "🟣 VERIFY"
    )
  )

kable(dmy_summary, 
      caption = "Table 6: Daily Milk Yield Distribution by Category",
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)
Table 6: Daily Milk Yield Distribution by Category
yield_category N Min Max Mean SD Percentage Status
Zero Yield 333,069 0.00 0.00 0.00 0.00 31.76 🟠 CHECK |
Very Low (<0.5 kg) 40,284 0.01 0.49 0.28 0.12 3.84 🟠 WARNING |
Low (0.5-10 kg) 393,426 0.50 10.00 3.43 3.11 37.52 🟒 NORMAL |
Medium (10-20 kg) 237,650 10.01 20.00 14.23 2.69 22.66 🟒 NORMAL |
Good (20-30 kg) 42,181 20.10 30.00 23.08 2.45 4.02 🟒 NORMAL |
High (30-50 kg) 1,936 30.10 50.00 32.68 2.83 0.18 πŸ”΅ HIGH |

πŸ“Š Understanding Milk Yield Categories:

Category Range Status Explanation
Negative < 0 kg πŸ”΄ ERROR Impossible - data entry error
Zero = 0 kg 🟠 CHECK May be: dry cow, sick, or missing data recorded as 0
Very Low 0-0.5 kg 🟠 WARNING Unusual - verify recording
Low 0.5-10 kg 🟒 NORMAL Typical for local/indigenous breeds or late lactation
Medium 10-20 kg 🟒 NORMAL Good production for crossbreds
Good 20-30 kg 🟒 NORMAL High production crossbreds or pure exotics
High 30-50 kg πŸ”΅ HIGH Elite producers - verify if correct
Extreme > 50 kg 🟣 VERIFY Very rare - likely error unless elite Holstein

Common Reasons for Zero Yields:

  1. Dry period - cow is not being milked before next calving
  2. Sick animal - temporarily not milking
  3. Missing data - no recording made but 0 entered instead of NA
  4. Data entry error - value not recorded properly
# Plot 1: All data
p1 <- ggplot(dmy_data, aes(x = daily_yield)) +
  geom_histogram(aes(fill = yield_category), bins = 100, color = "black", linewidth = 0.1) +
  scale_fill_manual(values = c(
    "Negative (ERROR)" = "#D32F2F",
    "Zero Yield" = "#757575",
    "Very Low (<0.5 kg)" = "#FF9800",
    "Low (0.5-10 kg)" = "#8BC34A",
    "Medium (10-20 kg)" = "#4CAF50",
    "Good (20-30 kg)" = "#2E7D32",
    "High (30-50 kg)" = "#2196F3",
    "Extreme (>50 kg)" = "#9C27B0"
  )) +
  labs(
    title = "Figure 4: Daily Milk Yield - Complete Distribution",
    subtitle = paste0("Total N = ", format(nrow(dmy_data), big.mark = ","),
                      " | Note: Large spike at zero - investigate these records"),
    x = "Daily Yield (kg)",
    y = "Count (log scale)",
    fill = "Category"
  ) +
  scale_y_log10(labels = comma) +
  theme(legend.position = "bottom")

# Plot 2: Cleaned data
clean_dmy <- dmy_data %>% filter(daily_yield >= 0.5, daily_yield <= 50)
p2 <- ggplot(clean_dmy, aes(x = daily_yield)) +
  geom_histogram(bins = 50, fill = "steelblue", color = "black", linewidth = 0.1) +
  geom_vline(xintercept = mean(clean_dmy$daily_yield), linetype = "dashed", color = "red", linewidth = 1) +
  annotate("text", x = mean(clean_dmy$daily_yield) + 1, y = Inf, 
           label = paste0("Clean Mean = ", round(mean(clean_dmy$daily_yield), 2), " kg/day"),
           vjust = 2, hjust = 0, color = "red", fontface = "bold", size = 4) +
  labs(
    title = "Figure 5: Daily Milk Yield - Cleaned Data (0.5-50 kg)",
    subtitle = paste0("Clean N = ", format(nrow(clean_dmy), big.mark = ","),
                      " (", round(100*nrow(clean_dmy)/nrow(dmy_data), 1), "% retained)",
                      " | Mean = ", round(mean(clean_dmy$daily_yield), 2),
                      " Β± ", round(sd(clean_dmy$daily_yield), 2), " kg/day"),
    x = "Daily Yield (kg)",
    y = "Count"
  )

grid.arrange(p1, p2, ncol = 1)

⚠️ Important Finding: High Percentage of Zero Yields

n_zeros <- sum(dmy_data$daily_yield == 0)
pct_zeros <- round(100 * n_zeros / nrow(dmy_data), 1)

cat(paste0("
**", format(n_zeros, big.mark=","), " records (", pct_zeros, "%) have zero daily yield.**

This needs investigation:

1. **Are these dry cows?** Check if DIM is near end of lactation
2. **Are these sick animals?** Cross-reference with health records
3. **Is this missing data?** Should these be NA instead of 0?
4. **Project-specific issue?** Check if zeros concentrated in certain projects

**Recommendation:** Do NOT simply delete zeros - investigate the cause first!
"))

333,069 records (31.8%) have zero daily yield.

This needs investigation:

  1. Are these dry cows? Check if DIM is near end of lactation
  2. Are these sick animals? Cross-reference with health records
  3. Is this missing data? Should these be NA instead of 0?
  4. Project-specific issue? Check if zeros concentrated in certain projects

Recommendation: Do NOT simply delete zeros - investigate the cause first!


6 Summary and Recommendations

6.1 Final Quality Assessment

# Create status indicators
comparison <- comparison %>%
  mutate(
    Quality_Status = case_when(
      Pct_Lost < 5 ~ "🟒 Good",
      Pct_Lost < 10 ~ "🟑 Acceptable",
      Pct_Lost < 20 ~ "🟠 Warning",
      TRUE ~ "πŸ”΄ Major Issue"
    )
  )

kable(comparison %>% select(Trait, Raw_N, Raw_Min, Raw_Max, Raw_Mean, 
                             Filter_Applied, Clean_N, Clean_Mean, Clean_SD, 
                             Records_Lost, Pct_Lost, Quality_Status), 
      caption = "Table 7: Final Data Quality Summary",
      col.names = c("Trait", "Raw N", "Raw Min", "Raw Max", "Raw Mean", 
                    "Filter", "Clean N", "Clean Mean", "Clean SD", 
                    "N Lost", "% Lost", "Status"),
      format.args = list(big.mark = ",")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
  column_spec(3, color = ifelse(comparison$Min_Flag, "red", "black"),
              bold = ifelse(comparison$Min_Flag, TRUE, FALSE)) %>%
  column_spec(4, color = ifelse(comparison$Max_Flag, "red", "black"),
              bold = ifelse(comparison$Max_Flag, TRUE, FALSE)) %>%
  column_spec(11, color = ifelse(comparison$Loss_Flag, "red", "darkgreen"),
              bold = TRUE)
Table 7: Final Data Quality Summary
Trait Raw N Raw Min Raw Max Raw Mean Filter Clean N Clean Mean Clean SD N Lost % Lost Status
AFC (months) 33,015 -394 24,295.0 50.96 18 to 72 28,229 39.36 11.89 4,786 14.50 🟠 Warning |
Calving Interval (days) 50,463 0 16,530.0 521.70 300 to 700 41,248 433.45 92.15 9,215 18.26 🟠 Warning |
Lactation Length (days) 50,707 0 669,342.0 535.21 60 to 500 33,573 386.08 60.86 17,134 33.79 πŸ”΄ Major Issue |
Total Lactation Yield (kg) 88,347 0 23,186.4 112.02 >0 to 15,000 37,184 261.79 927.82 51,163 57.91 πŸ”΄ Major Issue |
Days Open 50,710 -280 669,062.0 255.21 21 to 400 40,802 150.28 87.90 9,908 19.54 🟠 Warning |
Daily Milk Yield (kg) 1,048,546 0 50.0 5.51 0.5 to 50 675,193 8.54 7.08 373,353 35.61 πŸ”΄ Major Issue |
Days in Milk (DIM) 1,048,575 -729,939 7,752.0 220.35 5 to 500 957,939 169.61 112.75 90,636 8.64 🟑 Acceptable |
Parity 87,990 1 32.0 2.42 1 to 12 87,927 2.41 1.76 63 0.07 🟒 Good |

πŸ“Š How to Use Table 7:

What to Report Column to Use Notes
Sample size Clean N After removing outliers
Average value Clean Mean NOT Raw Mean (biased by outliers)
Variation Clean SD Standard deviation of cleaned data
Data quality % Lost & Status Document in methods section

Status Legend:

  • 🟒 Good (< 5% lost) - Minor outliers, data is reliable
  • 🟑 Acceptable (5-10% lost) - Some outliers, proceed with caution
  • 🟠 Warning (10-20% lost) - Significant outliers, investigate cause
  • πŸ”΄ Major Issue (> 20% lost) - Serious data quality problem, investigate before use

6.2 Key Findings

🚨 Critical Issues Requiring Attention:

cat("
| Issue | Severity | Records Affected | Action Required |
|-------|----------|------------------|-----------------|
| **Negative AFC values** | πŸ”΄ Critical | Check birthdate/calving date | Fix or remove |
| **Negative DIM values** | πŸ”΄ Critical | Milking date before calving | Fix dates |
| **Zero total lactation yields** | πŸ”΄ Major | ~58% of records | Investigate - incomplete data? |
| **Zero daily yields** | 🟠 Warning | ~32% of records | Check if dry cows or missing |
| **Extreme AFC (>100 mo)** | πŸ”΄ Critical | Data entry errors | Remove |
| **Negative Days Open** | πŸ”΄ Critical | Conception before calving | Fix dates |
")
Issue Severity Records Affected Action Required
Negative AFC values πŸ”΄ Critical Check birthdate/calving date Fix or remove
Negative DIM values πŸ”΄ Critical Milking date before calving Fix dates
Zero total lactation yields πŸ”΄ Major ~58% of records Investigate - incomplete data?
Zero daily yields 🟠 Warning ~32% of records Check if dry cows or missing
Extreme AFC (>100 mo) πŸ”΄ Critical Data entry errors Remove
Negative Days Open πŸ”΄ Critical Conception before calving Fix dates

6.3 Recommendations

βœ… Recommended Data Cleaning Workflow:

Step 1: Remove Impossible Values

# Remove biologically impossible values
data_clean <- data %>%
  filter(
    afc_months >= 0,              # No negative ages
    dim >= 0,                      # No negative DIM
    days_open >= 0,                # No negative days open
    daily_yield >= 0               # No negative yields
  )

Step 2: Apply Biological Filters

# Keep only biologically plausible values
data_clean <- data_clean %>%
  filter(
    afc_months >= 18 & afc_months <= 72,
    calving_interval_days >= 300 & calving_interval_days <= 700,
    daily_yield >= 0.5 & daily_yield <= 50,
    dim >= 5 & dim <= 500,
    days_open >= 21 & days_open <= 400,
    total_lactation_yield > 0 & total_lactation_yield <= 15000
  )

Step 3: Document What You Did

# Always report:
cat("Original records:", nrow(data_raw))
cat("After cleaning:", nrow(data_clean))
cat("Records removed:", nrow(data_raw) - nrow(data_clean))
cat("Filters applied: AFC 18-72 mo, CI 300-700 d, etc.")

πŸ“ What to Report in Publications:

When reporting results, always include:

  1. Sample size after cleaning (Clean N)
  2. Mean Β± SD from cleaned data
  3. The filters applied (e.g., β€œAFC values outside 18-72 months were excluded”)
  4. Percentage of data removed (for transparency)

Example Methods Statement:

β€œAge at first calving (AFC) was calculated for first-parity animals. Records with AFC outside the biologically plausible range of 18-72 months were excluded (N = X, Y% of records), resulting in a final sample of N = Z animals. Mean AFC was X.X Β± Y.Y months.”


7 Session Info

sessionInfo()
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
## 
## Matrix products: default
##   LAPACK version 3.12.1
## 
## locale:
## [1] LC_COLLATE=English_Kenya.utf8  LC_CTYPE=English_Kenya.utf8   
## [3] LC_MONETARY=English_Kenya.utf8 LC_NUMERIC=C                  
## [5] LC_TIME=English_Kenya.utf8    
## 
## time zone: Africa/Nairobi
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] viridis_0.6.5     viridisLite_0.4.2 kableExtra_1.4.0  knitr_1.50       
##  [5] gridExtra_2.3     scales_1.4.0      ggplot2_3.5.2     tidyr_1.3.1      
##  [9] dplyr_1.1.4       data.table_1.17.8
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       jsonlite_2.0.0     compiler_4.5.1     tidyselect_1.2.1  
##  [5] xml2_1.4.0         stringr_1.5.1      jquerylib_0.1.4    textshaping_1.0.2 
##  [9] systemfonts_1.2.3  yaml_2.3.10        fastmap_1.2.0      R6_2.6.1          
## [13] labeling_0.4.3     generics_0.1.4     tibble_3.3.0       svglite_2.2.1     
## [17] bslib_0.9.0        pillar_1.11.0      RColorBrewer_1.1-3 rlang_1.1.6       
## [21] stringi_1.8.7      cachem_1.1.0       xfun_0.52          sass_0.4.10       
## [25] cli_3.6.5          withr_3.0.2        magrittr_2.0.3     digest_0.6.37     
## [29] grid_4.5.1         rstudioapi_0.17.1  lifecycle_1.0.4    vctrs_0.6.5       
## [33] evaluate_1.0.5     glue_1.8.0         farver_2.1.2       rmarkdown_2.29    
## [37] purrr_1.1.0        tools_4.5.1        pkgconfig_2.0.3    htmltools_0.5.8.1

Report generated on 08 December 2025 at 12:39