# Load required packages
library(data.table)
library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(gridExtra)
library(knitr)
library(kableExtra)
library(viridis)
# Set theme
theme_set(theme_bw(base_size = 12))This report assesses the data quality of the Kenya ADGG/TDGG dairy datasets by:
Throughout this report, we use color coding to highlight data quality issues:
| Color | Meaning |
|---|---|
| π΄ Red | Error or impossible value (e.g., negative AFC, negative DIM) |
| π Orange | Warning - unusual but possible (e.g., AFC < 18 months) |
| π’ Green | Normal/acceptable range |
| π΅ Blue | High but acceptable (e.g., daily yield 30-50 kg) |
| π£ Purple | Extreme outlier - needs verification |
# SET YOUR FILE PATH HERE
data_path <- "C:/Users/SAlemu/OneDrive - CGIAR/Documents/KenyaData/New_Kenya_data/"
# Read data - treating "NULL" as NA
lactation <- fread(paste0(data_path, "lactation_calving.csv"),
na.strings = c("NULL", "null", "NA", "N/A", "", " ", "NaN"),
stringsAsFactors = FALSE)
milking <- fread(paste0(data_path, "milking_data.csv"),
na.strings = c("NULL", "null", "NA", "N/A", "", " ", "NaN"),
stringsAsFactors = FALSE)
# Convert numeric columns
lactation <- lactation %>%
mutate(
parity = as.numeric(parity),
afc_months = as.numeric(afc_months),
afc_days = as.numeric(afc_days),
calving_interval_days = as.numeric(calving_interval_days),
calving_interval_months = as.numeric(calving_interval_months),
lactation_length_days = as.numeric(lactation_length_days),
total_lactation_yield = as.numeric(total_lactation_yield),
testday_count = as.numeric(testday_count),
days_open = as.numeric(days_open)
)
milking <- milking %>%
mutate(
parity = as.numeric(parity),
afc_months = as.numeric(afc_months),
afc_days = as.numeric(afc_days),
dim = as.numeric(dim),
daily_yield = as.numeric(daily_yield),
milk_morning = as.numeric(milk_morning),
milk_evening = as.numeric(milk_evening),
milk_midday = as.numeric(milk_midday),
testday_no = as.numeric(testday_no),
calving_age_months = as.numeric(calving_age_months)
)Datasets Loaded:
| Dataset | Records | Description |
|---|---|---|
| Lactation | 88,352 | One record per lactation (calving event) |
| Milking | 1,048,575 | One record per test-day milk recording |
What is a βtest-dayβ? A test-day is a single day when milk production was recorded. Each lactation typically has multiple test-days (monthly recordings throughout the lactation).
Important: Before cleaning data, we must examine the raw statistics (Min, Max, Mean) to:
The Min and Max columns are especially important for detecting errors!
# Function to calculate comprehensive stats
calc_stats <- function(x, var_name) {
x_valid <- x[!is.na(x)]
n_total <- length(x)
n_valid <- length(x_valid)
n_missing <- sum(is.na(x))
if (n_valid == 0) {
return(data.frame(
Variable = var_name,
N_Total = n_total,
N_Valid = 0,
N_Missing = n_missing,
Pct_Missing = 100,
Min = NA, Max = NA, Mean = NA, SD = NA, Median = NA
))
}
data.frame(
Variable = var_name,
N_Total = n_total,
N_Valid = n_valid,
N_Missing = n_missing,
Pct_Missing = round(100 * n_missing / n_total, 2),
Min = round(min(x_valid), 2),
Max = round(max(x_valid), 2),
Mean = round(mean(x_valid), 2),
SD = round(sd(x_valid), 2),
Median = round(median(x_valid), 2)
)
}
# Create subsets first
afc_parity1 <- lactation$afc_months[which(lactation$parity == 1)]
ci_parity_gt1 <- lactation$calving_interval_days[which(lactation$parity > 1)]
lact_stats_raw <- rbind(
calc_stats(afc_parity1, "AFC (months)"),
calc_stats(ci_parity_gt1, "Calving Interval (days)"),
calc_stats(lactation$lactation_length_days, "Lactation Length (days)"),
calc_stats(lactation$total_lactation_yield, "Total Lactation Yield (kg)"),
calc_stats(lactation$days_open, "Days Open"),
calc_stats(lactation$parity, "Parity")
)
# Determine which rows have problematic Min or Max values
# For color coding
lact_stats_raw <- lact_stats_raw %>%
mutate(
Min_Problem = case_when(
Variable == "AFC (months)" & Min < 0 ~ TRUE,
Variable == "Calving Interval (days)" & Min < 280 ~ TRUE,
Variable == "Days Open" & Min < 0 ~ TRUE,
Variable == "Parity" & Min < 1 ~ TRUE,
TRUE ~ FALSE
),
Max_Problem = case_when(
Variable == "AFC (months)" & Max > 100 ~ TRUE,
Variable == "Calving Interval (days)" & Max > 730 ~ TRUE,
Variable == "Total Lactation Yield (kg)" & Max > 20000 ~ TRUE,
Variable == "Days Open" & Max > 500 ~ TRUE,
Variable == "Parity" & Max > 15 ~ TRUE,
TRUE ~ FALSE
)
)
kable(lact_stats_raw %>% select(-Min_Problem, -Max_Problem),
caption = "Table 1: Lactation Data - Raw Statistics (Before Any Cleaning)",
format.args = list(big.mark = ",")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
column_spec(6, color = ifelse(lact_stats_raw$Min_Problem, "red", "black"),
bold = ifelse(lact_stats_raw$Min_Problem, TRUE, FALSE)) %>%
column_spec(7, color = ifelse(lact_stats_raw$Max_Problem, "red", "black"),
bold = ifelse(lact_stats_raw$Max_Problem, TRUE, FALSE))| Variable | N_Total | N_Valid | N_Missing | Pct_Missing | Min | Max | Mean | SD | Median |
|---|---|---|---|---|---|---|---|---|---|
| AFC (months) | 37,236 | 33,015 | 4,221 | 11.34 | -394 | 24,295.0 | 50.96 | 364.55 | 38 |
| Calving Interval (days) | 50,754 | 50,463 | 291 | 0.57 | 0 | 16,530.0 | 521.70 | 342.46 | 424 |
| Lactation Length (days) | 88,352 | 50,707 | 37,645 | 42.61 | 0 | 669,342.0 | 535.21 | 2,990.38 | 424 |
| Total Lactation Yield (kg) | 88,352 | 88,347 | 5 | 0.01 | 0 | 23,186.4 | 112.02 | 642.21 | 0 |
| Days Open | 88,352 | 50,710 | 37,642 | 42.60 | -280 | 669,062.0 | 255.21 | 2,990.29 | 144 |
| Parity | 88,352 | 87,990 | 362 | 0.41 | 1 | 32.0 | 2.42 | 1.82 | 2 |
π Column Descriptions for Table 1:
| Column | What It Means | How to Interpret |
|---|---|---|
| Variable | The trait being measured | - |
| N_Total | Total records in this subset | Higher = more data available |
| N_Valid | Records with actual values (not missing) | Should be close to N_Total |
| N_Missing | Records with missing (NA) values | High numbers indicate data collection gaps |
| Pct_Missing | % of records that are missing | >20% is concerning |
| Min | π΄ Smallest value found | Negative values = ERROR |
| Max | π΄ Largest value found | Extreme values = likely ERROR |
| Mean | Average of all valid values | Affected by outliers |
| SD | Standard Deviation (spread) | Large SD suggests high variability or outliers |
| Median | Middle value (50th percentile) | Less affected by outliers than Mean |
π΄ Values shown in RED indicate problems that need attention!
π¨ Critical Issues Found in Lactation Data:
# AFC issues
afc_min <- round(min(afc_parity1, na.rm = TRUE), 1)
afc_max <- round(max(afc_parity1, na.rm = TRUE), 1)
# CI issues
ci_min <- round(min(ci_parity_gt1, na.rm = TRUE), 1)
ci_max <- round(max(ci_parity_gt1, na.rm = TRUE), 1)
# Days open issues
do_min <- round(min(lactation$days_open, na.rm = TRUE), 1)
do_max <- round(max(lactation$days_open, na.rm = TRUE), 1)
cat(paste0("
| Trait | Problem Value | Why It's Wrong |
|-------|---------------|----------------|
| **AFC** | Min = ", afc_min, " months | π΄ **Negative age is impossible!** Birthdate must be before calving date |
| **AFC** | Max = ", format(afc_max, big.mark=','), " months | π΄ **", round(afc_max/12, 0), " years old at first calving is impossible!** |
| **Calving Interval** | Max = ", format(ci_max, big.mark=','), " days | π΄ **", round(ci_max/365, 1), " years between calvings is extreme** |
| **Days Open** | Min = ", do_min, " days | π΄ **Negative days open is impossible!** |
| **Days Open** | Max = ", format(do_max, big.mark=','), " days | π΄ **", round(do_max/365, 1), " years to conception is extreme** |
"))| Trait | Problem Value | Why Itβs Wrong |
|---|---|---|
| AFC | Min = -394 months | π΄ Negative age is impossible! Birthdate must be before calving date |
| AFC | Max = 24,295 months | π΄ 2025 years old at first calving is impossible! |
| Calving Interval | Max = 16,530 days | π΄ 45.3 years between calvings is extreme |
| Days Open | Min = -280 days | π΄ Negative days open is impossible! |
| Days Open | Max = 669,062 days | π΄ 1833 years to conception is extreme |
# Milking traits - RAW
milk_stats_raw <- rbind(
calc_stats(milking$daily_yield, "Daily Milk Yield (kg)"),
calc_stats(milking$milk_morning, "Morning Milk (kg)"),
calc_stats(milking$milk_evening, "Evening Milk (kg)"),
calc_stats(milking$dim, "Days in Milk (DIM)"),
calc_stats(milking$parity, "Parity"),
calc_stats(milking$afc_months, "AFC (months)")
)
# Flag problems
milk_stats_raw <- milk_stats_raw %>%
mutate(
Min_Problem = case_when(
Variable == "Daily Milk Yield (kg)" & Min < 0 ~ TRUE,
Variable == "Morning Milk (kg)" & Min < 0 ~ TRUE,
Variable == "Evening Milk (kg)" & Min < 0 ~ TRUE,
Variable == "Days in Milk (DIM)" & Min < 0 ~ TRUE,
Variable == "AFC (months)" & Min < 0 ~ TRUE,
TRUE ~ FALSE
),
Max_Problem = case_when(
Variable == "Daily Milk Yield (kg)" & Max > 60 ~ TRUE,
Variable == "Days in Milk (DIM)" & Max > 1000 ~ TRUE,
Variable == "AFC (months)" & Max > 100 ~ TRUE,
TRUE ~ FALSE
)
)
kable(milk_stats_raw %>% select(-Min_Problem, -Max_Problem),
caption = "Table 2: Milking Data - Raw Statistics (Before Any Cleaning)",
format.args = list(big.mark = ",")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
column_spec(6, color = ifelse(milk_stats_raw$Min_Problem, "red", "black"),
bold = ifelse(milk_stats_raw$Min_Problem, TRUE, FALSE)) %>%
column_spec(7, color = ifelse(milk_stats_raw$Max_Problem, "red", "black"),
bold = ifelse(milk_stats_raw$Max_Problem, TRUE, FALSE))| Variable | N_Total | N_Valid | N_Missing | Pct_Missing | Min | Max | Mean | SD | Median |
|---|---|---|---|---|---|---|---|---|---|
| Daily Milk Yield (kg) | 1,048,575 | 1,048,546 | 29 | 0.00 | 0 | 50 | 5.51 | 6.99 | 1.38 |
| Morning Milk (kg) | 1,048,575 | 1,048,415 | 160 | 0.02 | 0 | 459 | 1.90 | 3.84 | 0.00 |
| Evening Milk (kg) | 1,048,575 | 1,048,413 | 162 | 0.02 | -1 | 29 | 1.81 | 3.18 | 0.00 |
| Days in Milk (DIM) | 1,048,575 | 1,048,575 | 0 | 0.00 | -729,939 | 7,752 | 220.35 | 2,008.49 | 168.00 |
| Parity | 1,048,575 | 1,048,403 | 172 | 0.02 | 1 | 32 | 2.66 | 1.99 | 2.00 |
| AFC (months) | 1,048,575 | 389,867 | 658,708 | 62.82 | -288 | 270 | 38.29 | 19.43 | 33.00 |
π¨ Critical Issues Found in Milking Data:
dim_min <- round(min(milking$dim, na.rm = TRUE), 0)
dim_max <- round(max(milking$dim, na.rm = TRUE), 0)
eve_min <- round(min(milking$milk_evening, na.rm = TRUE), 2)
cat(paste0("
| Trait | Problem Value | Why It's Wrong |
|-------|---------------|----------------|
| **DIM** | Min = ", format(dim_min, big.mark=','), " days | π΄ **Extreme negative!** Milking date is ~2000 years before calving date! |
| **DIM** | Max = ", format(dim_max, big.mark=','), " days | π΄ **", round(dim_max/365, 1), " years in milk is impossible** |
| **Evening Milk** | Min = ", eve_min, " kg | π΄ **Negative milk yield is impossible** |
"))| Trait | Problem Value | Why Itβs Wrong |
|---|---|---|
| DIM | Min = -729,939 days | π΄ Extreme negative! Milking date is ~2000 years before calving date! |
| DIM | Max = 7,752 days | π΄ 21.2 years in milk is impossible |
| Evening Milk | Min = -1 kg | π΄ Negative milk yield is impossible |
What causes negative DIM? - DIM = Milking Date - Calving Date - Negative DIM means the milking was recorded BEFORE the calving date - This is a data entry error - dates were entered incorrectly
Data Cleaning = Removing biologically implausible values
We apply filters to keep only values within acceptable biological ranges:
| Trait | Filter Range | Biological Justification |
|---|---|---|
| AFC | 18-72 months | Heifers reach puberty ~12-15 mo; first calving typically 24-36 mo |
| Calving Interval | 300-700 days | Minimum ~280 days (gestation); >700 days indicates fertility problems |
| Daily Milk Yield | 0.5-50 kg | Typical range; >50 kg/day is rare even for elite Holsteins |
| DIM | 5-500 days | Standard lactation ~305 days; extended lactations possible |
| Days Open | 21-400 days | Minimum ~21 days postpartum; >400 days is poor fertility |
| Total Lactation Yield | >0 to 15,000 kg | Zero = incomplete data; >15,000 kg is exceptional |
# Calculate each trait separately to avoid NA issues
# ============ AFC (Parity 1) ============
afc_raw <- lactation %>% filter(parity == 1, !is.na(afc_months))
afc_clean <- lactation %>% filter(parity == 1, !is.na(afc_months), afc_months >= 18, afc_months <= 72)
afc_row <- data.frame(
Trait = "AFC (months)",
Raw_N = nrow(afc_raw),
Raw_Min = round(min(afc_raw$afc_months), 1),
Raw_Max = round(max(afc_raw$afc_months), 1),
Raw_Mean = round(mean(afc_raw$afc_months), 2),
Raw_SD = round(sd(afc_raw$afc_months), 2),
Filter_Applied = "18 to 72",
Clean_N = nrow(afc_clean),
Clean_Mean = round(mean(afc_clean$afc_months), 2),
Clean_SD = round(sd(afc_clean$afc_months), 2)
)
# ============ Calving Interval (Parity > 1) ============
ci_raw <- lactation %>% filter(parity > 1, !is.na(calving_interval_days))
ci_clean <- lactation %>% filter(parity > 1, !is.na(calving_interval_days),
calving_interval_days >= 300, calving_interval_days <= 700)
ci_row <- data.frame(
Trait = "Calving Interval (days)",
Raw_N = nrow(ci_raw),
Raw_Min = round(min(ci_raw$calving_interval_days), 1),
Raw_Max = round(max(ci_raw$calving_interval_days), 1),
Raw_Mean = round(mean(ci_raw$calving_interval_days), 2),
Raw_SD = round(sd(ci_raw$calving_interval_days), 2),
Filter_Applied = "300 to 700",
Clean_N = nrow(ci_clean),
Clean_Mean = round(mean(ci_clean$calving_interval_days), 2),
Clean_SD = round(sd(ci_clean$calving_interval_days), 2)
)
# ============ Lactation Length ============
ll_raw <- lactation %>% filter(!is.na(lactation_length_days))
ll_clean <- lactation %>% filter(!is.na(lactation_length_days),
lactation_length_days >= 60, lactation_length_days <= 500)
ll_row <- data.frame(
Trait = "Lactation Length (days)",
Raw_N = nrow(ll_raw),
Raw_Min = round(min(ll_raw$lactation_length_days), 1),
Raw_Max = round(max(ll_raw$lactation_length_days), 1),
Raw_Mean = round(mean(ll_raw$lactation_length_days), 2),
Raw_SD = round(sd(ll_raw$lactation_length_days), 2),
Filter_Applied = "60 to 500",
Clean_N = nrow(ll_clean),
Clean_Mean = round(mean(ll_clean$lactation_length_days), 2),
Clean_SD = round(sd(ll_clean$lactation_length_days), 2)
)
# ============ Total Lactation Yield ============
tly_raw <- lactation %>% filter(!is.na(total_lactation_yield))
tly_clean <- lactation %>% filter(!is.na(total_lactation_yield),
total_lactation_yield > 0, total_lactation_yield <= 15000)
tly_row <- data.frame(
Trait = "Total Lactation Yield (kg)",
Raw_N = nrow(tly_raw),
Raw_Min = round(min(tly_raw$total_lactation_yield), 1),
Raw_Max = round(max(tly_raw$total_lactation_yield), 1),
Raw_Mean = round(mean(tly_raw$total_lactation_yield), 2),
Raw_SD = round(sd(tly_raw$total_lactation_yield), 2),
Filter_Applied = ">0 to 15,000",
Clean_N = nrow(tly_clean),
Clean_Mean = round(mean(tly_clean$total_lactation_yield), 2),
Clean_SD = round(sd(tly_clean$total_lactation_yield), 2)
)
# ============ Days Open ============
do_raw <- lactation %>% filter(!is.na(days_open))
do_clean <- lactation %>% filter(!is.na(days_open), days_open >= 21, days_open <= 400)
do_row <- data.frame(
Trait = "Days Open",
Raw_N = nrow(do_raw),
Raw_Min = round(min(do_raw$days_open), 1),
Raw_Max = round(max(do_raw$days_open), 1),
Raw_Mean = round(mean(do_raw$days_open), 2),
Raw_SD = round(sd(do_raw$days_open), 2),
Filter_Applied = "21 to 400",
Clean_N = nrow(do_clean),
Clean_Mean = round(mean(do_clean$days_open), 2),
Clean_SD = round(sd(do_clean$days_open), 2)
)
# ============ Daily Milk Yield ============
dmy_raw <- milking %>% filter(!is.na(daily_yield))
dmy_clean <- milking %>% filter(!is.na(daily_yield), daily_yield >= 0.5, daily_yield <= 50)
dmy_row <- data.frame(
Trait = "Daily Milk Yield (kg)",
Raw_N = nrow(dmy_raw),
Raw_Min = round(min(dmy_raw$daily_yield), 2),
Raw_Max = round(max(dmy_raw$daily_yield), 2),
Raw_Mean = round(mean(dmy_raw$daily_yield), 2),
Raw_SD = round(sd(dmy_raw$daily_yield), 2),
Filter_Applied = "0.5 to 50",
Clean_N = nrow(dmy_clean),
Clean_Mean = round(mean(dmy_clean$daily_yield), 2),
Clean_SD = round(sd(dmy_clean$daily_yield), 2)
)
# ============ DIM ============
dim_raw <- milking %>% filter(!is.na(dim))
dim_clean <- milking %>% filter(!is.na(dim), dim >= 5, dim <= 500)
dim_row <- data.frame(
Trait = "Days in Milk (DIM)",
Raw_N = nrow(dim_raw),
Raw_Min = round(min(dim_raw$dim), 0),
Raw_Max = round(max(dim_raw$dim), 0),
Raw_Mean = round(mean(dim_raw$dim), 2),
Raw_SD = round(sd(dim_raw$dim), 2),
Filter_Applied = "5 to 500",
Clean_N = nrow(dim_clean),
Clean_Mean = round(mean(dim_clean$dim), 2),
Clean_SD = round(sd(dim_clean$dim), 2)
)
# ============ Parity ============
parity_raw <- lactation %>% filter(!is.na(parity))
parity_clean <- lactation %>% filter(!is.na(parity), parity >= 1, parity <= 12)
parity_row <- data.frame(
Trait = "Parity",
Raw_N = nrow(parity_raw),
Raw_Min = min(parity_raw$parity),
Raw_Max = max(parity_raw$parity),
Raw_Mean = round(mean(parity_raw$parity), 2),
Raw_SD = round(sd(parity_raw$parity), 2),
Filter_Applied = "1 to 12",
Clean_N = nrow(parity_clean),
Clean_Mean = round(mean(parity_clean$parity), 2),
Clean_SD = round(sd(parity_clean$parity), 2)
)
# Combine all rows
comparison <- rbind(afc_row, ci_row, ll_row, tly_row, do_row, dmy_row, dim_row, parity_row)
# Calculate records lost and percentage
comparison <- comparison %>%
mutate(
Records_Lost = Raw_N - Clean_N,
Pct_Lost = round(100 * Records_Lost / Raw_N, 2),
# Flag problematic values
Min_Flag = case_when(
Trait == "AFC (months)" & Raw_Min < 0 ~ TRUE,
Trait == "Days Open" & Raw_Min < 0 ~ TRUE,
Trait == "Days in Milk (DIM)" & Raw_Min < 0 ~ TRUE,
TRUE ~ FALSE
),
Max_Flag = case_when(
Trait == "AFC (months)" & Raw_Max > 100 ~ TRUE,
Trait == "Calving Interval (days)" & Raw_Max > 1000 ~ TRUE,
Trait == "Total Lactation Yield (kg)" & Raw_Max > 20000 ~ TRUE,
Trait == "Days Open" & Raw_Max > 1000 ~ TRUE,
Trait == "Days in Milk (DIM)" & Raw_Max > 1000 ~ TRUE,
TRUE ~ FALSE
),
Loss_Flag = Pct_Lost > 20
)
# Display table with color coding
kable(comparison %>% select(-Min_Flag, -Max_Flag, -Loss_Flag),
caption = "Table 3: Comparison of Raw vs Cleaned Statistics",
col.names = c("Trait", "Raw N", "Raw Min", "Raw Max", "Raw Mean", "Raw SD",
"Filter", "Clean N", "Clean Mean", "Clean SD", "N Lost", "% Lost"),
format.args = list(big.mark = ",")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
column_spec(3, color = ifelse(comparison$Min_Flag, "red", "black"),
bold = ifelse(comparison$Min_Flag, TRUE, FALSE)) %>%
column_spec(4, color = ifelse(comparison$Max_Flag, "red", "black"),
bold = ifelse(comparison$Max_Flag, TRUE, FALSE)) %>%
column_spec(12, color = ifelse(comparison$Loss_Flag, "red", "darkgreen"),
bold = TRUE)| Trait | Raw N | Raw Min | Raw Max | Raw Mean | Raw SD | Filter | Clean N | Clean Mean | Clean SD | N Lost | % Lost |
|---|---|---|---|---|---|---|---|---|---|---|---|
| AFC (months) | 33,015 | -394 | 24,295.0 | 50.96 | 364.55 | 18 to 72 | 28,229 | 39.36 | 11.89 | 4,786 | 14.50 |
| Calving Interval (days) | 50,463 | 0 | 16,530.0 | 521.70 | 342.46 | 300 to 700 | 41,248 | 433.45 | 92.15 | 9,215 | 18.26 |
| Lactation Length (days) | 50,707 | 0 | 669,342.0 | 535.21 | 2,990.38 | 60 to 500 | 33,573 | 386.08 | 60.86 | 17,134 | 33.79 |
| Total Lactation Yield (kg) | 88,347 | 0 | 23,186.4 | 112.02 | 642.21 | >0 to 15,000 | 37,184 | 261.79 | 927.82 | 51,163 | 57.91 |
| Days Open | 50,710 | -280 | 669,062.0 | 255.21 | 2,990.29 | 21 to 400 | 40,802 | 150.28 | 87.90 | 9,908 | 19.54 |
| Daily Milk Yield (kg) | 1,048,546 | 0 | 50.0 | 5.51 | 6.99 | 0.5 to 50 | 675,193 | 8.54 | 7.08 | 373,353 | 35.61 |
| Days in Milk (DIM) | 1,048,575 | -729,939 | 7,752.0 | 220.35 | 2,008.49 | 5 to 500 | 957,939 | 169.61 | 112.75 | 90,636 | 8.64 |
| Parity | 87,990 | 1 | 32.0 | 2.42 | 1.82 | 1 to 12 | 87,927 | 2.41 | 1.76 | 63 | 0.07 |
π Column Descriptions for Table 3:
| Column | Description | What to Look For |
|---|---|---|
| Trait | Variable being analyzed | - |
| Raw N | Number of records BEFORE filtering | Total available data |
| Raw Min | Minimum value in raw data | π΄ Red = impossible value (e.g., negative) |
| Raw Max | Maximum value in raw data | π΄ Red = extreme outlier |
| Raw Mean | Average BEFORE filtering | β οΈ Biased by outliers - donβt report this! |
| Raw SD | Standard deviation before filtering | Large SD may indicate outliers |
| Filter | Acceptable biological range | Values used to clean data |
| Clean N | Number of records AFTER filtering | Data available for analysis |
| Clean Mean | Average AFTER filtering | β Report this value! |
| Clean SD | Standard deviation after filtering | Use for reporting |
| N Lost | Records removed by filtering | How many outliers existed |
| % Lost | Percentage removed | π΄ Red if >20% = major data quality issue |
π¨ Major Data Quality Issues Identified:
major_issues <- comparison %>% filter(Pct_Lost > 20)
if(nrow(major_issues) > 0) {
cat("| Trait | % Lost | Interpretation |\n")
cat("|-------|--------|----------------|\n")
for(i in 1:nrow(major_issues)) {
interpretation <- case_when(
major_issues$Trait[i] == "Total Lactation Yield (kg)" ~ "Many incomplete lactations or zeros recorded as missing",
major_issues$Trait[i] == "Daily Milk Yield (kg)" ~ "Many zero yields - may be dry cows or missing data",
major_issues$Trait[i] == "Calving Interval (days)" ~ "Many extreme values - date recording issues",
TRUE ~ "Significant outliers present"
)
cat(paste0("| **", major_issues$Trait[i], "** | ", major_issues$Pct_Lost[i], "% | ", interpretation, " |\n"))
}
}| Trait | % Lost | Interpretation |
|---|---|---|
| Lactation Length (days) | 33.79% | Significant outliers present |
| Total Lactation Yield (kg) | 57.91% | Many incomplete lactations or zeros recorded as missing |
| Daily Milk Yield (kg) | 35.61% | Many zero yields - may be dry cows or missing data |
Impact: When >20% of records are removed, this affects:
Recommendation: Investigate WHY these records have problematic values before simply deleting them.
# AFC data with categories
afc_data <- lactation %>%
filter(parity == 1, !is.na(afc_months)) %>%
mutate(
afc_category = case_when(
afc_months < 0 ~ "Negative (ERROR)",
afc_months >= 0 & afc_months < 18 ~ "Too Young (<18 mo)",
afc_months >= 18 & afc_months <= 36 ~ "Optimal (18-36 mo)",
afc_months > 36 & afc_months <= 72 ~ "Late (37-72 mo)",
afc_months > 72 & afc_months <= 100 ~ "Very Late (73-100 mo)",
afc_months > 100 ~ "Extreme (>100 mo)"
),
afc_category = factor(afc_category, levels = c("Negative (ERROR)", "Too Young (<18 mo)",
"Optimal (18-36 mo)", "Late (37-72 mo)",
"Very Late (73-100 mo)", "Extreme (>100 mo)"))
)
# Summary table
afc_summary <- afc_data %>%
group_by(afc_category) %>%
summarise(
N = n(),
Min = round(min(afc_months), 1),
Max = round(max(afc_months), 1),
Mean = round(mean(afc_months), 2),
SD = round(sd(afc_months), 2),
Median = round(median(afc_months), 1),
.groups = "drop"
) %>%
mutate(
Percentage = round(100 * N / sum(N), 2),
Status = case_when(
afc_category == "Negative (ERROR)" ~ "π΄ ERROR",
afc_category == "Too Young (<18 mo)" ~ "π WARNING",
afc_category == "Optimal (18-36 mo)" ~ "π’ OPTIMAL",
afc_category == "Late (37-72 mo)" ~ "π‘ ACCEPTABLE",
afc_category == "Very Late (73-100 mo)" ~ "π WARNING",
afc_category == "Extreme (>100 mo)" ~ "π΄ ERROR"
)
)
kable(afc_summary,
caption = "Table 4: AFC Distribution by Category with Quality Status",
format.args = list(big.mark = ",")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| afc_category | N | Min | Max | Mean | SD | Median | Percentage | Status |
|---|---|---|---|---|---|---|---|---|
| Negative (ERROR) | 127 | -394 | -1 | -102.76 | 106.54 | -56 | 0.38 | π΄ ERROR | |
| Too Young (<18 mo) | 683 | 0 | 17 | 5.74 | 6.30 | 3 | 2.07 | π WARNING | |
| Optimal (18-36 mo) | 14,700 | 18 | 36 | 30.33 | 3.85 | 31 | 44.53 | π’ OPTIMAL | |
| Late (37-72 mo) | 13,529 | 37 | 72 | 49.17 | 9.68 | 47 | 40.98 | π‘ ACCEPTABLE | |
| Very Late (73-100 mo) | 2,495 | 73 | 100 | 84.47 | 8.11 | 84 | 7.56 | π WARNING | |
| Extreme (>100 mo) | 1,481 | 101 | 24,295 | 249.59 | 1,706.99 | 117 | 4.49 | π΄ ERROR | |
π Understanding AFC Categories:
| Category | Range | Status | Explanation |
|---|---|---|---|
| Negative | < 0 months | π΄ ERROR | Impossible - birthdate after calving date |
| Too Young | 0-17 months | π WARNING | Biologically unlikely - heifers not sexually mature |
| Optimal | 18-36 months | π’ OPTIMAL | Target range for dairy cattle |
| Late | 37-72 months | π‘ ACCEPTABLE | Possible but economically suboptimal |
| Very Late | 73-100 months | π WARNING | Unusual - verify data |
| Extreme | >100 months | π΄ ERROR | Impossible - >8 years at first calving |
Why AFC Matters:
# Plot 1: Full distribution
p1 <- ggplot(afc_data, aes(x = afc_months)) +
geom_histogram(aes(fill = afc_category), bins = 100, color = "black", linewidth = 0.1) +
scale_fill_manual(values = c(
"Negative (ERROR)" = "#D32F2F",
"Too Young (<18 mo)" = "#FF9800",
"Optimal (18-36 mo)" = "#4CAF50",
"Late (37-72 mo)" = "#FFC107",
"Very Late (73-100 mo)" = "#FF5722",
"Extreme (>100 mo)" = "#9C27B0"
)) +
geom_vline(xintercept = c(0, 18, 36, 72), linetype = "dashed", color = "black", linewidth = 0.5) +
annotate("text", x = 27, y = Inf, label = "Optimal\nRange", vjust = 1.5, size = 3, color = "darkgreen") +
labs(
title = "Figure 1: Age at First Calving - Complete Distribution",
subtitle = paste0("Total N = ", format(nrow(afc_data), big.mark = ","),
" | Raw Mean = ", round(mean(afc_data$afc_months), 1), " months",
" | Note: Y-axis is log scale to show outliers"),
x = "AFC (months)",
y = "Count (log scale)",
fill = "Category"
) +
theme(legend.position = "bottom") +
scale_y_log10(labels = comma)
# Plot 2: Cleaned data only
afc_clean <- afc_data %>% filter(afc_months >= 18, afc_months <= 72)
p2 <- ggplot(afc_clean, aes(x = afc_months)) +
geom_histogram(bins = 40, fill = "steelblue", color = "black", linewidth = 0.2) +
geom_vline(xintercept = mean(afc_clean$afc_months), linetype = "dashed", color = "red", linewidth = 1) +
annotate("text", x = mean(afc_clean$afc_months) + 2, y = Inf,
label = paste0("Clean Mean = ", round(mean(afc_clean$afc_months), 1), " months"),
vjust = 2, hjust = 0, color = "red", fontface = "bold") +
labs(
title = "Figure 2: AFC - Cleaned Data Only (18-72 months)",
subtitle = paste0("Clean N = ", format(nrow(afc_clean), big.mark = ","),
" (", round(100*nrow(afc_clean)/nrow(afc_data), 1), "% of total)",
" | Mean = ", round(mean(afc_clean$afc_months), 1),
" Β± ", round(sd(afc_clean$afc_months), 1), " months"),
x = "AFC (months)",
y = "Count"
)
grid.arrange(p1, p2, ncol = 1, heights = c(1.2, 1))π How to Read These Figures:
Figure 1 (All Data):
Figure 2 (Cleaned Data):
Key Insight: The raw mean is pulled upward by extreme outliers. Always report the CLEANED mean.
# AFC by breed - CLEANED
afc_by_breed <- lactation %>%
filter(parity == 1, !is.na(afc_months), !is.na(main_breed_name),
afc_months >= 18, afc_months <= 72) %>%
group_by(main_breed_name) %>%
summarise(
N = n(),
Min = round(min(afc_months), 1),
Max = round(max(afc_months), 1),
Mean = round(mean(afc_months), 2),
SD = round(sd(afc_months), 2),
SE = round(sd(afc_months) / sqrt(n()), 2),
Median = round(median(afc_months), 1),
.groups = "drop"
) %>%
filter(N >= 30) %>%
arrange(Mean)
kable(afc_by_breed,
caption = "Table 5: AFC by Breed (Cleaned: 18-72 months, N β₯ 30)",
format.args = list(big.mark = ",")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| main_breed_name | N | Min | Max | Mean | SD | SE | Median |
|---|---|---|---|---|---|---|---|
| Scandinavian Red | 50 | 18 | 38 | 26.84 | 4.51 | 0.64 | 25.5 |
| Jersey crosses | 70 | 18 | 52 | 30.11 | 6.17 | 0.74 | 29.5 |
| Gir | 199 | 21 | 72 | 31.07 | 7.48 | 0.53 | 30.0 |
| Fleckvieh | 55 | 21 | 62 | 31.49 | 8.06 | 1.09 | 30.0 |
| Jersey | 1,144 | 18 | 72 | 31.50 | 10.19 | 0.30 | 28.0 |
| Holstein-Friesian crosses | 1,255 | 18 | 72 | 35.78 | 7.02 | 0.20 | 35.0 |
| Ayrshire crosses | 88 | 18 | 72 | 36.75 | 10.89 | 1.16 | 34.0 |
| Holstein Friesian | 15,944 | 18 | 72 | 37.21 | 11.06 | 0.09 | 34.0 |
| Sahiwal crosses | 237 | 22 | 60 | 38.36 | 8.24 | 0.54 | 37.0 |
| Boran | 101 | 19 | 69 | 38.67 | 9.09 | 0.90 | 37.0 |
| Unknown | 43 | 24 | 71 | 40.09 | 14.00 | 2.13 | 36.0 |
| Guernsey | 574 | 19 | 72 | 41.44 | 12.62 | 0.53 | 38.0 |
| Ayrshire | 5,188 | 18 | 72 | 43.49 | 11.60 | 0.16 | 41.0 |
| Brown Swiss | 147 | 19 | 72 | 44.18 | 13.94 | 1.15 | 42.0 |
| Sahiwal | 2,875 | 18 | 72 | 49.27 | 11.44 | 0.21 | 49.0 |
π Column Descriptions for Table 5:
| Column | Description |
|---|---|
| main_breed_name | Breed of the animal |
| N | Number of first-parity records (after filtering 18-72 mo) |
| Min | Minimum AFC for this breed (within filtered range) |
| Max | Maximum AFC for this breed (within filtered range) |
| Mean | β Average AFC - use this for reporting |
| SD | Standard Deviation - spread of AFC values |
| SE | Standard Error = SD/βN - precision of mean estimate |
| Median | Middle value - alternative to mean |
Interpretation:
ggplot(afc_by_breed, aes(x = reorder(main_breed_name, Mean), y = Mean)) +
geom_bar(stat = "identity", fill = "steelblue", width = 0.7) +
geom_errorbar(aes(ymin = Mean - SE, ymax = Mean + SE), width = 0.2, color = "black") +
geom_text(aes(label = paste0(round(Mean, 1), " mo\n(n=", format(N, big.mark=","), ")")),
hjust = -0.1, size = 3) +
geom_hline(yintercept = 30, linetype = "dashed", color = "darkgreen", linewidth = 1) +
annotate("text", x = 0.5, y = 31, label = "Target: 30 months", hjust = 0, color = "darkgreen", size = 3) +
coord_flip() +
labs(
title = "Figure 3: Mean AFC by Breed (Cleaned Data: 18-72 months)",
subtitle = "Error bars = Β± 1 Standard Error | Green line = target AFC of 30 months",
x = "Breed",
y = "Mean AFC (months)"
) +
scale_y_continuous(limits = c(0, max(afc_by_breed$Mean + afc_by_breed$SE) * 1.5))π How to Read Figure 3:
# Daily yield categories
dmy_data <- milking %>%
filter(!is.na(daily_yield)) %>%
mutate(
yield_category = case_when(
daily_yield < 0 ~ "Negative (ERROR)",
daily_yield == 0 ~ "Zero Yield",
daily_yield > 0 & daily_yield < 0.5 ~ "Very Low (<0.5 kg)",
daily_yield >= 0.5 & daily_yield <= 10 ~ "Low (0.5-10 kg)",
daily_yield > 10 & daily_yield <= 20 ~ "Medium (10-20 kg)",
daily_yield > 20 & daily_yield <= 30 ~ "Good (20-30 kg)",
daily_yield > 30 & daily_yield <= 50 ~ "High (30-50 kg)",
daily_yield > 50 ~ "Extreme (>50 kg)"
),
yield_category = factor(yield_category, levels = c("Negative (ERROR)", "Zero Yield",
"Very Low (<0.5 kg)", "Low (0.5-10 kg)",
"Medium (10-20 kg)", "Good (20-30 kg)",
"High (30-50 kg)", "Extreme (>50 kg)"))
)
# Summary
dmy_summary <- dmy_data %>%
group_by(yield_category) %>%
summarise(
N = n(),
Min = round(min(daily_yield), 2),
Max = round(max(daily_yield), 2),
Mean = round(mean(daily_yield), 2),
SD = round(sd(daily_yield), 2),
.groups = "drop"
) %>%
mutate(
Percentage = round(100 * N / sum(N), 2),
Status = case_when(
yield_category == "Negative (ERROR)" ~ "π΄ ERROR",
yield_category == "Zero Yield" ~ "π CHECK",
yield_category == "Very Low (<0.5 kg)" ~ "π WARNING",
yield_category %in% c("Low (0.5-10 kg)", "Medium (10-20 kg)", "Good (20-30 kg)") ~ "π’ NORMAL",
yield_category == "High (30-50 kg)" ~ "π΅ HIGH",
yield_category == "Extreme (>50 kg)" ~ "π£ VERIFY"
)
)
kable(dmy_summary,
caption = "Table 6: Daily Milk Yield Distribution by Category",
format.args = list(big.mark = ",")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| yield_category | N | Min | Max | Mean | SD | Percentage | Status |
|---|---|---|---|---|---|---|---|
| Zero Yield | 333,069 | 0.00 | 0.00 | 0.00 | 0.00 | 31.76 | π CHECK | |
| Very Low (<0.5 kg) | 40,284 | 0.01 | 0.49 | 0.28 | 0.12 | 3.84 | π WARNING | |
| Low (0.5-10 kg) | 393,426 | 0.50 | 10.00 | 3.43 | 3.11 | 37.52 | π’ NORMAL | |
| Medium (10-20 kg) | 237,650 | 10.01 | 20.00 | 14.23 | 2.69 | 22.66 | π’ NORMAL | |
| Good (20-30 kg) | 42,181 | 20.10 | 30.00 | 23.08 | 2.45 | 4.02 | π’ NORMAL | |
| High (30-50 kg) | 1,936 | 30.10 | 50.00 | 32.68 | 2.83 | 0.18 | π΅ HIGH | |
π Understanding Milk Yield Categories:
| Category | Range | Status | Explanation |
|---|---|---|---|
| Negative | < 0 kg | π΄ ERROR | Impossible - data entry error |
| Zero | = 0 kg | π CHECK | May be: dry cow, sick, or missing data recorded as 0 |
| Very Low | 0-0.5 kg | π WARNING | Unusual - verify recording |
| Low | 0.5-10 kg | π’ NORMAL | Typical for local/indigenous breeds or late lactation |
| Medium | 10-20 kg | π’ NORMAL | Good production for crossbreds |
| Good | 20-30 kg | π’ NORMAL | High production crossbreds or pure exotics |
| High | 30-50 kg | π΅ HIGH | Elite producers - verify if correct |
| Extreme | > 50 kg | π£ VERIFY | Very rare - likely error unless elite Holstein |
Common Reasons for Zero Yields:
# Plot 1: All data
p1 <- ggplot(dmy_data, aes(x = daily_yield)) +
geom_histogram(aes(fill = yield_category), bins = 100, color = "black", linewidth = 0.1) +
scale_fill_manual(values = c(
"Negative (ERROR)" = "#D32F2F",
"Zero Yield" = "#757575",
"Very Low (<0.5 kg)" = "#FF9800",
"Low (0.5-10 kg)" = "#8BC34A",
"Medium (10-20 kg)" = "#4CAF50",
"Good (20-30 kg)" = "#2E7D32",
"High (30-50 kg)" = "#2196F3",
"Extreme (>50 kg)" = "#9C27B0"
)) +
labs(
title = "Figure 4: Daily Milk Yield - Complete Distribution",
subtitle = paste0("Total N = ", format(nrow(dmy_data), big.mark = ","),
" | Note: Large spike at zero - investigate these records"),
x = "Daily Yield (kg)",
y = "Count (log scale)",
fill = "Category"
) +
scale_y_log10(labels = comma) +
theme(legend.position = "bottom")
# Plot 2: Cleaned data
clean_dmy <- dmy_data %>% filter(daily_yield >= 0.5, daily_yield <= 50)
p2 <- ggplot(clean_dmy, aes(x = daily_yield)) +
geom_histogram(bins = 50, fill = "steelblue", color = "black", linewidth = 0.1) +
geom_vline(xintercept = mean(clean_dmy$daily_yield), linetype = "dashed", color = "red", linewidth = 1) +
annotate("text", x = mean(clean_dmy$daily_yield) + 1, y = Inf,
label = paste0("Clean Mean = ", round(mean(clean_dmy$daily_yield), 2), " kg/day"),
vjust = 2, hjust = 0, color = "red", fontface = "bold", size = 4) +
labs(
title = "Figure 5: Daily Milk Yield - Cleaned Data (0.5-50 kg)",
subtitle = paste0("Clean N = ", format(nrow(clean_dmy), big.mark = ","),
" (", round(100*nrow(clean_dmy)/nrow(dmy_data), 1), "% retained)",
" | Mean = ", round(mean(clean_dmy$daily_yield), 2),
" Β± ", round(sd(clean_dmy$daily_yield), 2), " kg/day"),
x = "Daily Yield (kg)",
y = "Count"
)
grid.arrange(p1, p2, ncol = 1)β οΈ Important Finding: High Percentage of Zero Yields
n_zeros <- sum(dmy_data$daily_yield == 0)
pct_zeros <- round(100 * n_zeros / nrow(dmy_data), 1)
cat(paste0("
**", format(n_zeros, big.mark=","), " records (", pct_zeros, "%) have zero daily yield.**
This needs investigation:
1. **Are these dry cows?** Check if DIM is near end of lactation
2. **Are these sick animals?** Cross-reference with health records
3. **Is this missing data?** Should these be NA instead of 0?
4. **Project-specific issue?** Check if zeros concentrated in certain projects
**Recommendation:** Do NOT simply delete zeros - investigate the cause first!
"))333,069 records (31.8%) have zero daily yield.
This needs investigation:
Recommendation: Do NOT simply delete zeros - investigate the cause first!
# Create status indicators
comparison <- comparison %>%
mutate(
Quality_Status = case_when(
Pct_Lost < 5 ~ "π’ Good",
Pct_Lost < 10 ~ "π‘ Acceptable",
Pct_Lost < 20 ~ "π Warning",
TRUE ~ "π΄ Major Issue"
)
)
kable(comparison %>% select(Trait, Raw_N, Raw_Min, Raw_Max, Raw_Mean,
Filter_Applied, Clean_N, Clean_Mean, Clean_SD,
Records_Lost, Pct_Lost, Quality_Status),
caption = "Table 7: Final Data Quality Summary",
col.names = c("Trait", "Raw N", "Raw Min", "Raw Max", "Raw Mean",
"Filter", "Clean N", "Clean Mean", "Clean SD",
"N Lost", "% Lost", "Status"),
format.args = list(big.mark = ",")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
column_spec(3, color = ifelse(comparison$Min_Flag, "red", "black"),
bold = ifelse(comparison$Min_Flag, TRUE, FALSE)) %>%
column_spec(4, color = ifelse(comparison$Max_Flag, "red", "black"),
bold = ifelse(comparison$Max_Flag, TRUE, FALSE)) %>%
column_spec(11, color = ifelse(comparison$Loss_Flag, "red", "darkgreen"),
bold = TRUE)| Trait | Raw N | Raw Min | Raw Max | Raw Mean | Filter | Clean N | Clean Mean | Clean SD | N Lost | % Lost | Status |
|---|---|---|---|---|---|---|---|---|---|---|---|
| AFC (months) | 33,015 | -394 | 24,295.0 | 50.96 | 18 to 72 | 28,229 | 39.36 | 11.89 | 4,786 | 14.50 | π Warning | |
| Calving Interval (days) | 50,463 | 0 | 16,530.0 | 521.70 | 300 to 700 | 41,248 | 433.45 | 92.15 | 9,215 | 18.26 | π Warning | |
| Lactation Length (days) | 50,707 | 0 | 669,342.0 | 535.21 | 60 to 500 | 33,573 | 386.08 | 60.86 | 17,134 | 33.79 | π΄ Major Issue | |
| Total Lactation Yield (kg) | 88,347 | 0 | 23,186.4 | 112.02 | >0 to 15,000 | 37,184 | 261.79 | 927.82 | 51,163 | 57.91 | π΄ Major Issue | |
| Days Open | 50,710 | -280 | 669,062.0 | 255.21 | 21 to 400 | 40,802 | 150.28 | 87.90 | 9,908 | 19.54 | π Warning | |
| Daily Milk Yield (kg) | 1,048,546 | 0 | 50.0 | 5.51 | 0.5 to 50 | 675,193 | 8.54 | 7.08 | 373,353 | 35.61 | π΄ Major Issue | |
| Days in Milk (DIM) | 1,048,575 | -729,939 | 7,752.0 | 220.35 | 5 to 500 | 957,939 | 169.61 | 112.75 | 90,636 | 8.64 | π‘ Acceptable | |
| Parity | 87,990 | 1 | 32.0 | 2.42 | 1 to 12 | 87,927 | 2.41 | 1.76 | 63 | 0.07 | π’ Good | |
π How to Use Table 7:
| What to Report | Column to Use | Notes |
|---|---|---|
| Sample size | Clean N | After removing outliers |
| Average value | Clean Mean | NOT Raw Mean (biased by outliers) |
| Variation | Clean SD | Standard deviation of cleaned data |
| Data quality | % Lost & Status | Document in methods section |
Status Legend:
π¨ Critical Issues Requiring Attention:
cat("
| Issue | Severity | Records Affected | Action Required |
|-------|----------|------------------|-----------------|
| **Negative AFC values** | π΄ Critical | Check birthdate/calving date | Fix or remove |
| **Negative DIM values** | π΄ Critical | Milking date before calving | Fix dates |
| **Zero total lactation yields** | π΄ Major | ~58% of records | Investigate - incomplete data? |
| **Zero daily yields** | π Warning | ~32% of records | Check if dry cows or missing |
| **Extreme AFC (>100 mo)** | π΄ Critical | Data entry errors | Remove |
| **Negative Days Open** | π΄ Critical | Conception before calving | Fix dates |
")| Issue | Severity | Records Affected | Action Required |
|---|---|---|---|
| Negative AFC values | π΄ Critical | Check birthdate/calving date | Fix or remove |
| Negative DIM values | π΄ Critical | Milking date before calving | Fix dates |
| Zero total lactation yields | π΄ Major | ~58% of records | Investigate - incomplete data? |
| Zero daily yields | π Warning | ~32% of records | Check if dry cows or missing |
| Extreme AFC (>100 mo) | π΄ Critical | Data entry errors | Remove |
| Negative Days Open | π΄ Critical | Conception before calving | Fix dates |
β Recommended Data Cleaning Workflow:
Step 1: Remove Impossible Values
# Remove biologically impossible values
data_clean <- data %>%
filter(
afc_months >= 0, # No negative ages
dim >= 0, # No negative DIM
days_open >= 0, # No negative days open
daily_yield >= 0 # No negative yields
)Step 2: Apply Biological Filters
# Keep only biologically plausible values
data_clean <- data_clean %>%
filter(
afc_months >= 18 & afc_months <= 72,
calving_interval_days >= 300 & calving_interval_days <= 700,
daily_yield >= 0.5 & daily_yield <= 50,
dim >= 5 & dim <= 500,
days_open >= 21 & days_open <= 400,
total_lactation_yield > 0 & total_lactation_yield <= 15000
)Step 3: Document What You Did
π What to Report in Publications:
When reporting results, always include:
Example Methods Statement:
βAge at first calving (AFC) was calculated for first-parity animals. Records with AFC outside the biologically plausible range of 18-72 months were excluded (N = X, Y% of records), resulting in a final sample of N = Z animals. Mean AFC was X.X Β± Y.Y months.β
## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_Kenya.utf8 LC_CTYPE=English_Kenya.utf8
## [3] LC_MONETARY=English_Kenya.utf8 LC_NUMERIC=C
## [5] LC_TIME=English_Kenya.utf8
##
## time zone: Africa/Nairobi
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] viridis_0.6.5 viridisLite_0.4.2 kableExtra_1.4.0 knitr_1.50
## [5] gridExtra_2.3 scales_1.4.0 ggplot2_3.5.2 tidyr_1.3.1
## [9] dplyr_1.1.4 data.table_1.17.8
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 jsonlite_2.0.0 compiler_4.5.1 tidyselect_1.2.1
## [5] xml2_1.4.0 stringr_1.5.1 jquerylib_0.1.4 textshaping_1.0.2
## [9] systemfonts_1.2.3 yaml_2.3.10 fastmap_1.2.0 R6_2.6.1
## [13] labeling_0.4.3 generics_0.1.4 tibble_3.3.0 svglite_2.2.1
## [17] bslib_0.9.0 pillar_1.11.0 RColorBrewer_1.1-3 rlang_1.1.6
## [21] stringi_1.8.7 cachem_1.1.0 xfun_0.52 sass_0.4.10
## [25] cli_3.6.5 withr_3.0.2 magrittr_2.0.3 digest_0.6.37
## [29] grid_4.5.1 rstudioapi_0.17.1 lifecycle_1.0.4 vctrs_0.6.5
## [33] evaluate_1.0.5 glue_1.8.0 farver_2.1.2 rmarkdown_2.29
## [37] purrr_1.1.0 tools_4.5.1 pkgconfig_2.0.3 htmltools_0.5.8.1
Report generated on 08 December 2025 at 12:39