# Load required packages
library(data.table)
library(dplyr)
library(tidyr)
library(ggplot2)
library(scales)
library(gridExtra)
library(knitr)
library(kableExtra)
library(viridis)
# Set theme
theme_set(theme_bw(base_size = 12))This report provides a comprehensive data quality assessment for the Kenya ADGG/TDGG dairy datasets. It identifies:
# SET YOUR FILE PATH HERE
data_path <- "C:/Users/SAlemu/OneDrive - CGIAR/Documents/KenyaData/New_Kenya_data/"
# Read data - treating "NULL" as NA
lactation <- fread(paste0(data_path, "lactation_calving.csv"),
na.strings = c("NULL", "null", "NA", "N/A", "", " ", "NaN"),
stringsAsFactors = FALSE)
milking <- fread(paste0(data_path, "milking_data.csv"),
na.strings = c("NULL", "null", "NA", "N/A", "", " ", "NaN"),
stringsAsFactors = FALSE)
# Convert numeric columns
lactation <- lactation %>%
mutate(
parity = as.numeric(parity),
afc_months = as.numeric(afc_months),
afc_days = as.numeric(afc_days),
calving_interval_days = as.numeric(calving_interval_days),
calving_interval_months = as.numeric(calving_interval_months),
lactation_length_days = as.numeric(lactation_length_days),
total_lactation_yield = as.numeric(total_lactation_yield),
testday_count = as.numeric(testday_count),
days_open = as.numeric(days_open)
)
milking <- milking %>%
mutate(
parity = as.numeric(parity),
afc_months = as.numeric(afc_months),
afc_days = as.numeric(afc_days),
dim = as.numeric(dim),
daily_yield = as.numeric(daily_yield),
milk_morning = as.numeric(milk_morning),
milk_evening = as.numeric(milk_evening),
milk_midday = as.numeric(milk_midday),
testday_no = as.numeric(testday_no),
calving_age_months = as.numeric(calving_age_months)
)
cat("Lactation data:", format(nrow(lactation), big.mark = ","), "records\n")## Lactation data: 88,352 records
## Milking data: 1,048,575 records
This section shows the raw statistics including outliers to highlight data quality issues.
# Function to calculate comprehensive stats with NA handling
calc_stats <- function(x, var_name) {
x_valid <- x[!is.na(x)]
n_total <- length(x)
n_valid <- length(x_valid)
n_missing <- sum(is.na(x))
if (n_valid == 0) {
return(data.frame(
Variable = var_name,
N_Total = n_total,
N_Valid = 0,
N_Missing = n_missing,
Pct_Missing = 100,
Min = NA, Max = NA, Mean = NA, SD = NA, Median = NA
))
}
data.frame(
Variable = var_name,
N_Total = n_total,
N_Valid = n_valid,
N_Missing = n_missing,
Pct_Missing = round(100 * n_missing / n_total, 2),
Min = round(min(x_valid), 2),
Max = round(max(x_valid), 2),
Mean = round(mean(x_valid), 2),
SD = round(sd(x_valid), 2),
Median = round(median(x_valid), 2)
)
}
# Lactation traits - RAW (including outliers)
lact_stats_raw <- rbind(
calc_stats(lactation$afc_months[lactation$parity == 1], "AFC (months) - Parity 1"),
calc_stats(lactation$afc_days[lactation$parity == 1], "AFC (days) - Parity 1"),
calc_stats(lactation$calving_interval_days[lactation$parity > 1], "Calving Interval (days)"),
calc_stats(lactation$lactation_length_days, "Lactation Length (days)"),
calc_stats(lactation$total_lactation_yield, "Total Lactation Yield (kg)"),
calc_stats(lactation$days_open, "Days Open"),
calc_stats(lactation$parity, "Parity"),
calc_stats(lactation$testday_count, "Test-day Count")
)
kable(lact_stats_raw,
caption = "Lactation Data - RAW Statistics (Including Outliers)",
format.args = list(big.mark = ",")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
column_spec(6, color = "red", bold = TRUE) %>%
column_spec(7, color = "red", bold = TRUE)| Variable | N_Total | N_Valid | N_Missing | Pct_Missing | Min | Max | Mean | SD | Median |
|---|---|---|---|---|---|---|---|---|---|
| AFC (months) - Parity 1 | 37,598 | 33,015 | 4,583 | 12.19 | -394 | 24,295.0 | 50.96 | 364.55 | 38 |
| AFC (days) - Parity 1 | 37,598 | 33,015 | 4,583 | 12.19 | -11,956 | 738,588.0 | 1,563.98 | 11,082.36 | 1,158 |
| Calving Interval (days) | 51,116 | 50,463 | 653 | 1.28 | 0 | 16,530.0 | 521.70 | 342.46 | 424 |
| Lactation Length (days) | 88,352 | 50,707 | 37,645 | 42.61 | 0 | 669,342.0 | 535.21 | 2,990.38 | 424 |
| Total Lactation Yield (kg) | 88,352 | 88,347 | 5 | 0.01 | 0 | 23,186.4 | 112.02 | 642.21 | 0 |
| Days Open | 88,352 | 50,710 | 37,642 | 42.60 | -280 | 669,062.0 | 255.21 | 2,990.29 | 144 |
| Parity | 88,352 | 87,990 | 362 | 0.41 | 1 | 32.0 | 2.42 | 1.82 | 2 |
| Test-day Count | 88,352 | 88,352 | 0 | 0.00 | 0 | 2,362.0 | 19.61 | 68.67 | 0 |
⚠️ KEY OUTLIERS VISIBLE IN MIN/MAX:
# Milking traits - RAW
milk_stats_raw <- rbind(
calc_stats(milking$daily_yield, "Daily Milk Yield (kg)"),
calc_stats(milking$milk_morning, "Morning Milk (kg)"),
calc_stats(milking$milk_evening, "Evening Milk (kg)"),
calc_stats(milking$dim, "Days in Milk (DIM)"),
calc_stats(milking$parity, "Parity"),
calc_stats(milking$afc_months, "AFC (months)"),
calc_stats(milking$testday_no, "Test-day Number")
)
kable(milk_stats_raw,
caption = "Milking Data - RAW Statistics (Including Outliers)",
format.args = list(big.mark = ",")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
column_spec(6, color = "red", bold = TRUE) %>%
column_spec(7, color = "red", bold = TRUE)| Variable | N_Total | N_Valid | N_Missing | Pct_Missing | Min | Max | Mean | SD | Median |
|---|---|---|---|---|---|---|---|---|---|
| Daily Milk Yield (kg) | 1,048,575 | 1,048,546 | 29 | 0.00 | 0 | 50 | 5.51 | 6.99 | 1.38 |
| Morning Milk (kg) | 1,048,575 | 1,048,415 | 160 | 0.02 | 0 | 459 | 1.90 | 3.84 | 0.00 |
| Evening Milk (kg) | 1,048,575 | 1,048,413 | 162 | 0.02 | -1 | 29 | 1.81 | 3.18 | 0.00 |
| Days in Milk (DIM) | 1,048,575 | 1,048,575 | 0 | 0.00 | -729,939 | 7,752 | 220.35 | 2,008.49 | 168.00 |
| Parity | 1,048,575 | 1,048,403 | 172 | 0.02 | 1 | 32 | 2.66 | 1.99 | 2.00 |
| AFC (months) | 1,048,575 | 389,867 | 658,708 | 62.82 | -288 | 270 | 38.29 | 19.43 | 33.00 |
| Test-day Number | 1,048,575 | 1,017,335 | 31,240 | 2.98 | 1 | 1,514 | 63.66 | 123.64 | 18.00 |
⚠️ KEY OUTLIERS VISIBLE IN MIN/MAX:
Comparing raw vs cleaned statistics to show the impact of outliers.
# Define cleaning thresholds
# AFC: 18-72 months
# Calving Interval: 300-700 days
# Lactation Length: 60-500 days
# Total Lactation Yield: > 0 and <= 15000 kg
# Days Open: 21-400 days
# Parity: 1-12
# Daily Yield: 0.5-50 kg
# DIM: 5-500 days
# Lactation - CLEANED stats
lact_stats_clean <- rbind(
calc_stats(lactation$afc_months[lactation$parity == 1 &
lactation$afc_months >= 18 &
lactation$afc_months <= 72],
"AFC (months) - CLEANED"),
calc_stats(lactation$calving_interval_days[lactation$parity > 1 &
lactation$calving_interval_days >= 300 &
lactation$calving_interval_days <= 700],
"Calving Interval (days) - CLEANED"),
calc_stats(lactation$lactation_length_days[lactation$lactation_length_days >= 60 &
lactation$lactation_length_days <= 500],
"Lactation Length (days) - CLEANED"),
calc_stats(lactation$total_lactation_yield[lactation$total_lactation_yield > 0 &
lactation$total_lactation_yield <= 15000],
"Total Lactation Yield (kg) - CLEANED"),
calc_stats(lactation$days_open[lactation$days_open >= 21 &
lactation$days_open <= 400],
"Days Open - CLEANED"),
calc_stats(lactation$parity[lactation$parity >= 1 & lactation$parity <= 12],
"Parity - CLEANED")
)
kable(lact_stats_clean,
caption = "Lactation Data - CLEANED Statistics (Outliers Removed)",
format.args = list(big.mark = ",")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE)| Variable | N_Total | N_Valid | N_Missing | Pct_Missing | Min | Max | Mean | SD | Median |
|---|---|---|---|---|---|---|---|---|---|
| AFC (months) - CLEANED | 32,804 | 28,229 | 4,575 | 13.95 | 18.00 | 72.0 | 39.36 | 11.89 | 36 |
| Calving Interval (days) - CLEANED | 41,843 | 41,248 | 595 | 1.42 | 300.00 | 700.0 | 433.45 | 92.15 | 408 |
| Lactation Length (days) - CLEANED | 71,218 | 33,573 | 37,645 | 52.86 | 60.00 | 500.0 | 386.08 | 60.86 | 383 |
| Total Lactation Yield (kg) - CLEANED | 37,189 | 37,184 | 5 | 0.01 | 0.02 | 14,973.5 | 261.79 | 927.82 | 19 |
| Days Open - CLEANED | 78,444 | 40,802 | 37,642 | 47.99 | 21.00 | 400.0 | 150.28 | 87.90 | 126 |
| Parity - CLEANED | 88,289 | 87,927 | 362 | 0.41 | 1.00 | 12.0 | 2.41 | 1.76 | 2 |
# Milking - CLEANED stats
milk_stats_clean <- rbind(
calc_stats(milking$daily_yield[milking$daily_yield >= 0.5 & milking$daily_yield <= 50],
"Daily Milk Yield (kg) - CLEANED"),
calc_stats(milking$milk_morning[milking$milk_morning > 0 & milking$milk_morning <= 30],
"Morning Milk (kg) - CLEANED"),
calc_stats(milking$milk_evening[milking$milk_evening > 0 & milking$milk_evening <= 20],
"Evening Milk (kg) - CLEANED"),
calc_stats(milking$dim[milking$dim >= 5 & milking$dim <= 500],
"DIM - CLEANED"),
calc_stats(milking$afc_months[milking$afc_months >= 18 & milking$afc_months <= 72],
"AFC (months) - CLEANED")
)
kable(milk_stats_clean,
caption = "Milking Data - CLEANED Statistics (Outliers Removed)",
format.args = list(big.mark = ",")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE)| Variable | N_Total | N_Valid | N_Missing | Pct_Missing | Min | Max | Mean | SD | Median |
|---|---|---|---|---|---|---|---|---|---|
| Daily Milk Yield (kg) - CLEANED | 675,222 | 675,193 | 29 | 0.00 | 0.50 | 50 | 8.54 | 7.08 | 8.1 |
| Morning Milk (kg) - CLEANED | 493,797 | 493,637 | 160 | 0.03 | 0.01 | 28 | 4.00 | 3.59 | 2.9 |
| Evening Milk (kg) - CLEANED | 453,383 | 453,221 | 162 | 0.04 | 0.01 | 20 | 4.19 | 3.67 | 3.4 |
| DIM - CLEANED | 957,939 | 957,939 | 0 | 0.00 | 5.00 | 500 | 169.61 | 112.75 | 156.0 |
| AFC (months) - CLEANED | 1,024,502 | 365,794 | 658,708 | 64.30 | 18.00 | 72 | 35.23 | 9.72 | 33.0 |
# Create comparison table
comparison <- data.frame(
Trait = c("AFC (months)", "Calving Interval (days)", "Lactation Length (days)",
"Total Lactation Yield (kg)", "Days Open", "Daily Milk Yield (kg)", "DIM"),
Raw_N = c(
sum(!is.na(lactation$afc_months) & lactation$parity == 1),
sum(!is.na(lactation$calving_interval_days) & lactation$parity > 1),
sum(!is.na(lactation$lactation_length_days)),
sum(!is.na(lactation$total_lactation_yield)),
sum(!is.na(lactation$days_open)),
sum(!is.na(milking$daily_yield)),
sum(!is.na(milking$dim))
),
Raw_Min = c(
round(min(lactation$afc_months[lactation$parity == 1], na.rm = TRUE), 1),
round(min(lactation$calving_interval_days[lactation$parity > 1], na.rm = TRUE), 1),
round(min(lactation$lactation_length_days, na.rm = TRUE), 1),
round(min(lactation$total_lactation_yield, na.rm = TRUE), 1),
round(min(lactation$days_open, na.rm = TRUE), 1),
round(min(milking$daily_yield, na.rm = TRUE), 2),
round(min(milking$dim, na.rm = TRUE), 0)
),
Raw_Max = c(
round(max(lactation$afc_months[lactation$parity == 1], na.rm = TRUE), 1),
round(max(lactation$calving_interval_days[lactation$parity > 1], na.rm = TRUE), 1),
round(max(lactation$lactation_length_days, na.rm = TRUE), 1),
round(max(lactation$total_lactation_yield, na.rm = TRUE), 1),
round(max(lactation$days_open, na.rm = TRUE), 1),
round(max(milking$daily_yield, na.rm = TRUE), 2),
round(max(milking$dim, na.rm = TRUE), 0)
),
Raw_Mean = c(
round(mean(lactation$afc_months[lactation$parity == 1], na.rm = TRUE), 2),
round(mean(lactation$calving_interval_days[lactation$parity > 1], na.rm = TRUE), 2),
round(mean(lactation$lactation_length_days, na.rm = TRUE), 2),
round(mean(lactation$total_lactation_yield, na.rm = TRUE), 2),
round(mean(lactation$days_open, na.rm = TRUE), 2),
round(mean(milking$daily_yield, na.rm = TRUE), 2),
round(mean(milking$dim, na.rm = TRUE), 2)
),
Clean_N = c(
sum(lactation$parity == 1 & lactation$afc_months >= 18 & lactation$afc_months <= 72, na.rm = TRUE),
sum(lactation$parity > 1 & lactation$calving_interval_days >= 300 & lactation$calving_interval_days <= 700, na.rm = TRUE),
sum(lactation$lactation_length_days >= 60 & lactation$lactation_length_days <= 500, na.rm = TRUE),
sum(lactation$total_lactation_yield > 0 & lactation$total_lactation_yield <= 15000, na.rm = TRUE),
sum(lactation$days_open >= 21 & lactation$days_open <= 400, na.rm = TRUE),
sum(milking$daily_yield >= 0.5 & milking$daily_yield <= 50, na.rm = TRUE),
sum(milking$dim >= 5 & milking$dim <= 500, na.rm = TRUE)
),
Clean_Mean = c(
round(mean(lactation$afc_months[lactation$parity == 1 & lactation$afc_months >= 18 & lactation$afc_months <= 72], na.rm = TRUE), 2),
round(mean(lactation$calving_interval_days[lactation$parity > 1 & lactation$calving_interval_days >= 300 & lactation$calving_interval_days <= 700], na.rm = TRUE), 2),
round(mean(lactation$lactation_length_days[lactation$lactation_length_days >= 60 & lactation$lactation_length_days <= 500], na.rm = TRUE), 2),
round(mean(lactation$total_lactation_yield[lactation$total_lactation_yield > 0 & lactation$total_lactation_yield <= 15000], na.rm = TRUE), 2),
round(mean(lactation$days_open[lactation$days_open >= 21 & lactation$days_open <= 400], na.rm = TRUE), 2),
round(mean(milking$daily_yield[milking$daily_yield >= 0.5 & milking$daily_yield <= 50], na.rm = TRUE), 2),
round(mean(milking$dim[milking$dim >= 5 & milking$dim <= 500], na.rm = TRUE), 2)
),
Filter_Applied = c(
"18-72 months",
"300-700 days",
"60-500 days",
">0 & ≤15,000 kg",
"21-400 days",
"0.5-50 kg",
"5-500 days"
)
)
comparison <- comparison %>%
mutate(
Records_Lost = Raw_N - Clean_N,
Pct_Lost = round(100 * Records_Lost / Raw_N, 2)
)
kable(comparison,
caption = "Comparison: Raw vs Cleaned Statistics",
col.names = c("Trait", "Raw N", "Raw Min", "Raw Max", "Raw Mean",
"Clean N", "Clean Mean", "Filter", "Records Lost", "% Lost"),
format.args = list(big.mark = ",")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) %>%
column_spec(3, color = "red") %>%
column_spec(4, color = "red") %>%
column_spec(10, color = "blue", bold = TRUE)| Trait | Raw N | Raw Min | Raw Max | Raw Mean | Clean N | Clean Mean | Filter | Records Lost | % Lost |
|---|---|---|---|---|---|---|---|---|---|
| AFC (months) | NA | -394 | 24,295.0 | 50.96 | 28,229 | 39.36 | 18-72 months | NA | NA |
| Calving Interval (days) | NA | 0 | 16,530.0 | 521.70 | 41,248 | 433.45 | 300-700 days | NA | NA |
| Lactation Length (days) | 50,707 | 0 | 669,342.0 | 535.21 | 33,573 | 386.08 | 60-500 days | 17,134 | 33.79 |
| Total Lactation Yield (kg) | 88,347 | 0 | 23,186.4 | 112.02 | 37,184 | 261.79 | >0 & ≤15,000 kg | 51,163 | 57.91 |
| Days Open | 50,710 | -280 | 669,062.0 | 255.21 | 40,802 | 150.28 | 21-400 days | 9,908 | 19.54 |
| Daily Milk Yield (kg) | 1,048,546 | 0 | 50.0 | 5.51 | 675,193 | 8.54 | 0.5-50 kg | 373,353 | 35.61 |
| DIM | 1,048,575 | -729,939 | 7,752.0 | 220.35 | 957,939 | 169.61 | 5-500 days | 90,636 | 8.64 |
# Create summary of issues
issues_summary <- data.frame(
Issue_Category = c(
"Negative AFC values",
"AFC > 100 months (8+ years)",
"AFC < 18 months (too young)",
"Negative calving interval",
"Calving interval > 1000 days",
"Negative days in milk",
"DIM > 1000 days",
"Daily yield = 0",
"Daily yield > 50 kg",
"Negative daily yield",
"Total lactation yield = 0",
"Total lactation yield > 20,000 kg",
"Negative days open",
"Sex field contains dates",
"Animal type = 'FEMALE' (misplaced)"
),
Dataset = c(rep("Lactation", 5), rep("Milking", 6), rep("Lactation", 4)),
Count = c(
sum(lactation$afc_months < 0, na.rm = TRUE),
sum(lactation$afc_months > 100, na.rm = TRUE),
sum(lactation$afc_months > 0 & lactation$afc_months < 18, na.rm = TRUE),
sum(lactation$calving_interval_days < 0, na.rm = TRUE),
sum(lactation$calving_interval_days > 1000, na.rm = TRUE),
sum(milking$dim < 0, na.rm = TRUE),
sum(milking$dim > 1000, na.rm = TRUE),
sum(milking$daily_yield == 0, na.rm = TRUE),
sum(milking$daily_yield > 50, na.rm = TRUE),
sum(milking$daily_yield < 0, na.rm = TRUE),
sum(lactation$total_lactation_yield == 0, na.rm = TRUE),
sum(lactation$total_lactation_yield > 20000, na.rm = TRUE),
sum(lactation$days_open < 0, na.rm = TRUE),
sum(grepl("/", lactation$sex), na.rm = TRUE),
sum(lactation$animal_type == "FEMALE", na.rm = TRUE)
)
)
issues_summary <- issues_summary %>%
mutate(
Percentage = round(100 * Count / ifelse(Dataset == "Lactation", nrow(lactation), nrow(milking)), 3)
) %>%
arrange(desc(Count))
kable(issues_summary,
caption = "Summary of Data Quality Issues",
format.args = list(big.mark = ",")) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE) %>%
row_spec(which(issues_summary$Count > 1000), bold = TRUE, color = "red")| Issue_Category | Dataset | Count | Percentage |
|---|---|---|---|
| Daily yield = 0 | Milking | 333,069 | 31.764 |
| Total lactation yield = 0 | Milking | 51,154 | 4.878 |
| DIM > 1000 days | Milking | 22,267 | 2.124 |
| Calving interval > 1000 days | Lactation | 3,091 | 3.499 |
| AFC > 100 months (8+ years) | Lactation | 2,280 | 2.581 |
| Negative days open | Lactation | 1,531 | 1.733 |
| AFC < 18 months (too young) | Lactation | 723 | 0.818 |
| Negative AFC values | Lactation | 480 | 0.543 |
| Negative days in milk | Milking | 10 | 0.001 |
| Sex field contains dates | Lactation | 8 | 0.009 |
| Animal type = ‘FEMALE’ (misplaced) | Lactation | 8 | 0.009 |
| Total lactation yield > 20,000 kg | Lactation | 1 | 0.001 |
| Negative calving interval | Lactation | 0 | 0.000 |
| Daily yield > 50 kg | Milking | 0 | 0.000 |
| Negative daily yield | Milking | 0 | 0.000 |
# AFC distribution showing all values including outliers
afc_data <- lactation %>%
filter(parity == 1, !is.na(afc_months)) %>%
mutate(
afc_category = case_when(
afc_months < 0 ~ "Negative (ERROR)",
afc_months >= 0 & afc_months < 18 ~ "Too Young (<18 mo)",
afc_months >= 18 & afc_months <= 72 ~ "Normal (18-72 mo)",
afc_months > 72 & afc_months <= 100 ~ "Late (72-100 mo)",
afc_months > 100 ~ "Extreme (>100 mo)"
)
)
# Summary table with Min, Max, Mean, SD
afc_summary <- afc_data %>%
group_by(afc_category) %>%
summarise(
N = n(),
Min = round(min(afc_months, na.rm = TRUE), 1),
Max = round(max(afc_months, na.rm = TRUE), 1),
Mean = round(mean(afc_months, na.rm = TRUE), 2),
SD = round(sd(afc_months, na.rm = TRUE), 2),
Median = round(median(afc_months, na.rm = TRUE), 1),
.groups = "drop"
) %>%
mutate(Percentage = round(100 * N / sum(N), 2))
kable(afc_summary,
caption = "AFC Value Categories with Min/Max/Mean",
format.args = list(big.mark = ",")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| afc_category | N | Min | Max | Mean | SD | Median | Percentage |
|---|---|---|---|---|---|---|---|
| Extreme (>100 mo) | 1,481 | 101 | 24,295 | 249.59 | 1,706.99 | 117 | 4.49 |
| Late (72-100 mo) | 2,495 | 73 | 100 | 84.47 | 8.11 | 84 | 7.56 |
| Negative (ERROR) | 127 | -394 | -1 | -102.76 | 106.54 | -56 | 0.38 |
| Normal (18-72 mo) | 28,229 | 18 | 72 | 39.36 | 11.89 | 36 | 85.50 |
| Too Young (<18 mo) | 683 | 0 | 17 | 5.74 | 6.30 | 3 | 2.07 |
# Plot 1: Full distribution with log scale
p1 <- ggplot(afc_data, aes(x = afc_months)) +
geom_histogram(aes(fill = afc_category), bins = 100, color = "black", linewidth = 0.1) +
scale_fill_manual(values = c(
"Negative (ERROR)" = "red",
"Too Young (<18 mo)" = "orange",
"Normal (18-72 mo)" = "darkgreen",
"Late (72-100 mo)" = "yellow",
"Extreme (>100 mo)" = "purple"
)) +
geom_vline(xintercept = c(0, 18, 72), linetype = "dashed", color = "black", linewidth = 0.8) +
annotate("text", x = 0, y = Inf, label = "0", vjust = 2, hjust = 0.5, size = 3) +
annotate("text", x = 18, y = Inf, label = "18", vjust = 2, hjust = 0.5, size = 3) +
annotate("text", x = 72, y = Inf, label = "72", vjust = 2, hjust = 0.5, size = 3) +
labs(
title = "Age at First Calving - Full Distribution",
subtitle = paste0("Min = ", round(min(afc_data$afc_months), 1), " | Max = ",
format(round(max(afc_data$afc_months), 1), big.mark = ","),
" | Mean = ", round(mean(afc_data$afc_months), 1), " months"),
x = "AFC (months)",
y = "Count",
fill = "Category"
) +
theme(legend.position = "bottom") +
scale_y_log10(labels = comma)
# Plot 2: Boxplot showing outliers
p2 <- ggplot(afc_data, aes(y = afc_months)) +
geom_boxplot(fill = "steelblue", outlier.color = "red", outlier.size = 1) +
labs(
title = "AFC Boxplot - Outliers in Red",
y = "AFC (months)"
) +
coord_flip()
grid.arrange(p1, p2, ncol = 1, heights = c(2, 1))# Show records with extreme AFC values
extreme_afc <- lactation %>%
filter(parity == 1, !is.na(afc_months)) %>%
filter(afc_months < 0 | afc_months > 100) %>%
select(animal_id, farm_id, project_name, main_breed_name,
afc_months, afc_days, birthdate, calving_date) %>%
arrange(afc_months) %>%
head(50)
kable(extreme_afc,
caption = "Sample of Extreme AFC Values (Negative or > 100 months)") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = TRUE) %>%
scroll_box(height = "400px")| animal_id | farm_id | project_name | main_breed_name | afc_months | afc_days | birthdate | calving_date |
|---|---|---|---|---|---|---|---|
| 176081 | 514151 | NA | Holstein Friesian | -394 | -11956 | 04/09/2018 | 7/15/1985 |
| 175712 | 514149 | NA | Holstein Friesian | -358 | -10868 | 04/09/2018 | 07/07/1988 |
| 175675 | 515076 | NA | Local Zebu | -344 | -10445 | 9/20/2017 | 2/14/1989 |
| 187786 | 510092 | NA | Sahiwal | -309 | -9378 | 7/16/2016 | 11/12/1990 |
| 178409 | 515077 | NA | Unknown | -306 | -9297 | 09/10/2019 | 3/28/1994 |
| 195491 | 598285 | NA | Ayrshire | -296 | -8998 | 02/01/2019 | 6/14/1994 |
| 178911 | 598285 | NA | Holstein Friesian | -292 | -8848 | 02/01/2020 | 11/11/1995 |
| 170328 | 595412 | NA | Ayrshire | -291 | -8822 | 05/08/2018 | 3/13/1994 |
| 178404 | 598285 | NA | Holstein Friesian | -288 | -8748 | 02/01/2020 | 2/19/1996 |
| 170285 | 515160 | NA | Ayrshire | -273 | -8282 | 7/27/2016 | 11/23/1993 |
| 171698 | 616608 | NA | Holstein Friesian | -273 | -8278 | 6/15/2016 | 10/16/1993 |
| 176871 | 597275 | NA | Local Zebu | -272 | -8256 | 10/21/2010 | 3/14/1988 |
| 170515 | 510092 | NA | Sahiwal | -261 | -7917 | 01/07/2018 | 05/05/1996 |
| 746443 | 601235 | KLBA | Ayrshire | -258 | -7831 | 05/04/2005 | 11/25/1983 |
| 335377 | 347826 | STANLEY AND SON LIMITED - KENYA | Boran | -257 | -7801 | 4/28/2011 | 12/18/1989 |
| 339751 | 347826 | STANLEY AND SON LIMITED - KENYA | Boran | -254 | -7698 | 1/15/2011 | 12/18/1989 |
| 170557 | 597358 | NA | Holstein Friesian | -241 | -7320 | 04/01/2020 | 3/17/2000 |
| 170506 | 514950 | NA | Local Zebu | -240 | -7272 | 05/04/2017 | 06/06/1997 |
| 170315 | 598316 | NA | Ayrshire | -238 | -7223 | 5/18/2018 | 08/08/1998 |
| 171861 | 514852 | NA | NA | -237 | -7202 | 03/07/2017 | 6/18/1997 |
| 171864 | 514855 | NA | NA | -227 | -6892 | 03/08/2017 | 4/25/1998 |
| 171859 | 514850 | NA | NA | -226 | -6858 | 03/07/2017 | 5/28/1998 |
| 174152 | 403781 | AVCD | Holstein Friesian | -224 | -6790 | 9/21/2016 | 2/18/1998 |
| 171863 | 514854 | NA | NA | -223 | -6771 | 03/08/2017 | 8/24/1998 |
| 171328 | 515320 | NA | Holstein Friesian | -216 | -6545 | 4/20/2019 | 5/19/2001 |
| 203420 | 514952 | NA | Local Zebu | -212 | -6432 | 05/04/2017 | 9/24/1999 |
| 171887 | 515079 | NA | Local Zebu | -208 | -6302 | 09/11/2019 | 06/10/2002 |
| 179369 | 598285 | NA | Holstein Friesian | -206 | -6235 | 02/01/2021 | 01/07/2004 |
| 792916 | 601289 | KALRO | Ayrshire | -206 | -6261 | 12/01/2022 | 10/10/2005 |
| 171888 | 403011 | AVCD | Unknown | -205 | -6223 | 5/18/2019 | 05/04/2002 |
| 170513 | 515024 | NA | Unknown | -204 | -6175 | 5/29/2019 | 07/02/2002 |
| 813818 | 601252 | KLBA | Holstein Friesian | -204 | -6193 | 4/18/2012 | 05/05/1995 |
| 175682 | 510092 | NA | Unknown | -200 | -6051 | 7/16/2016 | 12/22/1999 |
| 171299 | 514952 | NA | Local Zebu | -196 | -5944 | 05/04/2017 | 1/24/2001 |
| 743491 | 601217 | KLBA | Jersey | -185 | -5619 | 09/02/1997 | 4/15/1982 |
| 758995 | 601397 | KLBA | Holstein Friesian | -185 | -5621 | 11/15/2017 | 6/26/2002 |
| 754668 | 601289 | KALRO | Ayrshire | -183 | -5548 | 8/13/2019 | 06/04/2004 |
| 786263 | 601210 | KLBA | Holstein Friesian | -179 | -5417 | 04/04/2007 | 06/04/1992 |
| 801205 | 601289 | KALRO | Holstein Friesian | -169 | -5127 | 11/28/2022 | 11/14/2008 |
| 170559 | 616901 | NA | Ayrshire | -166 | -5027 | 10/17/2016 | 01/12/2003 |
| 177867 | 616942 | NA | Ayrshire | -166 | -5020 | 10/25/2016 | 1/27/2003 |
| 186024 | 598297 | NA | Holstein Friesian | -156 | -4738 | 05/07/2020 | 5/18/2007 |
| 813828 | 601252 | KLBA | Holstein Friesian | -148 | -4477 | 02/09/2012 | 11/07/1999 |
| 186411 | 515711 | NA | Ayrshire | -147 | -4468 | 4/20/2020 | 1/26/2008 |
| 286489 | 347826 | STANLEY AND SON LIMITED - KENYA | Holstein Friesian | -140 | -4252 | 5/22/1978 | 9/30/1966 |
| 748626 | 601246 | KLBA | Ayrshire | -135 | -4082 | 03/01/2006 | 12/27/1994 |
| 187716 | 598291 | NA | Ayrshire | -126 | -3817 | 10/07/2017 | 4/26/2007 |
| 187128 | 513219 | NA | Unknown | -123 | -3732 | 08/01/2019 | 5/13/2009 |
| 187718 | 598285 | NA | Holstein Friesian | -114 | -3437 | 02/01/2020 | 09/04/2010 |
| 195487 | 598285 | NA | Ayrshire | -110 | -3315 | 5/27/2018 | 4/29/2009 |
# AFC by breed - with proper filtering
afc_by_breed <- lactation %>%
filter(parity == 1, !is.na(afc_months), !is.na(main_breed_name),
afc_months >= 18, afc_months <= 72) %>% # Clean data only
group_by(main_breed_name) %>%
summarise(
N = n(),
Min = round(min(afc_months, na.rm = TRUE), 1),
Max = round(max(afc_months, na.rm = TRUE), 1),
Mean = round(mean(afc_months, na.rm = TRUE), 2),
SD = round(sd(afc_months, na.rm = TRUE), 2),
Median = round(median(afc_months, na.rm = TRUE), 1),
.groups = "drop"
) %>%
filter(N >= 30) %>%
arrange(Mean)
kable(afc_by_breed,
caption = "AFC by Breed (Cleaned: 18-72 months, N≥30)",
format.args = list(big.mark = ",")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| main_breed_name | N | Min | Max | Mean | SD | Median |
|---|---|---|---|---|---|---|
| Scandinavian Red | 50 | 18 | 38 | 26.84 | 4.51 | 25.5 |
| Jersey crosses | 70 | 18 | 52 | 30.11 | 6.17 | 29.5 |
| Gir | 199 | 21 | 72 | 31.07 | 7.48 | 30.0 |
| Fleckvieh | 55 | 21 | 62 | 31.49 | 8.06 | 30.0 |
| Jersey | 1,144 | 18 | 72 | 31.50 | 10.19 | 28.0 |
| Holstein-Friesian crosses | 1,255 | 18 | 72 | 35.78 | 7.02 | 35.0 |
| Ayrshire crosses | 88 | 18 | 72 | 36.75 | 10.89 | 34.0 |
| Holstein Friesian | 15,944 | 18 | 72 | 37.21 | 11.06 | 34.0 |
| Sahiwal crosses | 237 | 22 | 60 | 38.36 | 8.24 | 37.0 |
| Boran | 101 | 19 | 69 | 38.67 | 9.09 | 37.0 |
| Unknown | 43 | 24 | 71 | 40.09 | 14.00 | 36.0 |
| Guernsey | 574 | 19 | 72 | 41.44 | 12.62 | 38.0 |
| Ayrshire | 5,188 | 18 | 72 | 43.49 | 11.60 | 41.0 |
| Brown Swiss | 147 | 19 | 72 | 44.18 | 13.94 | 42.0 |
| Sahiwal | 2,875 | 18 | 72 | 49.27 | 11.44 | 49.0 |
# Plot
ggplot(afc_by_breed, aes(x = reorder(main_breed_name, Mean), y = Mean)) +
geom_bar(stat = "identity", fill = "steelblue", width = 0.7) +
geom_errorbar(aes(ymin = Mean - SD, ymax = Mean + SD), width = 0.2, color = "black") +
geom_text(aes(label = paste0(round(Mean, 1), "\n(n=", N, ")")),
hjust = -0.1, size = 3) +
coord_flip() +
labs(
title = "Mean Age at First Calving by Breed",
subtitle = "Error bars = ±1 SD | Filtered: 18-72 months",
x = "Breed",
y = "Mean AFC (months)"
) +
scale_y_continuous(limits = c(0, max(afc_by_breed$Mean + afc_by_breed$SD, na.rm = TRUE) * 1.3))# Daily yield categories
dmy_data <- milking %>%
filter(!is.na(daily_yield)) %>%
mutate(
yield_category = case_when(
daily_yield < 0 ~ "Negative (ERROR)",
daily_yield == 0 ~ "Zero Yield",
daily_yield > 0 & daily_yield < 0.5 ~ "Very Low (<0.5 kg)",
daily_yield >= 0.5 & daily_yield <= 30 ~ "Normal (0.5-30 kg)",
daily_yield > 30 & daily_yield <= 50 ~ "High (30-50 kg)",
daily_yield > 50 ~ "Extreme (>50 kg)"
)
)
# Summary with Min, Max, Mean
dmy_summary <- dmy_data %>%
group_by(yield_category) %>%
summarise(
N = n(),
Min = round(min(daily_yield, na.rm = TRUE), 2),
Max = round(max(daily_yield, na.rm = TRUE), 2),
Mean = round(mean(daily_yield, na.rm = TRUE), 2),
SD = round(sd(daily_yield, na.rm = TRUE), 2),
Median = round(median(daily_yield, na.rm = TRUE), 2),
.groups = "drop"
) %>%
mutate(Percentage = round(100 * N / sum(N), 2))
kable(dmy_summary,
caption = "Daily Milk Yield Categories with Min/Max/Mean",
format.args = list(big.mark = ",")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| yield_category | N | Min | Max | Mean | SD | Median | Percentage |
|---|---|---|---|---|---|---|---|
| High (30-50 kg) | 1,936 | 30.10 | 50.00 | 32.68 | 2.83 | 31.9 | 0.18 |
| Normal (0.5-30 kg) | 673,257 | 0.50 | 30.00 | 8.47 | 6.96 | 8.1 | 64.21 |
| Very Low (<0.5 kg) | 40,284 | 0.01 | 0.49 | 0.28 | 0.12 | 0.3 | 3.84 |
| Zero Yield | 333,069 | 0.00 | 0.00 | 0.00 | 0.00 | 0.0 | 31.76 |
# Plot 1: Histogram
p1 <- ggplot(dmy_data, aes(x = daily_yield)) +
geom_histogram(aes(fill = yield_category), bins = 100, color = "black", linewidth = 0.1) +
scale_fill_manual(values = c(
"Negative (ERROR)" = "red",
"Zero Yield" = "gray40",
"Very Low (<0.5 kg)" = "orange",
"Normal (0.5-30 kg)" = "darkgreen",
"High (30-50 kg)" = "blue",
"Extreme (>50 kg)" = "purple"
)) +
labs(
title = "Daily Milk Yield - Full Distribution",
subtitle = paste0("Min = ", round(min(dmy_data$daily_yield), 2), " | Max = ",
round(max(dmy_data$daily_yield), 2), " | Mean = ",
round(mean(dmy_data$daily_yield), 2), " kg"),
x = "Daily Yield (kg)",
y = "Count (log scale)",
fill = "Category"
) +
scale_y_log10(labels = comma) +
theme(legend.position = "bottom")
# Plot 2: Focus on normal range with cleaned mean
clean_dmy <- dmy_data %>% filter(daily_yield >= 0.5, daily_yield <= 50)
p2 <- ggplot(clean_dmy, aes(x = daily_yield)) +
geom_histogram(bins = 50, fill = "steelblue", color = "black", linewidth = 0.1) +
geom_vline(xintercept = mean(clean_dmy$daily_yield),
linetype = "dashed", color = "red", linewidth = 1) +
annotate("text", x = mean(clean_dmy$daily_yield), y = Inf,
label = paste0("Mean = ", round(mean(clean_dmy$daily_yield), 2), " kg"),
vjust = 2, hjust = -0.1, color = "red", size = 4) +
labs(
title = "Daily Milk Yield - Cleaned (0.5-50 kg)",
subtitle = paste0("N = ", format(nrow(clean_dmy), big.mark = ","),
" | Mean = ", round(mean(clean_dmy$daily_yield), 2),
" | SD = ", round(sd(clean_dmy$daily_yield), 2), " kg"),
x = "Daily Yield (kg)",
y = "Count"
) +
scale_y_continuous(labels = comma)
# Plot 3: Boxplot
p3 <- ggplot(dmy_data, aes(y = daily_yield)) +
geom_boxplot(fill = "lightblue", outlier.color = "red", outlier.size = 0.5) +
labs(title = "Daily Yield Boxplot", y = "Daily Yield (kg)") +
coord_flip()
grid.arrange(p1, p2, p3, ncol = 1, heights = c(2, 2, 1))# Daily yield by breed - cleaned
dmy_by_breed <- milking %>%
filter(!is.na(daily_yield), !is.na(main_breed_name),
daily_yield >= 0.5, daily_yield <= 50) %>%
group_by(main_breed_name) %>%
summarise(
N_Records = n(),
N_Animals = n_distinct(animal_id),
Min = round(min(daily_yield, na.rm = TRUE), 2),
Max = round(max(daily_yield, na.rm = TRUE), 2),
Mean = round(mean(daily_yield, na.rm = TRUE), 2),
SD = round(sd(daily_yield, na.rm = TRUE), 2),
Median = round(median(daily_yield, na.rm = TRUE), 2),
.groups = "drop"
) %>%
filter(N_Records >= 500) %>%
arrange(desc(Mean))
kable(dmy_by_breed,
caption = "Daily Milk Yield by Breed (Cleaned: 0.5-50 kg, N≥500)",
format.args = list(big.mark = ",")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| main_breed_name | N_Records | N_Animals | Min | Max | Mean | SD | Median |
|---|---|---|---|---|---|---|---|
| Jersey crosses | 25,699 | 55 | 0.5 | 31.4 | 14.61 | 4.97 | 14.20 |
| Gir | 82,442 | 184 | 0.5 | 39.5 | 14.14 | 5.28 | 13.60 |
| Ayrshire crosses | 16,660 | 41 | 0.5 | 34.0 | 13.90 | 4.83 | 13.70 |
| Sahiwal crosses | 24,438 | 41 | 1.0 | 37.1 | 13.22 | 5.23 | 12.65 |
| Holstein-Friesian crosses | 69,743 | 394 | 0.8 | 37.5 | 13.19 | 5.40 | 12.20 |
| Scandinavian Red | 23,510 | 49 | 1.0 | 34.0 | 12.84 | 4.50 | 12.30 |
| Brown Swiss | 21,789 | 126 | 0.5 | 36.0 | 12.54 | 6.59 | 12.30 |
| Girolando | 12,472 | 22 | 1.4 | 31.1 | 12.30 | 4.53 | 12.30 |
| Fleckvieh | 16,331 | 46 | 0.5 | 28.9 | 12.05 | 4.54 | 11.60 |
| Boran | 3,280 | 39 | 0.5 | 25.2 | 10.44 | 4.39 | 9.60 |
| Holstein Friesian | 250,367 | 9,107 | 0.5 | 50.0 | 5.56 | 6.38 | 2.00 |
| Jersey | 14,583 | 537 | 0.5 | 30.0 | 5.12 | 6.01 | 1.42 |
| Sahiwal | 26,437 | 1,422 | 0.5 | 30.0 | 3.26 | 3.69 | 2.00 |
| Ayrshire | 74,726 | 3,557 | 0.5 | 36.3 | 2.83 | 3.94 | 1.37 |
| Guernsey | 8,131 | 419 | 0.5 | 27.6 | 1.74 | 2.45 | 0.95 |
# Plot
ggplot(dmy_by_breed, aes(x = reorder(main_breed_name, Mean), y = Mean)) +
geom_bar(stat = "identity", fill = "darkgreen", width = 0.7) +
geom_errorbar(aes(ymin = Mean - SD/sqrt(N_Records)*1.96,
ymax = Mean + SD/sqrt(N_Records)*1.96),
width = 0.2, color = "black") +
geom_text(aes(label = paste0(round(Mean, 1), " kg")),
hjust = -0.1, size = 3) +
coord_flip() +
labs(
title = "Mean Daily Milk Yield by Breed",
subtitle = "Error bars = 95% CI | Filtered: 0.5-50 kg",
x = "Breed",
y = "Mean Daily Yield (kg)"
) +
scale_y_continuous(limits = c(0, max(dmy_by_breed$Mean, na.rm = TRUE) * 1.3))# DIM categories
dim_data <- milking %>%
filter(!is.na(dim)) %>%
mutate(
dim_category = case_when(
dim < 0 ~ "Negative (ERROR)",
dim == 0 ~ "Zero DIM",
dim >= 1 & dim <= 5 ~ "Very Early (1-5 d)",
dim > 5 & dim <= 305 ~ "Normal (5-305 d)",
dim > 305 & dim <= 500 ~ "Extended (305-500 d)",
dim > 500 & dim <= 1000 ~ "Very Extended (500-1000 d)",
dim > 1000 ~ "Extreme (>1000 d)"
)
)
# Summary
dim_summary <- dim_data %>%
group_by(dim_category) %>%
summarise(
N = n(),
Min = min(dim, na.rm = TRUE),
Max = max(dim, na.rm = TRUE),
Mean = round(mean(dim, na.rm = TRUE), 1),
SD = round(sd(dim, na.rm = TRUE), 1),
.groups = "drop"
) %>%
mutate(Percentage = round(100 * N / sum(N), 2))
kable(dim_summary,
caption = "Days in Milk Categories with Min/Max/Mean",
format.args = list(big.mark = ",")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| dim_category | N | Min | Max | Mean | SD | Percentage |
|---|---|---|---|---|---|---|
| Extended (305-500 d) | 117,863 | 306 | 500 | 376.9 | 55.0 | 11.24 |
| Extreme (>1000 d) | 22,267 | 1,001 | 7,752 | 1,596.5 | 718.3 | 2.12 |
| Negative (ERROR) | 10 | -729,939 | -298,437 | -633,728.4 | 119,639.4 | 0.00 |
| Normal (5-305 d) | 828,097 | 6 | 305 | 142.5 | 83.9 | 78.97 |
| Very Early (1-5 d) | 19,458 | 1 | 5 | 4.3 | 1.1 | 1.86 |
| Very Extended (500-1000 d) | 57,618 | 501 | 1,000 | 682.7 | 138.3 | 5.49 |
| Zero DIM | 3,262 | 0 | 0 | 0.0 | 0.0 | 0.31 |
# Plot
p1 <- ggplot(dim_data, aes(x = dim)) +
geom_histogram(aes(fill = dim_category), bins = 100, color = "black", linewidth = 0.1) +
scale_fill_manual(values = c(
"Negative (ERROR)" = "red",
"Zero DIM" = "gray40",
"Very Early (1-5 d)" = "orange",
"Normal (5-305 d)" = "darkgreen",
"Extended (305-500 d)" = "yellow",
"Very Extended (500-1000 d)" = "purple",
"Extreme (>1000 d)" = "black"
)) +
labs(
title = "Days in Milk - Full Distribution",
subtitle = paste0("Min = ", format(min(dim_data$dim), big.mark = ","),
" | Max = ", format(max(dim_data$dim), big.mark = ","),
" (Note extreme negative!)"),
x = "DIM (days)",
y = "Count (log scale)",
fill = "Category"
) +
scale_y_log10(labels = comma) +
theme(legend.position = "bottom")
print(p1)# TLY data
tly_data <- lactation %>%
filter(!is.na(total_lactation_yield)) %>%
mutate(
tly_category = case_when(
total_lactation_yield < 0 ~ "Negative (ERROR)",
total_lactation_yield == 0 ~ "Zero Yield",
total_lactation_yield > 0 & total_lactation_yield < 100 ~ "Very Low (<100 kg)",
total_lactation_yield >= 100 & total_lactation_yield <= 5000 ~ "Normal (100-5000 kg)",
total_lactation_yield > 5000 & total_lactation_yield <= 10000 ~ "High (5000-10000 kg)",
total_lactation_yield > 10000 & total_lactation_yield <= 15000 ~ "Very High (10-15k kg)",
total_lactation_yield > 15000 ~ "Extreme (>15000 kg)"
)
)
# Summary
tly_summary <- tly_data %>%
group_by(tly_category) %>%
summarise(
N = n(),
Min = round(min(total_lactation_yield, na.rm = TRUE), 1),
Max = round(max(total_lactation_yield, na.rm = TRUE), 1),
Mean = round(mean(total_lactation_yield, na.rm = TRUE), 1),
SD = round(sd(total_lactation_yield, na.rm = TRUE), 1),
.groups = "drop"
) %>%
mutate(Percentage = round(100 * N / sum(N), 2))
kable(tly_summary,
caption = "Total Lactation Yield Categories with Min/Max/Mean",
format.args = list(big.mark = ",")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| tly_category | N | Min | Max | Mean | SD | Percentage |
|---|---|---|---|---|---|---|
| Extreme (>15000 kg) | 9 | 15,205.2 | 23,186.4 | 18,090.9 | 2,309.1 | 0.01 |
| High (5000-10000 kg) | 361 | 5,004.2 | 9,979.2 | 6,615.5 | 1,290.3 | 0.41 |
| Normal (100-5000 kg) | 9,199 | 100.0 | 4,999.0 | 694.1 | 953.7 | 10.41 |
| Very High (10-15k kg) | 39 | 10,000.7 | 14,973.5 | 11,598.2 | 1,589.4 | 0.04 |
| Very Low (<100 kg) | 27,585 | 0.0 | 99.9 | 18.5 | 19.0 | 31.22 |
| Zero Yield | 51,154 | 0.0 | 0.0 | 0.0 | 0.0 | 57.90 |
# Major issue: Many zero yields!
zero_tly_pct <- round(100 * sum(tly_data$total_lactation_yield == 0) / nrow(tly_data), 1)
# Plot
p1 <- ggplot(tly_data, aes(x = total_lactation_yield)) +
geom_histogram(aes(fill = tly_category), bins = 100, color = "black", linewidth = 0.1) +
scale_fill_manual(values = c(
"Negative (ERROR)" = "red",
"Zero Yield" = "gray40",
"Very Low (<100 kg)" = "orange",
"Normal (100-5000 kg)" = "darkgreen",
"High (5000-10000 kg)" = "blue",
"Very High (10-15k kg)" = "purple",
"Extreme (>15000 kg)" = "black"
)) +
labs(
title = "Total Lactation Yield - Full Distribution",
subtitle = paste0("WARNING: ", zero_tly_pct, "% of records have ZERO yield!"),
x = "Total Lactation Yield (kg)",
y = "Count (log scale)",
fill = "Category"
) +
scale_y_log10(labels = comma) +
theme(legend.position = "bottom")
# Non-zero distribution with mean
tly_nonzero <- tly_data %>% filter(total_lactation_yield > 0, total_lactation_yield <= 15000)
p2 <- ggplot(tly_nonzero, aes(x = total_lactation_yield)) +
geom_histogram(bins = 50, fill = "steelblue", color = "black") +
geom_vline(xintercept = mean(tly_nonzero$total_lactation_yield),
linetype = "dashed", color = "red", linewidth = 1) +
annotate("text", x = mean(tly_nonzero$total_lactation_yield), y = Inf,
label = paste0("Mean = ", round(mean(tly_nonzero$total_lactation_yield), 1), " kg"),
vjust = 2, hjust = -0.1, color = "red") +
labs(
title = "Total Lactation Yield - Non-Zero Values Only",
subtitle = paste0("N = ", format(nrow(tly_nonzero), big.mark = ","),
" | Mean = ", round(mean(tly_nonzero$total_lactation_yield), 1),
" | SD = ", round(sd(tly_nonzero$total_lactation_yield), 1), " kg"),
x = "Total Lactation Yield (kg)",
y = "Count"
)
grid.arrange(p1, p2, ncol = 1)# CI data (parity > 1)
ci_data <- lactation %>%
filter(parity > 1, !is.na(calving_interval_days)) %>%
mutate(
ci_category = case_when(
calving_interval_days < 0 ~ "Negative (ERROR)",
calving_interval_days >= 0 & calving_interval_days < 300 ~ "Too Short (<300 d)",
calving_interval_days >= 300 & calving_interval_days <= 450 ~ "Optimal (300-450 d)",
calving_interval_days > 450 & calving_interval_days <= 600 ~ "Long (450-600 d)",
calving_interval_days > 600 & calving_interval_days <= 730 ~ "Very Long (600-730 d)",
calving_interval_days > 730 ~ "Extreme (>730 d / 2 yrs)"
)
)
# Summary
ci_summary <- ci_data %>%
group_by(ci_category) %>%
summarise(
N = n(),
Min = round(min(calving_interval_days, na.rm = TRUE), 1),
Max = round(max(calving_interval_days, na.rm = TRUE), 1),
Mean = round(mean(calving_interval_days, na.rm = TRUE), 1),
SD = round(sd(calving_interval_days, na.rm = TRUE), 1),
.groups = "drop"
) %>%
mutate(Percentage = round(100 * N / sum(N), 2))
kable(ci_summary,
caption = "Calving Interval Categories with Min/Max/Mean",
format.args = list(big.mark = ",")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| ci_category | N | Min | Max | Mean | SD | Percentage |
|---|---|---|---|---|---|---|
| Extreme (>730 d / 2 yrs) | 6,784 | 731 | 16,530 | 1,135.2 | 595.6 | 13.44 |
| Long (450-600 d) | 11,011 | 451 | 600 | 512.1 | 41.8 | 21.82 |
| Optimal (300-450 d) | 27,179 | 300 | 450 | 377.6 | 36.5 | 53.86 |
| Too Short (<300 d) | 1,761 | 0 | 299 | 151.7 | 108.3 | 3.49 |
| Very Long (600-730 d) | 3,728 | 601 | 730 | 659.3 | 37.6 | 7.39 |
# Plots
p1 <- ggplot(ci_data, aes(x = calving_interval_days)) +
geom_histogram(aes(fill = ci_category), bins = 100, color = "black", linewidth = 0.1) +
scale_fill_manual(values = c(
"Negative (ERROR)" = "red",
"Too Short (<300 d)" = "orange",
"Optimal (300-450 d)" = "darkgreen",
"Long (450-600 d)" = "yellow",
"Very Long (600-730 d)" = "purple",
"Extreme (>730 d / 2 yrs)" = "black"
)) +
labs(
title = "Calving Interval - Full Distribution",
subtitle = paste0("Min = ", round(min(ci_data$calving_interval_days), 1),
" | Max = ", format(round(max(ci_data$calving_interval_days), 1), big.mark = ","),
" | Mean = ", round(mean(ci_data$calving_interval_days), 1), " days"),
x = "Calving Interval (days)",
y = "Count (log scale)",
fill = "Category"
) +
scale_y_log10(labels = comma) +
theme(legend.position = "bottom")
print(p1)# Days open data
do_data <- lactation %>%
filter(!is.na(days_open)) %>%
mutate(
do_category = case_when(
days_open < 0 ~ "Negative (ERROR)",
days_open >= 0 & days_open < 21 ~ "Too Short (<21 d)",
days_open >= 21 & days_open <= 120 ~ "Optimal (21-120 d)",
days_open > 120 & days_open <= 200 ~ "Acceptable (120-200 d)",
days_open > 200 & days_open <= 365 ~ "Long (200-365 d)",
days_open > 365 ~ "Extreme (>365 d)"
)
)
# Summary
do_summary <- do_data %>%
group_by(do_category) %>%
summarise(
N = n(),
Min = round(min(days_open, na.rm = TRUE), 1),
Max = round(max(days_open, na.rm = TRUE), 1),
Mean = round(mean(days_open, na.rm = TRUE), 1),
SD = round(sd(days_open, na.rm = TRUE), 1),
.groups = "drop"
) %>%
mutate(Percentage = round(100 * N / sum(N), 2))
kable(do_summary,
caption = "Days Open Categories with Min/Max/Mean",
format.args = list(big.mark = ",")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE)| do_category | N | Min | Max | Mean | SD | Percentage |
|---|---|---|---|---|---|---|
| Acceptable (120-200 d) | 11,082 | 121 | 200 | 155.7 | 22.8 | 21.85 |
| Extreme (>365 d) | 9,053 | 366 | 669,062 | 821.1 | 7,046.7 | 17.85 |
| Long (200-365 d) | 9,364 | 201 | 365 | 266.3 | 45.8 | 18.47 |
| Negative (ERROR) | 1,531 | -280 | -1 | -159.2 | 97.2 | 3.02 |
| Optimal (21-120 d) | 19,353 | 21 | 120 | 79.0 | 24.0 | 38.16 |
| Too Short (<21 d) | 327 | 0 | 20 | 11.6 | 6.0 | 0.64 |
# Plot
p1 <- ggplot(do_data, aes(x = days_open)) +
geom_histogram(aes(fill = do_category), bins = 100, color = "black", linewidth = 0.1) +
scale_fill_manual(values = c(
"Negative (ERROR)" = "red",
"Too Short (<21 d)" = "orange",
"Optimal (21-120 d)" = "darkgreen",
"Acceptable (120-200 d)" = "lightgreen",
"Long (200-365 d)" = "yellow",
"Extreme (>365 d)" = "purple"
)) +
labs(
title = "Days Open - Full Distribution",
subtitle = paste0("Min = ", round(min(do_data$days_open), 1), " (NEGATIVE!) | Max = ",
format(round(max(do_data$days_open), 1), big.mark = ",")),
x = "Days Open",
y = "Count (log scale)",
fill = "Category"
) +
scale_y_log10(labels = comma) +
scale_x_continuous(limits = c(-500, 2000)) +
theme(legend.position = "bottom")
print(p1)# Merge data for scatter plots
scatter_data <- milking %>%
filter(!is.na(afc_months), !is.na(daily_yield)) %>%
filter(afc_months > -500, afc_months < 500, daily_yield >= 0, daily_yield <= 60)
# Sample if too large
if(nrow(scatter_data) > 50000) {
scatter_data <- scatter_data %>% sample_n(50000)
}
ggplot(scatter_data, aes(x = afc_months, y = daily_yield)) +
geom_point(alpha = 0.3, size = 0.5, color = "steelblue") +
geom_vline(xintercept = c(0, 18, 72), linetype = "dashed", color = "red") +
geom_hline(yintercept = c(0, 50), linetype = "dashed", color = "red") +
annotate("rect", xmin = 18, xmax = 72, ymin = 0.5, ymax = 50,
alpha = 0.1, fill = "green") +
labs(
title = "AFC vs Daily Milk Yield - Outlier Detection",
subtitle = "Green zone = expected normal range; Red lines = thresholds",
x = "Age at First Calving (months)",
y = "Daily Milk Yield (kg)"
)scatter_dim <- milking %>%
filter(!is.na(dim), !is.na(daily_yield)) %>%
filter(dim > -1000, dim < 1000, daily_yield >= 0, daily_yield <= 60)
# Sample if too large
if(nrow(scatter_dim) > 50000) {
scatter_dim <- scatter_dim %>% sample_n(50000)
}
ggplot(scatter_dim, aes(x = dim, y = daily_yield)) +
geom_point(alpha = 0.3, size = 0.5, color = "darkgreen") +
geom_vline(xintercept = c(0, 305, 500), linetype = "dashed", color = "red") +
geom_hline(yintercept = c(0, 50), linetype = "dashed", color = "red") +
annotate("rect", xmin = 5, xmax = 305, ymin = 0.5, ymax = 40,
alpha = 0.1, fill = "green") +
labs(
title = "Days in Milk vs Daily Yield - Outlier Detection",
subtitle = "Green zone = expected normal range",
x = "Days in Milk (DIM)",
y = "Daily Milk Yield (kg)"
)# Create final summary table with proper NA handling
quality_summary <- data.frame(
Trait = c("Age at First Calving", "Calving Interval", "Total Lactation Yield",
"Daily Milk Yield", "Days in Milk", "Days Open", "Parity"),
N_Total = c(
sum(!is.na(lactation$afc_months) & lactation$parity == 1),
sum(!is.na(lactation$calving_interval_days) & lactation$parity > 1),
sum(!is.na(lactation$total_lactation_yield)),
sum(!is.na(milking$daily_yield)),
sum(!is.na(milking$dim)),
sum(!is.na(lactation$days_open)),
sum(!is.na(lactation$parity))
),
Min_Value = c(
round(min(lactation$afc_months[lactation$parity == 1], na.rm = TRUE), 1),
round(min(lactation$calving_interval_days[lactation$parity > 1], na.rm = TRUE), 1),
round(min(lactation$total_lactation_yield, na.rm = TRUE), 1),
round(min(milking$daily_yield, na.rm = TRUE), 2),
round(min(milking$dim, na.rm = TRUE), 0),
round(min(lactation$days_open, na.rm = TRUE), 1),
round(min(lactation$parity, na.rm = TRUE), 0)
),
Max_Value = c(
round(max(lactation$afc_months[lactation$parity == 1], na.rm = TRUE), 1),
round(max(lactation$calving_interval_days[lactation$parity > 1], na.rm = TRUE), 1),
round(max(lactation$total_lactation_yield, na.rm = TRUE), 1),
round(max(milking$daily_yield, na.rm = TRUE), 2),
round(max(milking$dim, na.rm = TRUE), 0),
round(max(lactation$days_open, na.rm = TRUE), 1),
round(max(lactation$parity, na.rm = TRUE), 0)
),
Raw_Mean = c(
round(mean(lactation$afc_months[lactation$parity == 1], na.rm = TRUE), 2),
round(mean(lactation$calving_interval_days[lactation$parity > 1], na.rm = TRUE), 2),
round(mean(lactation$total_lactation_yield, na.rm = TRUE), 2),
round(mean(milking$daily_yield, na.rm = TRUE), 2),
round(mean(milking$dim, na.rm = TRUE), 2),
round(mean(lactation$days_open, na.rm = TRUE), 2),
round(mean(lactation$parity, na.rm = TRUE), 2)
),
Recommended_Range = c(
"18-72 months",
"300-700 days",
"100-15,000 kg",
"0.5-50 kg",
"5-500 days",
"21-400 days",
"1-12"
),
Problem_Count = c(
sum((lactation$afc_months < 0 | lactation$afc_months > 100) & lactation$parity == 1, na.rm = TRUE),
sum((lactation$calving_interval_days < 280 | lactation$calving_interval_days > 1000) & lactation$parity > 1, na.rm = TRUE),
sum(lactation$total_lactation_yield == 0 | lactation$total_lactation_yield > 20000, na.rm = TRUE),
sum(milking$daily_yield <= 0 | milking$daily_yield > 50, na.rm = TRUE),
sum(milking$dim < 0 | milking$dim > 1000, na.rm = TRUE),
sum(lactation$days_open < 0 | lactation$days_open > 500, na.rm = TRUE),
sum(lactation$parity < 1 | lactation$parity > 15, na.rm = TRUE)
)
)
# Calculate percentages
quality_summary <- quality_summary %>%
mutate(
Problem_Pct = round(100 * Problem_Count / N_Total, 2),
Quality_Status = case_when(
Problem_Pct < 1 ~ "Good",
Problem_Pct < 5 ~ "Minor Issues",
Problem_Pct < 20 ~ "Moderate Issues",
TRUE ~ "Major Issues"
)
)
kable(quality_summary,
caption = "Data Quality Summary with Min/Max Values",
col.names = c("Trait", "N Total", "Min", "Max", "Raw Mean",
"Recommended Range", "Problem N", "Problem %", "Status"),
format.args = list(big.mark = ",")) %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE)| Trait | N Total | Min | Max | Raw Mean | Recommended Range | Problem N | Problem % | Status |
|---|---|---|---|---|---|---|---|---|
| Age at First Calving | NA | -394 | 24,295.0 | 50.96 | 18-72 months | 1,608 | NA | Major Issues |
| Calving Interval | NA | 0 | 16,530.0 | 521.70 | 300-700 days | 4,529 | NA | Major Issues |
| Total Lactation Yield | 88,347 | 0 | 23,186.4 | 112.02 | 100-15,000 kg | 51,155 | 57.90 | Major Issues |
| Daily Milk Yield | 1,048,546 | 0 | 50.0 | 5.51 | 0.5-50 kg | 333,069 | 31.76 | Major Issues |
| Days in Milk | 1,048,575 | -729,939 | 7,752.0 | 220.35 | 5-500 days | 22,277 | 2.12 | Minor Issues |
| Days Open | 50,710 | -280 | 669,062.0 | 255.21 | 21-400 days | 7,307 | 14.41 | Moderate Issues |
| Parity | 87,990 | 1 | 32.0 | 2.42 | 1-12 | 39 | 0.04 | Good |
Summary of Outliers Detected:
| Trait | Min Value | Max Value | Issue |
|---|---|---|---|
| AFC | -394 months | 24,295 months | Negative values & extreme highs |
| DIM | -729,939 days | 7,752 days | Extreme negative values |
| Days Open | -280 days | 669,062 days | Negative values |
| Total Lact. Yield | 0 kg | 23,186.4 kg | 57.9% zeros |
# Lactation data cleaning
lactation_clean <- lactation %>%
filter(
is.na(afc_months) | (afc_months >= 18 & afc_months <= 72),
is.na(calving_interval_days) | (calving_interval_days >= 300 & calving_interval_days <= 700),
total_lactation_yield > 0 & total_lactation_yield <= 15000,
is.na(days_open) | (days_open >= 21 & days_open <= 400),
parity >= 1 & parity <= 12
)
# Milking data cleaning
milking_clean <- milking %>%
filter(
dim >= 5 & dim <= 500,
daily_yield >= 0.5 & daily_yield <= 50,
is.na(afc_months) | (afc_months >= 18 & afc_months <= 72),
is.na(parity) | (parity >= 1 & parity <= 12)
)## R version 4.5.1 (2025-06-13 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
##
## Matrix products: default
## LAPACK version 3.12.1
##
## locale:
## [1] LC_COLLATE=English_Kenya.utf8 LC_CTYPE=English_Kenya.utf8
## [3] LC_MONETARY=English_Kenya.utf8 LC_NUMERIC=C
## [5] LC_TIME=English_Kenya.utf8
##
## time zone: Africa/Nairobi
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] viridis_0.6.5 viridisLite_0.4.2 kableExtra_1.4.0 knitr_1.50
## [5] gridExtra_2.3 scales_1.4.0 ggplot2_3.5.2 tidyr_1.3.1
## [9] dplyr_1.1.4 data.table_1.17.8
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 jsonlite_2.0.0 compiler_4.5.1 tidyselect_1.2.1
## [5] xml2_1.4.0 stringr_1.5.1 jquerylib_0.1.4 textshaping_1.0.2
## [9] systemfonts_1.2.3 yaml_2.3.10 fastmap_1.2.0 R6_2.6.1
## [13] labeling_0.4.3 generics_0.1.4 tibble_3.3.0 svglite_2.2.1
## [17] bslib_0.9.0 pillar_1.11.0 RColorBrewer_1.1-3 rlang_1.1.6
## [21] stringi_1.8.7 cachem_1.1.0 xfun_0.52 sass_0.4.10
## [25] cli_3.6.5 withr_3.0.2 magrittr_2.0.3 digest_0.6.37
## [29] grid_4.5.1 rstudioapi_0.17.1 lifecycle_1.0.4 vctrs_0.6.5
## [33] evaluate_1.0.5 glue_1.8.0 farver_2.1.2 rmarkdown_2.29
## [37] purrr_1.1.0 tools_4.5.1 pkgconfig_2.0.3 htmltools_0.5.8.1
Report generated on 05 December 2025 at 04:48