This analysis is conducted based on Raphael’s comments regarding the need to analyze lactation curves by taurine gene proportion groups. We examine milk yield patterns across four taurine genetic composition categories to understand how genetic background influences lactation performance.
# Load data from the Excel file
file_name <- "kenya_merged_analysis-data_20250708.xlsx"
milk_data <- read_excel(file_name, sheet = 4)
# Data preprocessing
milk_data <- milk_data %>%
mutate(
calving_date = as.Date(calving_date),
birth_date = as.Date(birth_date),
milking_date = as.Date(milking_date),
cowno_final = as.character(cowno_final),
herd = factor(herd),
parity = as.numeric(parity),
yield = as.numeric(yield),
dim = as.numeric(dim),
breed_numeric = as.numeric(breed)
) %>%
# Filter for valid data
filter(!is.na(yield), !is.na(dim), dim > 0, dim <= 500, yield > 0)
# Create taurine gene proportion groups
milk_data <- milk_data %>%
mutate(
taurine_group = case_when(
is.na(breed_numeric) ~ NA_character_,
breed_numeric <= 0.25 ~ "0-0.25 (0-25%)",
breed_numeric <= 0.5 ~ "0.25-0.5 (25-50%)",
breed_numeric <= 0.75 ~ "0.5-0.75 (50-75%)",
breed_numeric <= 1.0 ~ "0.75-1.0 (75-100%)",
TRUE ~ NA_character_
),
taurine_group = factor(taurine_group,
levels = c("0-0.25 (0-25%)", "0.25-0.5 (25-50%)",
"0.5-0.75 (50-75%)", "0.75-1.0 (75-100%)"))
) %>%
filter(!is.na(taurine_group))## Data availability by taurine group:
##
## 0-0.25 (0-25%) 0.25-0.5 (25-50%) 0.5-0.75 (50-75%) 0.75-1.0 (75-100%)
## 1085 55244 68014 77942
# Sample sizes by taurine group with standard deviations
sample_summary <- milk_data %>%
group_by(taurine_group) %>%
summarise(
n_records = n(),
n_cows = n_distinct(cowno_final),
n_herds = n_distinct(herd),
mean_yield = round(mean(yield, na.rm = TRUE), 2),
sd_yield = round(sd(yield, na.rm = TRUE), 2),
mean_dim = round(mean(dim, na.rm = TRUE), 1),
sd_dim = round(sd(dim, na.rm = TRUE), 1),
.groups = 'drop'
) %>%
mutate(
yield_summary = paste0(mean_yield, " ± ", sd_yield),
dim_summary = paste0(mean_dim, " ± ", sd_dim)
) %>%
select(taurine_group, n_records, n_cows, n_herds, yield_summary, dim_summary)
kable(sample_summary,
caption = "Sample sizes by taurine group",
col.names = c("Taurine Group", "Records", "Cows", "Herds", "Mean Yield ± SD", "Mean DIM ± SD")) %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Taurine Group | Records | Cows | Herds | Mean Yield ± SD | Mean DIM ± SD |
|---|---|---|---|---|---|
| 0-0.25 (0-25%) | 1085 | 4 | 1 | 8.06 ± 2.92 | 132.7 ± 95.8 |
| 0.25-0.5 (25-50%) | 55244 | 94 | 1 | 14.47 ± 5.29 | 171.5 ± 106.6 |
| 0.5-0.75 (50-75%) | 68014 | 158 | 1 | 14.39 ± 5.18 | 164.5 ± 104.8 |
| 0.75-1.0 (75-100%) | 77942 | 232 | 1 | 13.91 ± 4.99 | 156.7 ± 102.7 |
# =============================================================================
# WEEKLY LACTATION CURVES (PER WEEK - 7-day periods)
# =============================================================================
# Calculate weekly averages - PER WEEK
weekly_data <- milk_data %>%
filter(dim <= 280) %>% # Limit to 40 weeks
mutate(week = ceiling(dim / 7)) %>%
filter(week >= 1 & week <= 40) %>%
group_by(taurine_group, week) %>%
summarise(
yield_mean = mean(yield, na.rm = TRUE),
yield_se = sd(yield, na.rm = TRUE) / sqrt(n()),
n_records = n(),
n_cows = n_distinct(cowno_final),
.groups = 'drop'
) %>%
filter(n_records >= 3) # Minimum 3 records for estimate
# Display weekly summary statistics
weekly_summary <- weekly_data %>%
group_by(taurine_group) %>%
summarise(
weeks_with_data = n(),
avg_records_per_week = round(mean(n_records), 1),
peak_yield = round(max(yield_mean, na.rm = TRUE), 2),
peak_week = week[which.max(yield_mean)],
.groups = 'drop'
)
kable(weekly_summary,
caption = "Weekly analysis summary by taurine group",
col.names = c("Taurine Group", "Weeks with Data", "Avg Records/Week", "Peak Yield", "Peak Week")) %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Taurine Group | Weeks with Data | Avg Records/Week | Peak Yield | Peak Week |
|---|---|---|---|---|
| 0-0.25 (0-25%) | 40 | 24.4 | 12.15 | 2 |
| 0.25-0.5 (25-50%) | 40 | 1119.3 | 17.32 | 7 |
| 0.5-0.75 (50-75%) | 40 | 1414.3 | 17.24 | 6 |
| 0.75-1.0 (75-100%) | 40 | 1661.3 | 16.20 | 5 |
# Create weekly lactation curves plot
p_weekly <- ggplot(weekly_data, aes(x = week, y = yield_mean, color = taurine_group)) +
geom_line(size = 1.2) +
geom_point(size = 2) +
geom_ribbon(aes(ymin = yield_mean - yield_se, ymax = yield_mean + yield_se, fill = taurine_group),
alpha = 0.2, color = NA) +
scale_color_manual(values = c("#e74c3c", "#f39c12", "#27ae60", "#3498db")) +
scale_fill_manual(values = c("#e74c3c", "#f39c12", "#27ae60", "#3498db")) +
labs(
title = "Weekly Lactation Curves by Taurine Gene Proportion",
x = "Week of Lactation",
y = "Mean Milk Yield (kg/day)",
color = "Taurine Group",
fill = "Taurine Group"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, hjust = 0.5, face = "bold"),
legend.position = "bottom"
) +
scale_x_continuous(breaks = seq(0, 40, 5))
print(p_weekly)Weekly lactation curves by taurine gene proportion
# =============================================================================
# 30-DAY LACTATION CURVES (PER 30-DAY PERIOD)
# =============================================================================
# Calculate 30-day period averages - PER 30 DAYS
monthly_data <- milk_data %>%
filter(dim <= 450) %>% # Limit to 15 months
mutate(
period = case_when(
dim <= 30 ~ 1, dim <= 60 ~ 2, dim <= 90 ~ 3, dim <= 120 ~ 4,
dim <= 150 ~ 5, dim <= 180 ~ 6, dim <= 210 ~ 7, dim <= 240 ~ 8,
dim <= 270 ~ 9, dim <= 300 ~ 10, dim <= 330 ~ 11, dim <= 360 ~ 12,
dim <= 390 ~ 13, dim <= 420 ~ 14, dim <= 450 ~ 15,
TRUE ~ 16
),
period_mid = case_when(
dim <= 30 ~ 15, dim <= 60 ~ 45, dim <= 90 ~ 75, dim <= 120 ~ 105,
dim <= 150 ~ 135, dim <= 180 ~ 165, dim <= 210 ~ 195, dim <= 240 ~ 225,
dim <= 270 ~ 255, dim <= 300 ~ 285, dim <= 330 ~ 315, dim <= 360 ~ 345,
dim <= 390 ~ 375, dim <= 420 ~ 405, dim <= 450 ~ 435,
TRUE ~ 465
),
period_label = case_when(
dim <= 30 ~ "Days 1-30", dim <= 60 ~ "Days 31-60", dim <= 90 ~ "Days 61-90",
dim <= 120 ~ "Days 91-120", dim <= 150 ~ "Days 121-150", dim <= 180 ~ "Days 151-180",
dim <= 210 ~ "Days 181-210", dim <= 240 ~ "Days 211-240", dim <= 270 ~ "Days 241-270",
dim <= 300 ~ "Days 271-300", dim <= 330 ~ "Days 301-330", dim <= 360 ~ "Days 331-360",
dim <= 390 ~ "Days 361-390", dim <= 420 ~ "Days 391-420", dim <= 450 ~ "Days 421-450",
TRUE ~ "Days >450"
)
) %>%
filter(period <= 15) %>%
group_by(taurine_group, period, period_mid, period_label) %>%
summarise(
yield_mean = mean(yield, na.rm = TRUE),
yield_se = sd(yield, na.rm = TRUE) / sqrt(n()),
n_records = n(),
n_cows = n_distinct(cowno_final),
.groups = 'drop'
) %>%
filter(n_records >= 3) # Minimum 3 records for estimate
# Display 30-day summary statistics
monthly_summary <- monthly_data %>%
group_by(taurine_group) %>%
summarise(
periods_with_data = n(),
avg_records_per_period = round(mean(n_records), 1),
peak_yield = round(max(yield_mean, na.rm = TRUE), 2),
peak_period = period[which.max(yield_mean)],
peak_days = period_mid[which.max(yield_mean)],
.groups = 'drop'
)
kable(monthly_summary,
caption = "30-day analysis summary by taurine group",
col.names = c("Taurine Group", "Periods with Data", "Avg Records/Period", "Peak Yield", "Peak Period", "Peak Days")) %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Taurine Group | Periods with Data | Avg Records/Period | Peak Yield | Peak Period | Peak Days |
|---|---|---|---|---|---|
| 0-0.25 (0-25%) | 13 | 83.5 | 10.60 | 1 | 15 |
| 0.25-0.5 (25-50%) | 14 | 3946.0 | 17.23 | 2 | 45 |
| 0.5-0.75 (50-75%) | 14 | 4858.1 | 17.16 | 2 | 45 |
| 0.75-1.0 (75-100%) | 14 | 5567.3 | 16.01 | 2 | 45 |
# Create 30-day lactation curves plot
p_monthly <- ggplot(monthly_data, aes(x = period_mid, y = yield_mean, color = taurine_group)) +
geom_line(size = 1.2) +
geom_point(size = 2) +
geom_ribbon(aes(ymin = yield_mean - yield_se, ymax = yield_mean + yield_se, fill = taurine_group),
alpha = 0.2, color = NA) +
scale_color_manual(values = c("#e74c3c", "#f39c12", "#27ae60", "#3498db")) +
scale_fill_manual(values = c("#e74c3c", "#f39c12", "#27ae60", "#3498db")) +
labs(
title = "30-Day Lactation Curves by Taurine Gene Proportion",
x = "Days in Milk (midpoint of 30-day period)",
y = "Mean Milk Yield (kg/day)",
color = "Taurine Group",
fill = "Taurine Group"
) +
theme_minimal() +
theme(
plot.title = element_text(size = 14, hjust = 0.5, face = "bold"),
legend.position = "bottom"
) +
scale_x_continuous(breaks = seq(0, 450, 60))
print(p_monthly)30-day lactation curves by taurine gene proportion
# Overall group comparisons
group_stats <- milk_data %>%
group_by(taurine_group) %>%
summarise(
n_records = n(),
n_cows = n_distinct(cowno_final),
mean_yield = round(mean(yield, na.rm = TRUE), 2),
sd_yield = round(sd(yield, na.rm = TRUE), 2),
mean_dim = round(mean(dim, na.rm = TRUE), 1),
max_dim = max(dim, na.rm = TRUE),
.groups = 'drop'
)
kable(group_stats,
caption = "Overall group statistics",
col.names = c("Taurine Group", "Records", "Cows", "Mean Yield", "SD Yield", "Mean DIM", "Max DIM")) %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Taurine Group | Records | Cows | Mean Yield | SD Yield | Mean DIM | Max DIM |
|---|---|---|---|---|---|---|
| 0-0.25 (0-25%) | 1085 | 4 | 8.06 | 2.92 | 132.7 | 389 |
| 0.25-0.5 (25-50%) | 55244 | 94 | 14.47 | 5.29 | 171.5 | 400 |
| 0.5-0.75 (50-75%) | 68014 | 158 | 14.39 | 5.18 | 164.5 | 400 |
| 0.75-1.0 (75-100%) | 77942 | 232 | 13.91 | 4.99 | 156.7 | 400 |
# Peak yield comparison
peak_comparison <- bind_rows(
weekly_summary %>%
select(taurine_group, peak_yield, peak_week) %>%
mutate(analysis = "Weekly", peak_timing = paste0("Week ", peak_week)),
monthly_summary %>%
select(taurine_group, peak_yield, peak_days) %>%
rename(peak_timing = peak_days) %>%
mutate(analysis = "30-Day", peak_timing = paste0("Day ", peak_timing))
) %>%
select(taurine_group, analysis, peak_yield, peak_timing)
kable(peak_comparison,
caption = "Peak yield comparison between weekly and 30-day analyses",
col.names = c("Taurine Group", "Analysis", "Peak Yield", "Peak Timing")) %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Taurine Group | Analysis | Peak Yield | Peak Timing |
|---|---|---|---|
| 0-0.25 (0-25%) | Weekly | 12.15 | Week 2 |
| 0.25-0.5 (25-50%) | Weekly | 17.32 | Week 7 |
| 0.5-0.75 (50-75%) | Weekly | 17.24 | Week 6 |
| 0.75-1.0 (75-100%) | Weekly | 16.20 | Week 5 |
| 0-0.25 (0-25%) | 30-Day | 10.60 | Day 15 |
| 0.25-0.5 (25-50%) | 30-Day | 17.23 | Day 45 |
| 0.5-0.75 (50-75%) | 30-Day | 17.16 | Day 45 |
| 0.75-1.0 (75-100%) | 30-Day | 16.01 | Day 45 |
This analysis is conducted based on Raphael’s comments. The next step will be to model lactation curves using various mathematical models and machine learning approaches to better understand and predict milk production patterns across different taurine gene proportion groups.
The classic gamma function model: \[Y_t = at^b e^{-ct}\]
Where: - \(Y_t\) = milk yield at time t - \(a\) = scale parameter (related to initial yield) - \(b\) = shape parameter (pre-peak slope) - \(c\) = decay parameter (post-peak decline)
Implementation approach:
Incorporates exponential and polynomial terms: \[Y_t = a + bt + ce^{-kt}\]
Advantages: Better fit for extended lactations and multiphasic curves
Polynomial-based approach: \[Y_t = a_0 + a_1t + a_2t^2 + a_3\ln(t) + a_4(\ln(t))^2\]
Advantages: Flexible functional form, good for irregular curve shapes
Mechanistic approach based on mammary cell dynamics: \[Y_t = as^{b-1}e^{-st}\]
Where \(s = ct/(b-1)\)
Advantages: - Provides uncertainty quantification - Handles irregular time intervals - Non-parametric approach - Can incorporate genetic covariates
Implementation:
Features to include: - Days in milk (DIM) - Taurine gene proportion - Parity - Herd effects - Season/month - Previous lactation performance
Implementation:
Architecture suggestions: - Long Short-Term Memory (LSTM) networks for sequential data - Convolutional Neural Networks for pattern recognition - Attention mechanisms for important time periods
Implementation:
Kernel options: - Radial Basis Function (RBF) - Polynomial kernels - Custom kernels incorporating genetic similarity
Options: - XGBoost with time-series features - LightGBM for large datasets - CatBoost for categorical variables (breed groups)
Incorporate random effects for: - Individual cow variations - Herd effects - Genetic group effects
Advantages: - Incorporate prior knowledge - Handle missing data naturally - Quantify uncertainty in parameters
Treat entire lactation curves as functional objects: - Principal Component Analysis of curves - Functional regression models - Curve clustering approaches
Combine multiple models:
Simultaneously predict: - Daily milk yield - Total lactation yield - Peak yield timing - Persistency measures
Genetic features: - Taurine gene proportion (continuous) - Breed composition vectors - Heterosis effects - Genomic relationship matrices
Environmental features: - Temperature-humidity index - Feed composition - Management practices - Disease records
Techniques: - Grid search - Random search - Bayesian
optimization (using packages like mlr3tuning) - Genetic
algorithms
Accuracy metrics: - Root Mean Square Error (RMSE) - Mean Absolute Error (MAE) - Mean Absolute Percentage Error (MAPE)
Biological relevance metrics: - Peak yield prediction accuracy - Lactation persistency correlation - Total milk yield prediction
Techniques: - SHAP (SHapley Additive exPlanations) values - LIME (Local Interpretable Model-agnostic Explanations) - Partial dependence plots - Feature importance rankings
This comprehensive modeling approach will leverage both traditional dairy science knowledge and modern machine learning techniques to understand lactation patterns across taurine gene proportion groups. The combination of multiple modeling approaches will provide robust predictions and insights into the genetic basis of lactation curve variations in Kenyan dairy cattle.
This analysis framework is developed based on Raphael’s comments and will be implemented using the lactation curve data analyzed above.