Background

This analysis is conducted based on Raphael’s comments regarding the need to analyze lactation curves by taurine gene proportion groups. We examine milk yield patterns across four taurine genetic composition categories to understand how genetic background influences lactation performance.

Data Loading and Preprocessing

# Load data from the Excel file
file_name <- "kenya_merged_analysis-data_20250708.xlsx"
milk_data <- read_excel(file_name, sheet = 4)

# Data preprocessing
milk_data <- milk_data %>%
  mutate(
    calving_date = as.Date(calving_date),
    birth_date = as.Date(birth_date),
    milking_date = as.Date(milking_date),
    cowno_final = as.character(cowno_final),
    herd = factor(herd),
    parity = as.numeric(parity),
    yield = as.numeric(yield),
    dim = as.numeric(dim),
    breed_numeric = as.numeric(breed)
  ) %>%
  # Filter for valid data
  filter(!is.na(yield), !is.na(dim), dim > 0, dim <= 500, yield > 0)

# Create taurine gene proportion groups
milk_data <- milk_data %>%
  mutate(
    taurine_group = case_when(
      is.na(breed_numeric) ~ NA_character_,
      breed_numeric <= 0.25 ~ "0-0.25 (0-25%)",
      breed_numeric <= 0.5 ~ "0.25-0.5 (25-50%)",
      breed_numeric <= 0.75 ~ "0.5-0.75 (50-75%)",
      breed_numeric <= 1.0 ~ "0.75-1.0 (75-100%)",
      TRUE ~ NA_character_
    ),
    taurine_group = factor(taurine_group, 
                          levels = c("0-0.25 (0-25%)", "0.25-0.5 (25-50%)", 
                                   "0.5-0.75 (50-75%)", "0.75-1.0 (75-100%)"))
  ) %>%
  filter(!is.na(taurine_group))

Data Summary

# Check data availability by group
cat("Data availability by taurine group:\n")
## Data availability by taurine group:
print(table(milk_data$taurine_group))
## 
##     0-0.25 (0-25%)  0.25-0.5 (25-50%)  0.5-0.75 (50-75%) 0.75-1.0 (75-100%) 
##               1085              55244              68014              77942
# Sample sizes by taurine group with standard deviations
sample_summary <- milk_data %>%
  group_by(taurine_group) %>%
  summarise(
    n_records = n(),
    n_cows = n_distinct(cowno_final),
    n_herds = n_distinct(herd),
    mean_yield = round(mean(yield, na.rm = TRUE), 2),
    sd_yield = round(sd(yield, na.rm = TRUE), 2),
    mean_dim = round(mean(dim, na.rm = TRUE), 1),
    sd_dim = round(sd(dim, na.rm = TRUE), 1),
    .groups = 'drop'
  ) %>%
  mutate(
    yield_summary = paste0(mean_yield, " ± ", sd_yield),
    dim_summary = paste0(mean_dim, " ± ", sd_dim)
  ) %>%
  select(taurine_group, n_records, n_cows, n_herds, yield_summary, dim_summary)

kable(sample_summary, 
      caption = "Sample sizes by taurine group",
      col.names = c("Taurine Group", "Records", "Cows", "Herds", "Mean Yield ± SD", "Mean DIM ± SD")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Sample sizes by taurine group
Taurine Group Records Cows Herds Mean Yield ± SD Mean DIM ± SD
0-0.25 (0-25%) 1085 4 1 8.06 ± 2.92 132.7 ± 95.8
0.25-0.5 (25-50%) 55244 94 1 14.47 ± 5.29 171.5 ± 106.6
0.5-0.75 (50-75%) 68014 158 1 14.39 ± 5.18 164.5 ± 104.8
0.75-1.0 (75-100%) 77942 232 1 13.91 ± 4.99 156.7 ± 102.7

Weekly Lactation Curves Analysis

# =============================================================================
# WEEKLY LACTATION CURVES (PER WEEK - 7-day periods)
# =============================================================================

# Calculate weekly averages - PER WEEK
weekly_data <- milk_data %>%
  filter(dim <= 280) %>%  # Limit to 40 weeks
  mutate(week = ceiling(dim / 7)) %>%
  filter(week >= 1 & week <= 40) %>%
  group_by(taurine_group, week) %>%
  summarise(
    yield_mean = mean(yield, na.rm = TRUE),
    yield_se = sd(yield, na.rm = TRUE) / sqrt(n()),
    n_records = n(),
    n_cows = n_distinct(cowno_final),
    .groups = 'drop'
  ) %>%
  filter(n_records >= 3)  # Minimum 3 records for estimate

# Display weekly summary statistics
weekly_summary <- weekly_data %>%
  group_by(taurine_group) %>%
  summarise(
    weeks_with_data = n(),
    avg_records_per_week = round(mean(n_records), 1),
    peak_yield = round(max(yield_mean, na.rm = TRUE), 2),
    peak_week = week[which.max(yield_mean)],
    .groups = 'drop'
  )

kable(weekly_summary,
      caption = "Weekly analysis summary by taurine group",
      col.names = c("Taurine Group", "Weeks with Data", "Avg Records/Week", "Peak Yield", "Peak Week")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Weekly analysis summary by taurine group
Taurine Group Weeks with Data Avg Records/Week Peak Yield Peak Week
0-0.25 (0-25%) 40 24.4 12.15 2
0.25-0.5 (25-50%) 40 1119.3 17.32 7
0.5-0.75 (50-75%) 40 1414.3 17.24 6
0.75-1.0 (75-100%) 40 1661.3 16.20 5
# Create weekly lactation curves plot
p_weekly <- ggplot(weekly_data, aes(x = week, y = yield_mean, color = taurine_group)) +
  geom_line(size = 1.2) +
  geom_point(size = 2) +
  geom_ribbon(aes(ymin = yield_mean - yield_se, ymax = yield_mean + yield_se, fill = taurine_group), 
              alpha = 0.2, color = NA) +
  scale_color_manual(values = c("#e74c3c", "#f39c12", "#27ae60", "#3498db")) +
  scale_fill_manual(values = c("#e74c3c", "#f39c12", "#27ae60", "#3498db")) +
  labs(
    title = "Weekly Lactation Curves by Taurine Gene Proportion",
    x = "Week of Lactation",
    y = "Mean Milk Yield (kg/day)",
    color = "Taurine Group",
    fill = "Taurine Group"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, hjust = 0.5, face = "bold"),
    legend.position = "bottom"
  ) +
  scale_x_continuous(breaks = seq(0, 40, 5))

print(p_weekly)
Weekly lactation curves by taurine gene proportion

Weekly lactation curves by taurine gene proportion

30-Day Lactation Curves Analysis

# =============================================================================
# 30-DAY LACTATION CURVES (PER 30-DAY PERIOD)
# =============================================================================

# Calculate 30-day period averages - PER 30 DAYS
monthly_data <- milk_data %>%
  filter(dim <= 450) %>%  # Limit to 15 months
  mutate(
    period = case_when(
      dim <= 30 ~ 1, dim <= 60 ~ 2, dim <= 90 ~ 3, dim <= 120 ~ 4,
      dim <= 150 ~ 5, dim <= 180 ~ 6, dim <= 210 ~ 7, dim <= 240 ~ 8,
      dim <= 270 ~ 9, dim <= 300 ~ 10, dim <= 330 ~ 11, dim <= 360 ~ 12,
      dim <= 390 ~ 13, dim <= 420 ~ 14, dim <= 450 ~ 15,
      TRUE ~ 16
    ),
    period_mid = case_when(
      dim <= 30 ~ 15, dim <= 60 ~ 45, dim <= 90 ~ 75, dim <= 120 ~ 105,
      dim <= 150 ~ 135, dim <= 180 ~ 165, dim <= 210 ~ 195, dim <= 240 ~ 225,
      dim <= 270 ~ 255, dim <= 300 ~ 285, dim <= 330 ~ 315, dim <= 360 ~ 345,
      dim <= 390 ~ 375, dim <= 420 ~ 405, dim <= 450 ~ 435,
      TRUE ~ 465
    ),
    period_label = case_when(
      dim <= 30 ~ "Days 1-30", dim <= 60 ~ "Days 31-60", dim <= 90 ~ "Days 61-90", 
      dim <= 120 ~ "Days 91-120", dim <= 150 ~ "Days 121-150", dim <= 180 ~ "Days 151-180",
      dim <= 210 ~ "Days 181-210", dim <= 240 ~ "Days 211-240", dim <= 270 ~ "Days 241-270",
      dim <= 300 ~ "Days 271-300", dim <= 330 ~ "Days 301-330", dim <= 360 ~ "Days 331-360",
      dim <= 390 ~ "Days 361-390", dim <= 420 ~ "Days 391-420", dim <= 450 ~ "Days 421-450",
      TRUE ~ "Days >450"
    )
  ) %>%
  filter(period <= 15) %>%
  group_by(taurine_group, period, period_mid, period_label) %>%
  summarise(
    yield_mean = mean(yield, na.rm = TRUE),
    yield_se = sd(yield, na.rm = TRUE) / sqrt(n()),
    n_records = n(),
    n_cows = n_distinct(cowno_final),
    .groups = 'drop'
  ) %>%
  filter(n_records >= 3)  # Minimum 3 records for estimate

# Display 30-day summary statistics
monthly_summary <- monthly_data %>%
  group_by(taurine_group) %>%
  summarise(
    periods_with_data = n(),
    avg_records_per_period = round(mean(n_records), 1),
    peak_yield = round(max(yield_mean, na.rm = TRUE), 2),
    peak_period = period[which.max(yield_mean)],
    peak_days = period_mid[which.max(yield_mean)],
    .groups = 'drop'
  )

kable(monthly_summary,
      caption = "30-day analysis summary by taurine group",
      col.names = c("Taurine Group", "Periods with Data", "Avg Records/Period", "Peak Yield", "Peak Period", "Peak Days")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
30-day analysis summary by taurine group
Taurine Group Periods with Data Avg Records/Period Peak Yield Peak Period Peak Days
0-0.25 (0-25%) 13 83.5 10.60 1 15
0.25-0.5 (25-50%) 14 3946.0 17.23 2 45
0.5-0.75 (50-75%) 14 4858.1 17.16 2 45
0.75-1.0 (75-100%) 14 5567.3 16.01 2 45
# Create 30-day lactation curves plot
p_monthly <- ggplot(monthly_data, aes(x = period_mid, y = yield_mean, color = taurine_group)) +
  geom_line(size = 1.2) +
  geom_point(size = 2) +
  geom_ribbon(aes(ymin = yield_mean - yield_se, ymax = yield_mean + yield_se, fill = taurine_group), 
              alpha = 0.2, color = NA) +
  scale_color_manual(values = c("#e74c3c", "#f39c12", "#27ae60", "#3498db")) +
  scale_fill_manual(values = c("#e74c3c", "#f39c12", "#27ae60", "#3498db")) +
  labs(
    title = "30-Day Lactation Curves by Taurine Gene Proportion",
    x = "Days in Milk (midpoint of 30-day period)",
    y = "Mean Milk Yield (kg/day)",
    color = "Taurine Group",
    fill = "Taurine Group"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, hjust = 0.5, face = "bold"),
    legend.position = "bottom"
  ) +
  scale_x_continuous(breaks = seq(0, 450, 60))

print(p_monthly)
30-day lactation curves by taurine gene proportion

30-day lactation curves by taurine gene proportion

Summary Statistics

# Overall group comparisons
group_stats <- milk_data %>%
  group_by(taurine_group) %>%
  summarise(
    n_records = n(),
    n_cows = n_distinct(cowno_final),
    mean_yield = round(mean(yield, na.rm = TRUE), 2),
    sd_yield = round(sd(yield, na.rm = TRUE), 2),
    mean_dim = round(mean(dim, na.rm = TRUE), 1),
    max_dim = max(dim, na.rm = TRUE),
    .groups = 'drop'
  )

kable(group_stats,
      caption = "Overall group statistics",
      col.names = c("Taurine Group", "Records", "Cows", "Mean Yield", "SD Yield", "Mean DIM", "Max DIM")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Overall group statistics
Taurine Group Records Cows Mean Yield SD Yield Mean DIM Max DIM
0-0.25 (0-25%) 1085 4 8.06 2.92 132.7 389
0.25-0.5 (25-50%) 55244 94 14.47 5.29 171.5 400
0.5-0.75 (50-75%) 68014 158 14.39 5.18 164.5 400
0.75-1.0 (75-100%) 77942 232 13.91 4.99 156.7 400
# Peak yield comparison
peak_comparison <- bind_rows(
  weekly_summary %>% 
    select(taurine_group, peak_yield, peak_week) %>%
    mutate(analysis = "Weekly", peak_timing = paste0("Week ", peak_week)),
  monthly_summary %>% 
    select(taurine_group, peak_yield, peak_days) %>%
    rename(peak_timing = peak_days) %>%
    mutate(analysis = "30-Day", peak_timing = paste0("Day ", peak_timing))
) %>%
  select(taurine_group, analysis, peak_yield, peak_timing)

kable(peak_comparison,
      caption = "Peak yield comparison between weekly and 30-day analyses",
      col.names = c("Taurine Group", "Analysis", "Peak Yield", "Peak Timing")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))
Peak yield comparison between weekly and 30-day analyses
Taurine Group Analysis Peak Yield Peak Timing
0-0.25 (0-25%) Weekly 12.15 Week 2
0.25-0.5 (25-50%) Weekly 17.32 Week 7
0.5-0.75 (50-75%) Weekly 17.24 Week 6
0.75-1.0 (75-100%) Weekly 16.20 Week 5
0-0.25 (0-25%) 30-Day 10.60 Day 15
0.25-0.5 (25-50%) 30-Day 17.23 Day 45
0.5-0.75 (50-75%) 30-Day 17.16 Day 45
0.75-1.0 (75-100%) 30-Day 16.01 Day 45

Next Steps: Lactation Curve Modeling Plan

Introduction

This analysis is conducted based on Raphael’s comments. The next step will be to model lactation curves using various mathematical models and machine learning approaches to better understand and predict milk production patterns across different taurine gene proportion groups.

Traditional Lactation Curve Models

1. Wood’s Model (1967)

The classic gamma function model: \[Y_t = at^b e^{-ct}\]

Where: - \(Y_t\) = milk yield at time t - \(a\) = scale parameter (related to initial yield) - \(b\) = shape parameter (pre-peak slope) - \(c\) = decay parameter (post-peak decline)

Implementation approach:

# Non-linear least squares fitting
wood_model <- nls(yield ~ a * (dim^b) * exp(-c * dim), 
                  data = milk_data, 
                  start = list(a = 10, b = 0.1, c = 0.002))

2. Wilmink’s Model (1987)

Incorporates exponential and polynomial terms: \[Y_t = a + bt + ce^{-kt}\]

Advantages: Better fit for extended lactations and multiphasic curves

3. Ali-Schaeffer Model (1987)

Polynomial-based approach: \[Y_t = a_0 + a_1t + a_2t^2 + a_3\ln(t) + a_4(\ln(t))^2\]

Advantages: Flexible functional form, good for irregular curve shapes

4. Dijkstra Model (1997)

Mechanistic approach based on mammary cell dynamics: \[Y_t = as^{b-1}e^{-st}\]

Where \(s = ct/(b-1)\)

Machine Learning Approaches

1. Gaussian Process Regression

Advantages: - Provides uncertainty quantification - Handles irregular time intervals - Non-parametric approach - Can incorporate genetic covariates

Implementation:

library(kernlab)
# GP with RBF kernel
gp_model <- gausspr(yield ~ dim + taurine_group, 
                    data = milk_data, 
                    kernel = "rbfdot")

2. Random Forest for Lactation Curves

Features to include: - Days in milk (DIM) - Taurine gene proportion - Parity - Herd effects - Season/month - Previous lactation performance

Implementation:

library(randomForest)
rf_model <- randomForest(yield ~ dim + I(dim^2) + I(log(dim)) + 
                         taurine_group + parity + herd + month,
                         data = milk_data, 
                         ntree = 500)

3. Neural Networks (Deep Learning)

Architecture suggestions: - Long Short-Term Memory (LSTM) networks for sequential data - Convolutional Neural Networks for pattern recognition - Attention mechanisms for important time periods

Implementation:

library(torch)
# LSTM for time series prediction
lstm_model <- nn_sequential(
  nn_lstm(input_size = n_features, hidden_size = 64, batch_first = TRUE),
  nn_dropout(0.2),
  nn_linear(64, 1)
)

4. Support Vector Regression (SVR)

Kernel options: - Radial Basis Function (RBF) - Polynomial kernels - Custom kernels incorporating genetic similarity

5. Gradient Boosting Methods

Options: - XGBoost with time-series features - LightGBM for large datasets - CatBoost for categorical variables (breed groups)

Advanced Modeling Approaches

1. Mixed-Effects Models

Incorporate random effects for: - Individual cow variations - Herd effects - Genetic group effects

library(nlme)
mixed_wood <- nlme(yield ~ a * (dim^b) * exp(-c * dim),
                   data = milk_data,
                   fixed = a + b + c ~ taurine_group,
                   random = a + b + c ~ 1 | cowno_final,
                   start = c(a = 10, b = 0.1, c = 0.002))

2. Bayesian Hierarchical Models

Advantages: - Incorporate prior knowledge - Handle missing data naturally - Quantify uncertainty in parameters

library(brms)
bayesian_model <- brm(yield ~ dim + I(dim^2) + I(log(dim)) + 
                      (1 + dim | taurine_group) + (1 | cowno_final),
                      data = milk_data,
                      family = gaussian(),
                      chains = 4, iter = 2000)

3. Functional Data Analysis

Treat entire lactation curves as functional objects: - Principal Component Analysis of curves - Functional regression models - Curve clustering approaches

Customization with Machine Learning

1. Ensemble Methods

Combine multiple models:

# Weighted ensemble
ensemble_pred <- 0.3 * wood_pred + 0.3 * gp_pred + 0.4 * rf_pred

2. Transfer Learning

  • Pre-train models on large dairy datasets
  • Fine-tune for specific genetic groups
  • Domain adaptation techniques

3. Multi-task Learning

Simultaneously predict: - Daily milk yield - Total lactation yield - Peak yield timing - Persistency measures

4. Feature Engineering

Genetic features: - Taurine gene proportion (continuous) - Breed composition vectors - Heterosis effects - Genomic relationship matrices

Environmental features: - Temperature-humidity index - Feed composition - Management practices - Disease records

5. Hyperparameter Optimization

Techniques: - Grid search - Random search - Bayesian optimization (using packages like mlr3tuning) - Genetic algorithms

Model Evaluation Framework

1. Cross-validation Strategies

  • Time series cross-validation
  • Leave-one-cow-out cross-validation
  • Leave-one-herd-out cross-validation

2. Performance Metrics

Accuracy metrics: - Root Mean Square Error (RMSE) - Mean Absolute Error (MAE) - Mean Absolute Percentage Error (MAPE)

Biological relevance metrics: - Peak yield prediction accuracy - Lactation persistency correlation - Total milk yield prediction

3. Model Interpretability

Techniques: - SHAP (SHapley Additive exPlanations) values - LIME (Local Interpretable Model-agnostic Explanations) - Partial dependence plots - Feature importance rankings

Implementation Strategy

Traditional Models Phase

  • Fit Wood’s, Wilmink’s, and Ali-Schaeffer models
  • Compare model performance by taurine group
  • Estimate genetic group-specific parameters

Machine Learning Models Phase

  • Implement Random Forest and SVR models
  • Feature engineering and selection
  • Cross-validation and hyperparameter tuning

Advanced Methods Phase

  • Gaussian Process regression
  • Neural network implementations
  • Mixed-effects and Bayesian models

Ensemble and Evaluation Phase

  • Combine models using ensemble methods
  • Comprehensive evaluation across genetic groups
  • Generate predictions and confidence intervals

Expected Outcomes

1. Model Comparison

  • Ranking of models by performance for each taurine group
  • Identification of best-performing approaches
  • Assessment of genetic group-specific model requirements

2. Biological Insights

  • Genetic effects on lactation curve parameters
  • Identification of critical lactation phases
  • Quantification of taurine gene proportion effects

3. Practical Applications

  • Improved yield predictions for management decisions
  • Early identification of poor-performing animals
  • Optimization of feeding and breeding strategies

Conclusion

This comprehensive modeling approach will leverage both traditional dairy science knowledge and modern machine learning techniques to understand lactation patterns across taurine gene proportion groups. The combination of multiple modeling approaches will provide robust predictions and insights into the genetic basis of lactation curve variations in Kenyan dairy cattle.

This analysis framework is developed based on Raphael’s comments and will be implemented using the lactation curve data analyzed above.