Background

This analysis is conducted based on Raphael’s comments regarding the need to analyze lactation curves by taurine gene proportion groups. We examine milk yield patterns across four taurine genetic composition categories to understand how genetic background influences lactation performance.

Data Loading and Preprocessing

# Load data from the Excel file
file_name <- "kenya_merged_analysis-data_20250708.xlsx"
milk_data <- read_excel(file_name, sheet = 4)

# Data preprocessing
milk_data <- milk_data %>%
  mutate(
    calving_date = as.Date(calving_date),
    birth_date = as.Date(birth_date),
    milking_date = as.Date(milking_date),
    cowno_final = as.character(cowno_final),
    herd = factor(herd),
    parity = as.numeric(parity),
    yield = as.numeric(yield),
    dim = as.numeric(dim),
    breed_numeric = as.numeric(breed)
  ) %>%
  # Filter for valid data
  filter(!is.na(yield), !is.na(dim), dim > 0, dim <= 500, yield > 0)

# Create taurine gene proportion groups
milk_data <- milk_data %>%
  mutate(
    taurine_group = case_when(
      is.na(breed_numeric) ~ NA_character_,
      breed_numeric <= 0.25 ~ "0-0.25 (0-25%)",
      breed_numeric <= 0.5 ~ "0.25-0.5 (25-50%)",
      breed_numeric <= 0.75 ~ "0.5-0.75 (50-75%)",
      breed_numeric <= 1.0 ~ "0.75-1.0 (75-100%)",
      TRUE ~ NA_character_
    ),
    taurine_group = factor(taurine_group, 
                          levels = c("0-0.25 (0-25%)", "0.25-0.5 (25-50%)", 
                                   "0.5-0.75 (50-75%)", "0.75-1.0 (75-100%)"))
  ) %>%
  filter(!is.na(taurine_group))

Data Summary

# Check data availability by group
cat("Data availability by taurine group:\n")

## Data availability by taurine group:

print(table(milk_data$taurine_group))

## 
##     0-0.25 (0-25%)  0.25-0.5 (25-50%)  0.5-0.75 (50-75%) 0.75-1.0 (75-100%) 
##               1085              55244              68014              77942

# Sample sizes by taurine group with standard deviations
sample_summary <- milk_data %>%
  group_by(taurine_group) %>%
  summarise(
    n_records = n(),
    n_cows = n_distinct(cowno_final),
    n_herds = n_distinct(herd),
    mean_yield = round(mean(yield, na.rm = TRUE), 2),
    sd_yield = round(sd(yield, na.rm = TRUE), 2),
    mean_dim = round(mean(dim, na.rm = TRUE), 1),
    sd_dim = round(sd(dim, na.rm = TRUE), 1),
    .groups = 'drop'
  ) %>%
  mutate(
    yield_summary = paste0(mean_yield, " ± ", sd_yield),
    dim_summary = paste0(mean_dim, " ± ", sd_dim)
  ) %>%
  select(taurine_group, n_records, n_cows, n_herds, yield_summary, dim_summary)

kable(sample_summary, 
      caption = "Sample sizes by taurine group",
      col.names = c("Taurine Group", "Records", "Cows", "Herds", "Mean Yield ± SD", "Mean DIM ± SD")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Sample sizes by taurine group
Taurine Group	Records	Cows	Herds	Mean Yield ± SD	Mean DIM ± SD
0-0.25 (0-25%)	1085	4	1	8.06 ± 2.92	132.7 ± 95.8
0.25-0.5 (25-50%)	55244	94	1	14.47 ± 5.29	171.5 ± 106.6
0.5-0.75 (50-75%)	68014	158	1	14.39 ± 5.18	164.5 ± 104.8
0.75-1.0 (75-100%)	77942	232	1	13.91 ± 4.99	156.7 ± 102.7

Weekly Lactation Curves Analysis

# =============================================================================
# WEEKLY LACTATION CURVES (PER WEEK - 7-day periods)
# =============================================================================

# Calculate weekly averages - PER WEEK
weekly_data <- milk_data %>%
  filter(dim <= 280) %>%  # Limit to 40 weeks
  mutate(week = ceiling(dim / 7)) %>%
  filter(week >= 1 & week <= 40) %>%
  group_by(taurine_group, week) %>%
  summarise(
    yield_mean = mean(yield, na.rm = TRUE),
    yield_se = sd(yield, na.rm = TRUE) / sqrt(n()),
    n_records = n(),
    n_cows = n_distinct(cowno_final),
    .groups = 'drop'
  ) %>%
  filter(n_records >= 3)  # Minimum 3 records for estimate

# Display weekly summary statistics
weekly_summary <- weekly_data %>%
  group_by(taurine_group) %>%
  summarise(
    weeks_with_data = n(),
    avg_records_per_week = round(mean(n_records), 1),
    peak_yield = round(max(yield_mean, na.rm = TRUE), 2),
    peak_week = week[which.max(yield_mean)],
    .groups = 'drop'
  )

kable(weekly_summary,
      caption = "Weekly analysis summary by taurine group",
      col.names = c("Taurine Group", "Weeks with Data", "Avg Records/Week", "Peak Yield", "Peak Week")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Weekly analysis summary by taurine group
Taurine Group	Weeks with Data	Avg Records/Week	Peak Yield	Peak Week
0-0.25 (0-25%)	40	24.4	12.15	2
0.25-0.5 (25-50%)	40	1119.3	17.32	7
0.5-0.75 (50-75%)	40	1414.3	17.24	6
0.75-1.0 (75-100%)	40	1661.3	16.20	5

# Create weekly lactation curves plot
p_weekly <- ggplot(weekly_data, aes(x = week, y = yield_mean, color = taurine_group)) +
  geom_line(size = 1.2) +
  geom_point(size = 2) +
  geom_ribbon(aes(ymin = yield_mean - yield_se, ymax = yield_mean + yield_se, fill = taurine_group), 
              alpha = 0.2, color = NA) +
  scale_color_manual(values = c("#e74c3c", "#f39c12", "#27ae60", "#3498db")) +
  scale_fill_manual(values = c("#e74c3c", "#f39c12", "#27ae60", "#3498db")) +
  labs(
    title = "Weekly Lactation Curves by Taurine Gene Proportion",
    x = "Week of Lactation",
    y = "Mean Milk Yield (kg/day)",
    color = "Taurine Group",
    fill = "Taurine Group"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, hjust = 0.5, face = "bold"),
    legend.position = "bottom"
  ) +
  scale_x_continuous(breaks = seq(0, 40, 5))

print(p_weekly)

Weekly lactation curves by taurine gene proportion

30-Day Lactation Curves Analysis

# =============================================================================
# 30-DAY LACTATION CURVES (PER 30-DAY PERIOD)
# =============================================================================

# Calculate 30-day period averages - PER 30 DAYS
monthly_data <- milk_data %>%
  filter(dim <= 450) %>%  # Limit to 15 months
  mutate(
    period = case_when(
      dim <= 30 ~ 1, dim <= 60 ~ 2, dim <= 90 ~ 3, dim <= 120 ~ 4,
      dim <= 150 ~ 5, dim <= 180 ~ 6, dim <= 210 ~ 7, dim <= 240 ~ 8,
      dim <= 270 ~ 9, dim <= 300 ~ 10, dim <= 330 ~ 11, dim <= 360 ~ 12,
      dim <= 390 ~ 13, dim <= 420 ~ 14, dim <= 450 ~ 15,
      TRUE ~ 16
    ),
    period_mid = case_when(
      dim <= 30 ~ 15, dim <= 60 ~ 45, dim <= 90 ~ 75, dim <= 120 ~ 105,
      dim <= 150 ~ 135, dim <= 180 ~ 165, dim <= 210 ~ 195, dim <= 240 ~ 225,
      dim <= 270 ~ 255, dim <= 300 ~ 285, dim <= 330 ~ 315, dim <= 360 ~ 345,
      dim <= 390 ~ 375, dim <= 420 ~ 405, dim <= 450 ~ 435,
      TRUE ~ 465
    ),
    period_label = case_when(
      dim <= 30 ~ "Days 1-30", dim <= 60 ~ "Days 31-60", dim <= 90 ~ "Days 61-90", 
      dim <= 120 ~ "Days 91-120", dim <= 150 ~ "Days 121-150", dim <= 180 ~ "Days 151-180",
      dim <= 210 ~ "Days 181-210", dim <= 240 ~ "Days 211-240", dim <= 270 ~ "Days 241-270",
      dim <= 300 ~ "Days 271-300", dim <= 330 ~ "Days 301-330", dim <= 360 ~ "Days 331-360",
      dim <= 390 ~ "Days 361-390", dim <= 420 ~ "Days 391-420", dim <= 450 ~ "Days 421-450",
      TRUE ~ "Days >450"
    )
  ) %>%
  filter(period <= 15) %>%
  group_by(taurine_group, period, period_mid, period_label) %>%
  summarise(
    yield_mean = mean(yield, na.rm = TRUE),
    yield_se = sd(yield, na.rm = TRUE) / sqrt(n()),
    n_records = n(),
    n_cows = n_distinct(cowno_final),
    .groups = 'drop'
  ) %>%
  filter(n_records >= 3)  # Minimum 3 records for estimate

# Display 30-day summary statistics
monthly_summary <- monthly_data %>%
  group_by(taurine_group) %>%
  summarise(
    periods_with_data = n(),
    avg_records_per_period = round(mean(n_records), 1),
    peak_yield = round(max(yield_mean, na.rm = TRUE), 2),
    peak_period = period[which.max(yield_mean)],
    peak_days = period_mid[which.max(yield_mean)],
    .groups = 'drop'
  )

kable(monthly_summary,
      caption = "30-day analysis summary by taurine group",
      col.names = c("Taurine Group", "Periods with Data", "Avg Records/Period", "Peak Yield", "Peak Period", "Peak Days")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

30-day analysis summary by taurine group
Taurine Group	Periods with Data	Avg Records/Period	Peak Yield	Peak Period	Peak Days
0-0.25 (0-25%)	13	83.5	10.60	1	15
0.25-0.5 (25-50%)	14	3946.0	17.23	2	45
0.5-0.75 (50-75%)	14	4858.1	17.16	2	45
0.75-1.0 (75-100%)	14	5567.3	16.01	2	45

# Create 30-day lactation curves plot
p_monthly <- ggplot(monthly_data, aes(x = period_mid, y = yield_mean, color = taurine_group)) +
  geom_line(size = 1.2) +
  geom_point(size = 2) +
  geom_ribbon(aes(ymin = yield_mean - yield_se, ymax = yield_mean + yield_se, fill = taurine_group), 
              alpha = 0.2, color = NA) +
  scale_color_manual(values = c("#e74c3c", "#f39c12", "#27ae60", "#3498db")) +
  scale_fill_manual(values = c("#e74c3c", "#f39c12", "#27ae60", "#3498db")) +
  labs(
    title = "30-Day Lactation Curves by Taurine Gene Proportion",
    x = "Days in Milk (midpoint of 30-day period)",
    y = "Mean Milk Yield (kg/day)",
    color = "Taurine Group",
    fill = "Taurine Group"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, hjust = 0.5, face = "bold"),
    legend.position = "bottom"
  ) +
  scale_x_continuous(breaks = seq(0, 450, 60))

print(p_monthly)

30-day lactation curves by taurine gene proportion

Summary Statistics

# Overall group comparisons
group_stats <- milk_data %>%
  group_by(taurine_group) %>%
  summarise(
    n_records = n(),
    n_cows = n_distinct(cowno_final),
    mean_yield = round(mean(yield, na.rm = TRUE), 2),
    sd_yield = round(sd(yield, na.rm = TRUE), 2),
    mean_dim = round(mean(dim, na.rm = TRUE), 1),
    max_dim = max(dim, na.rm = TRUE),
    .groups = 'drop'
  )

kable(group_stats,
      caption = "Overall group statistics",
      col.names = c("Taurine Group", "Records", "Cows", "Mean Yield", "SD Yield", "Mean DIM", "Max DIM")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Overall group statistics
Taurine Group	Records	Cows	Mean Yield	SD Yield	Mean DIM	Max DIM
0-0.25 (0-25%)	1085	4	8.06	2.92	132.7	389
0.25-0.5 (25-50%)	55244	94	14.47	5.29	171.5	400
0.5-0.75 (50-75%)	68014	158	14.39	5.18	164.5	400
0.75-1.0 (75-100%)	77942	232	13.91	4.99	156.7	400

# Peak yield comparison
peak_comparison <- bind_rows(
  weekly_summary %>% 
    select(taurine_group, peak_yield, peak_week) %>%
    mutate(analysis = "Weekly", peak_timing = paste0("Week ", peak_week)),
  monthly_summary %>% 
    select(taurine_group, peak_yield, peak_days) %>%
    rename(peak_timing = peak_days) %>%
    mutate(analysis = "30-Day", peak_timing = paste0("Day ", peak_timing))
) %>%
  select(taurine_group, analysis, peak_yield, peak_timing)

kable(peak_comparison,
      caption = "Peak yield comparison between weekly and 30-day analyses",
      col.names = c("Taurine Group", "Analysis", "Peak Yield", "Peak Timing")) %>%
  kable_styling(bootstrap_options = c("striped", "hover"))

Peak yield comparison between weekly and 30-day analyses
Taurine Group	Analysis	Peak Yield	Peak Timing
0-0.25 (0-25%)	Weekly	12.15	Week 2
0.25-0.5 (25-50%)	Weekly	17.32	Week 7
0.5-0.75 (50-75%)	Weekly	17.24	Week 6
0.75-1.0 (75-100%)	Weekly	16.20	Week 5
0-0.25 (0-25%)	30-Day	10.60	Day 15
0.25-0.5 (25-50%)	30-Day	17.23	Day 45
0.5-0.75 (50-75%)	30-Day	17.16	Day 45
0.75-1.0 (75-100%)	30-Day	16.01	Day 45

Next Steps: Lactation Curve Modeling Plan

Introduction

This analysis is conducted based on Raphael’s comments. The next step will be to model lactation curves using various mathematical models and machine learning approaches to better understand and predict milk production patterns across different taurine gene proportion groups.

Traditional Lactation Curve Models

1. Wood’s Model (1967)

The classic gamma function model: \[Y_t = at^b e^{-ct}\]

Where: - \(Y_t\) = milk yield at time t - \(a\) = scale parameter (related to initial yield) - \(b\) = shape parameter (pre-peak slope) - \(c\) = decay parameter (post-peak decline)

Implementation approach:

# Non-linear least squares fitting
wood_model <- nls(yield ~ a * (dim^b) * exp(-c * dim), 
                  data = milk_data, 
                  start = list(a = 10, b = 0.1, c = 0.002))

2. Wilmink’s Model (1987)

Incorporates exponential and polynomial terms: \[Y_t = a + bt + ce^{-kt}\]

Advantages: Better fit for extended lactations and multiphasic curves

3. Ali-Schaeffer Model (1987)

Polynomial-based approach: \[Y_t = a_0 + a_1t + a_2t^2 + a_3\ln(t) + a_4(\ln(t))^2\]

Advantages: Flexible functional form, good for irregular curve shapes

4. Dijkstra Model (1997)

Mechanistic approach based on mammary cell dynamics: \[Y_t = as^{b-1}e^{-st}\]

Where \(s = ct/(b-1)\)

Machine Learning Approaches

1. Gaussian Process Regression

Advantages: - Provides uncertainty quantification - Handles irregular time intervals - Non-parametric approach - Can incorporate genetic covariates

Implementation:

library(kernlab)
# GP with RBF kernel
gp_model <- gausspr(yield ~ dim + taurine_group, 
                    data = milk_data, 
                    kernel = "rbfdot")

2. Random Forest for Lactation Curves

Features to include: - Days in milk (DIM) - Taurine gene proportion - Parity - Herd effects - Season/month - Previous lactation performance

Implementation:

library(randomForest)
rf_model <- randomForest(yield ~ dim + I(dim^2) + I(log(dim)) + 
                         taurine_group + parity + herd + month,
                         data = milk_data, 
                         ntree = 500)

3. Neural Networks (Deep Learning)

Architecture suggestions: - Long Short-Term Memory (LSTM) networks for sequential data - Convolutional Neural Networks for pattern recognition - Attention mechanisms for important time periods

Implementation:

library(torch)
# LSTM for time series prediction
lstm_model <- nn_sequential(
  nn_lstm(input_size = n_features, hidden_size = 64, batch_first = TRUE),
  nn_dropout(0.2),
  nn_linear(64, 1)
)

4. Support Vector Regression (SVR)

Kernel options: - Radial Basis Function (RBF) - Polynomial kernels - Custom kernels incorporating genetic similarity

5. Gradient Boosting Methods

Options: - XGBoost with time-series features - LightGBM for large datasets - CatBoost for categorical variables (breed groups)

Advanced Modeling Approaches

1. Mixed-Effects Models

Incorporate random effects for: - Individual cow variations - Herd effects - Genetic group effects

library(nlme)
mixed_wood <- nlme(yield ~ a * (dim^b) * exp(-c * dim),
                   data = milk_data,
                   fixed = a + b + c ~ taurine_group,
                   random = a + b + c ~ 1 | cowno_final,
                   start = c(a = 10, b = 0.1, c = 0.002))

2. Bayesian Hierarchical Models

Advantages: - Incorporate prior knowledge - Handle missing data naturally - Quantify uncertainty in parameters

library(brms)
bayesian_model <- brm(yield ~ dim + I(dim^2) + I(log(dim)) + 
                      (1 + dim | taurine_group) + (1 | cowno_final),
                      data = milk_data,
                      family = gaussian(),
                      chains = 4, iter = 2000)

3. Functional Data Analysis

Treat entire lactation curves as functional objects: - Principal Component Analysis of curves - Functional regression models - Curve clustering approaches

Customization with Machine Learning

1. Ensemble Methods

Combine multiple models:

# Weighted ensemble
ensemble_pred <- 0.3 * wood_pred + 0.3 * gp_pred + 0.4 * rf_pred

2. Transfer Learning

Pre-train models on large dairy datasets
Fine-tune for specific genetic groups
Domain adaptation techniques

3. Multi-task Learning

Simultaneously predict: - Daily milk yield - Total lactation yield - Peak yield timing - Persistency measures

4. Feature Engineering

Genetic features: - Taurine gene proportion (continuous) - Breed composition vectors - Heterosis effects - Genomic relationship matrices

Environmental features: - Temperature-humidity index - Feed composition - Management practices - Disease records

5. Hyperparameter Optimization

Techniques: - Grid search - Random search - Bayesian optimization (using packages like mlr3tuning) - Genetic algorithms

Model Evaluation Framework

1. Cross-validation Strategies

Time series cross-validation
Leave-one-cow-out cross-validation
Leave-one-herd-out cross-validation

2. Performance Metrics

Accuracy metrics: - Root Mean Square Error (RMSE) - Mean Absolute Error (MAE) - Mean Absolute Percentage Error (MAPE)

Biological relevance metrics: - Peak yield prediction accuracy - Lactation persistency correlation - Total milk yield prediction

3. Model Interpretability

Techniques: - SHAP (SHapley Additive exPlanations) values - LIME (Local Interpretable Model-agnostic Explanations) - Partial dependence plots - Feature importance rankings

Implementation Strategy

Traditional Models Phase

Fit Wood’s, Wilmink’s, and Ali-Schaeffer models
Compare model performance by taurine group
Estimate genetic group-specific parameters

Machine Learning Models Phase

Implement Random Forest and SVR models
Feature engineering and selection
Cross-validation and hyperparameter tuning

Advanced Methods Phase

Gaussian Process regression
Neural network implementations
Mixed-effects and Bayesian models

Ensemble and Evaluation Phase

Combine models using ensemble methods
Comprehensive evaluation across genetic groups
Generate predictions and confidence intervals

Expected Outcomes

1. Model Comparison

Ranking of models by performance for each taurine group
Identification of best-performing approaches
Assessment of genetic group-specific model requirements

2. Biological Insights

Genetic effects on lactation curve parameters
Identification of critical lactation phases
Quantification of taurine gene proportion effects

3. Practical Applications

Improved yield predictions for management decisions
Early identification of poor-performing animals
Optimization of feeding and breeding strategies

Conclusion

This comprehensive modeling approach will leverage both traditional dairy science knowledge and modern machine learning techniques to understand lactation patterns across taurine gene proportion groups. The combination of multiple modeling approaches will provide robust predictions and insights into the genetic basis of lactation curve variations in Kenyan dairy cattle.

This analysis framework is developed based on Raphael’s comments and will be implemented using the lactation curve data analyzed above.

Lactation Curves Analysis by Taurine Gene Proportion

Based on Raphael’s comments

2025-09-12

Background

Data Loading and Preprocessing

Data Summary

Weekly Lactation Curves Analysis

30-Day Lactation Curves Analysis

Summary Statistics

Next Steps: Lactation Curve Modeling Plan

Introduction

Traditional Lactation Curve Models

1. Wood’s Model (1967)

2. Wilmink’s Model (1987)

3. Ali-Schaeffer Model (1987)

4. Dijkstra Model (1997)

Machine Learning Approaches

1. Gaussian Process Regression

2. Random Forest for Lactation Curves

3. Neural Networks (Deep Learning)

4. Support Vector Regression (SVR)

5. Gradient Boosting Methods

Advanced Modeling Approaches

1. Mixed-Effects Models

2. Bayesian Hierarchical Models

3. Functional Data Analysis

Customization with Machine Learning

1. Ensemble Methods

2. Transfer Learning

3. Multi-task Learning

4. Feature Engineering

5. Hyperparameter Optimization

Model Evaluation Framework

1. Cross-validation Strategies

2. Performance Metrics

3. Model Interpretability

Implementation Strategy

Traditional Models Phase

Machine Learning Models Phase

Advanced Methods Phase

Ensemble and Evaluation Phase

Expected Outcomes

1. Model Comparison

2. Biological Insights

3. Practical Applications

Conclusion