Objective: The goal is to use mathematical and statistical techniques to analyze milk yield in dairy cows, optimize feed intake, and predict trends in cow health using synthetic or open-source data. The project will demonstrate skills in linear algebra, statistics, and mathematical modeling using R.
# Load necessary libraries
library(ggplot2)
library(dplyr)
library(tidyr)
library(MASS) # For synthetic data generation
library(stats) # For PCA and ANOVA
library(caret) # For Linear Regression
library(corrplot) # For correlation matrix visualization
corrplot 0.95 loaded
# 1.1 Load open-source data (if available) from Kaggle or FAO
# Here we simulate the data as an example using synthetic data
# Synthetic data generation for cow feed intake and milk yield
set.seed(123)
# Create synthetic data for 100 cows over 365 days
n_cows <- 100
n_days <- 365
cow_ids <- rep(1:n_cows, each = n_days)
# Generate synthetic feed intake data (in kg) and milk yield (liters)
feed_protein <- rnorm(n_cows * n_days, mean = 16, sd = 2)
feed_fat <- rnorm(n_cows * n_days, mean = 4, sd = 1)
feed_energy <- rnorm(n_cows * n_days, mean = 2800, sd = 500) # in kcal
milk_yield <- 25 + 0.5 * feed_protein + 0.3 * feed_fat + rnorm(n_cows * n_days, mean = 0, sd = 5) # Milk yield in liters
# Combine into a data frame
synthetic_data <- data.frame(
cow_id = cow_ids,
feed_protein = feed_protein,
feed_fat = feed_fat,
feed_energy = feed_energy,
milk_yield = milk_yield
)
# View the first few rows
head(synthetic_data)
cow_id <int> | feed_protein <dbl> | feed_fat <dbl> | feed_energy <dbl> | milk_yield <dbl> | |
---|---|---|---|---|---|
1 | 1 | 14.87905 | 2.510811 | 2848.702 | 28.20588 |
2 | 1 | 15.53965 | 3.895453 | 4234.106 | 36.41569 |
3 | 1 | 19.11742 | 4.125529 | 2889.797 | 31.95770 |
4 | 1 | 16.14102 | 2.951716 | 2543.285 | 33.59276 |
5 | 1 | 16.25858 | 2.966957 | 2142.224 | 38.55488 |
6 | 1 | 19.43013 | 4.359533 | 1981.631 | 33.60346 |
# Summary of the synthetic data
summary(synthetic_data)
cow_id feed_protein feed_fat
Min. : 1.00 Min. : 8.309 Min. :-0.1291
1st Qu.: 25.75 1st Qu.:14.646 1st Qu.: 3.3328
Median : 50.50 Median :15.978 Median : 4.0118
Mean : 50.50 Mean :15.988 Mean : 4.0069
3rd Qu.: 75.25 3rd Qu.:17.348 3rd Qu.: 4.6770
Max. :100.00 Max. :24.646 Max. : 7.8689
feed_energy milk_yield
Min. : 609 Min. :13.36
1st Qu.:2469 1st Qu.:30.76
Median :2803 Median :34.17
Mean :2803 Mean :34.20
3rd Qu.:3138 3rd Qu.:37.63
Max. :4903 Max. :54.57
# 2.1 Summary statistics
summary(synthetic_data)
cow_id feed_protein feed_fat feed_energy milk_yield
Min. : 1.00 Min. : 8.309 Min. :-0.1291 Min. : 609 Min. :13.36
1st Qu.: 25.75 1st Qu.:14.646 1st Qu.: 3.3328 1st Qu.:2469 1st Qu.:30.76
Median : 50.50 Median :15.978 Median : 4.0118 Median :2803 Median :34.17
Mean : 50.50 Mean :15.988 Mean : 4.0069 Mean :2803 Mean :34.20
3rd Qu.: 75.25 3rd Qu.:17.348 3rd Qu.: 4.6770 3rd Qu.:3138 3rd Qu.:37.63
Max. :100.00 Max. :24.646 Max. : 7.8689 Max. :4903 Max. :54.57
# 2.2 Correlation matrix
cor_matrix <- cor(synthetic_data[, c("feed_protein", "feed_fat", "feed_energy", "milk_yield")])
print(cor_matrix)
feed_protein feed_fat feed_energy milk_yield
feed_protein 1.000000000 -0.002192113 0.002754189 0.195083280
feed_fat -0.002192113 1.000000000 0.001116323 0.053837849
feed_energy 0.002754189 0.001116323 1.000000000 0.004458438
milk_yield 0.195083280 0.053837849 0.004458438 1.000000000
# Visualize the correlation matrix
corrplot(cor_matrix, method = "color", type = "upper")
# 2.3 Visualizations
# Plot milk yield vs feed protein intake
ggplot(synthetic_data, aes(x = feed_protein, y = milk_yield)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", color = "blue") +
labs(title = "Milk Yield vs Protein Intake", x = "Protein Intake (kg)", y = "Milk Yield (liters)")
# Plot milk yield vs feed fat intake
ggplot(synthetic_data, aes(x = feed_fat, y = milk_yield)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", color = "red") +
labs(title = "Milk Yield vs Fat Intake", x = "Fat Intake (kg)", y = "Milk Yield (liters)")
NA
NA
We will use linear regression to predict milk yield based on the feed intake of protein, fat, and energy.
# 3.1 Linear regression: Milk yield as a function of feed intake
lm_model <- lm(milk_yield ~ feed_protein + feed_fat + feed_energy, data = synthetic_data)
summary(lm_model)
Call:
lm(formula = milk_yield ~ feed_protein + feed_fat + feed_energy,
data = synthetic_data)
Residuals:
Min 1Q Median 3Q Max
-20.9205 -3.3634 -0.0308 3.3950 20.0594
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.499e+01 2.778e-01 89.961 <2e-16 ***
feed_protein 4.999e-01 1.313e-02 38.078 <2e-16 ***
feed_fat 2.767e-01 2.614e-02 10.585 <2e-16 ***
feed_energy 3.943e-05 5.237e-05 0.753 0.451
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 5.008 on 36496 degrees of freedom
Multiple R-squared: 0.04102, Adjusted R-squared: 0.04094
F-statistic: 520.3 on 3 and 36496 DF, p-value: < 2.2e-16
# Visualize regression diagnostics
par(mfrow = c(2, 2))
plot(lm_model)
# 3.2 Predictions
predicted_milk_yield <- predict(lm_model, synthetic_data)
# Add the predicted values to the dataset
synthetic_data$predicted_milk_yield <- predicted_milk_yield
# 3.3 Visualize the actual vs predicted milk yield
ggplot(synthetic_data, aes(x = milk_yield, y = predicted_milk_yield)) +
geom_point(alpha = 0.5, color = "green") +
geom_abline(slope = 1, intercept = 0, color = "red") +
labs(title = "Actual vs Predicted Milk Yield", x = "Actual Milk Yield (liters)", y = "Predicted Milk Yield (liters)")
Perform PCA on the feed intake data to identify the most important features contributing to milk yield.
# 4.1 Perform PCA on feed intake data
pca_data <- synthetic_data[, c("feed_protein", "feed_fat", "feed_energy")]
pca_result <- prcomp(pca_data, scale. = TRUE)
# 4.2 Summary of PCA results
summary(pca_result)
Importance of components:
PC1 PC2 PC3
Standard deviation 1.0015 1.0005 0.9979
Proportion of Variance 0.3343 0.3337 0.3320
Cumulative Proportion 0.3343 0.6680 1.0000
# 4.3 Visualize PCA (Scree plot)
screeplot(pca_result, type = "lines", main = "Scree Plot")
# 4.4 Biplot of PCA
biplot(pca_result)
We will use matrix operations to calculate the total nutrient intake for a group of cows.
# 5.1 Matrix operations for total nutrient intake
nutrient_matrix <- as.matrix(synthetic_data[, c("feed_protein", "feed_fat", "feed_energy")])
nutrient_totals <- colSums(nutrient_matrix)
# 5.2 Display the total nutrient intake
print(nutrient_totals)
feed_protein feed_fat feed_energy
583553.2 146252.0 102321116.6
We will test whether there are significant differences in milk yield between different levels of protein intake using ANOVA.
# 6.1 Categorize protein intake into low, medium, high categories
synthetic_data$protein_level <- cut(synthetic_data$feed_protein, breaks = 3, labels = c("Low", "Medium", "High"))
# 6.2 Perform ANOVA on milk yield by protein level
anova_result <- aov(milk_yield ~ protein_level, data = synthetic_data)
summary(anova_result)
Df Sum Sq Mean Sq F value Pr(>F)
protein_level 2 20932 10466 409.2 <2e-16 ***
Residuals 36497 933541 26
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
# 6.3 Visualize milk yield by protein level
ggplot(synthetic_data, aes(x = protein_level, y = milk_yield, fill = protein_level)) +
geom_boxplot() +
labs(title = "Milk Yield by Protein Level", x = "Protein Level", y = "Milk Yield (liters)")