# Load the data
milk_data <- fread("milk4d.txt",
col.names = c("AnimID", "Ward", "Farm", "CYKsea", "MYMM",
"Class", "Lacest", "Lac", "DIM", "HTD",
"HYcalv", "HYprod", "PYM", "Ksea",
"wilk1", "wilk2", "xdim", "bprop", "age",
"milk", "bwt1", "bwt2", "bsc"),
fill = TRUE)
# Display data dimensions
cat("Dataset loaded: ", nrow(milk_data), "records from",
length(unique(milk_data$AnimID)), "animals\n")## Dataset loaded: 49221 records from 2603 animals
# Key statistics
total_records <- nrow(milk_data)
total_animals <- length(unique(milk_data$AnimID))
mean_milk <- mean(milk_data$milk, na.rm = TRUE)
sd_milk <- sd(milk_data$milk, na.rm = TRUE)
# High yielders
high_yielders <- milk_data %>% filter(milk > 20)
n_high_yield_records <- nrow(high_yielders)
n_high_yield_animals <- length(unique(high_yielders$AnimID))
pct_high_yield_records <- (n_high_yield_records / total_records) * 100
pct_high_yield_animals <- (n_high_yield_animals / total_animals) * 100
# Breed proportion anomalies
breed_anomaly <- milk_data %>% filter(bprop > 100)
n_breed_anomaly_records <- nrow(breed_anomaly)
n_breed_anomaly_animals <- length(unique(breed_anomaly$AnimID))
# Combined high yield and breed anomaly
high_yield_breed_anomaly <- milk_data %>% filter(milk > 20 & bprop > 100)
summary_df <- data.frame(
Metric = c("Total Records", "Total Animals",
"Mean Milk Yield (kg)", "SD Milk Yield (kg)",
"Records with Milk > 20 kg", "Animals with Milk > 20 kg",
"% Records with Milk > 20 kg", "% Animals with Milk > 20 kg",
"Records with Breed Prop > 100%", "Animals with Breed Prop > 100%",
"Records with Both Milk > 20 kg AND Breed > 100%"),
Value = c(total_records, total_animals,
round(mean_milk, 2), round(sd_milk, 2),
n_high_yield_records, n_high_yield_animals,
round(pct_high_yield_records, 2), round(pct_high_yield_animals, 2),
n_breed_anomaly_records, n_breed_anomaly_animals,
nrow(high_yield_breed_anomaly))
)
kable(summary_df, caption = "Executive Summary Statistics") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE) %>%
row_spec(c(5:8), background = "#e8f4ea") %>%
row_spec(c(9:11), background = "#ffe8e8")| Metric | Value |
|---|---|
| Total Records | 49221.00 |
| Total Animals | 2603.00 |
| Mean Milk Yield (kg) | 9.33 |
| SD Milk Yield (kg) | 7.63 |
| Records with Milk > 20 kg | 194.00 |
| Animals with Milk > 20 kg | 96.00 |
| % Records with Milk > 20 kg | 0.39 |
| % Animals with Milk > 20 kg | 3.69 |
| Records with Breed Prop > 100% | 41.00 |
| Animals with Breed Prop > 100% | 14.00 |
| Records with Both Milk > 20 kg AND Breed > 100% | 13.00 |
# Normality (QQ) plot with y-axis constrained to the range of the data
qq_df <- milk_data %>% filter(!is.na(milk))
ggplot(qq_df, aes(sample = milk)) +
stat_qq(size = 1, alpha = 0.7, color = "steelblue") +
stat_qq_line(color = "red", linewidth = 1) +
coord_cartesian(ylim = range(qq_df$milk, na.rm = TRUE)) +
labs(
title = "Normality Check (QQ Plot) for Milk Yield",
subtitle = sprintf("y-axis limited to data range: [%.2f, %.2f]",
min(qq_df$milk, na.rm = TRUE), max(qq_df$milk, na.rm = TRUE)),
x = "Theoretical Quantiles",
y = "Sample Quantiles (Milk)"
) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))# Identify outliers using IQR method
Q1 <- quantile(milk_data$milk, 0.25, na.rm = TRUE)
Q3 <- quantile(milk_data$milk, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
milk_data_no_outliers <- milk_data %>%
filter(milk >= lower_bound & milk <= upper_bound)
n_outliers <- nrow(milk_data) - nrow(milk_data_no_outliers)
pct_outliers <- (n_outliers / nrow(milk_data)) * 100
# Plot 1: Distribution with all data
p1 <- ggplot(milk_data, aes(x = milk)) +
geom_histogram(bins = 50, fill = "steelblue", color = "black", alpha = 0.7) +
geom_vline(aes(xintercept = mean(milk)), color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 20), color = "darkgreen", linetype = "solid", size = 1.5) +
annotate("text", x = 20, y = Inf, label = "20 kg threshold",
hjust = -0.1, vjust = 2, color = "darkgreen", fontface = "bold") +
labs(title = "Milk Yield Distribution - WITH Outliers",
subtitle = sprintf("n = %d records | Mean = %.2f kg | SD = %.2f kg | %d outliers (%.1f%%)",
nrow(milk_data), mean(milk_data$milk), sd(milk_data$milk),
n_outliers, pct_outliers),
x = "Milk Yield (kg)", y = "Frequency") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14))
# Plot 2: Distribution without outliers
p2 <- ggplot(milk_data_no_outliers, aes(x = milk)) +
geom_histogram(bins = 50, fill = "coral", color = "black", alpha = 0.7) +
geom_vline(aes(xintercept = mean(milk)), color = "red", linetype = "dashed", size = 1) +
geom_vline(aes(xintercept = 20), color = "darkgreen", linetype = "solid", size = 1.5) +
annotate("text", x = 20, y = Inf, label = "20 kg threshold",
hjust = -0.1, vjust = 2, color = "darkgreen", fontface = "bold") +
labs(title = "Milk Yield Distribution - WITHOUT Outliers",
subtitle = sprintf("n = %d records | Mean = %.2f kg | SD = %.2f kg | Outliers removed: %d",
nrow(milk_data_no_outliers),
mean(milk_data_no_outliers$milk),
sd(milk_data_no_outliers$milk),
n_outliers),
x = "Milk Yield (kg)", y = "Frequency") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14))
# Plot 3: Boxplot comparison
milk_data_combined <- rbind(
milk_data %>% mutate(Dataset = "With Outliers"),
milk_data_no_outliers %>% mutate(Dataset = "Without Outliers")
)
p3 <- ggplot(milk_data_combined, aes(x = Dataset, y = milk, fill = Dataset)) +
geom_boxplot(alpha = 0.7) +
geom_hline(yintercept = 20, color = "darkgreen", linetype = "solid", size = 1.5) +
stat_summary(fun = mean, geom = "point", shape = 23, size = 3, fill = "red") +
annotate("text", x = 0.5, y = 20, label = "20 kg threshold",
hjust = 0, color = "darkgreen", fontface = "bold") +
labs(title = "Milk Yield Distribution Comparison",
subtitle = "Red diamond = Mean | Green line = 20 kg threshold",
y = "Milk Yield (kg)", x = "") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"),
legend.position = "none") +
scale_fill_manual(values = c("With Outliers" = "steelblue", "Without Outliers" = "coral"))
grid.arrange(p1, p2, p3, ncol = 1)# Build outlier-only data
outliers_df <- milk_data %>%
filter(!is.na(milk)) %>%
filter(milk < lower_bound | milk > upper_bound)
if (nrow(outliers_df) > 0) {
ggplot(outliers_df, aes(x = milk)) +
geom_histogram(bins = 40, fill = "tomato", color = "black", alpha = 0.8) +
geom_vline(xintercept = c(lower_bound, upper_bound),
linetype = "dashed", color = "black", linewidth = 1) +
labs(
title = "Histogram of Outlier Milk Yields (IQR Rule)",
subtitle = sprintf("Outliers: %d (%.1f%%) | Bounds: [%.2f, %.2f]",
n_outliers, pct_outliers, lower_bound, upper_bound),
x = "Milk Yield (kg)", y = "Frequency"
) +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
} else {
cat("No outliers detected by the IQR rule (nice!).")
}
# High Yielders Analysis (>20 kg)
# Get all animals with any record > 20 kg
high_yield_animals <- milk_data %>%
filter(milk > 20) %>%
group_by(AnimID) %>%
summarise(
`Records > 20kg` = n(),
`Total Records` = n(),
`Mean Milk` = round(mean(milk), 2),
`Max Milk` = round(max(milk), 2),
`Min Milk` = round(min(milk), 2),
`SD Milk` = round(sd(milk), 2),
`Mean Breed%` = round(mean(bprop), 2),
`Mean DIM` = round(mean(DIM), 1),
.groups = 'drop'
) %>%
arrange(desc(`Max Milk`))
# Also get their total records from full dataset
total_records_per_animal <- milk_data %>%
filter(AnimID %in% high_yield_animals$AnimID) %>%
group_by(AnimID) %>%
summarise(`All Records` = n(), .groups = 'drop')
high_yield_animals <- high_yield_animals %>%
left_join(total_records_per_animal, by = "AnimID") %>%
mutate(`% Records > 20kg` = round((`Records > 20kg` / `All Records`) * 100, 1)) %>%
select(AnimID, `Records > 20kg`, `All Records`, `% Records > 20kg`,
`Mean Milk`, `Max Milk`, `Mean Breed%`, everything())
# Display top 20 high yielders
kable(head(high_yield_animals, 20),
caption = "Top 20 Animals with Milk Yield > 20 kg") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
font_size = 12) %>%
column_spec(5:6, background = "#e8f4ea", bold = TRUE)| AnimID | Records > 20kg | All Records | % Records > 20kg | Mean Milk | Max Milk | Mean Breed% | Total Records | Min Milk | SD Milk | Mean DIM |
|---|---|---|---|---|---|---|---|---|---|---|
| 1414 | 1 | 54 | 1.9 | 455.00 | 455.0 | 123.31 | 1 | 455.0 | NA | 35404757.0 |
| 1454 | 1 | 33 | 3.0 | 455.00 | 455.0 | 144.21 | 1 | 455.0 | NA | 35977524.0 |
| 2090 | 2 | 50 | 4.0 | 455.00 | 455.0 | 144.74 | 2 | 455.0 | 0.00 | 46810154.5 |
| 1462 | 1 | 49 | 2.0 | 422.00 | 422.0 | 145.66 | 1 | 422.0 | NA | 34076605.0 |
| 1622 | 4 | 48 | 8.3 | 361.00 | 361.0 | 230.74 | 4 | 361.0 | 0.00 | 21316971.0 |
| 39 | 3 | 25 | 12.0 | 347.33 | 348.0 | 122.19 | 3 | 346.0 | 1.15 | 18999411.7 |
| 693 | 1 | 11 | 9.1 | 337.00 | 337.0 | 162.10 | 1 | 337.0 | NA | 49438462.0 |
| 1440 | 1 | 9 | 11.1 | 39.50 | 39.5 | 0.76 | 1 | 39.5 | NA | 410.0 |
| 2087 | 6 | 52 | 11.5 | 28.17 | 38.0 | 0.78 | 6 | 22.0 | 5.42 | 282.2 |
| 224 | 1 | 3 | 33.3 | 30.00 | 30.0 | 0.96 | 1 | 30.0 | NA | 371.0 |
| 1070 | 2 | 46 | 4.3 | 26.00 | 30.0 | 0.93 | 2 | 22.0 | 5.66 | 85.5 |
| 767 | 7 | 9 | 77.8 | 28.29 | 29.0 | 0.97 | 7 | 27.0 | 0.95 | 204.4 |
| 315 | 1 | 15 | 6.7 | 28.00 | 28.0 | 0.86 | 1 | 28.0 | NA | 116.0 |
| 529 | 3 | 5 | 60.0 | 24.33 | 28.0 | 0.99 | 3 | 22.0 | 3.21 | 451.7 |
| 996 | 7 | 33 | 21.2 | 25.71 | 28.0 | 0.99 | 7 | 24.0 | 2.14 | 98.0 |
| 1492 | 1 | 16 | 6.2 | 28.00 | 28.0 | 0.57 | 1 | 28.0 | NA | 30.0 |
| 2259 | 10 | 29 | 34.5 | 26.20 | 28.0 | 0.78 | 10 | 26.0 | 0.63 | 279.3 |
| 1719 | 4 | 11 | 36.4 | 25.00 | 27.0 | 0.78 | 4 | 23.0 | 1.63 | 257.0 |
| 718 | 4 | 36 | 11.1 | 23.25 | 26.0 | 0.87 | 4 | 22.0 | 1.89 | 192.8 |
| 1559 | 3 | 46 | 6.5 | 24.33 | 26.0 | 0.97 | 3 | 22.0 | 2.08 | 78.3 |
##
## Total unique animals with at least one record > 20 kg: 96
# Select top 15 high yielders for visualization
top_high_yielders <- head(high_yield_animals, 15)
# Get all records for these animals
high_yielder_records <- milk_data %>%
filter(AnimID %in% top_high_yielders$AnimID)
# Plot 1: Box plot of top high yielders
p1 <- ggplot(high_yielder_records, aes(x = reorder(factor(AnimID), milk, median),
y = milk, fill = factor(AnimID))) +
geom_boxplot(alpha = 0.7) +
geom_hline(yintercept = 20, color = "red", linetype = "dashed", size = 1) +
coord_flip() +
labs(title = "Milk Yield Distribution for Top 15 High-Yielding Animals",
subtitle = "All their test-day records shown | Red line = 20 kg threshold",
x = "Animal ID", y = "Milk Yield (kg)") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"),
legend.position = "none")
# Plot 2: Scatter plot - milk yield vs DIM for high yielders
p2 <- ggplot(high_yielder_records %>% filter(AnimID %in% head(high_yield_animals$AnimID, 10)),
aes(x = DIM, y = milk, color = factor(AnimID))) +
geom_point(size = 3, alpha = 0.7) +
geom_smooth(method = "loess", se = FALSE, size = 0.8) +
geom_hline(yintercept = 20, color = "red", linetype = "dashed", size = 1) +
labs(title = "Lactation Curves of Top 10 High-Yielding Animals",
subtitle = "Individual lactation patterns",
x = "Days in Milk (DIM)", y = "Milk Yield (kg)",
color = "Animal ID") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"),
legend.position = "right")
grid.arrange(p1, p2, ncol = 1)# Animals with breed proportion > 100
breed_anomaly_animals <- milk_data %>%
filter(bprop > 100) %>%
group_by(AnimID) %>%
summarise(
`Records` = n(),
`Mean Breed%` = round(mean(bprop), 2),
`Max Breed%` = round(max(bprop), 2),
`Min Breed%` = round(min(bprop), 2),
`Mean Milk` = round(mean(milk), 2),
`Max Milk` = round(max(milk), 2),
`Mean DIM` = round(mean(DIM), 1),
Farm = first(Farm),
Ward = first(Ward),
.groups = 'drop'
) %>%
arrange(desc(`Max Breed%`))
kable(breed_anomaly_animals,
caption = "Animals with Breed Proportion > 100% (Data Quality Issue)") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
column_spec(3:5, background = "#ffe8e8", bold = TRUE)| AnimID | Records | Mean Breed% | Max Breed% | Min Breed% | Mean Milk | Max Milk | Mean DIM | Farm | Ward |
|---|---|---|---|---|---|---|---|---|---|
| 1622 | 6 | 230.74 | 230.74 | 230.74 | 240.67 | 361 | 21317033 | 819 | 48 |
| 620 | 1 | 173.08 | 173.08 | 173.08 | 0.00 | 0 | 9286851 | 357 | 46 |
| 693 | 1 | 162.10 | 162.10 | 162.10 | 337.00 | 337 | 49438462 | 1899 | 105 |
| 955 | 3 | 151.64 | 151.64 | 151.64 | 0.00 | 0 | 11083562 | 426 | 117 |
| 2084 | 2 | 147.14 | 147.14 | 147.14 | 0.00 | 0 | 46810302 | 1798 | 117 |
| 1460 | 2 | 147.04 | 147.04 | 147.04 | 0.00 | 0 | 3766180 | 145 | 117 |
| 1486 | 2 | 146.75 | 146.75 | 146.75 | 0.00 | 0 | 20822397 | 800 | 117 |
| 1462 | 2 | 145.66 | 145.66 | 145.66 | 211.00 | 422 | 34076718 | 1309 | 117 |
| 2090 | 4 | 144.74 | 144.74 | 144.74 | 227.50 | 455 | 46810210 | 1798 | 117 |
| 1454 | 5 | 144.21 | 144.21 | 144.21 | 91.00 | 455 | 35977690 | 1382 | 117 |
| 867 | 1 | 133.50 | 133.50 | 133.50 | 0.00 | 0 | 19885067 | 764 | 129 |
| 929 | 2 | 125.41 | 125.41 | 125.41 | 0.00 | 0 | 36758896 | 1412 | 117 |
| 1414 | 5 | 123.31 | 123.31 | 123.31 | 91.00 | 455 | 35404845 | 1360 | 116 |
| 39 | 5 | 122.19 | 122.19 | 122.19 | 208.40 | 348 | 18999451 | 730 | 135 |
##
## Total animals with breed proportion > 100%: 14
##
## Total records with breed proportion > 100%: 41
# Create breed proportion categories
breed_categories <- milk_data %>%
mutate(BreedCat = case_when(
bprop <= 25 ~ "0-25%",
bprop > 25 & bprop <= 50 ~ "25-50%",
bprop > 50 & bprop <= 75 ~ "50-75%",
bprop > 75 & bprop <= 100 ~ "75-100%",
bprop > 100 ~ ">100% (Anomaly)"
)) %>%
group_by(BreedCat) %>%
summarise(
Records = n(),
Animals = n_distinct(AnimID),
`Mean Milk` = round(mean(milk), 2),
`SD Milk` = round(sd(milk), 2),
.groups = 'drop'
)
# Plot 1: Histogram of breed proportion
p1 <- ggplot(milk_data, aes(x = bprop)) +
geom_histogram(bins = 50, fill = "purple", color = "black", alpha = 0.7) +
geom_vline(xintercept = 100, color = "red", linetype = "dashed", size = 1.5) +
annotate("text", x = 100, y = Inf, label = "100% threshold",
hjust = -0.1, vjust = 2, color = "red", fontface = "bold") +
labs(title = "Distribution of Breed Proportion",
subtitle = sprintf("Mean = %.1f%% | Values > 100%%: %d records",
mean(milk_data$bprop), sum(milk_data$bprop > 100)),
x = "Exotic Breed Proportion (%)", y = "Frequency") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
# Plot 2: Milk yield by breed proportion
p2 <- ggplot(milk_data, aes(x = bprop, y = milk)) +
geom_point(alpha = 0.3, aes(color = milk > 20)) +
geom_smooth(method = "loess", color = "blue", size = 1.2) +
geom_vline(xintercept = 100, color = "red", linetype = "dashed", size = 1) +
geom_hline(yintercept = 20, color = "darkgreen", linetype = "dashed", size = 1) +
scale_color_manual(values = c("FALSE" = "gray60", "TRUE" = "darkgreen"),
labels = c("≤20 kg", ">20 kg")) +
labs(title = "Milk Yield vs Breed Proportion",
subtitle = "Green points = Milk > 20 kg | Red line = 100% breed | Green line = 20 kg milk",
x = "Exotic Breed Proportion (%)", y = "Milk Yield (kg)",
color = "Milk Yield") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"))
grid.arrange(p1, p2, ncol = 1)# Animals with both conditions
combined_animals <- milk_data %>%
filter(milk > 20 & bprop > 100) %>%
group_by(AnimID) %>%
summarise(
`Records` = n(),
`Mean Milk` = round(mean(milk), 2),
`Max Milk` = round(max(milk), 2),
`Mean Breed%` = round(mean(bprop), 2),
`Max Breed%` = round(max(bprop), 2),
`Mean DIM` = round(mean(DIM), 1),
Farm = first(Farm),
Ward = first(Ward),
Class = first(Class),
.groups = 'drop'
) %>%
arrange(desc(`Max Milk`))
if(nrow(combined_animals) > 0) {
kable(combined_animals,
caption = "Animals with BOTH Milk > 20 kg AND Breed Proportion > 100%") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
column_spec(3:4, background = "#e8f4ea") %>%
column_spec(5:6, background = "#ffe8e8")
} else {
cat("No animals found with both conditions (Milk > 20 kg AND Breed > 100%)")
}| AnimID | Records | Mean Milk | Max Milk | Mean Breed% | Max Breed% | Mean DIM | Farm | Ward | Class |
|---|---|---|---|---|---|---|---|---|---|
| 1414 | 1 | 455.00 | 455 | 123.31 | 123.31 | 35404757 | 1360 | 116 | 2 |
| 1454 | 1 | 455.00 | 455 | 144.21 | 144.21 | 35977524 | 1382 | 117 | 2 |
| 2090 | 2 | 455.00 | 455 | 144.74 | 144.74 | 46810155 | 1798 | 117 | 2 |
| 1462 | 1 | 422.00 | 422 | 145.66 | 145.66 | 34076605 | 1309 | 117 | 1 |
| 1622 | 4 | 361.00 | 361 | 230.74 | 230.74 | 21316971 | 819 | 48 | 1 |
| 39 | 3 | 347.33 | 348 | 122.19 | 122.19 | 18999412 | 730 | 135 | 1 |
| 693 | 1 | 337.00 | 337 | 162.10 | 162.10 | 49438462 | 1899 | 105 | 2 |
# Get all records matching both criteria
combined_records <- milk_data %>%
filter(milk > 20 & bprop > 100) %>%
select(AnimID, milk, bprop, DIM, Farm, Ward, Class, Lac, age) %>%
arrange(desc(milk))
if(nrow(combined_records) > 0) {
kable(head(combined_records, 20),
caption = "Top 20 Records with Both Milk > 20 kg AND Breed > 100%") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
column_spec(2, background = "#e8f4ea") %>%
column_spec(3, background = "#ffe8e8")
cat("\nTotal records meeting both criteria:", nrow(combined_records))
} else {
cat("No records found with both conditions")
}##
## Total records meeting both criteria: 13
# Create comprehensive summary for high yielders
high_yield_summary <- milk_data %>%
mutate(HighYield = milk > 20) %>%
group_by(HighYield) %>%
summarise(
`Records` = n(),
`Animals` = n_distinct(AnimID),
`Mean Milk (kg)` = round(mean(milk), 2),
`SD Milk (kg)` = round(sd(milk), 2),
`Mean Breed (%)` = round(mean(bprop), 2),
`Mean DIM` = round(mean(DIM), 1),
`Mean Age (months)` = round(mean(age), 1),
.groups = 'drop'
) %>%
mutate(Category = ifelse(HighYield, "Milk > 20 kg", "Milk ≤ 20 kg")) %>%
select(Category, everything(), -HighYield)
kable(high_yield_summary,
caption = "Comparison: High Yielders vs Normal Yielders") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
row_spec(which(high_yield_summary$Category == "Milk > 20 kg"),
background = "#e8f4ea", bold = TRUE)| Category | Records | Animals | Mean Milk (kg) | SD Milk (kg) | Mean Breed (%) | Mean DIM | Mean Age (months) |
|---|---|---|---|---|---|---|---|
| Milk ≤ 20 kg | 49027 | 2603 | 9.17 | 4.33 | 0.88 | 15979.4 | 71.9 |
| Milk > 20 kg | 194 | 96 | 47.89 | 92.69 | 11.90 | 2014497.9 | 68.1 |
# Create comprehensive summary for breed anomalies
breed_anomaly_summary <- milk_data %>%
mutate(BreedAnomaly = bprop > 100) %>%
group_by(BreedAnomaly) %>%
summarise(
`Records` = n(),
`Animals` = n_distinct(AnimID),
`Mean Breed (%)` = round(mean(bprop), 2),
`Max Breed (%)` = round(max(bprop), 2),
`Mean Milk (kg)` = round(mean(milk), 2),
`Mean DIM` = round(mean(DIM), 1),
.groups = 'drop'
) %>%
mutate(Category = ifelse(BreedAnomaly, "Breed > 100%", "Breed ≤ 100%")) %>%
select(Category, everything(), -BreedAnomaly)
kable(breed_anomaly_summary,
caption = "Comparison: Breed Proportion Anomalies vs Normal") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
row_spec(which(breed_anomaly_summary$Category == "Breed > 100%"),
background = "#ffe8e8", bold = TRUE)| Category | Records | Animals | Mean Breed (%) | Max Breed (%) | Mean Milk (kg) | Mean DIM |
|---|---|---|---|---|---|---|
| Breed ≤ 100% | 49180 | 2603 | 0.80 | 1.00 | 9.23 | 220.6 |
| Breed > 100% | 41 | 14 | 152.67 | 230.74 | 123.54 | 28375195.6 |
# Create categories for visualization
milk_data_viz <- milk_data %>%
mutate(Category = case_when(
milk > 20 & bprop > 100 ~ "Both High Milk & Breed Anomaly",
milk > 20 & bprop <= 100 ~ "High Milk Only",
milk <= 20 & bprop > 100 ~ "Breed Anomaly Only",
TRUE ~ "Normal"
))
category_summary <- milk_data_viz %>%
group_by(Category) %>%
summarise(Count = n(), .groups = 'drop')
p <- ggplot(milk_data_viz, aes(x = bprop, y = milk, color = Category)) +
geom_point(alpha = 0.5, size = 2) +
geom_vline(xintercept = 100, color = "red", linetype = "dashed", size = 1) +
geom_hline(yintercept = 20, color = "darkgreen", linetype = "dashed", size = 1) +
scale_color_manual(values = c("Both High Milk & Breed Anomaly" = "purple",
"High Milk Only" = "darkgreen",
"Breed Anomaly Only" = "orange",
"Normal" = "gray60")) +
labs(title = "Comprehensive View: Milk Yield vs Breed Proportion",
subtitle = paste("Categories - Normal:", category_summary$Count[category_summary$Category == "Normal"],
"| High Milk:", category_summary$Count[category_summary$Category == "High Milk Only"],
"| Breed Anomaly:", category_summary$Count[category_summary$Category == "Breed Anomaly Only"],
"| Both:", sum(category_summary$Count[category_summary$Category == "Both High Milk & Breed Anomaly"], na.rm = TRUE)),
x = "Exotic Breed Proportion (%)",
y = "Milk Yield (kg)") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
legend.position = "bottom",
legend.title = element_text(face = "bold"))
print(p)# Calculate key statistics
total_animals <- length(unique(milk_data$AnimID))
high_yield_pct <- (length(unique(high_yield_animals$AnimID)) / total_animals) * 100
breed_anomaly_pct <- (nrow(breed_anomaly_animals) / total_animals) * 100
key_findings <- data.frame(
Finding = c(
"Total animals in dataset",
"Animals with at least one record > 20 kg milk",
"Percentage of animals that are high yielders",
"Animals with breed proportion > 100%",
"Percentage of animals with breed anomaly",
"Maximum milk yield recorded",
"Maximum breed proportion recorded",
"Mean milk yield for high yielders (>20 kg records)",
"Mean milk yield for breed anomaly animals"
),
Value = c(
total_animals,
nrow(high_yield_animals),
paste0(round(high_yield_pct, 2), "%"),
nrow(breed_anomaly_animals),
paste0(round(breed_anomaly_pct, 2), "%"),
paste0(round(max(milk_data$milk), 2), " kg"),
paste0(round(max(milk_data$bprop), 2), "%"),
paste0(round(mean(high_yielders$milk), 2), " kg"),
paste0(round(mean(breed_anomaly$milk), 2), " kg")
)
)
kable(key_findings, caption = "Key Findings Summary") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
column_spec(1, bold = TRUE, width = "60%") %>%
column_spec(2, width = "40%")| Finding | Value |
|---|---|
| Total animals in dataset | 2603 |
| Animals with at least one record > 20 kg milk | 96 |
| Percentage of animals that are high yielders | 3.69% |
| Animals with breed proportion > 100% | 14 |
| Percentage of animals with breed anomaly | 0.54% |
| Maximum milk yield recorded | 455 kg |
| Maximum breed proportion recorded | 230.74% |
| Mean milk yield for high yielders (>20 kg records) | 47.89 kg |
| Mean milk yield for breed anomaly animals | 123.54 kg |
Based on the analysis:
Data Quality: There are 14 animals with breed proportion > 100%, indicating data entry errors that should be investigated.
High Yielders: 3.69% of animals have achieved milk yields > 20 kg, representing elite performers that should be prioritized for breeding programs.
Outlier Management: 102 outlier records were identified (0.21% of data). These should be verified for accuracy.
Breed Proportion Impact: The relationship between exotic breed proportion and milk yield shows positive correlation, supporting crossbreeding strategies.
Further Investigation: Animals with both high milk yield AND breed proportion anomalies require immediate data verification.
Report generated on 2025-10-14