1 Data Loading and Preparation

# Load the data
milk_data <- fread("milk4d.txt", 
                   col.names = c("AnimID", "Ward", "Farm", "CYKsea", "MYMM",
                                 "Class", "Lacest", "Lac", "DIM", "HTD",
                                 "HYcalv", "HYprod", "PYM", "Ksea",
                                 "wilk1", "wilk2", "xdim", "bprop", "age",
                                 "milk", "bwt1", "bwt2", "bsc"),
                   fill = TRUE)

# Display data dimensions
cat("Dataset loaded: ", nrow(milk_data), "records from", 
    length(unique(milk_data$AnimID)), "animals\n")
## Dataset loaded:  49221 records from 2603 animals

2 Executive Summary

# Key statistics
total_records <- nrow(milk_data)
total_animals <- length(unique(milk_data$AnimID))
mean_milk <- mean(milk_data$milk, na.rm = TRUE)
sd_milk <- sd(milk_data$milk, na.rm = TRUE)

# High yielders
high_yielders <- milk_data %>% filter(milk > 20)
n_high_yield_records <- nrow(high_yielders)
n_high_yield_animals <- length(unique(high_yielders$AnimID))
pct_high_yield_records <- (n_high_yield_records / total_records) * 100
pct_high_yield_animals <- (n_high_yield_animals / total_animals) * 100

# Breed proportion anomalies
breed_anomaly <- milk_data %>% filter(bprop > 100)
n_breed_anomaly_records <- nrow(breed_anomaly)
n_breed_anomaly_animals <- length(unique(breed_anomaly$AnimID))

# Combined high yield and breed anomaly
high_yield_breed_anomaly <- milk_data %>% filter(milk > 20 & bprop > 100)

summary_df <- data.frame(
  Metric = c("Total Records", "Total Animals", 
             "Mean Milk Yield (kg)", "SD Milk Yield (kg)",
             "Records with Milk > 20 kg", "Animals with Milk > 20 kg",
             "% Records with Milk > 20 kg", "% Animals with Milk > 20 kg",
             "Records with Breed Prop > 100%", "Animals with Breed Prop > 100%",
             "Records with Both Milk > 20 kg AND Breed > 100%"),
  Value = c(total_records, total_animals,
            round(mean_milk, 2), round(sd_milk, 2),
            n_high_yield_records, n_high_yield_animals,
            round(pct_high_yield_records, 2), round(pct_high_yield_animals, 2),
            n_breed_anomaly_records, n_breed_anomaly_animals,
            nrow(high_yield_breed_anomaly))
)

kable(summary_df, caption = "Executive Summary Statistics") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = FALSE) %>%
  row_spec(c(5:8), background = "#e8f4ea") %>%
  row_spec(c(9:11), background = "#ffe8e8")
Executive Summary Statistics
Metric Value
Total Records 49221.00
Total Animals 2603.00
Mean Milk Yield (kg) 9.33
SD Milk Yield (kg) 7.63
Records with Milk > 20 kg 194.00
Animals with Milk > 20 kg 96.00
% Records with Milk > 20 kg 0.39
% Animals with Milk > 20 kg 3.69
Records with Breed Prop > 100% 41.00
Animals with Breed Prop > 100% 14.00
Records with Both Milk > 20 kg AND Breed > 100% 13.00

3 Milk Yield Analysis

3.1 Overall Distribution

# Normality (QQ) plot with y-axis constrained to the range of the data
qq_df <- milk_data %>% filter(!is.na(milk))

ggplot(qq_df, aes(sample = milk)) +
  stat_qq(size = 1, alpha = 0.7, color = "steelblue") +
  stat_qq_line(color = "red", linewidth = 1) +
  coord_cartesian(ylim = range(qq_df$milk, na.rm = TRUE)) +
  labs(
    title = "Normality Check (QQ Plot) for Milk Yield",
    subtitle = sprintf("y-axis limited to data range: [%.2f, %.2f]",
                       min(qq_df$milk, na.rm = TRUE), max(qq_df$milk, na.rm = TRUE)),
    x = "Theoretical Quantiles",
    y = "Sample Quantiles (Milk)"
  ) +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

3.2 Distribution Plots: With and Without Outliers

# Identify outliers using IQR method
Q1 <- quantile(milk_data$milk, 0.25, na.rm = TRUE)
Q3 <- quantile(milk_data$milk, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR

milk_data_no_outliers <- milk_data %>% 
  filter(milk >= lower_bound & milk <= upper_bound)

n_outliers <- nrow(milk_data) - nrow(milk_data_no_outliers)
pct_outliers <- (n_outliers / nrow(milk_data)) * 100

# Plot 1: Distribution with all data
p1 <- ggplot(milk_data, aes(x = milk)) +
  geom_histogram(bins = 50, fill = "steelblue", color = "black", alpha = 0.7) +
  geom_vline(aes(xintercept = mean(milk)), color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20), color = "darkgreen", linetype = "solid", size = 1.5) +
  annotate("text", x = 20, y = Inf, label = "20 kg threshold", 
           hjust = -0.1, vjust = 2, color = "darkgreen", fontface = "bold") +
  labs(title = "Milk Yield Distribution - WITH Outliers",
       subtitle = sprintf("n = %d records | Mean = %.2f kg | SD = %.2f kg | %d outliers (%.1f%%)",
                         nrow(milk_data), mean(milk_data$milk), sd(milk_data$milk),
                         n_outliers, pct_outliers),
       x = "Milk Yield (kg)", y = "Frequency") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14))

# Plot 2: Distribution without outliers
p2 <- ggplot(milk_data_no_outliers, aes(x = milk)) +
  geom_histogram(bins = 50, fill = "coral", color = "black", alpha = 0.7) +
  geom_vline(aes(xintercept = mean(milk)), color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20), color = "darkgreen", linetype = "solid", size = 1.5) +
  annotate("text", x = 20, y = Inf, label = "20 kg threshold", 
           hjust = -0.1, vjust = 2, color = "darkgreen", fontface = "bold") +
  labs(title = "Milk Yield Distribution - WITHOUT Outliers",
       subtitle = sprintf("n = %d records | Mean = %.2f kg | SD = %.2f kg | Outliers removed: %d",
                         nrow(milk_data_no_outliers), 
                         mean(milk_data_no_outliers$milk), 
                         sd(milk_data_no_outliers$milk),
                         n_outliers),
       x = "Milk Yield (kg)", y = "Frequency") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14))

# Plot 3: Boxplot comparison
milk_data_combined <- rbind(
  milk_data %>% mutate(Dataset = "With Outliers"),
  milk_data_no_outliers %>% mutate(Dataset = "Without Outliers")
)

p3 <- ggplot(milk_data_combined, aes(x = Dataset, y = milk, fill = Dataset)) +
  geom_boxplot(alpha = 0.7) +
  geom_hline(yintercept = 20, color = "darkgreen", linetype = "solid", size = 1.5) +
  stat_summary(fun = mean, geom = "point", shape = 23, size = 3, fill = "red") +
  annotate("text", x = 0.5, y = 20, label = "20 kg threshold", 
           hjust = 0, color = "darkgreen", fontface = "bold") +
  labs(title = "Milk Yield Distribution Comparison",
       subtitle = "Red diamond = Mean | Green line = 20 kg threshold",
       y = "Milk Yield (kg)", x = "") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"),
        legend.position = "none") +
  scale_fill_manual(values = c("With Outliers" = "steelblue", "Without Outliers" = "coral"))

grid.arrange(p1, p2, p3, ncol = 1)

# Build outlier-only data
outliers_df <- milk_data %>%
  filter(!is.na(milk)) %>%
  filter(milk < lower_bound | milk > upper_bound)

if (nrow(outliers_df) > 0) {
  ggplot(outliers_df, aes(x = milk)) +
    geom_histogram(bins = 40, fill = "tomato", color = "black", alpha = 0.8) +
    geom_vline(xintercept = c(lower_bound, upper_bound),
               linetype = "dashed", color = "black", linewidth = 1) +
    labs(
      title = "Histogram of Outlier Milk Yields (IQR Rule)",
      subtitle = sprintf("Outliers: %d (%.1f%%) | Bounds: [%.2f, %.2f]",
                         n_outliers, pct_outliers, lower_bound, upper_bound),
      x = "Milk Yield (kg)", y = "Frequency"
    ) +
    theme_bw() +
    theme(plot.title = element_text(hjust = 0.5, face = "bold"))
} else {
  cat("No outliers detected by the IQR rule (nice!).")
}

# High Yielders Analysis (>20 kg)

3.3 Animals with Milk Yield > 20 kg

# Get all animals with any record > 20 kg
high_yield_animals <- milk_data %>%
  filter(milk > 20) %>%
  group_by(AnimID) %>%
  summarise(
    `Records > 20kg` = n(),
    `Total Records` = n(),
    `Mean Milk` = round(mean(milk), 2),
    `Max Milk` = round(max(milk), 2),
    `Min Milk` = round(min(milk), 2),
    `SD Milk` = round(sd(milk), 2),
    `Mean Breed%` = round(mean(bprop), 2),
    `Mean DIM` = round(mean(DIM), 1),
    .groups = 'drop'
  ) %>%
  arrange(desc(`Max Milk`))

# Also get their total records from full dataset
total_records_per_animal <- milk_data %>%
  filter(AnimID %in% high_yield_animals$AnimID) %>%
  group_by(AnimID) %>%
  summarise(`All Records` = n(), .groups = 'drop')

high_yield_animals <- high_yield_animals %>%
  left_join(total_records_per_animal, by = "AnimID") %>%
  mutate(`% Records > 20kg` = round((`Records > 20kg` / `All Records`) * 100, 1)) %>%
  select(AnimID, `Records > 20kg`, `All Records`, `% Records > 20kg`, 
         `Mean Milk`, `Max Milk`, `Mean Breed%`, everything())

# Display top 20 high yielders
kable(head(high_yield_animals, 20), 
      caption = "Top 20 Animals with Milk Yield > 20 kg") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                font_size = 12) %>%
  column_spec(5:6, background = "#e8f4ea", bold = TRUE)
Top 20 Animals with Milk Yield > 20 kg
AnimID Records > 20kg All Records % Records > 20kg Mean Milk Max Milk Mean Breed% Total Records Min Milk SD Milk Mean DIM
1414 1 54 1.9 455.00 455.0 123.31 1 455.0 NA 35404757.0
1454 1 33 3.0 455.00 455.0 144.21 1 455.0 NA 35977524.0
2090 2 50 4.0 455.00 455.0 144.74 2 455.0 0.00 46810154.5
1462 1 49 2.0 422.00 422.0 145.66 1 422.0 NA 34076605.0
1622 4 48 8.3 361.00 361.0 230.74 4 361.0 0.00 21316971.0
39 3 25 12.0 347.33 348.0 122.19 3 346.0 1.15 18999411.7
693 1 11 9.1 337.00 337.0 162.10 1 337.0 NA 49438462.0
1440 1 9 11.1 39.50 39.5 0.76 1 39.5 NA 410.0
2087 6 52 11.5 28.17 38.0 0.78 6 22.0 5.42 282.2
224 1 3 33.3 30.00 30.0 0.96 1 30.0 NA 371.0
1070 2 46 4.3 26.00 30.0 0.93 2 22.0 5.66 85.5
767 7 9 77.8 28.29 29.0 0.97 7 27.0 0.95 204.4
315 1 15 6.7 28.00 28.0 0.86 1 28.0 NA 116.0
529 3 5 60.0 24.33 28.0 0.99 3 22.0 3.21 451.7
996 7 33 21.2 25.71 28.0 0.99 7 24.0 2.14 98.0
1492 1 16 6.2 28.00 28.0 0.57 1 28.0 NA 30.0
2259 10 29 34.5 26.20 28.0 0.78 10 26.0 0.63 279.3
1719 4 11 36.4 25.00 27.0 0.78 4 23.0 1.63 257.0
718 4 36 11.1 23.25 26.0 0.87 4 22.0 1.89 192.8
1559 3 46 6.5 24.33 26.0 0.97 3 22.0 2.08 78.3
cat("\nTotal unique animals with at least one record > 20 kg:", nrow(high_yield_animals))
## 
## Total unique animals with at least one record > 20 kg: 96

3.4 High Yielders Performance Plot

# Select top 15 high yielders for visualization
top_high_yielders <- head(high_yield_animals, 15)

# Get all records for these animals
high_yielder_records <- milk_data %>%
  filter(AnimID %in% top_high_yielders$AnimID)

# Plot 1: Box plot of top high yielders
p1 <- ggplot(high_yielder_records, aes(x = reorder(factor(AnimID), milk, median), 
                                        y = milk, fill = factor(AnimID))) +
  geom_boxplot(alpha = 0.7) +
  geom_hline(yintercept = 20, color = "red", linetype = "dashed", size = 1) +
  coord_flip() +
  labs(title = "Milk Yield Distribution for Top 15 High-Yielding Animals",
       subtitle = "All their test-day records shown | Red line = 20 kg threshold",
       x = "Animal ID", y = "Milk Yield (kg)") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"),
        legend.position = "none")

# Plot 2: Scatter plot - milk yield vs DIM for high yielders
p2 <- ggplot(high_yielder_records %>% filter(AnimID %in% head(high_yield_animals$AnimID, 10)), 
             aes(x = DIM, y = milk, color = factor(AnimID))) +
  geom_point(size = 3, alpha = 0.7) +
  geom_smooth(method = "loess", se = FALSE, size = 0.8) +
  geom_hline(yintercept = 20, color = "red", linetype = "dashed", size = 1) +
  labs(title = "Lactation Curves of Top 10 High-Yielding Animals",
       subtitle = "Individual lactation patterns",
       x = "Days in Milk (DIM)", y = "Milk Yield (kg)",
       color = "Animal ID") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"),
        legend.position = "right")

grid.arrange(p1, p2, ncol = 1)

4 Breed Proportion Analysis

4.1 Animals with Breed Proportion > 100%

# Animals with breed proportion > 100
breed_anomaly_animals <- milk_data %>%
  filter(bprop > 100) %>%
  group_by(AnimID) %>%
  summarise(
    `Records` = n(),
    `Mean Breed%` = round(mean(bprop), 2),
    `Max Breed%` = round(max(bprop), 2),
    `Min Breed%` = round(min(bprop), 2),
    `Mean Milk` = round(mean(milk), 2),
    `Max Milk` = round(max(milk), 2),
    `Mean DIM` = round(mean(DIM), 1),
    Farm = first(Farm),
    Ward = first(Ward),
    .groups = 'drop'
  ) %>%
  arrange(desc(`Max Breed%`))

kable(breed_anomaly_animals, 
      caption = "Animals with Breed Proportion > 100% (Data Quality Issue)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  column_spec(3:5, background = "#ffe8e8", bold = TRUE)
Animals with Breed Proportion > 100% (Data Quality Issue)
AnimID Records Mean Breed% Max Breed% Min Breed% Mean Milk Max Milk Mean DIM Farm Ward
1622 6 230.74 230.74 230.74 240.67 361 21317033 819 48
620 1 173.08 173.08 173.08 0.00 0 9286851 357 46
693 1 162.10 162.10 162.10 337.00 337 49438462 1899 105
955 3 151.64 151.64 151.64 0.00 0 11083562 426 117
2084 2 147.14 147.14 147.14 0.00 0 46810302 1798 117
1460 2 147.04 147.04 147.04 0.00 0 3766180 145 117
1486 2 146.75 146.75 146.75 0.00 0 20822397 800 117
1462 2 145.66 145.66 145.66 211.00 422 34076718 1309 117
2090 4 144.74 144.74 144.74 227.50 455 46810210 1798 117
1454 5 144.21 144.21 144.21 91.00 455 35977690 1382 117
867 1 133.50 133.50 133.50 0.00 0 19885067 764 129
929 2 125.41 125.41 125.41 0.00 0 36758896 1412 117
1414 5 123.31 123.31 123.31 91.00 455 35404845 1360 116
39 5 122.19 122.19 122.19 208.40 348 18999451 730 135
cat("\nTotal animals with breed proportion > 100%:", nrow(breed_anomaly_animals))
## 
## Total animals with breed proportion > 100%: 14
cat("\nTotal records with breed proportion > 100%:", nrow(milk_data %>% filter(bprop > 100)))
## 
## Total records with breed proportion > 100%: 41

4.2 Breed Proportion Distribution

# Create breed proportion categories
breed_categories <- milk_data %>%
  mutate(BreedCat = case_when(
    bprop <= 25 ~ "0-25%",
    bprop > 25 & bprop <= 50 ~ "25-50%",
    bprop > 50 & bprop <= 75 ~ "50-75%",
    bprop > 75 & bprop <= 100 ~ "75-100%",
    bprop > 100 ~ ">100% (Anomaly)"
  )) %>%
  group_by(BreedCat) %>%
  summarise(
    Records = n(),
    Animals = n_distinct(AnimID),
    `Mean Milk` = round(mean(milk), 2),
    `SD Milk` = round(sd(milk), 2),
    .groups = 'drop'
  )

# Plot 1: Histogram of breed proportion
p1 <- ggplot(milk_data, aes(x = bprop)) +
  geom_histogram(bins = 50, fill = "purple", color = "black", alpha = 0.7) +
  geom_vline(xintercept = 100, color = "red", linetype = "dashed", size = 1.5) +
  annotate("text", x = 100, y = Inf, label = "100% threshold", 
           hjust = -0.1, vjust = 2, color = "red", fontface = "bold") +
  labs(title = "Distribution of Breed Proportion",
       subtitle = sprintf("Mean = %.1f%% | Values > 100%%: %d records", 
                         mean(milk_data$bprop), sum(milk_data$bprop > 100)),
       x = "Exotic Breed Proportion (%)", y = "Frequency") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

# Plot 2: Milk yield by breed proportion
p2 <- ggplot(milk_data, aes(x = bprop, y = milk)) +
  geom_point(alpha = 0.3, aes(color = milk > 20)) +
  geom_smooth(method = "loess", color = "blue", size = 1.2) +
  geom_vline(xintercept = 100, color = "red", linetype = "dashed", size = 1) +
  geom_hline(yintercept = 20, color = "darkgreen", linetype = "dashed", size = 1) +
  scale_color_manual(values = c("FALSE" = "gray60", "TRUE" = "darkgreen"),
                     labels = c("≤20 kg", ">20 kg")) +
  labs(title = "Milk Yield vs Breed Proportion",
       subtitle = "Green points = Milk > 20 kg | Red line = 100% breed | Green line = 20 kg milk",
       x = "Exotic Breed Proportion (%)", y = "Milk Yield (kg)",
       color = "Milk Yield") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

grid.arrange(p1, p2, ncol = 1)

5 Combined Analysis: High Milk Yield AND Breed Anomalies

5.1 Animals with Both Milk > 20 kg AND Breed > 100%

# Animals with both conditions
combined_animals <- milk_data %>%
  filter(milk > 20 & bprop > 100) %>%
  group_by(AnimID) %>%
  summarise(
    `Records` = n(),
    `Mean Milk` = round(mean(milk), 2),
    `Max Milk` = round(max(milk), 2),
    `Mean Breed%` = round(mean(bprop), 2),
    `Max Breed%` = round(max(bprop), 2),
    `Mean DIM` = round(mean(DIM), 1),
    Farm = first(Farm),
    Ward = first(Ward),
    Class = first(Class),
    .groups = 'drop'
  ) %>%
  arrange(desc(`Max Milk`))

if(nrow(combined_animals) > 0) {
  kable(combined_animals, 
        caption = "Animals with BOTH Milk > 20 kg AND Breed Proportion > 100%") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
    column_spec(3:4, background = "#e8f4ea") %>%
    column_spec(5:6, background = "#ffe8e8")
} else {
  cat("No animals found with both conditions (Milk > 20 kg AND Breed > 100%)")
}
Animals with BOTH Milk > 20 kg AND Breed Proportion > 100%
AnimID Records Mean Milk Max Milk Mean Breed% Max Breed% Mean DIM Farm Ward Class
1414 1 455.00 455 123.31 123.31 35404757 1360 116 2
1454 1 455.00 455 144.21 144.21 35977524 1382 117 2
2090 2 455.00 455 144.74 144.74 46810155 1798 117 2
1462 1 422.00 422 145.66 145.66 34076605 1309 117 1
1622 4 361.00 361 230.74 230.74 21316971 819 48 1
39 3 347.33 348 122.19 122.19 18999412 730 135 1
693 1 337.00 337 162.10 162.10 49438462 1899 105 2

5.2 Detailed Records for Combined Criteria

# Get all records matching both criteria
combined_records <- milk_data %>%
  filter(milk > 20 & bprop > 100) %>%
  select(AnimID, milk, bprop, DIM, Farm, Ward, Class, Lac, age) %>%
  arrange(desc(milk))

if(nrow(combined_records) > 0) {
  kable(head(combined_records, 20), 
        caption = "Top 20 Records with Both Milk > 20 kg AND Breed > 100%") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
    column_spec(2, background = "#e8f4ea") %>%
    column_spec(3, background = "#ffe8e8")
  
  cat("\nTotal records meeting both criteria:", nrow(combined_records))
} else {
  cat("No records found with both conditions")
}
## 
## Total records meeting both criteria: 13

6 Summary Tables

6.1 Table 1: High Milk Yielders (>20 kg) Summary

# Create comprehensive summary for high yielders
high_yield_summary <- milk_data %>%
  mutate(HighYield = milk > 20) %>%
  group_by(HighYield) %>%
  summarise(
    `Records` = n(),
    `Animals` = n_distinct(AnimID),
    `Mean Milk (kg)` = round(mean(milk), 2),
    `SD Milk (kg)` = round(sd(milk), 2),
    `Mean Breed (%)` = round(mean(bprop), 2),
    `Mean DIM` = round(mean(DIM), 1),
    `Mean Age (months)` = round(mean(age), 1),
    .groups = 'drop'
  ) %>%
  mutate(Category = ifelse(HighYield, "Milk > 20 kg", "Milk ≤ 20 kg")) %>%
  select(Category, everything(), -HighYield)

kable(high_yield_summary, 
      caption = "Comparison: High Yielders vs Normal Yielders") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  row_spec(which(high_yield_summary$Category == "Milk > 20 kg"), 
           background = "#e8f4ea", bold = TRUE)
Comparison: High Yielders vs Normal Yielders
Category Records Animals Mean Milk (kg) SD Milk (kg) Mean Breed (%) Mean DIM Mean Age (months)
Milk ≤ 20 kg 49027 2603 9.17 4.33 0.88 15979.4 71.9
Milk > 20 kg 194 96 47.89 92.69 11.90 2014497.9 68.1

6.2 Table 2: Breed Proportion Anomaly Summary

# Create comprehensive summary for breed anomalies
breed_anomaly_summary <- milk_data %>%
  mutate(BreedAnomaly = bprop > 100) %>%
  group_by(BreedAnomaly) %>%
  summarise(
    `Records` = n(),
    `Animals` = n_distinct(AnimID),
    `Mean Breed (%)` = round(mean(bprop), 2),
    `Max Breed (%)` = round(max(bprop), 2),
    `Mean Milk (kg)` = round(mean(milk), 2),
    `Mean DIM` = round(mean(DIM), 1),
    .groups = 'drop'
  ) %>%
  mutate(Category = ifelse(BreedAnomaly, "Breed > 100%", "Breed ≤ 100%")) %>%
  select(Category, everything(), -BreedAnomaly)

kable(breed_anomaly_summary, 
      caption = "Comparison: Breed Proportion Anomalies vs Normal") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  row_spec(which(breed_anomaly_summary$Category == "Breed > 100%"), 
           background = "#ffe8e8", bold = TRUE)
Comparison: Breed Proportion Anomalies vs Normal
Category Records Animals Mean Breed (%) Max Breed (%) Mean Milk (kg) Mean DIM
Breed ≤ 100% 49180 2603 0.80 1.00 9.23 220.6
Breed > 100% 41 14 152.67 230.74 123.54 28375195.6

7 Final Visualizations

7.1 Combined Scatter Plot

# Create categories for visualization
milk_data_viz <- milk_data %>%
  mutate(Category = case_when(
    milk > 20 & bprop > 100 ~ "Both High Milk & Breed Anomaly",
    milk > 20 & bprop <= 100 ~ "High Milk Only",
    milk <= 20 & bprop > 100 ~ "Breed Anomaly Only",
    TRUE ~ "Normal"
  ))

category_summary <- milk_data_viz %>%
  group_by(Category) %>%
  summarise(Count = n(), .groups = 'drop')

p <- ggplot(milk_data_viz, aes(x = bprop, y = milk, color = Category)) +
  geom_point(alpha = 0.5, size = 2) +
  geom_vline(xintercept = 100, color = "red", linetype = "dashed", size = 1) +
  geom_hline(yintercept = 20, color = "darkgreen", linetype = "dashed", size = 1) +
  scale_color_manual(values = c("Both High Milk & Breed Anomaly" = "purple",
                                "High Milk Only" = "darkgreen",
                                "Breed Anomaly Only" = "orange",
                                "Normal" = "gray60")) +
  labs(title = "Comprehensive View: Milk Yield vs Breed Proportion",
       subtitle = paste("Categories - Normal:", category_summary$Count[category_summary$Category == "Normal"],
                       "| High Milk:", category_summary$Count[category_summary$Category == "High Milk Only"],
                       "| Breed Anomaly:", category_summary$Count[category_summary$Category == "Breed Anomaly Only"],
                       "| Both:", sum(category_summary$Count[category_summary$Category == "Both High Milk & Breed Anomaly"], na.rm = TRUE)),
       x = "Exotic Breed Proportion (%)",
       y = "Milk Yield (kg)") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
        legend.position = "bottom",
        legend.title = element_text(face = "bold"))

print(p)

8 Key Findings

# Calculate key statistics
total_animals <- length(unique(milk_data$AnimID))
high_yield_pct <- (length(unique(high_yield_animals$AnimID)) / total_animals) * 100
breed_anomaly_pct <- (nrow(breed_anomaly_animals) / total_animals) * 100

key_findings <- data.frame(
  Finding = c(
    "Total animals in dataset",
    "Animals with at least one record > 20 kg milk",
    "Percentage of animals that are high yielders",
    "Animals with breed proportion > 100%",
    "Percentage of animals with breed anomaly",
    "Maximum milk yield recorded",
    "Maximum breed proportion recorded",
    "Mean milk yield for high yielders (>20 kg records)",
    "Mean milk yield for breed anomaly animals"
  ),
  Value = c(
    total_animals,
    nrow(high_yield_animals),
    paste0(round(high_yield_pct, 2), "%"),
    nrow(breed_anomaly_animals),
    paste0(round(breed_anomaly_pct, 2), "%"),
    paste0(round(max(milk_data$milk), 2), " kg"),
    paste0(round(max(milk_data$bprop), 2), "%"),
    paste0(round(mean(high_yielders$milk), 2), " kg"),
    paste0(round(mean(breed_anomaly$milk), 2), " kg")
  )
)

kable(key_findings, caption = "Key Findings Summary") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  column_spec(1, bold = TRUE, width = "60%") %>%
  column_spec(2, width = "40%")
Key Findings Summary
Finding Value
Total animals in dataset 2603
Animals with at least one record > 20 kg milk 96
Percentage of animals that are high yielders 3.69%
Animals with breed proportion > 100% 14
Percentage of animals with breed anomaly 0.54%
Maximum milk yield recorded 455 kg
Maximum breed proportion recorded 230.74%
Mean milk yield for high yielders (>20 kg records) 47.89 kg
Mean milk yield for breed anomaly animals 123.54 kg

9 Recommendations

Based on the analysis:

  1. Data Quality: There are 14 animals with breed proportion > 100%, indicating data entry errors that should be investigated.

  2. High Yielders: 3.69% of animals have achieved milk yields > 20 kg, representing elite performers that should be prioritized for breeding programs.

  3. Outlier Management: 102 outlier records were identified (0.21% of data). These should be verified for accuracy.

  4. Breed Proportion Impact: The relationship between exotic breed proportion and milk yield shows positive correlation, supporting crossbreeding strategies.

  5. Further Investigation: Animals with both high milk yield AND breed proportion anomalies require immediate data verification.


Report generated on 2025-10-14