1 Data Loading and Preparation

# Load the data
milk_data <- fread("milk4d.txt", 
                   col.names = c("AnimID", "Ward", "Farm", "CYKsea", "MYMM",
                                 "Class", "Lacest", "Lac", "DIM", "HTD",
                                 "HYcalv", "HYprod", "PYM", "Ksea",
                                 "wilk1", "wilk2", "xdim", "bprop", "age",
                                 "milk", "bwt1", "bwt2", "bsc"),
                   fill = TRUE)

# Display data dimensions
cat("Dataset loaded: ", nrow(milk_data), "records from", 
    length(unique(milk_data$AnimID)), "animals\n")

## Dataset loaded:  49221 records from 2603 animals

2 Executive Summary

# Key statistics
total_records <- nrow(milk_data)
total_animals <- length(unique(milk_data$AnimID))
mean_milk <- mean(milk_data$milk, na.rm = TRUE)
sd_milk <- sd(milk_data$milk, na.rm = TRUE)

# High yielders
high_yielders <- milk_data %>% filter(milk > 20)
n_high_yield_records <- nrow(high_yielders)
n_high_yield_animals <- length(unique(high_yielders$AnimID))
pct_high_yield_records <- (n_high_yield_records / total_records) * 100
pct_high_yield_animals <- (n_high_yield_animals / total_animals) * 100

# Breed proportion anomalies
breed_anomaly <- milk_data %>% filter(bprop > 100)
n_breed_anomaly_records <- nrow(breed_anomaly)
n_breed_anomaly_animals <- length(unique(breed_anomaly$AnimID))

# Combined high yield and breed anomaly
high_yield_breed_anomaly <- milk_data %>% filter(milk > 20 & bprop > 100)

summary_df <- data.frame(
  Metric = c("Total Records", "Total Animals", 
             "Mean Milk Yield (kg)", "SD Milk Yield (kg)",
             "Records with Milk > 20 kg", "Animals with Milk > 20 kg",
             "% Records with Milk > 20 kg", "% Animals with Milk > 20 kg",
             "Records with Breed Prop > 100%", "Animals with Breed Prop > 100%",
             "Records with Both Milk > 20 kg AND Breed > 100%"),
  Value = c(total_records, total_animals,
            round(mean_milk, 2), round(sd_milk, 2),
            n_high_yield_records, n_high_yield_animals,
            round(pct_high_yield_records, 2), round(pct_high_yield_animals, 2),
            n_breed_anomaly_records, n_breed_anomaly_animals,
            nrow(high_yield_breed_anomaly))
)

kable(summary_df, caption = "Executive Summary Statistics") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                full_width = FALSE) %>%
  row_spec(c(5:8), background = "#e8f4ea") %>%
  row_spec(c(9:11), background = "#ffe8e8")

Executive Summary Statistics
Metric	Value
Total Records	49221.00
Total Animals	2603.00
Mean Milk Yield (kg)	9.33
SD Milk Yield (kg)	7.63
Records with Milk > 20 kg	194.00
Animals with Milk > 20 kg	96.00
% Records with Milk > 20 kg	0.39
% Animals with Milk > 20 kg	3.69
Records with Breed Prop > 100%	41.00
Animals with Breed Prop > 100%	14.00
Records with Both Milk > 20 kg AND Breed > 100%	13.00

3 Milk Yield Analysis

3.1 Overall Distribution

# Normality (QQ) plot with y-axis constrained to the range of the data
qq_df <- milk_data %>% filter(!is.na(milk))

ggplot(qq_df, aes(sample = milk)) +
  stat_qq(size = 1, alpha = 0.7, color = "steelblue") +
  stat_qq_line(color = "red", linewidth = 1) +
  coord_cartesian(ylim = range(qq_df$milk, na.rm = TRUE)) +
  labs(
    title = "Normality Check (QQ Plot) for Milk Yield",
    subtitle = sprintf("y-axis limited to data range: [%.2f, %.2f]",
                       min(qq_df$milk, na.rm = TRUE), max(qq_df$milk, na.rm = TRUE)),
    x = "Theoretical Quantiles",
    y = "Sample Quantiles (Milk)"
  ) +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

3.2 Distribution Plots: With and Without Outliers

# Identify outliers using IQR method
Q1 <- quantile(milk_data$milk, 0.25, na.rm = TRUE)
Q3 <- quantile(milk_data$milk, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR

milk_data_no_outliers <- milk_data %>% 
  filter(milk >= lower_bound & milk <= upper_bound)

n_outliers <- nrow(milk_data) - nrow(milk_data_no_outliers)
pct_outliers <- (n_outliers / nrow(milk_data)) * 100

# Plot 1: Distribution with all data
p1 <- ggplot(milk_data, aes(x = milk)) +
  geom_histogram(bins = 50, fill = "steelblue", color = "black", alpha = 0.7) +
  geom_vline(aes(xintercept = mean(milk)), color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20), color = "darkgreen", linetype = "solid", size = 1.5) +
  annotate("text", x = 20, y = Inf, label = "20 kg threshold", 
           hjust = -0.1, vjust = 2, color = "darkgreen", fontface = "bold") +
  labs(title = "Milk Yield Distribution - WITH Outliers",
       subtitle = sprintf("n = %d records | Mean = %.2f kg | SD = %.2f kg | %d outliers (%.1f%%)",
                         nrow(milk_data), mean(milk_data$milk), sd(milk_data$milk),
                         n_outliers, pct_outliers),
       x = "Milk Yield (kg)", y = "Frequency") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14))

# Plot 2: Distribution without outliers
p2 <- ggplot(milk_data_no_outliers, aes(x = milk)) +
  geom_histogram(bins = 50, fill = "coral", color = "black", alpha = 0.7) +
  geom_vline(aes(xintercept = mean(milk)), color = "red", linetype = "dashed", size = 1) +
  geom_vline(aes(xintercept = 20), color = "darkgreen", linetype = "solid", size = 1.5) +
  annotate("text", x = 20, y = Inf, label = "20 kg threshold", 
           hjust = -0.1, vjust = 2, color = "darkgreen", fontface = "bold") +
  labs(title = "Milk Yield Distribution - WITHOUT Outliers",
       subtitle = sprintf("n = %d records | Mean = %.2f kg | SD = %.2f kg | Outliers removed: %d",
                         nrow(milk_data_no_outliers), 
                         mean(milk_data_no_outliers$milk), 
                         sd(milk_data_no_outliers$milk),
                         n_outliers),
       x = "Milk Yield (kg)", y = "Frequency") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 14))

# Plot 3: Boxplot comparison
milk_data_combined <- rbind(
  milk_data %>% mutate(Dataset = "With Outliers"),
  milk_data_no_outliers %>% mutate(Dataset = "Without Outliers")
)

p3 <- ggplot(milk_data_combined, aes(x = Dataset, y = milk, fill = Dataset)) +
  geom_boxplot(alpha = 0.7) +
  geom_hline(yintercept = 20, color = "darkgreen", linetype = "solid", size = 1.5) +
  stat_summary(fun = mean, geom = "point", shape = 23, size = 3, fill = "red") +
  annotate("text", x = 0.5, y = 20, label = "20 kg threshold", 
           hjust = 0, color = "darkgreen", fontface = "bold") +
  labs(title = "Milk Yield Distribution Comparison",
       subtitle = "Red diamond = Mean | Green line = 20 kg threshold",
       y = "Milk Yield (kg)", x = "") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"),
        legend.position = "none") +
  scale_fill_manual(values = c("With Outliers" = "steelblue", "Without Outliers" = "coral"))

grid.arrange(p1, p2, p3, ncol = 1)

# Build outlier-only data
outliers_df <- milk_data %>%
  filter(!is.na(milk)) %>%
  filter(milk < lower_bound | milk > upper_bound)

if (nrow(outliers_df) > 0) {
  ggplot(outliers_df, aes(x = milk)) +
    geom_histogram(bins = 40, fill = "tomato", color = "black", alpha = 0.8) +
    geom_vline(xintercept = c(lower_bound, upper_bound),
               linetype = "dashed", color = "black", linewidth = 1) +
    labs(
      title = "Histogram of Outlier Milk Yields (IQR Rule)",
      subtitle = sprintf("Outliers: %d (%.1f%%) | Bounds: [%.2f, %.2f]",
                         n_outliers, pct_outliers, lower_bound, upper_bound),
      x = "Milk Yield (kg)", y = "Frequency"
    ) +
    theme_bw() +
    theme(plot.title = element_text(hjust = 0.5, face = "bold"))
} else {
  cat("No outliers detected by the IQR rule (nice!).")
}

# High Yielders Analysis (>20 kg)

3.3 Animals with Milk Yield > 20 kg

# Get all animals with any record > 20 kg
high_yield_animals <- milk_data %>%
  filter(milk > 20) %>%
  group_by(AnimID) %>%
  summarise(
    `Records > 20kg` = n(),
    `Total Records` = n(),
    `Mean Milk` = round(mean(milk), 2),
    `Max Milk` = round(max(milk), 2),
    `Min Milk` = round(min(milk), 2),
    `SD Milk` = round(sd(milk), 2),
    `Mean Breed%` = round(mean(bprop), 2),
    `Mean DIM` = round(mean(DIM), 1),
    .groups = 'drop'
  ) %>%
  arrange(desc(`Max Milk`))

# Also get their total records from full dataset
total_records_per_animal <- milk_data %>%
  filter(AnimID %in% high_yield_animals$AnimID) %>%
  group_by(AnimID) %>%
  summarise(`All Records` = n(), .groups = 'drop')

high_yield_animals <- high_yield_animals %>%
  left_join(total_records_per_animal, by = "AnimID") %>%
  mutate(`% Records > 20kg` = round((`Records > 20kg` / `All Records`) * 100, 1)) %>%
  select(AnimID, `Records > 20kg`, `All Records`, `% Records > 20kg`, 
         `Mean Milk`, `Max Milk`, `Mean Breed%`, everything())

# Display top 20 high yielders
kable(head(high_yield_animals, 20), 
      caption = "Top 20 Animals with Milk Yield > 20 kg") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), 
                font_size = 12) %>%
  column_spec(5:6, background = "#e8f4ea", bold = TRUE)

Top 20 Animals with Milk Yield > 20 kg
AnimID	Records > 20kg	All Records	% Records > 20kg	Mean Milk	Max Milk	Mean Breed%	Total Records	Min Milk	SD Milk	Mean DIM
1414	1	54	1.9	455.00	455.0	123.31	1	455.0	NA	35404757.0
1454	1	33	3.0	455.00	455.0	144.21	1	455.0	NA	35977524.0
2090	2	50	4.0	455.00	455.0	144.74	2	455.0	0.00	46810154.5
1462	1	49	2.0	422.00	422.0	145.66	1	422.0	NA	34076605.0
1622	4	48	8.3	361.00	361.0	230.74	4	361.0	0.00	21316971.0
39	3	25	12.0	347.33	348.0	122.19	3	346.0	1.15	18999411.7
693	1	11	9.1	337.00	337.0	162.10	1	337.0	NA	49438462.0
1440	1	9	11.1	39.50	39.5	0.76	1	39.5	NA	410.0
2087	6	52	11.5	28.17	38.0	0.78	6	22.0	5.42	282.2
224	1	3	33.3	30.00	30.0	0.96	1	30.0	NA	371.0
1070	2	46	4.3	26.00	30.0	0.93	2	22.0	5.66	85.5
767	7	9	77.8	28.29	29.0	0.97	7	27.0	0.95	204.4
315	1	15	6.7	28.00	28.0	0.86	1	28.0	NA	116.0
529	3	5	60.0	24.33	28.0	0.99	3	22.0	3.21	451.7
996	7	33	21.2	25.71	28.0	0.99	7	24.0	2.14	98.0
1492	1	16	6.2	28.00	28.0	0.57	1	28.0	NA	30.0
2259	10	29	34.5	26.20	28.0	0.78	10	26.0	0.63	279.3
1719	4	11	36.4	25.00	27.0	0.78	4	23.0	1.63	257.0
718	4	36	11.1	23.25	26.0	0.87	4	22.0	1.89	192.8
1559	3	46	6.5	24.33	26.0	0.97	3	22.0	2.08	78.3

cat("\nTotal unique animals with at least one record > 20 kg:", nrow(high_yield_animals))

## 
## Total unique animals with at least one record > 20 kg: 96

3.4 High Yielders Performance Plot

# Select top 15 high yielders for visualization
top_high_yielders <- head(high_yield_animals, 15)

# Get all records for these animals
high_yielder_records <- milk_data %>%
  filter(AnimID %in% top_high_yielders$AnimID)

# Plot 1: Box plot of top high yielders
p1 <- ggplot(high_yielder_records, aes(x = reorder(factor(AnimID), milk, median), 
                                        y = milk, fill = factor(AnimID))) +
  geom_boxplot(alpha = 0.7) +
  geom_hline(yintercept = 20, color = "red", linetype = "dashed", size = 1) +
  coord_flip() +
  labs(title = "Milk Yield Distribution for Top 15 High-Yielding Animals",
       subtitle = "All their test-day records shown | Red line = 20 kg threshold",
       x = "Animal ID", y = "Milk Yield (kg)") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"),
        legend.position = "none")

# Plot 2: Scatter plot - milk yield vs DIM for high yielders
p2 <- ggplot(high_yielder_records %>% filter(AnimID %in% head(high_yield_animals$AnimID, 10)), 
             aes(x = DIM, y = milk, color = factor(AnimID))) +
  geom_point(size = 3, alpha = 0.7) +
  geom_smooth(method = "loess", se = FALSE, size = 0.8) +
  geom_hline(yintercept = 20, color = "red", linetype = "dashed", size = 1) +
  labs(title = "Lactation Curves of Top 10 High-Yielding Animals",
       subtitle = "Individual lactation patterns",
       x = "Days in Milk (DIM)", y = "Milk Yield (kg)",
       color = "Animal ID") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"),
        legend.position = "right")

grid.arrange(p1, p2, ncol = 1)

4 Breed Proportion Analysis

4.1 Animals with Breed Proportion > 100%

# Animals with breed proportion > 100
breed_anomaly_animals <- milk_data %>%
  filter(bprop > 100) %>%
  group_by(AnimID) %>%
  summarise(
    `Records` = n(),
    `Mean Breed%` = round(mean(bprop), 2),
    `Max Breed%` = round(max(bprop), 2),
    `Min Breed%` = round(min(bprop), 2),
    `Mean Milk` = round(mean(milk), 2),
    `Max Milk` = round(max(milk), 2),
    `Mean DIM` = round(mean(DIM), 1),
    Farm = first(Farm),
    Ward = first(Ward),
    .groups = 'drop'
  ) %>%
  arrange(desc(`Max Breed%`))

kable(breed_anomaly_animals, 
      caption = "Animals with Breed Proportion > 100% (Data Quality Issue)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  column_spec(3:5, background = "#ffe8e8", bold = TRUE)

Animals with Breed Proportion > 100% (Data Quality Issue)
AnimID	Records	Mean Breed%	Max Breed%	Min Breed%	Mean Milk	Max Milk	Mean DIM	Farm	Ward
1622	6	230.74	230.74	230.74	240.67	361	21317033	819	48
620	1	173.08	173.08	173.08	0.00	0	9286851	357	46
693	1	162.10	162.10	162.10	337.00	337	49438462	1899	105
955	3	151.64	151.64	151.64	0.00	0	11083562	426	117
2084	2	147.14	147.14	147.14	0.00	0	46810302	1798	117
1460	2	147.04	147.04	147.04	0.00	0	3766180	145	117
1486	2	146.75	146.75	146.75	0.00	0	20822397	800	117
1462	2	145.66	145.66	145.66	211.00	422	34076718	1309	117
2090	4	144.74	144.74	144.74	227.50	455	46810210	1798	117
1454	5	144.21	144.21	144.21	91.00	455	35977690	1382	117
867	1	133.50	133.50	133.50	0.00	0	19885067	764	129
929	2	125.41	125.41	125.41	0.00	0	36758896	1412	117
1414	5	123.31	123.31	123.31	91.00	455	35404845	1360	116
39	5	122.19	122.19	122.19	208.40	348	18999451	730	135

cat("\nTotal animals with breed proportion > 100%:", nrow(breed_anomaly_animals))

## 
## Total animals with breed proportion > 100%: 14

cat("\nTotal records with breed proportion > 100%:", nrow(milk_data %>% filter(bprop > 100)))

## 
## Total records with breed proportion > 100%: 41

4.2 Breed Proportion Distribution

# Create breed proportion categories
breed_categories <- milk_data %>%
  mutate(BreedCat = case_when(
    bprop <= 25 ~ "0-25%",
    bprop > 25 & bprop <= 50 ~ "25-50%",
    bprop > 50 & bprop <= 75 ~ "50-75%",
    bprop > 75 & bprop <= 100 ~ "75-100%",
    bprop > 100 ~ ">100% (Anomaly)"
  )) %>%
  group_by(BreedCat) %>%
  summarise(
    Records = n(),
    Animals = n_distinct(AnimID),
    `Mean Milk` = round(mean(milk), 2),
    `SD Milk` = round(sd(milk), 2),
    .groups = 'drop'
  )

# Plot 1: Histogram of breed proportion
p1 <- ggplot(milk_data, aes(x = bprop)) +
  geom_histogram(bins = 50, fill = "purple", color = "black", alpha = 0.7) +
  geom_vline(xintercept = 100, color = "red", linetype = "dashed", size = 1.5) +
  annotate("text", x = 100, y = Inf, label = "100% threshold", 
           hjust = -0.1, vjust = 2, color = "red", fontface = "bold") +
  labs(title = "Distribution of Breed Proportion",
       subtitle = sprintf("Mean = %.1f%% | Values > 100%%: %d records", 
                         mean(milk_data$bprop), sum(milk_data$bprop > 100)),
       x = "Exotic Breed Proportion (%)", y = "Frequency") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

# Plot 2: Milk yield by breed proportion
p2 <- ggplot(milk_data, aes(x = bprop, y = milk)) +
  geom_point(alpha = 0.3, aes(color = milk > 20)) +
  geom_smooth(method = "loess", color = "blue", size = 1.2) +
  geom_vline(xintercept = 100, color = "red", linetype = "dashed", size = 1) +
  geom_hline(yintercept = 20, color = "darkgreen", linetype = "dashed", size = 1) +
  scale_color_manual(values = c("FALSE" = "gray60", "TRUE" = "darkgreen"),
                     labels = c("≤20 kg", ">20 kg")) +
  labs(title = "Milk Yield vs Breed Proportion",
       subtitle = "Green points = Milk > 20 kg | Red line = 100% breed | Green line = 20 kg milk",
       x = "Exotic Breed Proportion (%)", y = "Milk Yield (kg)",
       color = "Milk Yield") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"))

grid.arrange(p1, p2, ncol = 1)

5 Combined Analysis: High Milk Yield AND Breed Anomalies

5.1 Animals with Both Milk > 20 kg AND Breed > 100%

# Animals with both conditions
combined_animals <- milk_data %>%
  filter(milk > 20 & bprop > 100) %>%
  group_by(AnimID) %>%
  summarise(
    `Records` = n(),
    `Mean Milk` = round(mean(milk), 2),
    `Max Milk` = round(max(milk), 2),
    `Mean Breed%` = round(mean(bprop), 2),
    `Max Breed%` = round(max(bprop), 2),
    `Mean DIM` = round(mean(DIM), 1),
    Farm = first(Farm),
    Ward = first(Ward),
    Class = first(Class),
    .groups = 'drop'
  ) %>%
  arrange(desc(`Max Milk`))

if(nrow(combined_animals) > 0) {
  kable(combined_animals, 
        caption = "Animals with BOTH Milk > 20 kg AND Breed Proportion > 100%") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
    column_spec(3:4, background = "#e8f4ea") %>%
    column_spec(5:6, background = "#ffe8e8")
} else {
  cat("No animals found with both conditions (Milk > 20 kg AND Breed > 100%)")
}

Animals with BOTH Milk > 20 kg AND Breed Proportion > 100%
AnimID	Records	Mean Milk	Max Milk	Mean Breed%	Max Breed%	Mean DIM	Farm	Ward	Class
1414	1	455.00	455	123.31	123.31	35404757	1360	116	2
1454	1	455.00	455	144.21	144.21	35977524	1382	117	2
2090	2	455.00	455	144.74	144.74	46810155	1798	117	2
1462	1	422.00	422	145.66	145.66	34076605	1309	117	1
1622	4	361.00	361	230.74	230.74	21316971	819	48	1
39	3	347.33	348	122.19	122.19	18999412	730	135	1
693	1	337.00	337	162.10	162.10	49438462	1899	105	2

5.2 Detailed Records for Combined Criteria

# Get all records matching both criteria
combined_records <- milk_data %>%
  filter(milk > 20 & bprop > 100) %>%
  select(AnimID, milk, bprop, DIM, Farm, Ward, Class, Lac, age) %>%
  arrange(desc(milk))

if(nrow(combined_records) > 0) {
  kable(head(combined_records, 20), 
        caption = "Top 20 Records with Both Milk > 20 kg AND Breed > 100%") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
    column_spec(2, background = "#e8f4ea") %>%
    column_spec(3, background = "#ffe8e8")
  
  cat("\nTotal records meeting both criteria:", nrow(combined_records))
} else {
  cat("No records found with both conditions")
}

## 
## Total records meeting both criteria: 13

6 Summary Tables

6.1 Table 1: High Milk Yielders (>20 kg) Summary

# Create comprehensive summary for high yielders
high_yield_summary <- milk_data %>%
  mutate(HighYield = milk > 20) %>%
  group_by(HighYield) %>%
  summarise(
    `Records` = n(),
    `Animals` = n_distinct(AnimID),
    `Mean Milk (kg)` = round(mean(milk), 2),
    `SD Milk (kg)` = round(sd(milk), 2),
    `Mean Breed (%)` = round(mean(bprop), 2),
    `Mean DIM` = round(mean(DIM), 1),
    `Mean Age (months)` = round(mean(age), 1),
    .groups = 'drop'
  ) %>%
  mutate(Category = ifelse(HighYield, "Milk > 20 kg", "Milk ≤ 20 kg")) %>%
  select(Category, everything(), -HighYield)

kable(high_yield_summary, 
      caption = "Comparison: High Yielders vs Normal Yielders") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  row_spec(which(high_yield_summary$Category == "Milk > 20 kg"), 
           background = "#e8f4ea", bold = TRUE)

Comparison: High Yielders vs Normal Yielders
Category	Records	Animals	Mean Milk (kg)	SD Milk (kg)	Mean Breed (%)	Mean DIM	Mean Age (months)
Milk ≤ 20 kg	49027	2603	9.17	4.33	0.88	15979.4	71.9
Milk > 20 kg	194	96	47.89	92.69	11.90	2014497.9	68.1

6.2 Table 2: Breed Proportion Anomaly Summary

# Create comprehensive summary for breed anomalies
breed_anomaly_summary <- milk_data %>%
  mutate(BreedAnomaly = bprop > 100) %>%
  group_by(BreedAnomaly) %>%
  summarise(
    `Records` = n(),
    `Animals` = n_distinct(AnimID),
    `Mean Breed (%)` = round(mean(bprop), 2),
    `Max Breed (%)` = round(max(bprop), 2),
    `Mean Milk (kg)` = round(mean(milk), 2),
    `Mean DIM` = round(mean(DIM), 1),
    .groups = 'drop'
  ) %>%
  mutate(Category = ifelse(BreedAnomaly, "Breed > 100%", "Breed ≤ 100%")) %>%
  select(Category, everything(), -BreedAnomaly)

kable(breed_anomaly_summary, 
      caption = "Comparison: Breed Proportion Anomalies vs Normal") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
  row_spec(which(breed_anomaly_summary$Category == "Breed > 100%"), 
           background = "#ffe8e8", bold = TRUE)

Comparison: Breed Proportion Anomalies vs Normal
Category	Records	Animals	Mean Breed (%)	Max Breed (%)	Mean Milk (kg)	Mean DIM
Breed ≤ 100%	49180	2603	0.80	1.00	9.23	220.6
Breed > 100%	41	14	152.67	230.74	123.54	28375195.6

7 Final Visualizations

7.1 Combined Scatter Plot

# Create categories for visualization
milk_data_viz <- milk_data %>%
  mutate(Category = case_when(
    milk > 20 & bprop > 100 ~ "Both High Milk & Breed Anomaly",
    milk > 20 & bprop <= 100 ~ "High Milk Only",
    milk <= 20 & bprop > 100 ~ "Breed Anomaly Only",
    TRUE ~ "Normal"
  ))

category_summary <- milk_data_viz %>%
  group_by(Category) %>%
  summarise(Count = n(), .groups = 'drop')

p <- ggplot(milk_data_viz, aes(x = bprop, y = milk, color = Category)) +
  geom_point(alpha = 0.5, size = 2) +
  geom_vline(xintercept = 100, color = "red", linetype = "dashed", size = 1) +
  geom_hline(yintercept = 20, color = "darkgreen", linetype = "dashed", size = 1) +
  scale_color_manual(values = c("Both High Milk & Breed Anomaly" = "purple",
                                "High Milk Only" = "darkgreen",
                                "Breed Anomaly Only" = "orange",
                                "Normal" = "gray60")) +
  labs(title = "Comprehensive View: Milk Yield vs Breed Proportion",
       subtitle = paste("Categories - Normal:", category_summary$Count[category_summary$Category == "Normal"],
                       "| High Milk:", category_summary$Count[category_summary$Category == "High Milk Only"],
                       "| Breed Anomaly:", category_summary$Count[category_summary$Category == "Breed Anomaly Only"],
                       "| Both:", sum(category_summary$Count[category_summary$Category == "Both High Milk & Breed Anomaly"], na.rm = TRUE)),
       x = "Exotic Breed Proportion (%)",
       y = "Milk Yield (kg)") +
  theme_bw() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold", size = 16),
        legend.position = "bottom",
        legend.title = element_text(face = "bold"))

print(p)

8 Key Findings

# Calculate key statistics
total_animals <- length(unique(milk_data$AnimID))
high_yield_pct <- (length(unique(high_yield_animals$AnimID)) / total_animals) * 100
breed_anomaly_pct <- (nrow(breed_anomaly_animals) / total_animals) * 100

key_findings <- data.frame(
  Finding = c(
    "Total animals in dataset",
    "Animals with at least one record > 20 kg milk",
    "Percentage of animals that are high yielders",
    "Animals with breed proportion > 100%",
    "Percentage of animals with breed anomaly",
    "Maximum milk yield recorded",
    "Maximum breed proportion recorded",
    "Mean milk yield for high yielders (>20 kg records)",
    "Mean milk yield for breed anomaly animals"
  ),
  Value = c(
    total_animals,
    nrow(high_yield_animals),
    paste0(round(high_yield_pct, 2), "%"),
    nrow(breed_anomaly_animals),
    paste0(round(breed_anomaly_pct, 2), "%"),
    paste0(round(max(milk_data$milk), 2), " kg"),
    paste0(round(max(milk_data$bprop), 2), "%"),
    paste0(round(mean(high_yielders$milk), 2), " kg"),
    paste0(round(mean(breed_anomaly$milk), 2), " kg")
  )
)

kable(key_findings, caption = "Key Findings Summary") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) %>%
  column_spec(1, bold = TRUE, width = "60%") %>%
  column_spec(2, width = "40%")

Key Findings Summary
Finding	Value
Total animals in dataset	2603
Animals with at least one record > 20 kg milk	96
Percentage of animals that are high yielders	3.69%
Animals with breed proportion > 100%	14
Percentage of animals with breed anomaly	0.54%
Maximum milk yield recorded	455 kg
Maximum breed proportion recorded	230.74%
Mean milk yield for high yielders (>20 kg records)	47.89 kg
Mean milk yield for breed anomaly animals	123.54 kg

9 Recommendations

Based on the analysis:

Data Quality: There are 14 animals with breed proportion > 100%, indicating data entry errors that should be investigated.
High Yielders: 3.69% of animals have achieved milk yields > 20 kg, representing elite performers that should be prioritized for breeding programs.
Outlier Management: 102 outlier records were identified (0.21% of data). These should be verified for accuracy.
Breed Proportion Impact: The relationship between exotic breed proportion and milk yield shows positive correlation, supporting crossbreeding strategies.
Further Investigation: Animals with both high milk yield AND breed proportion anomalies require immediate data verification.

Report generated on 2025-10-14

Milk Yield and Breed Proportion Analysis

Focus on High Yielders (>20 kg) and Breed Proportion Anomalies

Dairy Data Analysis

2025-10-14

1 Data Loading and Preparation

2 Executive Summary

3 Milk Yield Analysis

3.1 Overall Distribution

3.2 Distribution Plots: With and Without Outliers

3.3 Animals with Milk Yield > 20 kg

3.4 High Yielders Performance Plot

4 Breed Proportion Analysis

4.1 Animals with Breed Proportion > 100%

4.2 Breed Proportion Distribution

5 Combined Analysis: High Milk Yield AND Breed Anomalies

5.1 Animals with Both Milk > 20 kg AND Breed > 100%

5.2 Detailed Records for Combined Criteria

6 Summary Tables

6.1 Table 1: High Milk Yielders (>20 kg) Summary

6.2 Table 2: Breed Proportion Anomaly Summary

7 Final Visualizations

7.1 Combined Scatter Plot

8 Key Findings

9 Recommendations