1 Objective

This analysis identifies the most representative plants for each donor-genotype combination by calculating centroids in PC space.

2 Load Required Libraries

library(dplyr)
library(ggplot2)
library(ggpubr)
library(tidyr)
library(broom)
library(knitr)
library(kableExtra)

3 Data Import and Preprocessing

# Data import and preprocessing
data <- read.csv("~/Desktop/CLY25_Inv4m.csv", na.strings = c("", "NA", NA)) 

# Include spatial coordinates in metadata
metadata <- data %>%
  select(plant:group, X_pos, Y_pos)

phytomere <- read.csv("~/Desktop/CLY25_Inv4m_Internode.csv", 
                     na.strings = c("", "na", "NA", NA)) 

# Check data structure
cat("Columns in metadata:\n")
## Columns in metadata:
print(colnames(metadata))
##  [1] "plant"    "row_id"   "plot_id"  "plot_row" "plot_col" "rep"     
##  [7] "Y_pos"    "X_pos"    "donor"    "inv4m_gt" "group"
cat("\nColumns in phytomere:\n")
## 
## Columns in phytomere:
print(colnames(phytomere))
## [1] "plant"     "internode" "has_ear"   "length"    "length_1"  "length_2" 
## [7] "notes"

4 Data Cleaning and Preparation

# Data cleaning and preparation
phytomere_processed <- metadata %>%
  inner_join(phytomere, by = "plant") %>%
  rename(genotype = "inv4m_gt") %>%
  filter(!is.na(length)) %>%
  filter(!(grepl("no tassel", notes))) %>%
  group_by(plant) %>%
  arrange(-internode) %>%
  mutate(from_top = row_number()) %>%
  filter(donor == "MI21" | donor == "TMEX" & from_top <= 10)

cat("Columns in phytomere_processed:\n")
## Columns in phytomere_processed:
print(colnames(phytomere_processed))
##  [1] "plant"     "row_id"    "plot_id"   "plot_row"  "plot_col"  "rep"      
##  [7] "Y_pos"     "X_pos"     "donor"     "genotype"  "group"     "internode"
## [13] "has_ear"   "length"    "length_1"  "length_2"  "notes"     "from_top"
# Create PCA input data
for_pca <- phytomere_processed %>%
  select(plant, internode, length) %>% 
  pivot_wider(names_from = "internode", 
              values_from = "length", 
              names_prefix = "top") %>%
  arrange(plant)

cat("\nPCA data dimensions:", dim(for_pca))
## 
## PCA data dimensions: 424 17
cat("\nColumns in PCA data (plant ID will be kept for identification but excluded from PCA):\n")
## 
## Columns in PCA data (plant ID will be kept for identification but excluded from PCA):
print(colnames(for_pca))
##  [1] "plant" "top16" "top15" "top14" "top13" "top12" "top11" "top10" "top9" 
## [10] "top8"  "top7"  "top6"  "top5"  "top4"  "top3"  "top2"  "top1"
# Check which columns are actually internode measurements (exclude plant ID)
internode_cols <- colnames(for_pca)[colnames(for_pca) != "plant"]
cat("\nInternode measurement columns to be used in PCA:", length(internode_cols), "columns\n")
## 
## Internode measurement columns to be used in PCA: 16 columns
# Calculate overall missing data percentage
pca_matrix <- for_pca %>% select(all_of(internode_cols))
total_cells <- nrow(pca_matrix) * ncol(pca_matrix)
missing_cells <- sum(is.na(pca_matrix))
missing_percentage <- round((missing_cells / total_cells) * 100, 1)

cat("Missing data: ", missing_percentage, "% of matrix positions\n")
## Missing data:  25.5 % of matrix positions
cat("(", missing_cells, " missing out of ", total_cells, " total values)\n")
## ( 1836  missing out of  7208  total values)

5 PCA Analysis Functions

# Function to detect and remove outliers post-hoc from PC space
remove_outliers_posthoc <- function(pc_scores, threshold_sd = 3) {
  # Calculate distances from center in PC space
  pc1_mean <- mean(pc_scores$PC1, na.rm = TRUE)
  pc2_mean <- mean(pc_scores$PC2, na.rm = TRUE)
  pc1_sd <- sd(pc_scores$PC1, na.rm = TRUE)
  pc2_sd <- sd(pc_scores$PC2, na.rm = TRUE)
  
  # Identify outliers as points beyond threshold_sd standard deviations
  pc1_outliers <- abs(pc_scores$PC1 - pc1_mean) > (threshold_sd * pc1_sd)
  pc2_outliers <- abs(pc_scores$PC2 - pc2_mean) > (threshold_sd * pc2_sd)
  
  # Plants that are outliers in either PC1 or PC2
  outlier_mask <- pc1_outliers | pc2_outliers
  outlier_plants <- pc_scores$plant[outlier_mask]
  
  # Return clean data
  clean_pc_scores <- pc_scores[!outlier_mask, ]
  
  list(
    clean_pc_scores = clean_pc_scores,
    outlier_plants = outlier_plants,
    n_outliers = length(outlier_plants)
  )
}

# Function to perform PCA and find centroid representatives
perform_pca_analysis <- function(pca_data, metadata_df, remove_outliers_flag = FALSE) {
  
  # Skip pre-PCA outlier removal
  outlier_info <- NULL
  
  # Prepare data for PCA with mean imputation
  pca_matrix <- pca_data %>%
    select(-plant) %>%
    as.matrix()
  
  # Mean imputation for missing values
  for (col in 1:ncol(pca_matrix)) {
    col_mean <- mean(pca_matrix[, col], na.rm = TRUE)
    pca_matrix[is.na(pca_matrix[, col]), col] <- col_mean
  }
  
  # Get all plants (no removal due to missing data)
  plants_all <- pca_data$plant
  
  # Get metadata for all plants - check for genotype column
  metadata_clean <- metadata_df %>%
    filter(plant %in% plants_all) %>%
    arrange(match(plant, plants_all))
  
  # Check if genotype column exists, if not use inv4m_gt
  if (!"genotype" %in% colnames(metadata_clean)) {
    if ("inv4m_gt" %in% colnames(metadata_clean)) {
      metadata_clean <- metadata_clean %>%
        rename(genotype = inv4m_gt)
    } else {
      stop("Neither 'genotype' nor 'inv4m_gt' column found in metadata")
    }
  }
  
  # Create combined PC scores dataframe first
  all_pc_scores <- data.frame(plant = plants_all) %>%
    left_join(metadata_clean, by = "plant")
  
  # Perform separate PCA for each donor
  donors <- unique(all_pc_scores$donor)
  pca_results_by_donor <- list()
  
  for (donor_name in donors) {
    cat("\nPerforming PCA for donor:", donor_name, "\n")
    
    # Filter data for this donor
    donor_plants <- all_pc_scores %>%
      filter(donor == donor_name) %>%
      pull(plant)
    
    donor_indices <- which(plants_all %in% donor_plants)
    donor_matrix <- pca_matrix[donor_indices, ]
    
    # Check for constant columns (zero variance)
    col_vars <- apply(donor_matrix, 2, var, na.rm = TRUE)
    constant_cols <- which(col_vars == 0 | is.na(col_vars))
    
    if (length(constant_cols) > 0) {
      cat("Removing", length(constant_cols), "constant/zero variance columns for", donor_name, ":\n")
      cat("Columns removed:", colnames(donor_matrix)[constant_cols], "\n")
      donor_matrix <- donor_matrix[, -constant_cols, drop = FALSE]
    }
    
    # Check if we have enough columns left for PCA
    if (ncol(donor_matrix) < 2) {
      cat("Warning: Less than 2 variables remaining for", donor_name, ". Skipping PCA.\n")
      next
    }
    
    # Check for near-constant columns (very low variance)
    remaining_vars <- apply(donor_matrix, 2, var, na.rm = TRUE)
    low_var_threshold <- 1e-10
    near_constant <- which(remaining_vars < low_var_threshold)
    
    if (length(near_constant) > 0) {
      cat("Removing", length(near_constant), "near-constant columns for", donor_name, "\n")
      donor_matrix <- donor_matrix[, -near_constant, drop = FALSE]
    }
    
    # Final check
    if (ncol(donor_matrix) < 2) {
      cat("Warning: Less than 2 variables remaining for", donor_name, ". Skipping PCA.\n")
      next
    }
    
    # Perform PCA for this donor
    pca_result <- prcomp(donor_matrix, center = TRUE, scale. = TRUE)
    
    # Extract PC scores
    pc_scores <- pca_result$x %>%
      as.data.frame() %>%
      mutate(plant = donor_plants) %>%
      left_join(metadata_clean, by = "plant")
    
    # Post-hoc outlier removal in PC space
    if (remove_outliers_flag) {
      cat("Removing post-hoc outliers in PC space for", donor_name, "\n")
      outlier_results <- remove_outliers_posthoc(pc_scores, threshold_sd = 3)
      pc_scores_clean <- outlier_results$clean_pc_scores
      outlier_plants <- outlier_results$outlier_plants
      cat("  - Outliers removed:", length(outlier_plants), "\n")
      if (length(outlier_plants) > 0) {
        cat("  - Outlier plants:", paste(outlier_plants, collapse = ", "), "\n")
      }
    } else {
      pc_scores_clean <- pc_scores
      outlier_plants <- character(0)
    }
    
    # Calculate centroids for each genotype within this donor (using clean data)
    centroids <- pc_scores_clean %>%
      group_by(genotype) %>%
      summarise(
        centroid_pc1 = mean(PC1, na.rm = TRUE),
        centroid_pc2 = mean(PC2, na.rm = TRUE),
        n_plants = n(),
        .groups = "drop"
      ) %>%
      mutate(donor = donor_name)
    
    # Find plant closest to each centroid (using clean data)
    closest_plants <- pc_scores_clean %>%
      left_join(centroids, by = "genotype") %>%
      mutate(
        distance_to_centroid = sqrt((PC1 - centroid_pc1)^2 + (PC2 - centroid_pc2)^2)
      ) %>%
      group_by(genotype) %>%
      slice_min(distance_to_centroid, n = 1) %>%
      ungroup()
    
    # Find top 4 plants closest to each centroid (for extended table)
    top4_closest_plants <- pc_scores_clean %>%
      left_join(centroids, by = "genotype") %>%
      mutate(
        distance_to_centroid = sqrt((PC1 - centroid_pc1)^2 + (PC2 - centroid_pc2)^2)
      ) %>%
      group_by(genotype) %>%
      slice_min(distance_to_centroid, n = 4) %>%
      mutate(rank = row_number()) %>%
      ungroup()
    
    # Store results for this donor
    pca_results_by_donor[[donor_name]] <- list(
      pca_result = pca_result,
      pc_scores = pc_scores,  # Keep original for plotting
      pc_scores_clean = pc_scores_clean,  # Clean for centroids
      centroids = centroids,
      closest_plants = closest_plants,
      top4_closest_plants = top4_closest_plants,
      variance_explained = summary(pca_result)$importance,
      removed_columns = c(constant_cols, near_constant),
      n_variables_used = ncol(donor_matrix),
      outlier_plants = outlier_plants
    )
  }
  
  # Combine results
  all_centroids <- bind_rows(lapply(pca_results_by_donor, function(x) x$centroids))
  all_closest_plants <- bind_rows(lapply(pca_results_by_donor, function(x) x$closest_plants))
  all_top4_closest_plants <- bind_rows(lapply(pca_results_by_donor, function(x) x$top4_closest_plants))
  
  # Return results with imputation and outlier info
  list(
    pca_results_by_donor = pca_results_by_donor,
    all_centroids = all_centroids,
    all_closest_plants = all_closest_plants,
    all_top4_closest_plants = all_top4_closest_plants,
    n_plants_analyzed = nrow(all_pc_scores),
    donors = donors,
    outlier_info = outlier_info,
    imputation_summary = data.frame(
      variable = colnames(pca_matrix),
      n_missing = sapply(1:ncol(pca_data %>% select(-plant)), 
                        function(i) sum(is.na(pca_data %>% select(-plant) %>% pull(i)))),
      mean_imputed = sapply(1:ncol(pca_matrix), function(i) mean(pca_matrix[, i]))
    )
  )
}

# Function to create PCA visualization
plot_pca_results <- function(analysis_results) {
  donors <- analysis_results$donors
  plot_list <- list()
  
  for (donor_name in donors) {
    # Check if PCA was actually performed for this donor
    if (donor_name %in% names(analysis_results$pca_results_by_donor)) {
      donor_results <- analysis_results$pca_results_by_donor[[donor_name]]
      pc_scores <- donor_results$pc_scores
      centroids <- donor_results$centroids
      closest_plants <- donor_results$closest_plants
      var_exp <- donor_results$variance_explained
      
      # Calculate variance explained percentages
      pc1_var <- round(var_exp[2, 1] * 100, 1)
      pc2_var <- round(var_exp[2, 2] * 100, 1)
      
      # Create PCA plot for this donor (using cleaned data for plotting)
      pc_scores_to_plot <- if(length(donor_results$outlier_plants) > 0) {
        donor_results$pc_scores_clean
      } else {
        donor_results$pc_scores
      }
      
      p <- ggplot(pc_scores_to_plot, aes(x = PC1, y = PC2, color = genotype)) +
        geom_point(size = 3, alpha = 0.8) +
        geom_point(data = centroids, 
                   aes(x = centroid_pc1, y = centroid_pc2, color = genotype),
                   size = 12, shape = 15, alpha = 0.9) +
        geom_point(data = closest_plants,
                   aes(x = PC1, y = PC2, color = genotype),
                   size = 4, shape = 1, stroke = 2) +
        geom_text(data = closest_plants,
                  aes(x = PC1, y = PC2, label = plant),
                  nudge_y = 0.3, size = 6, fontface = "bold",
                  show.legend = FALSE) +
        scale_color_manual(
          values = c("INV4M" = "purple4", "CTRL" = "gold"),
          labels = c("INV4M" = "Inv4m", "CTRL" = "Control"),
          name = "Genotype"
        ) +
        labs(
          title = paste("PCA for Donor:", donor_name),
          subtitle = paste("Squares = centroids, Circles = closest plants", 
                          if(length(donor_results$outlier_plants) > 0) 
                            paste("(", length(donor_results$outlier_plants), "outliers removed)")
                          else ""),
          x = paste0("PC1 (", pc1_var, "% variance)"),
          y = paste0("PC2 (", pc2_var, "% variance)")
        ) +
        theme_minimal() +
        theme(
          plot.title = element_text(size = 12, face = "bold"),
          plot.subtitle = element_text(size = 10),
          legend.position = "bottom"
        )
      
      plot_list[[donor_name]] <- p
    } else {
      # Create placeholder plot for skipped donors
      p <- ggplot() +
        annotate("text", x = 0, y = 0, 
                label = paste("PCA analysis skipped for", donor_name, "\n(insufficient variable variance)"),
                size = 5, hjust = 0.5, vjust = 0.5) +
        labs(title = paste("PCA for Donor:", donor_name)) +
        theme_void() +
        theme(plot.title = element_text(size = 12, face = "bold", hjust = 0.5))
      
      plot_list[[donor_name]] <- p
    }
  }
  
  # Combine plots
  if (length(plot_list) == 2) {
    combined_plot <- ggarrange(plotlist = plot_list, ncol = 2, nrow = 1, 
                              common.legend = TRUE, legend = "bottom")
  } else {
    combined_plot <- plot_list[[1]]
  }
  
  return(combined_plot)
}

6 Execute PCA Analysis

# Perform the analysis with more stringent outlier removal
pca_analysis <- perform_pca_analysis(for_pca, metadata, remove_outliers_flag = TRUE)
## 
## Performing PCA for donor: TMEX 
## Removing 2 constant/zero variance columns for TMEX :
## Columns removed: top16 top15 
## Removing post-hoc outliers in PC space for TMEX 
##   - Outliers removed: 2 
##   - Outlier plants: 344, 414 
## 
## Performing PCA for donor: MI21 
## Removing post-hoc outliers in PC space for MI21 
##   - Outliers removed: 1 
##   - Outlier plants: 8

7 Results

7.1 Outlier Detection Summary

if (!is.null(pca_analysis$outlier_info)) {
  cat("Outlier Detection Results by Donor:\n")
  cat("==================================\n")
  cat("Method: Mahalanobis Distance (per donor)\n")
  cat("Threshold: χ² 97.5th percentile\n\n")
  
  # Create summary table for all donors
  outlier_summary_list <- list()
  
  for (donor_name in names(pca_analysis$outlier_info)) {
    donor_outliers <- pca_analysis$outlier_info[[donor_name]]
    
    cat("DONOR:", donor_name, "\n")
    cat("  - Original plants:", donor_outliers$n_plants_original, "\n")
    cat("  - Outliers detected:", donor_outliers$n_outliers, "\n")
    cat("  - Threshold used:", round(donor_outliers$threshold, 3), "\n")
    
    if (donor_outliers$n_outliers > 0) {
      cat("  - Outlier plants:", paste(donor_outliers$outlier_plants, collapse = ", "), "\n")
      
      # Find indices of outlier plants in the distance vector
      outlier_indices <- which(names(donor_outliers$mahalanobis_distances) %in% donor_outliers$outlier_plants)
      
      # If names are not available, use plant order
      if (length(outlier_indices) == 0) {
        # Create a mapping based on original plant order
        donor_data_temp <- combined_data %>% filter(donor == donor_name)
        outlier_indices <- which(donor_data_temp$plant %in% donor_outliers$outlier_plants)
      }
      
      # Create data frame with matching lengths
      if (length(outlier_indices) > 0 && length(outlier_indices) == length(donor_outliers$outlier_plants)) {
        outlier_summary_list[[donor_name]] <- data.frame(
          donor = rep(donor_name, length(donor_outliers$outlier_plants)),
          outlier_plant = donor_outliers$outlier_plants,
          mahalanobis_distance = donor_outliers$mahalanobis_distances[outlier_indices]
        )
      } else {
        # Fallback: create data frame without distances if matching fails
        outlier_summary_list[[donor_name]] <- data.frame(
          donor = rep(donor_name, length(donor_outliers$outlier_plants)),
          outlier_plant = donor_outliers$outlier_plants,
          mahalanobis_distance = rep(NA, length(donor_outliers$outlier_plants))
        )
      }
    }
    cat("\n")
  }
  
  # Create combined outlier table if there are outliers
  if (length(outlier_summary_list) > 0) {
    all_outliers_table <- bind_rows(outlier_summary_list) %>%
      arrange(donor, desc(mahalanobis_distance)) %>%
      mutate(mahalanobis_distance = round(mahalanobis_distance, 3))
    
    if (nrow(all_outliers_table) > 0) {
      kable(all_outliers_table, 
            caption = "All Outliers Detected by Donor",
            col.names = c("Donor", "Plant ID", "Mahalanobis Distance"),
            align = c('c', 'c', 'c')) %>%
        kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
        group_rows("MI21", 1, sum(all_outliers_table$donor == "MI21")) %>%
        group_rows("TMEX", sum(all_outliers_table$donor == "MI21") + 1, nrow(all_outliers_table))
    }
  }
  
  total_removed <- sum(sapply(pca_analysis$outlier_info, function(x) x$n_outliers))
  cat("Total plants removed across all donors:", total_removed, "\n")
  
} else {
  cat("No outlier detection performed.")
}
## No outlier detection performed.

7.2 Missing Data Summary

kable(pca_analysis$imputation_summary, 
      caption = "Missing Data and Mean Imputation Summary",
      digits = 3) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Missing Data and Mean Imputation Summary
variable n_missing mean_imputed
plant 0 218.222
top16 417 21.286
top15 378 22.130
top14 264 22.637
top13 206 19.950
top12 195 14.328
top11 195 12.459
top10 145 14.989
top9 13 18.664
top8 3 15.919
top7 3 14.435
top6 2 14.659
top5 1 14.170
top4 3 12.926
top3 3 12.012
top2 3 11.494
top1 5 9.692
cat("Total plants analyzed (after outlier removal):", pca_analysis$n_plants_analyzed)
## Total plants analyzed (after outlier removal): 424

7.3 PCA Results by Donor

# Display results for each donor
for (donor_name in pca_analysis$donors) {
  if (donor_name %in% names(pca_analysis$pca_results_by_donor)) {
    cat("\n\n### DONOR:", donor_name, "\n")
    
    donor_results <- pca_analysis$pca_results_by_donor[[donor_name]]
    
    # Report removed columns if any
    if (!is.null(donor_results$removed_columns) && length(donor_results$removed_columns) > 0) {
      cat("\n**Note:** Removed", length(donor_results$removed_columns), 
          "constant/low-variance columns for this donor\n")
      cat("Variables used in PCA:", donor_results$n_variables_used, "\n")
    }
    
    cat("\n#### Variance Explained\n")
    var_table <- donor_results$variance_explained[1:3, 1:5] %>%
      as.data.frame() %>%
      round(4)
    print(kable(var_table, caption = paste("Variance Explained -", donor_name)) %>%
          kable_styling(bootstrap_options = c("striped", "hover", "condensed")))
    
    cat("\n#### Genotype Centroids\n")
    centroid_table <- donor_results$centroids %>%
      select(-donor) %>%
      mutate(across(where(is.numeric), ~round(.x, 3)))
    print(kable(centroid_table, caption = paste("Genotype Centroids -", donor_name)) %>%
          kable_styling(bootstrap_options = c("striped", "hover", "condensed")))
    
    cat("\n#### Closest Plants to Centroids\n")
    closest_table <- donor_results$closest_plants %>%
      select(plant, genotype, PC1, PC2, distance_to_centroid) %>%
      arrange(genotype) %>%
      mutate(across(where(is.numeric), ~round(.x, 3)))
    print(kable(closest_table, caption = paste("Representative Plants -", donor_name)) %>%
          kable_styling(bootstrap_options = c("striped", "hover", "condensed")))
    
    cat("\n")
  } else {
    cat("\n\n### DONOR:", donor_name, "\n")
    cat("**PCA analysis skipped due to insufficient variable variance**\n\n")
  }
}

7.3.1 DONOR: TMEX

Note: Removed 2 constant/low-variance columns for this donor Variables used in PCA: 15

7.3.1.1 Variance Explained

Variance Explained - TMEX
PC1 PC2 PC3 PC4 PC5
Standard deviation 1.9603 1.6366 1.4744 1.0899 1.0271
Proportion of Variance 0.2562 0.1786 0.1449 0.0792 0.0703
Cumulative Proportion 0.2562 0.4348 0.5797 0.6589 0.7292

7.3.1.2 Genotype Centroids

Genotype Centroids - TMEX
genotype centroid_pc1 centroid_pc2 n_plants
CTRL 0.668 -0.862 100
INV4M -1.019 0.729 95

7.3.1.3 Closest Plants to Centroids

Representative Plants - TMEX
plant genotype PC1 PC2 distance_to_centroid
373 CTRL 0.775 -0.733 0.168
213 INV4M -0.915 0.930 0.227

7.3.2 DONOR: MI21

7.3.2.1 Variance Explained

Variance Explained - MI21
PC1 PC2 PC3 PC4 PC5
Standard deviation 2.1030 1.3756 1.3032 1.1607 1.0693
Proportion of Variance 0.2602 0.1113 0.0999 0.0792 0.0673
Cumulative Proportion 0.2602 0.3715 0.4714 0.5506 0.6179

7.3.2.2 Genotype Centroids

Genotype Centroids - MI21
genotype centroid_pc1 centroid_pc2 n_plants
CTRL -0.280 0.263 112
INV4M 0.234 -0.306 114

7.3.2.3 Closest Plants to Centroids

Representative Plants - MI21
plant genotype PC1 PC2 distance_to_centroid
353 CTRL -0.103 0.392 0.219
327 INV4M 0.207 -0.469 0.165

7.4 PCA Visualizations

# Create and display plots
pca_plot <- plot_pca_results(pca_analysis)
print(pca_plot)

7.5 Summary: All Representative Plants with Plant Height

# Debug: Check what columns are in the closest plants data
cat("Columns in all_closest_plants:\n")
## Columns in all_closest_plants:
print(colnames(pca_analysis$all_closest_plants))
##  [1] "PC1"                  "PC2"                  "PC3"                 
##  [4] "PC4"                  "PC5"                  "PC6"                 
##  [7] "PC7"                  "PC8"                  "PC9"                 
## [10] "PC10"                 "PC11"                 "PC12"                
## [13] "PC13"                 "PC14"                 "PC15"                
## [16] "plant"                "row_id"               "plot_id"             
## [19] "plot_row"             "plot_col"             "rep"                 
## [22] "Y_pos"                "X_pos"                "donor.x"             
## [25] "genotype"             "group"                "centroid_pc1"        
## [28] "centroid_pc2"         "n_plants"             "donor.y"             
## [31] "distance_to_centroid" "PC16"                 "PC17"
# Get plant height data from original metadata (PH column)
plant_height_data <- data %>%
  select(plant, plant_height_cm = PH) %>%
  filter(!is.na(plant_height_cm))

# Show some basic stats about plant height data
cat("\nPlant height data summary:\n")
## 
## Plant height data summary:
cat("Total plants with height data:", nrow(plant_height_data), "\n")
## Total plants with height data: 442
cat("Height range:", min(plant_height_data$plant_height_cm, na.rm = TRUE), "-", 
    max(plant_height_data$plant_height_cm, na.rm = TRUE), "cm\n")
## Height range: 159 - 225 cm
cat("Mean height:", round(mean(plant_height_data$plant_height_cm, na.rm = TRUE), 1), "cm\n\n")
## Mean height: 191.6 cm
# Get metadata for merging donor information
cat("Columns in metadata:\n")
## Columns in metadata:
print(colnames(metadata))
##  [1] "plant"    "row_id"   "plot_id"  "plot_row" "plot_col" "rep"     
##  [7] "Y_pos"    "X_pos"    "donor"    "inv4m_gt" "group"
metadata_for_merge <- metadata %>%
  select(plant, donor, inv4m_gt)

cat("\nColumns in metadata_for_merge:\n")
## 
## Columns in metadata_for_merge:
print(colnames(metadata_for_merge))
## [1] "plant"    "donor"    "inv4m_gt"
# Combine with closest plants data - first check what we have
cat("\nFirst few rows of all_closest_plants:\n")
## 
## First few rows of all_closest_plants:
print(head(pca_analysis$all_closest_plants))
## # A tibble: 4 × 33
##      PC1    PC2    PC3     PC4    PC5      PC6      PC7     PC8    PC9   PC10
##    <dbl>  <dbl>  <dbl>   <dbl>  <dbl>    <dbl>    <dbl>   <dbl>  <dbl>  <dbl>
## 1  0.775 -0.733  2.12   1.90   -0.726 -0.415   -0.902   -0.837  -0.738  0.124
## 2 -0.915  0.930 -0.571 -0.208   0.161 -0.316   -1.17    -1.40    0.342 -1.12 
## 3 -0.103  0.392  1.43  -0.357  -0.494  2.38     0.219   -0.113  -0.117  0.185
## 4  0.207 -0.469  1.34  -0.0304 -0.316  0.00175 -0.00317  0.0982 -0.780  1.26 
## # ℹ 23 more variables: PC11 <dbl>, PC12 <dbl>, PC13 <dbl>, PC14 <dbl>,
## #   PC15 <dbl>, plant <int>, row_id <int>, plot_id <int>, plot_row <int>,
## #   plot_col <int>, rep <int>, Y_pos <dbl>, X_pos <dbl>, donor.x <chr>,
## #   genotype <chr>, group <chr>, centroid_pc1 <dbl>, centroid_pc2 <dbl>,
## #   n_plants <int>, donor.y <chr>, distance_to_centroid <dbl>, PC16 <dbl>,
## #   PC17 <dbl>
# Create the merged dataset step by step
step1 <- pca_analysis$all_closest_plants %>%
  left_join(metadata_for_merge, by = "plant")

cat("\nColumns after joining with metadata:\n")
## 
## Columns after joining with metadata:
print(colnames(step1))
##  [1] "PC1"                  "PC2"                  "PC3"                 
##  [4] "PC4"                  "PC5"                  "PC6"                 
##  [7] "PC7"                  "PC8"                  "PC9"                 
## [10] "PC10"                 "PC11"                 "PC12"                
## [13] "PC13"                 "PC14"                 "PC15"                
## [16] "plant"                "row_id"               "plot_id"             
## [19] "plot_row"             "plot_col"             "rep"                 
## [22] "Y_pos"                "X_pos"                "donor.x"             
## [25] "genotype"             "group"                "centroid_pc1"        
## [28] "centroid_pc2"         "n_plants"             "donor.y"             
## [31] "distance_to_centroid" "PC16"                 "PC17"                
## [34] "donor"                "inv4m_gt"
step2 <- step1 %>%
  left_join(plant_height_data, by = "plant")

cat("\nColumns after joining with height data:\n")
## 
## Columns after joining with height data:
print(colnames(step2))
##  [1] "PC1"                  "PC2"                  "PC3"                 
##  [4] "PC4"                  "PC5"                  "PC6"                 
##  [7] "PC7"                  "PC8"                  "PC9"                 
## [10] "PC10"                 "PC11"                 "PC12"                
## [13] "PC13"                 "PC14"                 "PC15"                
## [16] "plant"                "row_id"               "plot_id"             
## [19] "plot_row"             "plot_col"             "rep"                 
## [22] "Y_pos"                "X_pos"                "donor.x"             
## [25] "genotype"             "group"                "centroid_pc1"        
## [28] "centroid_pc2"         "n_plants"             "donor.y"             
## [31] "distance_to_centroid" "PC16"                 "PC17"                
## [34] "donor"                "inv4m_gt"             "plant_height_cm"
# Now safely select and rename columns
available_cols <- colnames(step2)
select_cols <- c("plant")

if ("donor" %in% available_cols) select_cols <- c(select_cols, "donor")
if ("inv4m_gt" %in% available_cols) select_cols <- c(select_cols, "inv4m_gt")
if ("genotype" %in% available_cols) select_cols <- c(select_cols, "genotype")
if ("PC1" %in% available_cols) select_cols <- c(select_cols, "PC1")
if ("PC2" %in% available_cols) select_cols <- c(select_cols, "PC2")
if ("distance_to_centroid" %in% available_cols) select_cols <- c(select_cols, "distance_to_centroid")
if ("plant_height_cm" %in% available_cols) select_cols <- c(select_cols, "plant_height_cm")

cat("\nColumns to select:", paste(select_cols, collapse = ", "), "\n")
## 
## Columns to select: plant, donor, inv4m_gt, genotype, PC1, PC2, distance_to_centroid, plant_height_cm
final_summary_with_height <- step2 %>%
  select(all_of(select_cols))

# Rename genotype column if needed
if ("inv4m_gt" %in% colnames(final_summary_with_height) && !"genotype" %in% colnames(final_summary_with_height)) {
  final_summary_with_height <- final_summary_with_height %>%
    rename(genotype = inv4m_gt)
}

# Round numeric columns
numeric_cols <- intersect(c("PC1", "PC2", "distance_to_centroid"), colnames(final_summary_with_height))
if (length(numeric_cols) > 0) {
  final_summary_with_height <- final_summary_with_height %>%
    mutate(across(all_of(numeric_cols), ~round(.x, 3)))
}

if ("plant_height_cm" %in% colnames(final_summary_with_height)) {
  final_summary_with_height <- final_summary_with_height %>%
    mutate(plant_height_cm = round(plant_height_cm, 1))
}

# Arrange if we have both donor and genotype
if (all(c("donor", "genotype") %in% colnames(final_summary_with_height))) {
  final_summary_with_height <- final_summary_with_height %>%
    arrange(donor, genotype)
}

# Show the final data structure
cat("\nFinal summary structure:\n")
## 
## Final summary structure:
print(str(final_summary_with_height))
## tibble [4 × 8] (S3: tbl_df/tbl/data.frame)
##  $ plant               : int [1:4] 353 327 373 213
##  $ donor               : chr [1:4] "MI21" "MI21" "TMEX" "TMEX"
##  $ inv4m_gt            : chr [1:4] "CTRL" "INV4M" "CTRL" "INV4M"
##  $ genotype            : chr [1:4] "CTRL" "INV4M" "CTRL" "INV4M"
##  $ PC1                 : num [1:4] -0.103 0.207 0.775 -0.915
##  $ PC2                 : num [1:4] 0.392 -0.469 -0.733 0.93
##  $ distance_to_centroid: num [1:4] 0.219 0.165 0.168 0.227
##  $ plant_height_cm     : num [1:4] 170 183 195 205
## NULL
cat("\nFirst few rows:\n")
## 
## First few rows:
print(head(final_summary_with_height))
## # A tibble: 4 × 8
##   plant donor inv4m_gt genotype    PC1    PC2 distance_to_centroid
##   <int> <chr> <chr>    <chr>     <dbl>  <dbl>                <dbl>
## 1   353 MI21  CTRL     CTRL     -0.103  0.392                0.219
## 2   327 MI21  INV4M    INV4M     0.207 -0.469                0.165
## 3   373 TMEX  CTRL     CTRL      0.775 -0.733                0.168
## 4   213 TMEX  INV4M    INV4M    -0.915  0.93                 0.227
## # ℹ 1 more variable: plant_height_cm <dbl>
# Create the table
kable(final_summary_with_height, 
      caption = "Most Representative Plants by Donor-Genotype with Plant Height") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Most Representative Plants by Donor-Genotype with Plant Height
plant donor inv4m_gt genotype PC1 PC2 distance_to_centroid plant_height_cm
353 MI21 CTRL CTRL -0.103 0.392 0.219 170
327 MI21 INV4M INV4M 0.207 -0.469 0.165 183
373 TMEX CTRL CTRL 0.775 -0.733 0.168 195
213 TMEX INV4M INV4M -0.915 0.930 0.227 205

7.6 Extended Table: Top 4 Plants Closest to Each Centroid

# Extended table with top 4 closest plants per genotype
cat("Creating extended summary with top 4 plants closest to each centroid...\n")
## Creating extended summary with top 4 plants closest to each centroid...
# Check if the top4 data exists
if ("all_top4_closest_plants" %in% names(pca_analysis)) {
  cat("Top 4 data found. Number of rows:", nrow(pca_analysis$all_top4_closest_plants), "\n")
  
  # Create extended summary with top 4 plants
  extended_summary <- pca_analysis$all_top4_closest_plants %>%
    left_join(metadata_for_merge, by = "plant") %>%
    left_join(plant_height_data, by = "plant")
  
  cat("Columns in extended summary:\n")
  print(colnames(extended_summary))
  
  # Select and arrange columns safely
  available_cols_ext <- colnames(extended_summary)
  select_cols_ext <- c("plant")
  
  if ("donor" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "donor")
  if ("inv4m_gt" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "inv4m_gt")
  if ("genotype" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "genotype")
  if ("rank" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "rank")
  if ("PC1" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "PC1")
  if ("PC2" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "PC2")
  if ("distance_to_centroid" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "distance_to_centroid")
  if ("plant_height_cm" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "plant_height_cm")
  
  cat("Columns to select for extended table:", paste(select_cols_ext, collapse = ", "), "\n")
  
  extended_summary_clean <- extended_summary %>%
    select(all_of(select_cols_ext))
  
  # Rename genotype column if needed
  if ("inv4m_gt" %in% colnames(extended_summary_clean) && !"genotype" %in% colnames(extended_summary_clean)) {
    extended_summary_clean <- extended_summary_clean %>%
      rename(genotype = inv4m_gt)
  }
  
  # Round numeric columns
  numeric_cols_ext <- intersect(c("PC1", "PC2", "distance_to_centroid"), colnames(extended_summary_clean))
  if (length(numeric_cols_ext) > 0) {
    extended_summary_clean <- extended_summary_clean %>%
      mutate(across(all_of(numeric_cols_ext), ~round(.x, 3)))
  }
  
  if ("plant_height_cm" %in% colnames(extended_summary_clean)) {
    extended_summary_clean <- extended_summary_clean %>%
      mutate(plant_height_cm = round(plant_height_cm, 1))
  }
  
  # Arrange if we have required columns
  if (all(c("donor", "genotype", "rank") %in% colnames(extended_summary_clean))) {
    extended_summary_clean <- extended_summary_clean %>%
      arrange(donor, genotype, rank)
  }
  
  cat("Final extended table dimensions:", nrow(extended_summary_clean), "x", ncol(extended_summary_clean), "\n")
  
  # Display the extended table
  kable(extended_summary_clean, 
        caption = "Top 4 Plants Closest to Each Centroid by Donor-Genotype",
        align = "c") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
    group_rows("MI21 - CTRL", 1, 4) %>%
    group_rows("MI21 - INV4M", 5, 8) %>%
    group_rows("TMEX - CTRL", 9, 12) %>%
    group_rows("TMEX - INV4M", 13, 16)
  
} else {
  cat("Error: Top 4 closest plants data not found in analysis results.\n")
}
## Top 4 data found. Number of rows: 16 
## Columns in extended summary:
##  [1] "PC1"                  "PC2"                  "PC3"                 
##  [4] "PC4"                  "PC5"                  "PC6"                 
##  [7] "PC7"                  "PC8"                  "PC9"                 
## [10] "PC10"                 "PC11"                 "PC12"                
## [13] "PC13"                 "PC14"                 "PC15"                
## [16] "plant"                "row_id"               "plot_id"             
## [19] "plot_row"             "plot_col"             "rep"                 
## [22] "Y_pos"                "X_pos"                "donor.x"             
## [25] "genotype"             "group"                "centroid_pc1"        
## [28] "centroid_pc2"         "n_plants"             "donor.y"             
## [31] "distance_to_centroid" "rank"                 "PC16"                
## [34] "PC17"                 "donor"                "inv4m_gt"            
## [37] "plant_height_cm"     
## Columns to select for extended table: plant, donor, inv4m_gt, genotype, rank, PC1, PC2, distance_to_centroid, plant_height_cm 
## Final extended table dimensions: 16 x 9
Top 4 Plants Closest to Each Centroid by Donor-Genotype
plant donor inv4m_gt genotype rank PC1 PC2 distance_to_centroid plant_height_cm
MI21 - CTRL
353 MI21 CTRL CTRL 1 -0.103 0.392 0.219 170
43 MI21 CTRL CTRL 2 -0.426 0.524 0.299 199
245 MI21 CTRL CTRL 3 -0.260 -0.039 0.303 186
109 MI21 CTRL CTRL 4 -0.592 0.294 0.314 192
MI21 - INV4M
327 MI21 INV4M INV4M 1 0.207 -0.469 0.165 183
9 MI21 INV4M INV4M 2 0.104 -0.486 0.221 185
187 MI21 INV4M INV4M 3 0.434 -0.063 0.316 167
135 MI21 INV4M INV4M 4 0.596 -0.457 0.392 186
TMEX - CTRL
373 TMEX CTRL CTRL 1 0.775 -0.733 0.168 195
74 TMEX CTRL CTRL 2 0.839 -0.810 0.178 207
335 TMEX CTRL CTRL 3 0.868 -0.916 0.208 204
100 TMEX CTRL CTRL 4 0.593 -1.060 0.211 184
TMEX - INV4M
213 TMEX INV4M INV4M 1 -0.915 0.930 0.227 205
130 TMEX INV4M INV4M 2 -1.057 0.953 0.228 214
174 TMEX INV4M INV4M 3 -1.146 0.956 0.259 205
363 TMEX INV4M INV4M 4 -1.099 0.480 0.261 190

7.7 Mean Plant Height by Donor-Genotype Group

# Calculate mean plant height for all plants by donor-genotype combination
all_plants_with_metadata <- data %>%
  select(plant, donor, inv4m_gt, PH) %>%
  rename(genotype = inv4m_gt, plant_height_cm = PH) %>%
  filter(!is.na(plant_height_cm))

# Calculate summary statistics
height_summary_all <- all_plants_with_metadata %>%
  group_by(donor, genotype) %>%
  summarise(
    n_plants = n(),
    mean_height = round(mean(plant_height_cm, na.rm = TRUE), 1),
    sd_height = round(sd(plant_height_cm, na.rm = TRUE), 1),
    min_height = round(min(plant_height_cm, na.rm = TRUE), 1),
    max_height = round(max(plant_height_cm, na.rm = TRUE), 1),
    .groups = "drop"
  ) %>%
  arrange(donor, genotype)

kable(height_summary_all,
      caption = "Plant Height Summary for All Plants by Donor-Genotype",
      col.names = c("Donor", "Genotype", "N Plants", "Mean Height (cm)", 
                   "SD Height (cm)", "Min Height (cm)", "Max Height (cm)"),
      align = c('c', 'c', 'c', 'c', 'c', 'c', 'c')) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Plant Height Summary for All Plants by Donor-Genotype
Donor Genotype N Plants Mean Height (cm) SD Height (cm) Min Height (cm) Max Height (cm)
MI21 CTRL 113 187.1 7.9 163 206
MI21 INV4M 119 182.9 7.9 159 197
TMEX CTRL 105 193.8 10.1 172 221
TMEX INV4M 105 204.1 11.4 167 225
# Statistical comparison between genotypes within each donor
cat("\nHeight Differences Between Genotypes:\n")
## 
## Height Differences Between Genotypes:
height_comparison_all <- all_plants_with_metadata %>%
  group_by(donor) %>%
  summarise(
    inv4m_mean = round(mean(plant_height_cm[genotype == "INV4M"], na.rm = TRUE), 1),
    ctrl_mean = round(mean(plant_height_cm[genotype == "CTRL"], na.rm = TRUE), 1),
    height_difference = round(inv4m_mean - ctrl_mean, 1),
    .groups = "drop"
  )

kable(height_comparison_all,
      caption = "Height Comparison: Inv4m vs Control by Donor (All Plants)",
      col.names = c("Donor", "Inv4m Mean (cm)", "Control Mean (cm)", "Difference (cm)"),
      align = c('c', 'c', 'c', 'c')) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Height Comparison: Inv4m vs Control by Donor (All Plants)
Donor Inv4m Mean (cm) Control Mean (cm) Difference (cm)
MI21 182.9 187.1 -4.2
TMEX 204.1 193.8 10.3

Analysis completed on: 2025-07-09 20:26:51.424791