Data Cleaning and
Preparation
# Data cleaning and preparation
phytomere_processed <- metadata %>%
  inner_join(phytomere, by = "plant") %>%
  rename(genotype = "inv4m_gt") %>%
  filter(!is.na(length)) %>%
  filter(!(grepl("no tassel", notes))) %>%
  group_by(plant) %>%
  arrange(-internode) %>%
  mutate(from_top = row_number()) %>%
  filter(donor == "MI21" | donor == "TMEX" & from_top <= 10)
cat("Columns in phytomere_processed:\n")
## Columns in phytomere_processed:
print(colnames(phytomere_processed))
##  [1] "plant"     "row_id"    "plot_id"   "plot_row"  "plot_col"  "rep"      
##  [7] "Y_pos"     "X_pos"     "donor"     "genotype"  "group"     "internode"
## [13] "has_ear"   "length"    "length_1"  "length_2"  "notes"     "from_top"
# Create PCA input data
for_pca <- phytomere_processed %>%
  select(plant, internode, length) %>% 
  pivot_wider(names_from = "internode", 
              values_from = "length", 
              names_prefix = "top") %>%
  arrange(plant)
cat("\nPCA data dimensions:", dim(for_pca))
## 
## PCA data dimensions: 424 17
cat("\nColumns in PCA data (plant ID will be kept for identification but excluded from PCA):\n")
## 
## Columns in PCA data (plant ID will be kept for identification but excluded from PCA):
print(colnames(for_pca))
##  [1] "plant" "top16" "top15" "top14" "top13" "top12" "top11" "top10" "top9" 
## [10] "top8"  "top7"  "top6"  "top5"  "top4"  "top3"  "top2"  "top1"
# Check which columns are actually internode measurements (exclude plant ID)
internode_cols <- colnames(for_pca)[colnames(for_pca) != "plant"]
cat("\nInternode measurement columns to be used in PCA:", length(internode_cols), "columns\n")
## 
## Internode measurement columns to be used in PCA: 16 columns
# Calculate overall missing data percentage
pca_matrix <- for_pca %>% select(all_of(internode_cols))
total_cells <- nrow(pca_matrix) * ncol(pca_matrix)
missing_cells <- sum(is.na(pca_matrix))
missing_percentage <- round((missing_cells / total_cells) * 100, 1)
cat("Missing data: ", missing_percentage, "% of matrix positions\n")
## Missing data:  25.5 % of matrix positions
cat("(", missing_cells, " missing out of ", total_cells, " total values)\n")
## ( 1836  missing out of  7208  total values)
 
 PCA Analysis
Functions
# Function to detect and remove outliers post-hoc from PC space
remove_outliers_posthoc <- function(pc_scores, threshold_sd = 3) {
  # Calculate distances from center in PC space
  pc1_mean <- mean(pc_scores$PC1, na.rm = TRUE)
  pc2_mean <- mean(pc_scores$PC2, na.rm = TRUE)
  pc1_sd <- sd(pc_scores$PC1, na.rm = TRUE)
  pc2_sd <- sd(pc_scores$PC2, na.rm = TRUE)
  
  # Identify outliers as points beyond threshold_sd standard deviations
  pc1_outliers <- abs(pc_scores$PC1 - pc1_mean) > (threshold_sd * pc1_sd)
  pc2_outliers <- abs(pc_scores$PC2 - pc2_mean) > (threshold_sd * pc2_sd)
  
  # Plants that are outliers in either PC1 or PC2
  outlier_mask <- pc1_outliers | pc2_outliers
  outlier_plants <- pc_scores$plant[outlier_mask]
  
  # Return clean data
  clean_pc_scores <- pc_scores[!outlier_mask, ]
  
  list(
    clean_pc_scores = clean_pc_scores,
    outlier_plants = outlier_plants,
    n_outliers = length(outlier_plants)
  )
}
# Function to perform PCA and find centroid representatives
perform_pca_analysis <- function(pca_data, metadata_df, remove_outliers_flag = FALSE) {
  
  # Skip pre-PCA outlier removal
  outlier_info <- NULL
  
  # Prepare data for PCA with mean imputation
  pca_matrix <- pca_data %>%
    select(-plant) %>%
    as.matrix()
  
  # Mean imputation for missing values
  for (col in 1:ncol(pca_matrix)) {
    col_mean <- mean(pca_matrix[, col], na.rm = TRUE)
    pca_matrix[is.na(pca_matrix[, col]), col] <- col_mean
  }
  
  # Get all plants (no removal due to missing data)
  plants_all <- pca_data$plant
  
  # Get metadata for all plants - check for genotype column
  metadata_clean <- metadata_df %>%
    filter(plant %in% plants_all) %>%
    arrange(match(plant, plants_all))
  
  # Check if genotype column exists, if not use inv4m_gt
  if (!"genotype" %in% colnames(metadata_clean)) {
    if ("inv4m_gt" %in% colnames(metadata_clean)) {
      metadata_clean <- metadata_clean %>%
        rename(genotype = inv4m_gt)
    } else {
      stop("Neither 'genotype' nor 'inv4m_gt' column found in metadata")
    }
  }
  
  # Create combined PC scores dataframe first
  all_pc_scores <- data.frame(plant = plants_all) %>%
    left_join(metadata_clean, by = "plant")
  
  # Perform separate PCA for each donor
  donors <- unique(all_pc_scores$donor)
  pca_results_by_donor <- list()
  
  for (donor_name in donors) {
    cat("\nPerforming PCA for donor:", donor_name, "\n")
    
    # Filter data for this donor
    donor_plants <- all_pc_scores %>%
      filter(donor == donor_name) %>%
      pull(plant)
    
    donor_indices <- which(plants_all %in% donor_plants)
    donor_matrix <- pca_matrix[donor_indices, ]
    
    # Check for constant columns (zero variance)
    col_vars <- apply(donor_matrix, 2, var, na.rm = TRUE)
    constant_cols <- which(col_vars == 0 | is.na(col_vars))
    
    if (length(constant_cols) > 0) {
      cat("Removing", length(constant_cols), "constant/zero variance columns for", donor_name, ":\n")
      cat("Columns removed:", colnames(donor_matrix)[constant_cols], "\n")
      donor_matrix <- donor_matrix[, -constant_cols, drop = FALSE]
    }
    
    # Check if we have enough columns left for PCA
    if (ncol(donor_matrix) < 2) {
      cat("Warning: Less than 2 variables remaining for", donor_name, ". Skipping PCA.\n")
      next
    }
    
    # Check for near-constant columns (very low variance)
    remaining_vars <- apply(donor_matrix, 2, var, na.rm = TRUE)
    low_var_threshold <- 1e-10
    near_constant <- which(remaining_vars < low_var_threshold)
    
    if (length(near_constant) > 0) {
      cat("Removing", length(near_constant), "near-constant columns for", donor_name, "\n")
      donor_matrix <- donor_matrix[, -near_constant, drop = FALSE]
    }
    
    # Final check
    if (ncol(donor_matrix) < 2) {
      cat("Warning: Less than 2 variables remaining for", donor_name, ". Skipping PCA.\n")
      next
    }
    
    # Perform PCA for this donor
    pca_result <- prcomp(donor_matrix, center = TRUE, scale. = TRUE)
    
    # Extract PC scores
    pc_scores <- pca_result$x %>%
      as.data.frame() %>%
      mutate(plant = donor_plants) %>%
      left_join(metadata_clean, by = "plant")
    
    # Post-hoc outlier removal in PC space
    if (remove_outliers_flag) {
      cat("Removing post-hoc outliers in PC space for", donor_name, "\n")
      outlier_results <- remove_outliers_posthoc(pc_scores, threshold_sd = 3)
      pc_scores_clean <- outlier_results$clean_pc_scores
      outlier_plants <- outlier_results$outlier_plants
      cat("  - Outliers removed:", length(outlier_plants), "\n")
      if (length(outlier_plants) > 0) {
        cat("  - Outlier plants:", paste(outlier_plants, collapse = ", "), "\n")
      }
    } else {
      pc_scores_clean <- pc_scores
      outlier_plants <- character(0)
    }
    
    # Calculate centroids for each genotype within this donor (using clean data)
    centroids <- pc_scores_clean %>%
      group_by(genotype) %>%
      summarise(
        centroid_pc1 = mean(PC1, na.rm = TRUE),
        centroid_pc2 = mean(PC2, na.rm = TRUE),
        n_plants = n(),
        .groups = "drop"
      ) %>%
      mutate(donor = donor_name)
    
    # Find plant closest to each centroid (using clean data)
    closest_plants <- pc_scores_clean %>%
      left_join(centroids, by = "genotype") %>%
      mutate(
        distance_to_centroid = sqrt((PC1 - centroid_pc1)^2 + (PC2 - centroid_pc2)^2)
      ) %>%
      group_by(genotype) %>%
      slice_min(distance_to_centroid, n = 1) %>%
      ungroup()
    
    # Find top 4 plants closest to each centroid (for extended table)
    top4_closest_plants <- pc_scores_clean %>%
      left_join(centroids, by = "genotype") %>%
      mutate(
        distance_to_centroid = sqrt((PC1 - centroid_pc1)^2 + (PC2 - centroid_pc2)^2)
      ) %>%
      group_by(genotype) %>%
      slice_min(distance_to_centroid, n = 4) %>%
      mutate(rank = row_number()) %>%
      ungroup()
    
    # Store results for this donor
    pca_results_by_donor[[donor_name]] <- list(
      pca_result = pca_result,
      pc_scores = pc_scores,  # Keep original for plotting
      pc_scores_clean = pc_scores_clean,  # Clean for centroids
      centroids = centroids,
      closest_plants = closest_plants,
      top4_closest_plants = top4_closest_plants,
      variance_explained = summary(pca_result)$importance,
      removed_columns = c(constant_cols, near_constant),
      n_variables_used = ncol(donor_matrix),
      outlier_plants = outlier_plants
    )
  }
  
  # Combine results
  all_centroids <- bind_rows(lapply(pca_results_by_donor, function(x) x$centroids))
  all_closest_plants <- bind_rows(lapply(pca_results_by_donor, function(x) x$closest_plants))
  all_top4_closest_plants <- bind_rows(lapply(pca_results_by_donor, function(x) x$top4_closest_plants))
  
  # Return results with imputation and outlier info
  list(
    pca_results_by_donor = pca_results_by_donor,
    all_centroids = all_centroids,
    all_closest_plants = all_closest_plants,
    all_top4_closest_plants = all_top4_closest_plants,
    n_plants_analyzed = nrow(all_pc_scores),
    donors = donors,
    outlier_info = outlier_info,
    imputation_summary = data.frame(
      variable = colnames(pca_matrix),
      n_missing = sapply(1:ncol(pca_data %>% select(-plant)), 
                        function(i) sum(is.na(pca_data %>% select(-plant) %>% pull(i)))),
      mean_imputed = sapply(1:ncol(pca_matrix), function(i) mean(pca_matrix[, i]))
    )
  )
}
# Function to create PCA visualization
plot_pca_results <- function(analysis_results) {
  donors <- analysis_results$donors
  plot_list <- list()
  
  for (donor_name in donors) {
    # Check if PCA was actually performed for this donor
    if (donor_name %in% names(analysis_results$pca_results_by_donor)) {
      donor_results <- analysis_results$pca_results_by_donor[[donor_name]]
      pc_scores <- donor_results$pc_scores
      centroids <- donor_results$centroids
      closest_plants <- donor_results$closest_plants
      var_exp <- donor_results$variance_explained
      
      # Calculate variance explained percentages
      pc1_var <- round(var_exp[2, 1] * 100, 1)
      pc2_var <- round(var_exp[2, 2] * 100, 1)
      
      # Create PCA plot for this donor (using cleaned data for plotting)
      pc_scores_to_plot <- if(length(donor_results$outlier_plants) > 0) {
        donor_results$pc_scores_clean
      } else {
        donor_results$pc_scores
      }
      
      p <- ggplot(pc_scores_to_plot, aes(x = PC1, y = PC2, color = genotype)) +
        geom_point(size = 3, alpha = 0.8) +
        geom_point(data = centroids, 
                   aes(x = centroid_pc1, y = centroid_pc2, color = genotype),
                   size = 12, shape = 15, alpha = 0.9) +
        geom_point(data = closest_plants,
                   aes(x = PC1, y = PC2, color = genotype),
                   size = 4, shape = 1, stroke = 2) +
        geom_text(data = closest_plants,
                  aes(x = PC1, y = PC2, label = plant),
                  nudge_y = 0.3, size = 6, fontface = "bold",
                  show.legend = FALSE) +
        scale_color_manual(
          values = c("INV4M" = "purple4", "CTRL" = "gold"),
          labels = c("INV4M" = "Inv4m", "CTRL" = "Control"),
          name = "Genotype"
        ) +
        labs(
          title = paste("PCA for Donor:", donor_name),
          subtitle = paste("Squares = centroids, Circles = closest plants", 
                          if(length(donor_results$outlier_plants) > 0) 
                            paste("(", length(donor_results$outlier_plants), "outliers removed)")
                          else ""),
          x = paste0("PC1 (", pc1_var, "% variance)"),
          y = paste0("PC2 (", pc2_var, "% variance)")
        ) +
        theme_minimal() +
        theme(
          plot.title = element_text(size = 12, face = "bold"),
          plot.subtitle = element_text(size = 10),
          legend.position = "bottom"
        )
      
      plot_list[[donor_name]] <- p
    } else {
      # Create placeholder plot for skipped donors
      p <- ggplot() +
        annotate("text", x = 0, y = 0, 
                label = paste("PCA analysis skipped for", donor_name, "\n(insufficient variable variance)"),
                size = 5, hjust = 0.5, vjust = 0.5) +
        labs(title = paste("PCA for Donor:", donor_name)) +
        theme_void() +
        theme(plot.title = element_text(size = 12, face = "bold", hjust = 0.5))
      
      plot_list[[donor_name]] <- p
    }
  }
  
  # Combine plots
  if (length(plot_list) == 2) {
    combined_plot <- ggarrange(plotlist = plot_list, ncol = 2, nrow = 1, 
                              common.legend = TRUE, legend = "bottom")
  } else {
    combined_plot <- plot_list[[1]]
  }
  
  return(combined_plot)
}
 
 Results
 Outlier Detection
Summary
if (!is.null(pca_analysis$outlier_info)) {
  cat("Outlier Detection Results by Donor:\n")
  cat("==================================\n")
  cat("Method: Mahalanobis Distance (per donor)\n")
  cat("Threshold: χ² 97.5th percentile\n\n")
  
  # Create summary table for all donors
  outlier_summary_list <- list()
  
  for (donor_name in names(pca_analysis$outlier_info)) {
    donor_outliers <- pca_analysis$outlier_info[[donor_name]]
    
    cat("DONOR:", donor_name, "\n")
    cat("  - Original plants:", donor_outliers$n_plants_original, "\n")
    cat("  - Outliers detected:", donor_outliers$n_outliers, "\n")
    cat("  - Threshold used:", round(donor_outliers$threshold, 3), "\n")
    
    if (donor_outliers$n_outliers > 0) {
      cat("  - Outlier plants:", paste(donor_outliers$outlier_plants, collapse = ", "), "\n")
      
      # Find indices of outlier plants in the distance vector
      outlier_indices <- which(names(donor_outliers$mahalanobis_distances) %in% donor_outliers$outlier_plants)
      
      # If names are not available, use plant order
      if (length(outlier_indices) == 0) {
        # Create a mapping based on original plant order
        donor_data_temp <- combined_data %>% filter(donor == donor_name)
        outlier_indices <- which(donor_data_temp$plant %in% donor_outliers$outlier_plants)
      }
      
      # Create data frame with matching lengths
      if (length(outlier_indices) > 0 && length(outlier_indices) == length(donor_outliers$outlier_plants)) {
        outlier_summary_list[[donor_name]] <- data.frame(
          donor = rep(donor_name, length(donor_outliers$outlier_plants)),
          outlier_plant = donor_outliers$outlier_plants,
          mahalanobis_distance = donor_outliers$mahalanobis_distances[outlier_indices]
        )
      } else {
        # Fallback: create data frame without distances if matching fails
        outlier_summary_list[[donor_name]] <- data.frame(
          donor = rep(donor_name, length(donor_outliers$outlier_plants)),
          outlier_plant = donor_outliers$outlier_plants,
          mahalanobis_distance = rep(NA, length(donor_outliers$outlier_plants))
        )
      }
    }
    cat("\n")
  }
  
  # Create combined outlier table if there are outliers
  if (length(outlier_summary_list) > 0) {
    all_outliers_table <- bind_rows(outlier_summary_list) %>%
      arrange(donor, desc(mahalanobis_distance)) %>%
      mutate(mahalanobis_distance = round(mahalanobis_distance, 3))
    
    if (nrow(all_outliers_table) > 0) {
      kable(all_outliers_table, 
            caption = "All Outliers Detected by Donor",
            col.names = c("Donor", "Plant ID", "Mahalanobis Distance"),
            align = c('c', 'c', 'c')) %>%
        kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
        group_rows("MI21", 1, sum(all_outliers_table$donor == "MI21")) %>%
        group_rows("TMEX", sum(all_outliers_table$donor == "MI21") + 1, nrow(all_outliers_table))
    }
  }
  
  total_removed <- sum(sapply(pca_analysis$outlier_info, function(x) x$n_outliers))
  cat("Total plants removed across all donors:", total_removed, "\n")
  
} else {
  cat("No outlier detection performed.")
}
## No outlier detection performed.
 
 Missing Data
Summary
kable(pca_analysis$imputation_summary, 
      caption = "Missing Data and Mean Imputation Summary",
      digits = 3) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Missing Data and Mean Imputation Summary
| variable | n_missing | mean_imputed | 
| plant | 0 | 218.222 | 
| top16 | 417 | 21.286 | 
| top15 | 378 | 22.130 | 
| top14 | 264 | 22.637 | 
| top13 | 206 | 19.950 | 
| top12 | 195 | 14.328 | 
| top11 | 195 | 12.459 | 
| top10 | 145 | 14.989 | 
| top9 | 13 | 18.664 | 
| top8 | 3 | 15.919 | 
| top7 | 3 | 14.435 | 
| top6 | 2 | 14.659 | 
| top5 | 1 | 14.170 | 
| top4 | 3 | 12.926 | 
| top3 | 3 | 12.012 | 
| top2 | 3 | 11.494 | 
| top1 | 5 | 9.692 | 
cat("Total plants analyzed (after outlier removal):", pca_analysis$n_plants_analyzed)
## Total plants analyzed (after outlier removal): 424
 
 PCA Results by
Donor
# Display results for each donor
for (donor_name in pca_analysis$donors) {
  if (donor_name %in% names(pca_analysis$pca_results_by_donor)) {
    cat("\n\n### DONOR:", donor_name, "\n")
    
    donor_results <- pca_analysis$pca_results_by_donor[[donor_name]]
    
    # Report removed columns if any
    if (!is.null(donor_results$removed_columns) && length(donor_results$removed_columns) > 0) {
      cat("\n**Note:** Removed", length(donor_results$removed_columns), 
          "constant/low-variance columns for this donor\n")
      cat("Variables used in PCA:", donor_results$n_variables_used, "\n")
    }
    
    cat("\n#### Variance Explained\n")
    var_table <- donor_results$variance_explained[1:3, 1:5] %>%
      as.data.frame() %>%
      round(4)
    print(kable(var_table, caption = paste("Variance Explained -", donor_name)) %>%
          kable_styling(bootstrap_options = c("striped", "hover", "condensed")))
    
    cat("\n#### Genotype Centroids\n")
    centroid_table <- donor_results$centroids %>%
      select(-donor) %>%
      mutate(across(where(is.numeric), ~round(.x, 3)))
    print(kable(centroid_table, caption = paste("Genotype Centroids -", donor_name)) %>%
          kable_styling(bootstrap_options = c("striped", "hover", "condensed")))
    
    cat("\n#### Closest Plants to Centroids\n")
    closest_table <- donor_results$closest_plants %>%
      select(plant, genotype, PC1, PC2, distance_to_centroid) %>%
      arrange(genotype) %>%
      mutate(across(where(is.numeric), ~round(.x, 3)))
    print(kable(closest_table, caption = paste("Representative Plants -", donor_name)) %>%
          kable_styling(bootstrap_options = c("striped", "hover", "condensed")))
    
    cat("\n")
  } else {
    cat("\n\n### DONOR:", donor_name, "\n")
    cat("**PCA analysis skipped due to insufficient variable variance**\n\n")
  }
}
 DONOR: TMEX
Note: Removed 2 constant/low-variance columns for
this donor Variables used in PCA: 15
 Variance
Explained
Variance Explained - TMEX
|  | PC1 | PC2 | PC3 | PC4 | PC5 | 
| Standard deviation | 1.9603 | 1.6366 | 1.4744 | 1.0899 | 1.0271 | 
| Proportion of Variance | 0.2562 | 0.1786 | 0.1449 | 0.0792 | 0.0703 | 
| Cumulative Proportion | 0.2562 | 0.4348 | 0.5797 | 0.6589 | 0.7292 | 
 
 Genotype
Centroids
Genotype Centroids - TMEX
| genotype | centroid_pc1 | centroid_pc2 | n_plants | 
| CTRL | 0.668 | -0.862 | 100 | 
| INV4M | -1.019 | 0.729 | 95 | 
 
 Closest Plants to
Centroids
Representative Plants - TMEX
| plant | genotype | PC1 | PC2 | distance_to_centroid | 
| 373 | CTRL | 0.775 | -0.733 | 0.168 | 
| 213 | INV4M | -0.915 | 0.930 | 0.227 | 
 
 
 DONOR: MI21
 Variance
Explained
Variance Explained - MI21
|  | PC1 | PC2 | PC3 | PC4 | PC5 | 
| Standard deviation | 2.1030 | 1.3756 | 1.3032 | 1.1607 | 1.0693 | 
| Proportion of Variance | 0.2602 | 0.1113 | 0.0999 | 0.0792 | 0.0673 | 
| Cumulative Proportion | 0.2602 | 0.3715 | 0.4714 | 0.5506 | 0.6179 | 
 
 Genotype
Centroids
Genotype Centroids - MI21
| genotype | centroid_pc1 | centroid_pc2 | n_plants | 
| CTRL | -0.280 | 0.263 | 112 | 
| INV4M | 0.234 | -0.306 | 114 | 
 
 Closest Plants to
Centroids
Representative Plants - MI21
| plant | genotype | PC1 | PC2 | distance_to_centroid | 
| 353 | CTRL | -0.103 | 0.392 | 0.219 | 
| 327 | INV4M | 0.207 | -0.469 | 0.165 | 
 
 
 
 PCA
Visualizations
# Create and display plots
pca_plot <- plot_pca_results(pca_analysis)
print(pca_plot)

 
 Summary: All
Representative Plants with Plant Height
# Debug: Check what columns are in the closest plants data
cat("Columns in all_closest_plants:\n")
## Columns in all_closest_plants:
print(colnames(pca_analysis$all_closest_plants))
##  [1] "PC1"                  "PC2"                  "PC3"                 
##  [4] "PC4"                  "PC5"                  "PC6"                 
##  [7] "PC7"                  "PC8"                  "PC9"                 
## [10] "PC10"                 "PC11"                 "PC12"                
## [13] "PC13"                 "PC14"                 "PC15"                
## [16] "plant"                "row_id"               "plot_id"             
## [19] "plot_row"             "plot_col"             "rep"                 
## [22] "Y_pos"                "X_pos"                "donor.x"             
## [25] "genotype"             "group"                "centroid_pc1"        
## [28] "centroid_pc2"         "n_plants"             "donor.y"             
## [31] "distance_to_centroid" "PC16"                 "PC17"
# Get plant height data from original metadata (PH column)
plant_height_data <- data %>%
  select(plant, plant_height_cm = PH) %>%
  filter(!is.na(plant_height_cm))
# Show some basic stats about plant height data
cat("\nPlant height data summary:\n")
## 
## Plant height data summary:
cat("Total plants with height data:", nrow(plant_height_data), "\n")
## Total plants with height data: 442
cat("Height range:", min(plant_height_data$plant_height_cm, na.rm = TRUE), "-", 
    max(plant_height_data$plant_height_cm, na.rm = TRUE), "cm\n")
## Height range: 159 - 225 cm
cat("Mean height:", round(mean(plant_height_data$plant_height_cm, na.rm = TRUE), 1), "cm\n\n")
## Mean height: 191.6 cm
# Get metadata for merging donor information
cat("Columns in metadata:\n")
## Columns in metadata:
print(colnames(metadata))
##  [1] "plant"    "row_id"   "plot_id"  "plot_row" "plot_col" "rep"     
##  [7] "Y_pos"    "X_pos"    "donor"    "inv4m_gt" "group"
metadata_for_merge <- metadata %>%
  select(plant, donor, inv4m_gt)
cat("\nColumns in metadata_for_merge:\n")
## 
## Columns in metadata_for_merge:
print(colnames(metadata_for_merge))
## [1] "plant"    "donor"    "inv4m_gt"
# Combine with closest plants data - first check what we have
cat("\nFirst few rows of all_closest_plants:\n")
## 
## First few rows of all_closest_plants:
print(head(pca_analysis$all_closest_plants))
## # A tibble: 4 × 33
##      PC1    PC2    PC3     PC4    PC5      PC6      PC7     PC8    PC9   PC10
##    <dbl>  <dbl>  <dbl>   <dbl>  <dbl>    <dbl>    <dbl>   <dbl>  <dbl>  <dbl>
## 1  0.775 -0.733  2.12   1.90   -0.726 -0.415   -0.902   -0.837  -0.738  0.124
## 2 -0.915  0.930 -0.571 -0.208   0.161 -0.316   -1.17    -1.40    0.342 -1.12 
## 3 -0.103  0.392  1.43  -0.357  -0.494  2.38     0.219   -0.113  -0.117  0.185
## 4  0.207 -0.469  1.34  -0.0304 -0.316  0.00175 -0.00317  0.0982 -0.780  1.26 
## # ℹ 23 more variables: PC11 <dbl>, PC12 <dbl>, PC13 <dbl>, PC14 <dbl>,
## #   PC15 <dbl>, plant <int>, row_id <int>, plot_id <int>, plot_row <int>,
## #   plot_col <int>, rep <int>, Y_pos <dbl>, X_pos <dbl>, donor.x <chr>,
## #   genotype <chr>, group <chr>, centroid_pc1 <dbl>, centroid_pc2 <dbl>,
## #   n_plants <int>, donor.y <chr>, distance_to_centroid <dbl>, PC16 <dbl>,
## #   PC17 <dbl>
# Create the merged dataset step by step
step1 <- pca_analysis$all_closest_plants %>%
  left_join(metadata_for_merge, by = "plant")
cat("\nColumns after joining with metadata:\n")
## 
## Columns after joining with metadata:
print(colnames(step1))
##  [1] "PC1"                  "PC2"                  "PC3"                 
##  [4] "PC4"                  "PC5"                  "PC6"                 
##  [7] "PC7"                  "PC8"                  "PC9"                 
## [10] "PC10"                 "PC11"                 "PC12"                
## [13] "PC13"                 "PC14"                 "PC15"                
## [16] "plant"                "row_id"               "plot_id"             
## [19] "plot_row"             "plot_col"             "rep"                 
## [22] "Y_pos"                "X_pos"                "donor.x"             
## [25] "genotype"             "group"                "centroid_pc1"        
## [28] "centroid_pc2"         "n_plants"             "donor.y"             
## [31] "distance_to_centroid" "PC16"                 "PC17"                
## [34] "donor"                "inv4m_gt"
step2 <- step1 %>%
  left_join(plant_height_data, by = "plant")
cat("\nColumns after joining with height data:\n")
## 
## Columns after joining with height data:
print(colnames(step2))
##  [1] "PC1"                  "PC2"                  "PC3"                 
##  [4] "PC4"                  "PC5"                  "PC6"                 
##  [7] "PC7"                  "PC8"                  "PC9"                 
## [10] "PC10"                 "PC11"                 "PC12"                
## [13] "PC13"                 "PC14"                 "PC15"                
## [16] "plant"                "row_id"               "plot_id"             
## [19] "plot_row"             "plot_col"             "rep"                 
## [22] "Y_pos"                "X_pos"                "donor.x"             
## [25] "genotype"             "group"                "centroid_pc1"        
## [28] "centroid_pc2"         "n_plants"             "donor.y"             
## [31] "distance_to_centroid" "PC16"                 "PC17"                
## [34] "donor"                "inv4m_gt"             "plant_height_cm"
# Now safely select and rename columns
available_cols <- colnames(step2)
select_cols <- c("plant")
if ("donor" %in% available_cols) select_cols <- c(select_cols, "donor")
if ("inv4m_gt" %in% available_cols) select_cols <- c(select_cols, "inv4m_gt")
if ("genotype" %in% available_cols) select_cols <- c(select_cols, "genotype")
if ("PC1" %in% available_cols) select_cols <- c(select_cols, "PC1")
if ("PC2" %in% available_cols) select_cols <- c(select_cols, "PC2")
if ("distance_to_centroid" %in% available_cols) select_cols <- c(select_cols, "distance_to_centroid")
if ("plant_height_cm" %in% available_cols) select_cols <- c(select_cols, "plant_height_cm")
cat("\nColumns to select:", paste(select_cols, collapse = ", "), "\n")
## 
## Columns to select: plant, donor, inv4m_gt, genotype, PC1, PC2, distance_to_centroid, plant_height_cm
final_summary_with_height <- step2 %>%
  select(all_of(select_cols))
# Rename genotype column if needed
if ("inv4m_gt" %in% colnames(final_summary_with_height) && !"genotype" %in% colnames(final_summary_with_height)) {
  final_summary_with_height <- final_summary_with_height %>%
    rename(genotype = inv4m_gt)
}
# Round numeric columns
numeric_cols <- intersect(c("PC1", "PC2", "distance_to_centroid"), colnames(final_summary_with_height))
if (length(numeric_cols) > 0) {
  final_summary_with_height <- final_summary_with_height %>%
    mutate(across(all_of(numeric_cols), ~round(.x, 3)))
}
if ("plant_height_cm" %in% colnames(final_summary_with_height)) {
  final_summary_with_height <- final_summary_with_height %>%
    mutate(plant_height_cm = round(plant_height_cm, 1))
}
# Arrange if we have both donor and genotype
if (all(c("donor", "genotype") %in% colnames(final_summary_with_height))) {
  final_summary_with_height <- final_summary_with_height %>%
    arrange(donor, genotype)
}
# Show the final data structure
cat("\nFinal summary structure:\n")
## 
## Final summary structure:
print(str(final_summary_with_height))
## tibble [4 × 8] (S3: tbl_df/tbl/data.frame)
##  $ plant               : int [1:4] 353 327 373 213
##  $ donor               : chr [1:4] "MI21" "MI21" "TMEX" "TMEX"
##  $ inv4m_gt            : chr [1:4] "CTRL" "INV4M" "CTRL" "INV4M"
##  $ genotype            : chr [1:4] "CTRL" "INV4M" "CTRL" "INV4M"
##  $ PC1                 : num [1:4] -0.103 0.207 0.775 -0.915
##  $ PC2                 : num [1:4] 0.392 -0.469 -0.733 0.93
##  $ distance_to_centroid: num [1:4] 0.219 0.165 0.168 0.227
##  $ plant_height_cm     : num [1:4] 170 183 195 205
## NULL
cat("\nFirst few rows:\n")
## 
## First few rows:
print(head(final_summary_with_height))
## # A tibble: 4 × 8
##   plant donor inv4m_gt genotype    PC1    PC2 distance_to_centroid
##   <int> <chr> <chr>    <chr>     <dbl>  <dbl>                <dbl>
## 1   353 MI21  CTRL     CTRL     -0.103  0.392                0.219
## 2   327 MI21  INV4M    INV4M     0.207 -0.469                0.165
## 3   373 TMEX  CTRL     CTRL      0.775 -0.733                0.168
## 4   213 TMEX  INV4M    INV4M    -0.915  0.93                 0.227
## # ℹ 1 more variable: plant_height_cm <dbl>
# Create the table
kable(final_summary_with_height, 
      caption = "Most Representative Plants by Donor-Genotype with Plant Height") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Most Representative Plants by Donor-Genotype with Plant Height
| plant | donor | inv4m_gt | genotype | PC1 | PC2 | distance_to_centroid | plant_height_cm | 
| 353 | MI21 | CTRL | CTRL | -0.103 | 0.392 | 0.219 | 170 | 
| 327 | MI21 | INV4M | INV4M | 0.207 | -0.469 | 0.165 | 183 | 
| 373 | TMEX | CTRL | CTRL | 0.775 | -0.733 | 0.168 | 195 | 
| 213 | TMEX | INV4M | INV4M | -0.915 | 0.930 | 0.227 | 205 | 
 
 Extended Table: Top 4
Plants Closest to Each Centroid
# Extended table with top 4 closest plants per genotype
cat("Creating extended summary with top 4 plants closest to each centroid...\n")
## Creating extended summary with top 4 plants closest to each centroid...
# Check if the top4 data exists
if ("all_top4_closest_plants" %in% names(pca_analysis)) {
  cat("Top 4 data found. Number of rows:", nrow(pca_analysis$all_top4_closest_plants), "\n")
  
  # Create extended summary with top 4 plants
  extended_summary <- pca_analysis$all_top4_closest_plants %>%
    left_join(metadata_for_merge, by = "plant") %>%
    left_join(plant_height_data, by = "plant")
  
  cat("Columns in extended summary:\n")
  print(colnames(extended_summary))
  
  # Select and arrange columns safely
  available_cols_ext <- colnames(extended_summary)
  select_cols_ext <- c("plant")
  
  if ("donor" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "donor")
  if ("inv4m_gt" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "inv4m_gt")
  if ("genotype" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "genotype")
  if ("rank" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "rank")
  if ("PC1" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "PC1")
  if ("PC2" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "PC2")
  if ("distance_to_centroid" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "distance_to_centroid")
  if ("plant_height_cm" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "plant_height_cm")
  
  cat("Columns to select for extended table:", paste(select_cols_ext, collapse = ", "), "\n")
  
  extended_summary_clean <- extended_summary %>%
    select(all_of(select_cols_ext))
  
  # Rename genotype column if needed
  if ("inv4m_gt" %in% colnames(extended_summary_clean) && !"genotype" %in% colnames(extended_summary_clean)) {
    extended_summary_clean <- extended_summary_clean %>%
      rename(genotype = inv4m_gt)
  }
  
  # Round numeric columns
  numeric_cols_ext <- intersect(c("PC1", "PC2", "distance_to_centroid"), colnames(extended_summary_clean))
  if (length(numeric_cols_ext) > 0) {
    extended_summary_clean <- extended_summary_clean %>%
      mutate(across(all_of(numeric_cols_ext), ~round(.x, 3)))
  }
  
  if ("plant_height_cm" %in% colnames(extended_summary_clean)) {
    extended_summary_clean <- extended_summary_clean %>%
      mutate(plant_height_cm = round(plant_height_cm, 1))
  }
  
  # Arrange if we have required columns
  if (all(c("donor", "genotype", "rank") %in% colnames(extended_summary_clean))) {
    extended_summary_clean <- extended_summary_clean %>%
      arrange(donor, genotype, rank)
  }
  
  cat("Final extended table dimensions:", nrow(extended_summary_clean), "x", ncol(extended_summary_clean), "\n")
  
  # Display the extended table
  kable(extended_summary_clean, 
        caption = "Top 4 Plants Closest to Each Centroid by Donor-Genotype",
        align = "c") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
    group_rows("MI21 - CTRL", 1, 4) %>%
    group_rows("MI21 - INV4M", 5, 8) %>%
    group_rows("TMEX - CTRL", 9, 12) %>%
    group_rows("TMEX - INV4M", 13, 16)
  
} else {
  cat("Error: Top 4 closest plants data not found in analysis results.\n")
}
## Top 4 data found. Number of rows: 16 
## Columns in extended summary:
##  [1] "PC1"                  "PC2"                  "PC3"                 
##  [4] "PC4"                  "PC5"                  "PC6"                 
##  [7] "PC7"                  "PC8"                  "PC9"                 
## [10] "PC10"                 "PC11"                 "PC12"                
## [13] "PC13"                 "PC14"                 "PC15"                
## [16] "plant"                "row_id"               "plot_id"             
## [19] "plot_row"             "plot_col"             "rep"                 
## [22] "Y_pos"                "X_pos"                "donor.x"             
## [25] "genotype"             "group"                "centroid_pc1"        
## [28] "centroid_pc2"         "n_plants"             "donor.y"             
## [31] "distance_to_centroid" "rank"                 "PC16"                
## [34] "PC17"                 "donor"                "inv4m_gt"            
## [37] "plant_height_cm"     
## Columns to select for extended table: plant, donor, inv4m_gt, genotype, rank, PC1, PC2, distance_to_centroid, plant_height_cm 
## Final extended table dimensions: 16 x 9
Top 4 Plants Closest to Each Centroid by Donor-Genotype
| plant | donor | inv4m_gt | genotype | rank | PC1 | PC2 | distance_to_centroid | plant_height_cm | 
| MI21 - CTRL | 
| 353 | MI21 | CTRL | CTRL | 1 | -0.103 | 0.392 | 0.219 | 170 | 
| 43 | MI21 | CTRL | CTRL | 2 | -0.426 | 0.524 | 0.299 | 199 | 
| 245 | MI21 | CTRL | CTRL | 3 | -0.260 | -0.039 | 0.303 | 186 | 
| 109 | MI21 | CTRL | CTRL | 4 | -0.592 | 0.294 | 0.314 | 192 | 
| MI21 - INV4M | 
| 327 | MI21 | INV4M | INV4M | 1 | 0.207 | -0.469 | 0.165 | 183 | 
| 9 | MI21 | INV4M | INV4M | 2 | 0.104 | -0.486 | 0.221 | 185 | 
| 187 | MI21 | INV4M | INV4M | 3 | 0.434 | -0.063 | 0.316 | 167 | 
| 135 | MI21 | INV4M | INV4M | 4 | 0.596 | -0.457 | 0.392 | 186 | 
| TMEX - CTRL | 
| 373 | TMEX | CTRL | CTRL | 1 | 0.775 | -0.733 | 0.168 | 195 | 
| 74 | TMEX | CTRL | CTRL | 2 | 0.839 | -0.810 | 0.178 | 207 | 
| 335 | TMEX | CTRL | CTRL | 3 | 0.868 | -0.916 | 0.208 | 204 | 
| 100 | TMEX | CTRL | CTRL | 4 | 0.593 | -1.060 | 0.211 | 184 | 
| TMEX - INV4M | 
| 213 | TMEX | INV4M | INV4M | 1 | -0.915 | 0.930 | 0.227 | 205 | 
| 130 | TMEX | INV4M | INV4M | 2 | -1.057 | 0.953 | 0.228 | 214 | 
| 174 | TMEX | INV4M | INV4M | 3 | -1.146 | 0.956 | 0.259 | 205 | 
| 363 | TMEX | INV4M | INV4M | 4 | -1.099 | 0.480 | 0.261 | 190 | 
 
 Mean Plant Height by
Donor-Genotype Group
# Calculate mean plant height for all plants by donor-genotype combination
all_plants_with_metadata <- data %>%
  select(plant, donor, inv4m_gt, PH) %>%
  rename(genotype = inv4m_gt, plant_height_cm = PH) %>%
  filter(!is.na(plant_height_cm))
# Calculate summary statistics
height_summary_all <- all_plants_with_metadata %>%
  group_by(donor, genotype) %>%
  summarise(
    n_plants = n(),
    mean_height = round(mean(plant_height_cm, na.rm = TRUE), 1),
    sd_height = round(sd(plant_height_cm, na.rm = TRUE), 1),
    min_height = round(min(plant_height_cm, na.rm = TRUE), 1),
    max_height = round(max(plant_height_cm, na.rm = TRUE), 1),
    .groups = "drop"
  ) %>%
  arrange(donor, genotype)
kable(height_summary_all,
      caption = "Plant Height Summary for All Plants by Donor-Genotype",
      col.names = c("Donor", "Genotype", "N Plants", "Mean Height (cm)", 
                   "SD Height (cm)", "Min Height (cm)", "Max Height (cm)"),
      align = c('c', 'c', 'c', 'c', 'c', 'c', 'c')) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Plant Height Summary for All Plants by Donor-Genotype
| Donor | Genotype | N Plants | Mean Height (cm) | SD Height (cm) | Min Height (cm) | Max Height (cm) | 
| MI21 | CTRL | 113 | 187.1 | 7.9 | 163 | 206 | 
| MI21 | INV4M | 119 | 182.9 | 7.9 | 159 | 197 | 
| TMEX | CTRL | 105 | 193.8 | 10.1 | 172 | 221 | 
| TMEX | INV4M | 105 | 204.1 | 11.4 | 167 | 225 | 
# Statistical comparison between genotypes within each donor
cat("\nHeight Differences Between Genotypes:\n")
## 
## Height Differences Between Genotypes:
height_comparison_all <- all_plants_with_metadata %>%
  group_by(donor) %>%
  summarise(
    inv4m_mean = round(mean(plant_height_cm[genotype == "INV4M"], na.rm = TRUE), 1),
    ctrl_mean = round(mean(plant_height_cm[genotype == "CTRL"], na.rm = TRUE), 1),
    height_difference = round(inv4m_mean - ctrl_mean, 1),
    .groups = "drop"
  )
kable(height_comparison_all,
      caption = "Height Comparison: Inv4m vs Control by Donor (All Plants)",
      col.names = c("Donor", "Inv4m Mean (cm)", "Control Mean (cm)", "Difference (cm)"),
      align = c('c', 'c', 'c', 'c')) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Height Comparison: Inv4m vs Control by Donor (All Plants)
| Donor | Inv4m Mean (cm) | Control Mean (cm) | Difference (cm) | 
| MI21 | 182.9 | 187.1 | -4.2 | 
| TMEX | 204.1 | 193.8 | 10.3 | 
Analysis completed on: 2025-07-09
20:26:51.424791