1 Objective

This analysis identifies the most representative plants for each donor-genotype combination by calculating centroids in PC space.

2 Load Required Libraries

library(dplyr)
library(ggplot2)
library(ggpubr)
library(tidyr)
library(broom)
library(knitr)
library(kableExtra)

3 Data Import and Preprocessing

# Data import and preprocessing
data <- read.csv("~/Desktop/CLY25_Inv4m.csv", na.strings = c("", "NA", NA)) 

# Include spatial coordinates in metadata
metadata <- data %>%
  select(plant:group, X_pos, Y_pos)

phytomere <- read.csv("~/Desktop/CLY25_Inv4m_Internode.csv", 
                     na.strings = c("", "na", "NA", NA)) 

# Check data structure
cat("Columns in metadata:\n")

## Columns in metadata:

print(colnames(metadata))

##  [1] "plant"    "row_id"   "plot_id"  "plot_row" "plot_col" "rep"     
##  [7] "Y_pos"    "X_pos"    "donor"    "inv4m_gt" "group"

cat("\nColumns in phytomere:\n")

## 
## Columns in phytomere:

print(colnames(phytomere))

## [1] "plant"     "internode" "has_ear"   "length"    "length_1"  "length_2" 
## [7] "notes"

4 Data Cleaning and Preparation

# Data cleaning and preparation
phytomere_processed <- metadata %>%
  inner_join(phytomere, by = "plant") %>%
  rename(genotype = "inv4m_gt") %>%
  filter(!is.na(length)) %>%
  filter(!(grepl("no tassel", notes))) %>%
  group_by(plant) %>%
  arrange(-internode) %>%
  mutate(from_top = row_number()) %>%
  filter(donor == "MI21" | donor == "TMEX" & from_top <= 10)

cat("Columns in phytomere_processed:\n")

## Columns in phytomere_processed:

print(colnames(phytomere_processed))

##  [1] "plant"     "row_id"    "plot_id"   "plot_row"  "plot_col"  "rep"      
##  [7] "Y_pos"     "X_pos"     "donor"     "genotype"  "group"     "internode"
## [13] "has_ear"   "length"    "length_1"  "length_2"  "notes"     "from_top"

# Create PCA input data
for_pca <- phytomere_processed %>%
  select(plant, internode, length) %>% 
  pivot_wider(names_from = "internode", 
              values_from = "length", 
              names_prefix = "top") %>%
  arrange(plant)

cat("\nPCA data dimensions:", dim(for_pca))

## 
## PCA data dimensions: 424 17

cat("\nColumns in PCA data (plant ID will be kept for identification but excluded from PCA):\n")

## 
## Columns in PCA data (plant ID will be kept for identification but excluded from PCA):

print(colnames(for_pca))

##  [1] "plant" "top16" "top15" "top14" "top13" "top12" "top11" "top10" "top9" 
## [10] "top8"  "top7"  "top6"  "top5"  "top4"  "top3"  "top2"  "top1"

# Check which columns are actually internode measurements (exclude plant ID)
internode_cols <- colnames(for_pca)[colnames(for_pca) != "plant"]
cat("\nInternode measurement columns to be used in PCA:", length(internode_cols), "columns\n")

## 
## Internode measurement columns to be used in PCA: 16 columns

# Calculate overall missing data percentage
pca_matrix <- for_pca %>% select(all_of(internode_cols))
total_cells <- nrow(pca_matrix) * ncol(pca_matrix)
missing_cells <- sum(is.na(pca_matrix))
missing_percentage <- round((missing_cells / total_cells) * 100, 1)

cat("Missing data: ", missing_percentage, "% of matrix positions\n")

## Missing data:  25.5 % of matrix positions

cat("(", missing_cells, " missing out of ", total_cells, " total values)\n")

## ( 1836  missing out of  7208  total values)

5 PCA Analysis Functions

# Function to detect and remove outliers post-hoc from PC space
remove_outliers_posthoc <- function(pc_scores, threshold_sd = 3) {
  # Calculate distances from center in PC space
  pc1_mean <- mean(pc_scores$PC1, na.rm = TRUE)
  pc2_mean <- mean(pc_scores$PC2, na.rm = TRUE)
  pc1_sd <- sd(pc_scores$PC1, na.rm = TRUE)
  pc2_sd <- sd(pc_scores$PC2, na.rm = TRUE)
  
  # Identify outliers as points beyond threshold_sd standard deviations
  pc1_outliers <- abs(pc_scores$PC1 - pc1_mean) > (threshold_sd * pc1_sd)
  pc2_outliers <- abs(pc_scores$PC2 - pc2_mean) > (threshold_sd * pc2_sd)
  
  # Plants that are outliers in either PC1 or PC2
  outlier_mask <- pc1_outliers | pc2_outliers
  outlier_plants <- pc_scores$plant[outlier_mask]
  
  # Return clean data
  clean_pc_scores <- pc_scores[!outlier_mask, ]
  
  list(
    clean_pc_scores = clean_pc_scores,
    outlier_plants = outlier_plants,
    n_outliers = length(outlier_plants)
  )
}

# Function to perform PCA and find centroid representatives
perform_pca_analysis <- function(pca_data, metadata_df, remove_outliers_flag = FALSE) {
  
  # Skip pre-PCA outlier removal
  outlier_info <- NULL
  
  # Prepare data for PCA with mean imputation
  pca_matrix <- pca_data %>%
    select(-plant) %>%
    as.matrix()
  
  # Mean imputation for missing values
  for (col in 1:ncol(pca_matrix)) {
    col_mean <- mean(pca_matrix[, col], na.rm = TRUE)
    pca_matrix[is.na(pca_matrix[, col]), col] <- col_mean
  }
  
  # Get all plants (no removal due to missing data)
  plants_all <- pca_data$plant
  
  # Get metadata for all plants - check for genotype column
  metadata_clean <- metadata_df %>%
    filter(plant %in% plants_all) %>%
    arrange(match(plant, plants_all))
  
  # Check if genotype column exists, if not use inv4m_gt
  if (!"genotype" %in% colnames(metadata_clean)) {
    if ("inv4m_gt" %in% colnames(metadata_clean)) {
      metadata_clean <- metadata_clean %>%
        rename(genotype = inv4m_gt)
    } else {
      stop("Neither 'genotype' nor 'inv4m_gt' column found in metadata")
    }
  }
  
  # Create combined PC scores dataframe first
  all_pc_scores <- data.frame(plant = plants_all) %>%
    left_join(metadata_clean, by = "plant")
  
  # Perform separate PCA for each donor
  donors <- unique(all_pc_scores$donor)
  pca_results_by_donor <- list()
  
  for (donor_name in donors) {
    cat("\nPerforming PCA for donor:", donor_name, "\n")
    
    # Filter data for this donor
    donor_plants <- all_pc_scores %>%
      filter(donor == donor_name) %>%
      pull(plant)
    
    donor_indices <- which(plants_all %in% donor_plants)
    donor_matrix <- pca_matrix[donor_indices, ]
    
    # Check for constant columns (zero variance)
    col_vars <- apply(donor_matrix, 2, var, na.rm = TRUE)
    constant_cols <- which(col_vars == 0 | is.na(col_vars))
    
    if (length(constant_cols) > 0) {
      cat("Removing", length(constant_cols), "constant/zero variance columns for", donor_name, ":\n")
      cat("Columns removed:", colnames(donor_matrix)[constant_cols], "\n")
      donor_matrix <- donor_matrix[, -constant_cols, drop = FALSE]
    }
    
    # Check if we have enough columns left for PCA
    if (ncol(donor_matrix) < 2) {
      cat("Warning: Less than 2 variables remaining for", donor_name, ". Skipping PCA.\n")
      next
    }
    
    # Check for near-constant columns (very low variance)
    remaining_vars <- apply(donor_matrix, 2, var, na.rm = TRUE)
    low_var_threshold <- 1e-10
    near_constant <- which(remaining_vars < low_var_threshold)
    
    if (length(near_constant) > 0) {
      cat("Removing", length(near_constant), "near-constant columns for", donor_name, "\n")
      donor_matrix <- donor_matrix[, -near_constant, drop = FALSE]
    }
    
    # Final check
    if (ncol(donor_matrix) < 2) {
      cat("Warning: Less than 2 variables remaining for", donor_name, ". Skipping PCA.\n")
      next
    }
    
    # Perform PCA for this donor
    pca_result <- prcomp(donor_matrix, center = TRUE, scale. = TRUE)
    
    # Extract PC scores
    pc_scores <- pca_result$x %>%
      as.data.frame() %>%
      mutate(plant = donor_plants) %>%
      left_join(metadata_clean, by = "plant")
    
    # Post-hoc outlier removal in PC space
    if (remove_outliers_flag) {
      cat("Removing post-hoc outliers in PC space for", donor_name, "\n")
      outlier_results <- remove_outliers_posthoc(pc_scores, threshold_sd = 3)
      pc_scores_clean <- outlier_results$clean_pc_scores
      outlier_plants <- outlier_results$outlier_plants
      cat("  - Outliers removed:", length(outlier_plants), "\n")
      if (length(outlier_plants) > 0) {
        cat("  - Outlier plants:", paste(outlier_plants, collapse = ", "), "\n")
      }
    } else {
      pc_scores_clean <- pc_scores
      outlier_plants <- character(0)
    }
    
    # Calculate centroids for each genotype within this donor (using clean data)
    centroids <- pc_scores_clean %>%
      group_by(genotype) %>%
      summarise(
        centroid_pc1 = mean(PC1, na.rm = TRUE),
        centroid_pc2 = mean(PC2, na.rm = TRUE),
        n_plants = n(),
        .groups = "drop"
      ) %>%
      mutate(donor = donor_name)
    
    # Find plant closest to each centroid (using clean data)
    closest_plants <- pc_scores_clean %>%
      left_join(centroids, by = "genotype") %>%
      mutate(
        distance_to_centroid = sqrt((PC1 - centroid_pc1)^2 + (PC2 - centroid_pc2)^2)
      ) %>%
      group_by(genotype) %>%
      slice_min(distance_to_centroid, n = 1) %>%
      ungroup()
    
    # Find top 4 plants closest to each centroid (for extended table)
    top4_closest_plants <- pc_scores_clean %>%
      left_join(centroids, by = "genotype") %>%
      mutate(
        distance_to_centroid = sqrt((PC1 - centroid_pc1)^2 + (PC2 - centroid_pc2)^2)
      ) %>%
      group_by(genotype) %>%
      slice_min(distance_to_centroid, n = 4) %>%
      mutate(rank = row_number()) %>%
      ungroup()
    
    # Store results for this donor
    pca_results_by_donor[[donor_name]] <- list(
      pca_result = pca_result,
      pc_scores = pc_scores,  # Keep original for plotting
      pc_scores_clean = pc_scores_clean,  # Clean for centroids
      centroids = centroids,
      closest_plants = closest_plants,
      top4_closest_plants = top4_closest_plants,
      variance_explained = summary(pca_result)$importance,
      removed_columns = c(constant_cols, near_constant),
      n_variables_used = ncol(donor_matrix),
      outlier_plants = outlier_plants
    )
  }
  
  # Combine results
  all_centroids <- bind_rows(lapply(pca_results_by_donor, function(x) x$centroids))
  all_closest_plants <- bind_rows(lapply(pca_results_by_donor, function(x) x$closest_plants))
  all_top4_closest_plants <- bind_rows(lapply(pca_results_by_donor, function(x) x$top4_closest_plants))
  
  # Return results with imputation and outlier info
  list(
    pca_results_by_donor = pca_results_by_donor,
    all_centroids = all_centroids,
    all_closest_plants = all_closest_plants,
    all_top4_closest_plants = all_top4_closest_plants,
    n_plants_analyzed = nrow(all_pc_scores),
    donors = donors,
    outlier_info = outlier_info,
    imputation_summary = data.frame(
      variable = colnames(pca_matrix),
      n_missing = sapply(1:ncol(pca_data %>% select(-plant)), 
                        function(i) sum(is.na(pca_data %>% select(-plant) %>% pull(i)))),
      mean_imputed = sapply(1:ncol(pca_matrix), function(i) mean(pca_matrix[, i]))
    )
  )
}

# Function to create PCA visualization
plot_pca_results <- function(analysis_results) {
  donors <- analysis_results$donors
  plot_list <- list()
  
  for (donor_name in donors) {
    # Check if PCA was actually performed for this donor
    if (donor_name %in% names(analysis_results$pca_results_by_donor)) {
      donor_results <- analysis_results$pca_results_by_donor[[donor_name]]
      pc_scores <- donor_results$pc_scores
      centroids <- donor_results$centroids
      closest_plants <- donor_results$closest_plants
      var_exp <- donor_results$variance_explained
      
      # Calculate variance explained percentages
      pc1_var <- round(var_exp[2, 1] * 100, 1)
      pc2_var <- round(var_exp[2, 2] * 100, 1)
      
      # Create PCA plot for this donor (using cleaned data for plotting)
      pc_scores_to_plot <- if(length(donor_results$outlier_plants) > 0) {
        donor_results$pc_scores_clean
      } else {
        donor_results$pc_scores
      }
      
      p <- ggplot(pc_scores_to_plot, aes(x = PC1, y = PC2, color = genotype)) +
        geom_point(size = 3, alpha = 0.8) +
        geom_point(data = centroids, 
                   aes(x = centroid_pc1, y = centroid_pc2, color = genotype),
                   size = 12, shape = 15, alpha = 0.9) +
        geom_point(data = closest_plants,
                   aes(x = PC1, y = PC2, color = genotype),
                   size = 4, shape = 1, stroke = 2) +
        geom_text(data = closest_plants,
                  aes(x = PC1, y = PC2, label = plant),
                  nudge_y = 0.3, size = 6, fontface = "bold",
                  show.legend = FALSE) +
        scale_color_manual(
          values = c("INV4M" = "purple4", "CTRL" = "gold"),
          labels = c("INV4M" = "Inv4m", "CTRL" = "Control"),
          name = "Genotype"
        ) +
        labs(
          title = paste("PCA for Donor:", donor_name),
          subtitle = paste("Squares = centroids, Circles = closest plants", 
                          if(length(donor_results$outlier_plants) > 0) 
                            paste("(", length(donor_results$outlier_plants), "outliers removed)")
                          else ""),
          x = paste0("PC1 (", pc1_var, "% variance)"),
          y = paste0("PC2 (", pc2_var, "% variance)")
        ) +
        theme_minimal() +
        theme(
          plot.title = element_text(size = 12, face = "bold"),
          plot.subtitle = element_text(size = 10),
          legend.position = "bottom"
        )
      
      plot_list[[donor_name]] <- p
    } else {
      # Create placeholder plot for skipped donors
      p <- ggplot() +
        annotate("text", x = 0, y = 0, 
                label = paste("PCA analysis skipped for", donor_name, "\n(insufficient variable variance)"),
                size = 5, hjust = 0.5, vjust = 0.5) +
        labs(title = paste("PCA for Donor:", donor_name)) +
        theme_void() +
        theme(plot.title = element_text(size = 12, face = "bold", hjust = 0.5))
      
      plot_list[[donor_name]] <- p
    }
  }
  
  # Combine plots
  if (length(plot_list) == 2) {
    combined_plot <- ggarrange(plotlist = plot_list, ncol = 2, nrow = 1, 
                              common.legend = TRUE, legend = "bottom")
  } else {
    combined_plot <- plot_list[[1]]
  }
  
  return(combined_plot)
}

6 Execute PCA Analysis

# Perform the analysis with more stringent outlier removal
pca_analysis <- perform_pca_analysis(for_pca, metadata, remove_outliers_flag = TRUE)

## 
## Performing PCA for donor: TMEX 
## Removing 2 constant/zero variance columns for TMEX :
## Columns removed: top16 top15 
## Removing post-hoc outliers in PC space for TMEX 
##   - Outliers removed: 2 
##   - Outlier plants: 344, 414 
## 
## Performing PCA for donor: MI21 
## Removing post-hoc outliers in PC space for MI21 
##   - Outliers removed: 1 
##   - Outlier plants: 8

7 Results

7.1 Outlier Detection Summary

if (!is.null(pca_analysis$outlier_info)) {
  cat("Outlier Detection Results by Donor:\n")
  cat("==================================\n")
  cat("Method: Mahalanobis Distance (per donor)\n")
  cat("Threshold: χ² 97.5th percentile\n\n")
  
  # Create summary table for all donors
  outlier_summary_list <- list()
  
  for (donor_name in names(pca_analysis$outlier_info)) {
    donor_outliers <- pca_analysis$outlier_info[[donor_name]]
    
    cat("DONOR:", donor_name, "\n")
    cat("  - Original plants:", donor_outliers$n_plants_original, "\n")
    cat("  - Outliers detected:", donor_outliers$n_outliers, "\n")
    cat("  - Threshold used:", round(donor_outliers$threshold, 3), "\n")
    
    if (donor_outliers$n_outliers > 0) {
      cat("  - Outlier plants:", paste(donor_outliers$outlier_plants, collapse = ", "), "\n")
      
      # Find indices of outlier plants in the distance vector
      outlier_indices <- which(names(donor_outliers$mahalanobis_distances) %in% donor_outliers$outlier_plants)
      
      # If names are not available, use plant order
      if (length(outlier_indices) == 0) {
        # Create a mapping based on original plant order
        donor_data_temp <- combined_data %>% filter(donor == donor_name)
        outlier_indices <- which(donor_data_temp$plant %in% donor_outliers$outlier_plants)
      }
      
      # Create data frame with matching lengths
      if (length(outlier_indices) > 0 && length(outlier_indices) == length(donor_outliers$outlier_plants)) {
        outlier_summary_list[[donor_name]] <- data.frame(
          donor = rep(donor_name, length(donor_outliers$outlier_plants)),
          outlier_plant = donor_outliers$outlier_plants,
          mahalanobis_distance = donor_outliers$mahalanobis_distances[outlier_indices]
        )
      } else {
        # Fallback: create data frame without distances if matching fails
        outlier_summary_list[[donor_name]] <- data.frame(
          donor = rep(donor_name, length(donor_outliers$outlier_plants)),
          outlier_plant = donor_outliers$outlier_plants,
          mahalanobis_distance = rep(NA, length(donor_outliers$outlier_plants))
        )
      }
    }
    cat("\n")
  }
  
  # Create combined outlier table if there are outliers
  if (length(outlier_summary_list) > 0) {
    all_outliers_table <- bind_rows(outlier_summary_list) %>%
      arrange(donor, desc(mahalanobis_distance)) %>%
      mutate(mahalanobis_distance = round(mahalanobis_distance, 3))
    
    if (nrow(all_outliers_table) > 0) {
      kable(all_outliers_table, 
            caption = "All Outliers Detected by Donor",
            col.names = c("Donor", "Plant ID", "Mahalanobis Distance"),
            align = c('c', 'c', 'c')) %>%
        kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
        group_rows("MI21", 1, sum(all_outliers_table$donor == "MI21")) %>%
        group_rows("TMEX", sum(all_outliers_table$donor == "MI21") + 1, nrow(all_outliers_table))
    }
  }
  
  total_removed <- sum(sapply(pca_analysis$outlier_info, function(x) x$n_outliers))
  cat("Total plants removed across all donors:", total_removed, "\n")
  
} else {
  cat("No outlier detection performed.")
}

## No outlier detection performed.

7.2 Missing Data Summary

kable(pca_analysis$imputation_summary, 
      caption = "Missing Data and Mean Imputation Summary",
      digits = 3) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

Missing Data and Mean Imputation Summary
variable	n_missing	mean_imputed
plant	0	218.222
top16	417	21.286
top15	378	22.130
top14	264	22.637
top13	206	19.950
top12	195	14.328
top11	195	12.459
top10	145	14.989
top9	13	18.664
top8	3	15.919
top7	3	14.435
top6	2	14.659
top5	1	14.170
top4	3	12.926
top3	3	12.012
top2	3	11.494
top1	5	9.692

cat("Total plants analyzed (after outlier removal):", pca_analysis$n_plants_analyzed)

## Total plants analyzed (after outlier removal): 424

7.3 PCA Results by Donor

# Display results for each donor
for (donor_name in pca_analysis$donors) {
  if (donor_name %in% names(pca_analysis$pca_results_by_donor)) {
    cat("\n\n### DONOR:", donor_name, "\n")
    
    donor_results <- pca_analysis$pca_results_by_donor[[donor_name]]
    
    # Report removed columns if any
    if (!is.null(donor_results$removed_columns) && length(donor_results$removed_columns) > 0) {
      cat("\n**Note:** Removed", length(donor_results$removed_columns), 
          "constant/low-variance columns for this donor\n")
      cat("Variables used in PCA:", donor_results$n_variables_used, "\n")
    }
    
    cat("\n#### Variance Explained\n")
    var_table <- donor_results$variance_explained[1:3, 1:5] %>%
      as.data.frame() %>%
      round(4)
    print(kable(var_table, caption = paste("Variance Explained -", donor_name)) %>%
          kable_styling(bootstrap_options = c("striped", "hover", "condensed")))
    
    cat("\n#### Genotype Centroids\n")
    centroid_table <- donor_results$centroids %>%
      select(-donor) %>%
      mutate(across(where(is.numeric), ~round(.x, 3)))
    print(kable(centroid_table, caption = paste("Genotype Centroids -", donor_name)) %>%
          kable_styling(bootstrap_options = c("striped", "hover", "condensed")))
    
    cat("\n#### Closest Plants to Centroids\n")
    closest_table <- donor_results$closest_plants %>%
      select(plant, genotype, PC1, PC2, distance_to_centroid) %>%
      arrange(genotype) %>%
      mutate(across(where(is.numeric), ~round(.x, 3)))
    print(kable(closest_table, caption = paste("Representative Plants -", donor_name)) %>%
          kable_styling(bootstrap_options = c("striped", "hover", "condensed")))
    
    cat("\n")
  } else {
    cat("\n\n### DONOR:", donor_name, "\n")
    cat("**PCA analysis skipped due to insufficient variable variance**\n\n")
  }
}

7.3.1 DONOR: TMEX

Note: Removed 2 constant/low-variance columns for this donor Variables used in PCA: 15

7.3.1.1 Variance Explained

Variance Explained - TMEX
	PC1	PC2	PC3	PC4	PC5
Standard deviation	1.9603	1.6366	1.4744	1.0899	1.0271
Proportion of Variance	0.2562	0.1786	0.1449	0.0792	0.0703
Cumulative Proportion	0.2562	0.4348	0.5797	0.6589	0.7292

7.3.1.2 Genotype Centroids

Genotype Centroids - TMEX
genotype	centroid_pc1	centroid_pc2	n_plants
CTRL	0.668	-0.862	100
INV4M	-1.019	0.729	95

7.3.1.3 Closest Plants to Centroids

Representative Plants - TMEX
plant	genotype	PC1	PC2	distance_to_centroid
373	CTRL	0.775	-0.733	0.168
213	INV4M	-0.915	0.930	0.227

7.3.2 DONOR: MI21

7.3.2.1 Variance Explained

Variance Explained - MI21
	PC1	PC2	PC3	PC4	PC5
Standard deviation	2.1030	1.3756	1.3032	1.1607	1.0693
Proportion of Variance	0.2602	0.1113	0.0999	0.0792	0.0673
Cumulative Proportion	0.2602	0.3715	0.4714	0.5506	0.6179

7.3.2.2 Genotype Centroids

Genotype Centroids - MI21
genotype	centroid_pc1	centroid_pc2	n_plants
CTRL	-0.280	0.263	112
INV4M	0.234	-0.306	114

7.3.2.3 Closest Plants to Centroids

Representative Plants - MI21
plant	genotype	PC1	PC2	distance_to_centroid
353	CTRL	-0.103	0.392	0.219
327	INV4M	0.207	-0.469	0.165

7.4 PCA Visualizations

# Create and display plots
pca_plot <- plot_pca_results(pca_analysis)
print(pca_plot)

7.5 Summary: All Representative Plants with Plant Height

# Debug: Check what columns are in the closest plants data
cat("Columns in all_closest_plants:\n")

## Columns in all_closest_plants:

print(colnames(pca_analysis$all_closest_plants))

##  [1] "PC1"                  "PC2"                  "PC3"                 
##  [4] "PC4"                  "PC5"                  "PC6"                 
##  [7] "PC7"                  "PC8"                  "PC9"                 
## [10] "PC10"                 "PC11"                 "PC12"                
## [13] "PC13"                 "PC14"                 "PC15"                
## [16] "plant"                "row_id"               "plot_id"             
## [19] "plot_row"             "plot_col"             "rep"                 
## [22] "Y_pos"                "X_pos"                "donor.x"             
## [25] "genotype"             "group"                "centroid_pc1"        
## [28] "centroid_pc2"         "n_plants"             "donor.y"             
## [31] "distance_to_centroid" "PC16"                 "PC17"

# Get plant height data from original metadata (PH column)
plant_height_data <- data %>%
  select(plant, plant_height_cm = PH) %>%
  filter(!is.na(plant_height_cm))

# Show some basic stats about plant height data
cat("\nPlant height data summary:\n")

## 
## Plant height data summary:

cat("Total plants with height data:", nrow(plant_height_data), "\n")

## Total plants with height data: 442

cat("Height range:", min(plant_height_data$plant_height_cm, na.rm = TRUE), "-", 
    max(plant_height_data$plant_height_cm, na.rm = TRUE), "cm\n")

## Height range: 159 - 225 cm

cat("Mean height:", round(mean(plant_height_data$plant_height_cm, na.rm = TRUE), 1), "cm\n\n")

## Mean height: 191.6 cm

# Get metadata for merging donor information
cat("Columns in metadata:\n")

## Columns in metadata:

print(colnames(metadata))

##  [1] "plant"    "row_id"   "plot_id"  "plot_row" "plot_col" "rep"     
##  [7] "Y_pos"    "X_pos"    "donor"    "inv4m_gt" "group"

metadata_for_merge <- metadata %>%
  select(plant, donor, inv4m_gt)

cat("\nColumns in metadata_for_merge:\n")

## 
## Columns in metadata_for_merge:

print(colnames(metadata_for_merge))

## [1] "plant"    "donor"    "inv4m_gt"

# Combine with closest plants data - first check what we have
cat("\nFirst few rows of all_closest_plants:\n")

## 
## First few rows of all_closest_plants:

print(head(pca_analysis$all_closest_plants))

## # A tibble: 4 × 33
##      PC1    PC2    PC3     PC4    PC5      PC6      PC7     PC8    PC9   PC10
##    <dbl>  <dbl>  <dbl>   <dbl>  <dbl>    <dbl>    <dbl>   <dbl>  <dbl>  <dbl>
## 1  0.775 -0.733  2.12   1.90   -0.726 -0.415   -0.902   -0.837  -0.738  0.124
## 2 -0.915  0.930 -0.571 -0.208   0.161 -0.316   -1.17    -1.40    0.342 -1.12 
## 3 -0.103  0.392  1.43  -0.357  -0.494  2.38     0.219   -0.113  -0.117  0.185
## 4  0.207 -0.469  1.34  -0.0304 -0.316  0.00175 -0.00317  0.0982 -0.780  1.26 
## # ℹ 23 more variables: PC11 <dbl>, PC12 <dbl>, PC13 <dbl>, PC14 <dbl>,
## #   PC15 <dbl>, plant <int>, row_id <int>, plot_id <int>, plot_row <int>,
## #   plot_col <int>, rep <int>, Y_pos <dbl>, X_pos <dbl>, donor.x <chr>,
## #   genotype <chr>, group <chr>, centroid_pc1 <dbl>, centroid_pc2 <dbl>,
## #   n_plants <int>, donor.y <chr>, distance_to_centroid <dbl>, PC16 <dbl>,
## #   PC17 <dbl>

# Create the merged dataset step by step
step1 <- pca_analysis$all_closest_plants %>%
  left_join(metadata_for_merge, by = "plant")

cat("\nColumns after joining with metadata:\n")

## 
## Columns after joining with metadata:

print(colnames(step1))

##  [1] "PC1"                  "PC2"                  "PC3"                 
##  [4] "PC4"                  "PC5"                  "PC6"                 
##  [7] "PC7"                  "PC8"                  "PC9"                 
## [10] "PC10"                 "PC11"                 "PC12"                
## [13] "PC13"                 "PC14"                 "PC15"                
## [16] "plant"                "row_id"               "plot_id"             
## [19] "plot_row"             "plot_col"             "rep"                 
## [22] "Y_pos"                "X_pos"                "donor.x"             
## [25] "genotype"             "group"                "centroid_pc1"        
## [28] "centroid_pc2"         "n_plants"             "donor.y"             
## [31] "distance_to_centroid" "PC16"                 "PC17"                
## [34] "donor"                "inv4m_gt"

step2 <- step1 %>%
  left_join(plant_height_data, by = "plant")

cat("\nColumns after joining with height data:\n")

## 
## Columns after joining with height data:

print(colnames(step2))

##  [1] "PC1"                  "PC2"                  "PC3"                 
##  [4] "PC4"                  "PC5"                  "PC6"                 
##  [7] "PC7"                  "PC8"                  "PC9"                 
## [10] "PC10"                 "PC11"                 "PC12"                
## [13] "PC13"                 "PC14"                 "PC15"                
## [16] "plant"                "row_id"               "plot_id"             
## [19] "plot_row"             "plot_col"             "rep"                 
## [22] "Y_pos"                "X_pos"                "donor.x"             
## [25] "genotype"             "group"                "centroid_pc1"        
## [28] "centroid_pc2"         "n_plants"             "donor.y"             
## [31] "distance_to_centroid" "PC16"                 "PC17"                
## [34] "donor"                "inv4m_gt"             "plant_height_cm"

# Now safely select and rename columns
available_cols <- colnames(step2)
select_cols <- c("plant")

if ("donor" %in% available_cols) select_cols <- c(select_cols, "donor")
if ("inv4m_gt" %in% available_cols) select_cols <- c(select_cols, "inv4m_gt")
if ("genotype" %in% available_cols) select_cols <- c(select_cols, "genotype")
if ("PC1" %in% available_cols) select_cols <- c(select_cols, "PC1")
if ("PC2" %in% available_cols) select_cols <- c(select_cols, "PC2")
if ("distance_to_centroid" %in% available_cols) select_cols <- c(select_cols, "distance_to_centroid")
if ("plant_height_cm" %in% available_cols) select_cols <- c(select_cols, "plant_height_cm")

cat("\nColumns to select:", paste(select_cols, collapse = ", "), "\n")

## 
## Columns to select: plant, donor, inv4m_gt, genotype, PC1, PC2, distance_to_centroid, plant_height_cm

final_summary_with_height <- step2 %>%
  select(all_of(select_cols))

# Rename genotype column if needed
if ("inv4m_gt" %in% colnames(final_summary_with_height) && !"genotype" %in% colnames(final_summary_with_height)) {
  final_summary_with_height <- final_summary_with_height %>%
    rename(genotype = inv4m_gt)
}

# Round numeric columns
numeric_cols <- intersect(c("PC1", "PC2", "distance_to_centroid"), colnames(final_summary_with_height))
if (length(numeric_cols) > 0) {
  final_summary_with_height <- final_summary_with_height %>%
    mutate(across(all_of(numeric_cols), ~round(.x, 3)))
}

if ("plant_height_cm" %in% colnames(final_summary_with_height)) {
  final_summary_with_height <- final_summary_with_height %>%
    mutate(plant_height_cm = round(plant_height_cm, 1))
}

# Arrange if we have both donor and genotype
if (all(c("donor", "genotype") %in% colnames(final_summary_with_height))) {
  final_summary_with_height <- final_summary_with_height %>%
    arrange(donor, genotype)
}

# Show the final data structure
cat("\nFinal summary structure:\n")

## 
## Final summary structure:

print(str(final_summary_with_height))

## tibble [4 × 8] (S3: tbl_df/tbl/data.frame)
##  $ plant               : int [1:4] 353 327 373 213
##  $ donor               : chr [1:4] "MI21" "MI21" "TMEX" "TMEX"
##  $ inv4m_gt            : chr [1:4] "CTRL" "INV4M" "CTRL" "INV4M"
##  $ genotype            : chr [1:4] "CTRL" "INV4M" "CTRL" "INV4M"
##  $ PC1                 : num [1:4] -0.103 0.207 0.775 -0.915
##  $ PC2                 : num [1:4] 0.392 -0.469 -0.733 0.93
##  $ distance_to_centroid: num [1:4] 0.219 0.165 0.168 0.227
##  $ plant_height_cm     : num [1:4] 170 183 195 205
## NULL

cat("\nFirst few rows:\n")

## 
## First few rows:

print(head(final_summary_with_height))

## # A tibble: 4 × 8
##   plant donor inv4m_gt genotype    PC1    PC2 distance_to_centroid
##   <int> <chr> <chr>    <chr>     <dbl>  <dbl>                <dbl>
## 1   353 MI21  CTRL     CTRL     -0.103  0.392                0.219
## 2   327 MI21  INV4M    INV4M     0.207 -0.469                0.165
## 3   373 TMEX  CTRL     CTRL      0.775 -0.733                0.168
## 4   213 TMEX  INV4M    INV4M    -0.915  0.93                 0.227
## # ℹ 1 more variable: plant_height_cm <dbl>

# Create the table
kable(final_summary_with_height, 
      caption = "Most Representative Plants by Donor-Genotype with Plant Height") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

Most Representative Plants by Donor-Genotype with Plant Height
plant	donor	inv4m_gt	genotype	PC1	PC2	distance_to_centroid	plant_height_cm
353	MI21	CTRL	CTRL	-0.103	0.392	0.219	170
327	MI21	INV4M	INV4M	0.207	-0.469	0.165	183
373	TMEX	CTRL	CTRL	0.775	-0.733	0.168	195
213	TMEX	INV4M	INV4M	-0.915	0.930	0.227	205

7.6 Extended Table: Top 4 Plants Closest to Each Centroid

# Extended table with top 4 closest plants per genotype
cat("Creating extended summary with top 4 plants closest to each centroid...\n")

## Creating extended summary with top 4 plants closest to each centroid...

# Check if the top4 data exists
if ("all_top4_closest_plants" %in% names(pca_analysis)) {
  cat("Top 4 data found. Number of rows:", nrow(pca_analysis$all_top4_closest_plants), "\n")
  
  # Create extended summary with top 4 plants
  extended_summary <- pca_analysis$all_top4_closest_plants %>%
    left_join(metadata_for_merge, by = "plant") %>%
    left_join(plant_height_data, by = "plant")
  
  cat("Columns in extended summary:\n")
  print(colnames(extended_summary))
  
  # Select and arrange columns safely
  available_cols_ext <- colnames(extended_summary)
  select_cols_ext <- c("plant")
  
  if ("donor" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "donor")
  if ("inv4m_gt" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "inv4m_gt")
  if ("genotype" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "genotype")
  if ("rank" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "rank")
  if ("PC1" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "PC1")
  if ("PC2" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "PC2")
  if ("distance_to_centroid" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "distance_to_centroid")
  if ("plant_height_cm" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "plant_height_cm")
  
  cat("Columns to select for extended table:", paste(select_cols_ext, collapse = ", "), "\n")
  
  extended_summary_clean <- extended_summary %>%
    select(all_of(select_cols_ext))
  
  # Rename genotype column if needed
  if ("inv4m_gt" %in% colnames(extended_summary_clean) && !"genotype" %in% colnames(extended_summary_clean)) {
    extended_summary_clean <- extended_summary_clean %>%
      rename(genotype = inv4m_gt)
  }
  
  # Round numeric columns
  numeric_cols_ext <- intersect(c("PC1", "PC2", "distance_to_centroid"), colnames(extended_summary_clean))
  if (length(numeric_cols_ext) > 0) {
    extended_summary_clean <- extended_summary_clean %>%
      mutate(across(all_of(numeric_cols_ext), ~round(.x, 3)))
  }
  
  if ("plant_height_cm" %in% colnames(extended_summary_clean)) {
    extended_summary_clean <- extended_summary_clean %>%
      mutate(plant_height_cm = round(plant_height_cm, 1))
  }
  
  # Arrange if we have required columns
  if (all(c("donor", "genotype", "rank") %in% colnames(extended_summary_clean))) {
    extended_summary_clean <- extended_summary_clean %>%
      arrange(donor, genotype, rank)
  }
  
  cat("Final extended table dimensions:", nrow(extended_summary_clean), "x", ncol(extended_summary_clean), "\n")
  
  # Display the extended table
  kable(extended_summary_clean, 
        caption = "Top 4 Plants Closest to Each Centroid by Donor-Genotype",
        align = "c") %>%
    kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
    group_rows("MI21 - CTRL", 1, 4) %>%
    group_rows("MI21 - INV4M", 5, 8) %>%
    group_rows("TMEX - CTRL", 9, 12) %>%
    group_rows("TMEX - INV4M", 13, 16)
  
} else {
  cat("Error: Top 4 closest plants data not found in analysis results.\n")
}

## Top 4 data found. Number of rows: 16 
## Columns in extended summary:
##  [1] "PC1"                  "PC2"                  "PC3"                 
##  [4] "PC4"                  "PC5"                  "PC6"                 
##  [7] "PC7"                  "PC8"                  "PC9"                 
## [10] "PC10"                 "PC11"                 "PC12"                
## [13] "PC13"                 "PC14"                 "PC15"                
## [16] "plant"                "row_id"               "plot_id"             
## [19] "plot_row"             "plot_col"             "rep"                 
## [22] "Y_pos"                "X_pos"                "donor.x"             
## [25] "genotype"             "group"                "centroid_pc1"        
## [28] "centroid_pc2"         "n_plants"             "donor.y"             
## [31] "distance_to_centroid" "rank"                 "PC16"                
## [34] "PC17"                 "donor"                "inv4m_gt"            
## [37] "plant_height_cm"     
## Columns to select for extended table: plant, donor, inv4m_gt, genotype, rank, PC1, PC2, distance_to_centroid, plant_height_cm 
## Final extended table dimensions: 16 x 9

Top 4 Plants Closest to Each Centroid by Donor-Genotype
plant	donor	inv4m_gt	genotype	rank	PC1	PC2	distance_to_centroid	plant_height_cm
MI21 - CTRL
353	MI21	CTRL	CTRL	1	-0.103	0.392	0.219	170
43	MI21	CTRL	CTRL	2	-0.426	0.524	0.299	199
245	MI21	CTRL	CTRL	3	-0.260	-0.039	0.303	186
109	MI21	CTRL	CTRL	4	-0.592	0.294	0.314	192
MI21 - INV4M
327	MI21	INV4M	INV4M	1	0.207	-0.469	0.165	183
9	MI21	INV4M	INV4M	2	0.104	-0.486	0.221	185
187	MI21	INV4M	INV4M	3	0.434	-0.063	0.316	167
135	MI21	INV4M	INV4M	4	0.596	-0.457	0.392	186
TMEX - CTRL
373	TMEX	CTRL	CTRL	1	0.775	-0.733	0.168	195
74	TMEX	CTRL	CTRL	2	0.839	-0.810	0.178	207
335	TMEX	CTRL	CTRL	3	0.868	-0.916	0.208	204
100	TMEX	CTRL	CTRL	4	0.593	-1.060	0.211	184
TMEX - INV4M
213	TMEX	INV4M	INV4M	1	-0.915	0.930	0.227	205
130	TMEX	INV4M	INV4M	2	-1.057	0.953	0.228	214
174	TMEX	INV4M	INV4M	3	-1.146	0.956	0.259	205
363	TMEX	INV4M	INV4M	4	-1.099	0.480	0.261	190

7.7 Mean Plant Height by Donor-Genotype Group

# Calculate mean plant height for all plants by donor-genotype combination
all_plants_with_metadata <- data %>%
  select(plant, donor, inv4m_gt, PH) %>%
  rename(genotype = inv4m_gt, plant_height_cm = PH) %>%
  filter(!is.na(plant_height_cm))

# Calculate summary statistics
height_summary_all <- all_plants_with_metadata %>%
  group_by(donor, genotype) %>%
  summarise(
    n_plants = n(),
    mean_height = round(mean(plant_height_cm, na.rm = TRUE), 1),
    sd_height = round(sd(plant_height_cm, na.rm = TRUE), 1),
    min_height = round(min(plant_height_cm, na.rm = TRUE), 1),
    max_height = round(max(plant_height_cm, na.rm = TRUE), 1),
    .groups = "drop"
  ) %>%
  arrange(donor, genotype)

kable(height_summary_all,
      caption = "Plant Height Summary for All Plants by Donor-Genotype",
      col.names = c("Donor", "Genotype", "N Plants", "Mean Height (cm)", 
                   "SD Height (cm)", "Min Height (cm)", "Max Height (cm)"),
      align = c('c', 'c', 'c', 'c', 'c', 'c', 'c')) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

Plant Height Summary for All Plants by Donor-Genotype
Donor	Genotype	N Plants	Mean Height (cm)	SD Height (cm)	Min Height (cm)	Max Height (cm)
MI21	CTRL	113	187.1	7.9	163	206
MI21	INV4M	119	182.9	7.9	159	197
TMEX	CTRL	105	193.8	10.1	172	221
TMEX	INV4M	105	204.1	11.4	167	225

# Statistical comparison between genotypes within each donor
cat("\nHeight Differences Between Genotypes:\n")

## 
## Height Differences Between Genotypes:

height_comparison_all <- all_plants_with_metadata %>%
  group_by(donor) %>%
  summarise(
    inv4m_mean = round(mean(plant_height_cm[genotype == "INV4M"], na.rm = TRUE), 1),
    ctrl_mean = round(mean(plant_height_cm[genotype == "CTRL"], na.rm = TRUE), 1),
    height_difference = round(inv4m_mean - ctrl_mean, 1),
    .groups = "drop"
  )

kable(height_comparison_all,
      caption = "Height Comparison: Inv4m vs Control by Donor (All Plants)",
      col.names = c("Donor", "Inv4m Mean (cm)", "Control Mean (cm)", "Difference (cm)"),
      align = c('c', 'c', 'c', 'c')) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))

Height Comparison: Inv4m vs Control by Donor (All Plants)
Donor	Inv4m Mean (cm)	Control Mean (cm)	Difference (cm)
MI21	182.9	187.1	-4.2
TMEX	204.1	193.8	10.3

Analysis completed on: 2025-07-09 20:26:51.424791

PCA Analysis of Internode Length Data for Inv4m Study

Maize Genetics Lab

2025-07-09