Data Cleaning and
Preparation
# Data cleaning and preparation
phytomere_processed <- metadata %>%
inner_join(phytomere, by = "plant") %>%
rename(genotype = "inv4m_gt") %>%
filter(!is.na(length)) %>%
filter(!(grepl("no tassel", notes))) %>%
group_by(plant) %>%
arrange(-internode) %>%
mutate(from_top = row_number()) %>%
filter(donor == "MI21" | donor == "TMEX" & from_top <= 10)
cat("Columns in phytomere_processed:\n")
## Columns in phytomere_processed:
print(colnames(phytomere_processed))
## [1] "plant" "row_id" "plot_id" "plot_row" "plot_col" "rep"
## [7] "Y_pos" "X_pos" "donor" "genotype" "group" "internode"
## [13] "has_ear" "length" "length_1" "length_2" "notes" "from_top"
# Create PCA input data
for_pca <- phytomere_processed %>%
select(plant, internode, length) %>%
pivot_wider(names_from = "internode",
values_from = "length",
names_prefix = "top") %>%
arrange(plant)
cat("\nPCA data dimensions:", dim(for_pca))
##
## PCA data dimensions: 424 17
cat("\nColumns in PCA data (plant ID will be kept for identification but excluded from PCA):\n")
##
## Columns in PCA data (plant ID will be kept for identification but excluded from PCA):
print(colnames(for_pca))
## [1] "plant" "top16" "top15" "top14" "top13" "top12" "top11" "top10" "top9"
## [10] "top8" "top7" "top6" "top5" "top4" "top3" "top2" "top1"
# Check which columns are actually internode measurements (exclude plant ID)
internode_cols <- colnames(for_pca)[colnames(for_pca) != "plant"]
cat("\nInternode measurement columns to be used in PCA:", length(internode_cols), "columns\n")
##
## Internode measurement columns to be used in PCA: 16 columns
# Calculate overall missing data percentage
pca_matrix <- for_pca %>% select(all_of(internode_cols))
total_cells <- nrow(pca_matrix) * ncol(pca_matrix)
missing_cells <- sum(is.na(pca_matrix))
missing_percentage <- round((missing_cells / total_cells) * 100, 1)
cat("Missing data: ", missing_percentage, "% of matrix positions\n")
## Missing data: 25.5 % of matrix positions
cat("(", missing_cells, " missing out of ", total_cells, " total values)\n")
## ( 1836 missing out of 7208 total values)
PCA Analysis
Functions
# Function to detect and remove outliers post-hoc from PC space
remove_outliers_posthoc <- function(pc_scores, threshold_sd = 3) {
# Calculate distances from center in PC space
pc1_mean <- mean(pc_scores$PC1, na.rm = TRUE)
pc2_mean <- mean(pc_scores$PC2, na.rm = TRUE)
pc1_sd <- sd(pc_scores$PC1, na.rm = TRUE)
pc2_sd <- sd(pc_scores$PC2, na.rm = TRUE)
# Identify outliers as points beyond threshold_sd standard deviations
pc1_outliers <- abs(pc_scores$PC1 - pc1_mean) > (threshold_sd * pc1_sd)
pc2_outliers <- abs(pc_scores$PC2 - pc2_mean) > (threshold_sd * pc2_sd)
# Plants that are outliers in either PC1 or PC2
outlier_mask <- pc1_outliers | pc2_outliers
outlier_plants <- pc_scores$plant[outlier_mask]
# Return clean data
clean_pc_scores <- pc_scores[!outlier_mask, ]
list(
clean_pc_scores = clean_pc_scores,
outlier_plants = outlier_plants,
n_outliers = length(outlier_plants)
)
}
# Function to perform PCA and find centroid representatives
perform_pca_analysis <- function(pca_data, metadata_df, remove_outliers_flag = FALSE) {
# Skip pre-PCA outlier removal
outlier_info <- NULL
# Prepare data for PCA with mean imputation
pca_matrix <- pca_data %>%
select(-plant) %>%
as.matrix()
# Mean imputation for missing values
for (col in 1:ncol(pca_matrix)) {
col_mean <- mean(pca_matrix[, col], na.rm = TRUE)
pca_matrix[is.na(pca_matrix[, col]), col] <- col_mean
}
# Get all plants (no removal due to missing data)
plants_all <- pca_data$plant
# Get metadata for all plants - check for genotype column
metadata_clean <- metadata_df %>%
filter(plant %in% plants_all) %>%
arrange(match(plant, plants_all))
# Check if genotype column exists, if not use inv4m_gt
if (!"genotype" %in% colnames(metadata_clean)) {
if ("inv4m_gt" %in% colnames(metadata_clean)) {
metadata_clean <- metadata_clean %>%
rename(genotype = inv4m_gt)
} else {
stop("Neither 'genotype' nor 'inv4m_gt' column found in metadata")
}
}
# Create combined PC scores dataframe first
all_pc_scores <- data.frame(plant = plants_all) %>%
left_join(metadata_clean, by = "plant")
# Perform separate PCA for each donor
donors <- unique(all_pc_scores$donor)
pca_results_by_donor <- list()
for (donor_name in donors) {
cat("\nPerforming PCA for donor:", donor_name, "\n")
# Filter data for this donor
donor_plants <- all_pc_scores %>%
filter(donor == donor_name) %>%
pull(plant)
donor_indices <- which(plants_all %in% donor_plants)
donor_matrix <- pca_matrix[donor_indices, ]
# Check for constant columns (zero variance)
col_vars <- apply(donor_matrix, 2, var, na.rm = TRUE)
constant_cols <- which(col_vars == 0 | is.na(col_vars))
if (length(constant_cols) > 0) {
cat("Removing", length(constant_cols), "constant/zero variance columns for", donor_name, ":\n")
cat("Columns removed:", colnames(donor_matrix)[constant_cols], "\n")
donor_matrix <- donor_matrix[, -constant_cols, drop = FALSE]
}
# Check if we have enough columns left for PCA
if (ncol(donor_matrix) < 2) {
cat("Warning: Less than 2 variables remaining for", donor_name, ". Skipping PCA.\n")
next
}
# Check for near-constant columns (very low variance)
remaining_vars <- apply(donor_matrix, 2, var, na.rm = TRUE)
low_var_threshold <- 1e-10
near_constant <- which(remaining_vars < low_var_threshold)
if (length(near_constant) > 0) {
cat("Removing", length(near_constant), "near-constant columns for", donor_name, "\n")
donor_matrix <- donor_matrix[, -near_constant, drop = FALSE]
}
# Final check
if (ncol(donor_matrix) < 2) {
cat("Warning: Less than 2 variables remaining for", donor_name, ". Skipping PCA.\n")
next
}
# Perform PCA for this donor
pca_result <- prcomp(donor_matrix, center = TRUE, scale. = TRUE)
# Extract PC scores
pc_scores <- pca_result$x %>%
as.data.frame() %>%
mutate(plant = donor_plants) %>%
left_join(metadata_clean, by = "plant")
# Post-hoc outlier removal in PC space
if (remove_outliers_flag) {
cat("Removing post-hoc outliers in PC space for", donor_name, "\n")
outlier_results <- remove_outliers_posthoc(pc_scores, threshold_sd = 3)
pc_scores_clean <- outlier_results$clean_pc_scores
outlier_plants <- outlier_results$outlier_plants
cat(" - Outliers removed:", length(outlier_plants), "\n")
if (length(outlier_plants) > 0) {
cat(" - Outlier plants:", paste(outlier_plants, collapse = ", "), "\n")
}
} else {
pc_scores_clean <- pc_scores
outlier_plants <- character(0)
}
# Calculate centroids for each genotype within this donor (using clean data)
centroids <- pc_scores_clean %>%
group_by(genotype) %>%
summarise(
centroid_pc1 = mean(PC1, na.rm = TRUE),
centroid_pc2 = mean(PC2, na.rm = TRUE),
n_plants = n(),
.groups = "drop"
) %>%
mutate(donor = donor_name)
# Find plant closest to each centroid (using clean data)
closest_plants <- pc_scores_clean %>%
left_join(centroids, by = "genotype") %>%
mutate(
distance_to_centroid = sqrt((PC1 - centroid_pc1)^2 + (PC2 - centroid_pc2)^2)
) %>%
group_by(genotype) %>%
slice_min(distance_to_centroid, n = 1) %>%
ungroup()
# Find top 4 plants closest to each centroid (for extended table)
top4_closest_plants <- pc_scores_clean %>%
left_join(centroids, by = "genotype") %>%
mutate(
distance_to_centroid = sqrt((PC1 - centroid_pc1)^2 + (PC2 - centroid_pc2)^2)
) %>%
group_by(genotype) %>%
slice_min(distance_to_centroid, n = 4) %>%
mutate(rank = row_number()) %>%
ungroup()
# Store results for this donor
pca_results_by_donor[[donor_name]] <- list(
pca_result = pca_result,
pc_scores = pc_scores, # Keep original for plotting
pc_scores_clean = pc_scores_clean, # Clean for centroids
centroids = centroids,
closest_plants = closest_plants,
top4_closest_plants = top4_closest_plants,
variance_explained = summary(pca_result)$importance,
removed_columns = c(constant_cols, near_constant),
n_variables_used = ncol(donor_matrix),
outlier_plants = outlier_plants
)
}
# Combine results
all_centroids <- bind_rows(lapply(pca_results_by_donor, function(x) x$centroids))
all_closest_plants <- bind_rows(lapply(pca_results_by_donor, function(x) x$closest_plants))
all_top4_closest_plants <- bind_rows(lapply(pca_results_by_donor, function(x) x$top4_closest_plants))
# Return results with imputation and outlier info
list(
pca_results_by_donor = pca_results_by_donor,
all_centroids = all_centroids,
all_closest_plants = all_closest_plants,
all_top4_closest_plants = all_top4_closest_plants,
n_plants_analyzed = nrow(all_pc_scores),
donors = donors,
outlier_info = outlier_info,
imputation_summary = data.frame(
variable = colnames(pca_matrix),
n_missing = sapply(1:ncol(pca_data %>% select(-plant)),
function(i) sum(is.na(pca_data %>% select(-plant) %>% pull(i)))),
mean_imputed = sapply(1:ncol(pca_matrix), function(i) mean(pca_matrix[, i]))
)
)
}
# Function to create PCA visualization
plot_pca_results <- function(analysis_results) {
donors <- analysis_results$donors
plot_list <- list()
for (donor_name in donors) {
# Check if PCA was actually performed for this donor
if (donor_name %in% names(analysis_results$pca_results_by_donor)) {
donor_results <- analysis_results$pca_results_by_donor[[donor_name]]
pc_scores <- donor_results$pc_scores
centroids <- donor_results$centroids
closest_plants <- donor_results$closest_plants
var_exp <- donor_results$variance_explained
# Calculate variance explained percentages
pc1_var <- round(var_exp[2, 1] * 100, 1)
pc2_var <- round(var_exp[2, 2] * 100, 1)
# Create PCA plot for this donor (using cleaned data for plotting)
pc_scores_to_plot <- if(length(donor_results$outlier_plants) > 0) {
donor_results$pc_scores_clean
} else {
donor_results$pc_scores
}
p <- ggplot(pc_scores_to_plot, aes(x = PC1, y = PC2, color = genotype)) +
geom_point(size = 3, alpha = 0.8) +
geom_point(data = centroids,
aes(x = centroid_pc1, y = centroid_pc2, color = genotype),
size = 12, shape = 15, alpha = 0.9) +
geom_point(data = closest_plants,
aes(x = PC1, y = PC2, color = genotype),
size = 4, shape = 1, stroke = 2) +
geom_text(data = closest_plants,
aes(x = PC1, y = PC2, label = plant),
nudge_y = 0.3, size = 6, fontface = "bold",
show.legend = FALSE) +
scale_color_manual(
values = c("INV4M" = "purple4", "CTRL" = "gold"),
labels = c("INV4M" = "Inv4m", "CTRL" = "Control"),
name = "Genotype"
) +
labs(
title = paste("PCA for Donor:", donor_name),
subtitle = paste("Squares = centroids, Circles = closest plants",
if(length(donor_results$outlier_plants) > 0)
paste("(", length(donor_results$outlier_plants), "outliers removed)")
else ""),
x = paste0("PC1 (", pc1_var, "% variance)"),
y = paste0("PC2 (", pc2_var, "% variance)")
) +
theme_minimal() +
theme(
plot.title = element_text(size = 12, face = "bold"),
plot.subtitle = element_text(size = 10),
legend.position = "bottom"
)
plot_list[[donor_name]] <- p
} else {
# Create placeholder plot for skipped donors
p <- ggplot() +
annotate("text", x = 0, y = 0,
label = paste("PCA analysis skipped for", donor_name, "\n(insufficient variable variance)"),
size = 5, hjust = 0.5, vjust = 0.5) +
labs(title = paste("PCA for Donor:", donor_name)) +
theme_void() +
theme(plot.title = element_text(size = 12, face = "bold", hjust = 0.5))
plot_list[[donor_name]] <- p
}
}
# Combine plots
if (length(plot_list) == 2) {
combined_plot <- ggarrange(plotlist = plot_list, ncol = 2, nrow = 1,
common.legend = TRUE, legend = "bottom")
} else {
combined_plot <- plot_list[[1]]
}
return(combined_plot)
}
Results
Outlier Detection
Summary
if (!is.null(pca_analysis$outlier_info)) {
cat("Outlier Detection Results by Donor:\n")
cat("==================================\n")
cat("Method: Mahalanobis Distance (per donor)\n")
cat("Threshold: χ² 97.5th percentile\n\n")
# Create summary table for all donors
outlier_summary_list <- list()
for (donor_name in names(pca_analysis$outlier_info)) {
donor_outliers <- pca_analysis$outlier_info[[donor_name]]
cat("DONOR:", donor_name, "\n")
cat(" - Original plants:", donor_outliers$n_plants_original, "\n")
cat(" - Outliers detected:", donor_outliers$n_outliers, "\n")
cat(" - Threshold used:", round(donor_outliers$threshold, 3), "\n")
if (donor_outliers$n_outliers > 0) {
cat(" - Outlier plants:", paste(donor_outliers$outlier_plants, collapse = ", "), "\n")
# Find indices of outlier plants in the distance vector
outlier_indices <- which(names(donor_outliers$mahalanobis_distances) %in% donor_outliers$outlier_plants)
# If names are not available, use plant order
if (length(outlier_indices) == 0) {
# Create a mapping based on original plant order
donor_data_temp <- combined_data %>% filter(donor == donor_name)
outlier_indices <- which(donor_data_temp$plant %in% donor_outliers$outlier_plants)
}
# Create data frame with matching lengths
if (length(outlier_indices) > 0 && length(outlier_indices) == length(donor_outliers$outlier_plants)) {
outlier_summary_list[[donor_name]] <- data.frame(
donor = rep(donor_name, length(donor_outliers$outlier_plants)),
outlier_plant = donor_outliers$outlier_plants,
mahalanobis_distance = donor_outliers$mahalanobis_distances[outlier_indices]
)
} else {
# Fallback: create data frame without distances if matching fails
outlier_summary_list[[donor_name]] <- data.frame(
donor = rep(donor_name, length(donor_outliers$outlier_plants)),
outlier_plant = donor_outliers$outlier_plants,
mahalanobis_distance = rep(NA, length(donor_outliers$outlier_plants))
)
}
}
cat("\n")
}
# Create combined outlier table if there are outliers
if (length(outlier_summary_list) > 0) {
all_outliers_table <- bind_rows(outlier_summary_list) %>%
arrange(donor, desc(mahalanobis_distance)) %>%
mutate(mahalanobis_distance = round(mahalanobis_distance, 3))
if (nrow(all_outliers_table) > 0) {
kable(all_outliers_table,
caption = "All Outliers Detected by Donor",
col.names = c("Donor", "Plant ID", "Mahalanobis Distance"),
align = c('c', 'c', 'c')) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
group_rows("MI21", 1, sum(all_outliers_table$donor == "MI21")) %>%
group_rows("TMEX", sum(all_outliers_table$donor == "MI21") + 1, nrow(all_outliers_table))
}
}
total_removed <- sum(sapply(pca_analysis$outlier_info, function(x) x$n_outliers))
cat("Total plants removed across all donors:", total_removed, "\n")
} else {
cat("No outlier detection performed.")
}
## No outlier detection performed.
Missing Data
Summary
kable(pca_analysis$imputation_summary,
caption = "Missing Data and Mean Imputation Summary",
digits = 3) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Missing Data and Mean Imputation Summary
variable
|
n_missing
|
mean_imputed
|
plant
|
0
|
218.222
|
top16
|
417
|
21.286
|
top15
|
378
|
22.130
|
top14
|
264
|
22.637
|
top13
|
206
|
19.950
|
top12
|
195
|
14.328
|
top11
|
195
|
12.459
|
top10
|
145
|
14.989
|
top9
|
13
|
18.664
|
top8
|
3
|
15.919
|
top7
|
3
|
14.435
|
top6
|
2
|
14.659
|
top5
|
1
|
14.170
|
top4
|
3
|
12.926
|
top3
|
3
|
12.012
|
top2
|
3
|
11.494
|
top1
|
5
|
9.692
|
cat("Total plants analyzed (after outlier removal):", pca_analysis$n_plants_analyzed)
## Total plants analyzed (after outlier removal): 424
PCA Results by
Donor
# Display results for each donor
for (donor_name in pca_analysis$donors) {
if (donor_name %in% names(pca_analysis$pca_results_by_donor)) {
cat("\n\n### DONOR:", donor_name, "\n")
donor_results <- pca_analysis$pca_results_by_donor[[donor_name]]
# Report removed columns if any
if (!is.null(donor_results$removed_columns) && length(donor_results$removed_columns) > 0) {
cat("\n**Note:** Removed", length(donor_results$removed_columns),
"constant/low-variance columns for this donor\n")
cat("Variables used in PCA:", donor_results$n_variables_used, "\n")
}
cat("\n#### Variance Explained\n")
var_table <- donor_results$variance_explained[1:3, 1:5] %>%
as.data.frame() %>%
round(4)
print(kable(var_table, caption = paste("Variance Explained -", donor_name)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")))
cat("\n#### Genotype Centroids\n")
centroid_table <- donor_results$centroids %>%
select(-donor) %>%
mutate(across(where(is.numeric), ~round(.x, 3)))
print(kable(centroid_table, caption = paste("Genotype Centroids -", donor_name)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")))
cat("\n#### Closest Plants to Centroids\n")
closest_table <- donor_results$closest_plants %>%
select(plant, genotype, PC1, PC2, distance_to_centroid) %>%
arrange(genotype) %>%
mutate(across(where(is.numeric), ~round(.x, 3)))
print(kable(closest_table, caption = paste("Representative Plants -", donor_name)) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")))
cat("\n")
} else {
cat("\n\n### DONOR:", donor_name, "\n")
cat("**PCA analysis skipped due to insufficient variable variance**\n\n")
}
}
DONOR: TMEX
Note: Removed 2 constant/low-variance columns for
this donor Variables used in PCA: 15
Variance
Explained
Variance Explained - TMEX
|
PC1
|
PC2
|
PC3
|
PC4
|
PC5
|
Standard deviation
|
1.9603
|
1.6366
|
1.4744
|
1.0899
|
1.0271
|
Proportion of Variance
|
0.2562
|
0.1786
|
0.1449
|
0.0792
|
0.0703
|
Cumulative Proportion
|
0.2562
|
0.4348
|
0.5797
|
0.6589
|
0.7292
|
Genotype
Centroids
Genotype Centroids - TMEX
genotype
|
centroid_pc1
|
centroid_pc2
|
n_plants
|
CTRL
|
0.668
|
-0.862
|
100
|
INV4M
|
-1.019
|
0.729
|
95
|
Closest Plants to
Centroids
Representative Plants - TMEX
plant
|
genotype
|
PC1
|
PC2
|
distance_to_centroid
|
373
|
CTRL
|
0.775
|
-0.733
|
0.168
|
213
|
INV4M
|
-0.915
|
0.930
|
0.227
|
DONOR: MI21
Variance
Explained
Variance Explained - MI21
|
PC1
|
PC2
|
PC3
|
PC4
|
PC5
|
Standard deviation
|
2.1030
|
1.3756
|
1.3032
|
1.1607
|
1.0693
|
Proportion of Variance
|
0.2602
|
0.1113
|
0.0999
|
0.0792
|
0.0673
|
Cumulative Proportion
|
0.2602
|
0.3715
|
0.4714
|
0.5506
|
0.6179
|
Genotype
Centroids
Genotype Centroids - MI21
genotype
|
centroid_pc1
|
centroid_pc2
|
n_plants
|
CTRL
|
-0.280
|
0.263
|
112
|
INV4M
|
0.234
|
-0.306
|
114
|
Closest Plants to
Centroids
Representative Plants - MI21
plant
|
genotype
|
PC1
|
PC2
|
distance_to_centroid
|
353
|
CTRL
|
-0.103
|
0.392
|
0.219
|
327
|
INV4M
|
0.207
|
-0.469
|
0.165
|
PCA
Visualizations
# Create and display plots
pca_plot <- plot_pca_results(pca_analysis)
print(pca_plot)

Summary: All
Representative Plants with Plant Height
# Debug: Check what columns are in the closest plants data
cat("Columns in all_closest_plants:\n")
## Columns in all_closest_plants:
print(colnames(pca_analysis$all_closest_plants))
## [1] "PC1" "PC2" "PC3"
## [4] "PC4" "PC5" "PC6"
## [7] "PC7" "PC8" "PC9"
## [10] "PC10" "PC11" "PC12"
## [13] "PC13" "PC14" "PC15"
## [16] "plant" "row_id" "plot_id"
## [19] "plot_row" "plot_col" "rep"
## [22] "Y_pos" "X_pos" "donor.x"
## [25] "genotype" "group" "centroid_pc1"
## [28] "centroid_pc2" "n_plants" "donor.y"
## [31] "distance_to_centroid" "PC16" "PC17"
# Get plant height data from original metadata (PH column)
plant_height_data <- data %>%
select(plant, plant_height_cm = PH) %>%
filter(!is.na(plant_height_cm))
# Show some basic stats about plant height data
cat("\nPlant height data summary:\n")
##
## Plant height data summary:
cat("Total plants with height data:", nrow(plant_height_data), "\n")
## Total plants with height data: 442
cat("Height range:", min(plant_height_data$plant_height_cm, na.rm = TRUE), "-",
max(plant_height_data$plant_height_cm, na.rm = TRUE), "cm\n")
## Height range: 159 - 225 cm
cat("Mean height:", round(mean(plant_height_data$plant_height_cm, na.rm = TRUE), 1), "cm\n\n")
## Mean height: 191.6 cm
# Get metadata for merging donor information
cat("Columns in metadata:\n")
## Columns in metadata:
print(colnames(metadata))
## [1] "plant" "row_id" "plot_id" "plot_row" "plot_col" "rep"
## [7] "Y_pos" "X_pos" "donor" "inv4m_gt" "group"
metadata_for_merge <- metadata %>%
select(plant, donor, inv4m_gt)
cat("\nColumns in metadata_for_merge:\n")
##
## Columns in metadata_for_merge:
print(colnames(metadata_for_merge))
## [1] "plant" "donor" "inv4m_gt"
# Combine with closest plants data - first check what we have
cat("\nFirst few rows of all_closest_plants:\n")
##
## First few rows of all_closest_plants:
print(head(pca_analysis$all_closest_plants))
## # A tibble: 4 × 33
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.775 -0.733 2.12 1.90 -0.726 -0.415 -0.902 -0.837 -0.738 0.124
## 2 -0.915 0.930 -0.571 -0.208 0.161 -0.316 -1.17 -1.40 0.342 -1.12
## 3 -0.103 0.392 1.43 -0.357 -0.494 2.38 0.219 -0.113 -0.117 0.185
## 4 0.207 -0.469 1.34 -0.0304 -0.316 0.00175 -0.00317 0.0982 -0.780 1.26
## # ℹ 23 more variables: PC11 <dbl>, PC12 <dbl>, PC13 <dbl>, PC14 <dbl>,
## # PC15 <dbl>, plant <int>, row_id <int>, plot_id <int>, plot_row <int>,
## # plot_col <int>, rep <int>, Y_pos <dbl>, X_pos <dbl>, donor.x <chr>,
## # genotype <chr>, group <chr>, centroid_pc1 <dbl>, centroid_pc2 <dbl>,
## # n_plants <int>, donor.y <chr>, distance_to_centroid <dbl>, PC16 <dbl>,
## # PC17 <dbl>
# Create the merged dataset step by step
step1 <- pca_analysis$all_closest_plants %>%
left_join(metadata_for_merge, by = "plant")
cat("\nColumns after joining with metadata:\n")
##
## Columns after joining with metadata:
print(colnames(step1))
## [1] "PC1" "PC2" "PC3"
## [4] "PC4" "PC5" "PC6"
## [7] "PC7" "PC8" "PC9"
## [10] "PC10" "PC11" "PC12"
## [13] "PC13" "PC14" "PC15"
## [16] "plant" "row_id" "plot_id"
## [19] "plot_row" "plot_col" "rep"
## [22] "Y_pos" "X_pos" "donor.x"
## [25] "genotype" "group" "centroid_pc1"
## [28] "centroid_pc2" "n_plants" "donor.y"
## [31] "distance_to_centroid" "PC16" "PC17"
## [34] "donor" "inv4m_gt"
step2 <- step1 %>%
left_join(plant_height_data, by = "plant")
cat("\nColumns after joining with height data:\n")
##
## Columns after joining with height data:
print(colnames(step2))
## [1] "PC1" "PC2" "PC3"
## [4] "PC4" "PC5" "PC6"
## [7] "PC7" "PC8" "PC9"
## [10] "PC10" "PC11" "PC12"
## [13] "PC13" "PC14" "PC15"
## [16] "plant" "row_id" "plot_id"
## [19] "plot_row" "plot_col" "rep"
## [22] "Y_pos" "X_pos" "donor.x"
## [25] "genotype" "group" "centroid_pc1"
## [28] "centroid_pc2" "n_plants" "donor.y"
## [31] "distance_to_centroid" "PC16" "PC17"
## [34] "donor" "inv4m_gt" "plant_height_cm"
# Now safely select and rename columns
available_cols <- colnames(step2)
select_cols <- c("plant")
if ("donor" %in% available_cols) select_cols <- c(select_cols, "donor")
if ("inv4m_gt" %in% available_cols) select_cols <- c(select_cols, "inv4m_gt")
if ("genotype" %in% available_cols) select_cols <- c(select_cols, "genotype")
if ("PC1" %in% available_cols) select_cols <- c(select_cols, "PC1")
if ("PC2" %in% available_cols) select_cols <- c(select_cols, "PC2")
if ("distance_to_centroid" %in% available_cols) select_cols <- c(select_cols, "distance_to_centroid")
if ("plant_height_cm" %in% available_cols) select_cols <- c(select_cols, "plant_height_cm")
cat("\nColumns to select:", paste(select_cols, collapse = ", "), "\n")
##
## Columns to select: plant, donor, inv4m_gt, genotype, PC1, PC2, distance_to_centroid, plant_height_cm
final_summary_with_height <- step2 %>%
select(all_of(select_cols))
# Rename genotype column if needed
if ("inv4m_gt" %in% colnames(final_summary_with_height) && !"genotype" %in% colnames(final_summary_with_height)) {
final_summary_with_height <- final_summary_with_height %>%
rename(genotype = inv4m_gt)
}
# Round numeric columns
numeric_cols <- intersect(c("PC1", "PC2", "distance_to_centroid"), colnames(final_summary_with_height))
if (length(numeric_cols) > 0) {
final_summary_with_height <- final_summary_with_height %>%
mutate(across(all_of(numeric_cols), ~round(.x, 3)))
}
if ("plant_height_cm" %in% colnames(final_summary_with_height)) {
final_summary_with_height <- final_summary_with_height %>%
mutate(plant_height_cm = round(plant_height_cm, 1))
}
# Arrange if we have both donor and genotype
if (all(c("donor", "genotype") %in% colnames(final_summary_with_height))) {
final_summary_with_height <- final_summary_with_height %>%
arrange(donor, genotype)
}
# Show the final data structure
cat("\nFinal summary structure:\n")
##
## Final summary structure:
print(str(final_summary_with_height))
## tibble [4 × 8] (S3: tbl_df/tbl/data.frame)
## $ plant : int [1:4] 353 327 373 213
## $ donor : chr [1:4] "MI21" "MI21" "TMEX" "TMEX"
## $ inv4m_gt : chr [1:4] "CTRL" "INV4M" "CTRL" "INV4M"
## $ genotype : chr [1:4] "CTRL" "INV4M" "CTRL" "INV4M"
## $ PC1 : num [1:4] -0.103 0.207 0.775 -0.915
## $ PC2 : num [1:4] 0.392 -0.469 -0.733 0.93
## $ distance_to_centroid: num [1:4] 0.219 0.165 0.168 0.227
## $ plant_height_cm : num [1:4] 170 183 195 205
## NULL
cat("\nFirst few rows:\n")
##
## First few rows:
print(head(final_summary_with_height))
## # A tibble: 4 × 8
## plant donor inv4m_gt genotype PC1 PC2 distance_to_centroid
## <int> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 353 MI21 CTRL CTRL -0.103 0.392 0.219
## 2 327 MI21 INV4M INV4M 0.207 -0.469 0.165
## 3 373 TMEX CTRL CTRL 0.775 -0.733 0.168
## 4 213 TMEX INV4M INV4M -0.915 0.93 0.227
## # ℹ 1 more variable: plant_height_cm <dbl>
# Create the table
kable(final_summary_with_height,
caption = "Most Representative Plants by Donor-Genotype with Plant Height") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Most Representative Plants by Donor-Genotype with Plant Height
plant
|
donor
|
inv4m_gt
|
genotype
|
PC1
|
PC2
|
distance_to_centroid
|
plant_height_cm
|
353
|
MI21
|
CTRL
|
CTRL
|
-0.103
|
0.392
|
0.219
|
170
|
327
|
MI21
|
INV4M
|
INV4M
|
0.207
|
-0.469
|
0.165
|
183
|
373
|
TMEX
|
CTRL
|
CTRL
|
0.775
|
-0.733
|
0.168
|
195
|
213
|
TMEX
|
INV4M
|
INV4M
|
-0.915
|
0.930
|
0.227
|
205
|
Extended Table: Top 4
Plants Closest to Each Centroid
# Extended table with top 4 closest plants per genotype
cat("Creating extended summary with top 4 plants closest to each centroid...\n")
## Creating extended summary with top 4 plants closest to each centroid...
# Check if the top4 data exists
if ("all_top4_closest_plants" %in% names(pca_analysis)) {
cat("Top 4 data found. Number of rows:", nrow(pca_analysis$all_top4_closest_plants), "\n")
# Create extended summary with top 4 plants
extended_summary <- pca_analysis$all_top4_closest_plants %>%
left_join(metadata_for_merge, by = "plant") %>%
left_join(plant_height_data, by = "plant")
cat("Columns in extended summary:\n")
print(colnames(extended_summary))
# Select and arrange columns safely
available_cols_ext <- colnames(extended_summary)
select_cols_ext <- c("plant")
if ("donor" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "donor")
if ("inv4m_gt" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "inv4m_gt")
if ("genotype" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "genotype")
if ("rank" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "rank")
if ("PC1" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "PC1")
if ("PC2" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "PC2")
if ("distance_to_centroid" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "distance_to_centroid")
if ("plant_height_cm" %in% available_cols_ext) select_cols_ext <- c(select_cols_ext, "plant_height_cm")
cat("Columns to select for extended table:", paste(select_cols_ext, collapse = ", "), "\n")
extended_summary_clean <- extended_summary %>%
select(all_of(select_cols_ext))
# Rename genotype column if needed
if ("inv4m_gt" %in% colnames(extended_summary_clean) && !"genotype" %in% colnames(extended_summary_clean)) {
extended_summary_clean <- extended_summary_clean %>%
rename(genotype = inv4m_gt)
}
# Round numeric columns
numeric_cols_ext <- intersect(c("PC1", "PC2", "distance_to_centroid"), colnames(extended_summary_clean))
if (length(numeric_cols_ext) > 0) {
extended_summary_clean <- extended_summary_clean %>%
mutate(across(all_of(numeric_cols_ext), ~round(.x, 3)))
}
if ("plant_height_cm" %in% colnames(extended_summary_clean)) {
extended_summary_clean <- extended_summary_clean %>%
mutate(plant_height_cm = round(plant_height_cm, 1))
}
# Arrange if we have required columns
if (all(c("donor", "genotype", "rank") %in% colnames(extended_summary_clean))) {
extended_summary_clean <- extended_summary_clean %>%
arrange(donor, genotype, rank)
}
cat("Final extended table dimensions:", nrow(extended_summary_clean), "x", ncol(extended_summary_clean), "\n")
# Display the extended table
kable(extended_summary_clean,
caption = "Top 4 Plants Closest to Each Centroid by Donor-Genotype",
align = "c") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed")) %>%
group_rows("MI21 - CTRL", 1, 4) %>%
group_rows("MI21 - INV4M", 5, 8) %>%
group_rows("TMEX - CTRL", 9, 12) %>%
group_rows("TMEX - INV4M", 13, 16)
} else {
cat("Error: Top 4 closest plants data not found in analysis results.\n")
}
## Top 4 data found. Number of rows: 16
## Columns in extended summary:
## [1] "PC1" "PC2" "PC3"
## [4] "PC4" "PC5" "PC6"
## [7] "PC7" "PC8" "PC9"
## [10] "PC10" "PC11" "PC12"
## [13] "PC13" "PC14" "PC15"
## [16] "plant" "row_id" "plot_id"
## [19] "plot_row" "plot_col" "rep"
## [22] "Y_pos" "X_pos" "donor.x"
## [25] "genotype" "group" "centroid_pc1"
## [28] "centroid_pc2" "n_plants" "donor.y"
## [31] "distance_to_centroid" "rank" "PC16"
## [34] "PC17" "donor" "inv4m_gt"
## [37] "plant_height_cm"
## Columns to select for extended table: plant, donor, inv4m_gt, genotype, rank, PC1, PC2, distance_to_centroid, plant_height_cm
## Final extended table dimensions: 16 x 9
Top 4 Plants Closest to Each Centroid by Donor-Genotype
plant
|
donor
|
inv4m_gt
|
genotype
|
rank
|
PC1
|
PC2
|
distance_to_centroid
|
plant_height_cm
|
MI21 - CTRL
|
353
|
MI21
|
CTRL
|
CTRL
|
1
|
-0.103
|
0.392
|
0.219
|
170
|
43
|
MI21
|
CTRL
|
CTRL
|
2
|
-0.426
|
0.524
|
0.299
|
199
|
245
|
MI21
|
CTRL
|
CTRL
|
3
|
-0.260
|
-0.039
|
0.303
|
186
|
109
|
MI21
|
CTRL
|
CTRL
|
4
|
-0.592
|
0.294
|
0.314
|
192
|
MI21 - INV4M
|
327
|
MI21
|
INV4M
|
INV4M
|
1
|
0.207
|
-0.469
|
0.165
|
183
|
9
|
MI21
|
INV4M
|
INV4M
|
2
|
0.104
|
-0.486
|
0.221
|
185
|
187
|
MI21
|
INV4M
|
INV4M
|
3
|
0.434
|
-0.063
|
0.316
|
167
|
135
|
MI21
|
INV4M
|
INV4M
|
4
|
0.596
|
-0.457
|
0.392
|
186
|
TMEX - CTRL
|
373
|
TMEX
|
CTRL
|
CTRL
|
1
|
0.775
|
-0.733
|
0.168
|
195
|
74
|
TMEX
|
CTRL
|
CTRL
|
2
|
0.839
|
-0.810
|
0.178
|
207
|
335
|
TMEX
|
CTRL
|
CTRL
|
3
|
0.868
|
-0.916
|
0.208
|
204
|
100
|
TMEX
|
CTRL
|
CTRL
|
4
|
0.593
|
-1.060
|
0.211
|
184
|
TMEX - INV4M
|
213
|
TMEX
|
INV4M
|
INV4M
|
1
|
-0.915
|
0.930
|
0.227
|
205
|
130
|
TMEX
|
INV4M
|
INV4M
|
2
|
-1.057
|
0.953
|
0.228
|
214
|
174
|
TMEX
|
INV4M
|
INV4M
|
3
|
-1.146
|
0.956
|
0.259
|
205
|
363
|
TMEX
|
INV4M
|
INV4M
|
4
|
-1.099
|
0.480
|
0.261
|
190
|
Mean Plant Height by
Donor-Genotype Group
# Calculate mean plant height for all plants by donor-genotype combination
all_plants_with_metadata <- data %>%
select(plant, donor, inv4m_gt, PH) %>%
rename(genotype = inv4m_gt, plant_height_cm = PH) %>%
filter(!is.na(plant_height_cm))
# Calculate summary statistics
height_summary_all <- all_plants_with_metadata %>%
group_by(donor, genotype) %>%
summarise(
n_plants = n(),
mean_height = round(mean(plant_height_cm, na.rm = TRUE), 1),
sd_height = round(sd(plant_height_cm, na.rm = TRUE), 1),
min_height = round(min(plant_height_cm, na.rm = TRUE), 1),
max_height = round(max(plant_height_cm, na.rm = TRUE), 1),
.groups = "drop"
) %>%
arrange(donor, genotype)
kable(height_summary_all,
caption = "Plant Height Summary for All Plants by Donor-Genotype",
col.names = c("Donor", "Genotype", "N Plants", "Mean Height (cm)",
"SD Height (cm)", "Min Height (cm)", "Max Height (cm)"),
align = c('c', 'c', 'c', 'c', 'c', 'c', 'c')) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Plant Height Summary for All Plants by Donor-Genotype
Donor
|
Genotype
|
N Plants
|
Mean Height (cm)
|
SD Height (cm)
|
Min Height (cm)
|
Max Height (cm)
|
MI21
|
CTRL
|
113
|
187.1
|
7.9
|
163
|
206
|
MI21
|
INV4M
|
119
|
182.9
|
7.9
|
159
|
197
|
TMEX
|
CTRL
|
105
|
193.8
|
10.1
|
172
|
221
|
TMEX
|
INV4M
|
105
|
204.1
|
11.4
|
167
|
225
|
# Statistical comparison between genotypes within each donor
cat("\nHeight Differences Between Genotypes:\n")
##
## Height Differences Between Genotypes:
height_comparison_all <- all_plants_with_metadata %>%
group_by(donor) %>%
summarise(
inv4m_mean = round(mean(plant_height_cm[genotype == "INV4M"], na.rm = TRUE), 1),
ctrl_mean = round(mean(plant_height_cm[genotype == "CTRL"], na.rm = TRUE), 1),
height_difference = round(inv4m_mean - ctrl_mean, 1),
.groups = "drop"
)
kable(height_comparison_all,
caption = "Height Comparison: Inv4m vs Control by Donor (All Plants)",
col.names = c("Donor", "Inv4m Mean (cm)", "Control Mean (cm)", "Difference (cm)"),
align = c('c', 'c', 'c', 'c')) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Height Comparison: Inv4m vs Control by Donor (All Plants)
Donor
|
Inv4m Mean (cm)
|
Control Mean (cm)
|
Difference (cm)
|
MI21
|
182.9
|
187.1
|
-4.2
|
TMEX
|
204.1
|
193.8
|
10.3
|
Analysis completed on: 2025-07-09
20:26:51.424791