Phenotype data process and transformation

Field canopy (temperature and other traits)

Load data

### Load entry 
DB <- read.csv("~/Library/CloudStorage/Box-Box/Projects/11_DIV/02_Phenotype/01_Canopy/DIV_BLUPS_12Days-Canopy.csv", 
               stringsAsFactors = FALSE, header = T, row.names = 1)
nrow(DB)

## [1] 382

head(DB)

Time <- c("temp_189","temp_196", "temp_204", "temp_210", "temp_217", "temp_224", 
          "temp_231","temp_238", "temp_245", "temp_253", "temp_259", "temp_267", 
          "temp_33_189", "temp_33_196", "temp_33_204", "temp_33_210", "temp_33_217", "temp_33_224", 
          "temp_33_231","temp_33_238", "temp_33_245", "temp_33_253", "temp_33_259", "temp_33_267",
          "temp_10_189", "temp_10_196", "temp_10_204", "temp_10_210", "temp_10_217", "temp_10_224", 
          "temp_10_231","temp_10_238", "temp_10_245", "temp_10_253", "temp_10_259", "temp_10_267",
          "CTD_189", "CTD_196", "CTD_204", "CTD_210", "CTD_217", "CTD_224", 
          "CTD_231","CTD_238", "CTD_245", "CTD_253", "CTD_259", "CTD_267")

Plot all raw data and transformed data together

for (i in 1:length(Time)){

  temp.df <- DB[, grep(paste0(Time[i],"_"), colnames(DB), value = F)]
  temp.df$Accession <- rownames(temp.df)
  melt.df <- melt(temp.df, id.vars = "Accession", 
     variable.name = "condition", value.name = "Sum.Area")
  ### Add the calculation
  melt.df$Log10.Sum.area <- log10(abs(melt.df$Sum.Area))
  melt.df$Sqrt.Sum.area <- sqrt(abs(melt.df$Sum.Area))
  
  ### Calculate the mean for each group
  temp.mean1 <- ddply(melt.df %>% na.omit(), "condition", summarise, grp.mean=mean(Sum.Area))
  temp.mean2 <- ddply(melt.df %>% na.omit(), "condition", summarise, grp.mean.log=mean(Log10.Sum.area))
  temp.mean3 <- ddply(melt.df %>% na.omit(), "condition", summarise, grp.mean.sqrt=mean(Sqrt.Sum.area))
  
 A <-  ggplot(melt.df %>% filter(!grepl("both", condition, ignore.case = TRUE)) %>% na.omit(), 
              aes(x=Sum.Area, color = condition, fill = condition)) +
       geom_histogram(aes(y=..density..),alpha = 0.8)+
       scale_fill_manual(values=c("lightblue", "lightpink"))+
       scale_color_manual(values=c("darkblue", "red")) +
       theme_classic() +
       theme(legend.position = "bottom") +
       geom_density(alpha = 0.2)
 
 
  B  <-  ggplot(melt.df %>% filter(!grepl("both", condition, ignore.case = TRUE)) %>% na.omit(), 
              aes(x=Log10.Sum.area, color = condition, fill = condition)) +
       geom_histogram(aes(y=..density..),alpha = 0.8)+
       scale_fill_manual(values=c("lightblue", "lightpink"))+
       scale_color_manual(values=c("darkblue", "red")) +
       theme_classic() +
       theme(legend.position = "bottom") +
       geom_density(alpha = 0.2)
  
    C  <-  ggplot(melt.df %>% filter(!grepl("both", condition, ignore.case = TRUE)) %>% na.omit(), 
              aes(x=Sqrt.Sum.area, color = condition, fill = condition)) +
       geom_histogram(aes(y=..density..),alpha = 0.8)+
       scale_fill_manual(values=c("lightblue", "lightpink"))+
       scale_color_manual(values=c("darkblue", "red")) +
       theme_classic() +
       theme(legend.position = "bottom") +
       geom_density(alpha = 0.2)

   
   D <- ggplot(melt.df %>% filter(grepl("both", condition, ignore.case = TRUE)) %>% na.omit(), 
              aes(x=Sum.Area, color = condition, fill = condition)) +
       geom_histogram(aes(y=..density..),alpha = 0.8)+
       scale_fill_manual(values=c("grey70"))+
       scale_color_manual(values=c("grey30")) +
       theme_classic() +
       theme(legend.position = "bottom") +
       geom_density(alpha = 0.2)
   
   E <-  ggplot(melt.df %>% filter(grepl("both", condition, ignore.case = TRUE)) %>% na.omit(), 
              aes(x=Log10.Sum.area, color = condition, fill = condition)) +
       geom_histogram(aes(y=..density..),alpha = 0.8)+
       scale_fill_manual(values=c("grey70"))+
       scale_color_manual(values=c("grey30")) +
       theme_classic() +
       theme(legend.position = "bottom") +
       geom_density(alpha = 0.2)
   
   G <- ggplot(melt.df %>% filter(grepl("both", condition, ignore.case = TRUE)) %>% na.omit(), 
              aes(x=Sqrt.Sum.area, color = condition, fill = condition)) +
       geom_histogram(aes(y=..density..),alpha = 0.8)+
       scale_fill_manual(values=c("grey70"))+
       scale_color_manual(values=c("grey30")) +
       theme_classic() +
       theme(legend.position = "bottom") +
       geom_density(alpha = 0.2)
   
   ### Combine all plots for each day
   pdf(paste0("~/Library/CloudStorage/Box-Box/Projects/11_DIV/02_Phenotype/01_Canopy/02_Pheno-Nomality/Canopy_",Time[i], ".pdf"), 
       height = 10, width = 15)
   
    print(plot_grid(A, B, C, D, E, G, labels = c('A', 'B', 'C', 'D', 'E', 'G'), label_size = 12))
    dev.off()
   
}

Export statistical profile

for (i in 1:length(Time)){
  
  temp.df <- DB[, grep(paste0(Time[i],"_"), colnames(DB), value = F)]
  temp.df$Accession <- rownames(temp.df)

    ### Calculate the transformation
   temp.df$WL.sqrt <- sqrt(abs(temp.df[,1]))
   temp.df$WW.sqrt <- sqrt(abs(temp.df[,2]))
   temp.df$WL.log10 <- log10(abs(temp.df[,1]))
   temp.df$WW.log10 <- log10(abs(temp.df[,2]))
   temp.df$both.log10 <- log10(abs(temp.df[,3]))
   temp.df$both.sqrt <- sqrt(abs(temp.df[,3]))
   
   temp.clean <- temp.df[, colnames(temp.df) != "Accession"]
  
    ### Create an empty dataframe with specified row numbers and column names
    sum.df <- data.frame(matrix(nrow = 9, ncol = 7))
    colnames(sum.df) <- c("term", "shapiro.test(P)", "skewness", "kurtosis",
                    "Mean", "Media", "Delta")
    
    for (x in 1: nrow(sum.df)){
      sum.df[x,1] <- colnames(temp.clean[,x, drop = F])
      S.test <- shapiro.test(temp.clean[,x] %>% na.omit())
      sum.df[x,2] <- S.test[2]$p.value
      sum.df[x,3] <- skewness(temp.clean[,1] %>% na.omit())
      sum.df[x,4] <- kurtosis(temp.clean[,x] %>% na.omit())
      sum.df[x,5] <- mean(temp.clean[,x] %>% na.omit())
      sum.df[x,6] <- median(temp.clean[,x] %>% na.omit())
      sum.df[x,7] <- sum.df[x,6] - sum.df[x,5]
    }
    
    sum.df$Day <- Time[i]
    assign(paste0("Normal-distribution_statistics_", Time[i]), sum.df)
}

### Combine all list 
Stats_list <- lapply(ls(pattern = "Normal-distribution_statistics_"), get)
Normal_distribution_Stats <- bind_rows(Stats_list)
head(Normal_distribution_Stats, 30)

### Export report files 
write.csv(Normal_distribution_Stats, 
          "~/Library/CloudStorage/Box-Box/Projects/11_DIV/02_Phenotype/01_Canopy/01_Normality-stats-canopy-temperature-daily.csv",quote = F)

Fiber quality and yield-related traits

Prepare dataset

### Load entry 
Fiber <- read.csv("~/Library/CloudStorage/Box-Box/Projects/11_DIV/02_Phenotype/02_Fiber/seed_fiber_blups_results-Normality.csv", header = T, row.names = 1)
head(Fiber)

Fiber.trait <- c("area","perimeter", "width", "height", "avg_oil", "pre_gin", "total_fiber",
           "total_seed", "seed_25_count_weight", "seed_index", "lint_percentage",
           "boll_size", "seed_per_boll", "lint_index", "NepSize","Neps_g",
           "L.w.", "L.w.CV", "UQL.w.", "SFC.w.", "L.n.", "L.n.CV", "SFC","L5",
           "Total", "TrashSize", "Dust", "Trash", "VFM", "SCNSize", "SCN", "Fine", 
           "IFC", "Mat.Ratio", "Mic", "UHM", "UI", "Str", "Elo", "ShortFiberIndex")

Plot all raw data and transformed data together

for (i in 1:length(Fiber.trait)){
  
  temp.df <- Fiber[,Fiber.trait[i]]
  melt.df <- data.frame(
   "Accession" = rownames(Fiber),       # Replace with your desired values
   "Trait" = temp.df)
  
  ### Calculate the mean for each group
  melt.df$log  <- log10(melt.df$Trait)
  melt.df$sqrt <- sqrt(melt.df$Trait)
  
  temp.mean1 <- mean(melt.df$Trait)
  temp.mean2 <- mean(melt.df$log)
  temp.mean3 <- mean(melt.df$sqrt)
  
  A <-  ggplot(melt.df %>% na.omit(), aes(x=Trait)) +
    geom_histogram(aes(y=..density..),alpha = 0.8, color = "darkblue", fill = "lightblue")+
    theme_classic() +
    theme(legend.position = "none")+
    geom_vline(xintercept = temp.mean1,
               linetype="dashed", color = "darkblue") +
    geom_density(alpha = 0.2) + xlab(Fiber.trait[i])
  
  B <- ggplot(melt.df %>% na.omit(), aes(x=log)) +
    geom_histogram(aes(y=..density..),alpha = 0.8, color = "red", fill = "lightpink")+
    theme_classic() +
    theme(legend.position = "none")+
    geom_vline(xintercept = temp.mean2,
               linetype="dashed", color = "red") +
    geom_density(alpha = 0.2) + xlab(paste0("log10.",Fiber.trait[i]))
  
  
  C <-  ggplot(melt.df %>% na.omit(), aes(x=sqrt)) +
    geom_histogram(aes(y=..density..),alpha = 0.8, color = "black", fill = "grey")+
    theme_classic() +
    theme(legend.position = "none")+
    geom_vline(xintercept = temp.mean3,
               linetype="dashed", color = "black") +
    geom_density(alpha = 0.2) + xlab(paste0("Square.root.",Fiber.trait[i]))
  
  ### Combine all plots for each day
  pdf(paste0("~/Library/CloudStorage/Box-Box/Projects/11_DIV/02_Phenotype/02_Fiber/Normality/Fiber-traits-",Fiber.trait[i],".pdf"), 
      height = 15, width = 5)
  
  print(plot_grid(A, B, C, labels = c('A', 'B', 'C'), label_size = 12, nrow = 3))
  dev.off()
  
}

Export statistical profile

Fiber.trait

##  [1] "area"                 "perimeter"            "width"               
##  [4] "height"               "avg_oil"              "pre_gin"             
##  [7] "total_fiber"          "total_seed"           "seed_25_count_weight"
## [10] "seed_index"           "lint_percentage"      "boll_size"           
## [13] "seed_per_boll"        "lint_index"           "NepSize"             
## [16] "Neps_g"               "L.w."                 "L.w.CV"              
## [19] "UQL.w."               "SFC.w."               "L.n."                
## [22] "L.n.CV"               "SFC"                  "L5"                  
## [25] "Total"                "TrashSize"            "Dust"                
## [28] "Trash"                "VFM"                  "SCNSize"             
## [31] "SCN"                  "Fine"                 "IFC"                 
## [34] "Mat.Ratio"            "Mic"                  "UHM"                 
## [37] "UI"                   "Str"                  "Elo"                 
## [40] "ShortFiberIndex"

for (i in 1:length(Fiber.trait)){
  
  temp.df <- Fiber[,Fiber.trait[i], drop = F]

    ### Calculate the transformation
   temp.df$Sqrt <- sqrt(temp.df[,1])
   temp.df$Log10 <- log10(temp.df[,1])
   colnames(temp.df) <- c(Fiber.trait[i], paste0(Fiber.trait[i],".sqrt"),paste0(Fiber.trait[i],".log10"))
   
    ### Create an empty dataframe with specified row numbers and column names
    sum.df <- data.frame(matrix(nrow = 3, ncol = 7))
    colnames(sum.df) <- c("term", "shapiro.test(P)", "skewness", "kurtosis",
                    "Mean", "Media", "Delta")
    
     temp.clean <- temp.df
    
    for (x in 1: nrow(sum.df)){
      sum.df[x,1] <- colnames(temp.clean[,x, drop = F])
      S.test <- shapiro.test(temp.clean[,x] %>% na.omit())
      sum.df[x,2] <- S.test[2]$p.value
      sum.df[x,3] <- skewness(temp.clean[,x] %>% na.omit())
      sum.df[x,4] <- kurtosis(temp.clean[,x] %>% na.omit())
      sum.df[x,5] <- mean(temp.clean[,x] %>% na.omit())
      sum.df[x,6] <- median(temp.clean[,x] %>% na.omit())
      sum.df[x,7] <- sum.df[x,6] - sum.df[x,5]
    }
    
    sum.df$Term <- Fiber.trait[i]
    assign(paste0("Normality_Fiber-statistics_", Fiber.trait[i]), sum.df)
    assign(paste0("Fiber_traits_EMMAX", Fiber.trait[i]), temp.df)
}

### Combine all list 
Stats_list <- lapply(ls(pattern = "Normality_Fiber-statistics_"), get)
Normal_distribution_Stats <- bind_rows(Stats_list)
head(Normal_distribution_Stats, 30)

### Export report files 
write.csv(Normal_distribution_Stats, 
          "~/Library/CloudStorage/Box-Box/Projects/11_DIV/02_Phenotype/02_Fiber/02_Normality-stats-Fiber-traits.csv",
          quote = F)

Prepare phenotype files for GWAS

FIBER DATA

Reformat the data

data <- read.csv("~/Library/CloudStorage/Box-Box/Projects/11_DIV/02_Phenotype/02_Fiber/Seed-Fiber_blups_results-clean.csv", header = F)

fiber_index <- read.csv("~/Library/CloudStorage/Box-Box/Projects/11_DIV/02_Phenotype/02_Fiber/Traits/fiber-index.csv", header = T)
fiber_index

base_columns <- c(1, 2)

# Loop through the additional columns and save subtables
for (i in 1:nrow(fiber_index)) {
  # Extract the subtable with base columns and the current additional column
  subtable <- data[, c(base_columns, fiber_index[i,1])]
  
  # Generate the file name dynamically
  file_name <- paste0(fiber_index[i,2], ".txt")
  
  # Save the subtable to a text file
  write.table(subtable, file =     
              paste0("~/Library/CloudStorage/Box-Box/Projects/11_DIV/02_Phenotype/02_Fiber/", file_name), 
              sep = "\t",  row.names = FALSE, col.names = FALSE, quote = FALSE)
}

CANOPY DATA

Reformat the data

data.canopy <- read.csv("~/Library/CloudStorage/Box-Box/Projects/11_DIV/02_Phenotype/01_Canopy/DIV_BLUPS_12Days-Canopy-clean.csv", header = F)
head(data.canopy)

canopy_index <- read.csv("~/Library/CloudStorage/Box-Box/Projects/11_DIV/02_Phenotype/01_Canopy/03_Traits/canopy-index.csv", header = T)
canopy_index

base_columns <- c(1, 2)

# Loop through the additional columns and save subtables
for (i in 1:nrow(canopy_index)) {
  # Extract the subtable with base columns and the current additional column
  subtable <- data.canopy[, c(base_columns, canopy_index[i,1])]
  
  # Generate the file name dynamically
  file_name <- paste0(canopy_index[i,2], ".txt")
  
  # Save the subtable to a text file
  write.table(subtable, file =     
              paste0("~/Library/CloudStorage/Box-Box/Projects/11_DIV/02_Phenotype/01_Canopy/03_Traits/", file_name), 
              sep = "\t",  row.names = FALSE, col.names = FALSE, quote = FALSE)
}

Cotton-DIV-panel-Field-phenotypes-data

Li’ang Yu, Andrew Nelson

2025-01-03

Load libraries

Phenotype data process and transformation

Field canopy (temperature and other traits)

Load data

Plot all raw data and transformed data together

Export statistical profile

Prepare phenotype files for GWAS

FIBER DATA

CANOPY DATA