Genomes-to-Fields (G2F) is a cooperative public sector project with broad goals encompassing the collection of multi-environment maize hybrid data to enhance the underlying genomic and phenomic understanding of complex phenotypic traits. The project was initiated in 2013 and has since evaluated approximately 180,000 plots for over 2,500 hybrids across 162 environments within the U.S. Every two consecutive years, the same hybrids are grown – we will look at the 2020 and 2021 hybrids.
Some phenotypic traits of interest include: plant height, grain yield, and grain moisture.
Plant breeders are interested in identifying superior performing hybrids for a) their particular environment and b) broadly adapted to multiple environments. Grain yield (tons/hect.) is a phenotypic trait of particular interest for breeders because of its economic implications. We will be exploring grain yield for the 2020 G2F hybrid dataset in the upcoming analyses.
#refer to data set in github repository
git_path <- "https://raw.githubusercontent.com/fatmaoz25/ReactionNormGenomicPrediction/refs/heads/main"
#G2F 2020 dataset
df20_file <- "g2f_2020_phenotypic_clean_data.csv"
df20_path <- file.path(git_path, df20_file)
df20 <- read.csv(df20_path)
df20 <- df20[df20$Field.Location %in% c("DEH1","GAH1","INH1","MNH1","NEH1","NEH2","NEH3","OHH1","TXH1","TXH2","WIH1","WIH2"),] #environments for 2020
#G2F 2021 dataset
df21_file <- "g2f_2021_phenotypic_clean_data.csv"
df21_path <- file.path(git_path, df21_file)
df21 <- read.csv(df21_path)
df21 <- df21[df21$Field.Location %in% c( "DEH1", "GAH1", "ILH1", "MNH1", "NCH1", "NEH1", "SCH1", "TXH2", "TXH3", "WIH1", "WIH2", "WIH3","IAH1", "IAH2", "IAH3", "IAH4", "MIH1"),] #environments for 2021
#combine the 2020 and 2021 datasets
df <- rbind(df20,df21)
#create a column for each environment (the "environment" is each unique state:year combination)
df$Env <- paste(df$Field.Location,df$Year, sep = ".")
#overview of columns for dataset
head(df)
## Year Field.Location State City Plot.length..center.center.in.feet.
## 1 2020 DEH1 DE Georgetown 17.5
## 2 2020 DEH1 DE Georgetown 17.5
## 3 2020 DEH1 DE Georgetown 17.5
## 4 2020 DEH1 DE Georgetown 17.5
## 5 2020 DEH1 DE Georgetown 17.5
## 6 2020 DEH1 DE Georgetown 17.5
## Plot.area..ft2. Alley.length..in.inches. Row.spacing..in.inches.
## 1 75 30 30
## 2 75 30 30
## 3 75 30 30
## 4 75 30 30
## 5 75 30 30
## 6 75 30 30
## Rows.per.plot X..Seed.per.plot Experiment Source Pedigree
## 1 2 75 G2F_PHZ51 WIPV19/50151B W10004_0308/PHZ51
## 2 2 75 G2F_PHZ51 WIPV19/50249B W10004_0858/PHZ51
## 3 2 75 G2F_PHZ51 WIPV19/50052B W10004_0178/PHZ51
## 4 2 75 G2F_PHZ51 WIPV19/50280B W10004_0747/PHZ51
## 5 2 75 G2F_PHZ51 WIPV19/50096B W10004_0393/PHZ51
## 6 2 75 G2F_PHZ51 WIPV19/50281B W10004_0756/PHZ51
## Family Tester Replicate Block Plot Plot_ID Range Pass
## 1 W10004 PHZ51 1 1 1 1003603 2 1
## 2 W10004 PHZ51 1 1 2 1003604 2 2
## 3 W10004 PHZ51 1 1 3 1003605 2 3
## 4 W10004 PHZ51 1 1 4 1003606 2 4
## 5 W10004 PHZ51 1 1 5 1003607 2 5
## 6 W10004 PHZ51 1 1 6 1003608 2 6
## Date.Plot.Planted..MM.DD.YY. Date.Plot.Harvested..MM.DD.YY.
## 1 5/14/2020 9/15/2020
## 2 5/14/2020 9/15/2020
## 3 5/14/2020 9/15/2020
## 4 5/14/2020 9/15/2020
## 5 5/14/2020 9/15/2020
## 6 5/14/2020 9/15/2020
## Anthesis..MM.DD.YY. Silking..MM.DD.YY. Anthesis..days. Silking..days.
## 1 7/17/2020 7/16/2020 64 63
## 2 7/15/2020 7/16/2020 62 63
## 3 7/15/2020 7/16/2020 62 63
## 4 7/15/2020 7/15/2020 62 62
## 5 7/14/2020 7/15/2020 61 62
## 6 7/15/2020 7/16/2020 62 63
## Plant.Height..cm. Ear.Height..cm. Stand.Count....of.plants.
## 1 260 134 59
## 2 252 134 53
## 3 243 119 58
## 4 267 150 60
## 5 239 114 59
## 6 269 151 64
## Root.Lodging....of.plants. Stalk.Lodging....of.plants. Grain.Moisture....
## 1 1 14 21.8
## 2 0 8 17.9
## 3 0 18 20.8
## 4 0 23 19.9
## 5 3 3 19.5
## 6 0 44 18.8
## Test.Weight..lbs. Plot.Weight..lbs. Grain.Yield..bu.A.
## 1 55.8 13.9 133.4146
## 2 56.0 10.6 106.8147
## 3 53.8 12.0 116.6510
## 4 54.4 15.7 154.3526
## 5 56.2 15.4 152.1593
## 6 53.4 10.9 108.6337
## Plot.Discarded..enter..yes..or.blank. Comments Filler Snap....of.plants.
## 1 NA
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 NA
## Env
## 1 DEH1.2020
## 2 DEH1.2020
## 3 DEH1.2020
## 4 DEH1.2020
## 5 DEH1.2020
## 6 DEH1.2020
#display column names for dataset
colnames(df)
## [1] "Year"
## [2] "Field.Location"
## [3] "State"
## [4] "City"
## [5] "Plot.length..center.center.in.feet."
## [6] "Plot.area..ft2."
## [7] "Alley.length..in.inches."
## [8] "Row.spacing..in.inches."
## [9] "Rows.per.plot"
## [10] "X..Seed.per.plot"
## [11] "Experiment"
## [12] "Source"
## [13] "Pedigree"
## [14] "Family"
## [15] "Tester"
## [16] "Replicate"
## [17] "Block"
## [18] "Plot"
## [19] "Plot_ID"
## [20] "Range"
## [21] "Pass"
## [22] "Date.Plot.Planted..MM.DD.YY."
## [23] "Date.Plot.Harvested..MM.DD.YY."
## [24] "Anthesis..MM.DD.YY."
## [25] "Silking..MM.DD.YY."
## [26] "Anthesis..days."
## [27] "Silking..days."
## [28] "Plant.Height..cm."
## [29] "Ear.Height..cm."
## [30] "Stand.Count....of.plants."
## [31] "Root.Lodging....of.plants."
## [32] "Stalk.Lodging....of.plants."
## [33] "Grain.Moisture...."
## [34] "Test.Weight..lbs."
## [35] "Plot.Weight..lbs."
## [36] "Grain.Yield..bu.A."
## [37] "Plot.Discarded..enter..yes..or.blank."
## [38] "Comments"
## [39] "Filler"
## [40] "Snap....of.plants."
## [41] "Env"
#display unique environments for dataset
print(unique(df$Env))
## [1] "DEH1.2020" "GAH1.2020" "INH1.2020" "MNH1.2020" "NEH1.2020" "NEH2.2020"
## [7] "NEH3.2020" "OHH1.2020" "TXH1.2020" "TXH2.2020" "WIH1.2020" "WIH2.2020"
## [13] "DEH1.2021" "GAH1.2021" "ILH1.2021" "MNH1.2021" "NCH1.2021" "NEH1.2021"
## [19] "SCH1.2021" "TXH2.2021" "TXH3.2021" "WIH1.2021" "WIH2.2021" "WIH3.2021"
## [25] "IAH1.2021" "IAH2.2021" "IAH3.2021" "IAH4.2021" "MIH1.2021"
#display total count of environments for dataset
length(unique(df$Env))
## [1] 29
#display total count of Inbred parents for dataset
length(unique(df$Family))
## [1] 5
#display total count of Tester parents for dataset
length(unique(df$Tester))
## [1] 4
Now, let’s visualize the environments for the 2020-2021 dataset. The same hybrids were grown across both years. We will subset for only the hybrids with either PHK76, PHP02, or PHZ51 Tester parents.
#load libraries
pacman::p_load(readr, ggplot2, maps, ggrepel)
#refer to data set in github repository
git_path <- "https://raw.githubusercontent.com/fatmaoz25/ReactionNormGenomicPrediction/refs/heads/main"
locations_file <- "envRtype.txt"
locations_path <- file.path(git_path, locations_file)
#environments summary
locations <- read.table(locations_path, header=T)
head(locations)
## env lat lon start end100 end125 location year
## 1 DEH1.2020 38.63014 -75.46610 2020-05-14 2020-08-22 2020-09-16 DEH1 2020
## 2 DEH1.2021 38.63500 -75.45500 2021-04-27 2021-08-05 2021-08-30 DEH1 2021
## 3 GAH1.2020 31.50728 -83.55828 2020-04-01 2020-07-10 2020-08-04 GAH1 2020
## 4 GAH1.2021 31.50728 -83.55828 2021-03-25 2021-07-03 2021-07-28 GAH1 2021
## 5 IAH1.2021 41.21390 -91.53710 2021-05-01 2021-08-09 2021-09-03 IAH1 2021
## 6 IAH2.2021 42.06830 -94.85980 2021-04-27 2021-08-05 2021-08-30 IAH2 2021
## Tester
## 1 PHK76,PHP02,PHZ51
## 2 PHK76,PHP02,PHZ51
## 3 PHZ51
## 4 PHZ51
## 5 PHK76,PHZ51
## 6 PHK76,PHP02,PHZ51
dim(locations)
## [1] 29 9
#generate U.S. map with state boundaries
us_map <- map_data("state")
tester_counts <- locations[,c(1,9)]
PHK76_count <- (tester_counts[grep("PHK76", tester_counts$Tester),])
PHK76_count <- length(PHK76_count$env)
print(paste("Number of Environments with PHK76:", PHK76_count))
## [1] "Number of Environments with PHK76: 11"
PHP02_count <- (tester_counts[grep("PHP02", tester_counts$Tester),])
PHP02_count <- length(PHP02_count$env)
print(paste("Number of Environments with PHP02:", PHP02_count))
## [1] "Number of Environments with PHP02: 14"
PHZ51_count <- (tester_counts[grep("PHZ51", tester_counts$Tester),])
PHZ51_count <- length(PHZ51_count$env)
print(paste("Number of Environments with PHZ51:", PHZ51_count))
## [1] "Number of Environments with PHZ51: 20"
p <- ggplot() +
geom_polygon(data = us_map, aes(x=long, y=lat, group=group), fill="white", color="gray90") +
geom_point(data=locations, aes(x=lon, y=lat, color=Tester), size=2) +
geom_text_repel(data=locations, aes(x=lon, y=lat, label=location), size=3,
position=position_jitter(width=0.5, height=0.5)) +
theme_bw() +
labs(title="") +
facet_grid(~year) +
scale_color_brewer(palette = "Set1") +
theme(
plot.title = element_text(hjust = 0.5),
axis.text.x = element_text(size = 12), # Change this value for x-axis tick label size
axis.text.y = element_text(size = 12), # Change this value for y-axis tick label size
axis.title.x = element_text(size = 20), # Change this value for x-axis label size
axis.title.y = element_text(size = 20),
legend.background = element_rect(fill = alpha("gray80", 5), size = 0.5, linetype = "solid"),
legend.key.height = unit(0.1, 'cm'),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
legend.position = c(0.95, 0.15)
) +
scale_x_continuous("Longitude") +
scale_y_continuous("Latitude")
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Display the plot
print(p)
Further information regarding G2F can be found in the official website: https://www.genomes2fields.org/home/