This file will take Jeff’s latest CDBN experiment data (V1.9) from 2018-03-22 and do the following:

* Clean/tidy/transform 22 phenotypes: add lodging, emergence/early vigor, rust, seed appearance/desirability, white mold, biomass, and harvest index. Join DGF/SF with seedfill_duration, perhaps? Incorporate plant_height cleanup thoughts by Jeff?
* Filter the data to just keep the 22 phenotypes that I cleaned.
* Check that the pre-V1.7 data enrichment with germplasm, Location, Planting_date_est, and weather data still works. Enrich the data with determinacy information for 304 varieties.

———-

wb <- loadWorkbook("All_experiments_V1.9.1.xlsx")
lst = readWorksheet(wb, sheet = getSheets(wb))
metadata1 <- loadWorkbook("../../CDBN Variety Info/CDBN_Metadata_sequencing_2018-03-27.xlsx")
metadata = readWorksheet(metadata1, sheet = getSheets(metadata1))
# all_wea <- readRDS("../../CDBN Site Data/Weather Data/R_Weather/All_Weather_Data.rds")
wbp4 <- loadWorkbook("../../CDBN Site Data/CDBN_Planting_dates_1975_2015_v4.xlsx")
Plantings_est <- as_tibble(readWorksheet(wbp4, sheet = getSheets(wbp4)))

1. Add CDBN_ID’s to `lst$Germplasm`

There are a lot of synonymous variety names; the following code cleans the names and standardizes punctuation to underscores so that CDBN_ID’s consistently refer to the same CDBN entries.

I added the echo = FALSE parameter to most of the code for this step, so the html file does not show several hundred regular expression substitutions that generated the CDBN_ID’s from the Genotypes. If you want to see these, refer to the .Rmd document.

#Germplasm$CDBN_ID <- gsub("^1140$", "US1140", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^1062-V98$", "1062-V98", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^1132-V96$", "1132-V96", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^115m$", "115M", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^115M \\(Black Rhino)$", "115M", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^115M \\(Black Rhino)$", "115M", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Black Rhino$", "115M", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Black_Rhino$", "115M", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^black_rhino$", "115M", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^BlackRhino$", "115M", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^18-V96$", "18-V96", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^6I1$", "6I-1", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^6I13$", "6I-13", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^6I15$", "6I-15", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^6I3$", "6I-3", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^6I4$", "6I-4", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^6I5$", "6I-5", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^6I7$", "6I-7", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^6r_42$", "6R-42", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^6R42$", "6R-42", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^6R-42$", "6R-42", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^91 ABN 1551$", "91 ABN 551", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^AC Calmont$", "AC_Calmont", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^AC Calmont$", "AC_Calmont", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^AC Compass$", "AC_Compass", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^AC Compass$", "AC_Compass", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^ac_compass$", "AC_Compass", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^AC Elk$", "AC_Elk", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^AC Elk$", "AC_Elk", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^AC_ELK$", "AC_Elk", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Elk$", "AC_Elk", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^AC Mast$", "AC_Mast", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^AC Ole$", "AC_Ole", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^AC Ole$", "AC_Ole", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^ac_ole$", "AC_Ole", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^AC Pintoba$", "AC_Pintoba", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^ac_pintoba$", "AC_Pintoba", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^AC Skipper$", "AC_Skipper", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^AC Trident$", "AC_Trident", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^D77213$", "Agate", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Alpine$", "Alpine", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^D77133$", "Amber", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Apache$", "Apache", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^apache$", "Apache", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^80-1744$", "Arapaho", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Arapaho$", "Arapaho", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^arapaho$", "Arapaho", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Co.80-1744$", "Arapaho", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^CO80-1744$", "Arapaho", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Csu 80-1744$", "Arapaho", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Arthur$", "Arthur", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ND 91-076-01$", "Arthur", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Avalanche$", "Avalanche", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^avalanche$", "Avalanche", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Aztec$", "Aztec", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^aztec$", "Aztec", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Bandit$", "Bandit", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^bandit$", "Bandit", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^BD1002$", "BD-1002", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^BD 1003$", "BD-1003", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^BD 1003$", "BD-1003", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^BD1003$", "BD-1003", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^BD1004$", "BD-1004", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Beluga$", "Beluga", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Beryl$", "Beryl", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^beryl$", "Beryl", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Bighorn$", "Bighorn", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^bighorn$", "Bighorn", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^0081-13197$", "BillZ", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^81-13197$", "BillZ", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Bill Z$", "BillZ", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Bill Z$", "BillZ", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^bill_z$", "BillZ", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Co81-13197$", "BillZ", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^CO81-13197$", "BillZ", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Co81-13197    $", "BillZ", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Csu 81-13197$", "BillZ", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^blackjack$", "Black Jack", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Blackhawk$", "Blackhawk", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^blackhawk$", "Blackhawk", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Blush$", "Blush", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USWA-33$", "Blush", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Bos'n$", "Bosn", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Burke$", "Burke", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^burke$", "Burke", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USWA-19$", "Burke", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USWA-19$", "Burke", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USWA-19$", "Burke", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Buster$", "Buster", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^buster$", "Buster", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^c_20$", "C-20", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^C20$", "C-20", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^C-20$", "C-20", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Canario 707$", "Canario707", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Capri$", "Capri", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Cardinal$", "Cardinal", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^CB82-11\\(CB82-11\\(WM84-11))$", "CB82-11", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^CB82-11\\(WM84-11)$", "CB82-11", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^WM84-11$", "CB82-11", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^CDC Crocus$", "CDC_Crocus", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^CDC Crocus$", "CDC_Crocus", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^cdc_crocus$", "CDC_Crocus", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^cdc_expresso$", "CDC_Expression", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^CDC Pinnacle$", "CDC_Pinnacle", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^CDC Pinnacle$", "CDC_Pinnacle", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^cdc_pinnacle$", "CDC_Pinnacle", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^CDC Pintium$", "CDC_Pintium", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^CDC Pintium$", "CDC_Pintium", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^cdc_pintium$", "CDC_Pintium", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^CDC Rosalee$", "CDC_Rosalee", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^CDC Rosalee$", "CDC_Rosalee", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^cdc_rosalee$", "CDC_Rosalee", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Calif. Dark Red Kidney 82$", "CDRK82", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^CDRK 82$", "CDRK82", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^CELRK$", "CELRK", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^CERLK$", "CELRK", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Chase$", "Chase", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^chase$", "Chase", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Chinook 2000$", "Chinook2000", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Chinook 2000$", "Chinook2000", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^D81125$", "Cinnabar", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^D81-125$", "Cinnabar", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Claret$", "Claret", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^R93-365$", "Claret", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^CO 11113$", "CO-11113", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^CO11113$", "CO-11113", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^CO12601$", "CO-12601", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Co-12601$", "CO-12601", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Csu 12601$", "CO-12601", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^CO1760$", "CO-1760", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^CO 27864$", "CO-27864", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^CO27864$", "CO-27864", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^33142-Rres$", "CO-33142", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^CO33142$", "CO-33142", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^CO33875$", "CO-33875", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^CO64342$", "CO-64342", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^CO75511$", "CO-75511", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^CO91212-4$", "CO-91212-4", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Condor$", "Condor", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^condor$", "Condor", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Coyne$", "Coyne", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^coyne$", "Coyne", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^CPC00125$", "CPC-00125", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^CPC00152$", "CPC-00152", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^CPC00153$", "CPC-00153", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^CPC00247$", "CPC-00247", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^CPC 00250$", "CPC-00250", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^CPC00250$", "CPC-00250", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^CPC 1406$", "CPC-12406", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^CPC12406$", "CPC-12406", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^CPC-1406$", "CPC-12406", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^CPC1406$", "CPC-12406", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^CPC99814$", "CPC-99814", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Cran 09$", "Cran09", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Cran-09$", "Cran09", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Cran 34$", "Cran34", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Crestwood$", "Crestwood", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^crestwood$", "Crestwood", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Croissant$", "Croissant", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^croissant$", "Croissant", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^d000264$", "D000264", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^77196$", "D77196", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^79054$", "D79054", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^D81-122$", "D81122", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Desert Rose$", "Desert Rose", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^desert_rose$", "Desert Rose", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^K124467$", "Desert Rose", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Eclipse$", "Eclipse", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^eclipse$", "Eclipse", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Emerson$", "Emerson", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^emerson$", "Emerson", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Envoy$", "Envoy", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^envoy$", "Envoy", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^EP1$", "EP-1", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^NE EP-1$", "EP-1", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^NEEP-1$", "EP-1", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USWA-39$", "Fiero", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Fiesta$", "Fiesta", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^fiesta$", "Fiesta", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Fleetwood$", "Fleetwood", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^fleetwood$", "Fleetwood", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Frontier$", "Frontier", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^frontier$", "Frontier", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Gala$", "Gala", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^gala$", "Gala", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^GH482$", "GH432", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^gn9_1$", "GN9-1", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^GN9-1$", "GN9-1", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USGN9-1$", "GN9-1", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^gn9_4$", "GN9-4", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^GN9-4$", "GN9-4", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USGN9-4$", "GN9-4", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Grand Mea$", "Grand Mesa", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Grand Mesa$", "Grand Mesa", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^grand_mesa$", "Grand Mesa", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^GTS 0786-2$", "GTS-0786-2", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ac_harblack$", "Harblack", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^HR21-893$", "Harblack", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^GH1053$", "Harold", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Harold$", "Harold", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^harold$", "Harold", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Nw-1053$", "Harold", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^HR 17-827$", "Harowood", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^HR17-827$", "Harowood", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^GN_Harris$", "Harris", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^gn_harris$", "Harris", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Hatton$", "Hatton", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^hatton$", "Hatton", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Holberg$", "Holberg", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^holberg$", "Holberg", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^ND364$", "Holberg", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Hooter$", "Hooter", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^HR 18-675$", "HR18-675", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^HR 49$", "HR49", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Huron$", "Huron", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^huron$", "Huron", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Hyden$", "Hyden", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^hyden$", "Hyden", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^NW230$", "Hyden", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^usbk_cbb_5$", "I9606-6", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USBK-CBB-5$", "I9606-6", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^icb_10$", "ICB-10", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ICB10$", "ICB-10", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ICB-10$", "ICB-10", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ICB 10-5$", "ICB-10-5", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ICB10-5$", "ICB-10-5", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^IG GND$", "IG_GND", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ip08_2$", "IP08-2", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^IP08-2$", "IP08-2", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^2602$", "Isabella", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Isabella$", "Isabella", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB1$", "ISB-1", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB11$", "ISB-11", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB1145$", "ISB-1145", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB1194-2$", "ISB-1194-2", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB1218$", "ISB-1218", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB1252$", "ISB-1252", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB1256$", "ISB-1256", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB16$", "ISB-16", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB1614$", "ISB-1614", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB1618$", "ISB-1614", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB-1618$", "ISB-1614", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB-1618 \\*\\(or ISB-1614)$", "ISB-1614", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB-1618 \\*\\(or ISB-1614) \\*\\(or ISB-1614)$", "ISB-1614", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB18$", "ISB-18", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB1814$", "ISB-1814", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB1816$", "ISB-1816", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB19$", "ISB-19", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB20$", "ISB-20", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB2001$", "ISB-2001", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB21$", "ISB-21", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB23$", "ISB-23", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB24$", "ISB-24", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB254-1$", "ISB-254-1", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB254-4$", "ISB-254-4", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB2598$", "ISB-2598", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB3156$", "ISB-3156", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB3156$", "ISB-3156", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^LSB3156$", "ISB-3156", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^LSB 3156$", "ISB-3156", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB4071$", "ISB-4071", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB456$", "ISB-456", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB459$", "ISB-459", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB486$", "ISB-486", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB5172$", "ISB-5172", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB565$", "ISB-565", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB5893$", "ISB-5893", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB5893-1$", "ISB-5893-1", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB672$", "ISB-672", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB721$", "ISB-721", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB730$", "ISB-730", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB756-1$", "ISB-756-1", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB782$", "ISB-782", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB782-1$", "ISB-782-1", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB82-1024$", "ISB-82-1024", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB82-258$", "ISB-82-258", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB82-354$", "ISB-82-354", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB82-772$", "ISB-82-772", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB82-865$", "ISB-82-865", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB84-114$", "ISB-84-114", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB84-244$", "ISB-84-244", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB84-245$", "ISB-84-245", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB85-672$", "ISB-85-672", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB97-4071$", "ISB-97-4071", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB99-1815-2$", "ISB-99-1815-2", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB Ba 2-11$", "ISB-BA-2-11", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISBa 2-11$", "ISB-BA-2-11", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB-Ba 2-11$", "ISB-BA-2-11", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB-Ba-2-11$", "ISB-BA-2-11", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB BTR-26$", "ISB-BTR-26", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB-d-2-11$", "ISB-D-2-11", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISBTR-13$", "ISB-TR-13", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Isles$", "Isles", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^D80152$", "Ivory", Germplasm$CDBN_ID)
##Germplasm$CDBN_ID <- gsub("^Ivory$", "Ivory", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^ivory$", "Ivory", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB777$", "Jackpot", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB-777$", "Jackpot", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Jackpot$", "Jackpot", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^jackpot$", "Jackpot", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Jaguar$", "Jaguar", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^jaguar$", "Jaguar", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^jm_126$", "JM126", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^JM-126$", "JM126", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^jm_24$", "JM24", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^JM-24$", "JM24", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^K1441$", "K0441", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^K1441$", "K0441", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^K-407$", "K407", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^K-42$", "K42", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^K-59$", "K59", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^K-59-7$", "K59-7", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^K279$", "Kamiakin", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Kamiakin$", "Kamiakin", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Kardinal$", "Kardinal", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Kimberly$", "Kimberly", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^kimberly$", "Kimberly", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Kodiak$", "Kodiak", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^kodiak$", "Kodiak", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^C15$", "Laker", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^C-15$", "Laker", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Laker$", "Laker", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^laker$", "Laker", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Lariat$", "Lariat", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^lariat$", "Lariat", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Lassen$", "Lassen", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^LB-4803$", "LeBaron", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^LB-4803-B$", "LeBaron", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Le Baron$", "LeBaron", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^le_baron$", "LeBaron", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Lightning$", "Lightning", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^lightning$", "Lightning", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Lisa$", "Lisa", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Long's Peak$", "Long's Peak", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Longs Peak$", "Long's Peak", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^longs_peak$", "Long's Peak", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^LongsPeak$", "Long's Peak", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Long'sPeak$", "Long's Peak", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Mackinac$", "Mackinac", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^mackinac$", "Mackinac", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Majesty$", "Majesty", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Matterhorn$", "Matterhorn", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^matterhorn$", "Matterhorn", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^88-048-03$", "Maverick", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Maverick$", "Maverick", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^maverick$", "Maverick", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB1131$", "Max", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB-1131$", "Max", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Max$", "Max", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^max$", "Max", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Mayflower$", "Mayflower", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^mayflower$", "Mayflower", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Merlot$", "Merlot", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^merlot$", "Merlot", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Midnight$", "Midnight", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^midnight$", "Midnight", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Midnite$", "Midnight", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Montcalm$", "Montcalm", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Montrose$", "Montrose", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^montrose$", "Montrose", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^N05324$", "N05324", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^n05324$", "N05324", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ND060197$", "ND060197", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ND-307$", "ND307", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Newport$", "Newport", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^newport$", "Newport", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^NDP912$", "Nodak", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Nodak$", "Nodak", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^nodak$", "Nodak", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Norstar$", "Norstar", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^norstar$", "Norstar", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^NX 041$", "Norstar", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^NX-041$", "Norstar", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^GH11$", "NW11", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^nw_63$", "NW63", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^NW-63$", "NW63", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^NY104$", "NY-104", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^NY105$", "NY-105", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Dublin$", "OAC_Dublin", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^OAC Dublin$", "OAC_Dublin", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^OAC Gryphon$", "OAC_Gryphon", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^oac_gryphon$", "OAC_Gryphon", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Inferno$", "OAC_Inferno", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^OAC Inferno$", "OAC_Inferno", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^OAC Laser$", "OAC_Laser", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^oac_laser$", "OAC_Laser", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^OAC Lyrik$", "OAC_Lyrik", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^OACLyrik$", "OAC_Lyrik", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^OAC Rex$", "OAC_Rex", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^oac_rex$", "OAC_Rex", Germplasm$CDBN_ID)
##Germplasm$CDBN_ID <- gsub("^OAC Seaforth$", "OAC_Seaforth", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^OAC Seaforth$", "OAC_Seaforth", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Seaforth$", "OAC_Seaforth", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^OAC Thunder$", "OAC_Thunder", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^OAC Thunder$", "OAC_Thunder", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Olathe$", "Olathe", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^olathe$", "Olathe", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^D76035$", "Opal", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^GH215$", "Othello", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Othello$", "Othello", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^othello$", "Othello", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Othello \\(rust res.)$", "Othello-RR", Germplasm$CDBN_ID) # Changed from Othello to Othello-RR in V1.6
Germplasm$CDBN_ID <- gsub("^Othello\\(GH215)$", "Othello", Germplasm$CDBN_ID)
# Germplasm$CDBN_ID <- gsub("^Othello-RR$", "Othello", Germplasm$CDBN_ID)  # Changed from Othello to Othello-RR in V1.6
Germplasm$CDBN_ID <- gsub("^PI11$", "PI-11", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^PI8$", "PI-8", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Pindak$", "Pindak", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^pindak$", "Pindak", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^PVD023$", "PVD-023", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Quincy$", "Quincy", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^quincy$", "Quincy", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^raven$", "Raven", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Red Hawk$", "Redhawk", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Red Hawk$", "Redhawk", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^NY10195$", "RedKanner", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Red Kanner$", "RedKanner", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^2204$", "Redkloud", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Red Kloud$", "Redkloud", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Red Kloud$", "Redkloud", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^RedKloud$", "Redkloud", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^OAC 07-2$", "Rexeter", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^OAC_07-2$", "Rexeter", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ND061106$", "Rosie", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Royal Red$", "RoyalRed", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^roza$", "Roza", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Sacramento$", "Sacramento", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Sanilac$", "Sanilac", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^sanilac$", "Sanilac", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^P04205$", "Santa Fe", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Santa Fe$", "Santa Fe", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^santa_fe$", "Santa Fe", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Sapphire$", "Sapphire", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^sapphire$", "Sapphire", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Sawtooth$", "Sawtooth", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^sawtooth$", "Sawtooth", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Seabiskit$", "Seabiskit", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^seabiskit$", "Seabiskit", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^N97774$", "Seahawk", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Seahawk$", "Seahawk", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^seahawk$", "Seahawk", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^S00809$", "Sedona", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Sedona$", "Sedona", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^sedona$", "Sedona", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB620-1$", "Sequoia", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ISB-620-1$", "Sequoia", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Sequoia$", "Sequoia", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^sequoia$", "Sequoia", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^B201240$", "Shania", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Shania$", "Shania", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^shania$", "Shania", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Shania $", "Shania", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Shiny Crow$", "Shiny Crow", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^shiny_crow$", "Shiny Crow", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Shoshone$", "Shoshone", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^shoshone$", "Shoshone", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^P86299$", "Sierra", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^sierra$", "Sierra", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Sierra  $", "Sierra", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Silver Cloud$", "SilverCloud", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Silver Cloud$", "SilverCloud", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^D78174$", "Spinel", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^SR10-4$", "SR10-20", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ND020351-R$", "Stampede", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^stampede$", "Stampede", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Stampede $", "Stampede", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^gn_star$", "Star", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Great Northern Tara$", "Tara", Germplasm$CDBN_ID) # Changed from Star to Tara for V1.6
#Germplasm$CDBN_ID <- gsub("^Tara$", "Tara", Germplasm$CDBN_ID) # Changed from Star to Tara for V1.6
#Germplasm$CDBN_ID <- gsub("^tara$", "Tara", Germplasm$CDBN_ID) # Changed from Star to Tara for V1.6
Germplasm$CDBN_ID <- gsub("^GN-WM-85-43$", "Starlight", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Starlight$", "Starlight", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^starlight$", "Starlight", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^SW LRK 7$", "SW-LRK-7", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^SW LRK-7$", "SW-LRK-7", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^t_39$", "T-39", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^T39$", "T-39", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^T-39$", "T-39", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ND061210$", "Talon", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^D81123$", "Topaz", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^D81-123$", "Topaz", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Topaz$", "Topaz", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^topaz$", "Topaz", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ucd_96114$", "UC BTS 96114", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^UCD 0801$", "UCD-0801", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^UCD 0801$", "UCD-0801", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^UCD0801$", "UCD-0801", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^UC Flor 9623$", "UCD-9623", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^UCFlor9623$", "UCD-9623", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^UCD 9623$", "UCD-9623", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^UCD 9623 $", "UCD-9623", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ucd_9623$", "UCD-9623", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^UC Pink 9634$", "UCD-9634", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^UCD 9634$", "UCD-9634", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ucd_9634$", "UCD-9634", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^UCD 9830$", "UCD-9830", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^UCD 9830$", "UCD-9830", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^UCD9830$", "UCD-9830", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ui_111$", "UI111", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^UI-111$", "UI111", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ui_114$", "UI114", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^UI-114$", "UI114", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^K0125$", "UI125", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ui_126$", "UI126", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^UI-126$", "UI126", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^6137$", "UI137", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^K0158$", "UI158", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^GH196-2$", "UI196", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ui_196$", "UI196", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^UI-196$", "UI196", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^93:220$", "UI320", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^93:220$", "UI320", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^UI320 \\(93:220)$", "UI320", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^88:539$", "UI259", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ui_239$", "UI239", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^UI-239$", "UI239", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^UI259 \\(88:539)$", "UI259", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^KR42$", "UI42", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^K0425$", "UI425", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ui_425$", "UI425", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^UI-425$", "UI425", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^90:465$", "UI465", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^90:465$", "UI465", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^UI465 \\(90:465)$", "UI465", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^55037$", "UI537", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ui_537$", "UI537", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^UI-537$", "UI537", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ui_59$", "UI59", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^UI-59$", "UI59", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^KL10$", "UI60", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ui_906$", "UI906", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^UI-906$", "UI906", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ui_911$", "UI911", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^UI-911$", "UI911", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^UIP7-24P-lP$", "UIP7-24P-1P", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^UNS117$", "UNS-117", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^76067$", "US1140", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^us_1140$", "US1140", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^US-1140$", "US1140", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^I9606-6$", "USBK-CBB-5", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USGN 5$", "USGN-5", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USGN5$", "USGN-5", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USPT-7-8$", "USPT-7-8", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USPK7-8$", "USPK-7-8", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USPT7-8$", "USPT-7-8", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^PK-7-4$", "USPK-7-4", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USPK7-4$", "USPK-7-4", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^PK-7-4$", "USPK-7-4", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^PK7-4$", "USPK-7-4", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USPK7-5$", "USPK-7-5", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^PK7-5$", "USPK-7-5", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^PK-9-4$", "USPK-9-4", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USPK9-4$", "USPK-9-4", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USPK9-4$", "USPK-9-4", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^PK-9-4$", "USPK-9-4", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^PK9-4$", "USPK-9-4", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^PT11-9$", "USPT-11-9", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USPT11-9$", "USPT-11-9", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USPT11-9$", "USPT-11-9", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USPT7-1$", "USPT-7-1", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^PK7-1$", "USPT-7-1", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^PK-7-1$", "USPT-7-1", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^PT7-1$", "USPT-7-1", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^PT-72$", "USPT-7-2", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USPT-72$", "USPT-7-2", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USPT-73$", "USPT-7-3", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^PT-73$", "USPT-7-3", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USPT-73$", "USPT-7-3", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^PT-74$", "USPT-7-4", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USPT-74$", "USPT-7-4", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^PK7-8$", "USPK-7-8", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^PK-7-8$", "USPK-7-8", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USPK-7-8$", "USPK-7-8", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^PT7-8$", "USPT-7-8", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^PT-7-8$", "USPT-7-8", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USPT-7-8$", "USPT-7-8", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^PT 8-15$", "USPT-8-15", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^PT8-15$", "USPT-8-15", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USPT8-15$", "USPT-8-15", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^PT 8-6$", "USPT-8-6", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^PT8-6$", "USPT-8-6", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USPT8-6$", "USPT-8-6", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^PT9-18$", "USPT-9-18", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USPT9-18$", "USPT-9-18", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USPT9-18$", "USPT-9-18", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^PT 9-6$", "USPT-9-6", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^PT9-6$", "USPT-9-6", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USPT9-6$", "USPT-9-6", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USPT-ANT-1$", "USPT-ANT-1", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^uspt_ant_1$", "USPT-ANT-1", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USPT-CBB-1$", "USPT-CBB-1", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^uspt_cbb_1$", "USPT-CBB-1", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USRM 20$", "USRM-20", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^usrm_20$", "USRM-20", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USRM20$", "USRM-20", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USRM-20$", "USRM-20", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^uswa_12$", "USWA-12", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USWA-12$", "USWA-12", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^93LB1803$", "USWA-50", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^6R122$", "Victor", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^6R-122$", "Victor", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Victor$", "Victor", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^victor$", "Victor", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Vista$", "Vista", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^vista$", "Vista", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^viva$", "Viva", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Viva $", "Viva", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^abc_weihing$", "Weihing", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Weihing$", "Weihing", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^WK 380$", "WK380", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^WSB 101$", "WSB101", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Wyo166$", "WYO166", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Wyo166D$", "WYO166", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^WYO166D$", "WYO166", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^WYO166D$", "WYO166", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^WYOMING 166$", "WYO166", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^Wyo167$", "WYO167", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Yolano$", "Yolano", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^yolano$", "Yolano", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^D76063$", "Zircon", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Zorro$", "Zorro", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^zorro$", "Zorro", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^UIG4-6P-3P$", "Hungerford", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ABL2$", "Hungerford", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ABL 2$", "Hungerford", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^06I6$", "Hungerford", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^UIG4-53P-2P$", "Sawtooth", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ABL6$", "Sawtooth", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ABL 6$", "Sawtooth", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^06I1$", "Sawtooth", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^D77216$", "Garnet", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^77125$", "Sapphire", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^D77125$", "Sapphire", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^IS-4913-B$", "Burke", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^IS-4931-B$", "Rojo Chiquito", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^IS-4931$", "Rojo Chiquito", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^USWA-6$", "Rojo Chiquito", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^GR-122$", "Victor", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^UIP15-53G-4G$", "Shoshone", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^UIP15-53G-4G-1$", "Shoshone", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ABL8$", "Shoshone", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ABL 8$", "Shoshone", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^06I4$", "Shoshone", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^KP97$", "UI129", Germplasm$CDBN_ID)

—————————-

Here are a few punctuation fixes, as examples:

Germplasm$CDBN_ID <- gsub("-","_", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub(":","_", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub(" ","_", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("Long's_Peak","Longs_Peak", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^ICB_10_5$", "ICB_10", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^CO_75511$", "Grand_Mesa", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("^CDC_Expresso$", "CDC_Expression", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Grand Mesa$", "Grand_Mesa", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^OAC_Dubli$", "OAC_Dublin", Germplasm$CDBN_ID)
#Germplasm$CDBN_ID <- gsub("^Kamiakan$", "Kamiakin", Germplasm$CDBN_ID)

Are these regex fixes unneccessary for V1.9? Aka, is Jeff’s V1.6 already corrected for these synonyms and standardized? Both have 547 varieties now.

Well, some of them certainly are, but it’s hard to tell which and I don’t think it’s necessary to go into this code and remove those exhaustively. Cargo cult code go!

length(unique(Germplasm$CDBN_ID)) # 567 #546 #547 #537

## [1] 549

length(unique(Germplasm$Genotype)) # 581 #558 #547 #537

## [1] 549

1a. Clean the kinship matrix generated from the GBS data

kinship <- read.table("../../CDBN Genomics/CDBN_SNPs/8_Tassel/Filter_159Ct_MAF1per_CDBN_001to324_kinship.txt", skip = 3, sep = "\t")

kinship <- kinship %>%
  separate(V1, into = c("Seq_ID", "CDBN_ID"), sep = 9) 
kinship$Seq_ID <- gsub("_$", "", kinship$Seq_ID)
colnames(kinship) <- c("Seq_ID","CDBN_ID", kinship$CDBN_ID)

kinship %>%
  anti_join(Germplasm, by = "CDBN_ID") %>%
  dplyr::select(CDBN_ID)

kinship <- kinship %>%
  dplyr::select(-UNS_117_2, -Canario, -Canario707_2, -L94C356_2, -Stampede_2, -Beryl_2) %>%
  filter(!(CDBN_ID %in% c("UNS_117_2", "Canario", "Canario707_2", "L94C356_2", "Stampede_2", "Beryl_2")))

# write.table(kinship, "../../CDBN Genomics/CDBN SNPs/8_Tassel/Filter_159Ct_MAF1per_CDBN_312_kinship.txt", sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE)

1b. Test that the second round of punctuation fixes actually ties the CDBN_ID keys to the Germplasm table.

The kinship table below is empty, which means that there are no remaining CDBN_ID’s in the sequenced kinship matrix or metadata for sequenced lines that are unaccounted for.

kinship %>% 
  anti_join(Germplasm, by = "CDBN_ID") %>%
  dplyr::select(CDBN_ID)

metadata %>% 
  anti_join(Germplasm, by = "CDBN_ID") %>%
  dplyr::select(Seq_ID, CDBN_ID)

2. Join Germplasm and Metadata

Coalesce Jeff’s Market_class with my Market.class, which I have just for the sequenced CDBN entries. Standardize market class names.

Germplasm <- Germplasm %>%
  #dplyr::select(-(Location_code:Prev_names), -Race) %>%
  mutate_if(is.character, funs(na_if(., ""))) %>%
  group_by(CDBN_ID) %>%
  fill(Market_class, Orig_name, Prev_names, Race, Institution, Year_released, Source) %>%
  fill(Market_class, Orig_name, Prev_names, Race, Institution, Year_released, Source, .direction = "up") %>%
  dplyr::select(-Race, -Location_code, -Year, -Orig_name, -Prev_names) %>%
  unique()

# Here's how to fix more Germplasm column inconsistencies. 
Germplasm %>%  
   dplyr::select(Genotype, Gene_pool, Market_class, CDBN_ID) %>%
  mutate(Market_class = ifelse(Market_class == "Kidney",
                                   "Light Red Kidney",
                               Market_class),
         Market_class = ifelse(Market_class == "Red Kidney",
                                   "Light Red Kidney",
                               Market_class)
  ) %>%
  unique() %>%
  group_by(CDBN_ID) %>%
  summarise(count = n()) %>%
  arrange(desc(count))

Germplasm2 <- Germplasm %>%  
   dplyr::select(Genotype, Market_class, CDBN_ID) %>%
  mutate(Market_class = ifelse(Market_class == "Kidney",
                                   "Light Red Kidney",
                               Market_class),
         Market_class = ifelse(Market_class == "Red Kidney",
                                   "Light Red Kidney",
                               Market_class)
  ) %>%
  unique()

(Germplasm_ahm <- Germplasm2 %>%
  left_join(metadata) %>%
  dplyr::select(Genotype, CDBN_ID, Seq_ID, Gene_pool:Market.class, Market_class, everything()) %>%
  filter(!(CDBN_ID %in% c("5%_LSD", "CV", "CV_%", "CV_(%)", "Grand_Mean", "LSD", "LSD_0.05", "LSD_05", "LSD_5", "X", "MEAN", "Mean", "CV (%)", "LSD (P < .05)", "LSD_(P_<_.05)"))))

## Joining, by = "CDBN_ID"

# Test that I still have the correct number of Germplasm rows. Seems to be ok. 546 varieties total.
Germplasm_ahm %>%
  unique() %>%
  group_by(CDBN_ID, Seq_ID) %>%
  summarise(count = n()) %>%
  arrange(Seq_ID)

summary(as.factor(Germplasm_ahm$Market.class))

##                                      Black 
##                                         28 
##                                  Cranberry 
##                                          8 
##                            Dark Red Kidney 
##                                         19 
##                               Flor de Mayo 
##                                          3 
##                             Great Northern 
##                                         35 
## Kidney (Light Red Kidney unless very dark) 
##                                          1 
##                           Light Red Kidney 
##                                         21 
##                                       Navy 
##                                         22 
##                           Navy/Small White 
##                                         29 
##                                       Pink 
##                                         21 
##                                      Pinto 
##                                         78 
##                                  Small Red 
##                                         13 
##                                Small White 
##                                         22 
##                               White Kidney 
##                                         10 
##                                     Yellow 
##                                          3 
##                                       NA's 
##                                        233

summary(as.factor(Germplasm_ahm$Market_class))

##             (misc)            Anasazi              Black 
##                  1                  1                 35 
##          Cranberry    Dark Red Kidney       Flor de Mayo 
##                 12                 16                  4 
##     Great Northern   Light Red Kidney               Navy 
##                 52                 43                 28 
##               NaVY   Navy/Small White               Pink 
##                  1                 44                 31 
##              Pinto                Red Red Soldier Kidney 
##                131                 10                  1 
##           Red/Pink          Small Red        Small White 
##                  3                 11                 30 
##               Tebo       White Kidney             Yellow 
##                  1                 11                  4 
##               NA's 
##                 76

Use Market_class_ahm as that has fewer typos and less cleaning to do, and then fill from Jeff’s Market_class as needed. Then correct those typos. After that, fill Race and Gene_pool for different Market classes.

Germplasm_ahm$Market_class_ahm <- Germplasm_ahm$Market.class

Germplasm_ahm$Market_class_ahm <- coalesce(Germplasm_ahm$Market_class_ahm, Germplasm_ahm$Market_class)

Germplasm_ahm <- Germplasm_ahm %>%
    dplyr::select(Genotype, CDBN_ID, Seq_ID, Market_class_ahm, Gene_pool:Market.class, Market_class, everything()) %>%
  mutate(Market_class_ahm = ifelse(is.na(Market_class_ahm),
                                   NA,
                                   Market_class_ahm),
         Market_class_ahm = ifelse(Market_class_ahm == "Navy/Small White",
                                   "Navy",
                                   Market_class_ahm),
         Market_class_ahm = ifelse(Market_class_ahm == "Pinks",
                                   "Pink",
                                   Market_class_ahm),
         Market_class_ahm = ifelse(Market_class_ahm == "Small Red",
                                   "Red",
                                   Market_class_ahm),
         Market_class_ahm = ifelse(Market_class_ahm == "",
                                   NA,
                                   Market_class_ahm),
         Market_class_ahm = ifelse(Market_class_ahm == "Kidney (Light Red Kidney unless very dark)",
                                   "Light Red Kidney",
                                   Market_class_ahm),
         Market_class_ahm = ifelse(Market_class_ahm == "(misc)",
                                   NA,
                                   Market_class_ahm),
         Market_class_ahm = ifelse(Market_class_ahm == "Red/Pink",
                                   "Pink",
                                   Market_class_ahm),
         Market_class_ahm = ifelse(Market_class_ahm == "Red Soldier",
                                   "Red Soldier Kidney",
                                   Market_class_ahm),
         Market_class_ahm = ifelse(Market_class_ahm == "Ana",
                                   NA,
                                   Market_class_ahm),
         Race = ifelse(Market_class_ahm %in% c("Pinto", "Pink", "Great Northern", "Red", "Flor de Mayo"),
                       "DurangoJalisco",
                       Race),
         Race = ifelse(Market_class_ahm %in% c("Navy", "Small White", "Black"),
                       "Mesoamerican",
                       Race),
         Race = ifelse(Market_class_ahm %in% c("Yellow", "Red Kidney", "Light Red Kidney", "Dark Red Kidney", "White Kidney", "Red Soldier Kidney", "Cranberry"),
                       "Nueva Granada",
                       Race),
         Gene_pool = ifelse(Race %in% c("DurangoJalisco", "Mesoamerican"),
                            "MA",
                            Gene_pool),
         Gene_pool = ifelse(Race %in% c("Nueva Granada"),
                            "Andean",
                            Gene_pool)
  )

summary(as.factor(Germplasm_ahm$Market_class_ahm))

##            Anasazi              Black          Cranberry 
##                  1                 34                 12 
##    Dark Red Kidney       Flor de Mayo     Great Northern 
##                 21                  4                 55 
##   Light Red Kidney               Navy               Pink 
##                 40                 72                 35 
##              Pinto                Red Red Soldier Kidney 
##                132                 22                  1 
##        Small White               Tebo       White Kidney 
##                 34                  1                 11 
##             Yellow               NA's 
##                  4                 67

summary(as.factor(Germplasm_ahm$Race))

## DurangoJalisco   Mesoamerican  Nueva Granada           NA's 
##            248            140             89             69

3. Make a Germplasm table to add to the Phenotypes worksheet

Germplasm_small <- Germplasm_ahm %>%
  dplyr::select(Genotype, CDBN_ID, Seq_ID, Gene_pool, Race, Market_class_ahm, Det_scr)
  
# Germplasm_small$CDBN_ID

3a. Join small Genotypes table to Phenotypes worksheet

Phenotypes <- lst$Phenotypes

Phenotypes_ahm <- Phenotypes %>%
  left_join(Germplasm_small, by = "Genotype") %>%
  dplyr::select(Genotype, CDBN_ID, Seq_ID, Location_code, Year, Gene_pool, Race, Market_class_ahm, everything())

4. Add Climate_bin to Locations worksheet

Using the new file for the Location data: CDBN_weather_stations_V1.0_GL_JW1.xlsx which has new complete latitudes, longitudes, and elevations for 83 sites, finished in early November 2017 by Greg Lohrey (Thanks!). So now this step will:

* combine it with the Locations in the All_experiments datasheet 
* combine that with the location code translations that I have come up with for my climate_bins.

loc_transl_wb <- loadWorkbook("../CDBN Phenotype Summaries/Location_code_translations_w_JW_check.xlsx")
loc_climatebin = readWorksheet(loc_transl_wb, sheet = getSheets(loc_transl_wb))
small_climatebin <- loc_climatebin %>%
  dplyr::select(Location_code, State, Climate_bin) # All we really care about are State and Climate_bin here, and we need to be able to link it to the other two Location tables by Location_code.

weather_station_wb <- loadWorkbook("../CDBN Phenotype Summaries/CDBN_weather_stations_V1.0_GL_JW1.xlsx")
station_lst = readWorksheet(weather_station_wb, sheet = getSheets(weather_station_wb))
stations <- station_lst$CDBN_locations

Locations <- lst$Locations

weather_climate <- loc_climatebin %>%
  left_join(stations, by = "Location_code")

Locations_ahm <- Locations %>%
  left_join(weather_climate, by = c("Location_code")) %>%
  dplyr::select(Location_code, Climate_bin, Latitude, Lat_best, Longitude, Long_best, Elev_best, everything()) %>%
  mutate_if(is.numeric, funs(na_if(., -99))) %>%
  arrange(desc(Latitude)) %>%
  filter(Location_code != "")

Fill missing data for locations

Locations_ahm <- Locations_ahm %>%
  #dplyr::select(-(Location_code:Prev_names), -Race) %>%
  mutate_if(is.character, funs(na_if(., ""))) %>%
  group_by(Location_code) %>%
  fill(Institution, Research_sta, location_country, loc_2nd_level, loc_3rd_level, Latitude, Lat_best, Longitude, Long_best, Elev_best, Elev_original) %>%
  fill(Institution, Research_sta, location_country, loc_2nd_level, loc_3rd_level, Latitude, Lat_best, Longitude, Long_best, Elev_best, Elev_original, .direction = "up")

Locations %>%
  anti_join(Locations_ahm, by = c("Year", "Location_code"))

4a. Some locations don’t have associated Latitude and Longitude

But, because the unique locations don’t have any missing values for lat/long, these can be filled in from other datapoints that share the same location code. So here fix Lat/Long for the 11 year*location combinations.

2 locations have missing Lat_best/Long_best: MISA and MNRA

6 locations have missing Latitude/Longitude but Lat_best values to fill from: TXCS TXVE TXMU MOPO COF2 CAD2

IDK2 is missing Lat_best and Latitude, but these values can be filled from IDKI, the same site with a different treatment.

Locations_ahm %>%
  filter(is.na(Lat_best)) # IDK2 MIS2 MISA

Locations_ahm %>%
  filter(is.na(Latitude)) # MIS2 TXCS TXVE TXMU MOPO COF2 CAD2 IDK2

4b. Fix IDK2

IDK2 should have the same lat/long as IDKI.

Locations_ahm %>%
  filter(Location_code %in% c("IDKI", "IDK2"))

# Just give IDK2 IDKI's data.

Locations_ahm <- Locations_ahm %>%
  mutate(Latitude = ifelse(Location_code == "IDK2",
                           42.55103,
                           Latitude),
         Lat_best = ifelse(Location_code == "IDK2",
                           42.55103,
                           Lat_best),
         Longitude = ifelse(Location_code == "IDK2",
                            -114.34,
                            Longitude),
         Long_best =ifelse(Location_code == "IDK2",
                            -114.34,
                            Long_best),
         Climate_bin = ifelse(Location_code == "IDK2",
                            "RockiesWest",
                            Climate_bin),
         Elev_best = ifelse(Location_code == "IDK2",
                            1200,
                            Elev_best)
               )

4c. Supply missing Lat/Long or Lat_best/Long_best

Fix issues where only one of the two lat/long columns - site or weather station - have info. To fix this, use the lat/long information from the other column.

Locations_ahm <- Locations_ahm %>%
  mutate(Latitude = ifelse(Location_code %in% c("MOPO", "TXCS", "TXMU", "TXVE", "COF2", "CAD2"),
                           Lat_best,
                           Latitude),
         Longitude = ifelse(Location_code %in% c("MOPO", "TXCS", "TXMU", "TXVE", "COF2", "CAD2"),
                           Long_best,
                           Longitude)
  )
      

Locations_ahm <- Locations_ahm %>%
  mutate(Lat_best = ifelse(Location_code %in% c("MISA"),
                           Latitude,
                           Lat_best),
         Long_best = ifelse(Location_code %in% c("MISA"),
                           Longitude,
                           Long_best)
  )

4d. Fix MIS2 to have Lat/Long of MISA

Locations_ahm <- Locations_ahm %>%
  # filter(Location.x == "Saginaw")
  mutate(Climate_bin = ifelse(Location_code == "MIS2",
                              "GreatLakes",
                              Climate_bin),
         Latitude = ifelse(Location_code == "MIS2",
                              43.39521,
                              Latitude),
         Lat_best = ifelse(Location_code == "MIS2",
                              43.39521,
                              Lat_best),
         Long_best = ifelse(Location_code == "MIS2",
                              -83.68696,
                              Long_best),
         Longitude = ifelse(Location_code == "MIS2",
                              -83.68696,
                              Longitude))

5. Make a unique Locations sheet.

Locations_ahm currently has all of the location by year combinations, which is too repetive and is not tidy, given that the latitudes, longitudes, and elevations don’t change by the year. Make a location by years datasheet with all of the location and year combinations, and a locations datasheet that just has location-specific information.

Locations_unique <- Locations_ahm %>%
  group_by(Location_code) %>% 
  mutate(Num_Year = n()) %>%
  filter(row_number(State.x) == 1) %>%
  arrange(Location_code) %>%
  mutate(First_Year = Year,
         State = State.x,
         Location = Location.x) %>%
  dplyr::select(Location_code, Climate_bin, State, Latitude, Longitude, Lat_best, Long_best, Elev_best, Location, First_Year, Num_Year, Institution, Research_sta, Soil_series, Soil_class, NOAA, Notes)

6. Clean and enrich the `Locations_ahm` sheet

The Locations_ahm sheet has the Locations_by_year data.

Add estimated or actual planting dates to this sheet, and ensure names are synced between the planting data estimates file and the Locations_by_Year sheet.

Clean up estimated planting date tibble

First, which are the problem locations with estimated planting dates that don’t match the locations_by_years table?

Plantings_est %>%
  anti_join(Locations_ahm, by = c("Location_code", "Year")) #%>%

  #filter(!(Location_code %in% c("COGR", "MNRO", "NYLE", "NYRO", "NYVA", "NES3"))) # CAD2 MIFR MIMO NEMI NES3 WAPR

Locations_ahm %>%
  anti_join(Plantings_est, by = c("Location_code", "Year")) %>%
  dplyr::select(Location_code, Year) # 0 this time!! Woo!!!!

Now, make a DSSATwthcode variable to merge this data with the weather data. Mostly, sites that end in “2” are renamed. Usually these sites were treatment plots at the same site as the one with the first three letters of the code, so the weather at the treatment plot is the same, only the treatment - e.g. white mold control, drought - differs.

Plantings_est <- Plantings_est %>%
  dplyr::select(Location_code, Year, Planting_date_est, PD_check) %>%
    mutate(DSSATwthCode = ifelse(Location_code == "CAD2",
                           "CADV",
                           Location_code),
           DSSATwthCode = ifelse(Location_code == "COF2",
                          "COFC",
                          DSSATwthCode),
           DSSATwthCode = ifelse(Location_code == "IDK2",
                          "IDKI",
                          DSSATwthCode),
           DSSATwthCode = ifelse(Location_code == "IDP2",
                          "IDPA",
                          DSSATwthCode),
           DSSATwthCode = ifelse(Location_code == "NES2",
                          "NESB",
                          DSSATwthCode),
           DSSATwthCode = ifelse(Location_code == "NES3",
                           "NEMI",
                           DSSATwthCode)
    )


Plantings_est <- Plantings_est %>%
  mutate(PD_check = ifelse(PD_check == "M",
                           "Not_estimated",
                           PD_check),
         PD_check = ifelse(PD_check == "R",
                           "Known",
                           PD_check),
         PD_check = ifelse(PD_check == "E",
                           "Estimated",
                           PD_check)
         )

Now that the problem locations are fixed or dropped, join the estimated planting dates to the location dataset.

Also, format the planting date estimate so that R knows it is a date.

Locations_ahm <- Locations_ahm %>%
  left_join(Plantings_est, by = c("Location_code", "Year")) %>%
  dplyr::select(Location_code:Year, DSSATwthCode, Planting_date_est, PD_check, everything())

Locations_ahm$Planting_date_est <- as.Date(Locations_ahm$Planting_date_est)

——————

7. Clean phenotypes and add new phenotypes to `Phenotypes_ahm`

Clean yield, DTM, DTF, plant height and remove or examine outliers.
- Clean unit seed weight to add this as a phenotype for subsequent three GxE analyses.
Clean growth habit to add this as a phenotype for the three bean GxE analyses.
- Add duration of grainfill (DTM-DTF) as Jeff requested this as a new phenotype.

7a. Basic phenotype tidying

Basic phenotype tidying is needed first. Start by cleaning up Phenotypes table. Remove “Genotypes” that are actually statistical quantities, and replace blank entries and “-99” entries with NA’s.

Phenotypes_ahm <- Phenotypes_ahm %>%
  mutate_if(is.double, funs(na_if(., -99))) %>%
  mutate_if(is.integer, funs(na_if(., -99))) %>%
  mutate_if(is.character, funs(na_if(., ""))) %>%
  filter(!(Genotype %in% c("CV", "CV %", "CV (%)", "Grand Mean", "LSD", "LSD 0.05", "5% LSD", "LSD 5", "LSD 05", "MEAN", "X", "E.Gallatin", "RUST_CHECK", "LSD (P < .05)" , "Mean", "RUST CHECK"))) %>%
  arrange(desc(Year))

colSums(!is.na(Phenotypes_ahm))

##                  Genotype                   CDBN_ID 
##                     19415                     19415 
##                    Seq_ID             Location_code 
##                     14780                     19415 
##                      Year                 Gene_pool 
##                     19415                     18646 
##                      Race          Market_class_ahm 
##                     18646                     18666 
##               Yield_kg_ha            DAYS_TO_FLOWER 
##                     18242                      8916 
##          DAYS_TO_MATURITY           Unit_seed_wt_mg 
##                     12005                     14358 
##               Test_weight        Days_to_full_bloom 
##                       350                         0 
##      DAYS_TO_BLOOM_50_PER         Seedfill_duration 
##                       170                      2629 
##           DAYS_TO_HARVEST          Harvest_maturity 
##                       119                       339 
##         Ripening_date_scr              Maturity_scr 
##                        36                        74 
##              Plant_height             Canopy_height 
##                      2627                       481 
##              Growth_habit          Growth_habit_scr 
##                      2602                        32 
##           Plant_type_eval      Internode_length_scr 
##                        74                       530 
##             Pod_clearance               Plant_width 
##                       250                        78 
##         Pod_position_eval                Pod_height 
##                        34                        64 
##        Branch_length_eval               Lodging_scr 
##                        62                       241 
##               Lodging_1_5               Lodging_0_9 
##                      2203                        74 
##               Lodging_1_9               Lodging_per 
##                       975                        35 
##                 Yield_day        Yield_Day_Seedfill 
##                      4172                      3460 
##                   Biomass               Biomass_day 
##                      2899                      1459 
##             Harvest_index             Emergence_scr 
##                      3193                        60 
##           Early_vigor_scr                 Stand_per 
##                       763                        64 
##                Stand_code          Seed_quality_scr 
##                        20                       147 
##       Seed_appearance_scr         Seed_appear_desir 
##                       314                         0 
##     Seed_appear_desir_scr          Desirability_scr 
##                       559                       218 
##    Field_desirability_scr        Harvestability_scr 
##                        72                       232 
##           Adaptation_eval              Desirability 
##                        20                         0 
##               Pod_set_scr        MN_deficiency_eval 
##                        39                        11 
##             Zinc_dwarfing            Zinc_yellowing 
##                        36                        36 
##           Zinc_defic_eval            Zinc_defic_scr 
##                        24                        38 
##            Air_pollut_scr                 Ozone_scr 
##                       387                        24 
##              Bronzing_scr               Disease_scr 
##                        83                        23 
##          Anthracnose_eval                 BCMV_eval 
##                         0                       124 
##                   CBB_scr                   CBB_per 
##                       556                       450 
##                  CBB_eval     CBB_Foliage_per_innoc 
##                        36                        40 
## CBB_Foliage_per_non_innoc           CBB_pustule_scr 
##                        40                        20 
##            Blight_Pod_per             Curly_Top_per 
##                        40                        88 
##       Curly_top_virus_scr            Fusarium_emerg 
##                        96                        32 
##     Fusarium_seedling_vig       Fusarium_seed_yield 
##                        32                        32 
##          Fusarium_wilt_GH           Halo_blight_scr 
##                        33                       134 
##                Halo_B_per        Powdery_mildew_scr 
##                        70                        37 
##                Rhizoc_scr        Root_rot_emerg_scr 
##                        30                        36 
##    Root_rot_early_vig_scr       Root_rot_seed_yield 
##                        36                        36 
##              Root_rot_scr             Root_rot_eval 
##                        58                        43 
##                  Rust_scr                Rust_scr_c 
##                       316                        80 
##             Rust_CIAT_scr                  Rust_per 
##                        23                       369 
##                 Rust_eval          Rust_Foliage_per 
##                       489                        40 
##         Rust_Pustule_Type            White_mold_scr 
##                        32                       128 
##            White_mold_per           White_mold_eval 
##                       353                         6 
##       White_mold_scr_val2             White_mold_GH 
##                         0                        33 
##           Cooking_quality        Seed_color_uniform 
##                        34                        36 
##           Seed_wt_cul_dry       Seed_wt_cul_imbibed 
##                        36                        36 
##        Seed_dry_imb_ratio        Seed_cooked_appear 
##                        36                       100 
##           Seed_splits_cul           Seed_luminosity 
##                        36                        72 
##               Seed_chroma                  Seed_hue 
##                        72                        72 
##         Halo_blight_scr_1         Halo_blight_scr_2 
##                        45                        46 
##         Halo_blight_scr_3            Air_pollut_per 
##                        46                        46 
##                 Merit_scr          Fe_chlorosis_scr 
##                        69                        46 
##              Seed_L_color         Stand_uniform_scr 
##                        28                        34 
##           Row_closure_scr     Phenology_stage_51DAP 
##                        34                        34 
##        Growth_density_scr         Tunnel_effect_scr 
##                        34                        34 
##          Architecture_scr            Vine_habit_scr 
##                        34                        34 
##          Pod_maturity_scr             Plant_density 
##                        34                        36 
##            Pod_height_scr                Podset_scr 
##                        34                        34 
##             Flat_vine_scr                 Bush_Type 
##                        34                        34 
##             Pods_Peduncle             Seeds_per_Pod 
##                        34                        34 
##              Flower_color        General_appearance 
##                        41                        41 
##           Air_pollut_rate          Spray_injury_scr 
##                        41                        40 
##     Plant_vigor_bloom_scr            Plant_type_scr 
##                        41                        41 
##               Quality_scr                Blight_scr 
##                        41                        44 
##        Zinc_reaction_eval             Plant_width_2 
##                        13                        10 
##    Plant_architecture_scr              Fusarium_scr 
##                         5                        40 
##      White_mold_stems_per       Halo_blight_lvs_per 
##                         0                        32 
##      Halo_blight_pods_per               Rust_eval_1 
##                        32                        40 
##              Rust_eval_2_               Rust_eval_3 
##                         0                         0 
##                Plant_type                Virus_eval 
##                        64                        84 
##            Curly_top_eval          Drydown_duration 
##                        73                        28 
##          Frost_damage_scr       CBB_Innoculated_per 
##                        28                        36 
##                       GH2        Leaf_retention_scr 
##                         0                        26 
##               Rust_eval_2       White_mold_porosity 
##                        40                        40 
##                  Num_miss              Num_numerics 
##                     18530                     19415 
##                 Pref_name                Prev_names 
##                      1869                      4643 
##                 Orig_name                   Det_scr 
##                      1869                     14801

Make a small locations table to join with phenotypes also.

Locations_small <- Locations_unique %>%
  dplyr::select(Location_code, State, Climate_bin, Latitude, Longitude)

Phenotypes_ahm <- Phenotypes_ahm %>%
  left_join(Locations_small) %>%
  dplyr::select(Genotype, CDBN_ID, Seq_ID, Location_code, Year, Gene_pool, Race, Market_class_ahm, State, Climate_bin, Latitude, Longitude, everything())

## Joining, by = "Location_code"

Kusmec et al 2017 (Nature Plants) said that they removed outliers like so:

First, phenotypes measured in only one or two environments were removed. Second, the interquartile ranges (IQRs) were calculated for each RIL across environments and for each environment across RILs within a phenotype. Any trait measurements of RILs that were more than 1.5 times larger or smaller than either of the IQRs was removed. Finally, for a given phenotype any RIL that was not measured in at least three environments was removed.

Jeff thinks I should not do this for yield, however. But if I do, he thinks there’s more argument for removing high outliers than low outliers.

7b. Clean yield

What does yield look like? Are there any obvious outliers?

There is what looks like an excess of small values - a heavy left tail. Also, there are outliers +/- 1.5 IQR above Q3 or below Q1 (the dots in the boxplot).

Phenotypes_ahm %>%
  ggplot(aes(x = Yield_kg_ha)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 1173 rows containing non-finite values (stat_bin).

Phenotypes_ahm %>%
  ggplot(aes(x = 1, y = Yield_kg_ha)) +
  geom_boxplot()

## Warning: Removed 1173 rows containing non-finite values (stat_boxplot).

Phenotypes_ahm %>%
  ggplot(aes(x = Climate_bin, y = Yield_kg_ha)) +
  geom_boxplot(notch = TRUE)

## Warning: Removed 1173 rows containing non-finite values (stat_boxplot).

Phenotypes_ahm %>%
  ggplot(aes(x = State, y = Yield_kg_ha)) +
  geom_boxplot(notch = TRUE)

## Warning: Removed 1173 rows containing non-finite values (stat_boxplot).

Phenotypes_ahm %>%
  ggplot(aes(x = Market_class_ahm, y = Yield_kg_ha)) +
  geom_boxplot(notch = TRUE) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

## Warning: Removed 1173 rows containing non-finite values (stat_boxplot).

## notch went outside hinges. Try setting notch=FALSE.

## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.

Phenotypes_ahm$Yield_ahm <- Phenotypes_ahm$Yield_kg_ha

7bi. Drop rare varieties

Drop varieties without yields measured in at least three environments, and environments that don’t measure at least three yields.

Var_drop_yield <- Phenotypes_ahm %>%
  group_by(CDBN_ID, Seq_ID) %>%
  summarise(yield_median = median(Yield_kg_ha, na.rm = TRUE),
            yield_IQR = IQR(Yield_kg_ha, na.rm = TRUE),
            yield_datapoints = sum(!is.na(Yield_kg_ha))) %>%
  arrange(yield_datapoints) %>%
  filter(yield_datapoints <= 4)

Env_drop_yield <- Phenotypes_ahm %>%
  group_by(Location_code) %>%
  summarise(yield_median = median(Yield_kg_ha, na.rm = TRUE),
            yield_IQR = IQR(Yield_kg_ha, na.rm = TRUE),
            yield_datapoints = sum(!is.na(Yield_kg_ha))) %>%
  arrange(yield_datapoints) %>%
  filter(yield_datapoints <= 4)

Phenotypes_ahm <- Phenotypes_ahm %>%
  mutate(Yield_ahm = ifelse(CDBN_ID %in% Var_drop_yield$CDBN_ID,
                            NA,
                            Yield_ahm),
         Yield_ahm = ifelse(Location_code %in% Env_drop_yield$Location_code,
                            NA,
                            Yield_ahm)
  ) 

Phenotypes_ahm %>%
    filter(is.na(Yield_ahm))

7c. Clean unit seed weight

What does seed weight look like? Are there outliers?

There are values of -1000 which is clearly a problem. Also some outliers which seem surprisingly high, particularly in Alberta/ABBR

Phenotypes_ahm$Unit_seed_wt_mg_ahm <- Phenotypes_ahm$Unit_seed_wt_mg

Phenotypes_ahm %>%
  ggplot(aes(x = Unit_seed_wt_mg_ahm)) +
  geom_histogram(binwidth = 50)

## Warning: Removed 5057 rows containing non-finite values (stat_bin).

Phenotypes_ahm %>%
  ggplot(aes(x = 1, y = Unit_seed_wt_mg_ahm)) +
  geom_boxplot()

## Warning: Removed 5057 rows containing non-finite values (stat_boxplot).

Phenotypes_ahm %>%
  ggplot(aes(x = Climate_bin, y = Unit_seed_wt_mg_ahm)) +
  geom_boxplot(notch = TRUE)

## Warning: Removed 5057 rows containing non-finite values (stat_boxplot).

## notch went outside hinges. Try setting notch=FALSE.

Phenotypes_ahm %>%
  ggplot(aes(x = State, y = Unit_seed_wt_mg_ahm)) +
  geom_boxplot(notch = TRUE)

## Warning: Removed 5057 rows containing non-finite values (stat_boxplot).

## notch went outside hinges. Try setting notch=FALSE.

Phenotypes_ahm %>%
  ggplot(aes(x = Location_code, y = Unit_seed_wt_mg_ahm)) +
  geom_boxplot(notch = TRUE) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

## Warning: Removed 5057 rows containing non-finite values (stat_boxplot).

## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.

Phenotypes_ahm %>%
  ggplot(aes(x = Race, y = Unit_seed_wt_mg_ahm)) +
  geom_boxplot(notch = TRUE)

## Warning: Removed 5057 rows containing non-finite values (stat_boxplot).

Phenotypes_ahm %>%
  ggplot(aes(x = Market_class_ahm, y = Unit_seed_wt_mg_ahm)) +
  geom_boxplot(notch = TRUE) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

## Warning: Removed 5057 rows containing non-finite values (stat_boxplot).

## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.

7ci. Get rid of negative weights

The problem here is another typo. Values of -99 (which are Jeff’s NA’s) have accidentally been entered as -990. Well, that’s easy to fix.

Phenotypes_ahm %>%
  filter(Unit_seed_wt_mg_ahm <= 0)

Phenotypes_ahm <- Phenotypes_ahm %>%
  mutate(Unit_seed_wt_mg = ifelse(Unit_seed_wt_mg == -990,
                                  NA,
                                  Unit_seed_wt_mg),
         Unit_seed_wt_mg_ahm = ifelse((Location_code == "ABBR" & Year == 1997),
                                      NA,
                                      Unit_seed_wt_mg_ahm),
         Unit_seed_wt_mg_ahm = ifelse(Unit_seed_wt_mg_ahm > 800,
                                      NA,
                                      Unit_seed_wt_mg_ahm),
         Unit_seed_wt_mg_ahm = ifelse((Location_code == "WYPO" & Year == 1999),
                                      NA,
                                      Unit_seed_wt_mg_ahm),
         Unit_seed_wt_mg_ahm = ifelse((Location_code == "WYTO" & Year == 2003),
                                      NA,
                                      Unit_seed_wt_mg_ahm),
         Unit_seed_wt_mg_ahm = ifelse((Race == "DurangoJalisco" & Unit_seed_wt_mg_ahm > 550),
                                      NA,
                                      Unit_seed_wt_mg_ahm)
         
         )

7cii. What weights are > 750mg?

Weights above 750mg for a single seed seem pretty problematic and are limited to 2 sites. Look at these for errors.

39 rows are >750 mg, 90 rows are > 700 mg.

ABBR in 1997 seems to have measured seed weight very weirdly. Drop this site*year, I think, or check it for how it measured unit seed weight.. Yeah, it’s pretty clear that there was a change in, well, management between 1987 and 1997, and in 1997 ABBR didn’t know what it was doing for measuring seed weight. There’s not even a consistent fold-difference in weights… maaaaybe double. Drop seed weights for this year for sure, and be wary of this site for other variables…

Phenotypes_ahm %>%
  filter(Unit_seed_wt_mg > 760) %>%
  arrange(Location_code)

Phenotypes_ahm %>%
  filter(Location_code == "ABBR") %>%
  arrange(Market_class_ahm) %>%
  dplyr::select(Year, Unit_seed_wt_mg, Market_class_ahm, everything()) %>%
    ggplot(aes(x = Market_class_ahm, y = Unit_seed_wt_mg)) +
  geom_jitter(notch = TRUE, aes(color = as.factor(Year))) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

## Warning: Ignoring unknown parameters: notch

## Warning: Removed 44 rows containing missing values (geom_point).

The other sites with outliers I think are more likely to just be typos of some kind.

Phenotypes_ahm %>%
  filter(Location_code == "MISA") %>%
  arrange(Market_class_ahm) %>%
  dplyr::select(Year, Unit_seed_wt_mg, Market_class_ahm, everything()) %>%
    ggplot(aes(x = Market_class_ahm, y = Unit_seed_wt_mg)) +
  geom_jitter(notch = TRUE, aes(color = as.factor(Year)), alpha = 0.2) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

## Warning: Ignoring unknown parameters: notch

## Warning: Removed 60 rows containing missing values (geom_point).

7ciii. Cleanup by Race

As seed size is so bean-type specific, I thought I’d center the remainder of cleanup on Race & Market class.

WYPO is a bit of a mystery in 1999 - why are its Nueva Granada so bad? But many of the other outliers I could believe as representations of places where the environment is not good to grow beans (as there are also very low/poor yields there).

WYPO 1999 just seems to have measured its seed weights differently… they’re unusually low across the board for anything that isn’t ~200mg weight already. Drop this Year*Location also.

Phenotypes_ahm %>%
  filter(Race == "Nueva Granada" & Unit_seed_wt_mg_ahm < 250) %>%
  dplyr::select(Unit_seed_wt_mg_ahm, Year, Location_code, Race, Yield_ahm, CDBN_ID, Seq_ID, everything()) %>%
  arrange(Location_code)

Phenotypes_ahm %>%
  filter(Location_code == "WYPO" & Year %in% c(1995:2001)) %>%
  arrange(Market_class_ahm) %>%
  dplyr::select(Year, Unit_seed_wt_mg, Market_class_ahm, everything()) %>%
    ggplot(aes(x = Market_class_ahm, y = Unit_seed_wt_mg)) +
  geom_point(aes(color = as.factor(Year))) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

## Warning: Removed 2 rows containing missing values (geom_point).

A variety IG_GND seems to be a bit funny.

Phenotypes_ahm %>%
  filter(Race == "DurangoJalisco" & Unit_seed_wt_mg_ahm > 550) %>%
  dplyr::select(Unit_seed_wt_mg_ahm, Year, Location_code, Race, Yield_ahm, CDBN_ID, Seq_ID, everything()) %>%
  arrange(Location_code)

Phenotypes_ahm %>%
  filter(Market_class_ahm == "Great Northern") %>%
  dplyr::select(Year, Unit_seed_wt_mg, Market_class_ahm, everything()) %>%
    ggplot(aes(x = CDBN_ID, y = Unit_seed_wt_mg_ahm)) +
  geom_boxplot() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

## Warning: Removed 627 rows containing non-finite values (stat_boxplot).

Hmm, check out WYTO 2003…. Only high for Black and Small White beans in 2003, but now I don’t trust it. Remove.

Phenotypes_ahm %>%
  filter(Race == "Mesoamerican" & Unit_seed_wt_mg_ahm > 330) %>%
  dplyr::select(Unit_seed_wt_mg_ahm, Year, Location_code, Race, Yield_ahm, CDBN_ID, Seq_ID, everything()) %>%
  arrange(Location_code)

Phenotypes_ahm %>%
  filter(Location_code == "WYTO" & Year %in% c(1999:2007)) %>%
  arrange(Market_class_ahm) %>%
  dplyr::select(Year, Unit_seed_wt_mg, Market_class_ahm, everything()) %>%
    ggplot(aes(x = Market_class_ahm, y = Unit_seed_wt_mg)) +
  geom_point(aes(color = as.factor(Year))) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

Phenotypes_ahm %>%
  filter(Race == "Mesoamerican") %>%
  dplyr::select(Year, Unit_seed_wt_mg, Market_class_ahm, everything()) %>%
    ggplot(aes(x = CDBN_ID, y = Unit_seed_wt_mg_ahm)) +
  geom_boxplot() +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

## Warning: Removed 1533 rows containing non-finite values (stat_boxplot).

Phenotypes_ahm %>%
  ggplot(aes(x = Race, y = Unit_seed_wt_mg_ahm)) +
  geom_boxplot(notch = TRUE)

## Warning: Removed 5185 rows containing non-finite values (stat_boxplot).

Phenotypes_ahm %>%
  ggplot(aes(x = Market_class_ahm, y = Unit_seed_wt_mg_ahm)) +
  geom_boxplot(notch = TRUE) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

## Warning: Removed 5185 rows containing non-finite values (stat_boxplot).

## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.

7d. Clean growth habit

This isn’t going to be pretty. First, mutate the data so there are actually NA’s.

Looking at GH first. What do each of these scores mean? How do I simplify scores?

Two ways to define: CIAT classification: 1 = determinate bush; 2 = upright vine; 3 = prostrate vine; a=short/absent vine; b=long/present vine. Or, the first number can correspond to the CIAT classification; second number indicates internode length with 1 as shortest and 5 as longest. For my classification I’ll group 1 and 2 into absent vine and 3-5 to present vine.

Plant_type_eval may also be translatable into growth habit. I should make a column that’s “architecture” on CIAT’s scale and “vine” which is the a vs b or the second number, which I don’t always have.

CIAT_ahm: 1 (bush), 2 (upright vine), 3 (prostrate vine) Vine_ahm: absent (a, or 1-2) or present (b, or 3-5)

Translating some of the various plant type evaluations: I did this in Excel. Because, seriously. The key is called “Fixing GH 2018-02-12.xlsx” and can be provided upon request. B: X b-v, viny: 1 b comp.b-v: 1 b erect vine: 2 b

What do the determinacy scores look like by Race? Pretty skewed, except for the Mesoamerican set.

metadata %>%
  group_by(Race, Det_scr) %>%
  summarise(count = n())

metadata %>%
  filter(is.na(Det_scr))

Phenotypes_ahm$GH_ahm <- Phenotypes_ahm$Growth_habit
Phenotypes_ahm$GHS_ahm <- Phenotypes_ahm$Growth_habit_scr
Phenotypes_ahm$PT_ahm <- Phenotypes_ahm$Plant_type_eval
Phenotypes_ahm$CIAT_ahm <- NA
Phenotypes_ahm$Vine_ahm <- NA

Phenotypes_ahm <- Phenotypes_ahm %>%
  mutate(GH_ahm = ifelse(GH_ahm == "",
                         NA,
                         GH_ahm),
         GHS_ahm = ifelse(GHS_ahm == "",
                         NA,
                         GHS_ahm),
         PT_ahm = ifelse(PT_ahm == "",
                         NA,
                         PT_ahm)
         ) 

Phenotypes_ahm <- Phenotypes_ahm %>%
  mutate(CIAT_ahm = ifelse(GHS_ahm %in% c(10,11,12),
                           1,
                           CIAT_ahm),
         CIAT_ahm = ifelse(GHS_ahm %in% c(20,21,22),
                           2,
                           CIAT_ahm),
         CIAT_ahm = ifelse(GHS_ahm %in% c(30,31,32),
                           3,
                           CIAT_ahm),
         Vine_ahm = ifelse(GHS_ahm %in% c(10,11,21,30,31),
                           "a",
                           CIAT_ahm),
         Vine_ahm = ifelse(GHS_ahm %in% c(12,22,32),
                           "b",
                           CIAT_ahm),
         CIAT_ahm = ifelse(PT_ahm %in% c("up bu", "up comp bu", "vig bu", "vig.bu", "wide bu", "Wk bu", "b-v, viny", "comp.b-v", "tall open b-v", "tall up b-v", "un up b-v", "vig b-v", "vig,b-v", "Wk,var b-v", "ex up bu", "up wide bu"),
                           1,
                           CIAT_ahm),
         CIAT_ahm = ifelse(PT_ahm %in% c("up s-v", "up s v", "up sv", "v erect sv", "erect vine", "up mod vine", "up vine"),
                           2,
                           CIAT_ahm),
         CIAT_ahm = ifelse(PT_ahm %in% c("vig,viny", "vine"),
                           3,
                           CIAT_ahm),
         Vine_ahm = ifelse(PT_ahm %in% c("up bu", "up comp bu", "vig bu", "vig.bu", "wide bu", "Wk bu", "up s-v", "up s v", "up sv", "v erect sv"),
                           "a",
                           Vine_ahm),
         Vine_ahm = ifelse(PT_ahm %in% c("b-v, viny", "comp.b-v", "tall open b-v", "tall up b-v", "un up b-v", "vig b-v", "vig,b-v", "Wk,var b-v", "erect vine", "up mod vine", "up vine", "V", "vig,viny", "vine", "vlg,viny", "Wk,viny"),
                           "b",
                           Vine_ahm),
         CIAT_ahm = ifelse(GH_ahm %in% c("1", "1A", "1B", "1C"),
                           1,
                           CIAT_ahm),
         CIAT_ahm = ifelse(GH_ahm %in% c("2", "2-3", "2,3", "2a", "2A", "2A-2B", "2A/B", "2AB", "2b", "2B"),
                           2,
                           CIAT_ahm),
         CIAT_ahm = ifelse(GH_ahm %in% c("3", "3A", "3a-3b", "3A-3B", "3b", "3B"),
                           3,
                           CIAT_ahm),
         Vine_ahm = ifelse(GH_ahm %in% c("2a", "2A", "3A", "2/3A", "2A-3A"),
                           "a",
                           Vine_ahm),
         Vine_ahm = ifelse(GH_ahm %in% c("2b", "2B", "3b", "3B", "2B/3"),
                           "b",
                           Vine_ahm)
         )



## Used the below to work out a reasonable key that was conservative (i.e. when I had no idea I didn't include the data).
summary(as.factor(Phenotypes_ahm$PT_ahm))

##             B      b-v,viny comp b-v viny      comp.b-v      ex up bu 
##            13             2             1             2             1 
##            PV            SB           Seg            SV      tall b-v 
##             1             5             1             8             3 
##    tall ex bu tall open b-v   tall up b-v           TSB     un up b-v 
##             1             1             2             2             2 
##         up bu    up comp bu    up open by             V       vig b-v 
##             1             1             1            10             2 
##        vig bu       vig,b-v      vig,viny        vig.bu      vlg,viny 
##             1             1             2             1             1 
##            wk         Wk bu        Wk,var    Wk,var b-v       Wk,viny 
##             1             1             1             1             3 
##        Wk.var          NA's 
##             1         19341

summary(as.factor(Phenotypes_ahm$GHS_ahm))

##    10    11    12    16    21    22    26    30    31    32  NA's 
##     8     1     4     2     1     1     2     1     6     6 19383

summary(as.factor(Phenotypes_ahm$GH_ahm))

##     .     1   1-2   1 2    1A    1B    1C     2   2-3   2 3   2,3  2/3A 
##    33   654     1     2    35    28     2   553     4     4     7     1 
##    2a    2A 2A-2B 2A-3A  2A/B   2AB    2b    2B 2b-3a 2B-3A  2B/3 2B/3A 
##    13   138     1     1     1     1     6   233     1     3     2     3 
##     3    3A 3a-3b 3A-3B    3b    3B     4    4A     B    EV    PV  NA's 
##   639    75     3     5     3   110     2     1    10    13    14 16813

#  filter(!(is.na(Growth_habit_scr) & is.na(Growth_habit)))
#  dplyr::select(CDBN_ID, Seq_ID, Location_code, Year, GH_ahm, GHS_ahm, everything()) %>%

7e. Clean days to flowering

For days to flowering, I had previously added days to bloom 50% and removed the tails…

From the plot of CADV and CAOR, it looks like values of 33 or less are below the lower boundary of possible flowering times, so remove these.

Phenotypes_ahm <- Phenotypes_ahm %>%
  mutate(DTF_ahm = coalesce(DAYS_TO_FLOWER, DAYS_TO_BLOOM_50_PER)) %>%
  mutate(DTF_ahm = ifelse(DTF_ahm <= 30, NA, DTF_ahm))
 
Phenotypes_ahm %>%
  ggplot(aes(DTF_ahm)) +
    geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 10331 rows containing non-finite values (stat_bin).

Phenotypes_ahm %>%
  filter(!is.na(CIAT_ahm)) %>%
  dplyr::select(CDBN_ID, Seq_ID, Location_code, Year, CIAT_ahm, Vine_ahm, GH_ahm, GHS_ahm, PT_ahm, everything()) %>%
  ggplot(aes(x = as.factor(CIAT_ahm), y = DTF_ahm)) +
  geom_boxplot(notch = TRUE)

## Warning: Removed 825 rows containing non-finite values (stat_boxplot).

Phenotypes_ahm %>%
  filter(Location_code %in% c("CADV", "CAOR")) %>%
  ggplot(aes(x = Market_class_ahm, y = DTF_ahm)) + 
  geom_point(aes(shape = Location_code, color = as.factor(Year)))

## Warning: Removed 331 rows containing missing values (geom_point).

This looks ok and looks like I don’t actually have to mutate any outliers to NA’s.

Phenotypes_ahm %>%
  filter(CDBN_ID == "Gala")

Phenotypes_ahm %>%
  filter(DTF_ahm <= 30)

Phenotypes_ahm %>%
  ggplot(aes(x = DTF_ahm)) +
  geom_histogram(binwidth = 50)

## Warning: Removed 10331 rows containing non-finite values (stat_bin).

Phenotypes_ahm %>%
  ggplot(aes(x = 1, y = DTF_ahm)) +
  geom_boxplot()

## Warning: Removed 10331 rows containing non-finite values (stat_boxplot).

Phenotypes_ahm %>%
  ggplot(aes(x = Climate_bin, y = DTF_ahm)) +
  geom_boxplot(notch = TRUE)

## Warning: Removed 10331 rows containing non-finite values (stat_boxplot).

## notch went outside hinges. Try setting notch=FALSE.

Phenotypes_ahm %>%
  ggplot(aes(x = State, y = DTF_ahm)) +
  geom_boxplot(notch = TRUE)

## Warning: Removed 10331 rows containing non-finite values (stat_boxplot).

## notch went outside hinges. Try setting notch=FALSE.

Phenotypes_ahm %>%
  ggplot(aes(x = Location_code, y = DTF_ahm)) +
  geom_boxplot(notch = TRUE) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

## Warning: Removed 10331 rows containing non-finite values (stat_boxplot).

## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.

Phenotypes_ahm %>%
  ggplot(aes(x = Race, y = DTF_ahm)) +
  geom_boxplot(notch = TRUE)

## Warning: Removed 10331 rows containing non-finite values (stat_boxplot).

Phenotypes_ahm %>%
  ggplot(aes(x = Market_class_ahm, y = DTF_ahm)) +
  geom_boxplot(notch = TRUE) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

## Warning: Removed 10331 rows containing non-finite values (stat_boxplot).

## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.

7f. Clean plant height

Clean Plant Height by (with some discrimination) combining plant height and canopy height and removing outliers.

Use Canopy height when possible, Plant_height can sometimes mean Plant_length. Move outliers that are Plant_length to a new column, Plant_length_ahm

With Jeff, we thought to remove AZBO 1992, IDNA 1991, IDNA 1990, IDTF 1988, MBMO 2001 from plant height, and keep the rest of the data.

Phenotypes_ahm %>%
  filter(!is.na(Plant_height)) %>%
  group_by(Seq_ID) %>%
  summarise(count = n())

Phenotypes_ahm %>%
  filter(!is.na(Canopy_height)) %>%
  group_by(Seq_ID) %>%
  summarise(count = n())

Check through the low outliers (~<25) and ABBI and MOCO didn’t look like there were any statistical outliers. ABBR, maybe, but they only measured plant height for three years, so we can’t rule out two of the years being terrible ones to grow beans there…

Phenotypes_ahm <- Phenotypes_ahm %>%
  mutate(Plant_height_ahm = coalesce(Canopy_height, Plant_height),
         Plant_height_ahm = ifelse(Plant_height_ahm <= 10, 
                                   NA, 
                                   Plant_height_ahm),
         Plant_length_ahm = ifelse((Location_code == "IDTF" & Year == 1988) | (Location_code == "MBMO" & Year == 2001) | (Location_code == "AZBO" & Year == 1992) | (Location_code == "IDNA" & Year == 1990) | (Location_code == "IDNA" & Year == 1991),
                                   Plant_height_ahm,
                                   NA),
         Plant_height_ahm = ifelse((Location_code == "IDTF" & Year == 1988) | (Location_code == "MBMO" & Year == 2001) | (Location_code == "AZBO" & Year == 1992) | (Location_code == "IDNA" & Year == 1990) | (Location_code == "IDNA" & Year == 1991),
                                   NA,
                                   Plant_height_ahm)#,
        # Plant_height_ahm = ifelse(Location_code == "MOCO" & Year == 1997, 
        #                           Plant_height_ahm * 2.54, 
        #                           Plant_height_ahm),
        # Plant_height_ahm = ifelse(Location_code == "MIEN" & Year == 1989, 
        #                           Plant_height_ahm * 2.54, 
        #                           Plant_height_ahm)
        # Jeff fixed the values that were in inches for V1.7.
         )
 
Phenotypes_ahm %>%
  ggplot(aes(Plant_height_ahm)) + 
    geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 16519 rows containing non-finite values (stat_bin).

Phenotypes_ahm %>%
  ggplot(aes(Plant_length_ahm)) + 
    geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 19235 rows containing non-finite values (stat_bin).

Phenotypes_ahm %>%
  filter(Location_code == "WAOT" & Year %in% c(1991:2001)) %>%
  arrange(Market_class_ahm) %>%
  dplyr::select(Year, Plant_height_ahm, Market_class_ahm, everything()) %>%
    ggplot(aes(x = Market_class_ahm, y = Plant_height_ahm)) +
  geom_point(aes(color = as.factor(Year))) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  ylim(1,75)

## Warning: Removed 142 rows containing missing values (geom_point).

Phenotypes_ahm %>%
  filter(Location_code == "ABBR" & !is.na(Plant_height_ahm)) %>%
  dplyr::select(Plant_height_ahm, everything()) %>%
  arrange(Location_code) %>%
  dplyr::select(Plant_height_ahm, Yield_kg_ha, Location_code, Year, everything())

Phenotypes_ahm %>%
  ggplot(aes(x = Plant_height_ahm)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 16519 rows containing non-finite values (stat_bin).

Phenotypes_ahm %>%
  ggplot(aes(x = 1, y = Plant_height_ahm)) +
  geom_boxplot()

## Warning: Removed 16519 rows containing non-finite values (stat_boxplot).

Phenotypes_ahm %>%
  ggplot(aes(x = Climate_bin, y = Plant_height_ahm)) +
  geom_boxplot(notch = TRUE)

## Warning: Removed 16519 rows containing non-finite values (stat_boxplot).

Phenotypes_ahm %>%
  ggplot(aes(x = State, y = Plant_height_ahm)) +
  geom_boxplot(notch = TRUE)

## Warning: Removed 16519 rows containing non-finite values (stat_boxplot).

Phenotypes_ahm %>%
  ggplot(aes(x = Location_code, y = Plant_height_ahm)) +
  geom_boxplot(notch = TRUE) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

## Warning: Removed 16519 rows containing non-finite values (stat_boxplot).

## notch went outside hinges. Try setting notch=FALSE.

## notch went outside hinges. Try setting notch=FALSE.

Phenotypes_ahm %>%
  ggplot(aes(x = Race, y = Plant_height_ahm)) +
  geom_boxplot(notch = TRUE)

## Warning: Removed 16519 rows containing non-finite values (stat_boxplot).

Phenotypes_ahm %>%
  ggplot(aes(x = Market_class_ahm, y = Plant_height_ahm)) +
  geom_boxplot(notch = TRUE) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

## Warning: Removed 16519 rows containing non-finite values (stat_boxplot).

## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.

AB, ND, WA, MB may measure plant length sometimes for this value.

7g. Clean days to maturity

Coalesce DAYS_TO_HARVEST and DAYS_TO_MATURITY

Phenotypes_ahm %>%
  filter(!is.na(DAYS_TO_HARVEST)) %>%
  ggplot(aes(x = as.factor(Year), y = DAYS_TO_HARVEST)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.4, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm %>%
  filter(!is.na(DAYS_TO_MATURITY) & Location_code %in% c("MTSI", "NES2", "NESB")) %>%
  ggplot(aes(x = as.factor(Year), y = DAYS_TO_MATURITY)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.4, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm %>%
  mutate(DTM_2 = coalesce(DAYS_TO_MATURITY, DAYS_TO_HARVEST)) %>%
  filter(!is.na(DTM_2) & Location_code %in% c("MTSI", "NES2", "NESB")) %>%
  ggplot(aes(x = as.factor(Year), y = DTM_2)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.4, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm <- Phenotypes_ahm %>%
  mutate(DTM_ahm = coalesce(DAYS_TO_MATURITY, DAYS_TO_HARVEST))

Phenotypes_ahm %>%
  ggplot(aes(x = DTM_ahm)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 7370 rows containing non-finite values (stat_bin).

Phenotypes_ahm %>%
  ggplot(aes(x = 1, y = DTM_ahm)) +
  geom_boxplot()

## Warning: Removed 7370 rows containing non-finite values (stat_boxplot).

Phenotypes_ahm %>%
  ggplot(aes(x = Climate_bin, y = DTM_ahm)) +
  geom_boxplot(notch = TRUE)

## Warning: Removed 7370 rows containing non-finite values (stat_boxplot).

Phenotypes_ahm %>%
  ggplot(aes(x = State, y = DTM_ahm)) +
  geom_boxplot(notch = TRUE)

## Warning: Removed 7370 rows containing non-finite values (stat_boxplot).

## notch went outside hinges. Try setting notch=FALSE.

Phenotypes_ahm %>%
  ggplot(aes(x = Location_code, y = DTM_ahm)) +
  geom_boxplot(notch = TRUE) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

## Warning: Removed 7370 rows containing non-finite values (stat_boxplot).

## notch went outside hinges. Try setting notch=FALSE.

## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.

Phenotypes_ahm %>%
  ggplot(aes(x = Race, y = DTM_ahm)) +
  geom_boxplot(notch = TRUE)

## Warning: Removed 7370 rows containing non-finite values (stat_boxplot).

Phenotypes_ahm %>%
  ggplot(aes(x = Market_class_ahm, y = DTM_ahm)) +
  geom_boxplot(notch = TRUE) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

## Warning: Removed 7370 rows containing non-finite values (stat_boxplot).

## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.

ABBR in 1987 seems to have been a very late season… Most of the right tail seem to be clustered in some site*years that could have had weird weather. Without looking at the weather, these could be reasonable to all include. So keep them for now.

Phenotypes_ahm %>%
  filter(DTM_ahm > 130) %>%
    dplyr::select(DTM_ahm, DTF_ahm, Year, Location_code, Race, Yield_ahm, CDBN_ID, Seq_ID, everything()) %>%
  arrange(Location_code)

Phenotypes_ahm %>%
  filter(Location_code == "ABBR" & Year %in% c(1981:2001)) %>%
  arrange(Market_class_ahm) %>%
  dplyr::select(Year, DTM_ahm, Market_class_ahm, everything()) %>%
    ggplot(aes(x = Market_class_ahm, y = DTM_ahm)) +
  geom_point(aes(color = as.factor(Year))) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  ylim(60, 160)

## Warning: Removed 18 rows containing missing values (geom_point).

Phenotypes_ahm %>%
  filter(Gene_pool == "Andean") %>%
  ggplot(aes(x = as.factor(Year), y = DTF_ahm)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.4, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

## Warning: Removed 1823 rows containing non-finite values (stat_boxplot).

## Warning: Removed 1823 rows containing missing values (geom_point).

ggsave(filename = "DTF_Loc_code_and_Year_Andean.bmp", width = 16.18*3, height = 30, units = "in", dpi = 400)

## Warning: Removed 1823 rows containing non-finite values (stat_boxplot).

## Warning: Removed 1823 rows containing missing values (geom_point).

Phenotypes_ahm %>%
  filter(Gene_pool == "MA") %>%
  ggplot(aes(x = as.factor(Year), y = DTF_ahm)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.4, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

## Warning: Removed 7990 rows containing non-finite values (stat_boxplot).

## Warning: Removed 7990 rows containing missing values (geom_point).

ggsave(filename = "DTF_Loc_code_and_Year_MA.bmp", width = 16.18*3, height = 30, units = "in", dpi = 400)

## Warning: Removed 7990 rows containing non-finite values (stat_boxplot).

## Warning: Removed 7990 rows containing missing values (geom_point).

Phenotypes_ahm %>%
  filter(Gene_pool == "Andean") %>%
  ggplot(aes(x = as.factor(Year), y = DTM_ahm)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.4, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

## Warning: Removed 1437 rows containing non-finite values (stat_boxplot).

## Warning: Removed 1437 rows containing missing values (geom_point).

ggsave(filename = "DTM_Loc_code_and_Year_Andean.bmp", width = 16.18*3, height = 30, units = "in", dpi = 400)

## Warning: Removed 1437 rows containing non-finite values (stat_boxplot).

## Warning: Removed 1437 rows containing missing values (geom_point).

Phenotypes_ahm %>%
  filter(Gene_pool == "MA") %>%
  ggplot(aes(x = as.factor(Year), y = DTM_ahm)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.4, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

## Warning: Removed 5651 rows containing non-finite values (stat_boxplot).

## Warning: Removed 5651 rows containing missing values (geom_point).

ggsave(filename = "DTM_Loc_code_and_Year_MA.bmp", width = 16.18*3, height = 30, units = "in", dpi = 400)

## Warning: Removed 5651 rows containing non-finite values (stat_boxplot).

## Warning: Removed 5651 rows containing missing values (geom_point).

7h. Calculate duration of grainfill

This is simply days to maturity minus days to flowering, where both of those values are available.

Join the DGF with the seedfill_duration quantity. I do this because it’s the same range and looks identical in many cases where I calculated DGF and there is a seedfill_duration. However, there are some additional seedfill_duration year*sites that are added here.

Phenotypes_ahm <- Phenotypes_ahm %>%
  mutate(DG_ahm = DTM_ahm - DTF_ahm,
         SF_ahm = coalesce(DG_ahm, Seedfill_duration)
  )

Phenotypes_ahm %>%
  ggplot(aes(x = DG_ahm)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 12000 rows containing non-finite values (stat_bin).

Phenotypes_ahm %>%
  ggplot(aes(x = as.factor(Year), y = Seedfill_duration)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.4, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

## Warning: Removed 16786 rows containing non-finite values (stat_boxplot).

## Warning: Removed 16786 rows containing missing values (geom_point).

Phenotypes_ahm %>%
  ggplot(aes(x = as.factor(Year), y = DG_ahm)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.4, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

## Warning: Removed 12000 rows containing non-finite values (stat_boxplot).

## Warning: Removed 12000 rows containing missing values (geom_point).

Phenotypes_ahm %>%
  filter(!is.na(SF_ahm)) %>%
  ggplot(aes(x = as.factor(Year), y = SF_ahm)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.4, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

colSums(!is.na(Phenotypes_ahm))

##                  Genotype                   CDBN_ID 
##                     19415                     19415 
##                    Seq_ID             Location_code 
##                     14780                     19415 
##                      Year                 Gene_pool 
##                     19415                     18646 
##                      Race          Market_class_ahm 
##                     18646                     18666 
##                     State               Climate_bin 
##                     19415                     19415 
##                  Latitude                 Longitude 
##                     19415                     19415 
##               Yield_kg_ha            DAYS_TO_FLOWER 
##                     18242                      8916 
##          DAYS_TO_MATURITY           Unit_seed_wt_mg 
##                     12005                     14358 
##               Test_weight        Days_to_full_bloom 
##                       350                         0 
##      DAYS_TO_BLOOM_50_PER         Seedfill_duration 
##                       170                      2629 
##           DAYS_TO_HARVEST          Harvest_maturity 
##                       119                       339 
##         Ripening_date_scr              Maturity_scr 
##                        36                        74 
##              Plant_height             Canopy_height 
##                      2627                       481 
##              Growth_habit          Growth_habit_scr 
##                      2602                        32 
##           Plant_type_eval      Internode_length_scr 
##                        74                       530 
##             Pod_clearance               Plant_width 
##                       250                        78 
##         Pod_position_eval                Pod_height 
##                        34                        64 
##        Branch_length_eval               Lodging_scr 
##                        62                       241 
##               Lodging_1_5               Lodging_0_9 
##                      2203                        74 
##               Lodging_1_9               Lodging_per 
##                       975                        35 
##                 Yield_day        Yield_Day_Seedfill 
##                      4172                      3460 
##                   Biomass               Biomass_day 
##                      2899                      1459 
##             Harvest_index             Emergence_scr 
##                      3193                        60 
##           Early_vigor_scr                 Stand_per 
##                       763                        64 
##                Stand_code          Seed_quality_scr 
##                        20                       147 
##       Seed_appearance_scr         Seed_appear_desir 
##                       314                         0 
##     Seed_appear_desir_scr          Desirability_scr 
##                       559                       218 
##    Field_desirability_scr        Harvestability_scr 
##                        72                       232 
##           Adaptation_eval              Desirability 
##                        20                         0 
##               Pod_set_scr        MN_deficiency_eval 
##                        39                        11 
##             Zinc_dwarfing            Zinc_yellowing 
##                        36                        36 
##           Zinc_defic_eval            Zinc_defic_scr 
##                        24                        38 
##            Air_pollut_scr                 Ozone_scr 
##                       387                        24 
##              Bronzing_scr               Disease_scr 
##                        83                        23 
##          Anthracnose_eval                 BCMV_eval 
##                         0                       124 
##                   CBB_scr                   CBB_per 
##                       556                       450 
##                  CBB_eval     CBB_Foliage_per_innoc 
##                        36                        40 
## CBB_Foliage_per_non_innoc           CBB_pustule_scr 
##                        40                        20 
##            Blight_Pod_per             Curly_Top_per 
##                        40                        88 
##       Curly_top_virus_scr            Fusarium_emerg 
##                        96                        32 
##     Fusarium_seedling_vig       Fusarium_seed_yield 
##                        32                        32 
##          Fusarium_wilt_GH           Halo_blight_scr 
##                        33                       134 
##                Halo_B_per        Powdery_mildew_scr 
##                        70                        37 
##                Rhizoc_scr        Root_rot_emerg_scr 
##                        30                        36 
##    Root_rot_early_vig_scr       Root_rot_seed_yield 
##                        36                        36 
##              Root_rot_scr             Root_rot_eval 
##                        58                        43 
##                  Rust_scr                Rust_scr_c 
##                       316                        80 
##             Rust_CIAT_scr                  Rust_per 
##                        23                       369 
##                 Rust_eval          Rust_Foliage_per 
##                       489                        40 
##         Rust_Pustule_Type            White_mold_scr 
##                        32                       128 
##            White_mold_per           White_mold_eval 
##                       353                         6 
##       White_mold_scr_val2             White_mold_GH 
##                         0                        33 
##           Cooking_quality        Seed_color_uniform 
##                        34                        36 
##           Seed_wt_cul_dry       Seed_wt_cul_imbibed 
##                        36                        36 
##        Seed_dry_imb_ratio        Seed_cooked_appear 
##                        36                       100 
##           Seed_splits_cul           Seed_luminosity 
##                        36                        72 
##               Seed_chroma                  Seed_hue 
##                        72                        72 
##         Halo_blight_scr_1         Halo_blight_scr_2 
##                        45                        46 
##         Halo_blight_scr_3            Air_pollut_per 
##                        46                        46 
##                 Merit_scr          Fe_chlorosis_scr 
##                        69                        46 
##              Seed_L_color         Stand_uniform_scr 
##                        28                        34 
##           Row_closure_scr     Phenology_stage_51DAP 
##                        34                        34 
##        Growth_density_scr         Tunnel_effect_scr 
##                        34                        34 
##          Architecture_scr            Vine_habit_scr 
##                        34                        34 
##          Pod_maturity_scr             Plant_density 
##                        34                        36 
##            Pod_height_scr                Podset_scr 
##                        34                        34 
##             Flat_vine_scr                 Bush_Type 
##                        34                        34 
##             Pods_Peduncle             Seeds_per_Pod 
##                        34                        34 
##              Flower_color        General_appearance 
##                        41                        41 
##           Air_pollut_rate          Spray_injury_scr 
##                        41                        40 
##     Plant_vigor_bloom_scr            Plant_type_scr 
##                        41                        41 
##               Quality_scr                Blight_scr 
##                        41                        44 
##        Zinc_reaction_eval             Plant_width_2 
##                        13                        10 
##    Plant_architecture_scr              Fusarium_scr 
##                         5                        40 
##      White_mold_stems_per       Halo_blight_lvs_per 
##                         0                        32 
##      Halo_blight_pods_per               Rust_eval_1 
##                        32                        40 
##              Rust_eval_2_               Rust_eval_3 
##                         0                         0 
##                Plant_type                Virus_eval 
##                        64                        84 
##            Curly_top_eval          Drydown_duration 
##                        73                        28 
##          Frost_damage_scr       CBB_Innoculated_per 
##                        28                        36 
##                       GH2        Leaf_retention_scr 
##                         0                        26 
##               Rust_eval_2       White_mold_porosity 
##                        40                        40 
##                  Num_miss              Num_numerics 
##                     18530                     19415 
##                 Pref_name                Prev_names 
##                      1869                      4643 
##                 Orig_name                   Det_scr 
##                      1869                     14801 
##                 Yield_ahm       Unit_seed_wt_mg_ahm 
##                     18227                     14230 
##                    GH_ahm                   GHS_ahm 
##                      2602                        32 
##                    PT_ahm                  CIAT_ahm 
##                        74                      2558 
##                  Vine_ahm                   DTF_ahm 
##                       642                      9084 
##          Plant_height_ahm          Plant_length_ahm 
##                      2896                       180 
##                   DTM_ahm                    DG_ahm 
##                     12045                      7415 
##                    SF_ahm 
##                      7544

7i. Clean emergence/early vigor?

Phenotypes_ahm %>%
    ggplot(aes(x = Early_vigor_scr)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 18652 rows containing non-finite values (stat_bin).

Phenotypes_ahm %>%
    ggplot(aes(x = Emergence_scr)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 19355 rows containing non-finite values (stat_bin).

Phenotypes_ahm %>%
  filter(!is.na(Early_vigor_scr)) %>%
  ggplot(aes(x = as.factor(Year), y = Early_vigor_scr)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.4, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm %>%
  filter(!is.na(Emergence_scr)) %>%
  ggplot(aes(x = as.factor(Year), y = Emergence_scr)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.4, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm %>%
  mutate(EVG_ahm = coalesce(Emergence_scr, Early_vigor_scr)) %>%
  filter(!is.na(EVG_ahm)) %>%
  ggplot(aes(x = as.factor(Year), y = EVG_ahm)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.4, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm <- Phenotypes_ahm %>%
  mutate(EVG_ahm = coalesce(Emergence_scr, Early_vigor_scr)) 

Phenotypes_ahm %>%
  filter(!is.na(EVG_ahm))

7j. Clean Seed appearance/desirability

Conclusion: Don’t combine desirability, combine the SAD/SAS/SADS columns.

1. Carefully check each year/site combo as to whether they're scoring out of 1-5 or 1-9, and separate into SAD_1_5 SAD_1_7 and SAD_1_9 
2. then combine as SAD_same_scale. 
* Got each year*site with data in the pdf's organized below. Don't believe anything else.

1999 MISG 1-9 // MAMO 1-5 2000 CACH IDPA MAMO WAOT 1-9 2001 MAMO 1-5 // CADV NDER 1-7 // WAOT 1-9 2002 Can’t find the scale for this but only one column. Should probably drop if I can’t find the scale. CADV has to be on a 1-9 scale; MBMO could be on a 1-5 scale like in 2001, so could NDER. WAOT could be on 1-7 or 1-9. Keep CADV and MBMO drop NDER and WAOT. 2003 WAOT 1-5 2004 CADV IDPA MISA 1-9 // WAOT 1-5 2005 CADV MISA 1-9 // WAOT 1-5 2006 CADV MISA WAOT 1-9 2007’s pdf is missing the data for some reasonâ¦ 2008 WAOT 1-9 2009 WAOT 1-5 2010 not scored 2011 not scored 2012 not scored 2013 WA 1-5 2014 WA 1-5

Seed_appearance_scr -> SAS_ahm Seed_appear_desir -> SAD_ahm: SAD may have added extra data for this one, most scores (21928) appear to be blank. Ok, as of V1.9.1 this is all blank. And it looks like that data that was in this column is now in Desirability_scr, at a guess. So try re-giving SAD1 below Seed_appear_desir_scr -> SADS_ahm Seed_quality_scr -> SQS_ahm

Phenotypes_ahm$SAS_ahm <- Phenotypes_ahm$Seed_appearance_scr
Phenotypes_ahm$SAD1_ahm <- Phenotypes_ahm$Desirability_scr
Phenotypes_ahm$SADS_ahm <- Phenotypes_ahm$Seed_appear_desir_scr
Phenotypes_ahm$SQS_ahm <- Phenotypes_ahm$Seed_quality_scr

summary(as.factor(Phenotypes_ahm$Seed_appear_desir))

##  NA's 
## 19415

Phenotypes_ahm <- Phenotypes_ahm %>%
  dplyr::select(SAS_ahm, SAD1_ahm, SADS_ahm, SQS_ahm, everything()) %>%
  mutate(SAD_ahm = ifelse(SAD1_ahm == "",
                          NA,
                          SAD1_ahm),
         SAD_ahm = coalesce(as.numeric(SAD1_ahm), as.numeric(SAS_ahm), as.numeric(SADS_ahm), as.numeric(SQS_ahm)))

Phenotypes_ahm <- Phenotypes_ahm %>%
  mutate(SAD_ahm = as.numeric(SAD_ahm),
         SAD_1to5 = ifelse((Location_code %in% c("ABVA", "NDER", "ONEX")) | (Location_code == "MBMO" & Year %in% c(1999, 2001)) | (Location_code == "WAOT" & Year %in% c(2003, 2004, 2005, 2009, 2013, 2014)),
                           SAD_ahm,
                           NA),
         SAD_1to7 = ifelse(Location_code == "MIEN",
                           SAD_ahm,
                           NA),
         SAD_1to9 = ifelse((Location_code %in% c("CACH", "CADV", "IDKI", "IDPA", "MISA")) | (Location_code == "MBMO" & Year == 2000) | (Location_code == "WAOT" & Year %in% c(2000, 2001, 2002, 2006, 2008)),
                           SAD_ahm,
                           NA),
         SAD_1to3 = ifelse(SAD_1to5 < 2.5| SAD_1to7 > 4.5 | SAD_1to9 < 3.5,
                           1,
                           NA)
         #SAD_1to3 = ifelse((between(SAD_1to5,2.5,3.5) | between(SAD_1to7,3.5,4.5) | between(SAD_1to9,3.5,6.5)),
         #                  2,
         #                  SAD_1to3),
         #SAD_1to3 = ifelse(SAD_1to5 > 3.5 | SAD_1to7 < 3.5 | SAD_1to9 > 6.5,
         #                  3,
         #                  SAD_1to3)
                           ) #%>%
  #dplyr::select(SAD_1to3, SAD_1to5, SAD_1to7, SAD_1to9, everything())
 # filter(!is.na(SAD_ahm))

Phenotypes_ahm <- Phenotypes_ahm %>%
  mutate(SAD_1to3 =  ifelse(is.na(SAD_1to3) & (SAD_1to5 > 3.5 | SAD_1to7 < 3.5 | SAD_1to9 > 6.5),
                            3,
                            SAD_1to3),
         SAD_1to3 = ifelse(is.na(SAD_1to3) & (between(SAD_1to5,2.5,3.5) | between(SAD_1to7,3.5,4.5) | between(SAD_1to9,3.5,6.5)),
                           2,
                           SAD_1to3)
  )

Phenotypes_ahm %>%
  filter(!is.na(SAD_1to3))

Phenotypes_ahm %>%
  filter(!is.na(SAD_1to3)) %>%
  ggplot(aes(x = as.factor(Year), y = SAD_1to3)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.6, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm %>%
  filter(!is.na(Desirability_scr)) %>%
  ggplot(aes(x = as.factor(Year), y = Desirability_scr)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.6, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm %>%
  filter(!is.na(Seed_appearance_scr)) %>%
  ggplot(aes(x = as.factor(Year), y = Seed_appearance_scr)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.6, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm %>%
  filter(!is.na(Seed_appear_desir_scr)) %>%
  ggplot(aes(x = as.factor(Year), y = Seed_appear_desir_scr)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.6, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm %>%
  filter(!is.na(SQS_ahm)) %>%
  ggplot(aes(x = as.factor(Year), y = SQS_ahm)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.4, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm %>%
  filter(!is.na(SAD1_ahm) & SAD1_ahm != "") %>%
  ggplot(aes(x = as.factor(Year), y = SAD1_ahm)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.4, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

# ggsave(filename = "SAD-Loc-code-and-Year_2018-03-01.bmp", width = 16.18*3, height = 30, units = "in", dpi = 400)

Phenotypes_ahm %>%
  filter(!is.na(SADS_ahm)) %>%
  ggplot(aes(x = as.factor(Year), y = SADS_ahm)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.4, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm %>%
  filter(!is.na(SAS_ahm)) %>%
  ggplot(aes(x = as.factor(Year), y = SAS_ahm)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.6, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

7k. Clean biomass

The only ones that appear as outliers here are AZKS - much higher than the others - in 2003/2004. But in 2003, those are the actual values for AZKS. So keep them I guess. Still drop values < 1000 or > 10000 because these values are biologically implausible.

Phenotypes_ahm %>%
  filter(!is.na(Biomass)) %>%
  ggplot(aes(x = as.factor(Year), y = Biomass)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.1, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm %>%
  filter(!is.na(Biomass) & Location_code == "AZKS") %>%
  ggplot(aes(x = as.factor(Year), y = Biomass)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.2, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) #+

  #facet_wrap(~Location_code)

Phenotypes_ahm$BMS_ahm <- Phenotypes_ahm$Biomass 

Phenotypes_ahm <- Phenotypes_ahm %>%
  mutate(BMS_ahm = ifelse(!is.na(BMS_ahm) & Location_code != "AZKS",
                          BMS_ahm,
                          NA))

Phenotypes_ahm %>%
  ggplot(aes(x = BMS_ahm)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 16569 rows containing non-finite values (stat_bin).

Phenotypes_ahm %>%
  ggplot(aes(x = 1, y = BMS_ahm)) +
  geom_boxplot()

## Warning: Removed 16569 rows containing non-finite values (stat_boxplot).

Phenotypes_ahm %>%
  ggplot(aes(x = Climate_bin, y = BMS_ahm)) +
  geom_boxplot(notch = TRUE)

## Warning: Removed 16569 rows containing non-finite values (stat_boxplot).

Phenotypes_ahm %>%
  ggplot(aes(x = State, y = BMS_ahm)) +
  geom_boxplot(notch = TRUE)

## Warning: Removed 16569 rows containing non-finite values (stat_boxplot).

Phenotypes_ahm %>%
  ggplot(aes(x = Location_code, y = BMS_ahm)) +
  geom_boxplot(notch = TRUE) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

## Warning: Removed 16569 rows containing non-finite values (stat_boxplot).

## notch went outside hinges. Try setting notch=FALSE.

## notch went outside hinges. Try setting notch=FALSE.

Phenotypes_ahm %>%
  ggplot(aes(x = Race, y = BMS_ahm)) +
  geom_boxplot(notch = TRUE)

## Warning: Removed 16569 rows containing non-finite values (stat_boxplot).

## notch went outside hinges. Try setting notch=FALSE.

Phenotypes_ahm %>%
  ggplot(aes(x = Market_class_ahm, y = BMS_ahm)) +
  geom_boxplot(notch = TRUE) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

## Warning: Removed 16569 rows containing non-finite values (stat_boxplot).

## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.
## notch went outside hinges. Try setting notch=FALSE.

7l. Clean Harvest index

HIN_ahm

Phenotypes_ahm$HIN_ahm <- Phenotypes_ahm$Harvest_index 

Phenotypes_ahm <- Phenotypes_ahm %>%
  mutate(HIN_ahm = ifelse(!is.na(HIN_ahm) & Location_code != "AZKS",
                          HIN_ahm,
                          NA))

Phenotypes_ahm %>%
  ggplot(aes(x = HIN_ahm)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 16275 rows containing non-finite values (stat_bin).

Phenotypes_ahm %>%
  ggplot(aes(x = 1, y = HIN_ahm)) +
  geom_boxplot()

## Warning: Removed 16275 rows containing non-finite values (stat_boxplot).

Phenotypes_ahm %>%
  ggplot(aes(x = Climate_bin, y = HIN_ahm)) +
  geom_boxplot(notch = TRUE)

## Warning: Removed 16275 rows containing non-finite values (stat_boxplot).

Phenotypes_ahm %>%
  ggplot(aes(x = State, y = HIN_ahm)) +
  geom_boxplot(notch = TRUE)

## Warning: Removed 16275 rows containing non-finite values (stat_boxplot).

Phenotypes_ahm %>%
  ggplot(aes(x = Location_code, y = HIN_ahm)) +
  geom_boxplot(notch = TRUE) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

## Warning: Removed 16275 rows containing non-finite values (stat_boxplot).

Phenotypes_ahm %>%
  ggplot(aes(x = Race, y = HIN_ahm)) +
  geom_boxplot(notch = TRUE)

## Warning: Removed 16275 rows containing non-finite values (stat_boxplot).

Phenotypes_ahm %>%
  ggplot(aes(x = Market_class_ahm, y = HIN_ahm)) +
  geom_boxplot(notch = TRUE) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))

## Warning: Removed 16275 rows containing non-finite values (stat_boxplot).

## notch went outside hinges. Try setting notch=FALSE.

## notch went outside hinges. Try setting notch=FALSE.

Phenotypes_ahm %>%
  filter(!is.na(HIN_ahm)) %>%
  ggplot(aes(x = as.factor(Year), y = HIN_ahm)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.1, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

7m. Clean lodging

There are four variables in the dataset (as of 1.9.1, there are 5). Possible variables to coalesce:

Lodging_1_5 Lodging_1_9 Lodging_0_9 Lodging_per Lodging_scr

Phenotypes_ahm %>%
  filter(!is.na(Lodging_1_5)) %>%
  mutate(LDG_1_5 = ifelse(!is.na(Lodging_1_5) & Lodging_1_5 > 5.5,
                          Lodging_1_5/10,
                          Lodging_1_5)) %>%
  ggplot(aes(x = as.factor(Year), y = LDG_1_5)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.1, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm %>%
  filter(!is.na(Lodging_1_9)) %>%
  ggplot(aes(x = as.factor(Year), y = Lodging_1_9)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.1, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm %>%
  filter(!is.na(Lodging_0_9)) %>%
  mutate(LDG_0_9 = ifelse(!is.na(Lodging_0_9) & Lodging_0_9 > 9.5,
                          Lodging_0_9/10,
                          Lodging_0_9)) %>%
  ggplot(aes(x = as.factor(Year), y = LDG_0_9)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.1, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm %>%
  filter(!is.na(Lodging_per)) %>%
  mutate(LDG_pr = Lodging_per/10+1) %>%
  ggplot(aes(x = as.factor(Year), y = LDG_pr)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.1, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm %>%
  filter(!is.na(Lodging_scr)) %>%
  #mutate(LDG_pr = Lodging_per/10+1) %>%
  ggplot(aes(x = as.factor(Year), y = Lodging_scr)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.1, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Lodging_scr: MOCO is 1-5; ABBI is 1-5; IDKI is 1-5; IDPA is 1-5; NESB is 1-5; WARO is 1-9 NDFA is 0-9

According to my emails with CDBN cooperators, MISA is supposedly always on the 1-5 scale. I bet IDPA is also because there are many years where it’s on the 1-5 scale and only 1 where it’s supposedly on this 0-9 scale. So only NDFA is actually rightfully in the Lodging_0_9 group.

Phenotypes_ahm <- Phenotypes_ahm %>%
    mutate(LDG_ahm = ifelse(!is.na(Lodging_1_5) & Lodging_1_5 > 5.5,
                          Lodging_1_5 / 10,
                          Lodging_1_5)) 

Phenotypes_ahm %>%
  mutate(LDG_0_9 = ifelse(!is.na(Lodging_0_9) & Lodging_0_9 > 9.5,
                          Lodging_0_9 / 10,
                          Lodging_0_9),
         LDG_ahm = ifelse(is.na(LDG_ahm) & !is.na(LDG_0_9) & Location_code %in% c("MISA", "IDPA"),
                          LDG_0_9,
                          LDG_ahm),
         LDG_ahm = ifelse(is.na(LDG_ahm) & !is.na(LDG_0_9) & Location_code %in% c("NDFA"),
                          LDG_0_9/2.25 + 1,
                          LDG_ahm),
         LDG_ahm = ifelse(is.na(LDG_ahm) & !is.na(Lodging_1_9),
                          Lodging_1_9/2 + 0.5,
                          LDG_ahm),
         LDG_ahm = ifelse(is.na(LDG_ahm) & !is.na(Lodging_per),
                          Lodging_per/10 + 1,
                          LDG_ahm),
         LDG_ahm = ifelse(is.na(LDG_ahm) & !is.na(Lodging_scr) & Location_code %in% c("MOCO", "ABBI", "IDKI", "IDPA", "NESB"),
                          Lodging_scr,
                          LDG_ahm),
         LDG_ahm = ifelse(is.na(LDG_ahm) & !is.na(Lodging_scr) & Location_code %in% c("WARO"),
                          Lodging_scr/2 + 0.5,
                          LDG_ahm),
         LDG_ahm = ifelse(is.na(LDG_ahm) & !is.na(Lodging_scr) & Location_code %in% c("NDFA"),
                          Lodging_scr/2.25 + 1,
                          LDG_ahm)
         ) %>%
  filter(!is.na(LDG_ahm)) %>%
  ggplot(aes(x = as.factor(Year), y = LDG_ahm)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.1, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm %>%
  filter(!is.na(LDG_ahm))

Phenotypes_ahm %>%
  ggplot(aes(x = LDG_ahm)) +
  geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 17212 rows containing non-finite values (stat_bin).

Phenotypes_ahm <- Phenotypes_ahm %>%
  mutate(LDG_0_9 = ifelse(!is.na(Lodging_0_9) & Lodging_0_9 > 9.5,
                          Lodging_0_9 / 10,
                          Lodging_0_9),
         LDG_ahm = ifelse(is.na(LDG_ahm) & !is.na(LDG_0_9) & Location_code %in% c("MISA", "IDPA"),
                          LDG_0_9,
                          LDG_ahm),
         LDG_ahm = ifelse(is.na(LDG_ahm) & !is.na(LDG_0_9) & Location_code %in% c("NDFA"),
                          LDG_0_9/2.25 + 1,
                          LDG_ahm),
         LDG_ahm = ifelse(is.na(LDG_ahm) & !is.na(Lodging_1_9),
                          Lodging_1_9/2 + 0.5,
                          LDG_ahm),
         LDG_ahm = ifelse(is.na(LDG_ahm) & !is.na(Lodging_per),
                          Lodging_per/10 + 1,
                          LDG_ahm),
         LDG_ahm = ifelse(is.na(LDG_ahm) & !is.na(Lodging_scr) & Location_code %in% c("MOCO", "ABBI", "IDKI", "IDPA", "NESB"),
                          Lodging_scr,
                          LDG_ahm),
         LDG_ahm = ifelse(is.na(LDG_ahm) & !is.na(Lodging_scr) & Location_code %in% c("WARO"),
                          Lodging_scr/2 + 0.5,
                          LDG_ahm),
         LDG_ahm = ifelse(is.na(LDG_ahm) & !is.na(Lodging_scr) & Location_code %in% c("NDFA"),
                          Lodging_scr/2.25 + 1,
                          LDG_ahm)
         )

—– Disease cleaning! ———-

First of all, which of the new disease phenotypes have enough CDBN entries with phenotypes that they’d make sense to try GWAS on?

Do: CBB (240), Rust (278) Try: BCMV (86), Curly_top_virus (122), Halo_blight (128), Root_rot (104), White_mold (167), Zinc (91)

Phenotypes_ahm %>%
  filter(!is.na(Curly_top_eval) | !is.na(Curly_top_virus_scr) | !is.na(Curly_Top_per) | !is.na(Virus_eval)) %>%
  group_by(Seq_ID) %>%
  summarise(count = n()) # 122 that were sequenced. Worth a try maybe. Actually 133.

Phenotypes_ahm %>%
  filter(!is.na(CBB_eval) | !is.na(CBB_per) | !is.na(CBB_scr) | !is.na(CBB_Foliage_per_non_innoc)) %>% # 73
  group_by(Seq_ID) %>%
  summarise(count = n()) # 240 that were sequenced

Phenotypes_ahm %>%
  filter(!is.na(Halo_B_per) | !is.na(Halo_blight_scr) | !is.na(Halo_blight_scr_1) | !is.na(Halo_blight_scr_2) | !is.na(Halo_blight_scr_3) | !is.na(Halo_blight_lvs_per) | !is.na(Halo_blight_pods_per)| !is.na(Blight_Pod_per) | !is.na(Blight_scr)) %>% 
  group_by(Seq_ID) %>%
  summarise(count = n()) # 128 that were sequenced.  Worth a try I guess.

Phenotypes_ahm %>%
  filter(!is.na(White_mold_scr) | !is.na(White_mold_per) | !is.na(White_mold_eval) | !is.na(White_mold_scr_val2) | !is.na(White_mold_GH) | !is.na(White_mold_stems_per) | !is.na(White_mold_porosity)) %>% 
  group_by(Seq_ID) %>%
  summarise(count = n()) # 167 that were sequenced. Try it now then.

Phenotypes_ahm %>%
  filter(!is.na(BCMV_eval)) %>% 
  group_by(Seq_ID) %>%
  summarise(count = n()) # 86 that were sequenced... yeah no. Actually 133. Worth a try I guess.

Phenotypes_ahm %>%
  filter(!is.na(Rust_scr) | !is.na(Rust_scr_c) | !is.na(Rust_CIAT_scr) | !is.na(Rust_per) | !is.na(Rust_eval) | !is.na(Rust_eval_1) | !is.na(Rust_eval_2_) | !is.na(Rust_eval_3) | !is.na(Rust_eval_2)) %>% 
  group_by(Seq_ID) %>%
  summarise(count = n()) # 278 that were sequenced. Niice.

Phenotypes_ahm %>%
  filter(!is.na(Pod_clearance) ) %>% 
  group_by(Seq_ID) %>%
  summarise(count = n()) # 0! that were sequenced for Anthracnose_eval. Empty column it seems.

# 108 for internode_length_scr so nope.
# 21 for Powdery_mildew_scr
# 132 for Yield_day and Yield_Day_Seedfill. No seems pointless.
# 104 for Biomass_day. No seems pointless.
# Air_pollut_scr 81 for this and the _per and _rate. So no.
# Pod_clearance 86. Ok, so no!

Phenotypes_ahm %>%
  filter(!is.na(Fusarium_emerg) | !is.na(Fusarium_seedling_vig) | !is.na(Fusarium_seed_yield) | !is.na(Fusarium_scr) | !is.na(Fusarium_wilt_GH)) %>% # 73
  group_by(Seq_ID) %>%
  summarise(count = n()) # 64 that were sequenced. Nope.

Phenotypes_ahm %>%
  filter(!is.na(Root_rot_emerg_scr) | !is.na(Root_rot_early_vig_scr) | !is.na(Root_rot_seed_yield) | !is.na(Root_rot_scr) | !is.na(Root_rot_eval)| !is.na(Rhizoc_scr)) %>% 
  group_by(Seq_ID) %>%
  summarise(count = n()) # 104-105 that were sequenced.

Phenotypes_ahm %>%
  filter(!is.na(Zinc_dwarfing) | !is.na(Zinc_yellowing) | !is.na(Zinc_defic_eval) | !is.na(Zinc_defic_scr) | !is.na(Zinc_reaction_eval)) %>% 
  group_by(Seq_ID) %>%
  summarise(count = n()) # 91 that were sequenced.

colSums(!is.na(Phenotypes_ahm))

##                   SAS_ahm                  SAD1_ahm 
##                       314                       218 
##                  SADS_ahm                   SQS_ahm 
##                       559                       147 
##                  Genotype                   CDBN_ID 
##                     19415                     19415 
##                    Seq_ID             Location_code 
##                     14780                     19415 
##                      Year                 Gene_pool 
##                     19415                     18646 
##                      Race          Market_class_ahm 
##                     18646                     18666 
##                     State               Climate_bin 
##                     19415                     19415 
##                  Latitude                 Longitude 
##                     19415                     19415 
##               Yield_kg_ha            DAYS_TO_FLOWER 
##                     18242                      8916 
##          DAYS_TO_MATURITY           Unit_seed_wt_mg 
##                     12005                     14358 
##               Test_weight        Days_to_full_bloom 
##                       350                         0 
##      DAYS_TO_BLOOM_50_PER         Seedfill_duration 
##                       170                      2629 
##           DAYS_TO_HARVEST          Harvest_maturity 
##                       119                       339 
##         Ripening_date_scr              Maturity_scr 
##                        36                        74 
##              Plant_height             Canopy_height 
##                      2627                       481 
##              Growth_habit          Growth_habit_scr 
##                      2602                        32 
##           Plant_type_eval      Internode_length_scr 
##                        74                       530 
##             Pod_clearance               Plant_width 
##                       250                        78 
##         Pod_position_eval                Pod_height 
##                        34                        64 
##        Branch_length_eval               Lodging_scr 
##                        62                       241 
##               Lodging_1_5               Lodging_0_9 
##                      2203                        74 
##               Lodging_1_9               Lodging_per 
##                       975                        35 
##                 Yield_day        Yield_Day_Seedfill 
##                      4172                      3460 
##                   Biomass               Biomass_day 
##                      2899                      1459 
##             Harvest_index             Emergence_scr 
##                      3193                        60 
##           Early_vigor_scr                 Stand_per 
##                       763                        64 
##                Stand_code          Seed_quality_scr 
##                        20                       147 
##       Seed_appearance_scr         Seed_appear_desir 
##                       314                         0 
##     Seed_appear_desir_scr          Desirability_scr 
##                       559                       218 
##    Field_desirability_scr        Harvestability_scr 
##                        72                       232 
##           Adaptation_eval              Desirability 
##                        20                         0 
##               Pod_set_scr        MN_deficiency_eval 
##                        39                        11 
##             Zinc_dwarfing            Zinc_yellowing 
##                        36                        36 
##           Zinc_defic_eval            Zinc_defic_scr 
##                        24                        38 
##            Air_pollut_scr                 Ozone_scr 
##                       387                        24 
##              Bronzing_scr               Disease_scr 
##                        83                        23 
##          Anthracnose_eval                 BCMV_eval 
##                         0                       124 
##                   CBB_scr                   CBB_per 
##                       556                       450 
##                  CBB_eval     CBB_Foliage_per_innoc 
##                        36                        40 
## CBB_Foliage_per_non_innoc           CBB_pustule_scr 
##                        40                        20 
##            Blight_Pod_per             Curly_Top_per 
##                        40                        88 
##       Curly_top_virus_scr            Fusarium_emerg 
##                        96                        32 
##     Fusarium_seedling_vig       Fusarium_seed_yield 
##                        32                        32 
##          Fusarium_wilt_GH           Halo_blight_scr 
##                        33                       134 
##                Halo_B_per        Powdery_mildew_scr 
##                        70                        37 
##                Rhizoc_scr        Root_rot_emerg_scr 
##                        30                        36 
##    Root_rot_early_vig_scr       Root_rot_seed_yield 
##                        36                        36 
##              Root_rot_scr             Root_rot_eval 
##                        58                        43 
##                  Rust_scr                Rust_scr_c 
##                       316                        80 
##             Rust_CIAT_scr                  Rust_per 
##                        23                       369 
##                 Rust_eval          Rust_Foliage_per 
##                       489                        40 
##         Rust_Pustule_Type            White_mold_scr 
##                        32                       128 
##            White_mold_per           White_mold_eval 
##                       353                         6 
##       White_mold_scr_val2             White_mold_GH 
##                         0                        33 
##           Cooking_quality        Seed_color_uniform 
##                        34                        36 
##           Seed_wt_cul_dry       Seed_wt_cul_imbibed 
##                        36                        36 
##        Seed_dry_imb_ratio        Seed_cooked_appear 
##                        36                       100 
##           Seed_splits_cul           Seed_luminosity 
##                        36                        72 
##               Seed_chroma                  Seed_hue 
##                        72                        72 
##         Halo_blight_scr_1         Halo_blight_scr_2 
##                        45                        46 
##         Halo_blight_scr_3            Air_pollut_per 
##                        46                        46 
##                 Merit_scr          Fe_chlorosis_scr 
##                        69                        46 
##              Seed_L_color         Stand_uniform_scr 
##                        28                        34 
##           Row_closure_scr     Phenology_stage_51DAP 
##                        34                        34 
##        Growth_density_scr         Tunnel_effect_scr 
##                        34                        34 
##          Architecture_scr            Vine_habit_scr 
##                        34                        34 
##          Pod_maturity_scr             Plant_density 
##                        34                        36 
##            Pod_height_scr                Podset_scr 
##                        34                        34 
##             Flat_vine_scr                 Bush_Type 
##                        34                        34 
##             Pods_Peduncle             Seeds_per_Pod 
##                        34                        34 
##              Flower_color        General_appearance 
##                        41                        41 
##           Air_pollut_rate          Spray_injury_scr 
##                        41                        40 
##     Plant_vigor_bloom_scr            Plant_type_scr 
##                        41                        41 
##               Quality_scr                Blight_scr 
##                        41                        44 
##        Zinc_reaction_eval             Plant_width_2 
##                        13                        10 
##    Plant_architecture_scr              Fusarium_scr 
##                         5                        40 
##      White_mold_stems_per       Halo_blight_lvs_per 
##                         0                        32 
##      Halo_blight_pods_per               Rust_eval_1 
##                        32                        40 
##              Rust_eval_2_               Rust_eval_3 
##                         0                         0 
##                Plant_type                Virus_eval 
##                        64                        84 
##            Curly_top_eval          Drydown_duration 
##                        73                        28 
##          Frost_damage_scr       CBB_Innoculated_per 
##                        28                        36 
##                       GH2        Leaf_retention_scr 
##                         0                        26 
##               Rust_eval_2       White_mold_porosity 
##                        40                        40 
##                  Num_miss              Num_numerics 
##                     18530                     19415 
##                 Pref_name                Prev_names 
##                      1869                      4643 
##                 Orig_name                   Det_scr 
##                      1869                     14801 
##                 Yield_ahm       Unit_seed_wt_mg_ahm 
##                     18227                     14230 
##                    GH_ahm                   GHS_ahm 
##                      2602                        32 
##                    PT_ahm                  CIAT_ahm 
##                        74                      2558 
##                  Vine_ahm                   DTF_ahm 
##                       642                      9084 
##          Plant_height_ahm          Plant_length_ahm 
##                      2896                       180 
##                   DTM_ahm                    DG_ahm 
##                     12045                      7415 
##                    SF_ahm                   EVG_ahm 
##                      7544                       823 
##                   SAD_ahm                  SAD_1to5 
##                      1238                       395 
##                  SAD_1to7                  SAD_1to9 
##                        47                       784 
##                  SAD_1to3                   BMS_ahm 
##                      1226                      2846 
##                   HIN_ahm                   LDG_ahm 
##                      3140                      3313 
##                   LDG_0_9 
##                        74

7n. Clean CBB

Use CIAT’s scale that converts percents to scores for this cleanup.

Variable definitions:

Common Blight (4) Scale of 1-5 where 1 is no incidence and 5 is severely infected.

Or, on a 1-9 or percentage scale: Common bacterial blight (Xanthomonas campestris pv. phaseoli) Evaluation stages: R6, R8. Scale: (Figure 6). 1. No visible disease symptoms. 3. Approximately 2% of the leaf surface area covered with a few small lesions. Pods are generally free of lesions. 5 . Approximately 5% of the leaf surface area covered by small lesions that are beginning to coalesce and sometimes encircled by yellow halos resulting in minor blight. Lesions on the pods are generally small and not coalescing. 7. Approximately 10% of the leaf surface area covered with medium and large lesions which are usually accompanied by yellow halos and necrosis. Lesions on pods are large and coalescing and often show bacterial exudate. 9. More than 25% of the leaf surface area with large coalescing and generally necrotic lesions resulting in defoliation. Lesions on pods coalesce to cover ex tensive areas, exhibit abundant bacterial exudation which sometimes causes pod malformation and empty pods.

Here’s the scale to percentage key, same as the Rust key actually 1 0% RST_per <= 0.5, 1, RST_per ~2 1% between(RST_per,0.5,1.5), 2, RST_per 3 2% between(RST_per,1.5,2.5), 3, RST_per ~4 3.75% between(RST_per,2.5,4), 4, RST_per 5 5% between(RST_per,4,6), 5, RST_per ~6 7.5% between(RST_per,6,8, 6, RST_per 7 10% between(RST_per,8,14), 7, RST_per ~8 20% between(RST_per,14,25), 8, RST_per 9 25% + RST_per >= 25, 9, RST_per

Possible variables to coalesce:

CBB_scr CBB_per CBB_eval # R (1), MR (4), MS(6), S (9) CBB_Foliage_per_innoc # don’t combine not interested if they put the disease in I think CBB_Foliage_per_non_innoc CBB_pustule_scr # don’t combine just at PRIS

Phenotypes_ahm %>%
  filter(!is.na(CBB_eval) | !is.na(CBB_per) | !is.na(CBB_scr) | !is.na(CBB_Foliage_per_non_innoc)) %>% # 73
  group_by(Seq_ID) %>%
  summarise(count = n()) # 240 that were sequenced

unique(as.factor(Phenotypes_ahm$CBB_pustule_scr))

## [1] <NA> 6    4    3    1    5   
## Levels: 1 3 4 5 6

Phenotypes_ahm %>%
  mutate(CBB_eval = ifelse(CBB_eval %in% c("", " "),
                           NA,
                           CBB_eval)) %>%
  filter(!is.na(CBB_eval)) %>%
  dplyr::select(CBB_eval, CDBN_ID, Location_code, Year, everything())

Phenotypes_ahm <- Phenotypes_ahm %>%
  mutate(CBB_scr_ahm = ifelse(!is.na(CBB_scr) & Location_code == "NESB" & Year %in% c(1989, 2011),
                          NA, 
                          CBB_scr),
         CBB_per_ahm = coalesce(CBB_per, CBB_Foliage_per_non_innoc),
         CBB_per_ahm = ifelse(!is.na(CBB_scr) &  Location_code == "NESB" & Year %in% c(1989, 2011),
                          CBB_scr, 
                          CBB_per_ahm),
         CBB_scr_ahm = ifelse(between(CBB_scr_ahm, 0, 5),
                              (CBB_scr_ahm + 1)*1.5,
                              CBB_scr_ahm),
         CBB_eval = ifelse(CBB_eval %in% c("", " "),
                           NA,
                           CBB_eval),
         CBB_scr_ahm = ifelse(CBB_eval %in% c("S"),
                           9,
                           CBB_scr_ahm),
         CBB_scr_ahm = ifelse(CBB_eval %in% c("MS"),
                           6,
                           CBB_scr_ahm),
         CBB_scr_ahm = ifelse(CBB_eval %in% c("MR"),
                           4,
                           CBB_scr_ahm),
         CBB_scr_ahm = ifelse(CBB_eval %in% c("R"),
                           1,
                           CBB_scr_ahm),
         CBB_scr_ahm = as.double(CBB_scr_ahm),
         CBB_scr_ahm = ifelse(is.na(CBB_scr_ahm) & !is.na(CBB_per_ahm) & CBB_per_ahm <= 0.5, 1, CBB_scr_ahm),
         CBB_scr_ahm = ifelse(is.na(CBB_scr_ahm) & !is.na(CBB_per_ahm) & between(CBB_per_ahm, 0.5, 1.5), 2, CBB_scr_ahm),
         CBB_scr_ahm = ifelse(is.na(CBB_scr_ahm) & !is.na(CBB_per_ahm) & between(CBB_per_ahm, 1.5, 2.5), 3, CBB_scr_ahm),
         CBB_scr_ahm = ifelse(is.na(CBB_scr_ahm) & !is.na(CBB_per_ahm) & between(CBB_per_ahm, 2.5, 4), 4, CBB_scr_ahm),
         CBB_scr_ahm = ifelse(is.na(CBB_scr_ahm) & !is.na(CBB_per_ahm) & between(CBB_per_ahm, 4, 6), 5, CBB_scr_ahm),            
         CBB_scr_ahm = ifelse(is.na(CBB_scr_ahm) & !is.na(CBB_per_ahm) & between(CBB_per_ahm, 6, 8), 6, CBB_scr_ahm),              
         CBB_scr_ahm = ifelse(is.na(CBB_scr_ahm) & !is.na(CBB_per_ahm) & between(CBB_per_ahm, 8, 14), 7, CBB_scr_ahm),             
         CBB_scr_ahm = ifelse(is.na(CBB_scr_ahm) & !is.na(CBB_per_ahm) & between(CBB_per_ahm, 14, 25), 8, CBB_scr_ahm),               
         CBB_scr_ahm = ifelse(is.na(CBB_scr_ahm) & !is.na(CBB_per_ahm) & CBB_per_ahm >= 25, 9, CBB_scr_ahm)                 
         )

Phenotypes_ahm %>%
  mutate(CBB_scr_ahm = ifelse(!is.na(CBB_scr) & Location_code == "NESB" & Year %in% c(1989, 2011),
                          NA, 
                          CBB_scr),
         CBB_per_ahm = coalesce(CBB_per, CBB_Foliage_per_non_innoc),
         CBB_per_ahm = ifelse(!is.na(CBB_scr) &  Location_code == "NESB" & Year %in% c(1989, 2011),
                          CBB_scr, 
                          CBB_per_ahm),
         CBB_scr_ahm = ifelse(between(CBB_scr_ahm, 0, 5),
                              (CBB_scr_ahm + 1)*1.5,
                              CBB_scr_ahm),
         CBB_eval = ifelse(CBB_eval %in% c("", " "),
                           NA,
                           CBB_eval),
         CBB_scr_ahm = ifelse(CBB_eval %in% c("S"),
                           9,
                           CBB_scr_ahm),
         CBB_scr_ahm = ifelse(CBB_eval %in% c("MS"),
                           6,
                           CBB_scr_ahm),
         CBB_scr_ahm = ifelse(CBB_eval %in% c("MR"),
                           4,
                           CBB_scr_ahm),
         CBB_scr_ahm = ifelse(CBB_eval %in% c("R"),
                           1,
                           CBB_scr_ahm),
         CBB_scr_ahm = as.double(CBB_scr_ahm),
         CBB_scr_ahm = ifelse(is.na(CBB_scr_ahm) & !is.na(CBB_per_ahm) & CBB_per_ahm <= 0.5, 1, CBB_scr_ahm),
         CBB_scr_ahm = ifelse(is.na(CBB_scr_ahm) & !is.na(CBB_per_ahm) & between(CBB_per_ahm, 0.5, 1.5), 2, CBB_scr_ahm),
         CBB_scr_ahm = ifelse(is.na(CBB_scr_ahm) & !is.na(CBB_per_ahm) & between(CBB_per_ahm, 1.5, 2.5), 3, CBB_scr_ahm),
         CBB_scr_ahm = ifelse(is.na(CBB_scr_ahm) & !is.na(CBB_per_ahm) & between(CBB_per_ahm, 2.5, 4), 4, CBB_scr_ahm),
         CBB_scr_ahm = ifelse(is.na(CBB_scr_ahm) & !is.na(CBB_per_ahm) & between(CBB_per_ahm, 4, 6), 5, CBB_scr_ahm),            
         CBB_scr_ahm = ifelse(is.na(CBB_scr_ahm) & !is.na(CBB_per_ahm) & between(CBB_per_ahm, 6, 8), 6, CBB_scr_ahm),              
         CBB_scr_ahm = ifelse(is.na(CBB_scr_ahm) & !is.na(CBB_per_ahm) & between(CBB_per_ahm, 8, 14), 7, CBB_scr_ahm),             
         CBB_scr_ahm = ifelse(is.na(CBB_scr_ahm) & !is.na(CBB_per_ahm) & between(CBB_per_ahm, 14, 25), 8, CBB_scr_ahm),               
         CBB_scr_ahm = ifelse(is.na(CBB_scr_ahm) & !is.na(CBB_per_ahm) & CBB_per_ahm >= 25, 9, CBB_scr_ahm)                 
         ) %>%
  filter(!is.na(CBB_scr_ahm)) %>%
  ggplot(aes(x = as.factor(Year), y = CBB_scr_ahm)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.1, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm %>%
  filter(!is.na(CBB_scr)) %>%
  mutate(CBB_scr = ifelse(!is.na(CBB_scr) & Location_code == "NESB" & Year == 1989,
                          CBB_scr, 
                          CBB_scr),
         CBB_scr = ifelse(!is.na(CBB_scr) & (Location_code == "NESB" & Year == 2011) | Location_code == "NENP" | Location_code == "NDER",
                          CBB_scr, 
                          CBB_scr)) %>%
  ggplot(aes(x = as.factor(Year), y = CBB_scr)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.1, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm %>%
  filter(!is.na(CBB_per)) %>%
  mutate(CBB_per = ifelse(!is.na(CBB_per),
                          CBB_per,
                          CBB_per)) %>%
  ggplot(aes(x = as.factor(Year), y = CBB_per)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.1, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm %>%
  filter(!is.na(CBB_Foliage_per_non_innoc)) %>%
  ggplot(aes(x = as.factor(Year), y = CBB_Foliage_per_non_innoc)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.1, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm %>%
  mutate(CBB_ahm = coalesce(CBB_per, CBB_Foliage_per_non_innoc),
         CBB_ahm = ifelse(is.na(CBB_ahm) & !is.na(CBB_scr),
                          CBB_scr,
                          CBB_ahm)) %>%
  filter(!is.na(CBB_ahm)) %>%
  ggplot(aes(x = as.factor(Year), y = CBB_ahm)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.1, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm <- Phenotypes_ahm %>%
  mutate(CBB_ahm = coalesce(CBB_per, CBB_Foliage_per_non_innoc),
         CBB_ahm = ifelse(is.na(CBB_ahm) & !is.na(CBB_scr),
                          CBB_scr,
                          CBB_ahm))

1989 for NESB CBB_scr seems to actually be CBB_per, perhaps. Divide by 5. 3 location*years seem to be on a 1-9 scale, so divide by 2 and add 0.5 to convert Lodging on a 1-9 scale to Lodging on a 1-5 scale.

7o. Clean rust

Possible variables to coalesce:

Rust_scr Rust_scr_c Rust_CIAT_scr Rust_per Rust_eval # letters w/ unknown meanings Rust_Foliage_per Rust_Pustule_Type # letters and numbers

First thoughts: Coalesce Rust_per and Rust_Foliage_per Use rust percentage to scale key from CIAT to convert these to scores Coalesce Rust_CIAT_scr with Rust_scr Convert Rust_scr_c to a scale and combine with the above. Combine scores and percentages.

Here’s the scale to percentage key 1 0% RST_per <= 0.5, 1, RST_per ~2 1% between(RST_per,0.5,1.5), 2, RST_per 3 2% between(RST_per,1.5,2.5), 3, RST_per ~4 3.75% between(RST_per,2.5,4), 4, RST_per 5 5% between(RST_per,4,6), 5, RST_per ~6 7.5% between(RST_per,6,8, 6, RST_per 7 10% between(RST_per,8,14), 7, RST_per ~8 20% between(RST_per,14,25), 8, RST_per 9 25% + RST_per >= 25, 9, RST_per

Unfortunately there’s not a good equation to use for this…

Phenotypes_ahm %>%
  mutate(Rust_scr = ifelse(Rust_scr %in% c("", " "),
                           NA,
                           Rust_scr)) %>%
  filter(!is.na(Rust_scr)) %>%
  ggplot(aes(x = as.factor(Year), y = Rust_scr)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.1, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm <- Phenotypes_ahm %>%
  mutate(RST_per = coalesce(Rust_per, Rust_Foliage_per),
         Rust_scr_c = ifelse(Rust_scr_c %in% c("", " "),
                           NA,
                           Rust_scr_c),
         Rust_scr_c = ifelse(Rust_scr_c %in% c("2,3"),
                           2.5,
                           Rust_scr_c),
         Rust_scr_c = ifelse(Rust_scr_c %in% c("3/4"),
                           3.5,
                           Rust_scr_c),
         Rust_scr_c = ifelse(Rust_scr_c %in% c("4,5"),
                           4.5,
                           Rust_scr_c),
         Rust_scr_c = ifelse(Rust_scr_c %in% c("5,6"),
                           5.5,
                           Rust_scr_c),
         Rust_scr_c = as.double(Rust_scr_c),
         RST_scr = coalesce(Rust_scr, Rust_CIAT_scr, Rust_scr_c),
         RST_scr = ifelse(is.na(RST_scr) & !is.na(RST_per) & RST_per <= 0.5, 1, RST_scr),
         RST_scr = ifelse(is.na(RST_scr) & !is.na(RST_per) & between(RST_per, 0.5, 1.5), 2, RST_scr),
         RST_scr = ifelse(is.na(RST_scr) & !is.na(RST_per) & between(RST_per, 1.5, 2.5), 3, RST_scr),
         RST_scr = ifelse(is.na(RST_scr) & !is.na(RST_per) & between(RST_per, 2.5, 4), 4, RST_scr),
         RST_scr = ifelse(is.na(RST_scr) & !is.na(RST_per) & between(RST_per, 4, 6), 5, RST_scr),            
         RST_scr = ifelse(is.na(RST_scr) & !is.na(RST_per) & between(RST_per, 6, 8), 6, RST_scr),              
         RST_scr = ifelse(is.na(RST_scr) & !is.na(RST_per) & between(RST_per, 8, 14), 7, RST_scr),             
         RST_scr = ifelse(is.na(RST_scr) & !is.na(RST_per) & between(RST_per, 14, 25), 8, RST_scr),               
         RST_scr = ifelse(is.na(RST_scr) & !is.na(RST_per) & RST_per >= 25, 9, RST_scr)                 
         ) 

# unique(as.factor(Phenotypes_ahm$Rust_scr_c))


Phenotypes_ahm %>%
  filter(!is.na(RST_scr)) %>%
  ggplot(aes(x = as.factor(Year), y = RST_scr)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.1, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm %>%
  mutate(Rust_scr_c = ifelse(Rust_scr_c %in% c("", " "),
                           NA,
                           Rust_scr_c)) %>%
  filter(!is.na(Rust_scr_c)) %>%
  ggplot(aes(x = as.factor(Year), y = Rust_scr_c)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.1, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm %>%
  filter(!is.na(Rust_CIAT_scr)) %>%
  ggplot(aes(x = as.factor(Year), y = Rust_CIAT_scr)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.1, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm %>%
  filter(!is.na(Rust_per)) %>%
  ggplot(aes(x = as.factor(Year), y = Rust_per)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.1, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm %>%
  filter(!is.na(Rust_Foliage_per)) %>%
  ggplot(aes(x = as.factor(Year), y = Rust_Foliage_per)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.1, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

7p. Clean BCMV

evaluations: NE, SL, CM, 5CM, 10N, 7NE, BR, VN, NS, LL, LL/, VA, SN, BCM, BR?

I think I should score BR/SN separately from regular BCMV to try and pull out the varieties with a specific gene for resistance to BCMV. May not work at all but worth a try…

NS = no symptoms = 0 VN = veinal necrosis = 1 VA = variable response = 0.5 LL, LL/ = local lesions = 1 BR, SN, BR? == black root or systemic necrosis. The black root reaction is a systemic necrosis symptom that occurs in varieties with a specific gene for resistance to BCMV. Varieties with this gene are resistant to all strains of BCMV under most conditions. However, when plants growing at high temperature are infected with BCMV, the hypersensitive black root reaction develops.

coalesce with Virus_eval with Virus_eval first. Ignore NE, SL, 10N, 7NE, BR, VN, NS, LL, LL/, VA, SN, BR? Only include CM, 5CM, and BCM. Second round: add NS, LL, LL/, VA

Other than that I have no idea what these mean. Seems like a really dumb idea to include this, particularly with only 86 varieties.

IDPA 2001, IDKI 1999, IDPA 1988, WARO 1988, IDKI 1984, IDKI 1983. Also found data for 1989 for WAPR and IDKI to include.

Virus_eval migt be for BCMV also. Which means I should possibly reevaluate BCMV. CRAP.

*Also NA’s for the site by year combinations with data indicate resistant lines, not unscored lines. For BCMV there are 11 s by y and 8 years with data:*

IDPA 1988, 1990, 1992, 2001 IDKI 1983, 1984, 1988, 1989, 1999 WARO 1988, 1989

Now there are 133 varieties, including the NA’s. Victory!

Phenotypes_ahm %>%
  filter(!is.na(BCMV_eval) | !is.na(Virus_eval)) %>% 
  group_by(Seq_ID) %>%
  summarise(count = n()) # 86 that were sequenced... yeah no. Actually 133. Worth a try I guess.

Phenotypes_ahm %>%
  filter(!is.na(BCMV_eval) | !is.na(Virus_eval)) %>%
  dplyr::select(BCMV_eval, Virus_eval, everything())

Phenotypes_ahm %>%
  mutate(CMV_ahm = ifelse((Location_code == "IDPA" & Year %in% c(1988, 1990, 1992, 2001)) | (Location_code == "IDKI" & Year %in% c(1983, 1984, 1988, 1989, 1999)) | (Location_code == "WARO" & Year %in% c(1988, 1989)),
                          0,
                          NA),
         CMV_ahm = ifelse(BCMV_eval %in% c("CM", "5CM", "BCM", "BCMV", "LL", "LL/", "VN", "VA"),
                          1,
                          CMV_ahm),
         CMV_ahm = ifelse(BCMV_eval %in% c("NE", "SL", "10N", "7NE"),
                          NA,
                          CMV_ahm),
         CMV_ahm = ifelse(Virus_eval %in% c("BCMV-VN", "BCMV-MM", "BCMV", "BCMV", "CTV, BCMV", "VN", "BCMV? & CTV", "BCMV?", "CTV&BCMV"),
                          1,
                          CMV_ahm),
         CMV_ahm = ifelse(Location_code == "WARO" & Year == 1989 & CDBN_ID %in% c("NW63", "K0228", "Olathe", "UI114", "Sierra", "D85212", "ISB_82_354", "UI59", "Harris", "GN_WM_85_43", "GN_WM_85_55", "Viva", "55037", "UNS_117", "Flamingo"),
                          1,
                          CMV_ahm),
         CMV_ahm = ifelse(Location_code == "IDKI" & Year == 1989 & CDBN_ID %in% c("UI114", "Sierra", "GN_WM_85_43"),
                          1,
                          CMV_ahm)
         ) %>%
  filter(!is.na(CMV_ahm)) %>%
  #group_by(Seq_ID) %>%
  #summarise(count = n())
  ggplot(aes(x = as.factor(Year), y = CMV_ahm)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.1, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm <- Phenotypes_ahm %>%
  mutate(CMV_ahm = ifelse((Location_code == "IDPA" & Year %in% c(1988, 1990, 1992, 2001)) | (Location_code == "IDKI" & Year %in% c(1983, 1984, 1988, 1989, 1999)) | (Location_code == "WARO" & Year %in% c(1988, 1989)),
                          0,
                          NA),
         CMV_ahm = ifelse(BCMV_eval %in% c("CM", "5CM", "BCM", "BCMV", "LL", "LL/", "VN", "VA", "SL"),
                          1,
                          CMV_ahm),
         CMV_ahm = ifelse(Virus_eval %in% c("BCMV-VN", "BCMV-MM", "BCMV", "BCMV", "CTV, BCMV", "VN", "BCMV? & CTV", "BCMV?", "CTV&BCMV"),
                          1,
                          CMV_ahm),
         CMV_ahm = ifelse(Location_code == "WARO" & Year == 1989 & CDBN_ID %in% c("NW63", "K0228", "Olathe", "UI114", "Sierra", "D85212", "ISB_82_354", "UI59", "Harris", "GN_WM_85_43", "GN_WM_85_55", "Viva", "55037", "UNS_117", "Flamingo"),
                          1,
                          CMV_ahm),
         CMV_ahm = ifelse(Location_code == "IDKI" & Year == 1989 & CDBN_ID %in% c("UI114", "Sierra", "GN_WM_85_43"),
                          1,
                          CMV_ahm)
         )

7p.i. Add BR phenotype

Phenotypes_ahm %>%
  filter(!is.na(BCMV_eval) | !is.na(Virus_eval)) %>%
  dplyr::select(BCMV_eval, Virus_eval, everything())

Phenotypes_ahm %>%
  mutate(BR_ahm = ifelse((Location_code == "IDPA" & Year %in% c(1988, 1990, 1992, 2001)) | (Location_code == "IDKI" & Year %in% c(1983, 1984, 1988, 1989, 1999)) | (Location_code == "WARO" & Year %in% c(1988, 1989)),
                          0,
                          NA),
         BR_ahm = ifelse(BCMV_eval %in% c("NE", "10N", "7NE", "BR", "SN"),
                          1,
                          BR_ahm),
         BR_ahm = ifelse(Virus_eval %in% c("BYMV & BR", "BR"),
                          1,
                          BR_ahm),
         BR_ahm = ifelse(Location_code == "WARO" & Year == 1989 & CDBN_ID %in% c("Aurora", "Fleetwood", "ISB_85_672", "UI114", "Sierra", "D85212", "Viva", "Midnight", "UI906", "ISB_82_772"),
                          1,
                          BR_ahm),
         BR_ahm = ifelse(Location_code == "IDKI" & Year == 1989 & CDBN_ID %in% c("Aurora", "Fleetwood", "UI137", "Mayflower", "ISB_85_672", "Yolano", "Flamingo", "Midnight", "UI906"),
                          1,
                          BR_ahm)
         ) %>%
  filter(!is.na(BR_ahm)) %>%
  #group_by(Seq_ID) %>%
  #summarise(count = n())
  ggplot(aes(x = as.factor(Year), y = BR_ahm)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.1, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm <- Phenotypes_ahm %>%
  mutate(BR_ahm = ifelse((Location_code == "IDPA" & Year %in% c(1988, 1990, 1992, 2001)) | (Location_code == "IDKI" & Year %in% c(1983, 1984, 1988, 1989, 1999)) | (Location_code == "WARO" & Year %in% c(1988, 1989)),
                          0,
                          NA),
         BR_ahm = ifelse(BCMV_eval %in% c("NE", "10N", "7NE", "BR", "SN"),
                          1,
                          BR_ahm),
         BR_ahm = ifelse(Virus_eval %in% c("BYMV & BR", "BR"),
                          1,
                          BR_ahm),
         BR_ahm = ifelse(Location_code == "WARO" & Year == 1989 & CDBN_ID %in% c("Aurora", "Fleetwood", "ISB_85_672", "UI114", "Sierra", "D85212", "Viva", "Midnight", "UI906", "ISB_82_772"),
                          1,
                          BR_ahm),
         BR_ahm = ifelse(Location_code == "IDKI" & Year == 1989 & CDBN_ID %in% c("Aurora", "Fleetwood", "UI137", "Mayflower", "ISB_85_672", "Yolano", "Flamingo", "Midnight", "UI906"),
                          1,
                          BR_ahm)
         )

7q. Clean Curly top virus

Looks like some of the NA data for Curly_top_eval actually means the absence of the virus, or 0/R. That increases the number of entries with data.

Virus_eval is for BCMV also.

Make R individuals explicit in each variable first, and then coalesce.

Curly_Top_per WARO 1992 1986 WAOT 1986

Also 133 with CTV evaluation, curiously. Probably the same set.

Phenotypes_ahm %>%
  filter((Location_code == "WARO" & Year %in% c(1986, 1992)))

Phenotypes_ahm %>%
  filter((Location_code == "WAOT" & Year %in% c(2002, 1992)))

Phenotypes_ahm %>%
  mutate(CTP = Curly_Top_per,
         CTP = ifelse(((Location_code == "WARO" & Year %in% c(1986, 1992)) | (Location_code == "WAOT" & Year == 1986)) & is.na(CTP),
                      0,
                    CTP),
         CTVS = Curly_top_virus_scr,
         CTVS = ifelse(((Location_code == "WARO" & Year %in% c(1994, 2000)) | (Location_code == "WAOT" & Year %in% c(1992, 2002))) & is.na(CTVS),
                       1,
                       CTVS),
         CTV_ahm = ifelse(((Location_code == "IDPA" & Year %in% c(1988, 1990, 1992)) | (Location_code == "IDKI" & Year %in% c(1988, 1989, 1992)) | (Location_code == "WARO" & Year %in% c(1988, 1989, 1990)) | (Location_code == "WAOT" & Year %in% c(1988))),
                          0,
                          NA),
         CTV_ahm = ifelse(Curly_top_eval %in% c("CTV", "sev CTV"),
                          1,
                          CTV_ahm),
         CTV_ahm = ifelse(Virus_eval %in% c("CTV", "sev CTV", "CTV, BCMV", "CTV & AMV"),
                          1,
                          CTV_ahm),
         CTV_ahm = ifelse(Location_code == "WARO" & Year == 1989 & CDBN_ID %in% c("GN_WM_85_43", "ISB_85_672", "UI906", "Redkloud", "Montcalm"),
                          1,
                          CTV_ahm),
         CTV_ahm = ifelse(CTVS >= 1.1  & is.na(CTV_ahm),
                          1,
                          CTV_ahm),
         CTV_ahm = ifelse(CTP >= 1 & is.na(CTV_ahm),
                          1,
                          CTV_ahm),
                  CTV_ahm = ifelse(CTVS <= 1  & is.na(CTV_ahm),
                          0,
                          CTV_ahm),
         CTV_ahm = ifelse(CTP <= 0.99 & is.na(CTV_ahm),
                          0,
                          CTV_ahm)
  ) %>%
  filter(!is.na(CTV_ahm)) %>%
  dplyr::select(CTV_ahm, CTV_ahm, Curly_Top_per, Curly_top_virus_scr, Location_code, Year, CDBN_ID, everything()) %>%
  group_by(Seq_ID) %>%
  summarise(count = n())

Phenotypes_ahm <- Phenotypes_ahm %>%
  mutate(CTP = Curly_Top_per,
         CTP = ifelse(((Location_code == "WARO" & Year %in% c(1986, 1992)) | (Location_code == "WAOT" & Year == 1986)) & is.na(CTP),
                      0,
                    CTP),
         CTVS = Curly_top_virus_scr,
         CTVS = ifelse(((Location_code == "WARO" & Year %in% c(1994, 2000)) | (Location_code == "WAOT" & Year %in% c(1992, 2002))) & is.na(CTVS),
                       1,
                       CTVS),
         CTV_ahm = ifelse(((Location_code == "IDPA" & Year %in% c(1988, 1990, 1992)) | (Location_code == "IDKI" & Year %in% c(1988, 1989, 1992)) | (Location_code == "WARO" & Year %in% c(1988, 1989, 1990)) | (Location_code == "WAOT" & Year %in% c(1988))),
                          0,
                          NA),
         CTV_ahm = ifelse(Curly_top_eval %in% c("CTV", "sev CTV"),
                          1,
                          CTV_ahm),
         CTV_ahm = ifelse(Virus_eval %in% c("CTV", "sev CTV", "CTV, BCMV", "CTV & AMV"),
                          1,
                          CTV_ahm),
         CTV_ahm = ifelse(Location_code == "WARO" & Year == 1989 & CDBN_ID %in% c("GN_WM_85_43", "ISB_85_672", "UI906", "Redkloud", "Montcalm"),
                          1,
                          CTV_ahm),
         CTV_ahm = ifelse(CTVS >= 1.1  & is.na(CTV_ahm),
                          1,
                          CTV_ahm),
         CTV_ahm = ifelse(CTP >= 1 & is.na(CTV_ahm),
                          1,
                          CTV_ahm),
                  CTV_ahm = ifelse(CTVS <= 1  & is.na(CTV_ahm),
                          0,
                          CTV_ahm),
         CTV_ahm = ifelse(CTP <= 0.99 & is.na(CTV_ahm),
                          0,
                          CTV_ahm)
  )

Phenotypes_ahm %>%
  filter(!is.na(Curly_top_eval) | !is.na(Curly_top_virus_scr) | !is.na(Curly_Top_per) | !is.na(Virus_eval)) %>%
  group_by(Seq_ID) %>%
  summarise(count = n()) # 122 that were sequenced. Worth a try maybe.

Phenotypes_ahm %>%
  filter(!is.na(Curly_top_eval)) %>%
  dplyr::select(Curly_top_eval, Location_code, Year, CDBN_ID, everything())

Phenotypes_ahm %>%
  filter(!is.na(Virus_eval)) %>%
  dplyr::select(Virus_eval, Location_code, Year, CDBN_ID, everything())

Phenotypes_ahm %>%
  filter(!is.na(Curly_top_virus_scr)) %>%
  ggplot(aes(x = as.factor(Year), y = Curly_top_virus_scr)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.1, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm %>%
  filter(!is.na(Curly_Top_per)) %>%
  ggplot(aes(x = as.factor(Year), y = Curly_Top_per)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.1, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

7r. Clean Halo blight

136 varieties.

11 site * year combinations.

Phenotypes_ahm <- Phenotypes_ahm %>%
  dplyr::select(Genotype:Unit_seed_wt_mg, everything())

Phenotypes_ahm %>%
  filter(!is.na(Halo_B_per) | !is.na(Halo_blight_scr) | !is.na(Halo_blight_scr_1) | !is.na(Halo_blight_scr_2) | !is.na(Halo_blight_scr_3) | !is.na(Halo_blight_lvs_per) | !is.na(Halo_blight_pods_per)| !is.na(Blight_Pod_per) | !is.na(Blight_scr)) %>% 
  group_by(Seq_ID) %>%
  summarise(count = n()) # 128 that were sequenced.  Worth a try I guess.

Phenotypes_ahm %>%
  filter(!is.na(Halo_B_per)) # 70 scored NENP 1992 NESB 1992 no issues with NA = 0

Phenotypes_ahm %>%
  filter(!is.na(Halo_blight_scr)) # 134 NDHA 2015 (1-7) NESB 1995 1993 1981 (1-5)

Phenotypes_ahm %>%
  filter(!is.na(Halo_blight_scr)) %>%
  ggplot(aes(x = as.factor(Year), y = Halo_blight_scr)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.1, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

## Halo_blight_scr_1 NESB 1982 (1-5)
## Halo_blight_scr_2 NESB 1982 ignore
## Halo_blight_scr_3 NESB 1982 ignore
## Blight_scr        COFC 1985 MISA 1991 (1-5)
## Halo_blight_lvs_per WYPO 1987 (0-100)
## Halo_blight_pods_per WYPO 1987 ignore
## Blight_Pod_per NENP 1986
## Halo_B_per NENP 1992 NESB 1992 (0-60)
## Halo_blight_scr NDHA 2015 (1-7) NESB 1995 1993 1981 (1-5)

Phenotypes_ahm %>%
  mutate(HB_ahm = ifelse((Location_code == "NESB" & Year %in% c(1981, 1982, 1992, 1993, 1995)) | (Location_code == "COFC" & Year == 1985) | (Location_code == "MISA" & Year == 1991) | (Location_code == "WYPO" & Year == 1987) | (Location_code == "NENP" & Year %in% c(1986, 1992)) | (Location_code == "NDHA" & Year == 2015),
                         1,
                         NA),
         HB_ahm = ifelse(!is.na(Halo_blight_scr_1),
                          Halo_blight_scr_1,
                          HB_ahm),
         HB_ahm = ifelse(!is.na(Blight_scr),
                          Blight_scr,
                          HB_ahm),
         HB_ahm = ifelse(!is.na(Blight_scr),
                          Blight_scr,
                          HB_ahm),
         HB_ahm = ifelse(!is.na(Halo_blight_scr) & Location_code == "NESB",
                          Halo_blight_scr,
                          HB_ahm),
         HB_ahm = ifelse(!is.na(Halo_blight_scr) & between(Halo_blight_scr, 0.5, 1.5) & Location_code == "NDHA",
                          1,
                          HB_ahm),
         HB_ahm = ifelse(!is.na(Halo_blight_scr) & between(Halo_blight_scr, 1.5, 2.5) & Location_code == "NDHA",
                          1.67,
                          HB_ahm),
         HB_ahm = ifelse(!is.na(Halo_blight_scr) & between(Halo_blight_scr, 2.5, 3.5) & Location_code == "NDHA",
                          2.33,
                          HB_ahm),
         HB_ahm = ifelse(!is.na(Halo_blight_scr) & between(Halo_blight_scr, 3.5, 4.5) & Location_code == "NDHA",
                          3,
                          HB_ahm),
         HB_ahm = ifelse(!is.na(Halo_blight_scr) & between(Halo_blight_scr, 4.5, 5.5) & Location_code == "NDHA",
                          3.67,
                          HB_ahm),
         HB_ahm = ifelse(!is.na(Halo_blight_scr) & between(Halo_blight_scr, 5.5, 6.5) & Location_code == "NDHA",
                          4.33,
                          HB_ahm),
         HB_ahm = ifelse(!is.na(Halo_blight_scr) & between(Halo_blight_scr, 6.5, 7.5) & Location_code == "NDHA",
                          5,
                          HB_ahm),
         HB_ahm = ifelse(!is.na(Halo_blight_lvs_per),
                          Halo_blight_lvs_per/25 + 1,
                          HB_ahm),
         HB_ahm = ifelse(HB_ahm == 1 & !is.na(Blight_Pod_per),
                          Blight_Pod_per/25 + 1,
                          HB_ahm),
         HB_ahm = ifelse(HB_ahm == 1 & !is.na(Halo_B_per),
                          Halo_B_per/25 + 1,
                          HB_ahm)
  ) %>%
  filter(!is.na(HB_ahm)) %>%
  dplyr::select(HB_ahm, Halo_blight_scr, Halo_B_per, Blight_Pod_per, Location_code, Year, CDBN_ID, everything()) %>%
  ggplot(aes(x = as.factor(Year), y = HB_ahm)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.1, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

   #%>%
  #group_by(Seq_ID) %>%
  #summarise(count = n())

1, 1.67, 2.33, 3, 3.67, 4.33, 5

Dividing the percentages by 25 then adding 1 right now. Should look up a better way to translate percentage infected tissue to scores for Halo blight.

Finally, add a new HB variable to the Phenotypes datasheet:

Phenotypes_ahm <- Phenotypes_ahm %>%
  mutate(HB_ahm = ifelse((Location_code == "NESB" & Year %in% c(1981, 1982, 1992, 1993, 1995)) | (Location_code == "COFC" & Year == 1985) | (Location_code == "MISA" & Year == 1991) | (Location_code == "WYPO" & Year == 1987) | (Location_code == "NENP" & Year %in% c(1986, 1992)) | (Location_code == "NDHA" & Year == 2015),
                         1,
                         NA),
         HB_ahm = ifelse(!is.na(Halo_blight_scr_1),
                          Halo_blight_scr_1,
                          HB_ahm),
         HB_ahm = ifelse(!is.na(Blight_scr),
                          Blight_scr,
                          HB_ahm),
         HB_ahm = ifelse(!is.na(Blight_scr),
                          Blight_scr,
                          HB_ahm),
         HB_ahm = ifelse(!is.na(Halo_blight_scr) & Location_code == "NESB",
                          Halo_blight_scr,
                          HB_ahm),
         HB_ahm = ifelse(!is.na(Halo_blight_scr) & between(Halo_blight_scr, 0.5, 1.5) & Location_code == "NDHA",
                          1,
                          HB_ahm),
         HB_ahm = ifelse(!is.na(Halo_blight_scr) & between(Halo_blight_scr, 1.5, 2.5) & Location_code == "NDHA",
                          1.67,
                          HB_ahm),
         HB_ahm = ifelse(!is.na(Halo_blight_scr) & between(Halo_blight_scr, 2.5, 3.5) & Location_code == "NDHA",
                          2.33,
                          HB_ahm),
         HB_ahm = ifelse(!is.na(Halo_blight_scr) & between(Halo_blight_scr, 3.5, 4.5) & Location_code == "NDHA",
                          3,
                          HB_ahm),
         HB_ahm = ifelse(!is.na(Halo_blight_scr) & between(Halo_blight_scr, 4.5, 5.5) & Location_code == "NDHA",
                          3.67,
                          HB_ahm),
         HB_ahm = ifelse(!is.na(Halo_blight_scr) & between(Halo_blight_scr, 5.5, 6.5) & Location_code == "NDHA",
                          4.33,
                          HB_ahm),
         HB_ahm = ifelse(!is.na(Halo_blight_scr) & between(Halo_blight_scr, 6.5, 7.5) & Location_code == "NDHA",
                          5,
                          HB_ahm),
         HB_ahm = ifelse(!is.na(Halo_blight_lvs_per),
                          Halo_blight_lvs_per/25 + 1,
                          HB_ahm),
         HB_ahm = ifelse(HB_ahm == 1 & !is.na(Blight_Pod_per),
                          Blight_Pod_per / 25 + 1,
                          HB_ahm),
         HB_ahm = ifelse(HB_ahm == 1 & !is.na(Halo_B_per),
                          Halo_B_per / 25 + 1,
                          HB_ahm)
  )

7s. Clean root rot (Fusarium)

123-126ish varieties were scored.

7s.i.First work out the site*year combinations where root rot was scored. Only 7 of these.

Phenotypes_ahm %>%
  filter(!is.na(Root_rot_emerg_scr) | !is.na(Root_rot_early_vig_scr) | !is.na(Root_rot_seed_yield) | !is.na(Root_rot_scr) | !is.na(Root_rot_eval)| !is.na(Fusarium_emerg) | !is.na(Fusarium_seedling_vig) | !is.na(Fusarium_seed_yield) | !is.na(Fusarium_scr) | !is.na(Fusarium_wilt_GH)) %>% 
  group_by(Seq_ID) %>%
  summarise(count = n()) # 123 that were sequenced.

Phenotypes_ahm %>%
  filter(!is.na(Root_rot_eval)) %>%
  dplyr::select(Root_rot_eval, everything()) # L, M, S, what? According to the dataset: L: light, M: moderate, S: severe. Also this is Fusarium root rot... so add those

Phenotypes_ahm %>%
  filter(!is.na(Fusarium_seedling_vig)) %>%
  ggplot(aes(x = as.factor(Year), y = Fusarium_seedling_vig)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.1, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

## Root_rot_emerg_scr WARO 2000 IGNORE
## Root_rot_early_vig_scr WARO 2000 (1-5)
## Root_rot_seed_yield WARO 2000 IGNORE
## Root_rot_scr NESB 1991 (1-5) WARO 1994 (1-9)
## Rhizoc_scr WARO 1994 IGNORE
## Root_rot_eval WARO 1982 blank = 1 L = 2 M = 5 S = 8
## Fusarium_emerg WARO 1999 IGNORE
## Fusarium_seedling_vig WARO 1999 (1-9) with one value that should be divided by 10
## Fusarium_scr WARO 1986 (1-9)
## Fusarium_seed_yield WARO 1999 IGNORE
## Fusarium_wilt_GH COF2 2000 (1-9)

# Now test fixes for these variables

Phenotypes_ahm %>%
  mutate(RR_ahm = ifelse((Location_code == "WARO" & Year %in% c(2000, 1994, 1982, 1999, 1986)) | (Location_code == "COF2" & Year == 2000) | (Location_code == "NESB" & Year == 1991),
                         1,
                         NA),
         RR_ahm = ifelse(!is.na(Root_rot_early_vig_scr),
                          (Root_rot_early_vig_scr - 0.5)*2,
                          RR_ahm),
         RR_ahm = ifelse(!is.na(Root_rot_scr) & Location_code == "NESB",
                          (Root_rot_scr - 0.5)*2,
                          RR_ahm),
         RR_ahm = ifelse(!is.na(Root_rot_scr) & Location_code == "WARO",
                          Root_rot_scr,
                          RR_ahm),
         RR_ahm = ifelse(!is.na(Fusarium_seedling_vig) & Fusarium_seedling_vig <= 10,
                          Fusarium_seedling_vig,
                          RR_ahm),
         RR_ahm = ifelse(!is.na(Fusarium_seedling_vig) & Fusarium_seedling_vig >= 10,
                          Fusarium_seedling_vig / 10,
                          RR_ahm),
         RR_ahm = ifelse(!is.na(Fusarium_scr),
                          Fusarium_scr,
                          RR_ahm),
         RR_ahm = ifelse(!is.na(Fusarium_wilt_GH),
                          Fusarium_wilt_GH,
                          RR_ahm),
         RR_ahm = ifelse(!is.na(Root_rot_eval) & Root_rot_eval == "L",
                          2,
                          RR_ahm),
         RR_ahm = ifelse(!is.na(Root_rot_eval) & Root_rot_eval == "M",
                          5,
                          RR_ahm),
         RR_ahm = ifelse(!is.na(Root_rot_eval) & Root_rot_eval == "S",
                          8,
                          RR_ahm)
  ) %>%
  filter(!is.na(RR_ahm)) %>%
  #group_by(Seq_ID) %>%
  #summarise(count = n())
  ggplot(aes(x = as.factor(Year), y = RR_ahm)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.1, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

7s.ii. Now add RR_ahm as a new variable in the Phenotypes dataset

Phenotypes_ahm <- Phenotypes_ahm %>%
  mutate(RR_ahm = ifelse((Location_code == "WARO" & Year %in% c(2000, 1994, 1982, 1999, 1986)) | (Location_code == "COF2" & Year == 2000) | (Location_code == "NESB" & Year == 1991),
                         1,
                         NA),
         RR_ahm = ifelse(!is.na(Root_rot_early_vig_scr),
                          (Root_rot_early_vig_scr - 0.5)*2,
                          RR_ahm),
         RR_ahm = ifelse(!is.na(Root_rot_scr) & Location_code == "NESB",
                          (Root_rot_scr - 0.5)*2,
                          RR_ahm),
         RR_ahm = ifelse(!is.na(Root_rot_scr) & Location_code == "WARO",
                          Root_rot_scr,
                          RR_ahm),
         RR_ahm = ifelse(!is.na(Fusarium_seedling_vig) & Fusarium_seedling_vig <= 10,
                          Fusarium_seedling_vig,
                          RR_ahm),
         RR_ahm = ifelse(!is.na(Fusarium_seedling_vig) & Fusarium_seedling_vig >= 10,
                          Fusarium_seedling_vig / 10,
                          RR_ahm),
         RR_ahm = ifelse(!is.na(Fusarium_scr),
                          Fusarium_scr,
                          RR_ahm),
         RR_ahm = ifelse(!is.na(Fusarium_wilt_GH),
                          Fusarium_wilt_GH,
                          RR_ahm),
         RR_ahm = ifelse(!is.na(Root_rot_eval) & Root_rot_eval == "L",
                          2,
                          RR_ahm),
         RR_ahm = ifelse(!is.na(Root_rot_eval) & Root_rot_eval == "M",
                          5,
                          RR_ahm),
         RR_ahm = ifelse(!is.na(Root_rot_eval) & Root_rot_eval == "S",
                          8,
                          RR_ahm)
  )

7t. Clean white mold

7t.i. Which variables to coalesce?

202 that were sequenced at first glance. 19 site by year combinations.

Phenotypes_ahm %>%
  filter(!is.na(White_mold_scr) | !is.na(White_mold_per) | !is.na(White_mold_eval) | !is.na(White_mold_scr_val2) | !is.na(White_mold_GH) | !is.na(White_mold_stems_per) | !is.na(White_mold_porosity)) %>% 
  group_by(Seq_ID) %>%
  summarise(count = n()) # 167 that were sequenced. Try it now then.

Phenotypes_ahm %>%
  filter(!is.na(White_mold_eval)) %>%
  dplyr::select(White_mold_eval, everything()) # L, M, S, what? According to the dataset: L: light, M: moderate, S: severe. Also this is Fusarium root rot... so add those

Phenotypes_ahm %>%
  filter(!is.na(White_mold_per)) %>%
  ggplot(aes(x = as.factor(Year), y = White_mold_per)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.1, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

## White_mold_scr NDER 1997 1999 (1-9) SKOU 1994 (1-5) WAPA 2001 (1-9)
## White_mold_per NDHA 1994 NEMI 1990 1991 1992 1994 NENP 1990 1999 NES2 1988 NESB 1981 1988 SKOU 1990
## White_mold_scr_val2 All NA's IGNORE
## White_mold_stems_per All NA's IGNORE
## White_mold_GH COF2 2000 (1-9)
## White_mold_porosity WAPA 2001 (1-5) What does this mean? There is a WAPA WM DS for 2001... use this instead so IGNORE
## White_mold_eval MISA 1983 6 have plus signs. Blank: 1; + = 3; ++ = 5 on 1-5 scale 
# also there is WM in NESB in 1985 on a 1-4 scale similar to the below (maybe skip 3)
# WM NESB 1986 has a WM stems percent...

Convert everything to a 1-5 scale. 1 - 9 converted to 1-5 scales as (x + 1) / 2 % of plot infected scale converted to 1-5 scales as: 1 between(White_mold_per, 0, 5) 2 between(White_mold_per, 5.01, 10) 3 between(White_mold_per, 10.01, 15) 4 between(White_mold_per, 15.01, 25) 5 between(White_mold_per, 25.01, 100)

Phenotypes_ahm %>%
  mutate(WM_ahm = ifelse((Location_code == "NDER" & Year %in% c(1997, 1999)) | (Location_code == "COF2" & Year == 2000) | (Location_code == "SKOU" & Year %in% c(1994, 1990)) | (Location_code == "WAPA" & Year %in% c(2001)) | (Location_code == "NDHA" & Year %in% c(1994)) | (Location_code == "NEMI" & Year %in% c(1990, 1991, 1992, 1994)) | (Location_code == "NENP" & Year %in% c(1990, 1999)) | (Location_code == "NES2" & Year %in% c(1988)) | (Location_code == "NESB" & Year %in% c(1981, 1985, 1986, 1988)) | (Location_code == "MISA" & Year %in% c(1983)),
                         1,
                         NA),
         WM_ahm = ifelse(!is.na(White_mold_GH),
                          (White_mold_GH + 1) / 2,
                          WM_ahm),
         WM_ahm = ifelse(!is.na(White_mold_scr) & Location_code %in% c("NDER", "WAPA"),
                          (White_mold_scr + 1) / 2,
                          WM_ahm),
         WM_ahm = ifelse(!is.na(White_mold_scr) & Location_code %in% c("SKOU"),
                          White_mold_scr,
                          WM_ahm),
         WM_ahm = ifelse(!is.na(White_mold_per) & between(White_mold_per, 0, 3),
                          1,
                          WM_ahm),
         WM_ahm = ifelse(!is.na(White_mold_per) & between(White_mold_per, 3, 10),
                          2,
                          WM_ahm),
         WM_ahm = ifelse(!is.na(White_mold_per) & between(White_mold_per, 10, 20),
                          3,
                          WM_ahm),
         WM_ahm = ifelse(!is.na(White_mold_per) & between(White_mold_per, 20, 40),
                          4,
                          WM_ahm),
         WM_ahm = ifelse(!is.na(White_mold_per) & between(White_mold_per, 40, 100),
                          5,
                          WM_ahm),
         WM_ahm = ifelse(!is.na(White_mold_eval) & White_mold_eval == "+",
                          3,
                          WM_ahm),
         WM_ahm = ifelse(!is.na(White_mold_eval) & White_mold_eval == "++",
                          5,
                          WM_ahm),
         WM_ahm = ifelse(Location_code == "NESB" & Year == 1985 & !(CDBN_ID %in% c("N81017", "White_Kidney", "GH760", "4533", "K42", "K407")),
                          4,
                          WM_ahm),
         WM_ahm = ifelse(Location_code == "NESB" & Year == 1985 & (CDBN_ID %in% c("GH760", "K42", "K407")),
                          2,
                          WM_ahm),
         WM_ahm = ifelse(Location_code == "NESB" & Year == 1985 & (CDBN_ID %in% c("4533")),
                          5,
                          WM_ahm),
         WM_ahm = ifelse(Location_code == "NESB" & Year == 1986 & (CDBN_ID %in% c("Fleetwood", "83B17", "UI114", "GH196_2", "81_13197", "D81122", "D84344", "UI59", "CB82_11", "83B342", "Viva", "ISB_459", "UNS_117", "4533", "45030", "79146")),
                          5,
                          WM_ahm),
         WM_ahm = ifelse(Location_code == "NESB" & Year == 1986 & (CDBN_ID %in% c("Aurora", "ISB_730", "83B10", "83B13", "83B16", "Othello", "Cinnabar", "83B229", "Harris", "83B364")),
                          4,
                          WM_ahm),
         WM_ahm = ifelse(Location_code == "NESB" & Year == 1986 & (CDBN_ID %in% c("83B235")),
                          3,
                          WM_ahm),
         WM_ahm = ifelse(Location_code == "NESB" & Year == 1986 & (CDBN_ID %in% c("81_12034", "Kamiakin", "Kardinal", "MRK44", "MRK45")),
                          2,
                          WM_ahm)
  ) %>%
  filter(!is.na(WM_ahm)) %>%
  #group_by(Seq_ID) %>%
  #summarise(count = n())
  ggplot(aes(x = as.factor(Year), y = WM_ahm)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.1, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

7t.ii. Add WM_ahm to the dataset

Phenotypes_ahm <- Phenotypes_ahm %>%
  mutate(WM_ahm = ifelse((Location_code == "NDER" & Year %in% c(1997, 1999)) | (Location_code == "COF2" & Year == 2000) | (Location_code == "SKOU" & Year %in% c(1994, 1990)) | (Location_code == "WAPA" & Year %in% c(2001)) | (Location_code == "NDHA" & Year %in% c(1994)) | (Location_code == "NEMI" & Year %in% c(1990, 1991, 1992, 1994)) | (Location_code == "NENP" & Year %in% c(1990, 1999)) | (Location_code == "NES2" & Year %in% c(1988)) | (Location_code == "NESB" & Year %in% c(1981, 1985, 1986, 1988)) | (Location_code == "MISA" & Year %in% c(1983)),
                         1,
                         NA),
         WM_ahm = ifelse(!is.na(White_mold_GH),
                          (White_mold_GH + 1) / 2,
                          WM_ahm),
         WM_ahm = ifelse(!is.na(White_mold_scr) & Location_code %in% c("NDER", "WAPA"),
                          (White_mold_scr + 1) / 2,
                          WM_ahm),
         WM_ahm = ifelse(!is.na(White_mold_scr) & Location_code %in% c("SKOU"),
                          White_mold_scr,
                          WM_ahm),
         WM_ahm = ifelse(!is.na(White_mold_per) & between(White_mold_per, 0, 3),
                          1,
                          WM_ahm),
         WM_ahm = ifelse(!is.na(White_mold_per) & between(White_mold_per, 3, 10),
                          2,
                          WM_ahm),
         WM_ahm = ifelse(!is.na(White_mold_per) & between(White_mold_per, 10, 20),
                          3,
                          WM_ahm),
         WM_ahm = ifelse(!is.na(White_mold_per) & between(White_mold_per, 20, 40),
                          4,
                          WM_ahm),
         WM_ahm = ifelse(!is.na(White_mold_per) & between(White_mold_per, 40, 100),
                          5,
                          WM_ahm),
         WM_ahm = ifelse(!is.na(White_mold_eval) & White_mold_eval == "+",
                          3,
                          WM_ahm),
         WM_ahm = ifelse(!is.na(White_mold_eval) & White_mold_eval == "++",
                          5,
                          WM_ahm),
         WM_ahm = ifelse(Location_code == "NESB" & Year == 1985 & !(CDBN_ID %in% c("N81017", "White_Kidney", "GH760", "4533", "K42", "K407")),
                          4,
                          WM_ahm),
         WM_ahm = ifelse(Location_code == "NESB" & Year == 1985 & (CDBN_ID %in% c("GH760", "K42", "K407")),
                          2,
                          WM_ahm),
         WM_ahm = ifelse(Location_code == "NESB" & Year == 1985 & (CDBN_ID %in% c("4533")),
                          5,
                          WM_ahm),
         WM_ahm = ifelse(Location_code == "NESB" & Year == 1986 & (CDBN_ID %in% c("Fleetwood", "83B17", "UI114", "GH196_2", "81_13197", "D81122", "D84344", "UI59", "CB82_11", "83B342", "Viva", "ISB_459", "UNS_117", "4533", "45030", "79146")),
                          5,
                          WM_ahm),
         WM_ahm = ifelse(Location_code == "NESB" & Year == 1986 & (CDBN_ID %in% c("Aurora", "ISB_730", "83B10", "83B13", "83B16", "Othello", "Cinnabar", "83B229", "Harris", "83B364")),
                          4,
                          WM_ahm),
         WM_ahm = ifelse(Location_code == "NESB" & Year == 1986 & (CDBN_ID %in% c("83B235")),
                          3,
                          WM_ahm),
         WM_ahm = ifelse(Location_code == "NESB" & Year == 1986 & (CDBN_ID %in% c("81_12034", "Kamiakin", "Kardinal", "MRK44", "MRK45")),
                          2,
                          WM_ahm)
  )

7u. Clean zinc

This is probably not worth it but whatever. 108 sequenced. 4 site*year combinations.

Phenotypes_ahm %>%
  filter(!is.na(Zinc_dwarfing) | !is.na(Zinc_yellowing) | !is.na(Zinc_defic_eval) | !is.na(Zinc_defic_scr) | !is.na(Zinc_reaction_eval)) %>% 
  group_by(Seq_ID) %>%
  summarise(count = n()) # 91 that were sequenced.

Phenotypes_ahm %>%
  filter(!is.na(Zinc_reaction_eval)) %>%
  dplyr::select(Zinc_reaction_eval, everything()) # L, M, S, what? According to the dataset: L: light, M: moderate, S: severe. Also this is Fusarium root rot... so add those

Phenotypes_ahm %>%
  filter(!is.na(Zinc_dwarfing)) %>%
  ggplot(aes(x = as.factor(Year), y = Zinc_dwarfing)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.1, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

## Zinc_dwarfing IDKI 1999 (1-9)
## Zinc_yellowing IDKI 1999 (1-9)
# Zinc_defic_scr IDPA 2001 (1-9)
# Zinc_defic_eval IDKI 2003 R I S guessing I is intermediate. 1= R _ 5 = I _ 9 = S
# Zinc_reaction_eval NDFA 1985 R NS S MR M MS resistant to susceptible scale : 1 = R 3 = MR 5 = M 7 = MS 9 = S
# don't autofill this eval because it looks like only pintos were scored for this at NDFA in 1985.

Phenotypes_ahm %>%
  mutate(ZN_ahm = ifelse((Location_code == "IDKI" & Year %in% c(2003, 1999)) | (Location_code == "IDPA" & Year == 2001) | (Location_code == "NDFA" & Year %in% c(1985)),
                         1,
                         NA),
         ZN_ahm = ifelse(!is.na(Zinc_yellowing),
                          Zinc_yellowing,
                          ZN_ahm),
         ZN_ahm = ifelse(!is.na(Zinc_defic_scr),
                          Zinc_defic_scr,
                          ZN_ahm),
         ZN_ahm = ifelse(!is.na(Zinc_defic_eval) & Zinc_defic_eval == "R",
                          1,
                          ZN_ahm),
         ZN_ahm = ifelse(!is.na(Zinc_defic_eval) & Zinc_defic_eval == "I",
                          5,
                          ZN_ahm),
         ZN_ahm = ifelse(!is.na(Zinc_defic_eval) & Zinc_defic_eval == "S",
                          9,
                          ZN_ahm),
         ZN_ahm = ifelse(!is.na(Zinc_reaction_eval) & Zinc_reaction_eval == "R",
                          1,
                          ZN_ahm),
         ZN_ahm = ifelse(!is.na(Zinc_reaction_eval) & Zinc_reaction_eval == "MR",
                          3,
                          ZN_ahm),
         ZN_ahm = ifelse(!is.na(Zinc_reaction_eval) & Zinc_reaction_eval == "M",
                          5,
                          ZN_ahm),
         ZN_ahm = ifelse(!is.na(Zinc_reaction_eval) & Zinc_reaction_eval == "MS",
                          7,
                          ZN_ahm),
         ZN_ahm = ifelse(!is.na(Zinc_reaction_eval) & Zinc_reaction_eval == "S",
                          9,
                          ZN_ahm),
  ) %>%
  filter(!is.na(ZN_ahm)) %>%
  #group_by(Seq_ID) %>%
  #summarise(count = n())
  ggplot(aes(x = as.factor(Year), y = ZN_ahm)) + 
  geom_boxplot() +
  geom_jitter(aes(color = as.factor(Year)), alpha = 0.1, height = 0) +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1)) +
  facet_wrap(~Location_code)

Phenotypes_ahm <- Phenotypes_ahm %>%
  mutate(ZN_ahm = ifelse((Location_code == "IDKI" & Year %in% c(2003, 1999)) | (Location_code == "IDPA" & Year == 2001) | (Location_code == "NDFA" & Year %in% c(1985)),
                         1,
                         NA),
         ZN_ahm = ifelse(!is.na(Zinc_yellowing),
                          Zinc_yellowing,
                          ZN_ahm),
         ZN_ahm = ifelse(!is.na(Zinc_defic_scr),
                          Zinc_defic_scr,
                          ZN_ahm),
         ZN_ahm = ifelse(!is.na(Zinc_defic_eval) & Zinc_defic_eval == "R",
                          1,
                          ZN_ahm),
         ZN_ahm = ifelse(!is.na(Zinc_defic_eval) & Zinc_defic_eval == "I",
                          5,
                          ZN_ahm),
         ZN_ahm = ifelse(!is.na(Zinc_defic_eval) & Zinc_defic_eval == "S",
                          9,
                          ZN_ahm),
         ZN_ahm = ifelse(!is.na(Zinc_reaction_eval) & Zinc_reaction_eval == "R",
                          1,
                          ZN_ahm),
         ZN_ahm = ifelse(!is.na(Zinc_reaction_eval) & Zinc_reaction_eval == "MR",
                          3,
                          ZN_ahm),
         ZN_ahm = ifelse(!is.na(Zinc_reaction_eval) & Zinc_reaction_eval == "M",
                          5,
                          ZN_ahm),
         ZN_ahm = ifelse(!is.na(Zinc_reaction_eval) & Zinc_reaction_eval == "MS",
                          7,
                          ZN_ahm),
         ZN_ahm = ifelse(!is.na(Zinc_reaction_eval) & Zinc_reaction_eval == "S",
                          9,
                          ZN_ahm),
  )

8. Save the new data to an Excel file.

Assign Germplasm_ahm to lst$Germplasm, Phenotypes_ahm to lst$Phenotypes, Locations_ahm to lst$Locations_by_Years, Locations_unique to lst$Locations_ahm.

Save three new xlsx files (all V2.0). - One file has all the germplasm and location-year metadata except for the weather, and is called CDBN-metadata_V2.0.xlsx. - One file has all the phenotype data, which can be related back to the metadata using the Location_code, Year, CDBN_ID and/or Seq_ID keys, and is called CDBN-phenotypes_V2.0.xlsx. - Finally I have a daily weather data file for the growing season months called CDBN-weather_V2.0.xlsx

NB: I commented out this code for the knitted version because it only needed to run once to generate the new Excel file.

#  createSheet(wb, name = "Kinship")
#  createSheet(wb, name = "Germplasm_ahm")
#  createSheet(wb, name = "Locations_by_Years")
#  createSheet(wb, name = "Locations_ahm")
#  writeWorksheet(wb, Germplasm_ahm, sheet = "Germplasm_ahm")
#  writeWorksheet(wb, Locations_ahm, sheet = "Locations_by_Years")
#  writeWorksheet(wb, Locations_unique, sheet = "Locations_ahm")
#  writeWorksheet(wb, kinship, sheet = "Kinship")
#  saveWorkbook(wb)

# wb1 <- loadWorkbook("CDBN-metadata_V2.0.xlsx", create = TRUE)
# createSheet(wb1, name = "Kinship")
# createSheet(wb1, name = "Germplasm_ahm")
# createSheet(wb1, name = "Locations_by_Years")
# createSheet(wb1, name = "Locations_ahm")
# writeWorksheet(wb1, Germplasm_ahm, sheet = "Germplasm_ahm")
# writeWorksheet(wb1, Locations_ahm, sheet = "Locations_by_Years")
# writeWorksheet(wb1, Locations_unique, sheet = "Locations_ahm")
# writeWorksheet(wb1, kinship, sheet = "Kinship")
# saveWorkbook(wb1)
 
# wb2 <- loadWorkbook("CDBN-phenotypes_V2.0.xlsx", create = TRUE)
# createSheet(wb2, name = "Phenotypes_ahm")
# writeWorksheet(wb2, Phenotypes_ahm, sheet = "Phenotypes_ahm")
# saveWorkbook(wb2)
 
# ?saveWorkbook
 
# wb3 <- loadWorkbook("CDBN-weather_V1.7.1.xlsx", create = TRUE)
# createSheet(wb3, name = "Daily_weather")
# writeWorksheet(wb3, all_wea, sheet = "Daily_weather")
# saveWorkbook(wb3)

All-experiments_V1.9_Cleanup

Alice MacQueen

2018-03-30