This file will take Jeff’s latest CDBN experiment data (V1.3) and do the following:

wb <- loadWorkbook("All_experiments_V1.3.2.xlsx")
lst = readWorksheet(wb, sheet = getSheets(wb))
metadata1 <- loadWorkbook("../../CDBN Variety Info/CDBN_Metadata_sequencing_2017-07-25.xlsx")
metadata = readWorksheet(metadata1, sheet = getSheets(metadata1))

1. Add CDBN_ID’s to lst$Germplasm

I added the echo = FALSE parameter to most of the code for this step, so the html file does not show several hundred regular expression substitutions that generated the CDBN_ID’s from the Genotypes. If you want to see these, refer to the .Rmd document.

Here are three punctuation fixes, as examples:

Germplasm$CDBN_ID <- gsub("-","_", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub(" ","_", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("Long's_Peak","Longs_Peak", Germplasm$CDBN_ID)

1a. Test that the second round of punctuation fixes actually ties the CDBN_ID keys to the Germplasm table.

The table below is empty, which means that there are no remaining CDBN_ID’s in my metadata that are unaccounted for.

metadata %>% 
  anti_join(Germplasm, by = "CDBN_ID") %>%
  select(CDBN_ID)
## [1] CDBN_ID
## <0 rows> (or 0-length row.names)

2. Join Germplasm and Metadata

Germplasm_ahm <- Germplasm %>% 
  left_join(metadata, by = "CDBN_ID") %>%
  select(Genotype:GBS_barcode_length.2, MDP_ID, Flower_color:Stem_thickness_below_cotyledons_width.1, Hue:Width_10..cm., Gene_pool:Market.class, Seed.From.1:Seed.From.2, BEANCAP_ID:Synonym.10)

3. Make a Germplasm table to add to the Phenotypes worksheet

Germplasm_small <- Germplasm_ahm %>%
  select(Genotype, CDBN_ID, Seq_ID, Gene_pool, Race, Market.class)

3a. Join small Genotypes table to Phenotypes worksheet

Phenotypes <- lst$Phenotypes

Phenotypes_ahm <- Phenotypes %>%
  left_join(Germplasm_small, by = "Genotype") %>%
  select(Genotype, CDBN_ID, Seq_ID, Location_code, Year, Gene_pool, Race, Market.class, everything())

4. Add Climate_bin to Locations worksheet

This step is different from what I did last month, because Jeff sent me a new file for the Location data: CDBN_weather_stations_V1.0_GL_JW1.xlsx which has new complete latitudes, longitudes, and elevations for 83 sites, finished in early November by Greg Lohrey (Thanks!). So that’s all good, but I will have to:

Which is all very doable.

loc_transl_wb <- loadWorkbook("Location_code_translations_w_JW_check.xlsx")
loc_climatebin = readWorksheet(loc_transl_wb, sheet = getSheets(loc_transl_wb))
small_climatebin <- loc_climatebin %>%
  select(Location_code, State, Climate_bin) # All we really care about are State and Climate_bin here, and we need to be able to link it to the other two Location tables by Location_code.

weather_station_wb <- loadWorkbook("CDBN_weather_stations_V1.0_GL_JW1.xlsx")
station_lst = readWorksheet(weather_station_wb, sheet = getSheets(weather_station_wb))
stations <- station_lst$CDBN_locations

Locations <- lst$Locations

weather_climate <- loc_climatebin %>%
  left_join(stations, by = "Location_code")
Locations_ahm <- Locations %>%
  left_join(weather_climate, by = c("Location_code")) %>%
  select(Location_code, Climate_bin, Latitude, Lat_best, Longitude, Long_best, Elev_best, everything()) %>%
  mutate_if(is.numeric, funs(na_if(., -99))) %>%
  arrange(desc(Latitude)) %>%
  filter(Location_code != "")

Unfortunately, this give me 11 year*location combinations without latitude and longitude information, so I need to go back through and clean these up. I suspect they are mostly merging issues due to errors in the original datasets.

4a. Some locations don’t have associated Lat and Long

But, because the unique locations don’t have any missing values for lat/long, I think that these can be filled in from other datapoints that share the same location code. I need to fix Lat/Long for the 11 year*location combinations.

Loc_to_fix <- Locations_ahm %>%
  filter(is.na(Lat_best))

sort(unique(Loc_to_fix$Location_code)) # "CAD2" "COF2" "IDK2" "MOPO" "TXCS" "TXMU" "TXVE"
## [1] "IDK2" "MISA" "MNRA"
# MISA and MNRA

4b. Fix CAD2

Fixing CAD2 to have the same lat/long as CADV.

#Locations_ahm %>%
#  filter(Location_code %in% c("CAD2", "CADV"))

Locations_ahm <- Locations_ahm %>%
#  filter(Location_code %in% c("CADV","CAD2")) %>%
    mutate(Latitude = ifelse(Location_code == "CAD2",
                           38.53782,
                           Latitude),
         Longitude = ifelse(Location_code == "CAD2",
                            -121.757,
                            Longitude),
         Institution = ifelse(Location_code %in% c("CADV","CAD2"),
                              "UC Davis",
                              Institution),
         Location_code = ifelse(Location_code == "CAD2",
                                "CADV",
                                Location_code)
         )

4c. Fix COFC

Some COFC are messed up because the locations didn’t merge correctly.

#Locations_ahm %>%
#  filter(Location_code %in% c("COF2", "COFC"))

Locations_ahm <- Locations_ahm %>%
#  filter(Location_code %in% c("COFC","COF2")) %>%
    mutate(Latitude = ifelse(Location_code == "COF2",
                           40.65167,
                           Latitude),
         Longitude = ifelse(Location_code == "COF2",
                            -104.9992,
                            Longitude),
         Location.x = ifelse(Location_code %in% c("COFC","COF2"),
                              "Ft. Collins",
                              Location.x),
         )

4d. Fix IDK2

IDK2 should have the same lat/long as IDKI.

#Locations_ahm %>%
#  filter(Location_code %in% c("IDKI", "IDK2"))

# Just give IDK2 IDKI's data.

Locations_ahm <- Locations_ahm %>%
  mutate(Latitude = ifelse(Location_code == "IDK2",
                           42.55103,
                           Latitude),
         Lat_best = ifelse(Location_code == "IDK2",
                           42.55103,
                           Lat_best),
         Longitude = ifelse(Location_code == "IDK2",
                            -114.34,
                            Longitude),
         Long_best =ifelse(Location_code == "IDK2",
                            -114.34,
                            Long_best),
         Climate_bin = ifelse(Location_code == "IDK2",
                            "RockiesWest",
                            Climate_bin),
         Elev_best = ifelse(Location_code == "IDK2",
                            1200,
                            Elev_best),
               )

4e. Supply missing Lat/Long or Lat_best/Long_best

Fix issues where only one of the two lat/long columns - site or weather station - have info. To fix this, I am using the lat/long information from the other column for now, that should be good enough.

Locations_ahm <- Locations_ahm %>%
  mutate(Latitude = ifelse(Location_code %in% c("MOPO", "TXCS", "TXMU", "TXVE"),
                           Lat_best,
                           Latitude),
         Longitude = ifelse(Location_code %in% c("MOPO", "TXCS", "TXMU", "TXVE"),
                           Long_best,
                           Longitude)
  )
      

Locations_ahm <- Locations_ahm %>%
  mutate(Lat_best = ifelse(Location_code %in% c("MISA", "MNRA"),
                           Latitude,
                           Lat_best),
         Long_best = ifelse(Location_code %in% c("MISA", "MNRA"),
                           Longitude,
                           Long_best)
  )   

5. Make a unique Locations sheet.

Locations_ahm has all of the site*year combinations, which is a little too much information, really, given that the latitudes, longitudes, and elevations don’t change by the year.

Locations_unique <- Locations_ahm %>%
  group_by(Location_code) %>% 
  mutate(Num_Year = n()) %>%
  filter(row_number(State.x) == 1) %>%
  arrange(Location_code) %>%
  mutate(First_Year = Year,
         State = State.x,
         Location = Location.x) %>%
  dplyr::select(Location_code, Climate_bin, State, Latitude, Longitude, Lat_best, Long_best, Elev_best, Location, First_Year, Num_Year, Institution, Research_sta, Soil_series, Soil_class, NOAA, Notes)

6. Save the new data to an Excel file.

Assign Germplasm_ahm to lst$Germplasm, Phenotypes_ahm to lst$Phenotypes, Locations_ahm to lst$Locations_by_Years, Locations_unique to lst$Locations_ahm, and save a new All_Experiments_V1.3.2.xlsx file.

NB: I commented out this code because it only needed to run once to generate the new Excel file.

# createSheet(wb, name = "Germplasm_ahm")
# createSheet(wb, name = "Phenotypes_ahm")
# createSheet(wb, name = "Locations_by_Years")
# createSheet(wb, name = "Locations_ahm")
# writeWorksheet(wb, Germplasm_ahm, sheet = "Germplasm_ahm")
# writeWorksheet(wb, Phenotypes_ahm, sheet = "Phenotypes_ahm")
# writeWorksheet(wb, Locations_ahm, sheet = "Locations_by_Years")
# writeWorksheet(wb, Locations_unique, sheet = "Locations_ahm")
# saveWorkbook(wb)