This file will take Jeff’s latest CDBN experiment data (V1.3) and do the following:
CDBN_ID’s and Seq_ID’s, as well as some other variety information that I have, and save a new Germplasm sheet: Germplasm_ahmCDBN_ID and Seq_ID columns from the Germplasm_ahm sheet to Jeff’s Phenotypes sheet so that it will be easier to combine the phenotypic data with the genotype data in the future. I save a new Phenotypes sheet: Phenotypes_ahmlocation*year combinations don’t have missing data. I save a new Locations sheet: Locations_ahm which has just the 77 unique locations, plus I save a Locations_by_Year sheet that has all ~690 Location by year combinations, which is the way Jeff had the data.wb <- loadWorkbook("All_experiments_V1.3.2.xlsx")
lst = readWorksheet(wb, sheet = getSheets(wb))
metadata1 <- loadWorkbook("../../CDBN Variety Info/CDBN_Metadata_sequencing_2017-07-25.xlsx")
metadata = readWorksheet(metadata1, sheet = getSheets(metadata1))
lst$GermplasmI added the echo = FALSE parameter to most of the code for this step, so the html file does not show several hundred regular expression substitutions that generated the CDBN_ID’s from the Genotypes. If you want to see these, refer to the .Rmd document.
Here are three punctuation fixes, as examples:
Germplasm$CDBN_ID <- gsub("-","_", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub(" ","_", Germplasm$CDBN_ID)
Germplasm$CDBN_ID <- gsub("Long's_Peak","Longs_Peak", Germplasm$CDBN_ID)
The table below is empty, which means that there are no remaining CDBN_ID’s in my metadata that are unaccounted for.
metadata %>%
anti_join(Germplasm, by = "CDBN_ID") %>%
select(CDBN_ID)
## [1] CDBN_ID
## <0 rows> (or 0-length row.names)
Germplasm_ahm <- Germplasm %>%
left_join(metadata, by = "CDBN_ID") %>%
select(Genotype:GBS_barcode_length.2, MDP_ID, Flower_color:Stem_thickness_below_cotyledons_width.1, Hue:Width_10..cm., Gene_pool:Market.class, Seed.From.1:Seed.From.2, BEANCAP_ID:Synonym.10)
Germplasm_small <- Germplasm_ahm %>%
select(Genotype, CDBN_ID, Seq_ID, Gene_pool, Race, Market.class)
Phenotypes <- lst$Phenotypes
Phenotypes_ahm <- Phenotypes %>%
left_join(Germplasm_small, by = "Genotype") %>%
select(Genotype, CDBN_ID, Seq_ID, Location_code, Year, Gene_pool, Race, Market.class, everything())
This step is different from what I did last month, because Jeff sent me a new file for the Location data: CDBN_weather_stations_V1.0_GL_JW1.xlsx which has new complete latitudes, longitudes, and elevations for 83 sites, finished in early November by Greg Lohrey (Thanks!). So that’s all good, but I will have to:
Which is all very doable.
loc_transl_wb <- loadWorkbook("Location_code_translations_w_JW_check.xlsx")
loc_climatebin = readWorksheet(loc_transl_wb, sheet = getSheets(loc_transl_wb))
small_climatebin <- loc_climatebin %>%
select(Location_code, State, Climate_bin) # All we really care about are State and Climate_bin here, and we need to be able to link it to the other two Location tables by Location_code.
weather_station_wb <- loadWorkbook("CDBN_weather_stations_V1.0_GL_JW1.xlsx")
station_lst = readWorksheet(weather_station_wb, sheet = getSheets(weather_station_wb))
stations <- station_lst$CDBN_locations
Locations <- lst$Locations
weather_climate <- loc_climatebin %>%
left_join(stations, by = "Location_code")
Locations_ahm <- Locations %>%
left_join(weather_climate, by = c("Location_code")) %>%
select(Location_code, Climate_bin, Latitude, Lat_best, Longitude, Long_best, Elev_best, everything()) %>%
mutate_if(is.numeric, funs(na_if(., -99))) %>%
arrange(desc(Latitude)) %>%
filter(Location_code != "")
Unfortunately, this give me 11 year*location combinations without latitude and longitude information, so I need to go back through and clean these up. I suspect they are mostly merging issues due to errors in the original datasets.
But, because the unique locations don’t have any missing values for lat/long, I think that these can be filled in from other datapoints that share the same location code. I need to fix Lat/Long for the 11 year*location combinations.
Loc_to_fix <- Locations_ahm %>%
filter(is.na(Lat_best))
sort(unique(Loc_to_fix$Location_code)) # "CAD2" "COF2" "IDK2" "MOPO" "TXCS" "TXMU" "TXVE"
## [1] "IDK2" "MISA" "MNRA"
# MISA and MNRA
CAD2Fixing CAD2 to have the same lat/long as CADV.
#Locations_ahm %>%
# filter(Location_code %in% c("CAD2", "CADV"))
Locations_ahm <- Locations_ahm %>%
# filter(Location_code %in% c("CADV","CAD2")) %>%
mutate(Latitude = ifelse(Location_code == "CAD2",
38.53782,
Latitude),
Longitude = ifelse(Location_code == "CAD2",
-121.757,
Longitude),
Institution = ifelse(Location_code %in% c("CADV","CAD2"),
"UC Davis",
Institution),
Location_code = ifelse(Location_code == "CAD2",
"CADV",
Location_code)
)
COFCSome COFC are messed up because the locations didn’t merge correctly.
#Locations_ahm %>%
# filter(Location_code %in% c("COF2", "COFC"))
Locations_ahm <- Locations_ahm %>%
# filter(Location_code %in% c("COFC","COF2")) %>%
mutate(Latitude = ifelse(Location_code == "COF2",
40.65167,
Latitude),
Longitude = ifelse(Location_code == "COF2",
-104.9992,
Longitude),
Location.x = ifelse(Location_code %in% c("COFC","COF2"),
"Ft. Collins",
Location.x),
)
IDK2 should have the same lat/long as IDKI.
#Locations_ahm %>%
# filter(Location_code %in% c("IDKI", "IDK2"))
# Just give IDK2 IDKI's data.
Locations_ahm <- Locations_ahm %>%
mutate(Latitude = ifelse(Location_code == "IDK2",
42.55103,
Latitude),
Lat_best = ifelse(Location_code == "IDK2",
42.55103,
Lat_best),
Longitude = ifelse(Location_code == "IDK2",
-114.34,
Longitude),
Long_best =ifelse(Location_code == "IDK2",
-114.34,
Long_best),
Climate_bin = ifelse(Location_code == "IDK2",
"RockiesWest",
Climate_bin),
Elev_best = ifelse(Location_code == "IDK2",
1200,
Elev_best),
)
Fix issues where only one of the two lat/long columns - site or weather station - have info. To fix this, I am using the lat/long information from the other column for now, that should be good enough.
Locations_ahm <- Locations_ahm %>%
mutate(Latitude = ifelse(Location_code %in% c("MOPO", "TXCS", "TXMU", "TXVE"),
Lat_best,
Latitude),
Longitude = ifelse(Location_code %in% c("MOPO", "TXCS", "TXMU", "TXVE"),
Long_best,
Longitude)
)
Locations_ahm <- Locations_ahm %>%
mutate(Lat_best = ifelse(Location_code %in% c("MISA", "MNRA"),
Latitude,
Lat_best),
Long_best = ifelse(Location_code %in% c("MISA", "MNRA"),
Longitude,
Long_best)
)
Locations_ahm has all of the site*year combinations, which is a little too much information, really, given that the latitudes, longitudes, and elevations don’t change by the year.
Locations_unique <- Locations_ahm %>%
group_by(Location_code) %>%
mutate(Num_Year = n()) %>%
filter(row_number(State.x) == 1) %>%
arrange(Location_code) %>%
mutate(First_Year = Year,
State = State.x,
Location = Location.x) %>%
dplyr::select(Location_code, Climate_bin, State, Latitude, Longitude, Lat_best, Long_best, Elev_best, Location, First_Year, Num_Year, Institution, Research_sta, Soil_series, Soil_class, NOAA, Notes)
Assign Germplasm_ahm to lst$Germplasm, Phenotypes_ahm to lst$Phenotypes, Locations_ahm to lst$Locations_by_Years, Locations_unique to lst$Locations_ahm, and save a new All_Experiments_V1.3.2.xlsx file.
NB: I commented out this code because it only needed to run once to generate the new Excel file.
# createSheet(wb, name = "Germplasm_ahm")
# createSheet(wb, name = "Phenotypes_ahm")
# createSheet(wb, name = "Locations_by_Years")
# createSheet(wb, name = "Locations_ahm")
# writeWorksheet(wb, Germplasm_ahm, sheet = "Germplasm_ahm")
# writeWorksheet(wb, Phenotypes_ahm, sheet = "Phenotypes_ahm")
# writeWorksheet(wb, Locations_ahm, sheet = "Locations_by_Years")
# writeWorksheet(wb, Locations_unique, sheet = "Locations_ahm")
# saveWorkbook(wb)