Load all data from CSV files
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
sites <- read.csv( "/alphadata01/bstocker/data/gcme/GCME_database_CSV_032017/Sites_tbl.csv", as.is=TRUE )
experiments <- read.csv( "/alphadata01/bstocker/data/gcme/GCME_database_CSV_032017/Experiments_tbl.csv", as.is=TRUE )
data <- read.csv( "/alphadata01/bstocker/data/gcme/GCME_database_CSV_032017/Data_tbl.csv", as.is=TRUE )
Some data is incomplete.
print("WARNING: no experiments info in sites table for:")
## [1] "WARNING: no experiments info in sites table for:"
print(sites$Site.Name[which(!is.element(sites$Site.Name, experiments$Site.Name))])
## [1] "SoyFACE"
Load sites table and clean lon, lat info (God invented negative numbers!)
sites$lat <- with( sites, ifelse( as.character(Lat)=="S", -Latitude, Latitude ) )
sites$lon <- with( sites, ifelse( as.character(Lon)=="W", -Longitude, Longitude ) )
sites <- sites %>% select( -Lat, -Lon, -Latitude, -Longitude )
Sites table should only have the following information:
This may be resolved “by machine”: - Vegetation type may be independent of site. Therefore, respective information should go into the experiments table and be dropped from the sites table.
experiments <- experiments %>% left_join( select( sites, Site.Name, C3.herb,C4.herb,Shrub.broadleaved.deciduous,Shrub.broadleaved.evergreen,Shrub.needleleaved, Tree.broadleaved.deciduous,Tree.broadleaved.evergreen,Tree.needleleaved.deciduous, Tree.needleleaved.evergreen, Climber.nonwoody,Climber.woody,Moss,AM,EcM,ErM ), by="Site.Name" )
sites <- sites %>% select( -C3.herb, -C4.herb, -Shrub.broadleaved.deciduous, -Shrub.broadleaved.evergreen, -Shrub.needleleaved, -Tree.broadleaved.deciduous, -Tree.broadleaved.evergreen, -Tree.needleleaved.deciduous, -Tree.needleleaved.evergreen, -Climber.nonwoody, -Climber.woody, -Moss, -AM, -EcM, -ErM )
To be resolved “by hand”:
This table defines which factors are manipulated, provides information about how many levels of each factors are available (at least 2 but sometimes more) and quantitative information about the magnitude of the manipulation (e.g. amount of fertiliser added).
Should only have the following information:
This may be resolved “by machine”:
experiments$exp_nam <- with( experiments, sapply( strsplit( Experiment.Name, split = "_"), function(v) {paste(rev(rev(v)[-1]), collapse ="_")} ) )
experiments_red <- data.frame()
for (thisexp in unique(experiments$exp_nam)){
sub <- filter( experiments, exp_nam==thisexp )
## complete information is givem by row that has longest string in column "Treatment".
## WARNING: THIS MAY NOT BE THE CASE. CHECK BY HAND!!!
useidx <- which.max( nchar( sub$Treatment ) )
## combining single rows to new clean data frame
experiments_red <- rbind( experiments_red, sub[useidx,] )
}
## replace experiments table with reduced one
experiments <- experiments_red
Create new column ‘factors’ that contains information about all available factors for each experiment. This is more or less equivalent to ‘Treatments_all’ in original database but can be reduced to single letters for each factor. It’s important that the letters’ order doesn’t matter! This can (partly) be done by machine.
experiments$factors <- experiments$Treatments_all
experiments$factors <- with( experiments, ifelse( Treatments_all=="Warming", "w", factors ) )
experiments$factors <- with( experiments, ifelse( Treatments_all=="Fertilization", "f", factors ) )
experiments$factors <- with( experiments, ifelse( Treatments_all=="CO2xFertilization", "cf", factors ) )
experiments$factors <- with( experiments, ifelse( Treatments_all=="WarmingxFertilization", "fw", factors ) )
experiments$factors <- with( experiments, ifelse( Treatments_all=="CO2", "c", factors ) )
experiments$factors <- with( experiments, ifelse( Treatments_all=="WarmingxWater", "iw", factors ) ) # i for irrigation
experiments$factors <- with( experiments, ifelse( Treatments_all=="FertilizationxWater", "fi", factors ) )
experiments$factors <- with( experiments, ifelse( Treatments_all=="CO2xFertilizationxSoil", "cfs", factors ) )
experiments$factors <- with( experiments, ifelse( Treatments_all=="CO2xWarmingxDrought", "cwd", factors ) ) # a for aqua=drought
experiments$factors <- with( experiments, ifelse( Treatments_all=="CO2xFertilizationxShading", "cfr", factors ) ) # r for radiation
experiments$factors <- with( experiments, ifelse( Treatments_all=="CO2xWaterxMulch", "cim", factors ) ) # m for mulch
experiments$factors <- with( experiments, ifelse( Treatments_all=="Drought", "d", factors ) )
experiments$factors <- with( experiments, ifelse( Treatments_all=="CO2xWarmingxFertilization", "cwf", factors ) )
experiments$factors <- with( experiments, ifelse( Treatments_all=="WaterxFertilization", "fi", factors ) )
experiments$factors <- with( experiments, ifelse( Treatments_all=="CO2xOzone", "co", factors ) )
experiments$factors <- with( experiments, ifelse( Treatments_all=="CO2xWarming", "cw", factors ) )
experiments$factors <- with( experiments, ifelse( Treatments_all=="FertilizationxSnow", "fv", factors ) ) # v for snow
experiments$factors <- with( experiments, ifelse( Treatments_all=="CO2xWater", "ci", factors ) )
experiments$factors <- with( experiments, ifelse( Treatments_all=="CO2xFertilizationxWarmingxWater", "cfiw", factors ) )
experiments$factors <- with( experiments, ifelse( Treatments_all=="CO2xSoil", "cs", factors ) )
experiments$factors <- with( experiments, ifelse( Treatments_all=="CO2xWarmingxWater", "cwi", factors ) )
To be resolved by hand:
experiments <- experiments %>% dplyr::rename(
lev0_water = WQuantity.control, # check what it means when not NA
lev1_water = WQuantity.treament, # TODO: is very messy now, try to come up with something that is machine readable using columns e.g. lev1_water_add, lev1_water_mult, lev1_water_abs, ...
lev0_warmair = TQuantity.air.control, # ok, only NA
lev1_warmair = TQuantity.air.treatment, # TODO: separate into soil and air and allow only numeric
lev0_warmsoil = TQuantity.soil.control, # ok, only NA
lev1_warmsoil = TQuantity.soil.treatment, # ok, only NA
lev0_co2 = CQuantity.Control, # put "amb" if "ambient" is used in control
## TODO: see below
lev0_nfert = N.Quantity.control, # TODO change to identical units, and drop the column for units, keeping it as metainformation
lev1_nfert = N.Quantity.treatment, # TODO change to identical units, and drop the column for units, keeping it as metainformation
lev0_pfert = P.Quantity.control, # TODO change to identical units, and drop the column for units, keeping it as metainformation
lev1_pfert = P.Quantity.treatment, # TODO change to identical units, and drop the column for units, keeping it as metainformation
lev0_kfert = K.Quantity.control, # TODO change to identical units, and drop the column for units, keeping it as metainformation
lev1_kfert = K.Quantity.treatment # TODO change to identical units, and drop the column for units, keeping it as metainformation
)
experiments$lev1_co2_add <- rep( NA, nrow(experiments) )
experiments$lev1_co2_abs <- rep( NA, nrow(experiments) )
experiments$lev1_co2_mlt <- rep( NA, nrow(experiments) )
## CO2: re-interpret treatment info in the form of e.g. "+300"
idxs_add <- which( is.element( "+", strsplit( experiments$CQuantity.Treatment, split = "" )[[1]] ) )
experiments$lev1_co2_add[idxs_add] <- gsub( "\\+", "", experiments$CQuantity.Treatment[idxs_add] )
By hand:
Whether this should be in long or wide format is probably mostly a matter of what the database is used for. Long is simpler when adding data to the database as it avoids duplicating the entry of the control data. An algorithm can be build to interconvert from long to wide and back. (maybe not just one line but if all else is clean this should be straight forward). The most important changes to be made to the data table is for how to deal with factors information. The undersore-plus-letters at the end of the experiment name doesn’t work well. I very warmly suggest to add columns for each factor. The table would then have the following columns (in long format):
The factors columns co2, nfer, pfert, warm, water (and more if necessary) can have 0 (=ambient), 1 (=elevated), 2 (=elevated at higher level than 1), etc. This conversion could be done partly by machine along similar lines as shown here for single factorial only (warning: this is computationally very expensive):
## Add new standard experiment name
data$exp_nam <- with( data, sapply( strsplit( Experiment.Name, split = "_"), function(v) {paste(rev(rev(v)[-1]), collapse ="_")} ) )
## Add columns for factors
tmp <- cbind( data,
co2=rep(NA,nrow(data)),
nfert=rep(NA,nrow(data)),
pfert=rep(NA,nrow(data)),
kfert=rep(NA,nrow(data)),
warm=rep(NA,nrow(data)),
water=rep(NA,nrow(data)),
ozone=rep(NA,nrow(data))
# possibly more: ntype (for type of fertiliser), species/PFT/Nfixer as factor ...
)
## Convert to long format
tmp <- tmp %>% left_join( select( experiments, exp_nam, factors, Treatment ), by="exp_nam" ) %>%
select( exp_nam, factors, treatment=Treatment, co2, nfert, pfert, kfert, warm, water, ozone, Data.type, Unit, Sampling.date, ambient, ambient.Sd, ambient.Se, elevated, elevated.Se, elevated.Sd )
long <- data.frame()
for ( idx in 1:nrow(tmp) ){
if ( nchar(tmp$factors[idx])==1 ){
## single factor
lo <- select( tmp[idx,], exp_nam, factors, treatment, co2, nfert, pfert, kfert, warm, water, ozone, Data.type, Unit, Sampling.date, value=ambient, value.sd=ambient.Sd, value.se=ambient.Se )
hi <- select( tmp[idx,], exp_nam, factors, treatment, co2, nfert, pfert, kfert, warm, water, ozone, Data.type, Unit, Sampling.date, value=elevated, value.sd=elevated.Sd, value.se=elevated.Se )
if ( is.element( "c", strsplit( tmp$factors[idx], split = "" )[[1]] ) ){
## factorial with CO2
lo$co2 <- 0
hi$co2 <- 1
} else if ( is.element( "f", strsplit( tmp$factors[idx], split = "" )[[1]] ) ){
## factorial with fertilisation
lo$nfert <- 1
hi$nfert <- 0
}
long <- rbind( long, lo, hi )
}
}
In wide format, the factors take a new meaning: E.g., when ‘elevated’ refers to CO2 x N-fert then entry co2=1 and nfert=1, and the values for ambient represents the control while, treatment represents the measurement where all factors that are 1 are at level 1 (could also be 2 if multiple levels are available).
I suggest that we keep the long format for the database and its distribution and provide a script that converts it to wide.