Thesis

Read PJ Plot Metrics Data

metrics = read.csv("plot_level_structure_vs_lidar_metrics_20220923_nodups_by_date.csv")

Outlier Removal

outliers = read.csv("outliers.csv")
outliers = as.data.frame(outliers)
outliers = subset(outliers, select = c(plot_id, Rating))
colnames(outliers)[colnames(outliers) == "plot_id"] = "plot.id" 
outliers = merge(metrics, outliers, by="plot.id") 
outliers = outliers %>% filter(!(Rating == '3'))
metrics = subset(outliers, select = -c(Rating))

VSURF for Metrics Data

#set.seed(4077)

#x.vars = as.data.frame(metrics[,13:109]) #define independent (predictor) variables in metrics dataframe
#y.var = metrics[,10] #define dependent variable (AGB) in metrics dataframe

#metrics.vsurf = VSURF(x.vars, y.var, RFimplem = "ranger", parallel = TRUE, ncores = 20, importance = TRUE) #run VSURF

Variables that will be used in RF model as determined by the VSURF

#xvarsnew = x.vars[metrics.vsurf$varselect.pred] #get names of VSURF chosen variables
#colnames(xvarsnew)
new_metrics = metrics[c("plot.id","bio","cc", "cd", "mh.ag", "vrd.0.50", "vnrd.50.100", "p.45", "p.25", "p.55", "vrd.250.300","p.20", "cr.qh.20", "p.85", "plot.y", "plot.x", "skew.ag", "vnrd.0.50", "twi")] #put into dataframe the metrics to be used in RF as chosen by the VSURF

Landfire

Read and Clean Landfire (PJ Types) Data

pj_types = read.csv("PJ_types_updated.csv") #read pj types csv to get groupings of woodlands
pj_types$EVT_NAME[pj_types$EVT_NAME=="Great Basin Pinyon-Juniper Woodland "]="Great Basin Pinyon-Juniper Woodland" #fix the pj woodland great basin syntax error
#unique(pj_types$EVT_NAME) #check that there are 5 unique woodland names
colnames(pj_types)[colnames(pj_types) == "plot_id"] = "plot.id" #fix plot id syntax error
pj_types$EVT_NAME[pj_types$EVT_NAME=="Colorado Plateau Pinyon-Juniper Shrubland"]="Colorado Plateau Pinyon-Juniper Woodland" #combine Colorado Plateau shrubland and woodland because it is the same woodland, only different cover type

Combine PJ_types Landfire data and new_metrics (variables chosen by vsurf) data so you have a PJ type for each of the plots and values for the VSURF chosen metrics

mets.types = merge(new_metrics, pj_types, by="plot.id") #merge predictor variables and pj groups on plot.id
colnames(mets.types)[colnames(mets.types) == "EVT_NAME"] = "group" #rename EVT_NAME to "group". This is the PJ Woodland Type as defined by Landfire data

Create function so I can specify training group, and testing group

fun1 = function(x, y) {
  df.train = x  
  df.test = y
  mod = ranger(bio ~ cc + cd + mh.ag + vrd.0.50 + vnrd.50.100 + p.45 + p.25 
                + p.55 + vrd.250.300 + p.20 + cr.qh.20 + p.85 + plot.y + plot.x 
                + skew.ag + vnrd.0.50 + twi, data = df.train)
  pred = predict(mod, df.test)
  r2.test = summary(lm(pred$predictions ~ df.test$bio))$adj.r.squared
  return(r2.test)
}

Landfire modeling

set.seed(4379)
df = mets.types[c("bio","group","cc", "cd", "mh.ag", "vrd.0.50", "vnrd.50.100", "p.45", "p.25", "p.55", "vrd.250.300", "p.20", "cr.qh.20", "p.85", "plot.y", "plot.x", "skew.ag", "vnrd.0.50", "twi")] #Landfire Data frame

#subset data to their stratas
cpsubset = subset(df, group == "Colorado Plateau Pinyon-Juniper Woodland")
masubset = subset(df, group == "Madrean Pinyon-Juniper Woodland")
srsubset = subset(df, group == "Southern Rocky Mountain Pinyon-Juniper Woodland")
gbsubset = subset(df, group == "Great Basin Pinyon-Juniper Woodland")

typeslist = list(cpsubset,masubset,srsubset,gbsubset)
cp = sapply(typeslist, y = cpsubset, fun1)
ma = sapply(typeslist, y = masubset, fun1)
sr = sapply(typeslist, y = srsubset, fun1)
gb = sapply(typeslist, y = gbsubset, fun1)

types_r2test = cbind(cp,ma,sr,gb) #make sure its cbind!
types_r2test = as.data.frame(types_r2test)
colnames = c('Colorado Plateau', 'Madrean', 'Southern Rocky Mountain','Great Basin')
types_r2test$name = colnames 
names(types_r2test) = c('Colorado Plateau', 'Madrean', 'Southern Rocky Mountain','Great Basin', 'name')
col_order = c('name','Colorado Plateau', 'Madrean', 'Southern Rocky Mountain','Great Basin')
types_r2test = types_r2test[, col_order]
types_r2test = as.data.frame(types_r2test) #Final Dataframe R^2 Test
kable(types_r2test)

name	Colorado Plateau	Madrean	Southern Rocky Mountain	Great Basin
Colorado Plateau	0.9368563	0.6031054	0.2292131	0.4635338
Madrean	0.4150234	0.9390612	0.1724020	0.5972720
Southern Rocky Mountain	0.2666712	0.3064240	0.9154455	0.3519319
Great Basin	0.4128860	0.4466395	0.1790556	0.9374206

Quantile-based bias correction for Landfire models ###### Create function so I can specify training, testing

fun2 = function(x, y) {
  dfTraining = x
  dfTest =  y
  x.train = dfTraining #all independent variables + biomass
  y.train = dfTraining[,1] #biomass
  x.test = dfTest[,-1] #remove biomass
  y.test = dfTest[,1] #keep only biomass
  mod = ranger(bio ~ ., data = x.train) #define model
  pred = predict(mod, data = x.train) #predict using model on train data
  qs = seq(0,1,0.001) #get quantiles
  q.obs = quantile(y.train, probs = qs)
  q.pred = quantile(pred$predictions, probs = qs)
  q.diff = q.obs - q.pred # get quantile differences
  pred.test = predict(mod, data = x.test) 
  pred.test.corr = pred.test$predictions + approx(q.pred, q.diff, pred.test$predictions)$y # apply
  r2.testnew = summary(lm(pred.test.corr ~ y.test))$adj.r.squared
  return(r2.testnew)
}

Apply quantile bias correction function to Landfire data

set.seed(302)
cp = sapply(typeslist, y = cpsubset, fun2)
ma = sapply(typeslist, y = masubset, fun2)
sr = sapply(typeslist, y = srsubset, fun2)
gb = sapply(typeslist, y = gbsubset, fun2)

types_r2test_new = cbind(cp,ma,sr,gb) #make sure its cbind!
types_r2test_new = as.data.frame(types_r2test_new)
colnames = c('Colorado Plateau', 'Madrean', 'Southern Rocky Mountain','Great Basin')
types_r2test_new$name = colnames 
names(types_r2test_new) = c('Colorado Plateau', 'Madrean', 'Southern Rocky Mountain','Great Basin', 'name')
col_order = c('name','Colorado Plateau', 'Madrean', 'Southern Rocky Mountain','Great Basin')
types_r2test_new = types_r2test_new[, col_order]
types_r2test_new = as.data.frame(types_r2test_new) #Final Dataframe R^2 Test
kable(types_r2test_new)

name	Colorado Plateau	Madrean	Southern Rocky Mountain	Great Basin
Colorado Plateau	0.9486089	0.6170014	0.2238475	0.4527957
Madrean	0.3925832	0.9315172	0.1736019	0.5619276
Southern Rocky Mountain	0.2577034	0.3232358	0.9269643	0.3099159
Great Basin	0.4035137	0.4063400	0.1852104	0.9569060

Landfire Heatmap

par(mar=c(5.1, 4.1, 4.1, 4.1))   # adapt margins

rownames(types_r2test_new) = types_r2test_new$name
g = types_r2test_new[, -1] #remove name variable
data = g[sapply(g,is.numeric)]
data = as.matrix(g)
data1 = melt(data)
 
#Var1 is training strata V2 is test strata

landfire_heatmap = ggplot(data1, aes(x = Var2,y = Var1, fill = value))+
  geom_tile()+
  scale_fill_continuous(high = "#132B43", low = "#56B1F7") +
  labs(title="Landfire", x ='Test', y = 'Train') +
  geom_text(aes(label = round(value, 2)), color = "white", size = 4) 

landfire_heatmap

Group number of plots for each dataset strata

table(mets.types$group)

## 
##        Colorado Plateau Pinyon-Juniper Woodland 
##                                             345 
##             Great Basin Pinyon-Juniper Woodland 
##                                              20 
##                 Madrean Pinyon-Juniper Woodland 
##                                             106 
## Southern Rocky Mountain Pinyon-Juniper Woodland 
##                                             106

Koppen-Geiger Climate

Read and clean data

climate = read.csv("plot_NAclimate.csv")
climate = climate %>% rename(plot.id = plot_id) #rename column plot_id to plot.id
climate = climate[(climate$plot.id %in% metrics$plot.id),] #remove data plots not in metrics
climate = data.frame(climate)

#Rename zone numbers to climate zone identifiers as outlined in the Koppen-Geiger Climate Classification Map and visualize zone frequencies 
climate$zone_num[climate$zone_num=="7"]="Bsk" #Bsk: Cold, semi-arid climate
climate$zone_num[climate$zone_num=="26"]="Dfb" #Dfb: Humid Continental Mild Summer, Wet all Year
climate$zone_num[climate$zone_num=="18"]="Dsb" #Dsb: Humid Continental Climate - Dry Cool Summer
climate$zone_num[climate$zone_num=="5"]="Bwk" #Bwk: Cold Desert Climate
climate$zone_num[climate$zone_num=="8"]="Csa" #Csa: Hot-summer Mediterranean Climate
climate$zone_num[climate$zone_num=="9"]="Csb" #Csb: Warm-Summer Mediterranean Climate

#Merge new_metrics (VSURF chosen variables) and the climate zone for each plot based on the plot id
mets.types.climate = merge(new_metrics, climate, by="plot.id") #merge predictor variables and climate groups on plot.id

colnames(mets.types.climate)[colnames(mets.types.climate) == "zone_num"] = "group" #rename zone_num to "group"

Climate modeling

df1 = mets.types.climate[c("bio","group","cc", "cd", "mh.ag", "vrd.0.50", "vnrd.50.100", "p.45", "p.25", "p.55", "vrd.250.300", "p.20", "cr.qh.20", "p.85", "plot.y", "plot.x", "skew.ag", "vnrd.0.50", "twi")] #define new dataframe with climate data and vsurf metrics

#subset data to their stratas
bsksubset = subset(df1, group == "Bsk")
dfbsubset = subset(df1, group == "Dfb")
dsbsubset = subset(df1, group == "Dsb")
bwksubset = subset(df1, group == "Bwk")
csasubset = subset(df1, group == "Csa")
csbsubset = subset(df1, group == "Csb")

climatelist = list(bsksubset,dfbsubset,dsbsubset,bwksubset,csasubset,csbsubset)
bsk = sapply(climatelist, y = bsksubset, fun1)
dfb = sapply(climatelist, y = dfbsubset, fun1)
dsb = sapply(climatelist, y = dsbsubset, fun1)
bwk = sapply(climatelist, y = bwksubset, fun1)
csa = sapply(climatelist, y = csasubset, fun1)
csb = sapply(climatelist, y = csbsubset, fun1)

climate_r2test = cbind(bsk,dfb,dsb,bwk,csa,csb) #make sure its cbind!
climate_r2test = as.data.frame(climate_r2test)
colnames = c('bsk', 'dfb', 'dsb','bwk', 'csa', 'csb')
climate_r2test$name = colnames 
names(climate_r2test) = c('bsk', 'dfb', 'dsb','bwk', 'csa', 'csb', 'name')
col_order = c('name','bsk', 'dfb', 'dsb','bwk', 'csa', 'csb')
climate_r2test = climate_r2test[, col_order]
climate_r2test = as.data.frame(climate_r2test) #Final Dataframe R^2 Test
kable(climate_r2test)

name	bsk	dfb	dsb	bwk	csa	csb
bsk	0.9350923	0.8588505	0.2855222	0.6358336	0.4367911	0.5898570
dfb	0.3273430	0.9428021	0.3067462	-0.1765821	0.4866249	0.4718233
dsb	0.2959589	0.5487451	0.8816401	0.3206864	0.5107904	0.5639845
bwk	0.2447129	0.1294043	0.3072177	0.8823109	0.3877524	0.4341143
csa	0.3016368	0.4095260	0.2893984	0.2559004	0.9300191	0.5165714
csb	0.3852891	0.4697509	0.3150601	0.1957733	0.5132352	0.9386295

Apply quantile bias correction function to Climate data

set.seed(4382)
climatelistnew = list(bsksubset, dfbsubset, dsbsubset, bwksubset, csasubset,csbsubset)
bsk = sapply(climatelistnew, y = bsksubset, fun2)
dfb = sapply(climatelist, y = dfbsubset, fun2)
dsb = sapply(climatelistnew, y = dsbsubset, fun2)
bwk = sapply(climatelist, y = bwksubset, fun2)
csa = sapply(climatelistnew, y = csasubset, fun2)
csb = sapply(climatelistnew, y = csbsubset, fun2)

#dfb and bwk missing. Need to do dfb individually and bind
climate_r2test_new = cbind(bsk,dfb,dsb,bwk,csa,csb) #make sure its cbind!
climate_r2test_new = as.data.frame(climate_r2test_new)
colnames = c('bsk', 'dfb', 'dsb', 'bwk','csa', 'csb') 
climate_r2test_new$name = colnames 
col_order = c('name', 'bsk','dfb','dsb', 'bwk', 'csa', 'csb')
climate_r2test_new = climate_r2test_new[, col_order]
climate_r2test_new = as.data.frame(climate_r2test_new) #Final Dataframe R^2 Test
kable(climate_r2test_new)

name	bsk	dfb	dsb	bwk	csa	csb
bsk	0.9426097	0.8481140	0.2686701	0.7534487	0.4433002	0.5603104
dfb	0.3494935	0.9132276	0.2248574	-0.0028074	0.3883737	0.4205612
dsb	0.3147162	0.5232728	0.9157291	0.4217184	0.5240855	0.5251109
bwk	0.2342202	0.1846932	0.2813620	0.9220809	0.4055287	0.4343965
csa	0.3086427	0.3822028	0.2595292	0.3456775	0.9540287	0.5385872
csb	0.3488965	0.5651654	0.3186887	0.1842543	0.5616905	0.9239666

Climate Heatmap

par(mar=c(5.1, 4.1, 4.1, 4.1))   # adapt margins

rownames(climate_r2test_new) = climate_r2test_new$name
h = climate_r2test_new[, -1] #remove name variable
data2 = h[sapply(h,is.numeric)]
data2 = as.matrix(h)
data3 = melt(data2)
 
#Var1 is training strata V2 is test strata

climate_heatmap = ggplot(data3, aes(x = Var2,y = Var1, fill = value))+
  geom_tile()+
  scale_fill_continuous(high = "#132B43", low = "#56B1F7") +
  labs(title="Climate", x ='Test', y = 'Train') +
  geom_text(aes(label = round(value, 2)), color = "white", size = 4) 

climate_heatmap

Group number of plots for each dataset strata

table(mets.types.climate$group)

## 
## Bsk Bwk Csa Csb Dfb Dsb 
## 407   6  79  32  11  42

Level III Ecoregions

Read and clean ecoregions data

ecoregions = read.csv("eco_plots_table_cleaned.csv")
ecoregions = ecoregions %>% rename(plot.id = plot_id) #rename column plot_id to plot.id
ecoregions = subset(ecoregions, select = -c(L2, L1))
colnames(ecoregions)[colnames(ecoregions) == "L3"] = "group" #rename L3 to "group"
mets.types.eco = merge(new_metrics, ecoregions, by="plot.id")
ecoregions = subset(mets.types.eco, select = c(plot.id, group))
ecoregions = subset(mets.types.eco, plot.id != 'NMFWRI_NMSF_P25' | mets.types.eco$group != "Southern Rockies") #delete duplicate row
ecoregions = subset(ecoregions, select = c(plot.id, group))
mets.types.eco = merge(new_metrics, ecoregions, by="plot.id")

Ecoregions modeling

df2 = mets.types.eco[c("bio","group","cc", "cd", "mh.ag", "vrd.0.50", "vnrd.50.100", "p.45", "p.25", "p.55", "vrd.250.300", "p.20", "cr.qh.20", "p.85", "plot.y", "plot.x", "skew.ag", "vnrd.0.50", "twi")] #Define new dataframe with ecoregions and VSURF metrics

#subset data to their stratas
cplsubset = subset(df2, group == "Colorado Plateaus")
wumsubset = subset(df2, group == "Wasatch and Uinta Mountains")
anmmsubset = subset(df2, group == "Arizona/New Mexico Mountains")
cbrsubset = subset(df2, group == "Central Basin and Range")
srosubset = subset(df2, group == "Southern Rockies")
anmpsubset = subset(df2, group == "Arizona/New Mexico Plateau")
nbrsubset = subset(df2, group == "Northern Basin and Range")

ecolist = list(cplsubset,wumsubset,anmmsubset,cbrsubset,srosubset,anmpsubset,nbrsubset)
cpl = sapply(ecolist, y = cplsubset, fun1)
wum = sapply(ecolist, y = wumsubset, fun1)
anmm = sapply(ecolist, y = anmmsubset, fun1)
cbr = sapply(ecolist, y = cbrsubset, fun1)
sro = sapply(ecolist, y = srosubset, fun1)
anmp = sapply(ecolist, y = anmpsubset, fun1)
nbr = sapply(ecolist, y = nbrsubset, fun1)

eco_r2test = cbind(cpl,wum,anmm,cbr,sro,anmp, nbr) #make sure its cbind!
eco_r2test = as.data.frame(eco_r2test)
colnames = c('Colorado Plateaus', 'Wasatch and Uinta Mountains', 'Arizona/New Mexico Mountains','Central Basin and Range', 'Southern Rockies', 'Arizona/New Mexico Plateau', 'Northern Basin and Range')
eco_r2test$name = colnames 
names(eco_r2test) = c('Colorado Plateaus', 'Wasatch and Uinta Mountains', 'Arizona/New Mexico Mountains','Central Basin and Range', 'Southern Rockies', 'Arizona/New Mexico Plateau', 'Northern Basin and Range','name')
col_order = c('name','Colorado Plateaus', 'Wasatch and Uinta Mountains', 'Arizona/New Mexico Mountains','Central Basin and Range', 'Southern Rockies', 'Arizona/New Mexico Plateau', 'Northern Basin and Range')
eco_r2test$name = as.character(eco_r2test$name) 
eco_r2test = eco_r2test[, col_order]
eco_r2test = as.data.frame(eco_r2test) 
kable(eco_r2test)

name	Colorado Plateaus	Wasatch and Uinta Mountains	Arizona/New Mexico Mountains	Central Basin and Range	Southern Rockies	Arizona/New Mexico Plateau	Northern Basin and Range
Colorado Plateaus	0.9504143	0.7146891	0.4442224	0.7170951	0.1933612	0.3845464	0.1576704
Wasatch and Uinta Mountains	0.5323348	0.9394347	0.4211500	0.7369050	0.0654008	0.1901524	0.1912149
Arizona/New Mexico Mountains	0.5316391	0.6121587	0.9380635	0.7965292	0.1571317	0.3744238	-0.1926878
Central Basin and Range	0.5213767	0.7165116	0.4039291	0.9453706	0.1167281	0.1859021	0.3891940
Southern Rockies	0.2984375	0.3163917	0.0652367	0.4018121	0.9101661	0.0267937	0.3548266
Arizona/New Mexico Plateau	0.2850045	0.3690173	0.2383918	0.2402829	0.0135441	0.9316545	-0.3569439
Northern Basin and Range	0.5001275	0.4899798	0.4992736	0.3998993	0.5005482	0.4861693	0.0515958

Quantile bias correction Ecoregions

#Nbr excluded due to not enough points? Errors abound
set.seed(27183)
ecolist = list(cplsubset,wumsubset,anmmsubset,cbrsubset,srosubset,anmpsubset)
cpl = sapply(ecolist, y = cplsubset, fun2)
wum = sapply(ecolist, y = wumsubset, fun2)
anmm = sapply(ecolist, y = anmmsubset, fun2)
cbr = sapply(ecolist, y = cbrsubset, fun2)
sro = sapply(ecolist, y = srosubset, fun2)
anmp = sapply(ecolist, y = anmpsubset, fun2)
#nbr = sapply(ecolist, y = nbrsubset, fun2) #nbr being difficult, need to apply function separately and add to table --> "need at least two non-NA values to interpolate"

eco_r2test_new = cbind(cpl,wum,anmm,cbr,sro,anmp) #make sure its cbind!
eco_r2test_new = as.data.frame(eco_r2test_new)

colnames = c('Colorado Plateaus', 'Wasatch and Uinta Mountains', 'Arizona/New Mexico Mountains','Central Basin and Range', 'Southern Rockies', 'Arizona/New Mexico Plateau')
eco_r2test_new$name = colnames 
names(eco_r2test_new) = c('Colorado Plateaus', 'Wasatch and Uinta Mountains', 'Arizona/New Mexico Mountains','Central Basin and Range', 'Southern Rockies', 'Arizona/New Mexico Plateau', 'name')
col_order = c('name','Colorado Plateaus', 'Wasatch and Uinta Mountains', 'Arizona/New Mexico Mountains','Central Basin and Range', 'Southern Rockies', 'Arizona/New Mexico Plateau')
eco_r2test_new$name = as.character(eco_r2test_new$name) 
eco_r2test_new = eco_r2test_new[, col_order]
eco_r2test_new = as.data.frame(eco_r2test_new)  #double check column names all correspond correctly
kable(eco_r2test_new)

name	Colorado Plateaus	Wasatch and Uinta Mountains	Arizona/New Mexico Mountains	Central Basin and Range	Southern Rockies	Arizona/New Mexico Plateau
Colorado Plateaus	0.9480693	0.7036532	0.4403688	0.7050489	0.1851302	0.3780664
Wasatch and Uinta Mountains	0.5049594	0.9714311	0.3896303	0.7027209	0.0747271	0.1966509
Arizona/New Mexico Mountains	0.5391734	0.6327346	0.9387260	0.8019751	0.1597679	0.3629285
Central Basin and Range	0.4995647	0.7036838	0.3986290	0.9605335	0.1084056	0.2007989
Southern Rockies	0.3183344	0.3014855	0.0676894	0.2653763	0.9272947	0.0866184
Arizona/New Mexico Plateau	0.2893662	0.3016727	0.2210203	0.2520933	0.0112961	0.9512367

Ecoregion Heatmap

par(mar=c(5.1, 4.1, 4.1, 4.1))   # adapt margins

rownames(eco_r2test_new) = eco_r2test_new$name
j = eco_r2test_new[, -1] #remove name variable
data4 = j[sapply(j,is.numeric)]
data4 = as.matrix(j)
data5 = melt(data4)
 
#Var1 is training strata V2 is test strata

ecoregion_heatmap = ggplot(data5, aes(x = Var2,y = Var1, fill = value))+
  geom_tile()+
  scale_fill_continuous(high = "#132B43", low = "#56B1F7") +
  labs(title="Ecoregions", x ='Test', y = 'Train') +
  scale_x_discrete(labels = label_wrap(5))+
  geom_text(aes(label = round(value, 2)), color = "white", size = 4) 
  #+theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))

ecoregion_heatmap

Group number of plots for each dataset strata

table(mets.types.eco$group)

## 
## Arizona/New Mexico Mountains   Arizona/New Mexico Plateau 
##                          316                           41 
##      Central Basin and Range            Colorado Plateaus 
##                           17                          125 
##     Northern Basin and Range             Southern Rockies 
##                            3                           43 
##  Wasatch and Uinta Mountains 
##                           32

One model to rule them all - random forest and mixed effects

Add landfire, climate, ecoregion strata to one data frame

#Merge new_metrics with landfire, climate, and ecoregions variables by plot id

#pj_types #Landfire
colnames(pj_types)[colnames(pj_types) == "EVT_NAME"] = "landfire" #rename EVT_NAME to "landfire"

#climate #Koppen-Geiger 
colnames(climate)[colnames(climate) == "zone_num"] = "climate" #rename zone_num to climate

#ecoregions #L3 ecoregion 
colnames(ecoregions)[colnames(ecoregions) == "group"] = "ecoregion" #rename L3 to "group"
#ecoregions = ecoregions[c('plot.id','ecoregion')] #subset ecoregions data to plot.id and L3

#Merge all dataframes to get all strata and metrics into one dataframe (onemodel)
data3 = merge(x = new_metrics, y = pj_types, by = "plot.id", all.x = TRUE)
data4 = merge(x = data3, y = climate, by = "plot.id", all.x = TRUE)
onemodel1 = merge(x = data4, y = ecoregions, by = "plot.id", all.x = TRUE)

onemodel = onemodel1[c("plot.id", "landfire", "climate", "ecoregion", "bio", "cc", "cd", "mh.ag", "vrd.0.50", "vnrd.50.100", "p.45", "p.25", "p.55", "vrd.250.300", "p.20", "cr.qh.20", "p.85", "plot.y", "plot.x", "skew.ag", "vnrd.0.50", "twi")] #define new dataframe for final model

Ranger and Mixed Effects Models

#Should basic ranger model include all strata as factors? 
set.seed(17695)
#do not want strata in x cols dataframe for mixed effects. Also excludes other strata
#Incorporate strata as model predictors in RF

#split into training and test data
sample = sample(c(TRUE, FALSE), nrow(onemodel), replace=TRUE, prob=c(0.70,0.30))
df.train  = onemodel[sample, ]
df.test   = onemodel[!sample, ]

#RF with bias correction is a ranger model with all strata as factors
# make factors
df.train$ecoregion = as.factor(df.train$ecoregion)
df.test$ecoregion = as.factor(df.test$ecoregion)
df.train$landfire = as.factor(df.train$landfire)
df.test$landfire = as.factor(df.test$landfire)
df.train$climate = as.factor(df.train$climate)
df.test$climate = as.factor(df.test$climate)

#define independent and dependent variables
y = df.train$bio
X = df.train[,c(6:22)] #regular ranger model --> does not include strata as factors in predictor variables

# build basic ranger model (does not include strata as factors in df)
mod = ranger(x = X,
              y = y,
              data = df.train)

# get quantile-based bias correction -- basic ranger model
pred.mod = predict(mod, df.train)$predictions
obs = df.train$bio
qs = seq(0,1,0.001)
q.obs.mod = quantile(obs, probs = qs)
q.pred.mod = quantile(pred.mod, probs = qs)
q.diff.mod = q.obs.mod - q.pred.mod

# make predictions on test data
pred.mod = predict(mod, df.test)$predictions
obs = df.test$bio

# perform bias correction
pred.mod.bc = pred.mod + approx(q.pred.mod, q.diff.mod, pred.mod, rule = 2)$y

#Mixed Effects Landfire Model
y = df.train$bio
X.me = df.train[,c(6:22)] #random effects model --> exclude 'landfire' from predictor variables and use as random effect. Excludes climate and ecoregion factors from model as well. 

mod.me = MERFranger(Y = y,
                     X = X.me,
                     random = "(1|landfire)", #set lf to be the random effect
                     data = df.train)

# get quantile-based bias correction -- mixed effects ranger model
pred.mod.me = predict(mod.me, df.train) #cannot use $predictions like in regular model because it is a MERF not a ranger object. Get vector of numeric predictions if you exclude the $predictions in code
obs = df.train$bio
qs = seq(0,1,0.001)
q.obs.mod.me = quantile(obs, probs = qs)
q.pred.mod.me = quantile(pred.mod.me, probs = qs)
q.diff.mod.me = q.obs.mod.me - q.pred.mod.me

# make predictions on test data
pred.mod.me = predict(mod.me, df.test)
pred.mod.me = unname(pred.mod.me)
obs = df.test$bio

# perform bias correction
pred.mod.me.bc = pred.mod.me + approx(q.pred.mod.me, q.diff.mod.me, pred.mod.me, rule = 2)$y

#Mixed Effects Climate Model
y = df.train$bio
X.me.1 = df.train[,c(6:22)] #exclude 'climate' from predictor variables and use as random effect. Excludes landfire and ecoregion factors from model as well. 

mod.me.1 = MERFranger(Y = y,
                     X = X.me.1,
                     random = "(1|climate)", #set cli to be the random effect
                     data = df.train)

# get quantile-based bias correction -- mixed effects ranger model
pred.mod.me.1 = predict(mod.me.1, df.train) #cannot use $predictions like in regular model because it is a MERF not a ranger object. Get vector of numeric predictions if you exclude the $predictions in code
obs = df.train$bio
qs = seq(0,1,0.001)
q.obs.mod.me.1 = quantile(obs, probs = qs)
q.pred.mod.me.1 = quantile(pred.mod.me.1, probs = qs)
q.diff.mod.me.1 = q.obs.mod.me.1 - q.pred.mod.me.1

# make predictions on test data
pred.mod.me.1 = predict(mod.me.1, df.test)
pred.mod.me.1= unname(pred.mod.me.1)
obs = df.test$bio

# perform bias correction
pred.mod.me.1.bc = pred.mod.me.1 + approx(q.pred.mod.me.1, q.diff.mod.me.1, pred.mod.me.1, rule = 2)$y

#Mixed Effects Ecoregion Model
y = df.train$bio
X.me.2 = df.train[,c(6:22)] #exclude 'ecoregion' from predictor variables and use as random effect. Excludes climate and landfire factors from model as well (only lidar predictor variables). 

mod.me.2 = MERFranger(Y = y,
                     X = X.me.2,
                     random = "(1|ecoregion)", #set eco to be the random effect
                     data = df.train)

# get quantile-based bias correction -- mixed effects ranger model
pred.mod.me.2 = predict(mod.me.2, df.train) #cannot use $predictions like in regular model because it is a MERF not a ranger object. Get vector of numeric predictions if you exclude the $predictions in code
obs = df.train$bio
qs = seq(0,1,0.001)
q.obs.mod.me.2 = quantile(obs, probs = qs)
q.pred.mod.me.2 = quantile(pred.mod.me.2, probs = qs)
q.diff.mod.me.2 = q.obs.mod.me.2 - q.pred.mod.me.2

# make predictions on test data
pred.mod.me.2 = predict(mod.me.2, df.test)
pred.mod.me.2= unname(pred.mod.me.2)
obs = df.test$bio

# perform bias correction
pred.mod.me.2.bc = pred.mod.me.2 + approx(q.pred.mod.me.2, q.diff.mod.me.2, pred.mod.me.2, rule = 2)$y

# define plotting function
plot.fun = function(x,y,main,xlab="Observed AGB",ylab="Predicted AGB"){
  plot(y ~ x, xlim = c(ax.min, ax.max), ylim = c(ax.min, ax.max), pch = 16,
       main = main, xlab = xlab, ylab = ylab)
  grid()
  lines(x = c(-1000,1000), y = c(-1000,1000), lty = 2)
  mod = lm(y ~ x)
  abline(mod, lwd = 2, col = "red")
  a = coef(mod)[2]
  b = coef(mod)[1]
  if (a < 0) eq = paste0("y=", round(a,2), "x-", abs(round(b,2)))
  if (a >= 0) eq = paste0("y=", round(a,2), "x+", abs(round(b,2)))
  r2 = paste0("R2=", round(summary(mod)$adj.r.squared,2))
  rmse = paste0("RMSE=", round(sqrt(mean((y-x)^2)),2))
  legend("topleft", legend = c(eq,r2,rmse), text.col = "red", bty = "n", x.intersp = 0)
}

# plot out the results
par(mfrow = c(2,4),
    las = 1,
    mar = c(5,5,2,1))
ax.min = min(c(pred.mod, pred.mod.me, obs))
ax.max = max(c(pred.mod, pred.mod.me, obs))
plot.fun(obs, pred.mod, "RF")
plot.fun(obs, pred.mod.bc, "RF Bias Corr")
plot.fun(obs, pred.mod.me, "LF MERF")
plot.fun(obs, pred.mod.me.bc, "LF MERF Bias Corr")
plot.fun(obs, pred.mod.me.1, "Cli MERF")
plot.fun(obs, pred.mod.me.1.bc, "Cli  MERF Bias Corr")
plot.fun(obs, pred.mod.me.2, "Eco MERF")
plot.fun(obs, pred.mod.me.2.bc, "Eco MERF Bias Corr")

Plot the random effects for landfire model - intercepts = 0, landfire and climate mixed effects models are straight lines

# get data frame of random effects
df.re = as.data.frame(mod.me$RandomEffects$landfire) #plot ecoregion effects
colnames(df.re) = "Intercept"

# get bar colors
cols = colorRampPalette(brewer.pal(11, "Spectral"))(nrow(df.re))

# plot the random effects
par(mar = c(8,5,1,1),
    las = 1)
b = barplot(df.re$Intercept, space = 0, col = cols, ylim = c(min(df.re - 1), max(df.re + 1)), ylab = "Intercept")

box()
mtext(text = row.names(df.re), side = 1, line = 1, at = b, las = 2, cex = 0.45)
abline(h = 0)

df.re

##                                                 Intercept
## Colorado Plateau Pinyon-Juniper Woodland                0
## Great Basin Pinyon-Juniper Woodland                     0
## Madrean Pinyon-Juniper Woodland                         0
## Southern Rocky Mountain Pinyon-Juniper Woodland         0

Plot the random effects for climate model - intercepts = 0

# get data frame of random effects
df.re = as.data.frame(mod.me.1$RandomEffects$climate) #plot ecoregion effects
colnames(df.re) = "Intercept"

# get bar colors
cols = colorRampPalette(brewer.pal(11, "Spectral"))(nrow(df.re))

# plot the random effects
par(mar = c(8,5,1,1),
    las = 1)
b = barplot(df.re$Intercept, space = 0, col = cols, ylim = c(min(df.re - 1), max(df.re + 1)), ylab = "Intercept")

box()
mtext(text = row.names(df.re), side = 1, line = 1, at = b, las = 2, cex = 0.55)
abline(h = 0)

Plot the random effects for ecoregion model

# get data frame of random effects
df.re = as.data.frame(mod.me.2$RandomEffects$ecoregion) #plot ecoregion effects
colnames(df.re) = "Intercept"

# get bar colors
cols = colorRampPalette(brewer.pal(11, "Spectral"))(nrow(df.re))

# plot the random effects
par(mar = c(8,5,1,1),
    las = 1)
b = barplot(df.re$Intercept, space = 0, col = cols, ylim = c(min(df.re - 1), max(df.re + 1)), ylab = "Intercept")

box()
mtext(text = row.names(df.re), side = 1, line = 1, at = b, las = 2, cex = 0.55)
abline(h = 0)

Bootstrap the whole damn thing

Instead of site.id in original code, use ecoregion as the random effect Is there a way to include more than one random effect in model?

set.seed(867509)

#redefine things just in case
new_metrics = metrics[c("site.id","plot.id","bio", "cc", "cd", "mh.ag", "vrd.0.50", "vnrd.50.100", "p.45", "p.25", "p.55", "vrd.250.300", "p.20", "cr.qh.20", "p.85", "plot.y", "plot.x", "skew.ag", "vnrd.0.50", "twi")]

#Merge all dataframes to get all strata and metrics into one dataframe (onemodel)
data3 = merge(x = new_metrics, y = pj_types, by = "plot.id", all.x = TRUE)
data4 = merge(x = data3, y = climate, by = "plot.id", all.x = TRUE)
onemodel1 = merge(x = data4, y = ecoregions, by = "plot.id", all.x = TRUE)

df = onemodel1[c("plot.id", "landfire", "climate", "ecoregion", "bio", "cc", "cd", "mh.ag", "vrd.0.50", "vnrd.50.100", "p.45", "p.25", "p.55", "vrd.250.300", "p.20", "cr.qh.20", "p.85", "plot.y", "plot.x", "skew.ag", "vnrd.0.50", "twi")] #define new

for (i in seq(1,200)){
  df$train.test = "train"
  ecoregions1 = unique(df$ecoregion)
  for (ecoregion in ecoregions1){
    eco.rows = which(df$ecoregion == ecoregion)
    test.row = sample(eco.rows, 2)
    df$train.test[test.row] = "test"
  }
  df.train = df[df$train.test == "train",]
  df.test = df[df$train.test == "test",]
  df.train$train.test = NULL
  df.test$train.test = NULL

#regular ranger model --> does not include all strata as factors in predictor variables
  mod = ranger(x = df.train[,c(6:22)] , 
                y = df.train$bio, 
               data = df.train)
  
#should i include landfire and climate as factors in model? Columns 3 and 4, ecoregion column 5. Currently does not include landfire and climate as factors in model.
  mod.me = MERFranger(Y = df.train$bio,
                       X = df.train[,c(6:22)],  
                       random = "(1|ecoregion)",
                       data = df.train)
  pred.mod = predict(mod, df.train)$predictions
  obs = df.train$bio
  qs = seq(0,1,0.001)
  q.obs.mod = quantile(obs, probs = qs)
  q.pred.mod = quantile(pred.mod, probs = qs)
  q.diff.mod = q.obs.mod - q.pred.mod
  pred.mod.me = predict(mod, df.train)$predictions
  obs = df.train$bio
  qs = seq(0,1,0.001)
  q.obs.mod.me = quantile(obs, probs = qs)
  q.pred.mod.me = quantile(pred.mod.me, probs = qs)
  q.diff.mod.me = q.obs.mod.me - q.pred.mod.me
  pred.mod = predict(mod, df.test)$predictions
  pred.mod.me = predict(mod.me, df.test)
  obs = df.test$bio
  pred.mod.bc = pred.mod + approx(q.pred.mod, q.diff.mod, pred.mod, rule = 2)$y
  pred.mod.me.bc = pred.mod.me + approx(q.pred.mod.me, q.diff.mod.me, pred.mod.me, rule = 2)$y
  r2.mod = summary(lm(pred.mod ~ obs))$adj.r.squared
  r2.mod.me = summary(lm(pred.mod.me ~ obs))$adj.r.squared
  r2.mod.bc = summary(lm(pred.mod.bc ~ obs))$adj.r.squared
  r2.mod.me.bc = summary(lm(pred.mod.me.bc ~ obs))$adj.r.squared
  rmse.mod = sqrt(mean((pred.mod - obs)^2))
  rmse.mod.me = sqrt(mean((pred.mod.me - obs)^2))
  rmse.mod.bc = sqrt(mean((pred.mod.bc - obs)^2))
  rmse.mod.me.bc = sqrt(mean((pred.mod.me.bc - obs)^2))
  if (i == 1){
    results.df = data.frame(r2.mod = r2.mod,
                             r2.mod.me = r2.mod.me,
                             r2.mod.bc = r2.mod.bc,
                             r2.mod.me.bc = r2.mod.me.bc,
                             rmse.mod = rmse.mod,
                             rmse.mod.me = rmse.mod.me,
                             rmse.mod.bc = rmse.mod.bc,
                             rmse.mod.me.bc = rmse.mod.me.bc)
    df.re = t(as.data.frame(mod.me$RandomEffects$ecoregion))
  } else {
    results.df = rbind(results.df, data.frame(r2.mod = r2.mod,
                                               r2.mod.me = r2.mod.me,
                                               r2.mod.bc = r2.mod.bc,
                                               r2.mod.me.bc = r2.mod.me.bc,
                                               rmse.mod = rmse.mod,
                                               rmse.mod.me = rmse.mod.me,
                                               rmse.mod.bc = rmse.mod.bc,
                                               rmse.mod.me.bc = rmse.mod.me.bc))
    df.re = rbind(df.re, t(as.data.frame(mod.me$RandomEffects$ecoregion)))
  }
}

Plot the bootstrap results

set.seed(238931)
# get kernel densities
d.r2.mod = density(results.df$r2.mod)
d.r2.mod.me = density(results.df$r2.mod.me)
d.r2.mod.bc = density(results.df$r2.mod.bc)
d.r2.mod.me.bc = density(results.df$r2.mod.me.bc)
d.rmse.mod = density(results.df$rmse.mod)
d.rmse.mod.me = density(results.df$rmse.mod.me)
d.rmse.mod.bc = density(results.df$rmse.mod.bc)
d.rmse.mod.me.bc = density(results.df$rmse.mod.me.bc)

# get axis limits
xlim.r2 = c(min(c(d.r2.mod$x, d.r2.mod.me$x, d.r2.mod.bc$x, d.r2.mod.me.bc$x)), 
             max(c(d.r2.mod$x, d.r2.mod.me$x, d.r2.mod.bc$x, d.r2.mod.me.bc$x)))
xlim.rmse = c(min(c(d.rmse.mod$x, d.rmse.mod.me$x, d.rmse.mod.bc$x, d.rmse.mod.me.bc$x)), 
               max(c(d.rmse.mod$x, d.rmse.mod.me$x, d.rmse.mod.bc$x, d.rmse.mod.me.bc$x)))
ylim.r2 = c(min(c(d.r2.mod$y, d.r2.mod.me$y, d.r2.mod.bc$y, d.r2.mod.me.bc$y)), 
             max(c(d.r2.mod$y, d.r2.mod.me$y, d.r2.mod.bc$y, d.r2.mod.me.bc$y)))
ylim.rmse = c(min(c(d.rmse.mod$y, d.rmse.mod.me$y, d.rmse.mod.bc$y, d.rmse.mod.me.bc$y)), 
               max(c(d.rmse.mod$y, d.rmse.mod.me$y, d.rmse.mod.bc$y, d.rmse.mod.me.bc$y)))

# set up the plot
par(mfrow = c(2,2),
    las = 1,
    mar = c(5,1,2,1))

# plot 1 -- RF vs. MERF, R2
plot(xlim.r2, ylim.r2, type = "n", yaxt = "n", ylab = NA, xlab = "R2", main = "R2")
polygon(d.r2.mod$x, d.r2.mod$y, col = rgb(1,0,0,0.25))
polygon(d.r2.mod.me$x, d.r2.mod.me$y, col = rgb(0,0,1,0.25))
abline(v = median(results.df$r2.mod), lwd = 2, col = "red")
abline(v = median(results.df$r2.mod.me), lwd = 2, col = "blue")
r2.med.mod = paste0("Med Basic = ", round(median(results.df$r2.mod),2))
r2.med.mod.me = paste0("Med ME = ", round(median(results.df$r2.mod.me),2))
legend("topleft", legend = c(r2.med.mod, r2.med.mod.me), text.col = c("red", "blue"), bty = "n", x.intersp = 0)

# plot 2 -- RF vs. MERF, RMSE
plot(xlim.rmse, ylim.rmse, type = "n", yaxt = "n", ylab = NA, xlab = "RMSE", main = "RMSE")
polygon(d.rmse.mod$x, d.rmse.mod$y, col = rgb(1,0,0,0.25))
polygon(d.rmse.mod.me$x, d.rmse.mod.me$y, col = rgb(0,0,1,0.25))
abline(v = median(results.df$rmse.mod), lwd = 2, col = "red")
abline(v = median(results.df$rmse.mod.me), lwd = 2, col = "blue")
rmse.med.mod = paste0("Med Basic = ", round(median(results.df$rmse.mod),2))
rmse.med.mod.me = paste0("Med ME = ", round(median(results.df$rmse.mod.me),2))
legend("topleft", legend = c(rmse.med.mod, rmse.med.mod.me), text.col = c("red", "blue"), bty = "n", x.intersp = 0)

# plot 3 -- RF vs. MERF Bias Corr, R2
plot(xlim.r2, ylim.r2, type = "n", yaxt = "n", ylab = NA, xlab = "R2 Bias", main = "R2 Bias Corr")
polygon(d.r2.mod.bc$x, d.r2.mod.bc$y, col = rgb(1,0,0,0.25))
polygon(d.r2.mod.me.bc$x, d.r2.mod.me.bc$y, col = rgb(0,0,1,0.25))
abline(v = median(results.df$r2.mod.bc), lwd = 2, col = "red")
abline(v = median(results.df$r2.mod.me.bc), lwd = 2, col = "blue")
r2.med.mod.bc = paste0("Med Basic = ", round(median(results.df$r2.mod.bc),2))
r2.med.mod.me.bc = paste0("Med ME = ", round(median(results.df$r2.mod.me.bc),2))
legend("topleft", legend = c(r2.med.mod.bc, r2.med.mod.me.bc), text.col = c("red", "blue"), bty = "n", x.intersp = 0)

# plot 4 -- RF vs. MERF Bias Corr, RMSE
plot(xlim.rmse, ylim.rmse, type = "n", yaxt = "n", ylab = NA, xlab = "RMSE", main = "RMSE Bias Corr")
polygon(d.rmse.mod.bc$x, d.rmse.mod.bc$y, col = rgb(1,0,0,0.25))
polygon(d.rmse.mod.me.bc$x, d.rmse.mod.me.bc$y, col = rgb(0,0,1,0.25))
abline(v = median(results.df$rmse.mod.bc), lwd = 2, col = "red")
abline(v = median(results.df$rmse.mod.me.bc), lwd = 2, col = "blue")
rmse.med.mod.bc = paste0("Med Basic = ", round(median(results.df$rmse.mod.bc),2))
rmse.med.mod.me.bc = paste0("Med ME = ", round(median(results.df$rmse.mod.me.bc),2))
legend("topleft", legend = c(rmse.med.mod.bc, rmse.med.mod.me.bc), text.col = c("red", "blue"), bty = "n", x.intersp = 0)

# get axis limits
ymin = 100
ymax = -100
eco.ids = colnames(df.re)
for (ecoregion in eco.ids){
  d = density(df.re[,ecoregion])
  y = d$x
  if (min(y) < ymin) ymin = min(y)
  if (max(y) > ymax) ymax = max(y)
}

# set up plot
par(mar = c(8,5,1,1),
    las = 1)

# create empty plot
plot(x = c(0.5, ncol(df.re) + 0.5),
     y = c(ymin, ymax),
     type = "n",
     ylab = "Intercept",
     xaxt = "n",
     xlab = NA)
abline(h = 0)

# loop through sites and plot violins
eco.ids = colnames(df.re)
for (i in seq(1,length(eco.ids))){
  abline(v = i, lty = 3, col = "lightgray")
  ecoregion = eco.ids[i]
  d = density(df.re[,ecoregion])
  y = d$x
  x = d$y + i
  y = c(y, rev(y))
  x = c(x, rev(d$y * -1 + i))
  col = cols[i]
  polygon(x, y, col = col)
  points(i, median(df.re[,ecoregion]), pch = 16)
  mtext(ecoregion, 1, 1, at = i, las = 2, cex = 0.55)
}

Exploratory Analysis

Tree Species Barplots

###### Clean up tree data
treedata = read.csv("tree_level_data.csv") #Load tree data which specifies tree species for all plots
trees = subset(treedata, select = c(plot_id, tree_sp))
trees = trees %>% rename(plot.id = plot_id) #rename column plot_id to plot.id
trees = trees[(trees$plot.id %in% metrics$plot.id),] #remove data plots not in metrics
treesfreq = data.frame(unclass(table(trees))) #turn tree frequency table into dataframe
#treesfreq #frequency of each tree species in each plot
treesfreq = cbind(plot.id = rownames(treesfreq), treesfreq) #move plot.id from the index column to the first column in the data frame
rownames(treesfreq) = 1:nrow(treesfreq)
#subset to only P and J for fun reasons
treespj = subset(treesfreq, select = c(plot.id, JUDE2, JUMO, JUOS, JUSC2, PIED, PIMO, PIMOF)) #include PIPO??

#All trees and all variables. Only plot.ids used in metrics are in this dataframe
trees_df = merge(trees, onemodel)
trees_df = subset(trees_df, select = c(plot.id, tree_sp, landfire, climate, ecoregion))

Plot tree species count for each strata

trees_df2 = trees_df %>% filter(tree_sp == 'JUDE2' | tree_sp == 'JUMO' | tree_sp == 'JUOS' | tree_sp == 'JUSC2' | tree_sp == 'PIED' | tree_sp == 'PIMO' | tree_sp == 'PIMOF')
#lf = c('Colorado Plateau', 'Madrean', 'Southern Rocky Mountain','Great Basin')

trees_df2 %>%  
  group_by(landfire, tree_sp) %>%  
  summarize(Count = n()) %>% 
  ggplot(aes(x=landfire, y=Count, fill=tree_sp)) + 
  geom_bar(stat='identity', position= "dodge") +
  scale_x_discrete(labels = label_wrap(5))

trees_df2 %>%  
  group_by(climate, tree_sp) %>%  
  summarize(Count = n()) %>% 
  ggplot(aes(x=climate, y=Count, fill=tree_sp)) + 
  geom_bar(stat='identity', position= "dodge") +
  scale_x_discrete(labels = label_wrap(5))

#eco = c('Colorado Plateaus', 'Wasatch & Uinta Mountains', 'Arizona/New Mexico Mountains','Central Basin and Range', 'Southern Rockies', 'Arizona/New Mexico Plateau', 'Northern Basin and Range')

trees_df2 %>%  
  group_by(ecoregion, tree_sp) %>%  
  summarize(Count = n()) %>%  
  ggplot(aes(x=ecoregion, y=Count, fill=tree_sp)) + 
  geom_bar(stat='identity', position= "dodge") +
  scale_x_discrete(labels = label_wrap(5))

lf_trees_ct1 = trees_df2 %>% count(landfire, tree_sp)
lf_trees_ct1 = as.data.frame(lf_trees_ct1)
cli_trees_ct = trees_df2 %>% count(climate, tree_sp)
cli_trees_ct = as.data.frame(cli_trees_ct)
eco_trees_ct = trees_df2 %>% count(ecoregion, tree_sp)
eco_trees_ct = as.data.frame(eco_trees_ct)

lf_trees_ct2 = lf_trees_ct1 %>% group_by(landfire) %>% top_n(2, n)
cli_trees_ct = cli_trees_ct %>% group_by(climate) %>% top_n(2, n)
eco_trees_ct = eco_trees_ct %>% group_by(ecoregion) %>% top_n(2, n)

lf_trees_ct3 = lf_trees_ct2 %>%  
  group_by(landfire, tree_sp) %>%  
  ggplot(aes(x=landfire, y=n, fill=tree_sp)) + 
  geom_bar(stat='identity', position= "dodge") + 
  scale_x_discrete(labels = label_wrap(5))

cli_trees_ct = cli_trees_ct %>%  
  group_by(climate, tree_sp) %>%  
  ggplot(aes(x=climate, y=n, fill=tree_sp)) + 
  geom_bar(stat='identity', position= "dodge") 

eco_trees_ct = eco_trees_ct %>%  
  group_by(ecoregion, tree_sp) %>%  
  ggplot(aes(x=ecoregion, y=n, fill=tree_sp)) + 
  geom_bar(stat='identity', position= "dodge") +
  scale_x_discrete(labels = label_wrap(5))

grid.arrange(lf_trees_ct3,landfire_heatmap)

grid.arrange(cli_trees_ct,climate_heatmap)

grid.arrange(eco_trees_ct,ecoregion_heatmap)

Tree species sums for each dataset

Create function to turn data frame into matrix and sum columns

fun3 = function(df) {
  x = df[,unlist(lapply(df, is.numeric))]
  y = as.matrix(x)
  return(colSums(y))
}

Tree species sums for each landfire cluster

mets.types.treespj = merge(treespj, pj_types, by="plot.id") #merge predictor treespj and pj_types on plot.id
colnames(mets.types.treespj)[colnames(mets.types.treespj) == "landfire"] = "group" #rename landfire to group

#subset mets.types.treespj to each group
#investigate the PJ species composition of each group
mets.types.treespj.cp = subset(mets.types.treespj, group == "Colorado Plateau Pinyon-Juniper Woodland")
mets.types.treespj.ma = subset(mets.types.treespj, group == "Madrean Pinyon-Juniper Woodland")
mets.types.treespj.sr = subset(mets.types.treespj, group == "Southern Rocky Mountain Pinyon-Juniper Woodland")
mets.types.treespj.gb = subset(mets.types.treespj, group == "Great Basin Pinyon-Juniper Woodland")

typ = list(mets.types.treespj.cp, mets.types.treespj.ma, mets.types.treespj.sr, mets.types.treespj.gb)

#rownames(pj_types_plot_sums)

pj_types_plot_sums_fun = lapply(typ, fun3)
pj_types_plot_sums = as.data.frame(do.call(cbind, pj_types_plot_sums_fun))
names(pj_types_plot_sums) = c('Colorado Plateau', 'Madrean', 'Southern Rocky Mountain','Great Basin')
pj_types_plot_sums = t(pj_types_plot_sums)
kable(pj_types_plot_sums)

	JUDE2	JUMO	JUOS	JUSC2	PIED	PIMO	PIMOF
Colorado Plateau	554	777	3916	101	6635	0	0
Madrean	1011	34	115	7	380	0	116
Southern Rocky Mountain	7	612	0	96	1284	0	0
Great Basin	0	1	532	0	55	415	0

Tree species sums for each climate cluster

mets.types.clim.treespj = merge(treespj, climate, by="plot.id") #merge predictor treespj and pj_types on plot.id
colnames(mets.types.clim.treespj)[colnames(mets.types.clim.treespj) == "climate"] = "group" #rename climate to group

#subset data to their stratas
mets.types.clim.treespj.bsk = subset(mets.types.clim.treespj, group == "Bsk")
mets.types.clim.treespj.dfb = subset(mets.types.clim.treespj, group == "Dfb")
mets.types.clim.treespj.dsb = subset(mets.types.clim.treespj, group == "Dsb")
mets.types.clim.treespj.bwk = subset(mets.types.clim.treespj, group == "Bwk")
mets.types.clim.treespj.csa = subset(mets.types.clim.treespj, group == "Csa")
mets.types.clim.treespj.csb = subset(mets.types.clim.treespj, group == "Csb")

climatelist2 = list(mets.types.clim.treespj.bsk,mets.types.clim.treespj.dfb,mets.types.clim.treespj.dsb,mets.types.clim.treespj.bwk,mets.types.clim.treespj.csa,mets.types.clim.treespj.csb)

mets.types.clim.treespj.fun = lapply(climatelist2, fun3)
mets.types.clim.treespj.sums = as.data.frame(do.call(cbind, mets.types.clim.treespj.fun))
names(mets.types.clim.treespj.sums) = c('Bsk', 'Dfb','Dsb', 'Bwk', 'Csa', 'Csb')
mets.types.clim.treespj.sums = t(mets.types.clim.treespj.sums)
kable(mets.types.clim.treespj.sums)

	JUDE2	JUMO	JUOS	JUSC2	PIED	PIMO	PIMOF
Bsk	271	1376	4448	187	8058	415	116
Dfb	0	1	16	2	45	0	0
Dsb	189	19	42	5	147	0	0
Bwk	0	5	44	0	14	0	0
Csa	836	23	13	7	88	0	0
Csb	276	0	0	3	2	0	0

Tree species sums for each ecoregion cluster

mets.types.eco.treespj = merge(treespj, ecoregions, by="plot.id") #merge predictor treespj and pj_types on plot.id
colnames(mets.types.eco.treespj)[colnames(mets.types.eco.treespj) == "ecoregion"] = "group" #rename ecoregion to group

#subset data to their stratas
mets.types.eco.treespj.cp = subset(mets.types.eco.treespj, group == "Colorado Plateaus")
mets.types.eco.treespj.wum = subset(mets.types.eco.treespj, group == "Wasatch and Uinta Mountains")
mets.types.eco.treespj.anmm = subset(mets.types.eco.treespj, group == "Arizona/New Mexico Mountains")
mets.types.eco.treespj.cbr = subset(mets.types.eco.treespj, group == "Central Basin and Range")
mets.types.eco.treespj.sro = subset(mets.types.eco.treespj, group == "Southern Rockies")
mets.types.eco.treespj.anmp = subset(mets.types.eco.treespj, group == "Arizona/New Mexico Plateau")
mets.types.eco.treespj.nbr = subset(mets.types.eco.treespj, group == "Northern Basin and Range")

ecolist2 = list(mets.types.eco.treespj.cp,mets.types.eco.treespj.wum,mets.types.eco.treespj.anmm,mets.types.eco.treespj.cbr,mets.types.eco.treespj.sro,mets.types.eco.treespj.anmp,mets.types.eco.treespj.nbr)

mets.types.eco.plot.sums.fun = lapply(ecolist2, fun3)
mets.types.eco.plot.sums = as.data.frame(do.call(cbind, mets.types.eco.plot.sums.fun))
names(mets.types.eco.plot.sums) = c('Colorado Plateaus', 'Wasatch and Uinta Mountains', 'Arizona/New Mexico Mountains','Central Basin and Range', 'Southern Rockies', 'Arizona/New Mexico Plateau', 'Northern Basin and Range')
mets.types.eco.plot.sums = t(mets.types.eco.plot.sums)
kable(mets.types.eco.plot.sums)

	JUDE2	JUMO	JUOS	JUSC2	PIED	PIMO	PIMOF
Colorado Plateaus	0	0	3326	12	3587	0	0
Wasatch and Uinta Mountains	0	0	191	64	582	0	0
Arizona/New Mexico Mountains	1572	873	514	49	3147	0	116
Central Basin and Range	0	1	410	0	0	415	0
Southern Rockies	0	230	0	79	707	0	0
Arizona/New Mexico Plateau	0	320	0	0	276	0	0
Northern Basin and Range	0	0	122	0	55	0	0

Notes:

Based on tree species sums for each strata, strata with a higher species variability seem to be more transferable than strata with a low species variability. In other words, the Colorado Plateau PJ in the Landfire data set encompasses 5 different PJ species and its models are highly transferable to different strata, while Southern Rocky Mountain PJ encompasses 4 different PJ species and its models are less transferable to different strata. Other stronger examples of this in the ecoregions and climate datasets.

Do similar species compositions impact model transferability?

Average height metrics between different strata

Plots of mean height (mh) and mean height aboveground (mh.ag) for different strata

metricsheight = metrics[c("plot.id","mh")] #put into dataframe the metrics to be used in RF as chosen by the VSURF
heights = merge(onemodel1, metricsheight, by = "plot.id")
heights = subset(heights, select = c(plot.id, landfire, climate, ecoregion, bio, mh, mh.ag))

Mean heights and mean heights aboveground (without shrubs) for each strata in landfire data

#function to get mean heights and mean heights aboveground (without shrubs) for each strata in landfire data
fun5 = function(df){
  lf.mean.height = mean(df[,"mh"])
  lf.mean.height.ag = mean(df[,"mh.ag"])
  return(cbind(lf.mean.height, lf.mean.height.ag))}

#landfire
#subset data to their stratas
cpsubset = subset(heights, landfire == "Colorado Plateau Pinyon-Juniper Woodland")
masubset = subset(heights, landfire == "Madrean Pinyon-Juniper Woodland")
srsubset = subset(heights, landfire == "Southern Rocky Mountain Pinyon-Juniper Woodland")
gbsubset = subset(heights, landfire == "Great Basin Pinyon-Juniper Woodland")

typeslist = list(cpsubset,masubset,srsubset,gbsubset)
mean.heights.lf = lapply(typeslist, fun5)
mean.heights.lf.df = as.data.frame(do.call(rbind, mean.heights.lf))
rownames(mean.heights.lf.df) = c('Colorado Plateau', 'Madrean', 'Southern Rocky Mountain','Great Basin')
mean.heights.lf.df = tibble::rownames_to_column(mean.heights.lf.df, "VALUE")
mean.heights.lf.df = mean.heights.lf.df %>% rename(lf.strata = "VALUE")

Plots

p1 = ggplot(data=mean.heights.lf.df, aes(x=lf.strata, y=lf.mean.height)) +
  geom_bar(stat="identity") + scale_x_discrete(labels = label_wrap(10)) + scale_y_continuous(limits = c(0, 4))

p2 = ggplot(data=mean.heights.lf.df, aes(x=lf.strata, y=lf.mean.height.ag)) +
  geom_bar(stat="identity") + scale_x_discrete(labels = label_wrap(10)) + scale_y_continuous(limits = c(0, 4))

Get mean heights and mean heights aboveground (without shrubs) for each strata in climate data

#function to get mean heights and mean heights aboveground (without shrubs) for each strata in landfire data
fun5 = function(df){
  cli.mean.height = mean(df[,"mh"])
  cli.mean.height.ag = mean(df[,"mh.ag"])
  return(cbind(cli.mean.height, cli.mean.height.ag))}

#climate
#subset data to their stratas
bsksubset = subset(heights, climate == "Bsk")
dfbsubset = subset(heights, climate == "Dfb")
dsbsubset = subset(heights, climate == "Dsb")
bwksubset = subset(heights, climate == "Bwk")
csasubset = subset(heights, climate == "Csa")
csbsubset = subset(heights, climate == "Csb")

climatelist = list(bsksubset,dfbsubset,dsbsubset,bwksubset,csasubset,csbsubset)
mean.heights.cli = lapply(climatelist, fun5)
mean.heights.cli.df = as.data.frame(do.call(rbind, mean.heights.cli))
rownames(mean.heights.cli.df) = c('bsk', 'dfb', 'dsb','bwk', 'csa', 'csb')
mean.heights.cli.df = tibble::rownames_to_column(mean.heights.cli.df, "VALUE")
mean.heights.cli.df = mean.heights.cli.df %>% rename(climate.strata = "VALUE")

Plots

p3 = ggplot(data=mean.heights.cli.df, aes(x=climate.strata, y=cli.mean.height)) +
  geom_bar(stat="identity") + scale_x_discrete(labels = label_wrap(10)) + scale_y_continuous(limits = c(0, 4))
p4 = ggplot(data=mean.heights.cli.df, aes(x=climate.strata, y=cli.mean.height.ag)) +
  geom_bar(stat="identity") + scale_x_discrete(labels = label_wrap(10)) + scale_y_continuous(limits = c(0, 4))

Get mean heights and mean heights aboveground (without shrubs) for each strata in ecoregion data

#function to get mean heights and mean heights aboveground (without shrubs) for each strata in landfire data
fun5 = function(df){
  eco.mean.height = mean(df[,"mh"])
  eco.mean.height.ag = mean(df[,"mh.ag"])
  return(cbind(eco.mean.height, eco.mean.height.ag))}

#ecoregions
#subset data to their stratas
cplsubset = subset(heights, ecoregion == "Colorado Plateaus")
wumsubset = subset(heights, ecoregion == "Wasatch and Uinta Mountains")
anmmsubset = subset(heights, ecoregion == "Arizona/New Mexico Mountains")
cbrsubset = subset(heights, ecoregion == "Central Basin and Range")
srosubset = subset(heights, ecoregion == "Southern Rockies")
anmpsubset = subset(heights, ecoregion == "Arizona/New Mexico Plateau")
nbrsubset = subset(heights, ecoregion == "Northern Basin and Range")

ecolist = list(cplsubset,wumsubset,anmmsubset,cbrsubset,srosubset,anmpsubset,nbrsubset)
mean.heights.eco = lapply(ecolist, fun5)
mean.heights.eco.df = as.data.frame(do.call(rbind, mean.heights.eco))
rownames(mean.heights.eco.df) = c('Colorado Plateaus', 'Wasatch and Uinta Mountains', 'Arizona/New Mexico Mountains','Central Basin and Range', 'Southern Rockies', 'Arizona/New Mexico Plateau', 'Northern Basin and Range')
mean.heights.eco.df = tibble::rownames_to_column(mean.heights.eco.df, "VALUE")
mean.heights.eco.df = mean.heights.eco.df %>% rename(eco.strata = "VALUE")

Plots

p5= ggplot(data=mean.heights.eco.df, aes(x=eco.strata, y=eco.mean.height)) +
  geom_bar(stat="identity") + scale_x_discrete(labels = label_wrap(10)) + scale_y_continuous(limits = c(0, 4))

#+ scale_x_discrete(labels = label_wrap(10))  
p6 = ggplot(data=mean.heights.eco.df, aes(x=eco.strata, y=eco.mean.height.ag)) +
  geom_bar(stat="identity")  +  scale_x_discrete(labels = label_wrap(10)) + scale_y_continuous(limits = c(0, 4))

Mean height and mean height aboveground plots

#grid.arrange(p5,p1,p2,p3,p4,p6, ncol = 3, nrow=2) 
grid.arrange(p1,p2, ncol = 1, nrow=2)

grid.arrange(p3,p4, ncol = 1, nrow=2)

grid.arrange(p5,p6, ncol = 1, nrow=2)

I don’t want no shrubs

Subtract mean height aboveground from mean height to get shrub cover. Does the presence of a lot of low cover vegetation (shrub) impact model transferability? Possibly too much error introduced in this analysis due to the taking of averages for mean height and mean height aboveground.

mean.heights.eco.df$diff=(mean.heights.eco.df$eco.mean.height.ag-mean.heights.eco.df$eco.mean.height)
p7 = ggplot(data=mean.heights.eco.df, aes(x=eco.strata, y=diff)) +
  geom_bar(stat="identity")  + scale_x_discrete(labels = label_wrap(17))  + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))


mean.heights.cli.df$diff=(mean.heights.cli.df$cli.mean.height.ag-mean.heights.cli.df$cli.mean.height)
p8 = ggplot(data=mean.heights.cli.df, aes(x=climate.strata, y=diff)) +
  geom_bar(stat="identity") + scale_x_discrete(labels = label_wrap(5)) + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))

mean.heights.lf.df$diff=(mean.heights.lf.df$lf.mean.height.ag-mean.heights.lf.df$lf.mean.height)
p9 = ggplot(data=mean.heights.lf.df, aes(x=lf.strata, y=diff)) +
  geom_bar(stat="identity") + scale_x_discrete(labels = label_wrap(5)) + theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))

#If the diff is smaller, it indicates less ground vegetation. Higher diff indicates more ground vegetation.
#Could cross reference this with shrub counts per strata.
px = p9 + p8 + p7 & scale_y_continuous(limits = c(0, 2.1)) 
plot(px)

How many shrubs: look at shrub numbers for each strata using tree data. Could investigate average proportion of shrubs to trees?

#Double check that trees are removed from subset
#Subset trees dataframe to only shrubs
trees_df3 = trees_df %>% filter(!(tree_sp == 'JUDE2' | tree_sp == 'JUMO' | tree_sp == 'JUOS' | tree_sp == 'JUSC2' | tree_sp == 'PIED' | tree_sp == 'PIMO' | tree_sp == 'PIMOF'| tree_sp == 'PIPO' | tree_sp == 'ABCO'))
#unique(trees_df3$tree_sp) #make sure only shrubs

p10 = trees_df3 %>%  
  group_by(landfire, tree_sp) %>%  
  summarize(Count = n()) %>% 
  ggplot(aes(x=landfire, y=Count)) + 
  geom_bar(stat='identity', position= "dodge") +
  scale_x_discrete(labels = label_wrap(17)) +
  xlab("landfire strata") + 
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))


p11 = trees_df3 %>%  
  group_by(climate, tree_sp) %>%  
  summarize(Count = n()) %>% 
  ggplot(aes(x=climate, y=Count)) + 
  geom_bar(stat='identity', position= "dodge") +
  scale_x_discrete(labels = label_wrap(5)) +
  xlab("climate strata") +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))


p12 = trees_df3 %>%  
  group_by(ecoregion, tree_sp) %>%  
  summarize(Count = n()) %>% 
  ggplot(aes(x=ecoregion, y=Count)) + 
  geom_bar(stat='identity', position= "dodge") +
  scale_x_discrete(labels = label_wrap(17)) +
  xlab("ecoregion strata") +
  theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1))

py = p10 + p11 + p12 & scale_y_continuous(limits = c(0, 400)) 

#Look into proportion of trees to shrubs?

plot(py)

Violin Plots for biomass per strata distribution visualization

ggplot(data = onemodel, aes(x = factor(landfire), y = bio)) +
  geom_violin(draw_quantiles = c(0.25, 0.5, 0.75)) +
  ggtitle("Violin Plot") +
  xlab("Landfire clusters") +
  ylab("AGB") +
  scale_x_discrete(labels = label_wrap(10))

  #theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) 


ggplot(data = onemodel, aes(x = factor(climate), y = bio)) +
  geom_violin(draw_quantiles = c(0.25, 0.5, 0.75)) +
  ggtitle("Violin Plot") +
  xlab("Cli clusters") +
  ylab("AGB") +
  scale_x_discrete(labels = label_wrap(10))

  #theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) 


ggplot(data = onemodel, aes(x = factor(ecoregion), y = bio)) +
  geom_violin(draw_quantiles = c(0.25, 0.5, 0.75)) +
  ggtitle("Violin Plot") +
  xlab("Eco clusters") +
  ylab("AGB") +
  scale_x_discrete(labels = label_wrap(10))

Elevation Investigation: Elevations of each plot center point plotted for each strata for each dataset

#will need to redo getData if metrics dataset changes (outliers are removed or added)
#df.onemodel = onemodel[c("plot.x","plot.y","plot.id", "landfire", "climate", "ecoregion")]
#ll_proj = "+proj=longlat +ellps=WGS84 +datum=WGS84 +no_defs"
#elev.onemodel = get_elev_point(df.onemodel, prj = ll_proj)
#elev.onemodel = as.data.frame(elev.onemodel)
elev.onemodel = read.csv("elev_onemodel.csv")
elev.onemodel = as.data.frame(elev.onemodel)

Elevation Distribution Violin Plots

ggplot(data = elev.onemodel, aes(x = landfire, y = elevation)) +
  geom_violin(draw_quantiles = c(0.25, 0.5, 0.75)) +
  ggtitle("Landfire Violin Plot") +
  xlab("Landfire clusters") +
  ylab("Elevation") +
  scale_x_discrete(labels = label_wrap(10))

  #theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) 


ggplot(data = elev.onemodel, aes(x = climate, y = elevation)) +
  geom_violin(draw_quantiles = c(0.25, 0.5, 0.75)) +
  ggtitle("Climate Violin Plot") +
  xlab("Climate clusters") +
  ylab("Elevation") +
  scale_x_discrete(labels = label_wrap(10))

  #theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust=1)) 


ggplot(data = elev.onemodel, aes(x = ecoregion, y = elevation)) +
  geom_violin(draw_quantiles = c(0.25, 0.5, 0.75)) +
  ggtitle("Ecoregion Violin Plot") +
  xlab("Ecoregion clusters") +
  ylab("Elevation") +
  scale_x_discrete(labels = label_wrap(10))

Elevation Distribution Box Plots

pl.1 = ggplot(elev.onemodel, aes(landfire, elevation)) + geom_boxplot() +
   scale_x_discrete(labels = label_wrap(10))

pl.2 = ggplot(elev.onemodel, aes(climate, elevation)) + geom_boxplot() +
   scale_x_discrete(labels = label_wrap(10))

pl.3 = ggplot(elev.onemodel, aes(ecoregion, elevation)) + geom_boxplot() +
   scale_x_discrete(labels = label_wrap(10))

pl.1

pl.2

pl.3

Thesis

Jessie Eastburn

1/18/2023

VSURF for Metrics Data

Landfire

Koppen-Geiger Climate

Level III Ecoregions

One model to rule them all - random forest and mixed effects

Bootstrap the whole damn thing

Exploratory Analysis

Tree Species Barplots

Tree species sums for each dataset

Average height metrics between different strata

I don’t want no shrubs

Violin Plots for biomass per strata distribution visualization

Elevation Investigation: Elevations of each plot center point plotted for each strata for each dataset