1. Load canopy height model and generate quantiles

I generated a small, 1 m spatial resolution canopy height model from Katherine’s lidar data. I’ll read this in, and then generate a 30 m aggregate image with 101 bands, representing the 0th through the 100th quantiles of 1 m canopy height pixel values within each 30 m aggregate pixel.

library(stringr)
library(raster)
## Loading required package: sp
library(RStoolbox)
## Warning: package 'RStoolbox' was built under R version 4.0.5
# set working directory
setwd("S:/ursa/campbell/TEMP")

# load in canopy height model (don't mind the misspelling)
chm <- raster("cmh.tif")

# loop through every quantile from 0th to the 100th, aggregate, and add to stack
for (i in seq(0,1,0.01)){
  q <- aggregate(chm, 30, fun = function(x, ...) quantile(x,i, na.rm = T))
  if (i == 0){
    s <- stack(q)
  } else {
    s <- addLayer(s, q)
  }
}

# change the layer names
names(s) <- paste0("q.", str_pad(seq(0,100), 3, "left", "0"))

# plot the original chm and the 50th height quantile
par(mfrow = c(1,2))
plot(chm)
plot(s$q.050)

2. Get a random sample of pixels for plotting purposes

To get a sense for what these density plots will look like, I’ll take three random points: one in tall vegetation, one in medium vegetation, and one in low vegetation. To do this, I’ll reclassify the 50th quantile height image into three classes and generate a stratified random sample with one point in each class.

# reclassify the 50th height percentile image into three classes
rc <- matrix(data = c(-100,4,1,
                      4,8,2,
                      8,100,3),
             byrow = T,
             nrow = 3)
strata <- reclassify(s$q.050, rc)
plot(strata)

# generate random sample points
pts <- sampleStratified(strata, 1, sp = T)
plot(pts, add = T, pch = paste(pts$q.050))

3. Select low-vegetation point and plot out the density distribtion

I’ll now focus on the first point – the “low” vegetation point. First, I’ll extract the 1 m CHM within a 30 x 30 m box around the point just for visualization’s sake. I’ll then get the quantile values from the quantile image stack of the pixel in which the point falls. I’ll compare the original density distribution to that of the 100 quantiles. They should be pretty similar. I’ll also plot the quantiles themselves.

# select the first point
pt <- pts[1,]

# extract chm within 30 x 30 m box around point
pt.x <- pt@coords[1,1]
pt.y <- pt@coords[1,2]
box.xs <- c(pt.x - 15, pt.x - 15, pt.x + 15, pt.x + 15)
box.ys <- c(pt.y - 15, pt.y + 15, pt.y + 15, pt.y - 15)
box.xys <- cbind(box.xs, box.ys)
box.p <- Polygon(box.xys)
box.ps <- Polygons(list(box.p),1)
box.sps <- SpatialPolygons(list(box.ps), proj4string = crs(chm))
chm.clip <- crop(chm,box.sps)

# plot chm chip
plot(chm.clip)

# generate kernel density plot of raw data
d.orig <- stats::density(values(chm.clip), bw = 1)
par(mfrow = c(1,3))
par(mar = c(5,5,2,1))
plot(d.orig$x ~ d.orig$y, type = "l", xlab = "Density", ylab = "Height (m)", lwd = 2,
     col = "forestgreen", las = 1, main = "Original Data")

# extract the quantiles from the aggregate quantile image stack
df <- data.frame(q = seq(0,1,0.01),
                 h = s[pt$cell][1,])

# generate kernel density plot of quantile data
d.quant <- density(df$h, bw = 1)
plot(d.quant$x ~ d.quant$y, type = "l", xlab = "Density", ylab = "Height (m)", lwd = 2,
     col = "forestgreen", las = 1, main = "100 Quantiles")

# generate quantile plot
plot(h ~ q, data = df, ylim = c(min(d.orig$x), max(d.orig$x)), type = "l", lwd = 2,
     col = "blue", xlab = "Quantile", ylab = "Height (m)", las = 1, main = "Quantiles vs. Height")

4. Compare original distribution to those based on fewer quantiles

As can be seen, the 100-quantile density distribution is a very close representation of the full dataset. But, presumably, as you create density profiles from fewer and fewer quantiles, the degree to which they match the original dataset will decrease. In the interest of modeling as few quantiles as possible while retaining the highest degree of similarity to the original distribution, I’ll now create several density profiles based on smaller numbers of quantiles. To do this will require the creation of a function that generates random data from within each quantile bin. It will generate a sample of 900 points drawn from each quantile-based empirical distribution, and create a density plot based on those points.

# create a function to randomly sample within a quantile-based empirical distribution and plot the results
rand.quant.plot <- function(n.quants){
  n <- 900
  x <- rep(0,n)
  df.sub <- df[seq(1,101, length.out = n.quants),]
  for (i in seq(1,n)){
    ind <- sample(1:(nrow(df.sub)-1),1)
    x[i] <- runif(1,df.sub$h[ind],df.sub$h[ind+1])
  }
  d <- density(x,bw = 1)
  plot(d$x ~ d$y, type = "l", xlab = "Density", ylab = "Height (m)", lwd = 2,
     col = "forestgreen", ylim = c(min(d.orig$x), max(d.orig$x)), las = 1, 
     main = paste0(n.quants, " Quantiles"))
}

# create new multi-plot area
par(mfrow = c(2,3))
par(mar = c(5,5,2,1))

# plot the full dataset
plot(d.orig$x ~ d.orig$y, type = "l", xlab = "Density", ylab = "Height (m)", lwd = 2,
     col = "forestgreen", las = 1, main = "Original Data")

# plot the data
rand.quant.plot(50)
rand.quant.plot(25)
rand.quant.plot(12)
rand.quant.plot(6)
rand.quant.plot(3)

5. Analyze quantiles of medium vegetation height point

Now I’ll do the same with the second point – the medium vegetation height point.

# select the first point
pt <- pts[2,]

# extract chm within 30 x 30 m box around point
pt.x <- pt@coords[1,1]
pt.y <- pt@coords[1,2]
box.xs <- c(pt.x - 15, pt.x - 15, pt.x + 15, pt.x + 15)
box.ys <- c(pt.y - 15, pt.y + 15, pt.y + 15, pt.y - 15)
box.xys <- cbind(box.xs, box.ys)
box.p <- Polygon(box.xys)
box.ps <- Polygons(list(box.p),1)
box.sps <- SpatialPolygons(list(box.ps), proj4string = crs(chm))
chm.clip <- crop(chm,box.sps)

# plot chm chip
plot(chm.clip)

# generate kernel density plot of raw data
d.orig <- stats::density(values(chm.clip), bw = 1)
par(mfrow = c(1,3))
par(mar = c(5,5,2,1))
plot(d.orig$x ~ d.orig$y, type = "l", xlab = "Density", ylab = "Height (m)", lwd = 2,
     col = "forestgreen", las = 1, main = "Original Data")

# extract the quantiles from the aggregate quantile image stack
df <- data.frame(q = seq(0,1,0.01),
                 h = s[pt$cell][1,])

# generate kernel density plot of quantile data
d.quant <- density(df$h, bw = 1)
plot(d.quant$x ~ d.quant$y, type = "l", xlab = "Density", ylab = "Height (m)", lwd = 2,
     col = "forestgreen", las = 1, main = "100 Quantiles")

# generate quantile plot
plot(h ~ q, data = df, ylim = c(min(d.orig$x), max(d.orig$x)), type = "l", lwd = 2,
     col = "blue", xlab = "Quantile", ylab = "Height (m)", las = 1, main = "Quantiles vs. Height")

# create new multi-plot area
par(mfrow = c(2,3))
par(mar = c(5,5,2,1))

# plot the full dataset
plot(d.orig$x ~ d.orig$y, type = "l", xlab = "Density", ylab = "Height (m)", lwd = 2,
     col = "forestgreen", las = 1, main = "Original Data")

# plot the data
rand.quant.plot(50)
rand.quant.plot(25)
rand.quant.plot(12)
rand.quant.plot(6)
rand.quant.plot(3)

6. Analyze quantiles of tall vegetation height point

Now I’ll do the same with the third point – the tall vegetation height point.

# select the first point
pt <- pts[3,]

# extract chm within 30 x 30 m box around point
pt.x <- pt@coords[1,1]
pt.y <- pt@coords[1,2]
box.xs <- c(pt.x - 15, pt.x - 15, pt.x + 15, pt.x + 15)
box.ys <- c(pt.y - 15, pt.y + 15, pt.y + 15, pt.y - 15)
box.xys <- cbind(box.xs, box.ys)
box.p <- Polygon(box.xys)
box.ps <- Polygons(list(box.p),1)
box.sps <- SpatialPolygons(list(box.ps), proj4string = crs(chm))
chm.clip <- crop(chm,box.sps)

# plot chm chip
plot(chm.clip)

# generate kernel density plot of raw data
d.orig <- stats::density(values(chm.clip), bw = 1)
par(mfrow = c(1,3))
par(mar = c(5,5,2,1))
plot(d.orig$x ~ d.orig$y, type = "l", xlab = "Density", ylab = "Height (m)", lwd = 2,
     col = "forestgreen", las = 1, main = "Original Data")

# extract the quantiles from the aggregate quantile image stack
df <- data.frame(q = seq(0,1,0.01),
                 h = s[pt$cell][1,])

# generate kernel density plot of quantile data
d.quant <- density(df$h, bw = 1)
plot(d.quant$x ~ d.quant$y, type = "l", xlab = "Density", ylab = "Height (m)", lwd = 2,
     col = "forestgreen", las = 1, main = "100 Quantiles")

# generate quantile plot
plot(h ~ q, data = df, ylim = c(min(d.orig$x), max(d.orig$x)), type = "l", lwd = 2,
     col = "blue", xlab = "Quantile", ylab = "Height (m)", las = 1, main = "Quantiles vs. Height")

# create new multi-plot area
par(mfrow = c(2,3))
par(mar = c(5,5,2,1))

# plot the full dataset
plot(d.orig$x ~ d.orig$y, type = "l", xlab = "Density", ylab = "Height (m)", lwd = 2,
     col = "forestgreen", las = 1, main = "Original Data")

# plot the data
rand.quant.plot(50)
rand.quant.plot(25)
rand.quant.plot(12)
rand.quant.plot(6)
rand.quant.plot(3)

7. Test the effect of quantile number on density distribution representation

Looking at things visually is nice and all, but it would be more helpful to look at it quantitatively. That is, to compare the extent to which these density distributions generated using varying numbers of quantiles are similar to that of the entire dataset. To do this, I’ll generate 20 points in each of the three strata

# generate new sample points
pts <- sampleStratified(strata, 20, sp = T)

# loop through the points
for (i in seq(1,nrow(pts))){

  # define point
  pt <- pts[i,]
  
  # get the density curve of the 100-quantile distribution
  df <- data.frame(q = seq(0,1,0.01),
                   h = s[pt$cell][1,])
  d.main <- density(df$h, bw = 1)
  
  # loop through quantile counts from 2 to 99
  for (j in seq(2,99)){
    
    # generate the density curve of the lesser quantile distribution
    n <- 900
    x <- rep(0,n)
    df.sub <- df[seq(1,101, length.out = j),]
    for (k in seq(1,n)){
      ind <- sample(1:(nrow(df.sub)-1),1)
      x[k] <- runif(1,df.sub$h[ind],df.sub$h[ind+1])
    }
    d.comp <- density(x,bw = 1)
    
    # interpolate 100 values from main and comparison datasets
    y.main <- approx(x = d.main$x,
                     y = d.main$y,
                     xout = seq(min(d.comp$x), max(d.comp$x), length.out = 100))
    y.comp <- approx(x = d.comp$x,
                     y = d.comp$y,
                     xout = seq(min(d.comp$x), max(d.comp$x), length.out = 100))
    
    # calculate rmse and add to results data frame
    rmse <- sqrt(mean((y.main$y - y.comp$y) ^ 2))
    if (i == 1 & j == 2){
      results.df <- data.frame(num.quant = j,
                               rmse = rmse)
    } else {
      results.df <- rbind(results.df, data.frame(num.quant = j,
                               rmse = rmse))
    }
    
  }

}

# plot out the results
par(mar = c(5,5,1,1))
plot(rmse ~ num.quant,
     data = results.df,
     pch = 16,
     col = rgb(0,0,0,0.25),
     las = 1,
     ylab = "RMSE",
     xlab = "Number of Quantiles")

# summarize mean rmse by quantile number and add line to plot
rmse.mean <- tapply(results.df$rmse, results.df$num.quant, mean)
lines(rmse.mean ~ names(rmse.mean), lwd = 2, col = "red")