npudens vs npudist

Run npudensbw() & npudens() and npudistbw() & npudist()

the vector d is the distance to water (meters) value for a randomly selected site Each value corresponds to one of the 26 10x10-meters cells that make up this site

npudensbw() - computes a bandwidth object for a kernel unconditional density estimator

npudens() - computes kernel unconditional density estimates on evaluation data

npudistbw() - computes a bandwidth object for a kernel cumulative distribution estimator

npudist() - computes kernel unconditional cumulative distribution estimates on evaluation data

## Warning: package 'np' was built under R version 3.1.3

d <-  c(21.3,31.9, 42.5, 15.0, 23.8, 33.6, 43.8, 54.2, 23.8, 30.1, 38.3, 47.6, 57.3,
        30.1, 38.3, 45.1, 53.2, 62.0, 38.3, 45.1, 53.2, 60.2,68.1, 53.2, 60.2, 68.1)
d <- round(d,0)

bwDens <- npudensbw(d)        # compute bandwidth for KDE
fDens <- npudens(bws=bwDens)  # fit bandwith to d
pDens <- fitted(fDens)        # get the fitted densities for values of d
bwDist <- npudistbw(d)        # compute bandwidth for CDF
fDist <- npudist(bws=bwDist)  # fit bandwidth to d
pDist <- fitted(fDist)        # get the fitted values for d < x

print(pDens)                  # printed fitted density values

##  [1] 0.010948814 0.017942586 0.020953539 0.006631671 0.013111824
##  [6] 0.018835688 0.021140124 0.019900929 0.013111824 0.016910088
## [11] 0.020181747 0.021124458 0.018569916 0.016910088 0.020181747
## [16] 0.021185635 0.020228553 0.015209063 0.020181747 0.021185635
## [21] 0.020228553 0.016710012 0.010008390 0.020228553 0.016710012
## [26] 0.010008390

print(pDist)                  # print fitted cumulative probabilities

##  [1] 0.11478027 0.27101183 0.46013709 0.06085512 0.15029552 0.30621965
##  [7] 0.50052552 0.69880705 0.15029552 0.23761346 0.38109036 0.58154037
## [13] 0.75302246 0.23761346 0.38109036 0.52080736 0.67993579 0.83296168
## [19] 0.38109036 0.52080736 0.67993579 0.80277094 0.90704296 0.67993579
## [25] 0.80277094 0.90704296

plot(pDens~pDist)             # plot desnity ~ probabilities

plot(bwDens)                  # plot KDE

plot(bwDist)                  # plot CDF

Simulate a bunch of data points that lie at standard deviation distances on either side of site data mean

# inefficient routine to simualte data points for +/- 10 std devs on either side of site data mean
meanX1 <- round(mean(d),1)  # get mean
sdX1 <- round(sd(d),1)      # get SD
simX1v <- meanX1            # start simulated data with mean
for(i in 1:10){             # loop to add 10 positive StdDevs
  simX1 <- meanX1 + sdX1*i
  simX1v <- c(simX1v, simX1)# add sim data to right of mean
}
simX1vneg <- NULL
for(i in 10:1){             # loop to get 10 negative StdDevs
  simX1 <- meanX1 - sdX1*i
  simX1vneg <- c(simX1vneg, simX1)
}
simData <- c(simX1vneg, simX1v)# add sim data to left of mean 
print(simData) # this is waht it looks like

##  [1] -103.3  -88.6  -73.9  -59.2  -44.5  -29.8  -15.1   -0.4   14.3   29.0
## [11]   43.7   58.4   73.1   87.8  102.5  117.2  131.9  146.6  161.3  176.0
## [21]  190.7

Get fitted values of density and cumulative probability using bws objects created above and sim data

fDensSIM <- npudens(bws=bwDens, edat=simData) # fit KDE to sim data unsing empirical bws object
pDensSIM <- fitted(fDensSIM)                  # get fitted density values for sim data
fDistSIM <- npudist(bws=bwDist, edat=simData) # fit CDF to sim data using empirical bws object
pDistSIM <- fitted(fDistSIM)                  # get fitted probabilities for sim data

print(pDensSIM)

##  [1] 9.613303e-44 2.331940e-34 3.230724e-26 2.558396e-19 1.161284e-13
##  [6] 3.052540e-09 4.830597e-06 5.309158e-04 6.165481e-03 1.634496e-02
## [11] 2.112027e-02 1.776563e-02 5.783725e-03 3.117198e-04 1.388265e-06
## [16] 3.879342e-10 6.335864e-15 5.944015e-21 3.189029e-28 9.773005e-37
## [21] 1.710198e-46

sum(pDensSIM)

## [1] 0.06802891

print(pDistSIM)

##  [1] 0.000000e+00 0.000000e+00 2.989062e-17 5.602506e-13 1.935109e-09
##  [6] 1.286030e-06 1.768746e-04 5.768421e-03 5.600025e-02 2.216617e-01
## [11] 4.944482e-01 7.768588e-01 9.497701e-01 9.959267e-01 9.999117e-01
## [16] 9.999996e-01 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
## [21] 1.000000e+00

sum(pDistSIM)

## [1] 10.50052

simPlot <- data.frame(X1 = simData, pDensSIM = pDensSIM, pDistSIM = pDistSIM)
plot(pDensSIM~pDistSIM)   # plot simulated dens~dist

# plot KDE of simualted data (red line is empirical mean)
ggplot(simPlot, aes(x = X1, y=pDensSIM)) + geom_line() + geom_vline(xintercept = meanX1, color="red")

# plot CDF of simulated data (red line is empirical mean)
ggplot(simPlot, aes(x = X1, y=pDistSIM)) + geom_line() + geom_vline(xintercept = meanX1, color="red")

Results

From what I see, if I evaluate the density or cumulative distribution functions on the mean of the sites data I get either a density of 0.02 (the max density of the empirical data) or a cumulative probability of 0.5 (which is the mean of the empirical data). for the KDE function generated from npudensbw & npudens, simulated values higher and lower than the empirical mean drop quickly in their fitted density. Not surprising; as values get further away from that of the site, the have a lower density. For the cumulative distribution of the npudistbw & npudist functions, as values get higher than the mean,the probability approaches 1 and visa verse for lower simulated data points. This again makes sense because as the simulated values get higher, there is a greater probability that they are in the distribution; that there is greater area under the distribution for values x < x+1.

For this project, this means that we are either predicting for the density of an observation or predicting for the are under the distribution to the left of the observation (cumulative probability). When predicting for a density, the resultant value is generally small and variable based on bandwidth and sample size; therefore not very comparable across sites. If the cumulative probability is the estimate, then any Loi value greater than the mean of the site will be increasingly probable to the point the very high values (10 sigma here) have the probability of 1.

I believe what we need is the probability that a data value is between a range of values under the empirical distribution; not the density or cumulative probability. Figure (4) of the Arch Sci. paper suggests this, but I cannot find how the code gets this.

quick bi-varaite example

d1 <-  round(c(21.3,31.9, 42.5, 15.0, 23.8, 33.6, 43.8, 54.2, 23.8, 30.1, 38.3, 47.6, 57.3,
        30.1, 38.3, 45.1, 53.2, 62.0, 38.3, 45.1, 53.2, 60.2,68.1, 53.2, 60.2, 68.1),0)
d2 <- factor(sample(c(1:6),length(d1), replace=TRUE), ordered=TRUE) # simualated slope percent
d <- data.frame(X1 = d1, X2 = d2)
bwDens <- npudensbw(d)
fDens <- npudens(bws=bwDens)
pDens <- fitted(fDens)
bwDist <- npudistbw(d)
fDist <- npudist(bws=bwDist)
pDist <- fitted(fDist)

### SIMULATE DATA UP TO 10 StdDevs on EITHER SIDE OF SITE DATA MEAN
meanX1 <- round(mean(d$X1),1)
medX2 <- median(as.numeric(d$X2))
varX2 <- round(sd(as.numeric(d$X2)),0)
sdX1 <- round(sd(d$X1),1)
simX1v <- meanX1
simX2v <- medX2
for(i in 1:10){
  simX1 <- meanX1 + sdX1*i
  simX1v <- c(simX1v, simX1)
  simX2 <- medX2 + varX2*i
  simX2v <- c(simX2v, simX2)
}
simX1vneg <- NULL
simX2vneg <- NULL
for(i in 10:1){
  simX1 <- meanX1 - sdX1*i
  simX1vneg <- c(simX1vneg, simX1)
  simX2 <- medX2 - varX2*i
  simX2vneg <- c(simX2vneg, simX2)
}
simX1vec <- c(simX1vneg, simX1v)
simX2vec <- factor(c(simX2vneg, simX2v), ordered=TRUE)
simData <- data.frame(X1 = simX1vec, X2 = simX2vec)
### REFIT SIM DATA using SIET DATA bws
fDensSIM <- npudens(bws=bwDens, edat=simData)
pDensSIM <- fitted(fDensSIM)

fDistSIM <- npudist(bws=bwDist, edat=simData)
pDistSIM <- fitted(fDistSIM)

print(pDensSIM)

##  [1] 1.705805e-43 8.717659e-36 6.640691e-29 7.652090e-23 1.374821e-17
##  [6] 4.066214e-13 2.135461e-09 2.155203e-06 4.653524e-04 5.877816e-03
## [11] 5.863083e-03 4.766954e-03 5.109967e-05 1.375352e-07 8.331937e-11
## [16] 1.012497e-14 2.138351e-19 7.137551e-25 3.605642e-31 2.709989e-38
## [21] 3.010869e-46

print(pDistSIM)

##  [1] 0.000000e+00 4.169536e-45 7.466551e-38 4.567393e-31 9.308623e-25
##  [6] 6.392735e-19 1.523214e-13 1.346547e-08 5.266949e-04 1.419747e-01
## [11] 4.199394e-01 7.592057e-01 9.319599e-01 9.903323e-01 9.994176e-01
## [16] 9.999868e-01 9.999999e-01 1.000000e+00 1.000000e+00 1.000000e+00
## [21] 1.000000e+00

# plot bivariate KDE
plot(fDensSIM, view='fixed', theta=10)

# plot bivariate CDF
plot(fDistSIM, view='fixed', theta=2)