1 Introduction

I will present an application of logistic regression in a study of tree survival in a forest plot (100m by 100m) in Alberta (I used a part of the data from my master thesis project). The objective was to see whether tree survival would be affected by the stem density within a specific radius. In particular, I am interested to test whether the probability of survival of a particular tree species (Aspen) was depressed by a higher stem density of different species in its neighbourhood (a circle with a radius of 5 m).

# Setup envirnment 
library(tidyverse)
## Warning: package 'tidyr' was built under R version 3.5.2
library(caret)
library(GGally)
library(ggplot2)
library(corrplot)
library(bayesplot)
theme_set(bayesplot::theme_default(base_family = "sans"))
library(rstanarm)
options(mc.cores = parallel::detectCores())
library(broom)
load('logistic_regression.rdata')
splist=list('Aspen','Birch','White_spruce','OD','OS')

2 Exploratory data analysis

2.1 Data sturcture

There are totally 6 columns in this data set. ‘Common’ refers the common name of the tree species. In this study, we have 5 species in total (‘Aspen’,‘Birch’,‘White_spruce’,‘OD’,‘OS’). ‘X’, ‘Y’ indicates the spatial coordinates of tree points. ‘Status1’, ‘STATUS2’: the living status of trees in 2010 and 2015, respectively.

model_data = gl.dat[,c(1,3,7,8,6,10)]
head(model_data)# initial dataset
##   TreeID       Common     X    Y status1 STATUS2
## 1      1        Aspen  1.37 4.66   alive    dead
## 2      2        Aspen  3.84 0.92   alive   alive
## 3      3           OD  5.26 1.30    dead    dead
## 4      4 White_spruce  6.61 3.66   alive   alive
## 5      5 White_spruce 12.85 3.34   alive   alive
## 6      6 White_spruce 14.71 1.23   alive   alive
t=table(model_data[,c(2,6,5)])
# table of living status in 2015 of alive trees in 2010  
t[, , status1 = 'alive']
##               STATUS2
## Common         dead alive
##   Aspen          59   285
##   Birch         139  1348
##   OD             36   143
##   OS            305   598
##   White_spruce  100   295
# plot the tree points
plot(model_data$X,model_data$Y,xlim=c(0,100),ylim=c(0,100),xlab = "X",ylab = "Y",col=model_data$Common)
legend(70,99,unique(model_data$Common),col=1:length(model_data$Common),pch=1)

2.2 Feature extraction

I pre-processed the original data, to extract the predictors that I am interested in this study. Here, I calculated the number of tree stems within radius of 5 m for all tree points.

find_nbs=function(maxR){
  nb2=list()
  for(species in splist) {  
    dat_focal=gl.dat[gl.dat$Common==species,]
    coord=as.list(as.data.frame(t(dat_focal[,c('X','Y','TreeID','DBH')])))
    nb2[[species]]=lapply(coord,function(x){
      data=all_nbs[[paste(x[1]%/%10+11,x[2]%/%10+11)]];
      data$distsqrt=(data$X-x[1])*(data$X-x[1])+(data$Y-x[2])*(data$Y-x[2]);
      nb=data[which(data$distsqrt<=maxR*maxR & data$TreeID!=x[3]),];
      nb$dist=sqrt(nb$distsqrt);
      nb$dist[nb$dist<.1]=.1
      nb[,c('Common','DBH','dist','X','Y','STATUS2')]
    })
  }  
  return(nb2)
}

neighor_sp_stems=function(){
  new_cols=c()
  for (sp in splist) {
      new_cols = append(new_cols,sp)
  }
  model_data[new_cols]=NA
  model_data['L5']=NA
  for(sp in splist){
    for (i in 1:sum(model_data$Common==sp)) {
      #number of aspens/birchs/white spruce within 10m
      nb=find_nbs(maxR = 5)
      #other deciduous and shrubs
      for(spes in splist){
        model_data[model_data$Common==sp,][spes][i,]=sum(nb[[sp]][[i]]$Common==spes)
        model_data[model_data$Common==sp,]['L5'][i,]=sum(nb[[sp]][[i]]$STATUS2=='alive')
      }
    }
  }
  return(model_data)
}
#neighbor_stems=neighor_sp_stems()
#levels(neighbor_stems$STATUS2)<-c(0,1)
#It takes a while to run these code
#save(neighbor_stems,splist,file = 'nbs1.rdata')

3.3 Structure of the new data set

load('nbs1.rdata')
# Now, I will use the new data set for logistic regression modelling
head(neighbor_stems)
##   TreeID       Common status1     X    Y STATUS2 Aspen5 Birch5
## 1      1        Aspen   alive  1.37 4.66       0     17      0
## 2      2        Aspen   alive  3.84 0.92       1      5      9
## 3      3           OD    dead  5.26 1.30       0      3      8
## 4      4 White_spruce   alive  6.61 3.66       1      3      7
## 5      5 White_spruce   alive 12.85 3.34       1      4      2
## 6      6 White_spruce   alive 14.71 1.23       1      2      5
##   White_spruce5 OD5 OS5 L5
## 1             5   4  33 37
## 2             1   3   6 10
## 3             1   2   0  5
## 4             3   3   0  6
## 5             3   4   0  5
## 6             3   3   6  8
# totally, we have 4250 tree points
str(neighbor_stems)
## 'data.frame':    4250 obs. of  12 variables:
##  $ TreeID       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Common       : Factor w/ 5 levels "Aspen","Birch",..: 1 1 3 5 5 5 1 2 1 1 ...
##  $ status1      : Factor w/ 2 levels "alive","dead": 1 1 2 1 1 1 2 1 1 1 ...
##  $ X            : num  1.37 3.84 5.26 6.61 12.85 ...
##  $ Y            : num  4.66 0.92 1.3 3.66 3.34 1.23 1.66 3.86 6.32 5.43 ...
##  $ STATUS2      : num  0 1 0 1 1 1 0 0 1 1 ...
##  $ Aspen5       : int  17 5 3 3 4 2 2 10 8 8 ...
##  $ Birch5       : int  0 9 8 7 2 5 5 1 1 1 ...
##  $ White_spruce5: int  5 1 1 3 3 3 3 1 0 2 ...
##  $ OD5          : int  4 3 2 3 4 3 3 2 3 3 ...
##  $ OS5          : int  33 6 0 0 0 6 8 2 1 0 ...
##  $ L5           : int  37 10 5 6 5 8 10 7 4 5 ...
summary(neighbor_stems)
##      TreeID              Common      status1           X         
##  Min.   :   1   Aspen       : 539   alive:3308   Min.   :  0.00  
##  1st Qu.:2036   Birch       :1665   dead : 942   1st Qu.: 30.91  
##  Median :2816   OD          : 237                Median : 57.53  
##  Mean   :2746   OS          :1310                Mean   : 54.67  
##  3rd Qu.:3925   White_spruce: 499                3rd Qu.: 80.21  
##  Max.   :5000                                    Max.   :100.00  
##        Y             STATUS2          Aspen5           Birch5     
##  Min.   :  0.00   Min.   :0.000   Min.   : 0.000   Min.   : 0.00  
##  1st Qu.: 19.25   1st Qu.:0.000   1st Qu.: 0.000   1st Qu.: 4.00  
##  Median : 51.90   Median :1.000   Median : 3.000   Median :14.00  
##  Mean   : 50.92   Mean   :0.628   Mean   : 5.582   Mean   :16.86  
##  3rd Qu.: 81.35   3rd Qu.:1.000   3rd Qu.: 9.000   3rd Qu.:25.00  
##  Max.   :100.00   Max.   :1.000   Max.   :42.000   Max.   :98.00  
##  White_spruce5         OD5             OS5              L5       
##  Min.   : 0.000   Min.   : 0.00   Min.   : 0.00   Min.   : 1.00  
##  1st Qu.: 1.000   1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.:18.00  
##  Median : 2.000   Median : 1.00   Median :11.00   Median :27.00  
##  Mean   : 3.242   Mean   : 1.97   Mean   :16.15   Mean   :27.34  
##  3rd Qu.: 4.000   3rd Qu.: 2.00   3rd Qu.:27.00   3rd Qu.:36.00  
##  Max.   :29.000   Max.   :26.00   Max.   :73.00   Max.   :72.00
aspen=neighbor_stems[neighbor_stems$Common=='Aspen',]
# when a tree is dead in 2010, its living status must be dead in 2015. Therefore, I only use tree points which was alived in 2010.
t
## , , status1 = alive
## 
##               STATUS2
## Common         dead alive
##   Aspen          59   285
##   Birch         139  1348
##   OD             36   143
##   OS            305   598
##   White_spruce  100   295
## 
## , , status1 = dead
## 
##               STATUS2
## Common         dead alive
##   Aspen         195     0
##   Birch         178     0
##   OD             58     0
##   OS            407     0
##   White_spruce  104     0
# 344 obs. of live aspen
aspen=aspen[aspen$status1=='alive',]
# check the correlation
corrplot(cor(aspen[,c(6:12)]))

# change the STATUS2 to factor
neighbor_stems$STATUS2=factor(neighbor_stems$STATUS2)

3 Multiple logistic regression

In this analysis, I used tree survival as the binomial response variable in logistic regression modelling. Survival is a binary variable \(z_i\) that was assigned one if the plant was alive and zero if the plant was dead in 2015, where \(z_i \sim Bernoulli(p_i)\). Let \(p_i = P(z_i = 1)\) be the probability of survival of a plant i for a given observation \((1,x_{i1},...,x_{i6})\) of 6 preditors \(\beta_i\) and an intecept \(\beta_0\), which is stem densities of aspen, birch, white spruce, other shrubs (os), other deciduous (od) and all of living stems within a radius of 5 m (for accounting spatial autocorrelation). So write: \[logit(p_i)=\beta^TX_i\] \[p_i=P(z_i=1│z_{j(j≠i)},β,γ)=\frac{1}{1+\exp{-(β^T X_i+γ∑_{j=1}^{n}z_j )}}\] When fix the levels of other \(x_k\), \(\exp(\beta_j)\) is the multiplicative effect on the odds of a 1-unit increase in \(x_j\) i.e. adding one stem of the \(j^{th}\) species within the radius of 5 m.

Tree survival is always spatially autocorrelated, which violates the assumption of independence. So, It is necassary to add a parameter \(γ\) accounting for spatial autocorrelation . In this case, I use total living stems around the subject tree within the radius of 5m as the predictor.

3.1 Maximum likelihood method

After accountting for the spatial autocorrelation, \(z_i\) was assumed to be independent, the joint probability function (likelihood) is: \[L(Z|β,γ)=∏_{i=1}^{n}p_i^{z_i}(1-p_i)^{1-z_i}\] So, the log-likelihood is: \[\log(L(Z|β,γ))=∑_{i=1}^{n}z_i\log(\frac{p_i}{1-p_i})+∑_{i=1}^{n}\log(1-p_i) \\=∑_{i=1}^{n}z_i\beta^TX-∑_{i=1}^{n}\log(1+\exp(\beta^TX))\] Take partial derivative, the likelihood equations can be written as: \[\frac{\partial\log(L(Z|β,γ))}{\partial \beta}= X^T(Z-\hat{Z})\],where \(\hat{z}\) denote the estimated probability of tree survival, the MLE of \(\beta\) are obtained by solve the equation of RHS equal to zero. But there is no close form solution. Therefore, glm() use iterative method to approximately solve the equations and obtain the MLE estimators.

# Fit the logistic regression model
fit = glm(STATUS2~Aspen5+Birch5+White_spruce5+OD5+OS5+L5,data = aspen,family = binomial("logit"))
result=summary(fit)
result
## 
## Call:
## glm(formula = STATUS2 ~ Aspen5 + Birch5 + White_spruce5 + OD5 + 
##     OS5 + L5, family = binomial("logit"), data = aspen)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4623   0.3127   0.4828   0.6228   1.4747  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    1.82118    0.52312   3.481 0.000499 ***
## Aspen5        -0.05913    0.02929  -2.019 0.043507 *  
## Birch5        -0.12945    0.03339  -3.876 0.000106 ***
## White_spruce5 -0.13625    0.04104  -3.320 0.000901 ***
## OD5           -0.20632    0.07302  -2.825 0.004721 ** 
## OS5           -0.05413    0.01555  -3.481 0.000499 ***
## L5             0.13374    0.03103   4.310 1.63e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 315.29  on 343  degrees of freedom
## Residual deviance: 284.31  on 337  degrees of freedom
## AIC: 298.31
## 
## Number of Fisher Scoring iterations: 5
#confidence intervals for log odds ratio
round(confint(fit),2)
## Waiting for profiling to be done...
##               2.5 % 97.5 %
## (Intercept)    0.83   2.88
## Aspen5        -0.12   0.00
## Birch5        -0.20  -0.07
## White_spruce5 -0.22  -0.06
## OD5           -0.35  -0.06
## OS5           -0.09  -0.02
## L5             0.07   0.20
# 95% CI for odds ratio
round(exp(confint(fit)),2)
## Waiting for profiling to be done...
##               2.5 % 97.5 %
## (Intercept)    2.28  17.86
## Aspen5         0.89   1.00
## Birch5         0.82   0.94
## White_spruce5  0.80   0.95
## OD5            0.70   0.94
## OS5            0.92   0.98
## L5             1.08   1.22

3.2 Model diagnostics

(1) Multicollinearity

car::vif(fit) # No multicollinearity
##        Aspen5        Birch5 White_spruce5           OD5           OS5 
##      2.391576      3.299899      1.603338      1.343553      2.469481 
##            L5 
##      4.843316

(2) Linear assumption

We can check the linear relationship between predictor variables and the logit of the outcome by visually inspecting the scatter plot between each predictor and the logit values.

# Predict the probability (p) of diabete positivity
probabilities <- predict(fit, type = "response")
# Select predictors
mydata <- aspen[,c(7:12)]
predictors <- colnames(mydata)
# Bind the logit and tidying the data for plot
mydata <- mydata %>%
  mutate(logit = log(probabilities/(1-probabilities))) %>%
  gather(key = "predictors", value = "predictor.value", -logit)
ggplot(mydata, aes(predictor.value,logit))+
  geom_point(size = 1.3, alpha = .3,color=9) +
  geom_smooth(method = "lm",formula = y ~ splines::bs(x, 3),size = 1.4) + 
  theme_bw() + 
  facet_wrap(~predictors, scales = "free_x")

We can detect some nonlinear trend in OS5. Therefore, the model need some transformations to account the nonlinear relationships. For example, including some power terms, fractional polynomials and spline functions would be helpful. However, non-linear regression some times will cause other problems like over-fitting.

(3) Influential values

By plotting the Pearson residuals verses the predictors value individually, we can see which points are influential observations (i.e. point with absolute value much larger than 2).

pear.res=resid(fit, type="pearson")
# std.res=pear.res/sqrt(1 - influence(fit)$hat)
mydata2 <- aspen[,c(7:12)] %>%
  mutate(pear.res=pear.res) %>%
  gather(key = "predictors", value = "predictor.value", -pear.res)
ggplot(mydata2, aes(predictor.value, pear.res))+
  geom_point(size = 1.5, alpha = .3,color = 9) +
  theme_bw() + 
  facet_wrap(~predictors, scales = "free_x")

3.2 Bayesian method (An alternative way to fit logistic regression model)

In Bayesian logistic regression, we start with an initial belief about the priors distributions of \(\beta\). So, independent priors for the components of the vector \(α=[β_0,β_1,…,β_6,γ]\) are specified as non-informative uniform distribution, which is \(p(α)∝1.\) According to Bayes theorem, the posterior distribution of parameters \(α\) can be expressed as: \(f(α│Z)∝ L(Z│α)p(α)\).We can’t evaluate the closed form posterior, but can approximate the posterior by sampling methods e.g. Markov chain Monte Carlo (MCMC) methods by stan_glm().

post1 <- stan_glm(STATUS2 ~ Aspen5 + Birch5 + White_spruce5 + OD5 +
                  OS5 + L5, data = aspen,
                  family = binomial(link = "logit"),
                  prior = NULL, QR=TRUE)
#posterior distribution of estimated coefficients
mcmc_areas(
  as.array(post1),
  pars = c("Aspen5", "Birch5", "White_spruce5", "OD5","OS5","L5"),
  prob = 0.95, # 95% intervals
  prob_outer = 0.99,
  point_est = "mean"
)