I will present an application of logistic regression in a study of tree survival in a forest plot (100m by 100m) in Alberta (I used a part of the data from my master thesis project). The objective was to see whether tree survival would be affected by the stem density within a specific radius. In particular, I am interested to test whether the probability of survival of a particular tree species (Aspen) was depressed by a higher stem density of different species in its neighbourhood (a circle with a radius of 5 m).
# Setup envirnment
library(tidyverse)
library(caret)
library(GGally)
library(ggplot2)
library(corrplot)
library(bayesplot)
theme_set(bayesplot::theme_default(base_family = "sans"))
library(rstanarm)
options(mc.cores = parallel::detectCores())
library(broom)
load('logistic_regression.rdata')
splist=list('Aspen','Birch','White_spruce','OD','OS')
There are totally 6 columns in this data set. ‘Common’ refers the common name of the tree species. In this study, we have 5 species in total (‘Aspen’,‘Birch’,‘White_spruce’,‘OD’,‘OS’). ‘X’, ‘Y’ indicates the spatial coordinates of tree points. ‘Status1’, ‘STATUS2’: the living status of trees in 2010 and 2015, respectively.
model_data = gl.dat[,c(1,3,7,8,6,10)]
head(model_data)# initial dataset
## TreeID Common X Y status1 STATUS2
## 1 1 Aspen 1.37 4.66 alive dead
## 2 2 Aspen 3.84 0.92 alive alive
## 3 3 OD 5.26 1.30 dead dead
## 4 4 White_spruce 6.61 3.66 alive alive
## 5 5 White_spruce 12.85 3.34 alive alive
## 6 6 White_spruce 14.71 1.23 alive alive
t=table(model_data[,c(2,6,5)])
# table of living status in 2015 of alive trees in 2010
t[, , status1 = 'alive']
## STATUS2
## Common dead alive
## Aspen 59 285
## Birch 139 1348
## OD 36 143
## OS 305 598
## White_spruce 100 295
# plot the tree points
plot(model_data$X,model_data$Y,xlim=c(0,100),ylim=c(0,100),xlab = "X",ylab = "Y",col=model_data$Common)
legend(70,99,unique(model_data$Common),col=1:length(model_data$Common),pch=1)
I pre-processed the original data, to extract the predictors that I am interested in this study. Here, I calculated the number of tree stems within radius of 5 m for all tree points.
find_nbs=function(maxR){
nb2=list()
for(species in splist) {
dat_focal=gl.dat[gl.dat$Common==species,]
coord=as.list(as.data.frame(t(dat_focal[,c('X','Y','TreeID','DBH')])))
nb2[[species]]=lapply(coord,function(x){
data=all_nbs[[paste(x[1]%/%10+11,x[2]%/%10+11)]];
data$distsqrt=(data$X-x[1])*(data$X-x[1])+(data$Y-x[2])*(data$Y-x[2]);
nb=data[which(data$distsqrt<=maxR*maxR & data$TreeID!=x[3]),];
nb$dist=sqrt(nb$distsqrt);
nb$dist[nb$dist<.1]=.1
nb[,c('Common','DBH','dist','X','Y','STATUS2')]
})
}
return(nb2)
}
neighor_sp_stems=function(){
new_cols=c()
for (sp in splist) {
new_cols = append(new_cols,sp)
}
model_data[new_cols]=NA
model_data['L5']=NA
for(sp in splist){
for (i in 1:sum(model_data$Common==sp)) {
#number of aspens/birchs/white spruce within 10m
nb=find_nbs(maxR = 5)
#other deciduous and shrubs
for(spes in splist){
model_data[model_data$Common==sp,][spes][i,]=sum(nb[[sp]][[i]]$Common==spes)
model_data[model_data$Common==sp,]['L5'][i,]=sum(nb[[sp]][[i]]$STATUS2=='alive')
}
}
}
return(model_data)
}
#neighbor_stems=neighor_sp_stems()
#levels(neighbor_stems$STATUS2)<-c(0,1)
#It takes a while to run these code
#save(neighbor_stems,splist,file = 'nbs1.rdata')
load('nbs1.rdata')
# Now, I will use the new data set for logistic regression modelling
head(neighbor_stems)
## TreeID Common status1 X Y STATUS2 Aspen5 Birch5
## 1 1 Aspen alive 1.37 4.66 0 17 0
## 2 2 Aspen alive 3.84 0.92 1 5 9
## 3 3 OD dead 5.26 1.30 0 3 8
## 4 4 White_spruce alive 6.61 3.66 1 3 7
## 5 5 White_spruce alive 12.85 3.34 1 4 2
## 6 6 White_spruce alive 14.71 1.23 1 2 5
## White_spruce5 OD5 OS5 L5
## 1 5 4 33 37
## 2 1 3 6 10
## 3 1 2 0 5
## 4 3 3 0 6
## 5 3 4 0 5
## 6 3 3 6 8
# totally, we have 4250 tree points
str(neighbor_stems)
## 'data.frame': 4250 obs. of 12 variables:
## $ TreeID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Common : Factor w/ 5 levels "Aspen","Birch",..: 1 1 3 5 5 5 1 2 1 1 ...
## $ status1 : Factor w/ 2 levels "alive","dead": 1 1 2 1 1 1 2 1 1 1 ...
## $ X : num 1.37 3.84 5.26 6.61 12.85 ...
## $ Y : num 4.66 0.92 1.3 3.66 3.34 1.23 1.66 3.86 6.32 5.43 ...
## $ STATUS2 : num 0 1 0 1 1 1 0 0 1 1 ...
## $ Aspen5 : int 17 5 3 3 4 2 2 10 8 8 ...
## $ Birch5 : int 0 9 8 7 2 5 5 1 1 1 ...
## $ White_spruce5: int 5 1 1 3 3 3 3 1 0 2 ...
## $ OD5 : int 4 3 2 3 4 3 3 2 3 3 ...
## $ OS5 : int 33 6 0 0 0 6 8 2 1 0 ...
## $ L5 : int 37 10 5 6 5 8 10 7 4 5 ...
summary(neighbor_stems)
## TreeID Common status1 X
## Min. : 1 Aspen : 539 alive:3308 Min. : 0.00
## 1st Qu.:2036 Birch :1665 dead : 942 1st Qu.: 30.91
## Median :2816 OD : 237 Median : 57.53
## Mean :2746 OS :1310 Mean : 54.67
## 3rd Qu.:3925 White_spruce: 499 3rd Qu.: 80.21
## Max. :5000 Max. :100.00
## Y STATUS2 Aspen5 Birch5
## Min. : 0.00 Min. :0.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 19.25 1st Qu.:0.000 1st Qu.: 0.000 1st Qu.: 4.00
## Median : 51.90 Median :1.000 Median : 3.000 Median :14.00
## Mean : 50.92 Mean :0.628 Mean : 5.582 Mean :16.86
## 3rd Qu.: 81.35 3rd Qu.:1.000 3rd Qu.: 9.000 3rd Qu.:25.00
## Max. :100.00 Max. :1.000 Max. :42.000 Max. :98.00
## White_spruce5 OD5 OS5 L5
## Min. : 0.000 Min. : 0.00 Min. : 0.00 Min. : 1.00
## 1st Qu.: 1.000 1st Qu.: 0.00 1st Qu.: 0.00 1st Qu.:18.00
## Median : 2.000 Median : 1.00 Median :11.00 Median :27.00
## Mean : 3.242 Mean : 1.97 Mean :16.15 Mean :27.34
## 3rd Qu.: 4.000 3rd Qu.: 2.00 3rd Qu.:27.00 3rd Qu.:36.00
## Max. :29.000 Max. :26.00 Max. :73.00 Max. :72.00
aspen=neighbor_stems[neighbor_stems$Common=='Aspen',]
# when a tree is dead in 2010, its living status must be dead in 2015. Therefore, I only use tree points which was alived in 2010.
t
## , , status1 = alive
##
## STATUS2
## Common dead alive
## Aspen 59 285
## Birch 139 1348
## OD 36 143
## OS 305 598
## White_spruce 100 295
##
## , , status1 = dead
##
## STATUS2
## Common dead alive
## Aspen 195 0
## Birch 178 0
## OD 58 0
## OS 407 0
## White_spruce 104 0
# 344 obs. of live aspen
aspen=aspen[aspen$status1=='alive',]
# check the correlation
corrplot(cor(aspen[,c(6:12)]))
# change the STATUS2 to factor
neighbor_stems$STATUS2=factor(neighbor_stems$STATUS2)
In this analysis, I used tree survival as the binomial response variable in logistic regression modelling. Survival is a binary variable \(z_i\) that was assigned one if the plant was alive and zero if the plant was dead in 2015, where \(z_i \sim Bernoulli(p_i)\). Let \(p_i = P(z_i = 1)\) be the probability of survival of a plant i for a given observation \((1,x_{i1},...,x_{i6})\) of 6 preditors \(\beta_i\) and an intecept \(\beta_0\), which is stem densities of aspen, birch, white spruce, other shrubs (os), other deciduous (od) and all of living stems within a radius of 5 m (for accounting spatial autocorrelation). So write: \[logit(p_i)=\beta^TX_i\] \[p_i=P(z_i=1âz_{j(jâ i)},β,γ)=\frac{1}{1+\exp{-(β^T X_i+γâ_{j=1}^{n}z_j )}}\] When fix the levels of other \(x_k\), \(\exp(\beta_j)\) is the multiplicative effect on the odds of a 1-unit increase in \(x_j\) i.e. adding one stem of the \(j^{th}\) species within the radius of 5 m.
Tree survival is always spatially autocorrelated, which violates the assumption of independence. So, It is necassary to add a parameter \(γ\) accounting for spatial autocorrelation . In this case, I use total living stems around the subject tree within the radius of 5m as the predictor.
After accountting for the spatial autocorrelation, \(z_i\) was assumed to be independent, the joint probability function (likelihood) is: \[L(Z|β,γ)=â_{i=1}^{n}p_i^{z_i}(1-p_i)^{1-z_i}\] So, the log-likelihood is: \[\log(L(Z|β,γ))=â_{i=1}^{n}z_i\log(\frac{p_i}{1-p_i})+â_{i=1}^{n}\log(1-p_i) \\=â_{i=1}^{n}z_i\beta^TX-â_{i=1}^{n}\log(1+\exp(\beta^TX))\] Take partial derivative, the likelihood equations can be written as: \[\frac{\partial\log(L(Z|β,γ))}{\partial \beta}= X^T(Z-\hat{Z})\],where \(\hat{z}\) denote the estimated probability of tree survival, the MLE of \(\beta\) are obtained by solve the equation of RHS equal to zero. But there is no close form solution. Therefore, glm() use iterative method to approximately solve the equations and obtain the MLE estimators.
# Fit the logistic regression model
fit = glm(STATUS2~Aspen5+Birch5+White_spruce5+OD5+OS5+L5,data = aspen,family = binomial("logit"))
result=summary(fit)
result
##
## Call:
## glm(formula = STATUS2 ~ Aspen5 + Birch5 + White_spruce5 + OD5 +
## OS5 + L5, family = binomial("logit"), data = aspen)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4623 0.3127 0.4828 0.6228 1.4747
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.82118 0.52312 3.481 0.000499 ***
## Aspen5 -0.05913 0.02929 -2.019 0.043507 *
## Birch5 -0.12945 0.03339 -3.876 0.000106 ***
## White_spruce5 -0.13625 0.04104 -3.320 0.000901 ***
## OD5 -0.20632 0.07302 -2.825 0.004721 **
## OS5 -0.05413 0.01555 -3.481 0.000499 ***
## L5 0.13374 0.03103 4.310 1.63e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 315.29 on 343 degrees of freedom
## Residual deviance: 284.31 on 337 degrees of freedom
## AIC: 298.31
##
## Number of Fisher Scoring iterations: 5
#confidence intervals for log odds ratio
round(confint(fit),2)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.83 2.88
## Aspen5 -0.12 0.00
## Birch5 -0.20 -0.07
## White_spruce5 -0.22 -0.06
## OD5 -0.35 -0.06
## OS5 -0.09 -0.02
## L5 0.07 0.20
# 95% CI for odds ratio
round(exp(confint(fit)),2)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 2.28 17.86
## Aspen5 0.89 1.00
## Birch5 0.82 0.94
## White_spruce5 0.80 0.95
## OD5 0.70 0.94
## OS5 0.92 0.98
## L5 1.08 1.22
car::vif(fit) # No multicollinearity
## Aspen5 Birch5 White_spruce5 OD5 OS5
## 2.391576 3.299899 1.603338 1.343553 2.469481
## L5
## 4.843316
We can check the linear relationship between predictor variables and the logit of the outcome by visually inspecting the scatter plot between each predictor and the logit values.
# Predict the probability (p) of diabete positivity
probabilities <- predict(fit, type = "response")
# Select predictors
mydata <- aspen[,c(7:12)]
predictors <- colnames(mydata)
# Bind the logit and tidying the data for plot
mydata <- mydata %>%
mutate(logit = log(probabilities/(1-probabilities))) %>%
gather(key = "predictors", value = "predictor.value", -logit)
ggplot(mydata, aes(predictor.value,logit))+
geom_point(size = 1.3, alpha = .3,color=9) +
geom_smooth(method = "lm",formula = y ~ splines::bs(x, 3),size = 1.4) +
theme_bw() +
facet_wrap(~predictors, scales = "free_x")
We can detect some nonlinear trend in OS5. Therefore, the model need some transformations to account the nonlinear relationships. For example, including some power terms, fractional polynomials and spline functions would be helpful. However, non-linear regression some times will cause other problems like over-fitting.
By plotting the Pearson residuals verses the predictors value individually, we can see which points are influential observations (i.e. point with absolute value much larger than 2).
pear.res=resid(fit, type="pearson")
# std.res=pear.res/sqrt(1 - influence(fit)$hat)
mydata2 <- aspen[,c(7:12)] %>%
mutate(pear.res=pear.res) %>%
gather(key = "predictors", value = "predictor.value", -pear.res)
ggplot(mydata2, aes(predictor.value, pear.res))+
geom_point(size = 1.5, alpha = .3,color = 9) +
theme_bw() +
facet_wrap(~predictors, scales = "free_x")
In Bayesian logistic regression, we start with an initial belief about the priors distributions of \(\beta\). So, independent priors for the components of the vector \(α=[β_0,β_1,â¦,β_6,γ]\) are specified as non-informative uniform distribution, which is \(p(α)â1.\) According to Bayes theorem, the posterior distribution of parameters \(α\) can be expressed as: \(f(αâZ)â L(Zâα)p(α)\).We can’t evaluate the closed form posterior, but can approximate the posterior by sampling methods e.g. Markov chain Monte Carlo (MCMC) methods by stan_glm().
post1 <- stan_glm(STATUS2 ~ Aspen5 + Birch5 + White_spruce5 + OD5 +
OS5 + L5, data = aspen,
family = binomial(link = "logit"),
prior = NULL, QR=TRUE)
#posterior distribution of estimated coefficients
mcmc_areas(
as.array(post1),
pars = c("Aspen5", "Birch5", "White_spruce5", "OD5","OS5","L5"),
prob = 0.95, # 95% intervals
prob_outer = 0.99,
point_est = "mean"
)
# Estimated coefficients
round(coef(post1), 2)
## (Intercept) Aspen5 Birch5 White_spruce5 OD5
## 1.84 -0.06 -0.13 -0.14 -0.21
## OS5 L5
## -0.06 0.14
# 95% Bayesian credible interval
round(posterior_interval(post1, prob = 0.95), 2)
## 2.5% 97.5%
## (Intercept) 0.82 2.93
## Aspen5 -0.12 0.00
## Birch5 -0.20 -0.07
## White_spruce5 -0.23 -0.06
## OD5 -0.36 -0.06
## OS5 -0.09 -0.02
## L5 0.08 0.20
The sign of \(\beta\) are all negative for the 5 species and positive for spatial autocorrelation, indicates the probability of survival of aspen is decreasing as the stem density of each species increases within the radius of 5m. Also, the survival rate of aspen was increasing as the number of living stems in the neighbourhood increasing, that indicates the survival was significantly spatial autocorrelated. An alternative way to fit logistic regression model is Bayesian approach. However, the estimated values of coefficients were quite similar to the MLE. The predictors of the model should be transforms to follow the logistic regression assumption.