A GLM tutorial provided by Dr. T Caughlin here. These are just the follow along notes.
halo <- read.csv("halo.season2.csv")
head(halo) %>%
gt()%>%
opt_stylize(style = 2, color="gray")
| date | month | day | year | dayofweek | sample | time | cell_count_A | cell_count_B | cell_count_C | ambientweather.F | precip_in | precip_mm | tide_ft | highest.high | lowest.just.prior | lowest.low | tide_range_calc_1 | tide_range_calc_3 | tiderange_3 | wind | wind_direction | windspeedanddirection.mph | photoalbum | collector | picker | lunar_illumination_percent | next_full_days | lunar_distance_mi | nitrate_mg | nitrate_volts | act_cond_us_cm | spc_cond_um_cm | salinity_PSU | resistivity_ohm_cm | density | total_diss_solids_ppt | ph | ph_mv | orp_mv | chl_a_flu_rfu | water_temp_c | baro_pressure_mmhg | pressure_psi | depth_ft | ext_volts | bat_capacity | lat | long | Week |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 12/1/2023 | 12 | 1 | 2023 | Friday | 1 | 5:30 PM | 0 | NA | NA | 41 | 0.28 | 7.112 | 6.67 | 9.40 | 6.78 | -1.38 | -0.11 | 10.78 | 3.285744 | 6 | S | S6 | 120123 | JB | JB | 83 | 25 | 247101 | 28.262569 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | 48.725417 | -122.507581 | One |
| 12/2/2023 | 12 | 2 | 2023 | Saturday | 2 | 10:02 AM | 0 | NA | NA | 45 | 0.39 | 9.906 | 9.18 | 9.18 | -0.73 | -0.73 | 9.91 | 9.91 | 3.020568 | 5 | S | S5 | 120223 | JB | JB | 72 | 24 | 249605 | 0.91175 | NA | 29514.88924 | 43101.90371 | 27.21819 | 34.04475 | 1.0211 | 28.01624 | 7.6796 | -45.37576 | 275.2353 | 0.00662 | 8.4969 | 743.397 | 14.57792 | 33.46395 | 0 | 93 | 48.72545 | -122.50759 | One |
| 12/3/2023 | 12 | 3 | 2023 | Sunday | 3 | 10:43 AM | 0 | NA | NA | 44 | 0.11 | 2.794 | 8.97 | 8.97 | 0.01 | 0.01 | 8.96 | 8.96 | 2.731008 | 20 | S | S20 | 120323 | JB | JB | 64 | 23 | 250758 | 42.84298 | NA | 32863.91042 | 47810.85762 | 30.53366 | 30.42876 | 1.02368 | 31.07706 | 8.16124 | -43.40457 | 219.65986 | 0.00373 | 8.70334 | 743.397 | 14.65751 | 33.64568 | 0 | 92 | 48.72539 | -122.5076 | One |
| 12/4/2023 | 12 | 4 | 2023 | Monday | 4 | 10:56 AM | 0 | NA | NA | 44 | 0.15 | 3.810 | 8.43 | 8.79 | 0.83 | 0.83 | 7.60 | 7.96 | 2.426208 | 7 | N | N7 | 120423 | JB | AJ | 54 | 22 | 251148 | 21.45113 | 61.71999 | 30027.94771 | 44527.21381 | 28.13971 | 33.30287 | 1.0219 | 28.94269 | 8.22413 | -46.79762 | 190.61506 | 0.00327 | 7.95347 | 743.397 | 14.82278 | 34.01786 | 0 | 92 | 48.72537 | -122.5076 | One |
| 12/5/2023 | 12 | 5 | 2023 | Tuesday | 5 | 11:46 AM | 0 | NA | NA | 51 | 0.96 | 24.384 | 8.65 | 8.65 | 1.74 | 1.74 | 6.91 | 6.91 | 2.106168 | 3 | S | S3 | 120523 | JB | JB | 45 | 21 | 250730 | 30.08332 | 53.98064 | 33779.55717 | 48485.03633 | 31.07493 | 29.60384 | 1.02403 | 31.51527 | 8.25449 | -48.69966 | 206.55585 | 0.00144 | 9.16134 | 743.397 | 14.75961 | 33.87267 | 0 | 91 | 48.72545 | -122.50763 | One |
| 12/6/2023 | 12 | 6 | 2023 | Wednesday | 6 | 11:38 AM | 0 | NA | NA | 48 | 0.40 | 10.160 | 8.14 | 8.56 | 2.71 | 2.71 | 5.43 | 5.85 | 1.783080 | 0 | CALM | Calm | 120623 | JB | AJ | 39 | 20 | 249520 | 16.1092 | 68.63968 | 19552.55186 | 28587.31329 | 17.34768 | 51.14596 | 1.0134 | 18.58175 | 8.31576 | -52.00737 | 216.59472 | 0.00346 | 8.48911 | 743.397 | 14.64821 | 33.61533 | 0.00174 | 91 | 48.72545 | -122.50761 | One |
Explore correlations between number data for x variables.
mean(halo$cell_count_A, na.rm=T)
## [1] 3.756522
var(halo$cell_count_A, na.rm = T)
## [1] 392.5718
max(halo$cell_count_A, na.rm=T)
## [1] 206
DISTRIBUTION (Y-variable)
Variance > Mean | Zero Inflation | Big Values \(\rightarrow\) Negative Binomial
VARIABLES
Predictors: tide_ft,
lunar_illumination_percent, nitrate_mg,
water_temp_c
Response: cell_count_A
forggpairs <- halo[,c(14, 27, 30, 42)]
#nitrates and water temp are "chr" in the data frame (found by taking summary(forggpairs), so need to convert to numeric for R to process)
forggpairs.numbers <- forggpairs %>%
mutate(across(everything(), ~ as.numeric(as.character(.))))
ggpairs(forggpairs.numbers)+
scale_color_brewer(palette='Dark2', direction=-1)
Figure 1: Showing correlation between variables, however, nothing is huge…so I will leave all covariates in the mix. (-0.4 between water temperature and tidal height)
Number variables: tidal height (ft), lunar illumination, nitrates, water temperature (°C)
#built function for rescaling quick
rescale=function(x) {return((x - mean(x, na.rm = TRUE)) / (2 * sd(x, na.rm = TRUE)))}
numericsOnly_std<-apply(forggpairs.numbers,2,rescale)
head(numericsOnly_std)
## tide_ft lunar_illumination_percent nitrate_mg water_temp_c
## [1,] -0.87863063 0.46095740 1.3399314 NA
## [2,] 0.48131752 0.30233087 -0.9173286 0.13403018
## [3,] 0.36753700 0.18696613 2.5432511 0.20546199
## [4,] 0.07495851 0.04276019 0.7777841 -0.05400599
## [5,] 0.19415716 -0.08702515 1.4901978 0.36393791
## [6,] -0.08216697 -0.17354871 0.3369153 0.13133471
Add the rescaled columns back to the df.
#let's add the columns we just created (the scaling for the numeric variables) back onto our og dataframe so we can use the scaled data easily.
colnames(numericsOnly_std)<-paste(colnames(numericsOnly_std), "_scaled", sep="") #add the tag scaled to the numerics
halo_ready<-data.frame(numericsOnly_std,halo)
Again, just using priors within the brm function with
the assumption that they all have equal probability
(flat priors).
m2 <-brm(cell_count_A~tide_ft_scaled+lunar_illumination_percent_scaled+nitrate_mg_scaled+water_temp_c_scaled,family=negbinomial(),data=halo_ready, refresh=0)
summary(m2)
## Family: negbinomial
## Links: mu = log
## Formula: cell_count_A ~ tide_ft_scaled + lunar_illumination_percent_scaled + nitrate_mg_scaled + water_temp_c_scaled
## Data: halo_ready (Number of observations: 114)
## Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
## total post-warmup draws = 4000
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat
## Intercept 0.81 0.22 0.40 1.28 1.00
## tide_ft_scaled -0.99 0.59 -2.22 0.15 1.00
## lunar_illumination_percent_scaled -1.90 0.41 -2.69 -1.08 1.00
## nitrate_mg_scaled -1.49 0.93 -3.45 0.20 1.00
## water_temp_c_scaled -1.03 0.83 -2.66 0.62 1.00
## Bulk_ESS Tail_ESS
## Intercept 5150 3020
## tide_ft_scaled 4189 3081
## lunar_illumination_percent_scaled 5220 3140
## nitrate_mg_scaled 4492 2958
## water_temp_c_scaled 4730 2958
##
## Further Distributional Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## shape 0.23 0.04 0.16 0.33 1.00 3791 2671
##
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
draws2 <- as.data.frame(m2)
m2_long <- draws2 %>%
select(b_tide_ft_scaled,b_lunar_illumination_percent_scaled, b_nitrate_mg_scaled, b_water_temp_c_scaled ) %>%
mutate(across(everything(), exp)) %>% # exponentiated, so this changes the no effect point on the graph to 1 instead of 0
pivot_longer(everything(),
names_to = "parameter",
values_to = "value")
p6 <- ggplot(m2_long, aes(x = value, y = parameter, fill = parameter)) +
stat_halfeye(.width = c(.8, .95), point_interval = median_qi) +
geom_vline(xintercept = 1, linetype = "dashed") + #moved to 1 because exp is no effect at 1
scale_y_discrete(labels = c(b_water_temp_c_scaled = "Temperature", b_tide_ft_scaled = "Tidal Height", b_nitrate_mg_scaled = "Nitrates", b_lunar_illumination_percent_scaled = "Illumination")) +
coord_cartesian(xlim = c(0.0002, 1.8)) +
theme(legend.position = "none")+
labs(title = "Environmental drivers of cell count",
subtitle = "Negative binomial model; effects are multiplicative (exp(β))",
x = "Multiplicative effect on expected cell count (exp(β))", y="")
p6
Figure 2: Values > 1 → increase expected cell count, Values < 1 → decrease expected cell count. The strongest effect on cell count appears to be lunar illumination. About 15% of the variance in cell count is explained by the chosen predictors (see R-squared calculation in part E below.
mean(draws2$b_water_temp_c_scaled < 0)
## [1] 0.89725
Approximately 90% of posterior probability mass lies below 0.
t_min <- min(halo_ready$water_temp_c_scaled, na.rm=T)
t_max <- max(halo_ready$water_temp_c_scaled, na.rm = T)
min_t <- exp(draws2$b_Intercept+draws2$b_water_temp_c_scaled*t_min)
max_t <- exp(draws2$b_Intercept+draws2$b_water_temp_c_scaled*t_max)
effectSize_temp<- data.frame(max_t-min_t)
median(effectSize_temp$max_t...min_t)
## [1] -27.41491
quantile(effectSize_temp$max_t...min_t, c(0.05, 0.5, 0.95))
## 5% 50% 95%
## -793.77050 -27.41491 2.65420
Across the observed range of water temperature, expected cell counts decreased by a median of 29 cells, with a 90% credible interval ranging from a decrease of 884 cells to an increase of 3 cells.
mean(draws2$b_tide_ft_scaled < 0)
## [1] 0.959
tide_min <- min(halo_ready$tide_ft_scaled, na.rm=T)
tide_max <- max(halo_ready$tide_ft_scaled, na.rm = T)
min_tide <- exp(draws2$b_Intercept+draws2$b_tide_ft_scaled*tide_min)
max_tide <- exp(draws2$b_Intercept+draws2$b_tide_ft_scaled*tide_max)
effectSize_tide<- max_tide-min_tide
quantile(effectSize_tide, c(0.05, 0.5, 0.95))
## 5% 50% 95%
## -81.0661706 -11.8625889 -0.2916524
Increased tidal height from its minimum to maximum observed value was associated with a median decrease of 12 cells (90% credible interval: −80 to −0.4 cells).
mean(draws2$b_nitrate_mg_scaled < 0)
## [1] 0.9565
nitrate_min <- min(halo_ready$nitrate_mg_scaled, na.rm=T)
nitrate_max <- max(halo_ready$nitrate_mg_scaled, na.rm = T)
min_nitrate <- exp(draws2$b_Intercept+draws2$b_nitrate_mg_scaled*nitrate_min)
max_nitrate <- exp(draws2$b_Intercept+draws2$b_nitrate_mg_scaled*nitrate_max)
effectSize_nitrate<- max_nitrate-min_nitrate
quantile(effectSize_nitrate, c(0.05, 0.5, 0.95))
## 5% 50% 95%
## -40.2519262 -8.5000138 -0.4920075
Increased nitrate concentration across its observed range was associated with a median decrease of 8 cells (90% credible interval: −40 to −0.7 cells).
mean(draws2$b_lunar_illumination_percent_scaled < 0)
## [1] 1
illum_min <- min(halo_ready$lunar_illumination_percent_scaled, na.rm=T)
illum_max <- max(halo_ready$lunar_illumination_percent_scaled, na.rm = T)
min_illum <- exp(draws2$b_Intercept+draws2$b_lunar_illumination_percent_scaled*illum_min)
max_illum <- exp(draws2$b_Intercept+draws2$b_lunar_illumination_percent_scaled*illum_max)
effectSize_illum<- data.frame(max_illum-min_illum)
quantile(effectSize_illum$max_illum...min_illum, c(0.05, 0.5, 0.95))
## 5% 50% 95%
## -15.794262 -8.334459 -4.517712
Increasing lunar illumination across the observed range was associated with a decrease in 8 cells (90% credible interval: -16 to -5 cells).
bayes_R2(m2)
## Estimate Est.Error Q2.5 Q97.5
## R2 0.1508758 0.1265522 0.01443735 0.4568486
About 15% of the variance in cell count is explained by the chosen predictors.
Showing predictive curve for strongest predictor (lunar illumination).
p7 <- ggplot(effectSize_illum, aes(x=max_illum...min_illum))+
geom_density(fill="#F0726A", color="black", linewidth = 1)+
geom_text(x=-22, y= 0.1, label="Median effect = -8 cells")+
geom_vline(xintercept = 0, linewidth=1, color="#00B4F0")+
geom_vline(xintercept = -8.33, linewidth=1, color="pink")+
geom_vline(xintercept = -15.9, linewidth = 1, color="#9F8CFF", linetype="dashed")+
geom_vline(xintercept = -4.65, linewidth = 1, color="#9F8CFF", linetype="dashed")+
coord_cartesian(xlim = c(-30,0))+
labs(title="Posterior distribution of lunar illumination effect on predicted cell count", x="Change in predicted cell count", y="Density")+
theme_minimal()
p7
Figure 3: Here you can see a strong negative effect for lunar illumination on cell counts. The entire distribution lies below zero with a long tail toward cell count decreasing, and centered around ~10. The pink line shows the median, and the purple lines show the 90% credible interval and the blue line marks 0.
Include some plots of raw data and describe the patterns in the data.
p1 <- ggplot(halo, aes(as.numeric(nitrate_mg), log(cell_count_A))) +
geom_point(color = "#8EA500", alpha = 0.6, size = 2) +
labs(x = "Nitrates (mg/L)", y = "Cell Count")
p2 <- ggplot(halo, aes(lunar_illumination_percent, log(cell_count_A))) +
geom_point(color = "#F0726A", alpha = 0.6, size = 2) +
labs(x = "Lunar Illumination (%)", y = "Cell Count")
p3 <- ggplot(halo, aes(tide_ft, log(cell_count_A))) +
geom_point(color = "#C178F7", alpha = 0.6, size = 2) +
labs(x = "Tidal Height (ft)", y = "Cell Count")
p4 <- ggplot(halo, aes(as.numeric(water_temp_c), log(cell_count_A))) +
geom_point(color = "#F75EBD", alpha = 0.6, size = 2) +
labs(x = "Water Temp (°C)", y = "Cell Count")
p5 <- ggplot(halo, aes(cell_count_A))+
geom_density(fill="#00B9BE", color="black")+
coord_cartesian(xlim=c(0, 20))
(p5 | p1 | p2)/(p3|p4)
Figure 4. Showing cell count data (response), zero-inflated, variance > mean, and highest x value of 208 cells, which are not shown for zooming to density shape (upper left). The remaining plots are showing cell counts for each covariate used in the model.
Justify your choice for a distribution for the data
DISTRIBUTION (Y-variable)
Below you can see that the variance > mean | Zero Inflation | Big Values \(\rightarrow\) Negative Binomial
mean(halo$cell_count_A, na.rm=T)
## [1] 3.756522
var(halo$cell_count_A, na.rm = T)
## [1] 392.5718
Center your covariates around the mean and divide them by two standard deviations to increase interpretability
Covariates were centered around the mean:
head(halo_ready[1:4])%>%
gt()%>%
opt_stylize(style=3, color="gray")
| tide_ft_scaled | lunar_illumination_percent_scaled | nitrate_mg_scaled | water_temp_c_scaled |
|---|---|---|---|
| -0.87863063 | 0.46095740 | 1.3399314 | NA |
| 0.48131752 | 0.30233087 | -0.9173286 | 0.13403018 |
| 0.36753700 | 0.18696613 | 2.5432511 | 0.20546199 |
| 0.07495851 | 0.04276019 | 0.7777841 | -0.05400599 |
| 0.19415716 | -0.08702515 | 1.4901978 | 0.36393791 |
| -0.08216697 | -0.17354871 | 0.3369153 | 0.13133471 |
p8 <- ggplot(halo_ready)+
geom_density(aes(halo_ready$tide_ft_scaled, fill="Tidal Height (ft)"), alpha=0.4)+
geom_density(aes(halo_ready$water_temp_c_scaled, fill="Water Temp (°C)"), alpha=0.4)+
geom_density(aes(halo_ready$nitrate_mg_scaled, fill="Nitrate (mg)"), alpha=0.4)+
geom_density(aes(halo_ready$lunar_illumination_percent_scaled, fill="Lunar Illumination"), alpha=0.4)+
labs(x="", y="Density", fill=NULL)+
theme_minimal()
ggplotly(p8)
Figure 5. Interactive plot, predictors were centered by the mean and scaled by two standard deviations to improve coefficient interpretability and ensure comparable prior influence across covariates.
Produce a coefficient plot to show relative effect size of your covariates
p6
Figure 2: Values > 1 → increase expected cell count, Values < 1 → decrease expected cell count. The strongest effect on cell count appears to be lunar illumination. About 15% of the variance in cell count is explained by the chosen predictors (see R-squared calculation in part E below.
p7
Figure 3: Showing predictive curve for strongest predictor (lunar illumination). Here you can see a strong negative effect for lunar illumination on cell counts. The entire distribution lies below zero with a long tail toward cell count decreasing, and centered around ~10. The pink line shows the median, and the purple lines show the 90% credible interval and the blue line marks 0.
Write a sentence describing the probability of direction for important effects in the model as well as effect size (with uncertainty).
Approximately >99% of the posterior probability mass lies below 0. Increasing lunar illumination across the observed range was associated with a decrease in 8 cells (90% credible interval: -16 to -5 cells).
dogs <- read.delim("dogs.csv", sep=";")
head(dogs)%>%
gt()%>%
opt_stylize(style = 2, color = "gray")
| Date | Time | Location | Transect | Dog | Handler | Assistant | InclDaysWorked | TrainingDay | TrialNumber | SessionDuration | Temperature | Humidity | Weather | Wind | Terrain | Slope | Insect.presence | Leash | SearchArea | Blindness | Site | Visibility | ToadCondition | ToadAge | ToadSex | FoundFirstAttempt | FoundSecondAttempt | FoundThirdAttempt | Alert | DogBehaviour |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 25.08.2021 | 10:30:00 | Lagune | NA | dog2 | PersonA | NA | 1 | 1 | 1 | 70 | 23.0 | 47 | partly cloudy | slight | plaines, sands and rocks | slight | few | no leash | 50 | not blind | rocks | visible | alive | juvenile | unknown | 1 | NA | NA | independent strong | cooperative |
| 25.08.2021 | 10:30:00 | Lagune | NA | dog2 | PersonA | NA | 1 | 1 | 2 | 70 | 23.0 | 47 | partly cloudy | slight | plaines, sands and rocks | slight | few | no leash | 50 | not blind | rocks | visible | alive | juvenile | unknown | 0 | 0 | 1 | independent strong | cooperative |
| 25.08.2021 | 10:30:00 | Lagune | NA | dog2 | PersonA | NA | 1 | 1 | 3 | 70 | 23.0 | 47 | partly cloudy | slight | plaines, sands and rocks | slight | few | no leash | 75 | not blind | brush | invisible | alive | juvenile | unknown | 1 | NA | NA | independent strong | cooperative |
| 25.08.2021 | 10:30:00 | Lagune | NA | dog2 | PersonA | NA | 1 | 1 | 4 | 70 | 23.0 | 47 | partly cloudy | slight | plaines, sands and rocks | slight | few | no leash | 75 | not blind | brush | invisible | alive | juvenile | unknown | 1 | NA | NA | independent strong | fatigued |
| 04.02.2022 | 08:45:00 | PersonA_Garden | NA | dog2 | PersonA | NA | 1 | 6 | 5 | 45 | 7.5 | 77 | drizzle | slight | grassland, low herbs | mix | none | no leash | 100 | not blind | litter | visible | dead | subadult | unknown | 1 | NA | NA | independent strong | cooperative |
| 04.02.2022 | 08:45:00 | PersonA_Garden | NA | dog2 | PersonA | NA | 1 | 6 | 6 | 45 | 7.5 | 77 | drizzle | slight | grassland, low herbs | mix | none | no leash | 150 | not blind | litter | visible | dead | adult | female | 1 | NA | NA | independent strong | cooperative |
Predictors: TrainingDay, Temperature,
Leash, DogBehavior
Response: FoundFirstAttempt (binary)
mean(dogs$FoundFirstAttempt) #good job dogs!
## [1] 0.720524
Explore the correlations (NUMERICAL DATA ONLY in cor)
(also another plot for funsies)
#only pulling these columns out briefly to check them for fun things (correlations between covariates)
numericsOnly <- dogs[,c(9, 12, 20)]
cor(numericsOnly) #.39 is the highest and it is low enough that we can move on and assume there isnt high enough correlation between covariates to remove one.
## TrainingDay Temperature SearchArea
## TrainingDay 1.00000000 0.01699062 0.3911707
## Temperature 0.01699062 1.00000000 0.1615864
## SearchArea 0.39117067 0.16158640 1.0000000
columngrab <- dogs[,c(9, 12, 19, 27, 31)]
forggpairs <- dogs[,c(9, 12, 20, 27)]
ggpairs(forggpairs, aes(color=as.factor(FoundFirstAttempt)))
This figure shows a moderate positive linear correlation using Pearson correlations, between training day and search area. As training day increases, search area tends to increase, *** means statistically significant.
Because these covariates are all on different scales (Training Day 0-31, Temperature 1-40, and Search Area 5-900), we want to scale them by centering them around the mean so we can compare the effect sizes easily. Only the numeric covariates are rescaled.
#built function for rescaling quick
rescale=function(x) {return((x - mean(x, na.rm = TRUE)) / (2 * sd(x, na.rm = TRUE)))}
numericsOnly_std<-apply(numericsOnly,2,rescale)
head(numericsOnly_std)
## TrainingDay Temperature SearchArea
## [1,] -0.7392079 0.1231439 -0.6049086
## [2,] -0.7392079 0.1231439 -0.6049086
## [3,] -0.7392079 0.1231439 -0.5437738
## [4,] -0.7392079 0.1231439 -0.5437738
## [5,] -0.4777333 -0.7249340 -0.4826390
## [6,] -0.4777333 -0.7249340 -0.3603695
#let's add the columns we just created (the scaling for the numeric variables) back onto our og dataframe so we can use the scaled data easily.
colnames(numericsOnly_std)<-paste(colnames(numericsOnly_std), "_scaled", sep="") #add the tag scaled to the numerics
toadly_ready<-data.frame(numericsOnly_std,dogs)
In this tutorial, priors are not explicitely set. The
brm() automatically sets weak priors and once your model
runs you can see what it chose by running the code:
get_prior(m1)
COVARIATES
TrainingDay_scaled, Temperature_scaled,
Leash (category, not scaled), DogBehavior
(category, not scaled)
m1 <-brm(FoundFirstAttempt~TrainingDay_scaled+Temperature_scaled+Leash+DogBehaviour,family="bernoulli",data=toadly_ready, refresh=0)
#get_prior(FoundFirstAttempt~TrainingDay_scaled+Temperature_scaled+Leash+DogBehaviour,family="bernoulli",data=toadly_ready)
For m1 the brm function is using:
Intercept prior: student_t(3, 0, 2.5)
(default)
Slope priors (b): (flat) for all covariates
flat means that all slope values are equally
possible.
summary(m1)
summary(m1)
## Family: bernoulli
## Links: mu = logit
## Formula: FoundFirstAttempt ~ TrainingDay_scaled + Temperature_scaled + Leash + DogBehaviour
## Data: toadly_ready (Number of observations: 229)
## Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
## total post-warmup draws = 4000
##
## Regression Coefficients:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept 2.47 0.65 1.23 3.81 1.00 857
## TrainingDay_scaled 0.44 0.41 -0.37 1.22 1.00 1012
## Temperature_scaled -0.14 0.38 -0.91 0.57 1.00 1148
## Leashnoleash -0.60 0.62 -1.90 0.59 1.01 923
## Leashshortleash -1.59 0.76 -3.09 -0.09 1.00 888
## DogBehaviourdistracted -3.63 0.73 -5.18 -2.35 1.00 1006
## DogBehaviourfatigued -1.04 0.42 -1.88 -0.22 1.00 1097
## DogBehaviourunresponsive -819.28 1161.00 -3168.64 -25.80 1.01 362
## Tail_ESS
## Intercept 1309
## TrainingDay_scaled 1551
## Temperature_scaled 1393
## Leashnoleash 1291
## Leashshortleash 1337
## DogBehaviourdistracted 1191
## DogBehaviourfatigued 1477
## DogBehaviourunresponsive 226
##
## Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
draws <- as.data.frame(m1)
m1_long <- draws %>%
select(b_TrainingDay_scaled, b_Temperature_scaled, b_Leashnoleash, b_Leashshortleash, b_DogBehaviourdistracted, b_DogBehaviourfatigued, b_Intercept) %>%
pivot_longer(everything(),
names_to = "parameter",
values_to = "value")
ggplot(m1_long, aes(x = value, y = parameter, fill = parameter)) +
stat_halfeye(.width = c(.8, .95), point_interval = median_qi) +
geom_vline(xintercept = 0, linetype = "dashed") +
scale_y_discrete(labels = c(
b_TrainingDay_scaled = "Training day",
b_Temperature_scaled = "Temperature",
b_Leashnoleash = "Leashed or not",
b_Leashshortleash = "Short leash",
b_DogBehaviourdistracted = "Distracted dogs",
b_DogBehaviourfatigued = "Fatigued dog",
b_Intercept = "Intercept")) +
theme_minimal(base_size = 14) +
theme(legend.position = "none")+
scale_fill_manual(values=c("#440154", "#472D7B", "#3B528B", "#2C728E", "#21918C", "#5EC962", "#FDE725"))+
labs(x="Log-odds (posterior draws)")
The further away from 0 above, the stronger the effect.
You can have a strong effect with high uncertainty and weak effects with low uncertainty.
TrainingDay_scaledTHIS MEASURE WILL NOT TELL YOU EFFECT SIZE
We can evaluate how much of the posterior distribution lies above or below zero. For example, the Training Day coefficient is not entirely positive — some posterior draws fall below zero.
To quantify this, we calculate the proportion of posterior draws that are greater than zero:
length(which(draws$b_TrainingDay_scaled>0))/length(draws$b_TrainingDay_scaled)
## [1] 0.86225
84% of the posterior probability mass lies above zero.
TrainingDay_scaledHow big is the effect in real terms?
NOTE we used family=bernoulli in our
brm model because of the binary 0s and 1s in our y data.
This puts an automatic logit link into play, and the
inverse of this is plogis. So to look at the real terms of
the effect, lets undo our logit and place the data back
into an understandable scale.
td_min <- min(toadly_ready$TrainingDay_scaled) #using the minimum value of our training day data to determine what the effect is for training day at its most minimum observed value
td_max <- max(toadly_ready$TrainingDay_scaled)
min_td <- plogis(draws$b_Intercept+draws$b_TrainingDay_scaled*td_min)
max_td <- plogis(draws$b_Intercept+draws$b_TrainingDay_scaled*td_max)
effectSize_td <- data.frame(max_td-min_td)
median(effectSize_td$max_td...min_td)
## [1] 0.0418245
quantile(effectSize_td$max_td...min_td, c(0.05,0.5,0.95))
## 5% 50% 95%
## -0.03899603 0.04182450 0.13490075
ggplot(effectSize_td, aes(x=max_td...min_td))+
geom_histogram(fill="#2C728E", color="white", bins = 30)+
geom_vline(xintercept = 0, linetype = "dashed")+
geom_vline(xintercept = -0.037, color="pink")+
geom_vline(xintercept = 0.12, color="pink")+
geom_vline(xintercept=median(effectSize_td$max_td...min_td), color="black", linewidth=1.5)+
geom_text(x=-0.2, y= 500, label="Median effect = 4.2%")+
labs(title="Posterior distribution of training day effect on toad finding success probability", x="Effect size")+
theme_classic()
Here the thicker black line is the median effect size for dog training days, the pink lines are the 90% credible intervals, and the black dotted line is centered at 0.
DogBehaviorLooking at the effect plot we can see that dog behavior has the strongest effect overall.
WHICH DOGS ARE THE INTERCEPT HERE?
unique(toadly_ready$DogBehaviour)
## [1] "cooperative" "fatigued" "unresponsive" "distracted"
Answer: cooperative
Cooperative dogs are the intercept here, and everything else is being compared to them. Let’s look at how a distracted dog compares to a cooperative dog.
length(which(draws$b_DogBehaviourdistracted<0))/length(draws$b_DogBehaviourdistracted)
## [1] 1
100% of the posterior probability mass lies below zero. (we could likely assume this from the plot though). A nice scientific sentence for this is:
“There is a >99% probability that distracted dogs have lower success in finding toads.”
cooperative_dog <- plogis(draws$b_Intercept)
distracted_dog <- plogis(draws$b_Intercept+draws$b_DogBehaviourdistracted)
effect_distracted <-data.frame(cooperative_dog-distracted_dog)
median(effect_distracted$cooperative_dog...distracted_dog)
## [1] 0.6591733
quantile(effect_distracted$cooperative_dog...distracted_dog, c(0.05,0.5,0.95))
## 5% 50% 95%
## 0.3842834 0.6591733 0.8248833
ggplot(effect_distracted, aes(x=cooperative_dog...distracted_dog))+
geom_histogram(color="white", fill="#440154", bins = 30)+
geom_vline(xintercept = 0, linetype = "dashed")+
geom_vline(xintercept = 0.38, color="#2C728E")+
geom_vline(xintercept = 0.83, color="#2C728E")+
geom_vline(xintercept=median(effect_distracted$cooperative_dog...distracted_dog), color="#5EC962", linewidth = 1.5)+
geom_text(x=0.25, y= 200, label="Median effect = 66.5%")+
labs(title="Effect of distraction on probability of toad detection", x="Effect size")+
theme_classic()
Here the green line is the median effect size for dog training days, the blue lines are the 90% credible intervals, and the black dotted line is centered at 0.
Finally, we can get the \(R^2\) for the model fit:
bayes_R2(m1)
## Estimate Est.Error Q2.5 Q97.5
## R2 0.2628747 0.03539974 0.1838872 0.3221076
\(R^2\) is \(\approx\) 26%, which is pretty great for a dog behavior study.
See convergence to normal over large sample sizes:
curve(rgamma(x, 10, 3), from =0, to=100)
curve(dgamma(x, 10, 3), from =0, to=100)
#PRACTICE SOME CODES
# NEGATIVE BINOMIAL (size=0.1, mu=100) rnbinom()
draws <- rnbinom(1000, size=0.1, mu=100)
par(mfrow=c(1, 2))
hist(draws, breaks=10, col="pink")
#FOR LOOPS
empty <- rep("i", 100)
for(i in 1:100) {
empty[i]=mean(rnbinom(1000, size=0.1, mu=100))
}
hist(as.numeric(empty), col="tomato", main="Neg binom for loop")