#installing packages
options(repos = c(CRAN = "https://cloud.r-project.org"))
install.packages("MuMIn")
## package 'MuMIn' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\LIEWY\AppData\Local\Temp\Rtmp08E8sd\downloaded_packages
install.packages("car")
## package 'car' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\LIEWY\AppData\Local\Temp\Rtmp08E8sd\downloaded_packages
install.packages("arm") #for glm diagnostic plot
## package 'arm' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\LIEWY\AppData\Local\Temp\Rtmp08E8sd\downloaded_packages
install.packages("patchwork")
## package 'patchwork' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\LIEWY\AppData\Local\Temp\Rtmp08E8sd\downloaded_packages
#loading packages
library(tidyverse)
library(ggplot2)
library(Biostatistics)
library(readxl)
library(car)
library(MuMIn)
library(car)
library(ggeffects)
Checking raw data
#loading the data
qn1data <- read.csv("C:/Users/LIEWY/Downloads/ES3307/ES3307 Assignments/subsistence.csv")
#checking data
str(qn1data)
## 'data.frame': 87 obs. of 10 variables:
## $ name : chr "control" "control" "control" "control" ...
## $ kill : int 0 0 0 0 0 0 0 0 0 0 ...
## $ habitat : chr "subsistence" "subsistence" "subsistence" "subsistence" ...
## $ water : int 40 30 30 25 20 30 10 20 5 10 ...
## $ tree : int 20 10 10 15 20 20 25 10 5 10 ...
## $ shrub : int 20 15 5 10 15 15 20 25 30 15 ...
## $ human : int 60 30 140 190 180 120 120 20 310 50 ...
## $ fruit : int 10 20 20 10 10 10 10 5 20 30 ...
## $ cable : logi NA NA NA NA NA NA ...
## $ transectID: int 0 0 0 0 0 0 0 0 0 0 ...
The data has a binomial distribution. We are only interested in a subset of the listed characteristics.
#creating new dataframe
qn1data <- qn1data[, c(2,4:8)]
#checking changes
str(qn1data)
## 'data.frame': 87 obs. of 6 variables:
## $ kill : int 0 0 0 0 0 0 0 0 0 0 ...
## $ water: int 40 30 30 25 20 30 10 20 5 10 ...
## $ tree : int 20 10 10 15 20 20 25 10 5 10 ...
## $ shrub: int 20 15 5 10 15 15 20 25 30 15 ...
## $ human: int 60 30 140 190 180 120 120 20 310 50 ...
## $ fruit: int 10 20 20 10 10 10 10 5 20 30 ...
1A. Which model(s) have you included in the final candidate set?
Checking for multicollinearity
#fitting full model with all interactions
fullglm <- glm(kill ~ water + tree + shrub + human + fruit, family = binomial, data = qn1data)
#checking VIF
vif(fullglm)
## water tree shrub human fruit
## 1.053694 1.039607 1.044163 1.179552 1.165295
All variables considered have a VIF of less than 3 and can be used in the final model.
#fitting final model
finalglm <- glm(kill ~ water + tree + shrub + human + fruit,
family = binomial, data = qn1data,
na.action = "na.fail")
#determining candidate models using dredge() function
qn1candidate <- dredge(finalglm)
print(qn1candidate)
## Global model call: glm(formula = kill ~ water + tree + shrub + human + fruit, family = binomial,
## data = qn1data, na.action = "na.fail")
## ---
## Model selection table
## (Intrc) fruit human shrub tree water df logLik AICc delta
## 16 3.9640 -0.2340 0.007149 -0.09202 -0.1346 5 -40.864 92.5 0.00
## 32 4.2160 -0.2255 0.007210 -0.09276 -0.1361 -0.02289 6 -40.679 94.4 1.94
## 12 2.9620 -0.2309 0.006524 -0.1379 4 -43.052 94.6 2.12
## 28 3.1790 -0.2237 0.006506 -0.1379 -0.02035 5 -42.879 96.5 4.03
## 8 2.6430 -0.2444 0.006595 -0.09817 4 -45.053 98.6 6.13
## 14 4.0530 -0.1931 -0.07290 -0.1099 4 -45.656 99.8 7.33
## 24 2.7710 -0.2371 0.006649 -0.09732 -0.01528 5 -44.957 100.7 8.19
## 10 3.1990 -0.1914 -0.1123 3 -47.190 100.7 8.20
## 4 1.4880 -0.2299 0.005798 3 -47.624 101.5 9.07
## 30 4.2360 -0.1838 -0.07157 -0.1114 -0.01930 5 -45.504 101.7 9.28
## 26 3.4190 -0.1821 -0.1132 -0.02113 4 -46.981 102.5 9.98
## 20 1.6710 -0.2229 0.005847 -0.01834 4 -47.459 103.4 10.94
## 6 2.9090 -0.2040 -0.07782 3 -49.071 104.4 11.96
## 2 1.9410 -0.1965 2 -50.851 105.8 13.38
## 22 2.9980 -0.1970 -0.07652 -0.01206 4 -49.002 106.5 14.02
## 31 2.7880 0.005995 -0.09172 -0.1324 -0.05648 5 -48.006 106.8 14.28
## 18 2.0950 -0.1881 -0.01660 3 -50.706 107.7 15.23
## 15 1.9900 0.005832 -0.08785 -0.1351 4 -49.907 108.3 15.83
## 27 1.7490 0.005282 -0.1293 -0.05454 4 -50.502 109.5 17.02
## 11 1.0190 0.005143 -0.1328 3 -52.421 111.1 18.66
## 29 3.0870 -0.07137 -0.1148 -0.05406 4 -51.790 112.1 19.60
## 25 2.1870 -0.1116 -0.05259 3 -53.565 113.4 20.95
## 13 2.3100 -0.06960 -0.1160 3 -53.763 113.8 21.35
## 23 1.2870 0.005258 -0.08570 -0.05509 4 -53.108 114.7 22.24
## 9 1.4660 -0.1145 2 -55.541 115.2 22.76
## 7 0.4986 0.004955 -0.08192 3 -55.332 117.0 24.49
## 19 0.3807 0.004417 -0.05371 3 -55.506 117.3 24.83
## 21 1.6860 -0.06750 -0.05185 3 -56.103 118.5 26.03
## 3 -0.3582 0.004182 2 -57.765 119.7 27.20
## 17 0.8962 -0.05168 2 -57.775 119.7 27.23
## 5 0.9397 -0.06663 2 -58.280 120.7 28.23
## 1 0.1613 1 -60.022 122.1 29.62
## weight
## 16 0.500
## 32 0.189
## 12 0.173
## 28 0.067
## 8 0.023
## 14 0.013
## 24 0.008
## 10 0.008
## 4 0.005
## 30 0.005
## 26 0.003
## 20 0.002
## 6 0.001
## 2 0.001
## 22 0.000
## 31 0.000
## 18 0.000
## 15 0.000
## 27 0.000
## 11 0.000
## 29 0.000
## 25 0.000
## 13 0.000
## 23 0.000
## 9 0.000
## 7 0.000
## 19 0.000
## 21 0.000
## 3 0.000
## 17 0.000
## 5 0.000
## 1 0.000
## Models ranked by AICc(x)
To select the best models, candidate models where the Aikake’s difference between the best and candidate models < 5 are retained (the lower the delta AIC value the better).
#selecting modles with delta AIC <5.0
subset(qn1candidate, delta <5)
## Global model call: glm(formula = kill ~ water + tree + shrub + human + fruit, family = binomial,
## data = qn1data, na.action = "na.fail")
## ---
## Model selection table
## (Intrc) fruit human shrub tree water df logLik AICc delta
## 16 3.964 -0.2340 0.007149 -0.09202 -0.1346 5 -40.864 92.5 0.00
## 32 4.216 -0.2255 0.007210 -0.09276 -0.1361 -0.02289 6 -40.679 94.4 1.94
## 12 2.962 -0.2309 0.006524 -0.1379 4 -43.052 94.6 2.12
## 28 3.179 -0.2237 0.006506 -0.1379 -0.02035 5 -42.879 96.5 4.03
## weight
## 16 0.538
## 32 0.204
## 12 0.186
## 28 0.072
## Models ranked by AICc(x)
print(subset)
## function (x, ...)
## UseMethod("subset")
## <bytecode: 0x0000016da09452e0>
## <environment: namespace:base>
Based on these results, the models included in the final candidate set are:
kill ~ fruit + human + shrub + tree
kill ~ fruit + human + shrub + tree + water
kill ~ fruit + human + tree
kill ~ fruit + human + tree + water
1B. Using the best model from your candidate set, obtain your model predicted lines and confidence intervals using ggpredict() and plot them with your original data. (Hint: Think of the most suitable way to show how the probability of roadkills changes with each variable.)
The best model is kill ~ fruit + human + shrub + tree (delta = 0.0).
#fitting best model
bestglm <- glm(kill ~ fruit + human + shrub + tree,
family = binomial, data = qn1data)
#checking diagnostic plot
arm::binnedplot(bestglm$fitted.values, bestglm$residuals)
This binned residual plot is used as a diagnostic plot for GLMs using binomial data. The residuals of the plot mostly fall within the shaded confidence bands, indicating that the model has an acceptable fit. To further demonstrate that this model fits the original data well, its predicted values are plotted against the original data.
#Using ggpredict to obtain predicted values
fruiteffect <- ggpredict(bestglm, terms = c("fruit"))
#this dataframe contains the model's predicted values (y values) and the confidence interval (conf.low/conf.high) for the specified predictor (shows you its isolated effect).
#ggpredict() holds all other predictors at their mean/reference values
print(fruiteffect)
## # Predicted probabilities of kill
##
## fruit | Predicted | 95% CI
## ------------------------------
## 5 | 0.78 | 0.62, 0.88
## 10 | 0.52 | 0.37, 0.67
## 15 | 0.25 | 0.10, 0.50
## 20 | 0.09 | 0.02, 0.36
## 30 | 0.01 | 0.00, 0.17
## 40 | 0.00 | 0.00, 0.07
## 50 | 0.00 | 0.00, 0.03
##
## Adjusted for:
## * human = 100.00
## * shrub = 10.00
## * tree = 10.00
humaneffect <- ggpredict(bestglm, terms = c("human"))
print(humaneffect)
## # Predicted probabilities of kill
##
## human | Predicted | 95% CI
## ------------------------------
## 10 | 0.36 | 0.19, 0.57
## 80 | 0.48 | 0.33, 0.64
## 155 | 0.61 | 0.46, 0.75
## 225 | 0.72 | 0.55, 0.85
## 295 | 0.81 | 0.60, 0.93
## 365 | 0.88 | 0.65, 0.97
## 435 | 0.92 | 0.69, 0.98
## 580 | 0.97 | 0.75, 1.00
##
## Adjusted for:
## * fruit = 10.00
## * shrub = 10.00
## * tree = 10.00
shrubeffect <- ggpredict(bestglm, terms = c("shrub"))
print(shrubeffect)
## # Predicted probabilities of kill
##
## shrub | Predicted | 95% CI
## ------------------------------
## 5 | 0.63 | 0.43, 0.80
## 10 | 0.52 | 0.37, 0.67
## 15 | 0.40 | 0.25, 0.58
## 20 | 0.30 | 0.13, 0.54
## 25 | 0.21 | 0.06, 0.52
## 30 | 0.15 | 0.03, 0.51
##
## Adjusted for:
## * fruit = 10.00
## * human = 100.00
## * tree = 10.00
treeeffect <- ggpredict(bestglm, terms = c("tree"))
print(treeeffect)
## # Predicted probabilities of kill
##
## tree | Predicted | 95% CI
## -----------------------------
## 5 | 0.68 | 0.47, 0.83
## 10 | 0.52 | 0.37, 0.67
## 15 | 0.35 | 0.21, 0.54
## 20 | 0.22 | 0.08, 0.46
## 25 | 0.12 | 0.03, 0.41
## 30 | 0.07 | 0.01, 0.36
##
## Adjusted for:
## * fruit = 10.00
## * human = 100.00
## * shrub = 10.00
Plotting predicted against original values.
library(patchwork) #par(mfrow) can only be used for base R plots
#Plotting fruit predictor
fruit_plot <- ggplot()+
geom_point(data = qn1data, #original data
aes(x = fruit, y = kill),
colour = "blue",
size = 0.5,
position = position_jitter(w = 1, h = 0.03))+
geom_line(data = fruiteffect, #predicted values
aes(x = x, y = predicted),
colour = "blue",
size = 0.5)+
geom_ribbon(data = fruiteffect, #confidence interval
aes(x = x, ymin = conf.low, ymax = conf.high),
fill = "blue",
alpha = 0.2)+
labs(title = "Fruit as predictor",
x = "Distance from nearest fruit tree (m)",
y = "Predicted probability of roadkills")+
theme_minimal()
#plotting human predictor
human_plot <- ggplot()+
geom_point(data = qn1data,
aes(x = human, y = kill),
colour = "purple",
size = 0.5,
position = position_jitter(w = 1, h = 0.03))+
geom_line(data = humaneffect,
aes(x = x, y = predicted),
colour = "purple",
size = 0.5)+
geom_ribbon(data = humaneffect,
aes(x = x, ymin = conf.low, ymax = conf.high),
fill = "purple",
alpha = 0.2)+
labs(title = "Human as predictor",
x = "Distance from nearest human settlement (m)",
y = "Predicted probability of roadkills")+
theme_minimal()
#plotting tree predictor
tree_plot <- ggplot()+
geom_point(data = qn1data,
aes(x = tree, y = kill),
colour = "green",
size = 0.5,
position = position_jitter(w = 1, h = 0.03))+
geom_line(data = treeeffect,
aes(x = x, y = predicted),
colour = "green",
size = 0.5)+
geom_ribbon(data = treeeffect,
aes(x = x, ymin = conf.low, ymax = conf.high),
fill = "green",
alpha = 0.2)+
labs(title = "Tree as predictor",
x = "Distance from nearest non-fruiting tree (m)",
y = "Predicted probability of roadkills")+
theme_minimal()
#plotting shrub predictor
shrub_plot <- ggplot()+
geom_point(data = qn1data,
aes(x = shrub, y = kill),
colour = "darkgreen",
size = 0.5,
position = position_jitter(w = 1, h = 0.03))+
geom_line(data = shrubeffect,
aes(x = x, y = predicted),
colour = "darkgreen",
size = 0.5)+
geom_ribbon(data = shrubeffect,
aes(x = x, ymin = conf.low, ymax = conf.high),
fill = "darkgreen",
alpha = 0.1)+
labs(title = "Shrub as predictor",
x = "Distance from nearest shrub (m)",
y = "Predicted probability of roadkills")+
theme_minimal()
#arranging plots
(fruit_plot + human_plot)/
(tree_plot + shrub_plot)+ plot_annotation(tag_levels = c('A'))
1C. Using the best model from your candidate set, report and interpret your results.
Since we are assuming that there are no interactions between the predictors, type II ANOVA is used.
#Running anova
Anova(bestglm, type = "II")
## Analysis of Deviance Table (Type II tests)
##
## Response: kill
## LR Chisq Df Pr(>Chisq)
## fruit 18.0877 1 2.11e-05 ***
## human 9.5854 1 0.001961 **
## shrub 4.3765 1 0.036437 *
## tree 8.3797 1 0.003794 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#obtaining residual df
df.residual(bestglm)
## [1] 82
From the type II ANOVA, all four specified predictors - fruit, human, tree, shrub - significantly affect the probability of roadkills in subsistence agricultural areas. When the distance to fruit trees, non-fruiting trees, and shrubs increase, the probability of roadkills decrease (fruit: χ 2 1,82 = 18.09, p < 0.001; tree: χ 2 1,82 = 8.38, p = 0.004; shrub: χ 2 1,82 = 4.38, p = 0.04). However, when the distance to human settlements increases, the probability of roadkills increases (χ 2 1,82 = 9.59, p = 0.002).
#loading the data
qn2data <- read_excel("C:/Users/LIEWY/Downloads/ES3307/ES3307 Assignments/Assignment 1 data.xls",
sheet = "Dioecious trees")
#checking data
str(qn2data)#sex is still encoded as numbers
## tibble [50 × 3] (S3: tbl_df/tbl/data.frame)
## $ SEX : num [1:50] 2 1 2 1 2 1 2 2 1 1 ...
## $ DBH : num [1:50] 174 171 212 187 227 274 86 161 285 182 ...
## $ FLOWERS: num [1:50] 234 110 512 133 1107 ...
#converting sex to factors
qn2data <- qn2data %>%
mutate(SEX = case_when(SEX == "1" ~ "Male",
SEX == "2" ~ "Female"))
str(qn2data)
## tibble [50 × 3] (S3: tbl_df/tbl/data.frame)
## $ SEX : chr [1:50] "Female" "Male" "Female" "Male" ...
## $ DBH : num [1:50] 174 171 212 187 227 274 86 161 285 182 ...
## $ FLOWERS: num [1:50] 234 110 512 133 1107 ...
2A. Plot the raw data
Plot the data to show the effect of DBH and SEX on the number of flowers in dioecious trees. What does the plot tell you about the relationship between DBH and SEX?
#plotting scatterplot where Y -> number of flowers, explanatory variables -> DBH, SEX
qn2data %>%
ggplot(aes(x = DBH, y = FLOWERS, colour = SEX))+
geom_smooth(method = lm, se = FALSE)+
geom_point(size = 0.75)+
labs(title = "Effect of tree diameter on number of flowers of 2 sexes of trees",
x = "Diameter at breast height (mm)",
y = "Number of flowers on tree")+
theme_minimal()
In the plot, the lines of female and male trees are not parallel, suggesting that there is an interaction between sex and diameter of tree.
2B. Fit linear model
Based on the plot in Question 2A, fit an appropriate linear model to determine the effect of DBH and SEX on the number of flowers.
#fitting a linear model
qn2lm <- lm(FLOWERS ~ DBH*SEX, data = qn2data)
anova(qn2lm)
## Analysis of Variance Table
##
## Response: FLOWERS
## Df Sum Sq Mean Sq F value Pr(>F)
## DBH 1 5060723 5060723 239.0916 < 2.2e-16 ***
## SEX 1 980045 980045 46.3018 1.791e-08 ***
## DBH:SEX 1 174671 174671 8.2523 0.006138 **
## Residuals 46 973657 21166
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Run diagnostic plots to check your model fit.
par(mfrow = c(2, 2))
plot(qn2lm)
Report and interpret your results.
From the ANOVA results, there is very strong evidence that DBH and SEX both have significant effects on the number of flowers produced by trees (DBH: F 1,46 = 239.09, p < 0.001; SEX: F 1,46 = 46.3, p < 0.001). There is also a signficant interaction between DBH and SEX (F 1,46 = 8.25, P = 0.006).
However, the residual vs fitted plot shows some residuals that are not randomly distributed, suggesting that there is mild heteroscedasticity. The Q-Q residuals plot also shows some deviation of the residuals from the diagonal line, showing mild non-normality in the distribution of the data. The same outliers are flagged in all 4 of the diagnostic plots, but they are not of concern because they don’t cross the Cook’s distance line.
2C. Transform your data
Based on the diagnostic plots in Question 1B, transform your y variable using an appropriate transformation and fit the linear model again.
#running new model with transformed data
newlm <- lm(sqrt(FLOWERS) ~ DBH*SEX, data = qn2data)
Explain why you have chosen this transformation.
Square root transformation can reduce the effect of outliers in count data, which are seen in 2B.
Plot the model predicted lines and confidence intervals with your original data. (Hint: Plot your predicted lines and confidence interval on the same scale as your original data.)
#obtaining new predicted values
newpred <- ggpredict(newlm, terms = c("DBH", "SEX"))
newpred
## # Predicted values of FLOWERS
##
## SEX: Female
##
## DBH | Predicted | 95% CI
## ----------------------------------
## 68 | 0.77 | 0.40, 5.72
## 110 | 50.87 | 35.72, 68.68
## 150 | 171.29 | 148.09, 196.18
## 192 | 374.10 | 340.88, 408.86
## 234 | 655.12 | 601.27, 711.27
## 316 | 1429.21 | 1296.96, 1567.87
##
## SEX: Male
##
## DBH | Predicted | 95% CI
## ---------------------------------
## 68 | 1.42 | 17.43, 3.20
## 110 | 15.12 | 2.71, 37.60
## 150 | 76.18 | 50.75, 106.76
## 192 | 190.71 | 160.79, 223.17
## 234 | 356.88 | 315.26, 401.07
## 316 | 830.16 | 705.11, 965.41
#plotting new predicted values against original values
newpred_plot <- ggplot()+
geom_point(data = qn2data,
aes(x = DBH, y = FLOWERS,
colour = SEX),
size = 1,
position = position_jitter(w = 1, h = 0.03))+
geom_line(data = newpred,
aes(x = x, y = predicted,
fill = group),
size = 0.5)+
geom_ribbon(data = newpred,
aes(x = x, ymin = conf.low, ymax = conf.high,
fill = group),
alpha = 0.2)+
labs(title = "Effect of tree diameter on the number of flowers of 2 tree sexes",
x = "Diameter at breast height (mm)",
y = "Number of flowers")+
guides(fill = FALSE) +
theme_minimal()
newpred_plot
Run diagnostic plots to check your model fit.
#running diagnostic plots for linear model
par(mfrow = c(2, 2))
plot(newlm)
In the residual vs fitted plot shows some residuals show a higher randomness in distribution, suggesting better homoscedasticity. The Q-Q residuals plot show less deviation of the residuals from the diagonal line and less outliers are flagged in the diagnostic plots. This shows that this model is a better fit after the data is transformed.
Report and interpret your results.
summary(newlm)
##
## Call:
## lm(formula = sqrt(FLOWERS) ~ DBH * SEX, data = qn2data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.4767 -1.3903 -0.3699 0.9410 8.7186
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.246693 1.089126 -8.490 5.66e-11 ***
## DBH 0.148897 0.005636 26.420 < 2e-16 ***
## SEXMale -0.173783 2.368947 -0.073 0.9418
## DBH:SEXMale -0.027907 0.011118 -2.510 0.0157 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.349 on 46 degrees of freedom
## Multiple R-squared: 0.9492, Adjusted R-squared: 0.9459
## F-statistic: 286.4 on 3 and 46 DF, p-value: < 2.2e-16
anova(newlm)
## Analysis of Variance Table
##
## Response: sqrt(FLOWERS)
## Df Sum Sq Mean Sq F value Pr(>F)
## DBH 1 4320.2 4320.2 782.7191 < 2.2e-16 ***
## SEX 1 387.2 387.2 70.1451 8.324e-11 ***
## DBH:SEX 1 34.8 34.8 6.3002 0.01565 *
## Residuals 46 253.9 5.5
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The ANOVA results show strong evidence that there is a significant interaction between DBH and SEX (F 1,46 = 6.3, P = 0.016). We would hence expect that DBH’s effects on the number of flowers varies between male and female trees.
2D. Fit a generalised linear model (GLM) - Poisson
As the number of flowers is count data, we can instead fit a GLM to the data. Now, fit a GLM using a Poisson distribution to determine the effect of DBH and SEX on the number of flowers.
#fitting GLM
qn2glm1 <- glm(FLOWERS ~ DBH * SEX, family = poisson, data = qn2data)
Plot the model predicted lines and confidence intervals with your original data. (Hint: Plot your predicted lines and confidence interval on the same scale as your original data.)
#obtaining predictions and confidence intervals
glmpred <- ggpredict(qn2glm1, terms = c("DBH", "SEX"))
glmpred
## # Predicted counts of FLOWERS
##
## SEX: Female
##
## DBH | Predicted | 95% CI
## ----------------------------------
## 68 | 60.50 | 57.52, 63.64
## 110 | 105.53 | 101.34, 109.90
## 150 | 179.26 | 173.70, 184.99
## 192 | 312.66 | 305.50, 319.98
## 234 | 545.34 | 535.70, 555.15
## 316 | 1615.69 | 1577.45, 1654.85
##
## SEX: Male
##
## DBH | Predicted | 95% CI
## ----------------------------------
## 68 | 26.60 | 23.71, 29.83
## 110 | 50.06 | 45.79, 54.73
## 150 | 91.43 | 85.63, 97.62
## 192 | 172.09 | 164.96, 179.53
## 234 | 323.92 | 315.59, 332.47
## 316 | 1113.57 | 1059.21, 1170.72
#plotting GLM predicted values against original values
glm_plot <- ggplot()+
geom_point(data = qn2data,
aes(x = DBH, y = FLOWERS,
colour = SEX),
size = 1,
position = position_jitter(w = 1, h = 0.03))+
geom_line(data = glmpred,
aes(x = x, y = predicted,
fill = group),
size = 0.5)+
geom_ribbon(data = glmpred,
aes(x = x, ymin = conf.low, ymax = conf.high,
fill = group),
alpha = 0.2)+
labs(title = "Effect of tree diameter on the number of flowers of 2 tree sexes (Poisson)",
x = "Diameter at breast height (mm)",
y = "Number of flowers")+
guides(fill = FALSE) +
theme_minimal()
glm_plot
Run diagnostic plots to check your model fit.
par(mfrow = c(2, 2))
plot(qn2glm1)
From the residuals vs leverage plot, several residuals fall outside the Cook’s distance lines, indicating that these observations have a strong influence on the model and may be affecting its overall fit.
Report and interpret your results.
summary(qn2glm1)
##
## Call:
## glm(formula = FLOWERS ~ DBH * SEX, family = poisson, data = qn2data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.2020353 0.0343924 93.103 < 2e-16 ***
## DBH 0.0132452 0.0001321 100.276 < 2e-16 ***
## SEXMale -0.9453136 0.0869851 -10.868 < 2e-16 ***
## DBH:SEXMale 0.0018137 0.0003460 5.242 1.59e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 17921.6 on 49 degrees of freedom
## Residual deviance: 2458.7 on 46 degrees of freedom
## AIC: 2811.6
##
## Number of Fisher Scoring iterations: 5
#performing anova
anova(qn2glm1, test = "Chisq")
## Analysis of Deviance Table
##
## Model: poisson, link: log
##
## Response: FLOWERS
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 49 17921.6
## DBH 1 14343.9 48 3577.6 < 2.2e-16 ***
## SEX 1 1090.9 47 2486.7 < 2.2e-16 ***
## DBH:SEX 1 28.1 46 2458.7 1.168e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
From the anova results, there is a significant interaction between DBH and SEX (Χ² 1,46 = 2458.7, p <0.001).
What do you notice about the dispersion parameter in the summary() output?
From the summary output, the dispersion parameter indicates overdispersion of data, where the residual deviance is much larger than the residual degrees of freedom (a good ratio would be closer to 1).
2E. Fit a generalised linear model (GLM) - Quasipoisson
Now, fit a GLM with a Quasipoisson distribution. What has changed in your Quasipoisson model results?
#Quasipoisson GLM
qn2glm2 <- glm(FLOWERS ~ DBH * SEX, family = quasipoisson, data = qn2data)
summary(qn2glm2)
##
## Call:
## glm(formula = FLOWERS ~ DBH * SEX, family = quasipoisson, data = qn2data)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.2020353 0.2450739 13.066 <2e-16 ***
## DBH 0.0132452 0.0009412 14.072 <2e-16 ***
## SEXMale -0.9453136 0.6198402 -1.525 0.134
## DBH:SEXMale 0.0018137 0.0024656 0.736 0.466
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasipoisson family taken to be 50.77736)
##
## Null deviance: 17921.6 on 49 degrees of freedom
## Residual deviance: 2458.7 on 46 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 5
#running model
anova(qn2glm2, test = "F")
## Analysis of Deviance Table
##
## Model: quasipoisson, link: log
##
## Response: FLOWERS
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev F Pr(>F)
## NULL 49 17921.6
## DBH 1 14343.9 48 3577.6 282.4870 < 2.2e-16 ***
## SEX 1 1090.9 47 2486.7 21.4837 2.958e-05 ***
## DBH:SEX 1 28.1 46 2458.7 0.5529 0.4609
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
When the distribution is changed to a quasipoisson one, the significant interaction between DBH and SEX is no longer observed (F 1,46 = 0.5529, P = 0.46).
#running GLM without DBH:SEX interaction
qn2glm3 <- glm(FLOWERS ~ DBH + SEX, family = quasipoisson, data = qn2data)
summary(qn2glm3)
##
## Call:
## glm(formula = FLOWERS ~ DBH + SEX, family = quasipoisson, data = qn2data)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.1325813 0.2282061 13.727 < 2e-16 ***
## DBH 0.0135204 0.0008678 15.579 < 2e-16 ***
## SEXMale -0.4982782 0.1097203 -4.541 3.89e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasipoisson family taken to be 50.4053)
##
## Null deviance: 17921.6 on 49 degrees of freedom
## Residual deviance: 2486.7 on 47 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 5
anova(qn2glm3, test = "F")
## Analysis of Deviance Table
##
## Model: quasipoisson, link: log
##
## Response: FLOWERS
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev F Pr(>F)
## NULL 49 17921.6
## DBH 1 14343.9 48 3577.6 284.572 < 2.2e-16 ***
## SEX 1 1090.9 47 2486.7 21.642 2.698e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Plot the model predicted lines and confidence intervals with your original data. (Hint: Plot your predicted lines and confidence interval on the same scale as your original data.)
#obtaining predicted values
glm3pred <- ggpredict(qn2glm3, terms = c("DBH", "SEX"))
glm3pred
## # Predicted counts of FLOWERS
##
## SEX: Female
##
## DBH | Predicted | 95% CI
## ----------------------------------
## 68 | 57.51 | 40.67, 81.32
## 110 | 101.48 | 76.74, 134.18
## 150 | 174.28 | 139.97, 216.99
## 192 | 307.51 | 260.85, 362.52
## 234 | 542.60 | 476.53, 617.82
## 316 | 1644.25 | 1391.38, 1943.08
##
## SEX: Male
##
## DBH | Predicted | 95% CI
## ---------------------------------
## 68 | 34.94 | 24.46, 49.92
## 110 | 61.65 | 45.86, 82.89
## 150 | 105.89 | 82.92, 135.21
## 192 | 186.84 | 152.58, 228.78
## 234 | 329.67 | 274.88, 395.38
## 316 | 999.01 | 801.62, 1244.99
#plotting GLM predicted values against original values
glm_plot2 <- ggplot()+
geom_point(data = qn2data,
aes(x = DBH, y = FLOWERS,
colour = SEX),
size = 1,
position = position_jitter(w = 1, h = 0.03))+
geom_line(data = glm3pred,
aes(x = x, y = predicted,
fill = group),
size = 0.5)+
geom_ribbon(data = glm3pred,
aes(x = x, ymin = conf.low, ymax = conf.high,
fill = group),
alpha = 0.2)+
labs(title = "Effect of tree diameter on the number of flowers of 2 tree sexes (Quasipoisson)",
x = "Diameter at breast height (mm)",
y = "Number of flowers")+
guides(fill = FALSE) +
theme_minimal()
glm_plot2
Run diagnostic plots to check your model fit.
#checking diagnostic plots
par(mfrow = c(2, 2))
plot(qn2glm3)
The diagnostic plots have improved whereby there are no more significant outliers beyond the cook’s distance line. This shows that this model is a better fit.
Report and interpret your results.
The ANOVA results of this model show very strong evidence that both SEX and DBH have significant effects on the number of flowers counted on each tree (SEX: F 1,47 = 21.64, P < 0.001; DBH: F 1,48 = 284.57, P < 0.001).
2F.From the above models, which model would you select as the final model? Explain your reasoning.
The model with the quasipoisson distribution should be selected as according to its residual plots, it deals the best with outliers and is therefore the best fit for this dataset.
What are the advantages and disadvantages associated with running a linear model or a generalised linear model?
The application of LMs is limited to data with normal distributions, whereas GLMs are more flexible can be used for non-normal data with binomial and poisson distributions.
LMs assume homoscedasticity (constant variance) while GLMs can be used for data where variance increases with the mean (non-constant variance).
#loading the data
qn3df <- read.csv("C:/Users/LIEWY/Downloads/ES3307/ES3307 Assignments/dungbeetletraptype.csv")
#selecting relevant data
qn3data <- as.matrix(qn3df[,5:46])
3A. Based on the analysis you have selected, plot and interpret your ordination plot.
Since we are investigating community composition, we use the NMDS multivariate analysis method.
#goeveg and vegan package
install.packages("goeveg")
## package 'goeveg' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\LIEWY\AppData\Local\Temp\Rtmp08E8sd\downloaded_packages
library(goeveg)
library(vegan)
Performing NMDS
Step 1: Perform NMDS with 1 to 10 dimensions
Step 2: Check the stress vs dimension plot
#determining the optimal number of dimensions, set dimensions = 10
dimcheckMDS(qn3data, distance = "bray", #calculate dissimilarity using Bray-curtis distance
k = 10, trymax = 10,
autotransform = FALSE)
## | | | 0% | |======= | 10% | |============== | 20% | |===================== | 30% | |============================ | 40% | |=================================== | 50% | |========================================== | 60% | |================================================= | 70% | |======================================================== | 80% | |=============================================================== | 90% | |======================================================================| 100%
## 1 2 3 4 5 6 7 8 9 10
## 0.428 0.209 0.151 0.120 0.102 0.088 0.078 0.070 0.064 0.058
Step 3: Choose optimal number of dimensions
When the suitable level of stress = 0.2, k = 3 is the maximum number of dimensions.
Step 4: Perform final NMDS with that number of dimensions
#setting seed (can use any random number)
set.seed(3)
#performing NMDS
trapNMDS <- metaMDS(qn3data, k = 3, trymax = 100, trace = F)
#trymax -> number of times each algorithm is run for every number of K
#trace = F (False)->instructs console to only show output, no progress updates
#checking NMDS results
trapNMDS
##
## Call:
## metaMDS(comm = qn3data, k = 3, trymax = 100, trace = F)
##
## global Multidimensional Scaling using monoMDS
##
## Data: wisconsin(sqrt(qn3data))
## Distance: bray
##
## Dimensions: 3
## Stress: 0.1517875
## Stress type 1, weak ties
## Best solution was repeated 1 time in 27 tries
## The best solution was from try 20 (random start)
## Scaling: centring, PC rotation, halfchange scaling
## Species: expanded scores based on 'wisconsin(sqrt(qn3data))'
Step 5: Check for convergent solution and final stress.
#checking stress plot
stressplot(trapNMDS)
The final stress can be calculated by Stress = √(1 - R²). In this case, our final stress is 0.15, which is acceptable.
Plotting ordination plot
#extracting species scores
species.scores <- as.data.frame(scores(trapNMDS, "species"))
species.scores$species <- rownames(species.scores)
species.scores$type <- "species"
print(head(species.scores))
## NMDS1 NMDS2 NMDS3
## Catharsius.dayacus 0.3910190 -0.6666602 0.43215503
## Catharsius.renaudpauliani 1.0943895 0.0134793 0.37510934
## Copris.agnus -0.1230659 -1.3616843 0.57531934
## Copris.sinicus -0.1126617 -1.2277081 0.25817816
## Microcopris.doriae 0.1257673 -0.9969162 0.22957946
## Microcopris.hidakai -0.3479274 -1.0758146 0.08422025
## species type
## Catharsius.dayacus Catharsius.dayacus species
## Catharsius.renaudpauliani Catharsius.renaudpauliani species
## Copris.agnus Copris.agnus species
## Copris.sinicus Copris.sinicus species
## Microcopris.doriae Microcopris.doriae species
## Microcopris.hidakai Microcopris.hidakai species
#extracting site scores
site.scores <- as.data.frame(scores(trapNMDS, "sites"))
site.scores$sites <- rownames(site.scores)
site.scores$type <- "sites"
#adding environmental factors
site.scores <- cbind (site.scores,
trap = qn3df$trap)
print(head(site.scores))
## NMDS1 NMDS2 NMDS3 sites type trap
## 1 -0.8098117 0.26337581 -0.30724981 1 sites FIT
## 2 -0.6749437 0.24386316 -0.15957233 2 sites FIT
## 3 -0.7341704 0.19835953 -0.07835963 3 sites FIT
## 4 -0.7219002 0.06868809 -0.13937748 4 sites FIT
## 5 -0.6663365 0.26413346 -0.32146555 5 sites FIT
## 6 -0.6526782 0.20214436 -0.32567393 6 sites FIT
ggplot() +
geom_point(data = species.scores,
aes(x = NMDS1,
y = NMDS2),
size = 2) +
geom_point(data = site.scores,
aes(x = NMDS1,
y = NMDS2,
colour = trap,
alpha = 0.5),
shape = 17,
size = 2) +
stat_ellipse(data = site.scores, # classify sites by trap type and display using 95% confidence ellipses
aes(x = NMDS1,
y = NMDS2,
colour = trap,
fill = trap),
type = "norm") +
theme_minimal()
3B. Using appropriate statistical tests, report and interpret your results.
To statistically determine the difference between community compositions of the 2 trap types, we perform a PERMANOVA analysis using the adonis2() function from the vegan package.
#running PERMANOVA
trap_adonis <- adonis2(qn3data ~ trap, data = qn3df)
print(trap_adonis)
## Permutation test for adonis under reduced model
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = qn3data ~ trap, data = qn3df)
## Df SumOfSqs R2 F Pr(>F)
## Model 1 13.967 0.15314 54.611 0.001 ***
## Residual 302 77.235 0.84686
## Total 303 91.201 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The PERMANOVA results provide strong evidence that the dung beetle community composition between different traps are significantly different (F 1,302 = 54.61, p < 0.001).
However, as this statistical difference may be a result of variances rather than means, the dispersions of the trap types must be checked.
#creating dissimilarity matrix
dismatrix <- vegdist(qn3data)
#checking for non-equal dispersions using betadisper() and permutest() including pairise distances
distest <- betadisper(dismatrix, qn3df$trap)
permutest(distest, pairwise = TRUE)
##
## Permutation test for homogeneity of multivariate dispersions
## Permutation: free
## Number of permutations: 999
##
## Response: Distances
## Df Sum Sq Mean Sq F N.Perm Pr(>F)
## Groups 1 0.04658 0.046578 4.6154 999 0.026 *
## Residuals 302 3.04772 0.010092
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Pairwise comparisons:
## (Observed p-value below diagonal, permuted p-value above diagonal)
## BPT FIT
## BPT 0.03
## FIT 0.032481
#visualizing differences in dispersions between groups
plot(distest)
Visually, this graph supports the previous conclusion, as the clusters and centroids do not entirely overlap. It allows us to conclude that the significant differences from the PERMANOVA analysis are due to the difference in group centroids and dispersions (spatial distance from the centroids).