# Red-footed Booby in Chagos dimorphism analysis
# Peter Carr
# June 2019
# load libraries
library(ggplot2)
library(agricolae)
library(klaR)
library(irr)
library(cowplot)
# Added by RF
library(lme4)
library(lmerTest)
library(emmeans)
library(pROC)
library(reshape2)
library(caret)
library(sjPlot)#setwd("C:/RData")
# read in data
data<- read.csv("20190617-RFB_DFA-Data.csv")
data<-data.frame(data)
summary(data)## ID SEX LOC AT Mass
## GV37550: 4 FEMALE:104 BARTON POINT:63 DG :131 Min. : 700.0
## GV37510: 3 MALE : 86 CUST POINT :62 NGCB: 26 1st Qu.: 850.0
## GV37520: 3 NELSON :26 PB : 33 Median : 932.5
## GV37541: 3 GR. COQ :15 Mean : 939.2
## GV37558: 3 PARASOL : 7 3rd Qu.:1007.5
## GV37505: 2 BP NORTH : 6 Max. :1420.0
## (Other):172 (Other) :11
## WL
## Min. :362.0
## 1st Qu.:382.2
## Median :390.0
## Mean :389.1
## 3rd Qu.:395.0
## Max. :417.0
##
#------------------------------------------------------------------------------------------
### Confirming there is sexual dimorphism in Red-footed Booby
# Is there a significant difference between sexes in WL length and mass at the archipelago level? (Eva)
summary(lm(Mass ~ SEX, data))##
## Call:
## lm(formula = Mass ~ SEX, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -188.85 -64.71 -18.85 56.15 411.15
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1008.846 9.756 103.41 <2e-16 ***
## SEXMALE -153.846 14.501 -10.61 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 99.5 on 188 degrees of freedom
## Multiple R-squared: 0.3745, Adjusted R-squared: 0.3712
## F-statistic: 112.6 on 1 and 188 DF, p-value: < 2.2e-16
### Yes, there is a highly significant difference between males and females mass and wing length
# ROBIN TO CHECKRF: Appears so yes, but there is some repeated measures in the data (multiple samples per individual) and there’s multiple locations. Given the distribution of individuals is mostly single samples:
RF: this seems unlikely to have much impact, but it may be worth running the same model with ID as a random factor and including location (you could imagine the situation where all the females were from one locations and they were bigger?
# I'm using the lmerTest package here to get p-values. There's plenty of argument that we shouldn't
# be using p-values at all, but let's leave that for another day.
m1 <- lmer(Mass ~ SEX + LOC + (1|ID),data=data)
summary(m1)## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: Mass ~ SEX + LOC + (1 | ID)
## Data: data
##
## REML criterion at convergence: 2181.5
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.50271 -0.32280 -0.02408 0.29981 2.07402
##
## Random effects:
## Groups Name Variance Std.Dev.
## ID (Intercept) 7653 87.48
## Residual 2627 51.25
## Number of obs: 190, groups: ID, 171
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 1021.98 15.33 165.25 66.654 <2e-16 ***
## SEXMALE -154.51 16.12 158.92 -9.587 <2e-16 ***
## LOCBP NORTH -10.35 46.95 153.91 -0.220 0.8258
## LOCCUST POINT -17.99 18.63 179.50 -0.966 0.3355
## LOCGR. COQ 22.39 29.92 164.73 0.748 0.4553
## LOCLONGUE -84.73 43.76 164.16 -1.936 0.0545 .
## LOCMORESBY 33.02 52.96 163.83 0.623 0.5338
## LOCNELSON 2.68 24.86 165.20 0.108 0.9143
## LOCPARASOL -58.62 40.82 164.21 -1.436 0.1529
## LOCVERTE -67.47 102.84 163.77 -0.656 0.5127
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) SEXMAL LOCBPN LOCCUP LOCGRC LOCLON LOCMOR LOCNEL LOCPAR
## SEXMALE -0.400
## LOCBP NORTH -0.300 0.064
## LOCCUSTPOIN -0.637 -0.062 0.212
## LOCGR. COQ -0.383 -0.118 0.133 0.346
## LOCLONGUE -0.277 -0.044 0.093 0.234 0.156
## LOCMORESBY -0.289 0.116 0.087 0.184 0.111 0.080
## LOCNELSON -0.437 -0.202 0.156 0.420 0.289 0.190 0.127
## LOCPARASOL -0.308 -0.019 0.102 0.250 0.164 0.111 0.089 0.198
## LOCVERTE -0.086 -0.097 0.035 0.105 0.076 0.048 0.025 0.097 0.049
## Type III Analysis of Variance Table with Satterthwaite's method
## Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
## SEX 241403 241403 1 158.92 91.9018 <2e-16 ***
## LOC 23036 2879 8 164.60 1.0962 0.3684
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
RF: This reinforces your finding that Mass is a useful predictor of SEX, the effect is even a teeny tiny bit larger when location and individual and location are taken into acccount
# Plotting graphs to view data - males v females mass
ggplot(data,
aes(SEX, Mass)) +
geom_boxplot() +
theme(panel.background=element_rect(fill="white", colour="black"),
axis.text=element_text(size=10, color="black"),
axis.text.x = element_text(angle=45, hjust = 1),
axis.title=element_text(size=24)) +
xlab("ATOLL") +
ylab("MASS (g)")##
## Call:
## lm(formula = WL ~ SEX, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23.7212 -5.5930 0.2788 6.2788 23.2788
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 393.7212 0.8956 439.603 < 2e-16 ***
## SEXMALE -10.1281 1.3312 -7.608 1.3e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.134 on 188 degrees of freedom
## Multiple R-squared: 0.2354, Adjusted R-squared: 0.2313
## F-statistic: 57.88 on 1 and 188 DF, p-value: 1.299e-12
RF: Again, this is worth doing while taking individual and location into account… and again, the effect holds
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: WL ~ SEX + LOC + (1 | ID)
## Data: data
##
## REML criterion at convergence: 1319.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.44868 -0.37943 0.01169 0.39662 1.44590
##
## Random effects:
## Groups Name Variance Std.Dev.
## ID (Intercept) 54.12 7.357
## Residual 28.87 5.373
## Number of obs: 190, groups: ID, 171
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 394.6671 1.3757 161.8953 286.883 < 2e-16 ***
## SEXMALE -11.0612 1.4441 162.5812 -7.659 1.58e-12 ***
## LOCBP NORTH -3.0997 4.1945 155.4318 -0.739 0.461
## LOCCUST POINT -1.7697 1.6891 176.4990 -1.048 0.296
## LOCGR. COQ 2.8363 2.6883 167.4319 1.055 0.293
## LOCLONGUE -5.8032 3.9313 168.0193 -1.476 0.142
## LOCMORESBY 1.0829 4.7583 168.0204 0.228 0.820
## LOCNELSON 2.2983 2.2335 166.8912 1.029 0.305
## LOCPARASOL 0.6448 3.6679 167.9298 0.176 0.861
## LOCVERTE 0.3941 9.2407 168.4145 0.043 0.966
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) SEXMAL LOCBPN LOCCUP LOCGRC LOCLON LOCMOR LOCNEL LOCPAR
## SEXMALE -0.398
## LOCBP NORTH -0.302 0.065
## LOCCUSTPOIN -0.639 -0.063 0.214
## LOCGR. COQ -0.383 -0.119 0.134 0.347
## LOCLONGUE -0.277 -0.044 0.094 0.235 0.156
## LOCMORESBY -0.289 0.115 0.087 0.185 0.111 0.080
## LOCNELSON -0.438 -0.202 0.157 0.422 0.289 0.190 0.127
## LOCPARASOL -0.308 -0.019 0.102 0.250 0.164 0.111 0.089 0.198
## LOCVERTE -0.087 -0.097 0.035 0.105 0.076 0.048 0.025 0.097 0.049
## Type III Analysis of Variance Table with Satterthwaite's method
## Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
## SEX 1693.87 1693.87 1 162.58 58.667 1.583e-12 ***
## LOC 239.53 29.94 8 167.60 1.037 0.4103
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ggplot(data,
aes(SEX, WL)) +
geom_boxplot() +
theme(panel.background=element_rect(fill="white", colour="black"),
axis.text=element_text(size=10, color="black"),
axis.text.x = element_text(angle=45, hjust = 1),
axis.title=element_text(size=24))+
#scale_y_continuous(limits = c(360, 450))+
xlab("SEX") +
ylab("Wing length (mm)")### Confirming there is no difference in biometrics for males/females between three atolls
Male <- subset(data, data$SEX == "MALE")
WLbyAT <-aov(WL ~ AT, data = Male)
summary(WLbyAT)## Df Sum Sq Mean Sq F value Pr(>F)
## AT 2 193 96.52 1.121 0.331
## Residuals 83 7144 86.07
## Df Sum Sq Mean Sq F value Pr(>F)
## AT 2 19841 9920 1.261 0.289
## Residuals 83 652709 7864
RF: Again, I’d be tempted to just include this in the models above, so:
RF: These highlight that in a model including individual and sex to predict WL or Mass, there’s no significant effect of AT/atholl
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: Mass ~ SEX + AT + (1 | ID)
## Data: data
##
## REML criterion at convergence: 2246.9
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.45613 -0.34950 -0.08603 0.36239 2.03128
##
## Random effects:
## Groups Name Variance Std.Dev.
## ID (Intercept) 7703 87.77
## Residual 2689 51.86
## Number of obs: 190, groups: ID, 171
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 1012.930 11.535 160.861 87.813 <2e-16 ***
## SEXMALE -156.498 15.842 163.869 -9.879 <2e-16 ***
## ATNGCB 13.107 22.599 167.672 0.580 0.563
## ATPB -5.688 20.182 167.398 -0.282 0.778
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) SEXMAL ATNGCB
## SEXMALE -0.563
## ATNGCB -0.237 -0.198
## ATPB -0.357 -0.059 0.211
## Type III Analysis of Variance Table with Satterthwaite's method
## Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
## SEX 262477 262477 1 163.87 97.5933 <2e-16 ***
## AT 1365 682 2 167.79 0.2537 0.7762
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: WL ~ SEX + AT + (1 | ID)
## Data: data
##
## REML criterion at convergence: 1352.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.42674 -0.35999 0.02849 0.41428 1.51498
##
## Random effects:
## Groups Name Variance Std.Dev.
## ID (Intercept) 54.02 7.350
## Residual 28.66 5.354
## Number of obs: 190, groups: ID, 171
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 393.602 1.025 163.690 384.131 < 2e-16 ***
## SEXMALE -11.042 1.409 167.584 -7.835 5.13e-13 ***
## ATNGCB 3.350 2.014 172.275 1.663 0.0981 .
## ATPB 1.570 1.799 171.941 0.873 0.3839
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) SEXMAL ATNGCB
## SEXMALE -0.563
## ATNGCB -0.236 -0.198
## ATPB -0.356 -0.059 0.210
## Type III Analysis of Variance Table with Satterthwaite's method
## Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
## SEX 1759.56 1759.56 1 167.58 61.390 5.128e-13 ***
## AT 87.53 43.77 2 172.40 1.527 0.2201
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
RF: And the following p-values tell us that there’s no significant difference in mass or wing-length between atholls (including when sex and individual are taken into account)
## $emmeans
## AT emmean SE df lower.CL upper.CL
## DG 935 9.65 164 916 954
## NGCB 948 20.22 171 908 988
## PB 929 17.75 171 894 964
##
## Results are averaged over the levels of: SEX
## Degrees-of-freedom method: kenward-roger
## Confidence level used: 0.95
##
## $contrasts
## contrast estimate SE df t.ratio p.value
## DG - NGCB -13.11 22.6 170 -0.580 0.8310
## DG - PB 5.69 20.2 170 0.282 0.9572
## NGCB - PB 18.80 26.9 171 0.698 0.7651
##
## Results are averaged over the levels of: SEX
## P value adjustment: tukey method for comparing a family of 3 estimates
## contrast estimate SE df t.ratio p.value
## DG - NGCB -13.11 22.6 170 -0.580 0.8310
## DG - PB 5.69 20.2 170 0.282 0.9572
## NGCB - PB 18.80 26.9 171 0.698 0.7651
##
## Results are averaged over the levels of: SEX
## P value adjustment: tukey method for comparing a family of 3 estimates
## $emmeans
## SEX emmean SE df lower.CL upper.CL
## FEMALE 1015 12.6 169 990 1040
## MALE 859 12.1 169 835 883
##
## Results are averaged over the levels of: AT
## Degrees-of-freedom method: kenward-roger
## Confidence level used: 0.95
##
## $contrasts
## contrast estimate SE df t.ratio p.value
## FEMALE - MALE 156 15.8 167 9.876 <.0001
##
## Results are averaged over the levels of: AT
## contrast estimate SE df t.ratio p.value
## FEMALE - MALE 156 15.8 167 9.876 <.0001
##
## Results are averaged over the levels of: AT
## $emmeans
## AT emmean SE df lower.CL upper.CL
## DG 388.1 0.8574 162.8 386.4 389.8
## NGCB 391.4 1.8038 173.2 387.9 395.0
## PB 389.7 1.5831 173.3 386.5 392.8
##
## Results are averaged over the levels of: SEX
## Degrees-of-freedom method: kenward-roger
## Confidence level used: 0.95
##
## $contrasts
## contrast estimate SE df t.ratio p.value
## DG - NGCB -3.35 2.01 172 -1.663 0.2224
## DG - PB -1.57 1.80 171 -0.873 0.6581
## NGCB - PB 1.78 2.40 173 0.741 0.7394
##
## Results are averaged over the levels of: SEX
## P value adjustment: tukey method for comparing a family of 3 estimates
## contrast estimate SE df t.ratio p.value
## DG - NGCB -3.35 2.01 172 -1.663 0.2224
## DG - PB -1.57 1.80 171 -0.873 0.6581
## NGCB - PB 1.78 2.40 173 0.741 0.7394
##
## Results are averaged over the levels of: SEX
## P value adjustment: tukey method for comparing a family of 3 estimates
RF: I’m not sure what all these next bits are for….
## Df Wilks approx F num Df den Df Pr(>F)
## AT 2 0.95795 0.89015 4 164 0.4712
## Residuals 83
??manova
## No difference between sexes at locations, significant differece between sexes
# Further analysis of differences using univariate analyses
WL.model <- aov(lm(WL ~ AT * SEX, data=data))
summary(WL.model)## Df Sum Sq Mean Sq F value Pr(>F)
## AT 2 22 11 0.129 0.879
## SEX 1 5105 5105 61.065 4.1e-13 ***
## AT:SEX 2 2 1 0.012 0.988
## Residuals 184 15383 84
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## $statistics
## MSerror Df Mean CV
## 83.60545 184 389.1368 2.349714
##
## $parameters
## test name.t ntr StudentizedRange alpha
## Tukey AT 3 3.341555 0.05
##
## $means
## WL std r Min Max Q25 Q50 Q75
## DG 388.9313 10.807541 131 363 417 382.00 389 395.5
## NGCB 389.3077 9.332820 26 370 409 384.25 387 393.0
## PB 389.8182 9.888297 33 362 410 385.00 390 395.0
##
## $comparison
## NULL
##
## $groups
## WL groups
## PB 389.8182 a
## NGCB 389.3077 a
## DG 388.9313 a
##
## attr(,"class")
## [1] "group"
## No significant difference between atolls in wing length when males and females are grouped.
Mass.model <- aov(lm(Mass ~ AT * SEX, data=data))
summary(Mass.model)## Df Sum Sq Mean Sq F value Pr(>F)
## AT 2 19184 9592 0.963 0.384
## SEX 1 1102363 1102363 110.684 <2e-16 ***
## AT:SEX 2 21120 10560 1.060 0.348
## Residuals 184 1832565 9960
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## No significant difference between atolls in MASS when males and females are grouped.
(HSD.test(Mass.model, "AT")) # ## $statistics
## MSerror Df Mean CV
## 9959.591 184 939.2105 10.62571
##
## $parameters
## test name.t ntr StudentizedRange alpha
## Tukey AT 3 3.341555 0.05
##
## $means
## Mass std r Min Max Q25 Q50 Q75
## DG 945.4580 130.3847 131 700 1420 860 940 1020
## NGCB 917.6923 113.0065 26 730 1220 840 900 990
## PB 931.3636 115.4832 33 730 1190 850 950 990
##
## $comparison
## NULL
##
## $groups
## Mass groups
## DG 945.4580 a
## PB 931.3636 a
## NGCB 917.6923 a
##
## attr(,"class")
## [1] "group"
# Plotting graphs to view data - WL length by atoll
ggplot(data,
aes(AT, WL)) +
geom_boxplot() +
facet_wrap(~SEX) +
theme(panel.background=element_rect(fill="white", colour="black"),
axis.text=element_text(size=10, color="black"),
axis.text.x = element_text(angle=45, hjust = 1),
axis.title=element_text(size=24)) +
xlab("ATOLL") +
ylab("WL LENGTH (mm)")## Plotting graphs to view data - Mass by atoll
ggplot(data,
aes(AT, Mass)) +
geom_boxplot() +
facet_wrap(~SEX) +
theme(panel.background=element_rect(fill="white", colour="black"),
axis.text=element_text(size=10, color="black"),
axis.text.x = element_text(angle=45, hjust = 1),
axis.title=element_text(size=24)) +
xlab("ATOLL") +
ylab("MASS (g)")### Graph for BPMS Vava report
#-------------------------------------------------------------------------------------
### DIEGO GARCIA DFA
# Subset & reorder the data
DG <- subset(data, LOC == "BARTON POINT"| LOC == "BP NORTH" | LOC == "CUST POINT")
# iterate the cross-validation process
iter <- 1000
correct.rate <- 0
for(i in 1:iter) {
# ** This is creating a random train/test set for every iteration *
DG$rand <- runif(dim(DG)[1]) # add a random number to each case
DG <- DG[order(DG$rand),] # sort the dataset by random number
DG.train <- DG[1:(0.67*dim(DG)[1]),] # use the first 2/3rds of the data to train the DFA
DG.test <- DG[round(0.67*dim(DG)[1]+1):dim(DG)[1],] # and the last third to test the DFA
# ** And building a LDA for every iteration (so 1000 times)
# Build the DFA
# First, do the selection
dfa.data <- cbind(DG.train$Mass, DG.train$WL)
dfa.svw <- stepclass(dfa.data, DG.train$SEX, method = "lda", direction = c("backward"), criterion = "CR", fold = 10) # so not much difference in the correctness rate (88.2 vs 88.4%)
# And now, the posterior probabilities of F & M
dfa <- lda(SEX ~ Mass + WL, data=DG.train, CV=TRUE)
# ** Then testing its accuracy on the training dataset
# Cross-validation TaMasse
ct <- table(DG.train$SEX, dfa$class)
correct.rate[i] <- (ct[1,1] + ct[2,2])/sum(ct)
print(i)
}
# ** So this is the average **training** accuracy of the LDA
summary(correct.rate)
quantile(correct.rate, c(0.025, 0.975))
# ** And this is building a (separate to the LDA/DFA above) logistic regression to predict
# ** sex using Mass and WL
# Logistic regression & coefficients
log.reg <- glm(SEX ~ Mass + WL, data=DG.train, family="binomial")
log.reg
#D.2 <- DG.train$Mass*log.reg$coefficients[2] + DG.train$WL*log.reg$coefficients[3] + log.reg$coefficients[1]
# * Replaced with (is the same)....
D.2 <- predict(log.reg, DG.train)
#* dfa.plot <- as.data.frame(cbind(D.2, dfa$posterior, DG.train$SEX))
#* dfa.plot$V4 <- as.factor(dfa.plot$V4)
#* colnames(dfa.plot) <- c("D2", "Pr.F", "Pr.M", "Sex") # in the Sex column, 1 = F, 2 = M
#* summary(dfa.plot)
# ** Tidier version of the above....
dfa.plot = data.frame(D.2 = D.2, Pr.F=dfa$posterior[, 1], Pr.M = dfa$posterior[, 2], Sex=DG.train$SEX)
summary(dfa.plot)
#I DO NOT UNDERSTAND THIS PLOT
# * This is converting the probability of female from the LDA into logit (logit(p) = log(p/(1-p))),
# * I'm guessing to compare to the glm value??
# Figure out the critical values (i.e., calculate D for a given p(F))
trans.post <- qlogis(dfa$posterior[,1])
log.reg2 <- glm(D.2 ~ trans.post, family="gaussian")
cutoffs <- qlogis(c(0.25, 0.33, 0.50, 0.67, 0.75))# using 25, 33, and 50% cut-offs
cutoffs <- data.frame(cutoffs)
colnames(cutoffs) = c("trans.post")
pred.model <- predict(log.reg2, cutoffs)
pred.model
# Cohen's kappa - to test whether classification was better than chance
kappa2(cbind(DG.train$SEX, dfa$class), weight = "equal") # and it is!
# Predict the testing dataset sex
#DFA.score <- predict(log.reg, DG.test)
#DG.test$ProbA <- 1/(1+exp(-(log.reg$coefficients[1] + log.reg$coefficients[2]*DG.test$Mass + log.reg$coefficients[3]*DG.test$WL)))
# Replaced with (is the same, "response" gets you the probability).....
DG.test$ProbA <- predict(log.reg, DG.test, type="response")
summary(DG.test)
#Robin
###I DO NOT UNDERSTAND THIS SUMMARY.
#IN THE SUMMARY ARE THE MALE AND FEMALE BIOMETRICS COMBINED?
#HOW MANY BIRDS WERE SEXED CORRECTLY FROM THE TRAINING DATA?
#WHAT ARE THE BIOMETRIC PARAMETERS FOR SEXING BIRDS?
#CAN THIS NOW BE USED TO SEX ALL OTHER BIRDS TRACKED ON DG (OR THE CHAGOS)?RF: So the table DG.test is this:
# ID SEX LOC AT Mass WL rand ProbA female
# GV37607 FEMALE CUST POINT DG 920 393 0.6618114 3.433514e-01 TRUERF: * Where ID, SEX, LOC, AT, Mass, WL are the original data * rand is just a random number generated to create training/test datasets * ProbA is the probability that the bird in that row is female
RF: However, I find all the above a bit confusing (the combination of the LDA and the GLM). So I’ve had a go at how I would do it below…
# Tackling prediction of sex just using a GLM
# make new variable that is TRUE for females
# (makes it a little easier to interpret later)
DG <- subset(data, LOC == "BARTON POINT"| LOC == "BP NORTH" | LOC == "CUST POINT")
DG$female = DG$SEX != "MALE"
# Split data into training test 2/3 vs 1/3 (0.66)
train_index <- sample(1:nrow(DG), 0.66 * nrow(DG))
test_index <- setdiff(1:nrow(DG), train_index)
DG.train <- DG[train_index,] # use the first 2/3rds of the data to train
DG.test <- DG[test_index,] # and the last third to test
# Make model predicting sex (female) from Mass and wing length (using only training data)
log.regB = glm(factor(female) ~ Mass*WL, data=DG.train, family="binomial")
# Extract predicted probabilities for test data (see newdata)
pred <- predict(log.regB, type="response", newdata=DG.test)
# Calculate specificity/sensitivity curve (0.5 threshold looks ok)
myroc <- roc(DG.test$female, pred, ci=T)## Setting levels: control = FALSE, case = TRUE
## Setting direction: controls < cases
## Warning in coords.roc(myroc, "all"): An upcoming version of pROC will
## set the 'transpose' argument to FALSE by default. Set transpose = TRUE
## explicitly to keep the current behavior, or transpose = FALSE to adopt
## the new one and silence this warning. Type help(coords_transpose) for
## additional information.
melted_data = melt(data.frame(t(mycoords)), id=c("threshold"))
ggplot(data = melted_data, aes(x=threshold, y=value, group=variable, color=variable)) +
geom_line() +
xlab("Threshold") +
ylab("Performance") RF: Calculate confusion matrix for test data
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 15 0
## TRUE 7 23
##
## Accuracy : 0.8444
## 95% CI : (0.7054, 0.9351)
## No Information Rate : 0.5111
## P-Value [Acc > NIR] : 3.102e-06
##
## Kappa : 0.6866
##
## Mcnemar's Test P-Value : 0.02334
##
## Sensitivity : 1.0000
## Specificity : 0.6818
## Pos Pred Value : 0.7667
## Neg Pred Value : 1.0000
## Prevalence : 0.5111
## Detection Rate : 0.5111
## Detection Prevalence : 0.6667
## Balanced Accuracy : 0.8409
##
## 'Positive' Class : TRUE
##
# Shows accuracy is 0.8444 (0.7054, 0.9351),
# 27/30 females corrected predicted (TRUE=female, FALSE=male in output)
# 12/13 males correctly predicted (in test data)
DG.test$prob_female = pred
DG.test$correct_pred = ((DG.test$prob_female > 0.5) == DG.test$female)
# Uncomment to save file
#write.csv(DG.test, "DG_testing_results.csv")RF: Can also explore training accuracy if you want
train_pred <- predict(log.regB, type="response") # If no newdata is given, this uses training data
threshold <- 0.5
confusionMatrix(factor(train_pred>threshold), factor(DG.train$female), positive="TRUE")## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 22 4
## TRUE 8 52
##
## Accuracy : 0.8605
## 95% CI : (0.7689, 0.9258)
## No Information Rate : 0.6512
## P-Value [Acc > NIR] : 1.138e-05
##
## Kappa : 0.683
##
## Mcnemar's Test P-Value : 0.3865
##
## Sensitivity : 0.9286
## Specificity : 0.7333
## Pos Pred Value : 0.8667
## Neg Pred Value : 0.8462
## Prevalence : 0.6512
## Detection Rate : 0.6047
## Detection Prevalence : 0.6977
## Balanced Accuracy : 0.8310
##
## 'Positive' Class : TRUE
##
# Shows accuracy is 0.8391 (0.7448, 0.9091),
# 44/48 females corrected predicted
# 29/39 males correctly predicted (in test data)
# ** This suggests that the prediction is good, but that the errors tend to be in predicting males are females
DG.train$prob_female = train_pred
DG.train$correct_pred = ((DG.train$prob_female > 0.5) == DG.train$female)
# Uncomment to save file
#write.csv(DG.test, "DG_testing_results.csv")RF: We can plot predictions for each fixed effect separately
## $Mass
##
## $WL
RF: But more useful is to include the interaction…
newmass = seq(min(DG$Mass), max(DG$Mass), length.out = 200)
newWL = seq(min(DG$WL), max(DG$WL), length.out = 200)
newdata = expand.grid(newmass, newWL)
colnames(newdata) = c("Mass", "WL")
newdata$prob_female = predict(log.regB, newdata=newdata, type="response")RF: And then the probability surface
ggplot(newdata, aes(x=Mass, y=WL, z=prob_female, fill=prob_female)) +
geom_tile() +
geom_contour(breaks = c(0.2, 0.3, 0.4, 0.6, 0.7, 0.8)) +
geom_contour(breaks = c(0.5), color="red", linetype="dotted", size=2) +
geom_point(data=subset(DG, SEX=="MALE"), aes(z=1, fill=1), color="blue") +
geom_point(data=subset(DG, SEX=="FEMALE"), aes(z=1, fill=1), color="orange") +
scale_fill_distiller(palette = "Greys") ## Warning: Ignoring unknown aesthetics: z
## Warning: Ignoring unknown aesthetics: z
# This does highlight a couple of interesting outliers in the data
# A male with mass > 1200g
# A female with mass ~850g##########################
#####
#FIND OUT WHAT THIS TABLE MEANS
###########
###########################
# output a file with the probability(F) for each bird in the testing dataset
write.csv(DG.test, "DG testing results.csv")
#ROBIN - How can I direct the directory where this is stored?
# * Yup, you can put the path before the file.... (use forward slashes though! e.g.)
# write.csv(DG.test, "C:/mydir/mysubdir/etc/etc/DG testing results.csv")
# Plot DFA score vs. prob(F)
tiff(filename="DG DFA.tiff", res=150, width=174, height=174, units="mm", compression=c("lzw"))
# Robin - How can I direct where tis tiff is stored?
# Yup, same as before....
# tiff(filename="C:/mydir/mysubdir/blah/blah/DG DFA.tiff", res=150, width=174, height=174, units="mm", compression=c("lzw"))
ggplot(dfa.plot,
aes(D.2, Pr.F, colour=Sex)) +
scale_colour_manual(values=c("black", "grey")) +
geom_point(size=5) +
theme(panel.background=element_rect(fill="white", colour="black"),
legend.position="none",
axis.text=element_text(size=18, color="black"),
axis.title=element_text(size=20),
strip.text.x=element_text(size=18, color="black"),
strip.background=element_rect(fill="white", colour="black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank()) +
xlab("Discriminant score") +
ylab("Probability(Female)")
dev.off()
# and a figure of WL vs Mass with line to indicate separation
# Figure out the solid line in the scatterplot for 50% cutoff
Len <- DG.train$Mass
Dep <- (-log.reg$coefficients[1]-log.reg$coefficients[2]*Len)/log.reg$coefficients[3] # this uses the coefficients from log.reg
line.model <- lm(Dep ~ Len)
ggplot(DG,
aes(Mass, WL, colour=SEX)) +
scale_colour_manual(values=c("black", "grey")) +
scale_x_continuous(limits=c(700, 1500)) +
scale_y_continuous(limits=c(360, 425)) +
geom_point(size=5) +
geom_abline(intercept = line.model$coefficients[1], slope = line.model$coefficients[2]) +
theme(panel.background=element_rect(fill="white", colour="black"),
legend.position="none",
axis.text=element_text(size=18, color="black"),
axis.title=element_text(size=20),
strip.text.x=element_text(size=18, color="black"),
strip.background=element_rect(fill="white", colour="black"),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank()) +
xlab("MASS (g)") +
ylab("WING LENGTH (mm)")### Good graph for BPMS Vava report
##QUESTION ROBIN- WHAT ARE THE ACTUAL NUMBERS FOR MALE WL AND MASS AND FEMALE WL AND MASS???????
# If we go all the way back to our first model (m1b) we can get the marginal means for Male and Female:
emmeans(m1b, specs = pairwise ~ SEX)## $emmeans
## SEX emmean SE df lower.CL upper.CL
## FEMALE 1015 12.6 169 990 1040
## MALE 859 12.1 169 835 883
##
## Results are averaged over the levels of: AT
## Degrees-of-freedom method: kenward-roger
## Confidence level used: 0.95
##
## $contrasts
## contrast estimate SE df t.ratio p.value
## FEMALE - MALE 156 15.8 167 9.876 <.0001
##
## Results are averaged over the levels of: AT
# Which suggests that
# Females are on average 1015g (990 - 1040), and
# Males are 859g (835 - 883)
emmeans(m2b, specs = pairwise ~ SEX)## $emmeans
## SEX emmean SE df lower.CL upper.CL
## FEMALE 395 1.13 169 393 397
## MALE 384 1.07 171 382 386
##
## Results are averaged over the levels of: AT
## Degrees-of-freedom method: kenward-roger
## Confidence level used: 0.95
##
## $contrasts
## contrast estimate SE df t.ratio p.value
## FEMALE - MALE 11 1.41 167 7.831 <.0001
##
## Results are averaged over the levels of: AT