output_col = FALSE # Recommend changing this to TRUE if working in Base R or RStudio, and FALSE if generating an html
source("src/clean_flight_data.R") # Script that loads and cleans up the data
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
source("src/regression_output.R") # A script that cleans up regression outputs and prints in color or black and white
source("src/center_flight_data.R")
source("src/get_warnings.R")
data <- read_flight_data("data/all_flight_data-Winter2020.csv")
data_all <- data[[1]]
data_tested <- data[[2]]
d <- data_tested %>%
group_by(ID, sex,population, site, host_plant, latitude, longitude, total_eggs,
beak, thorax, wing, body, w_morph, morph_notes, tested,
host_c, sex_c, w_morph_c, lat_c, sym_dist, sym_dist_s, total_eggs_c,
beak_c, thorax_c, thorax_s, body_c, wing_c, wing2body, wing2body_c, wing2body_s) %>%
summarise_all(funs(list(na.omit(.))))
## Warning: `funs()` is deprecated as of dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
d$num_flew <- 0
d$num_notflew <- 0
d$average_mass <- 0
d$avg_days <- 0
for(row in 1:length(d$flew_b)){
n_flew <- sum(d$flew_b[[row]] == 1) # total number of times did fly among trails
d$num_flew[[row]] <- n_flew
n_notflew <- sum(d$flew_b[[row]] == 0) # total number of times did not fly among trails
d$num_notflew[[row]] <- n_notflew
avg_mass <- mean(d$mass[[row]]) # average mass
d$average_mass[[row]] <- avg_mass
avg_days <- mean(d$days_from_start[[row]]) # average days
d$avg_days[[row]] <- avg_days
}
d_all <- select(d, -filename, -channel_letter, -set_number) ##MC: changed the name here to avoid re-loading all data to generate male and female only centered datasets
d_all <- center_data(d_all, is_not_binded = FALSE) # AB: need this here or else average_mass_c and some other columns do not get calculated
d_all
## # A tibble: 333 x 64
## # Groups: ID, sex, population, site, host_plant, latitude, longitude,
## # total_eggs, beak, thorax, wing, body, w_morph, morph_notes, tested, host_c,
## # sex_c, w_morph_c, lat_c, sym_dist, sym_dist_s, total_eggs_c, beak_c,
## # thorax_c, thorax_s, body_c, wing_c, wing2body, wing2body_c [333]
## ID sex population site host_plant latitude longitude total_eggs beak
## <fct> <fct> <fct> <fct> <fct> <dbl> <dbl> <dbl> <dbl>
## 1 1 M Plantatio… Areg… C. corind… 25.0 -80.6 NA 5.5
## 2 2 M Plantatio… Areg… C. corind… 25.0 -80.6 NA 5.64
## 3 3 M Plantatio… Areg… C. corind… 25.0 -80.6 NA 5.75
## 4 4 M Plantatio… Areg… C. corind… 25.0 -80.6 NA 6.21
## 5 5 F Plantatio… Areg… C. corind… 25.0 -80.6 209 7.47
## 6 6 M Plantatio… Foun… C. corind… 25.0 -80.6 NA 5.76
## 7 7 F Plantatio… Foun… C. corind… 25.0 -80.6 8 8.19
## 8 8 F Plantatio… Foun… C. corind… 25.0 -80.6 92 8.39
## 9 9 M Plantatio… Foun… C. corind… 25.0 -80.6 NA 6.1
## 10 10 M Plantatio… Foun… C. corind… 25.0 -80.6 NA 6.09
## # … with 323 more rows, and 55 more variables: thorax <dbl>, wing <dbl>,
## # body <dbl>, w_morph <fct>, morph_notes <fct>, tested <fct>, host_c <dbl>,
## # sex_c <dbl>, w_morph_c <dbl>, lat_c <dbl>, sym_dist <dbl>,
## # sym_dist_s <dbl>, total_eggs_c <dbl>, beak_c <dbl>, thorax_c <dbl>,
## # thorax_s <dbl>, body_c <dbl>, wing_c <dbl>, wing2body <dbl>,
## # wing2body_c <dbl>, wing2body_s <dbl>, trial_type <list>, chamber <list>,
## # average_speed <list>, total_flight_time <list>, distance <list>,
## # shortest_flying_bout <list>, longest_flying_bout <list>,
## # total_duration <list>, max_speed <list>, test_date <list>,
## # time_start <list>, time_end <list>, flew <list>, flight_type <list>,
## # mass <list>, EWM <list>, flew_b <list>, eggs_b <list>,
## # minute_duration <list>, minute_duration_c <list>, min_from_IncStart <dbl>,
## # min_from_IncStart_c <dbl>, min_from_IncStart_s <dbl>,
## # days_from_start <list>, days_from_start_c <list>, mass_c <list>,
## # mass_s <list>, average_mass <dbl>, average_mass_c <dbl>,
## # average_mass_s <dbl>, trial_type_og <list>, num_flew <dbl>,
## # num_notflew <dbl>, avg_days <dbl>
# average days from start; this deals with the problem of individuals who were only tested early, who will have a low value here, and with any chance individuals who happened to get in early or late in both trials.
d_all$avg_days_c <- d_all$avg_days - mean(d_all$avg_days, na.rm=TRUE)
## Log-square-root transformation; this transformation gets average_mass to act normal and not give haywire effect estimates
## AB: Log-square-root transformation - does neither work on their own?
d_all$average_mass_trans <- log(sqrt(d_all$average_mass)) - mean(log(sqrt(d_all$average_mass)), na.rm=TRUE)
## Log-square-root transformation; this transformation gets wing2body ratio to act normal and not give haywire effect estimates; note that this has an inverse relationship with wing2body ratio itself, so effect estimate directions need to be flipped for interpretation.
d_all$wing2body_trans <- log(sqrt(0.85-d_all$wing2body)) - mean(log(sqrt(0.85-d_all$wing2body)), na.rm=TRUE)
# wing2body - take out a constant
The long-square-root transformation is a transformation for long right-tails. It makes them more normal.
# Compare Number of Trials a Bug Flew in to Average Days
d_all$num_trials <- 0
d_all$num_trials <- rowSums(d_all[,c("num_flew", "num_notflew")])
# Scatter Plot
# AB: some bugs we couldn't find early on in the bins and were tested into trial 2 time. Some bugs were also accidentally tested twice in trial 1.
s <- ggplot(d_all, aes(x=avg_days, y=num_trials)) +
geom_point(size=2, shape=23)
# Histogram
d_all$num_trials <- as.factor(rowSums(d_all[,c("num_flew", "num_notflew")]))
h <- ggplot(d_all, aes(x=avg_days, color=num_trials)) +
geom_histogram(binwidth=1, fill="white")+ # removed argument position="dodge"
theme(legend.position="top") +
scale_color_manual(values=c("#999999", "#E69F00", "#56B4E9")) +
scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9"))
grid.arrange(s, h, nrow=1)
d <- center_data(d_all, is_not_binded = FALSE)
days_model<-glm(cbind(num_flew,num_notflew)~avg_days_c, data=d, family=binomial)
summary(days_model)
##
## Call:
## glm(formula = cbind(num_flew, num_notflew) ~ avg_days_c, family = binomial,
## data = d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8429 -1.7825 -0.1593 1.5162 1.5795
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.22942 0.08236 2.785 0.00535 **
## avg_days_c 0.01087 0.02354 0.462 0.64430
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 668.05 on 332 degrees of freedom
## Residual deviance: 667.84 on 331 degrees of freedom
## AIC: 759.17
##
## Number of Fisher Scoring iterations: 3
While average days does not show the same strong effect as days_from_start, this should still control for the fact that some individuals were only tested early, and some by chance had two earlier or later days within a trial. This allows us to retain the structure with R1 and R2, which controls for ID, but still converges.
R1 = d$num_flew
R2 = d$num_notflew
A = d$host_c
B = d$sex_c
C = d$sym_dist_s
D = d$average_mass_trans
E = d$avg_days_c
data<-data.frame(R1, R2, A, B, C, D, E)
source("src/compare_models.R")
errors <- withWarnings(model_comparisonsAIC("src/generic models-binomial glm 2R ~ 4-FF + E.R"))
## [,1] [,2] [,3]
## AICs 683.3784 683.9498 684.4483
## models 85 63 50
## probs 0.08875001 0.06669651 0.05198035
##
## m85 glm(formula = cbind(R1, R2) ~ A * D + B * D + C * D + E, family = binomial,
## data = data)
## m63 glm(formula = cbind(R1, R2) ~ A * D + C * D + B + E, family = binomial,
## data = data)
## m50 glm(formula = cbind(R1, R2) ~ A * D + B * D + E, family = binomial,
## data = data)
#errors <- withWarnings(model_comparisonsAIC("src/generic models-binomial glm 2R ~ 4-FF + E + noE.R"))
length(errors$warnings)
## [1] 0
#generic 4-fixed factor and 1 fixed covariate models - but why? you don't even have a null model anymore.
anova(m63, m85, test="Chisq") # Adding B*D does not improve fit
## Analysis of Deviance Table
##
## Model 1: cbind(R1, R2) ~ A * D + C * D + B + E
## Model 2: cbind(R1, R2) ~ A * D + B * D + C * D + E
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 325 580.61
## 2 324 578.04 1 2.5713 0.1088
anova(m26, m36, test="Chisq") # Adding C does not improve fit
## Analysis of Deviance Table
##
## Model 1: cbind(R1, R2) ~ A * D + B + E
## Model 2: cbind(R1, R2) ~ A * D + B + C + E
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 327 585.29
## 2 326 585.11 1 0.17375 0.6768
anova(m12, m26, test="Chisq") # Adding A*D does improve fit
## Analysis of Deviance Table
##
## Model 1: cbind(R1, R2) ~ A + B + D + E
## Model 2: cbind(R1, R2) ~ A * D + B + E
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 328 590.71
## 2 327 585.29 1 5.4275 0.01982 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(m63, m36, test="Chisq") # Adding C*D does improve fit | m63 is the best fit
## Analysis of Deviance Table
##
## Model 1: cbind(R1, R2) ~ A * D + C * D + B + E
## Model 2: cbind(R1, R2) ~ A * D + B + C + E
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 325 580.61
## 2 326 585.11 -1 -4.4994 0.03391 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mass_model_all <- glm(cbind(num_flew, num_notflew) ~ host_c * average_mass_trans
+ sym_dist_s * average_mass_trans + sex + avg_days_c, data=d, family=binomial)
summary(mass_model_all) # matches with earlier top models but earlier top model was simplier (host*avg_mass + sex_c)
##
## Call:
## glm(formula = cbind(num_flew, num_notflew) ~ host_c * average_mass_trans +
## sym_dist_s * average_mass_trans + sex + avg_days_c, family = binomial,
## data = d)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.54690 -1.08568 -0.03934 1.17713 2.41019
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.42474 0.23003 -1.846 0.06483 .
## host_c -0.14192 0.13043 -1.088 0.27653
## average_mass_trans -1.04743 0.88200 -1.188 0.23501
## sym_dist_s -0.04106 0.12803 -0.321 0.74842
## sexM 0.92157 0.33595 2.743 0.00608 **
## avg_days_c 0.01138 0.02596 0.438 0.66113
## host_c:average_mass_trans 1.85579 0.59201 3.135 0.00172 **
## average_mass_trans:sym_dist_s -1.41377 0.68680 -2.058 0.03954 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 668.05 on 332 degrees of freedom
## Residual deviance: 580.61 on 325 degrees of freedom
## AIC: 683.95
##
## Number of Fisher Scoring iterations: 4
MLC: Because mass and morphology are so dimorphic between sexes, and sex itself has a strong direct effect, let’s go now to the split-by-sex results. It may be that in wing2body and mass, we have pinpointed the reasons that sexes are different; but they differ in so many ways, that is a strong inference to try and make.
AB: significant host_c:average_mass_trans relationship where if from GRT and heavy then fly more times. (It could be a category dominated by females). This was also noticed in the top model performed without transformations and avg_days_c.
data_fem <- d_all[d_all$sex=="F",]
data_fem <- center_data(data_fem, is_not_binded = FALSE)
R1 = data_fem$num_flew
R2 = data_fem$num_notflew
A = data_fem$host_c
B = data_fem$sym_dist
C = data_fem$average_mass_trans
D = data_fem$wing2body_trans
E = data_fem$avg_days_c
data<-data.frame(R1, R2, A, B, C, D, E)
source("src/compare_models.R")
errors <- withWarnings(model_comparisonsAIC("src/generic models-binomial glm 2R ~ 4-FF + E.R"))
## [,1] [,2] [,3]
## AICs 238.8713 239.0635 239.8444
## models 45 25 10
## probs 0.08178881 0.0742949 0.05027895
##
## m45 glm(formula = cbind(R1, R2) ~ A * C + A * D + E, family = binomial,
## data = data)
## m25 glm(formula = cbind(R1, R2) ~ A * C + D + E, family = binomial,
## data = data)
## m10 glm(formula = cbind(R1, R2) ~ C + D + E, family = binomial, data = data)
#errors <- withWarnings(model_comparisonsAIC("src/generic models-binomial glm 2R ~ 4-FF + E + noE.R"))
length(errors$warnings)
## [1] 0
anova(m25, m13, test='Chisq') #adding A*C improves fit
## Analysis of Deviance Table
##
## Model 1: cbind(R1, R2) ~ A * C + D + E
## Model 2: cbind(R1, R2) ~ A + C + D + E
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 114 202.11
## 2 115 206.87 -1 -4.764 0.02906 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(m25, m17, test="Chisq") #adding D improves fit
## Analysis of Deviance Table
##
## Model 1: cbind(R1, R2) ~ A * C + D + E
## Model 2: cbind(R1, R2) ~ A * C + E
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 114 202.11
## 2 115 206.35 -1 -4.243 0.03941 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(m10, m13, test="Chisq") #adding A alone does not improve fit
## Analysis of Deviance Table
##
## Model 1: cbind(R1, R2) ~ C + D + E
## Model 2: cbind(R1, R2) ~ A + C + D + E
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 116 206.89
## 2 115 206.87 1 0.016934 0.8965
anova(m10, m3, test="Chisq") #adding D improves fit
## Analysis of Deviance Table
##
## Model 1: cbind(R1, R2) ~ C + D + E
## Model 2: cbind(R1, R2) ~ C + E
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 116 206.89
## 2 117 212.11 -1 -5.2157 0.02238 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(m10, m4, test="Chisq") #adding C improves fit
## Analysis of Deviance Table
##
## Model 1: cbind(R1, R2) ~ C + D + E
## Model 2: cbind(R1, R2) ~ D + E
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 116 206.89
## 2 117 217.12 -1 -10.234 0.001379 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(m25, m45, test='Chisq') #adding A*D does not improve fit | top model is m25
## Analysis of Deviance Table
##
## Model 1: cbind(R1, R2) ~ A * C + D + E
## Model 2: cbind(R1, R2) ~ A * C + A * D + E
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 114 202.11
## 2 113 199.92 1 2.1922 0.1387
mass_model_fem <- glm(cbind(num_flew, num_notflew) ~ host_c * average_mass_trans + wing2body_trans + avg_days_c, data=data_fem, family=binomial) ### sym_dist does not matter for females; AB: this matches with earlier top models (host * avg_mass + wing)
summary(mass_model_fem)
##
## Call:
## glm(formula = cbind(num_flew, num_notflew) ~ host_c * average_mass_trans +
## wing2body_trans + avg_days_c, family = binomial, data = data_fem)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1849 -1.1189 -0.7523 1.1182 2.7357
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.15913 0.34121 -0.466 0.6409
## host_c -0.61037 0.33019 -1.849 0.0645 .
## average_mass_trans -2.08700 1.45468 -1.435 0.1514
## wing2body_trans -5.37017 2.66359 -2.016 0.0438 *
## avg_days_c 0.11558 0.04757 2.430 0.0151 *
## host_c:average_mass_trans 3.02237 1.39976 2.159 0.0308 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 223.66 on 119 degrees of freedom
## Residual deviance: 202.11 on 114 degrees of freedom
## AIC: 239.06
##
## Number of Fisher Scoring iterations: 4
# AB: conflation for females between days from start and eggs laid? Check total eggs vs. days from start (this is a rough sketch)
gf_point(total_eggs ~ days_from_start, data=data_tested) +
geom_smooth(method='lm')
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 420 rows containing non-finite values (stat_smooth).
## Warning: Removed 420 rows containing missing values (geom_point).
# not a strong enough relationship
summary(lm(total_eggs ~ days_from_start, data=data_tested)) # not significant
##
## Call:
## lm(formula = total_eggs ~ days_from_start, data = data_tested)
##
## Residuals:
## Min 1Q Median 3Q Max
## -113.69 -70.72 -30.56 48.58 253.03
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 90.698 13.254 6.843 1.01e-10 ***
## days_from_start 1.090 0.981 1.112 0.268
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 94.35 on 192 degrees of freedom
## (420 observations deleted due to missingness)
## Multiple R-squared: 0.006394, Adjusted R-squared: 0.001219
## F-statistic: 1.236 on 1 and 192 DF, p-value: 0.2677
data_male <- d_all[d_all$sex=="M",]
data_male <- center_data(data_male, is_not_binded = FALSE)
R1 = data_male$num_flew
R2 = data_male$num_notflew
A = data_male$host_c
B = data_male$sym_dist_s
C = data_male$average_mass_trans
D = data_male$wing2body_trans
E = data_male$avg_days_c
data<-data.frame(R1, R2, A, B, C, D, E)
source("src/compare_models.R")
errors <- withWarnings(model_comparisonsAIC("src/generic models-binomial glm 2R ~ 4-FF + E.R"))
## [,1] [,2] [,3]
## AICs 427.3933 427.6501 428.1158
## models 105 50 83
## probs 0.08394083 0.07382556 0.05848926
##
## m105 glm(formula = cbind(R1, R2) ~ A * D + B * C + B * D + C * D +
## E, family = binomial, data = data)
## m50 glm(formula = cbind(R1, R2) ~ A * D + B * D + E, family = binomial,
## data = data)
## m83 glm(formula = cbind(R1, R2) ~ A * D + B * C + B * D + E, family = binomial,
## data = data)
#errors <- withWarnings(model_comparisonsAIC("src/generic models-binomial glm 2R ~ 4-FF + E + noE.R")) # AB: maybe better to use this for males since # does not matter?
length(errors$warnings)
## [1] 0
anova(m83, m105, test="Chisq") #marginal improvement from adding C*D
## Analysis of Deviance Table
##
## Model 1: cbind(R1, R2) ~ A * D + B * C + B * D + E
## Model 2: cbind(R1, R2) ~ A * D + B * C + B * D + C * D + E
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 204 347.73
## 2 203 345.01 1 2.7225 0.09894 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(m50, m62, test="Chisq") #no improvement from adding C | m50 is the top model
## Analysis of Deviance Table
##
## Model 1: cbind(R1, R2) ~ A * D + B * D + E
## Model 2: cbind(R1, R2) ~ A * D + B * D + C + E
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 206 351.27
## 2 205 351.01 1 0.25486 0.6137
anova(m83, m62, test="Chisq") #marginal improvement from B*C
## Analysis of Deviance Table
##
## Model 1: cbind(R1, R2) ~ A * D + B * C + B * D + E
## Model 2: cbind(R1, R2) ~ A * D + B * D + C + E
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 204 347.73
## 2 205 351.01 -1 -3.2794 0.07015 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mass_model_male<-glm(cbind(num_flew, num_notflew)~host_c*wing2body_trans + sym_dist_s*wing2body_trans + avg_days_c, family=binomial, data=data_male) ## AB: host showing up matches top (simpler) model from original dataset (host + mass + days + wing but the nuance is that when used ID host was not significant and when use trial_type it was). Also strange to not see mass show up. It was significant in all the top models but not when it was the only predictor variable in a model.
summary(mass_model_male)
##
## Call:
## glm(formula = cbind(num_flew, num_notflew) ~ host_c * wing2body_trans +
## sym_dist_s * wing2body_trans + avg_days_c, family = binomial,
## data = data_male)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6331 -0.7526 0.8309 1.1667 2.0725
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.56182 0.14435 3.892 9.94e-05 ***
## host_c -0.38215 0.18894 -2.023 0.04312 *
## wing2body_trans -9.75687 3.01847 -3.232 0.00123 **
## sym_dist_s 0.13262 0.16364 0.810 0.41770
## avg_days_c -0.03316 0.03421 -0.969 0.33232
## host_c:wing2body_trans -9.45796 4.27415 -2.213 0.02691 *
## wing2body_trans:sym_dist_s 7.45469 3.57473 2.085 0.03703 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 372.15 on 212 degrees of freedom
## Residual deviance: 351.27 on 206 degrees of freedom
## AIC: 427.65
##
## Number of Fisher Scoring iterations: 4
MLC: I just made this a function so it’s easy to collapse. Probably won’t stay in the final summary script, but I found it helpful for looking at these interactions. In making the data_temp summaries, data=d can be swapped for data=d[d$sex==“M”,] (or F) to look at host effects within one sex only.
six_plots<-function(){
##quick plots for y/n flight
##plot-specific grouping variables
d$mass_block<-round(d$average_mass/0.005)*0.005
d$f_prob<-d$num_flew/(d$num_flew+d$num_notflew)
d$wing2body_block<-round(d$wing2body, digits=2)
d$days_block<-round(d$avg_days, digits=0)
par(mfrow=c(2,3), mai=c(0.6, 0.6, 0.02, 0.02))
##mass by sex
data_temp<-aggregate(f_prob~sex*mass_block, data=d, FUN=mean)
data_temp$n<-aggregate(f_prob~sex*mass_block, data=d, FUN=length)$f_prob
plot(data_temp$f_prob~data_temp$mass_block, pch=c(2,19)[as.factor(data_temp$sex)], ylab="Flight probability", xlab="Mass")
##Indeed, we can see that the effect of mass is very pronounced in females but essentially absent in males.
##Really clear to see - great image.
## AB: However, based on the delta mass analyses, it might be more nuanced (but it's definitely weaker). Mass does effect males, it might just be in a smaller window because their mass doesn't change as much? Consider the male group point at 1 flight probability - it might be an outlier? If you print the data temp you see that there is only 1 male bug in that group:
print(data_temp) # I don't think that their masses play a huge role but it does follow the trend in the overall data. I guess one way to parse this out experimentally would be to see how virgin female mass impacts flight probability. Would would have a smaller mass window and maybe more dispersed data points.
##wing2body by sex
data_temp<-aggregate(f_prob~sex*wing2body_block, data=d, FUN=mean)
data_temp$n<-aggregate(f_prob~sex*wing2body_block, data=d, FUN=length)$f_prob
plot(data_temp$f_prob~data_temp$wing2body_block, pch=c(2,19)[as.factor(data_temp$sex)], ylab="Flight probability", xlab="wing-to-body ratio")
##Here we can see that as wing2body ratio increases, flight probability increases, but that the effect is much more pronounced in males.
##average days from start by sex
data_temp<-aggregate(f_prob~sex*days_block, data=d, FUN=mean)
data_temp$n<-aggregate(f_prob~sex*days_block, data=d, FUN=length)$f_prob
plot(data_temp$f_prob~data_temp$days_block, pch=c(2,19)[as.factor(data_temp$sex)], ylab="Flight probability", xlab="Days from start")
##Here we can see that as the average days from start increases, flight probability increases, but that the effect is really only visible in females.
##This does not mean days from start didn't impact males - only that our experimental design successfully stopped
##that from being confounding (eg, males may have been less likely to die, or had less biased mortality by host).
## AB: Or does this mean this is a pool size issue? Also, I might argue that for both males and females they are similarly weak? Not entirely visible for females either.
##mass by host
data_temp<-aggregate(f_prob~host_c*mass_block, data=d, FUN=mean)
data_temp$n<-aggregate(f_prob~host_c*mass_block, data=d, FUN=length)$f_prob
plot(data_temp$f_prob~data_temp$mass_block, pch=19, col=c(rgb(1,0.1,0,0.8),rgb(0,1,0.8,0.8))[as.factor(data_temp$host_c)], ylab="Flight probability", xlab="Mass")
##Here, we can see that the effect of mass is clear on GRT (red) but weak on BV (blue)
print(data_temp)
##wing2body by host
data_temp<-aggregate(f_prob~host_c*wing2body_block, data=d, FUN=mean)
data_temp$n<-aggregate(f_prob~host_c*wing2body_block, data=d, FUN=length)$f_prob
plot(data_temp$f_prob~data_temp$wing2body_block, pch=19, col=c(rgb(1,0.1,0,0.8),rgb(0,1,0.8,0.8))[as.factor(data_temp$host_c)], , ylab="Flight probability", xlab="wing-to-body ratio")
##Here, we can see that the positive effect of wing2body ratio is clear on GRT (red) and on BV (blue)
##average days from start by host
data_temp<-aggregate(f_prob~host_c*days_block, data=d, FUN=mean)
data_temp$n<-aggregate(f_prob~host_c*days_block, data=d, FUN=length)$f_prob
plot(data_temp$f_prob~data_temp$days_block, pch=19, col=c(rgb(1,0.1,0,0.8),rgb(0,1,0.8,0.8))[as.factor(data_temp$host_c)], , ylab="Flight probability", xlab="Days from start")
##Here we can see no clear impact of days from start when separated by host
}
six_plots()
## sex mass_block f_prob n
## 1 M 0.025 0.3333333 6
## 2 M 0.030 0.6617647 34
## 3 M 0.035 0.6810345 58
## 4 F 0.040 1.0000000 1
## 5 M 0.040 0.7711864 59
## 6 F 0.045 0.2500000 6
## 7 M 0.045 0.6285714 35
## 8 F 0.050 0.2272727 11
## 9 M 0.050 0.5769231 13
## 10 F 0.055 0.3333333 6
## 11 M 0.055 0.5000000 7
## 12 F 0.060 0.5500000 10
## 13 M 0.060 1.0000000 1
## 14 F 0.065 0.2083333 12
## 15 F 0.070 0.5000000 8
## 16 F 0.075 0.3888889 9
## 17 F 0.080 0.2083333 12
## 18 F 0.085 0.3500000 10
## 19 F 0.090 0.3571429 7
## 20 F 0.095 0.2500000 4
## 21 F 0.100 0.1428571 7
## 22 F 0.105 0.3333333 6
## 23 F 0.110 0.3333333 3
## 24 F 0.115 0.0000000 2
## 25 F 0.120 0.0000000 2
## 26 F 0.125 0.0000000 1
## 27 F 0.135 0.0000000 2
## 28 F 0.180 0.0000000 1
## host_c mass_block f_prob n
## 1 -1 0.025 0.5000000 2
## 2 1 0.025 0.2500000 4
## 3 -1 0.030 0.7352941 17
## 4 1 0.030 0.5882353 17
## 5 -1 0.035 0.6627907 43
## 6 1 0.035 0.7333333 15
## 7 -1 0.040 0.8000000 45
## 8 1 0.040 0.7000000 15
## 9 -1 0.045 0.6000000 35
## 10 1 0.045 0.4166667 6
## 11 -1 0.050 0.5555556 18
## 12 1 0.050 0.0000000 6
## 13 -1 0.055 0.5000000 11
## 14 1 0.055 0.0000000 2
## 15 -1 0.060 0.6875000 8
## 16 1 0.060 0.3333333 3
## 17 -1 0.065 0.1428571 7
## 18 1 0.065 0.3000000 5
## 19 -1 0.070 0.5000000 4
## 20 1 0.070 0.5000000 4
## 21 -1 0.075 0.2500000 6
## 22 1 0.075 0.6666667 3
## 23 -1 0.080 0.2500000 8
## 24 1 0.080 0.1250000 4
## 25 -1 0.085 0.3125000 8
## 26 1 0.085 0.5000000 2
## 27 -1 0.090 0.3000000 5
## 28 1 0.090 0.5000000 2
## 29 -1 0.095 0.2500000 4
## 30 -1 0.100 0.1666667 6
## 31 1 0.100 0.0000000 1
## 32 -1 0.105 0.2000000 5
## 33 1 0.105 1.0000000 1
## 34 -1 0.110 0.3333333 3
## 35 -1 0.115 0.0000000 2
## 36 -1 0.120 0.0000000 1
## 37 1 0.120 0.0000000 1
## 38 -1 0.125 0.0000000 1
## 39 -1 0.135 0.0000000 2
## 40 -1 0.180 0.0000000 1
AB: Nuances in mass, host, sex, and flight probability goes back to those bivariate plots I made. You can more subtly see how each responds as they change. I know you wnat to see things more continuous, but I’m going to plot all points because it doesn’t fall into those uneven group pitfalls/limitations.
gf_point(num_flew ~ average_mass, data=d) +
geom_smooth(method='lm') +
ggtitle("All Data: Number Times Flew vs. Average Mass (g)") # as average mass increases, the number of times flew decreases
## `geom_smooth()` using formula 'y ~ x'
Then break it between host and average mass:
p1 <- gf_point(num_flew ~ average_mass, col=~host_plant, data=d[d$average_mass_c < 0,], title="(a) avgmass_c <0") +
geom_smooth(method='lm')
p2 <- gf_point(num_flew ~ average_mass, col=~host_plant, data=d[d$average_mass_c > 0,], title="(b) avgmass_c >0") +
geom_smooth(method='lm')
grid.arrange(p1,p2, nrow=2)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
Consistent with the model and graphs: (a) If GRT and had an average mass below the mean of the population, then you flew less times. (b) But if were GRT and had an average mass above the mean of the population, then you flew more times.
p3 <- gf_point(num_notflew ~ average_mass, col=~host_plant, data=d[d$average_mass_c < 0,], title="(a) avgmass_c <0") +
geom_smooth(method='lm')
p4 <- gf_point(num_notflew ~ average_mass, col=~host_plant, data=d[d$average_mass_c > 0,], title="(b) avgmass_c >0") +
geom_smooth(method='lm')
grid.arrange(p3,p4, nrow=2)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
p5 <- gf_point(num_flew ~ average_mass, col=~sex, data=d[d$average_mass_c < 0,], title="(a) avgmass_c <0") +
geom_smooth(method='lm')
p6 <- gf_point(num_flew ~ average_mass, col=~sex, data=d[d$average_mass_c > 0,], title="(b) avgmass_c >0") +
geom_smooth(method='lm')
grid.arrange(p5,p6, nrow=2)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
p7 <- gf_point(num_notflew ~ average_mass, col=~sex, data=d[d$average_mass_c < 0,], title="(a) avgmass_c <0") +
geom_smooth(method='lm')
p8 <- gf_point(num_notflew ~ average_mass, col=~sex, data=d[d$average_mass_c > 0,], title="(b) avgmass_c >0") +
geom_smooth(method='lm')
grid.arrange(p7,p8, nrow=2)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
Most males’ average mass is below the average mass of the population
gf_point(num_flew ~ average_mass, col=~sex, data=d) +
geom_smooth(method='lm')
## `geom_smooth()` using formula 'y ~ x'
Note: I did not take out bugs that only flew once. That is important in order to do delta analyses and important to consider on certain limitations in these graphs (e.g. bugs would never reach 2 num_flew or 2 num_notflew).
Note: Have ran models using mass transformed and wing2body transformed with the original dataset and have found similar convergence issues. This method with the condensed dataset is better.
Coefficients between the interactions host * average_mass_trans and sym_dist * average_mass_trans for the Males top model are conflicting. Problematic?
Coefficients between the interactions host * wing2body and sym_dist * wing2body for the Males top model are conflicting. Problematic?
noE vs. E source script? Why use the + E source script?
rm(list=ls())
output_col = FALSE
source("src/clean_flight_data.R")
source("src/regression_output.R")
source("src/center_flight_data.R")
source("src/get_warnings.R")
data <- read_flight_data("data/all_flight_data-Winter2020.csv")
data_all <- data[[1]]
data_tested <- data[[2]]
### Remove everyone who didn't fly (then remove distances = 0, if that didn't work fully)
data_flew_all <- data_tested[data_tested$flew_b == 1, ]
### Check for low speeds
low_speeds <- data_flew_all %>%
filter(average_speed < 0.05)
### Check for high speeds
high_speeds <- data_flew_all %>%
filter(average_speed > 0.65)
low_speeds$flight_type# have 7 bugs with average_speed = 0 but were marked as bursters (this could be just something very short (second burst) - not enough to grant a calculation) - I decided to remove them. But one bug was continuous and had 0 distance and 0 speeds - that was bug 196 T2 set011-3-03-2020-A3_196.txt
## [1] B B B B B B B B B C B B B
## Levels: B BC C CB
high_speeds$flight_type # 3 bugs - also bursters. Could also be short explosive bursts but not true to the biology of these bugs (more like us blowing on them).
## [1] B B B
## Levels: B BC C CB
### Remove outliers
data_flew <- data_flew_all %>%
filter(average_speed > 0.05) %>%
filter(average_speed < 0.65)
data_flew <- center_data(data_flew)
## transform mass & speed
data_flew$mass_trans<-log(data_flew$mass)-mean(log(data_flew$mass), na.rm=TRUE)
data_flew$speed_trans<-log(data_flew$average_speed)-mean(log(data_flew$average_speed), na.rm=TRUE)
#######do flight types differ?
data_flew$flight_type <- relevel(data_flew$flight_type, ref="B")
tidy_regression(lmer(speed_trans~flight_type + (1|chamber) + (1|ID), data=data_flew), is_color=output_col)
## lmer speed_trans ~ flight_type + (1 | chamber) + (1 | ID) data_flew
## AIC: 150.0831 134.0831
## (Intercept) coeff: 0.0818461 tval: 1.691962
## flight_type coeff: 0.1180218 tval: 0.7058626
## flight_typeBC coeff: -0.1442107 tval: -2.600565
## flight_typeC coeff: -0.1245013 tval: -3.594283
## flight_typeCB coeff: -0.2407188 tval: -2.302263
# yes, B and C differ distinctly in average speed; BC and CB are not different from C, so let's keep those in the continuous flight analyses.
####### Effect of chamber B-2 and B-4
data_flew$chamber <- relevel(data_flew$chamber, ref="A-4")
tidy_regression(lmer(speed_trans~chamber + (1|ID), data=data_flew), is_color=output_col)###Possibly reductions in speed
## lmer speed_trans ~ chamber + (1 | ID) data_flew
## AIC: 166.5634 146.5634
## (Intercept) coeff: 0.0465751 tval: 1.192484
## chamberA-1 coeff: 0.1317075 tval: 1.961805
## chamberA-2 coeff: 0.0714424 tval: 1.18636
## chamberA-3 coeff: -0.0614469 tval: -1.064087
## chamberB-1 coeff: 0.0383557 tval: 0.594997
## chamberB-2 coeff: -0.1540959 tval: -2.723254
## chamberB-3 coeff: -0.0745587 tval: -1.207074
## chamberB-4 coeff: -0.258447 tval: -4.257179
####### No effect of test date
tidy_regression(lmer(speed_trans~days_from_start + (1|chamber), data=data_flew), is_color=output_col)
## lmer speed_trans ~ days_from_start + (1 | chamber) data_flew
## AIC: 157.7216 149.7216
## (Intercept) coeff: -0.0131111 tval: -0.2631498
## days_from_start coeff: 0.0018488 tval: 0.7862338
####### No effect of test time
tidy_regression(lmer(average_speed~min_from_IncStart + (1|chamber), data=data_flew), is_color=output_col)
## lmer average_speed ~ min_from_IncStart + (1 | chamber) data_flew
## AIC: -579.7171 -587.7171
## (Intercept) coeff: 0.326614 tval: 21.39577
## min_from_IncStart coeff: -4.02e-05 tval: -1.098817
MLC: bursters are likely to be much less reliable than continuous flyers; so, let’s exclude them.
C, BC, and CB data for speed
data_flew <- data_flew %>%
filter(!is.na(mass))
data_flew <- center_data(data_flew)
dC<-data_flew[data_flew$flight_type=="C" | data_flew$flight_type=="BC" | data_flew$flight_type=="CB" ,]
dC <- dC %>%
filter(!is.na(body))
dC <- center_data(dC)
data<-data.frame(R=dC$speed_trans,
A=dC$host_c,
B=dC$sex_c,
C=dC$mass_trans,
D=dC$sym_dist_s, #I note it does not matter whether this is sym_dist or wing2body
X=dC$chamber,
Y=dC$ID)
source("src/compare_models.R")
model_comparisonsAIC("src/generic models-gaussian glmer 2-RF + 4-FF REMLF.R")
## [,1] [,2] [,3] [,4]
## AICs 50.57653 52.31405 52.57508 52.87968
## models 19 28 29 3
## probs 0.1990675 0.08350328 0.07328603 0.06293305
##
## m19 R ~ B * C + (1 | X) + (1 | Y)
## m28 R ~ B * C + A + (1 | X) + (1 | Y)
## m29 R ~ B * C + D + (1 | X) + (1 | Y)
## m3 R ~ C + (1 | X) + (1 | Y)
anova(m19, m28, test="Chisq") #adding A does not improve fit
## Data: data
## Models:
## m19: R ~ B * C + (1 | X) + (1 | Y)
## m28: R ~ B * C + A + (1 | X) + (1 | Y)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## m19 7 50.577 73.005 -18.288 36.577
## m28 8 52.314 77.946 -18.157 36.314 0.2625 1 0.6084
anova(m19, m29, test="Chisq") #adding D does not improve fit
## Data: data
## Models:
## m19: R ~ B * C + (1 | X) + (1 | Y)
## m29: R ~ B * C + D + (1 | X) + (1 | Y)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## m19 7 50.577 73.005 -18.288 36.577
## m29 8 52.575 78.207 -18.288 36.575 0.0015 1 0.9696
anova(m19, m8, test="Chisq") #adding B*C interaction does improve fit | m19 best fit
## Data: data
## Models:
## m8: R ~ B + C + (1 | X) + (1 | Y)
## m19: R ~ B * C + (1 | X) + (1 | Y)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## m8 6 54.839 74.063 -21.420 42.839
## m19 7 50.577 73.005 -18.288 36.577 6.2629 1 0.01233 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
continuous_model<-lmer(speed_trans~sex*mass_trans + (1|ID) + (1|chamber), data=dC)
summary(continuous_model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: speed_trans ~ sex * mass_trans + (1 | ID) + (1 | chamber)
## Data: dC
##
## REML criterion at convergence: 48.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.22163 -0.52173 -0.04694 0.53790 2.49311
##
## Random effects:
## Groups Name Variance Std.Dev.
## ID (Intercept) 0.01828 0.1352
## chamber (Intercept) 0.02271 0.1507
## Residual 0.04963 0.2228
## Number of obs: 182, groups: ID, 135; chamber, 8
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.11733 0.10291 1.140
## sexM -0.14761 0.09177 -1.608
## mass_trans -0.18444 0.19744 -0.934
## sexM:mass_trans 0.60933 0.24341 2.503
##
## Correlation of Fixed Effects:
## (Intr) sexM mss_tr
## sexM -0.815
## mass_trans -0.732 0.814
## sxM:mss_trn 0.604 -0.574 -0.822
# QQ plot of residuals
s.test <- paste("pval: ", shapiro.test(residuals(continuous_model))$p.value)
qqnorm(resid(continuous_model))
qqline(resid(continuous_model))
text(-2, 0.1, s.test) # normal post-transform
# Diagnostic plot
dC['fitted_values'] = fitted.values(continuous_model)
dC['fitted_residuals'] = residuals(continuous_model)
ggplot(data = dC ) + geom_point(aes(x = fitted_values, y = fitted_residuals)) +
geom_hline(yintercept = 0) +
labs(title = "Plot of Fitted Residuals against Fitted Values",
x = "Fitted Values", y = "Fitted Residuals")
# Plot of the leverage points
halfnorm(hatvalues(continuous_model))
hatvalues(continuous_model)[138]
## 138
## 0.3962033
hatvalues(continuous_model)[1]
## 1
## 0.3892357
speed_summary<-aggregate(average_speed~sex, data=dC, FUN=mean)
speed_summary$se<-aggregate(average_speed~sex, data=dC, FUN=function(x)
sd(x)/sqrt(length(x)))$average_speed
MLC: There is not enough replication to break this down by sex, so we’ll leave it as it is. This could be generalized, although it’s probably not worth the time?
four_plots<-function(){
##quick plots for speed
##plot-specific grouping variables
data_flew$mass_block<-round(data_flew$mass/0.005)*0.005 # AB (self-note): rounded up so the range, or block, begins 0.05 g less than the recorded mass_block value in data_tmep
data_flew$wing2body_block<-round(data_flew$wing2body, digits=2)
par(mfrow=c(2,2), mai=c(0.8, 0.8, 0.02, 0.02))
##mass by sex
data_temp<-aggregate(average_speed~sex*mass_block, data=data_flew, FUN=mean)
data_temp$n<-aggregate(average_speed~sex*mass_block, data=data_flew, FUN=length)$average_speed
plot(data_temp$average_speed~data_temp$mass_block, pch=c(2,19)[as.factor(data_temp$sex)], ylab="Flight speed", xlab="Mass")
##Unlike for flight probability, here we see a clear effect of mass on male flight speed, but not female flight speed.
##wing2body by sex
data_temp<-aggregate(average_speed~sex*wing2body_block, data=data_flew, FUN=mean)
data_temp$n<-aggregate(average_speed~sex*wing2body_block, data=data_flew, FUN=length)$average_speed
plot(data_temp$average_speed~data_temp$wing2body_block, pch=c(2,19)[as.factor(data_temp$sex)], ylab="Flight speed", xlab="wing-to-body ratio")
##Here we can see that as wing2body ratio increases, flight speed increases marginally in males, but not noticeably in females
#mass by host
data_temp<-aggregate(average_speed~host_c*mass_block, data=data_flew, FUN=mean)
data_temp$n<-aggregate(average_speed~host_c*mass_block, data=data_flew, FUN=length)$average_speed
plot(data_temp$average_speed~data_temp$mass_block, pch=19, col=c(rgb(1,0.1,0,0.8),rgb(0,1,0.8,0.8))[as.factor(data_temp$host_c)], ylab="Flight speed", xlab="Mass")
##Here we can see a weak positive effect of mass on flight speed, that does not appear to differ between hosts
##wing2body by host
data_temp<-aggregate(average_speed~host_c*wing2body_block, data=data_flew, FUN=mean)
data_temp$n<-aggregate(average_speed~host_c*wing2body_block, data=data_flew, FUN=length)$average_speed
plot(data_temp$average_speed~data_temp$wing2body_block, pch=19, col=c(rgb(1,0.1,0,0.8),rgb(0,1,0.8,0.8))[as.factor(data_temp$host_c)], ylab="Flight speed", xlab="wing-to-body ratio")
##Here we can see a potential weak positive effect of wing2body ratio on flight speed that does not differ between hosts.
}
four_plots()
AB: 2 females at 0.41 m/s average speed at mass 0.35 to 0.40 g and 1 male at 0.42 m/s average speed at mass 0.60 to 0.65 g
# pwc = "Pairwise Wilcoxon test" between groups
pwc <- dC %>%
dunn_test(average_speed ~ sex, p.adjust.method = "bonferroni") %>%
add_xy_position(x = "sex")
res.kruskal <- dC %>% kruskal_test(average_speed ~ sex)
ggplot(dC, aes(sex, average_speed)) +
geom_violin() +
labs(
subtitle = get_test_label(res.kruskal, detailed = TRUE),
caption = get_pwc_label(pwc)
) +
theme(legend.position="none") +
xlab("sex") +
ylab("average speed (m/s)") +
geom_boxplot(width=0.1) +
stat_pvalue_manual(pwc, hide.ns = TRUE)
Close to being significantly different between the sexes - mostly marginal
# check out the outliers
select(dC[dC$sex == "M" & dC$average_speed > 0.5, ], average_speed, distance, chamber) # really fast moving male bugs
## average_speed distance chamber
## 81 0.5400811 12317.8215 B-2
## 118 0.5205269 1662.4818 A-1
## 161 0.5170697 749.5619 A-3
dC_ro <- dC[dC$average_speed < 0.51, ]
pwc <- dC_ro %>%
dunn_test(average_speed ~ sex, p.adjust.method = "bonferroni") %>%
add_xy_position(x = "sex")
res.kruskal <- dC_ro %>% kruskal_test(average_speed ~ sex)
pwc
## # A tibble: 1 x 13
## .y. group1 group2 n1 n2 statistic p p.adj p.adj.signif
## <chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 aver… F M 37 142 -2.01 0.0445 0.0445 *
## # … with 4 more variables: y.position <dbl>, groups <named list>, xmin <int>,
## # xmax <int>
ggplot(dC_ro, aes(sex, average_speed)) +
geom_violin() +
labs(
subtitle = get_test_label(res.kruskal, detailed = TRUE),
caption = get_pwc_label(pwc)
) +
theme(legend.position="none") +
xlab("sex") +
ylab("average speed (m/s)") +
geom_boxplot(width=0.1) +
stat_pvalue_manual(pwc, hide.ns = TRUE)
AB: relationship significant if you remove ‘outliers’ but I wouldn’t consider them outliers. Also important to remember the very different sex pool sizes: there are over 3.5 times as many males as females. Generally, across the literature/across species, speed doesn’t show much difference between sexes within a species but max_speed has been used as a parameter. This parameter is tricky because I would need to go back and make sure I remove any of those first fast outliers from us blowing on them. It’s possible and there are many flight parameters that we aren’t using that may show us more about how these insects perform (Jones 2015) really takes advantage of many flight parameters
Difference in pool sizes between sexes causes this hard-to-parse-out comparison?
What to do about potential “outliers”? I don’t see them as outliers, just some powerful male flyers.
Use max speed instead of average speed?
rm(list=ls())
output_col = FALSE
source("src/clean_flight_data.R")
source("src/regression_output.R")
source("src/center_flight_data.R")
source("src/get_warnings.R")
data <- read_flight_data("data/all_flight_data-Winter2020.csv")
data_all <- data[[1]]
data_tested <- data[[2]]
### Remove everyone who didn't fly (then remove distances = 0, if that didn't work fully)
data_flew <- data_tested[data_tested$flew_b == 1, ]
data_flew <- data_flew[data_flew$distance > 0, ]
data_flew <- center_data(data_flew) ##bursters are likely to be much less reliable than continuous flyers; so, let's exclude them.
data_flew <- data_flew %>%
filter(!is.na(mass))
data_flew <- center_data(data_flew)
#######do flight types differ?
data_flew$flight_type <- relevel(data_flew$flight_type, ref="B")
tidy_regression(lmer(distance~flight_type + (1|chamber) + (1|ID), data=data_flew), is_color=output_col) ###This is intensely influenced by flight_type - similar to speed, I think these are much more meaningful for continuous fliers.
## lmer distance ~ flight_type + (1 | chamber) + (1 | ID) data_flew
## AIC: 6351.763 6335.763
## (Intercept) coeff: 291.5131498 tval: 0.7138526
## flight_type coeff: 1023.3143061 tval: 0.5211221
## flight_typeBC coeff: 3665.0176043 tval: 5.704257
## flight_typeC coeff: 4662.0917622 tval: 11.56366
## flight_typeCB coeff: -14.0062226 tval: -0.01085566
C, BC, and CB data for distance
dC<-data_flew[data_flew$flight_type=="C" | data_flew$flight_type=="BC" | data_flew$flight_type=="CB" ,]
dC <- dC %>%
filter(!is.na(body))
dC <- center_data(dC)
##transform mass
dC$mass_trans<-log(dC$mass)-mean(log(dC$mass), na.rm=TRUE)
##transform distance
dC$distance_trans<-log(sqrt(100+dC$distance))-mean(log(sqrt(100+dC$distance)), na.rm=TRUE)
####### Effect of chamber B-1 and A-1
dC$chamber <- relevel(dC$chamber, ref="B-4")
tidy_regression(lmer(distance_trans~chamber + (1|ID), data=dC), is_color=output_col) ###yes, this one's an issue
## lmer distance_trans ~ chamber + (1 | ID) dC
## AIC: 341.1044 321.1044
## (Intercept) coeff: -0.1252246 tval: -1.054125
## chamberA-1 coeff: 0.4215996 tval: 2.252771
## chamberA-2 coeff: 0.1892315 tval: 1.18247
## chamberA-3 coeff: 0.1783989 tval: 1.109788
## chamberA-4 coeff: -0.1622016 tval: -0.9709388
## chamberB-1 coeff: 0.3942594 tval: 2.391899
## chamberB-2 coeff: -0.0820173 tval: -0.4996263
## chamberB-3 coeff: 0.1264823 tval: 0.7068737
####### No effect of test date
tidy_regression(lmer(distance_trans~days_from_start + (1|chamber), data=dC), is_color=output_col)
## lmer distance_trans ~ days_from_start + (1 | chamber) dC
## AIC: 338.8225 330.8225
## (Intercept) coeff: 0.0338355 tval: 0.345132
## days_from_start coeff: -0.0025523 tval: -0.4287195
####### no effect of start time
tidy_regression(lmer(distance_trans~min_from_IncStart + (1|chamber), data=dC), is_color=output_col)
## lmer distance_trans ~ min_from_IncStart + (1 | chamber) dC
## AIC: 344.3012 336.3012
## (Intercept) coeff: 0.0441264 tval: 0.4925515
## min_from_IncStart coeff: -0.0002343 tval: -0.7209203
### model transformed distance
data<-data.frame(R=dC$distance_trans,
A=dC$host_c,
B=dC$sex_c,
C=dC$mass_trans,
D=dC$wing2body, #Sym_dist showed up in nothing, reran with wing2body
X=dC$chamber,
Y=dC$ID)
source("src/compare_models.R")
model_comparisonsAIC("src/generic models-gaussian glmer 2-RF + 4-FF REMLF.R")
## [,1] [,2] [,3] [,4]
## AICs 323.0466 324.1639 324.6107 324.6699
## models 3 2 10 6
## probs 0.1221808 0.06988459 0.05589216 0.05426259
##
## m3 R ~ C + (1 | X) + (1 | Y)
## m2 R ~ B + (1 | X) + (1 | Y)
## m10 R ~ C + D + (1 | X) + (1 | Y)
## m6 R ~ A + C + (1 | X) + (1 | Y)
anova(m2, m9, test="Chisq") #adding D does not improve things
## Data: data
## Models:
## m2: R ~ B + (1 | X) + (1 | Y)
## m9: R ~ B + D + (1 | X) + (1 | Y)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## m2 5 324.16 340.18 -157.08 314.16
## m9 6 325.07 344.29 -156.53 313.07 1.0968 1 0.295
anova(m3, m8, test="Chisq") #adding B does not improve things
## Data: data
## Models:
## m3: R ~ C + (1 | X) + (1 | Y)
## m8: R ~ B + C + (1 | X) + (1 | Y)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## m3 5 323.05 339.07 -156.52 313.05
## m8 6 324.88 344.10 -156.44 312.88 0.1708 1 0.6794
anova(m2, m8, test="Chisq") #adding C does not improve things
## Data: data
## Models:
## m2: R ~ B + (1 | X) + (1 | Y)
## m8: R ~ B + C + (1 | X) + (1 | Y)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## m2 5 324.16 340.18 -157.08 314.16
## m8 6 324.88 344.10 -156.44 312.88 1.2881 1 0.2564
anova(m3, m10, test="Chisq") # Adding D does not improve fit
## Data: data
## Models:
## m3: R ~ C + (1 | X) + (1 | Y)
## m10: R ~ C + D + (1 | X) + (1 | Y)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## m3 5 323.05 339.07 -156.52 313.05
## m10 6 324.61 343.83 -156.31 312.61 0.4358 1 0.5091
MLC: Once again, unclear if it’s mass or sex driving the increase in distance traveled. Unfortunately, we still don’t have the replication to split this by sex (at least, not for females). So, we just have to accept the uncertainty here - but we can use the delta stats to help dig into this for mass!
distance_model_all<-lmer(distance_trans~mass_trans + (1|chamber) + (1|ID), data=dC)
summary(distance_model_all)
## Linear mixed model fit by REML ['lmerMod']
## Formula: distance_trans ~ mass_trans + (1 | chamber) + (1 | ID)
## Data: dC
##
## REML criterion at convergence: 318.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.57080 -0.66165 0.00832 0.79299 1.75278
##
## Random effects:
## Groups Name Variance Std.Dev.
## ID (Intercept) 0.04723 0.2173
## chamber (Intercept) 0.02809 0.1676
## Residual 0.26978 0.5194
## Number of obs: 182, groups: ID, 135; chamber, 8
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.005233 0.073614 0.071
## mass_trans 0.350918 0.162555 2.159
##
## Correlation of Fixed Effects:
## (Intr)
## mass_trans 0.006
alt_distance_model<-lmer(distance_trans~sex + (1|chamber) + (1|ID), data=dC)
summary(alt_distance_model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: distance_trans ~ sex + (1 | chamber) + (1 | ID)
## Data: dC
##
## REML criterion at convergence: 320.2
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.41991 -0.68360 -0.00498 0.79067 1.78744
##
## Random effects:
## Groups Name Variance Std.Dev.
## ID (Intercept) 0.05958 0.2441
## chamber (Intercept) 0.02868 0.1694
## Residual 0.26030 0.5102
## Number of obs: 182, groups: ID, 135; chamber, 8
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.1653 0.1144 1.445
## sexM -0.2028 0.1091 -1.860
##
## Correlation of Fixed Effects:
## (Intr)
## sexM -0.759
# QQ plot of residuals
s.test <- paste("pval: ", shapiro.test(residuals(distance_model_all))$p.value)
qqnorm(resid(distance_model_all))
qqline(resid(distance_model_all))
text(-2, 0.1, s.test) # normal post-transform
# Diagnostic plot
dC['fitted_values'] = fitted.values(distance_model_all)
dC['fitted_residuals'] = residuals(distance_model_all)
ggplot(data = dC ) + geom_point(aes(x = fitted_values, y = fitted_residuals)) +
geom_hline(yintercept = 0) +
labs(title = "Plot of Fitted Residuals against Fitted Values",
x = "Fitted Values", y = "Fitted Residuals")
# AB: Diagnostic plot not looking so good, there's a striking linear relationship. It might not be best to actually let it pass
# Plot of the leverage points
halfnorm(hatvalues(distance_model_all))
hatvalues(distance_model_all)[138]
## 138
## 0.225122
hatvalues(distance_model_all)[127]
## 127
## 0.2052411
# QQ plot of residuals
s.test <- paste("pval: ", shapiro.test(residuals(alt_distance_model))$p.value)
qqnorm(resid(alt_distance_model))
qqline(resid(alt_distance_model))
text(-2, 0.1, s.test) # normal post-transform
# Diagnostic plot
dC['fitted_values'] = fitted.values(alt_distance_model)
dC['fitted_residuals'] = residuals(alt_distance_model)
ggplot(data = dC ) + geom_point(aes(x = fitted_values, y = fitted_residuals)) +
geom_hline(yintercept = 0) +
labs(title = "Plot of Fitted Residuals against Fitted Values",
x = "Fitted Values", y = "Fitted Residuals")
# AB: this plot isn't looking any better
# Plot of the leverage points
halfnorm(hatvalues(alt_distance_model))
hatvalues(alt_distance_model)[167]
## 167
## 0.2353251
hatvalues(alt_distance_model)[168]
## 168
## 0.2353251
four_plots<-function(){
##quick plots for distance
##plot-specific grouping variables
dC$mass_block<-round(dC$mass/0.005)*0.005
dC$wing2body_block<-round(dC$wing2body, digits=2)
par(mfrow=c(2,2), mai=c(0.8, 0.8, 0.02, 0.02))
##mass by sex
data_temp<-aggregate(distance~sex*mass_block, data=dC, FUN=mean)
data_temp$n<-aggregate(distance~sex*mass_block, data=dC, FUN=length)$distance
plot(data_temp$distance~data_temp$mass_block, pch=c(2,19)[as.factor(data_temp$sex)], ylab="Flight distance", xlab="Mass")
##Like flight speed, here we again see what looks like a positive effect of mass for males, but not for females
##wing2body by sex
data_temp<-aggregate(distance~sex*wing2body_block, data=dC, FUN=mean)
data_temp$n<-aggregate(distance~sex*wing2body_block, data=dC, FUN=length)$distance
plot(data_temp$distance~data_temp$wing2body_block, pch=c(2,19)[as.factor(data_temp$sex)], ylab="Flight distance", xlab="wing-to-body ratio")
##There's not much going on here.
#mass by host
data_temp<-aggregate(distance~host_c*mass_block, data=dC, FUN=mean)
data_temp$n<-aggregate(distance~host_c*mass_block, data=dC, FUN=length)$distance
plot(data_temp$distance~data_temp$mass_block, pch=19, col=c(rgb(1,0.1,0,0.8),rgb(0,1,0.8,0.8))[as.factor(data_temp$host_c)], ylab="Flight distance", xlab="Mass")
##Not much here either
##wing2body by host
data_temp<-aggregate(distance~host_c*wing2body_block, data=dC, FUN=mean)
data_temp$n<-aggregate(distance~host_c*wing2body_block, data=dC, FUN=length)$distance
plot(data_temp$distance~data_temp$wing2body_block, pch=19, col=c(rgb(1,0.1,0,0.8),rgb(0,1,0.8,0.8))[as.factor(data_temp$host_c)], ylab="Flight distance", xlab="wing-to-body ratio")
##Not much here either
}
four_plots()
MLC pointed out that distance is not normal even after transforming it, but I’m really not sure if it’s ok to let it pass. Seems like it’s better not. We might have to use nonparametric tests like the Kruskal-Wallis.