Winter 2020 Flight Trials Modeling

Y/N Flight Reponse

The Data

Source Scripts

output_col = FALSE # Recommend changing this to TRUE if working in Base R or RStudio, and FALSE if generating an html
source("src/clean_flight_data.R") # Script that loads and cleans up the data
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
source("src/regression_output.R") # A script that cleans up regression outputs and prints in color or black and white
source("src/center_flight_data.R")
source("src/get_warnings.R")

Read and Clean the Data

data <- read_flight_data("data/all_flight_data-Winter2020.csv")
data_all <- data[[1]]
data_tested <- data[[2]]

d <- data_tested %>%
   group_by(ID, sex,population, site, host_plant, latitude, longitude, total_eggs, 
            beak, thorax, wing, body, w_morph, morph_notes, tested,
            host_c, sex_c, w_morph_c, lat_c, sym_dist, sym_dist_s, total_eggs_c, 
            beak_c, thorax_c, thorax_s, body_c, wing_c, wing2body, wing2body_c, wing2body_s) %>%
   summarise_all(funs(list(na.omit(.))))
## Warning: `funs()` is deprecated as of dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
d$num_flew <- 0
d$num_notflew <- 0
d$average_mass <- 0
d$avg_days <- 0

for(row in 1:length(d$flew_b)){
  
  n_flew <- sum(d$flew_b[[row]] == 1) # total number of times did fly among trails
  d$num_flew[[row]] <- n_flew 
  
  n_notflew <- sum(d$flew_b[[row]] == 0) # total number of times did not fly among trails
  d$num_notflew[[row]] <- n_notflew
  
  avg_mass <- mean(d$mass[[row]]) # average mass
  d$average_mass[[row]] <- avg_mass
  
  avg_days <- mean(d$days_from_start[[row]]) # average days
  d$avg_days[[row]] <- avg_days

}

d_all <- select(d, -filename, -channel_letter, -set_number) ##MC: changed the name here to avoid re-loading all data to generate male and female only centered datasets
d_all <- center_data(d_all, is_not_binded = FALSE) # AB: need this here or else average_mass_c and some other columns do not get calculated
d_all
## # A tibble: 333 x 64
## # Groups:   ID, sex, population, site, host_plant, latitude, longitude,
## #   total_eggs, beak, thorax, wing, body, w_morph, morph_notes, tested, host_c,
## #   sex_c, w_morph_c, lat_c, sym_dist, sym_dist_s, total_eggs_c, beak_c,
## #   thorax_c, thorax_s, body_c, wing_c, wing2body, wing2body_c [333]
##    ID    sex   population site  host_plant latitude longitude total_eggs  beak
##    <fct> <fct> <fct>      <fct> <fct>         <dbl>     <dbl>      <dbl> <dbl>
##  1 1     M     Plantatio… Areg… C. corind…     25.0     -80.6         NA  5.5 
##  2 2     M     Plantatio… Areg… C. corind…     25.0     -80.6         NA  5.64
##  3 3     M     Plantatio… Areg… C. corind…     25.0     -80.6         NA  5.75
##  4 4     M     Plantatio… Areg… C. corind…     25.0     -80.6         NA  6.21
##  5 5     F     Plantatio… Areg… C. corind…     25.0     -80.6        209  7.47
##  6 6     M     Plantatio… Foun… C. corind…     25.0     -80.6         NA  5.76
##  7 7     F     Plantatio… Foun… C. corind…     25.0     -80.6          8  8.19
##  8 8     F     Plantatio… Foun… C. corind…     25.0     -80.6         92  8.39
##  9 9     M     Plantatio… Foun… C. corind…     25.0     -80.6         NA  6.1 
## 10 10    M     Plantatio… Foun… C. corind…     25.0     -80.6         NA  6.09
## # … with 323 more rows, and 55 more variables: thorax <dbl>, wing <dbl>,
## #   body <dbl>, w_morph <fct>, morph_notes <fct>, tested <fct>, host_c <dbl>,
## #   sex_c <dbl>, w_morph_c <dbl>, lat_c <dbl>, sym_dist <dbl>,
## #   sym_dist_s <dbl>, total_eggs_c <dbl>, beak_c <dbl>, thorax_c <dbl>,
## #   thorax_s <dbl>, body_c <dbl>, wing_c <dbl>, wing2body <dbl>,
## #   wing2body_c <dbl>, wing2body_s <dbl>, trial_type <list>, chamber <list>,
## #   average_speed <list>, total_flight_time <list>, distance <list>,
## #   shortest_flying_bout <list>, longest_flying_bout <list>,
## #   total_duration <list>, max_speed <list>, test_date <list>,
## #   time_start <list>, time_end <list>, flew <list>, flight_type <list>,
## #   mass <list>, EWM <list>, flew_b <list>, eggs_b <list>,
## #   minute_duration <list>, minute_duration_c <list>, min_from_IncStart <dbl>,
## #   min_from_IncStart_c <dbl>, min_from_IncStart_s <dbl>,
## #   days_from_start <list>, days_from_start_c <list>, mass_c <list>,
## #   mass_s <list>, average_mass <dbl>, average_mass_c <dbl>,
## #   average_mass_s <dbl>, trial_type_og <list>, num_flew <dbl>,
## #   num_notflew <dbl>, avg_days <dbl>

Three Additional Variables

# average days from start; this deals with the problem of individuals who were only tested early, who will have a low value here, and with any chance individuals who happened to get in early or late in both trials.
d_all$avg_days_c <- d_all$avg_days - mean(d_all$avg_days, na.rm=TRUE)

## Log-square-root transformation; this transformation gets average_mass to act normal and not give haywire effect estimates
## AB: Log-square-root transformation - does neither work on their own?
d_all$average_mass_trans <- log(sqrt(d_all$average_mass)) - mean(log(sqrt(d_all$average_mass)), na.rm=TRUE)

## Log-square-root transformation; this transformation gets wing2body ratio to act normal and not give haywire effect estimates; note that this has an inverse relationship with wing2body ratio itself, so effect estimate directions need to be flipped for interpretation.
d_all$wing2body_trans <- log(sqrt(0.85-d_all$wing2body)) - mean(log(sqrt(0.85-d_all$wing2body)), na.rm=TRUE)

# wing2body - take out a constant

The long-square-root transformation is a transformation for long right-tails. It makes them more normal.

# Compare Number of Trials a Bug Flew in to Average Days
d_all$num_trials <- 0
d_all$num_trials <- rowSums(d_all[,c("num_flew", "num_notflew")])

# Scatter Plot
# AB: some bugs we couldn't find early on in the bins and were tested into trial 2 time. Some bugs were also accidentally tested twice in trial 1. 
s <- ggplot(d_all, aes(x=avg_days, y=num_trials))  +
  geom_point(size=2, shape=23)
  
# Histogram
d_all$num_trials <- as.factor(rowSums(d_all[,c("num_flew", "num_notflew")]))
h <- ggplot(d_all, aes(x=avg_days, color=num_trials)) +
  geom_histogram(binwidth=1, fill="white")+ # removed argument position="dodge"
  theme(legend.position="top") +
  scale_color_manual(values=c("#999999", "#E69F00", "#56B4E9")) +
  scale_fill_manual(values=c("#999999", "#E69F00", "#56B4E9"))  

grid.arrange(s, h, nrow=1)

d <- center_data(d_all, is_not_binded = FALSE)

All

days_model<-glm(cbind(num_flew,num_notflew)~avg_days_c, data=d, family=binomial)
summary(days_model)
## 
## Call:
## glm(formula = cbind(num_flew, num_notflew) ~ avg_days_c, family = binomial, 
##     data = d)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8429  -1.7825  -0.1593   1.5162   1.5795  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  0.22942    0.08236   2.785  0.00535 **
## avg_days_c   0.01087    0.02354   0.462  0.64430   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 668.05  on 332  degrees of freedom
## Residual deviance: 667.84  on 331  degrees of freedom
## AIC: 759.17
## 
## Number of Fisher Scoring iterations: 3

While average days does not show the same strong effect as days_from_start, this should still control for the fact that some individuals were only tested early, and some by chance had two earlier or later days within a trial. This allows us to retain the structure with R1 and R2, which controls for ID, but still converges.

R1 = d$num_flew
R2 = d$num_notflew
A = d$host_c
B = d$sex_c
C = d$sym_dist_s
D = d$average_mass_trans
E = d$avg_days_c

data<-data.frame(R1, R2, A, B, C, D, E)

source("src/compare_models.R")
errors <- withWarnings(model_comparisonsAIC("src/generic models-binomial glm 2R ~ 4-FF + E.R"))
##        [,1]       [,2]       [,3]      
## AICs   683.3784   683.9498   684.4483  
## models 85         63         50        
## probs  0.08875001 0.06669651 0.05198035
## 
## m85  glm(formula = cbind(R1, R2) ~ A * D + B * D + C * D + E, family = binomial, 
##     data = data)
## m63  glm(formula = cbind(R1, R2) ~ A * D + C * D + B + E, family = binomial, 
##     data = data)
## m50  glm(formula = cbind(R1, R2) ~ A * D + B * D + E, family = binomial, 
##     data = data)
#errors <- withWarnings(model_comparisonsAIC("src/generic models-binomial glm 2R ~ 4-FF + E + noE.R"))
length(errors$warnings)
## [1] 0
#generic 4-fixed factor and 1 fixed covariate models - but why? you don't even have a null model anymore.
anova(m63, m85, test="Chisq") # Adding B*D does not improve fit
## Analysis of Deviance Table
## 
## Model 1: cbind(R1, R2) ~ A * D + C * D + B + E
## Model 2: cbind(R1, R2) ~ A * D + B * D + C * D + E
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       325     580.61                     
## 2       324     578.04  1   2.5713   0.1088
anova(m26, m36, test="Chisq") # Adding C does not improve fit 
## Analysis of Deviance Table
## 
## Model 1: cbind(R1, R2) ~ A * D + B + E
## Model 2: cbind(R1, R2) ~ A * D + B + C + E
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       327     585.29                     
## 2       326     585.11  1  0.17375   0.6768
anova(m12, m26, test="Chisq") # Adding A*D does improve fit
## Analysis of Deviance Table
## 
## Model 1: cbind(R1, R2) ~ A + B + D + E
## Model 2: cbind(R1, R2) ~ A * D + B + E
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       328     590.71                       
## 2       327     585.29  1   5.4275  0.01982 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(m63, m36, test="Chisq") # Adding C*D does improve fit | m63 is the best fit
## Analysis of Deviance Table
## 
## Model 1: cbind(R1, R2) ~ A * D + C * D + B + E
## Model 2: cbind(R1, R2) ~ A * D + B + C + E
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       325     580.61                       
## 2       326     585.11 -1  -4.4994  0.03391 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mass_model_all <- glm(cbind(num_flew, num_notflew) ~ host_c * average_mass_trans 
                      + sym_dist_s * average_mass_trans + sex + avg_days_c, data=d, family=binomial)

summary(mass_model_all) # matches with earlier top models but earlier top model was simplier (host*avg_mass + sex_c)
## 
## Call:
## glm(formula = cbind(num_flew, num_notflew) ~ host_c * average_mass_trans + 
##     sym_dist_s * average_mass_trans + sex + avg_days_c, family = binomial, 
##     data = d)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.54690  -1.08568  -0.03934   1.17713   2.41019  
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)   
## (Intercept)                   -0.42474    0.23003  -1.846  0.06483 . 
## host_c                        -0.14192    0.13043  -1.088  0.27653   
## average_mass_trans            -1.04743    0.88200  -1.188  0.23501   
## sym_dist_s                    -0.04106    0.12803  -0.321  0.74842   
## sexM                           0.92157    0.33595   2.743  0.00608 **
## avg_days_c                     0.01138    0.02596   0.438  0.66113   
## host_c:average_mass_trans      1.85579    0.59201   3.135  0.00172 **
## average_mass_trans:sym_dist_s -1.41377    0.68680  -2.058  0.03954 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 668.05  on 332  degrees of freedom
## Residual deviance: 580.61  on 325  degrees of freedom
## AIC: 683.95
## 
## Number of Fisher Scoring iterations: 4

MLC: Because mass and morphology are so dimorphic between sexes, and sex itself has a strong direct effect, let’s go now to the split-by-sex results. It may be that in wing2body and mass, we have pinpointed the reasons that sexes are different; but they differ in so many ways, that is a strong inference to try and make.

AB: significant host_c:average_mass_trans relationship where if from GRT and heavy then fly more times. (It could be a category dominated by females). This was also noticed in the top model performed without transformations and avg_days_c.

Females

data_fem <- d_all[d_all$sex=="F",]
data_fem <- center_data(data_fem, is_not_binded = FALSE)
R1 = data_fem$num_flew
R2 = data_fem$num_notflew
A = data_fem$host_c
B = data_fem$sym_dist
C = data_fem$average_mass_trans
D = data_fem$wing2body_trans
E = data_fem$avg_days_c

data<-data.frame(R1, R2, A, B, C, D, E)

source("src/compare_models.R")
errors <- withWarnings(model_comparisonsAIC("src/generic models-binomial glm 2R ~ 4-FF + E.R"))
##        [,1]       [,2]      [,3]      
## AICs   238.8713   239.0635  239.8444  
## models 45         25        10        
## probs  0.08178881 0.0742949 0.05027895
## 
## m45  glm(formula = cbind(R1, R2) ~ A * C + A * D + E, family = binomial, 
##     data = data)
## m25  glm(formula = cbind(R1, R2) ~ A * C + D + E, family = binomial, 
##     data = data)
## m10  glm(formula = cbind(R1, R2) ~ C + D + E, family = binomial, data = data)
#errors <- withWarnings(model_comparisonsAIC("src/generic models-binomial glm 2R ~ 4-FF + E + noE.R"))
length(errors$warnings)
## [1] 0
anova(m25, m13, test='Chisq') #adding A*C improves fit
## Analysis of Deviance Table
## 
## Model 1: cbind(R1, R2) ~ A * C + D + E
## Model 2: cbind(R1, R2) ~ A + C + D + E
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       114     202.11                       
## 2       115     206.87 -1   -4.764  0.02906 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(m25, m17, test="Chisq") #adding D improves fit
## Analysis of Deviance Table
## 
## Model 1: cbind(R1, R2) ~ A * C + D + E
## Model 2: cbind(R1, R2) ~ A * C + E
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       114     202.11                       
## 2       115     206.35 -1   -4.243  0.03941 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(m10, m13, test="Chisq") #adding A alone does not improve fit
## Analysis of Deviance Table
## 
## Model 1: cbind(R1, R2) ~ C + D + E
## Model 2: cbind(R1, R2) ~ A + C + D + E
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       116     206.89                     
## 2       115     206.87  1 0.016934   0.8965
anova(m10, m3, test="Chisq") #adding D improves fit
## Analysis of Deviance Table
## 
## Model 1: cbind(R1, R2) ~ C + D + E
## Model 2: cbind(R1, R2) ~ C + E
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       116     206.89                       
## 2       117     212.11 -1  -5.2157  0.02238 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(m10, m4, test="Chisq") #adding C improves fit
## Analysis of Deviance Table
## 
## Model 1: cbind(R1, R2) ~ C + D + E
## Model 2: cbind(R1, R2) ~ D + E
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)   
## 1       116     206.89                        
## 2       117     217.12 -1  -10.234 0.001379 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(m25, m45, test='Chisq') #adding A*D does not improve fit | top model is m25
## Analysis of Deviance Table
## 
## Model 1: cbind(R1, R2) ~ A * C + D + E
## Model 2: cbind(R1, R2) ~ A * C + A * D + E
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       114     202.11                     
## 2       113     199.92  1   2.1922   0.1387
mass_model_fem <- glm(cbind(num_flew, num_notflew) ~ host_c * average_mass_trans +  wing2body_trans + avg_days_c, data=data_fem, family=binomial) ### sym_dist does not matter for females; AB: this matches with earlier top models (host * avg_mass + wing) 

summary(mass_model_fem)
## 
## Call:
## glm(formula = cbind(num_flew, num_notflew) ~ host_c * average_mass_trans + 
##     wing2body_trans + avg_days_c, family = binomial, data = data_fem)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1849  -1.1189  -0.7523   1.1182   2.7357  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)  
## (Intercept)               -0.15913    0.34121  -0.466   0.6409  
## host_c                    -0.61037    0.33019  -1.849   0.0645 .
## average_mass_trans        -2.08700    1.45468  -1.435   0.1514  
## wing2body_trans           -5.37017    2.66359  -2.016   0.0438 *
## avg_days_c                 0.11558    0.04757   2.430   0.0151 *
## host_c:average_mass_trans  3.02237    1.39976   2.159   0.0308 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 223.66  on 119  degrees of freedom
## Residual deviance: 202.11  on 114  degrees of freedom
## AIC: 239.06
## 
## Number of Fisher Scoring iterations: 4
# AB: conflation for females between days from start and eggs laid? Check total eggs vs. days from start (this is a rough sketch)
gf_point(total_eggs ~ days_from_start, data=data_tested) + 
  geom_smooth(method='lm')
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 420 rows containing non-finite values (stat_smooth).
## Warning: Removed 420 rows containing missing values (geom_point).

# not a strong enough relationship
summary(lm(total_eggs ~ days_from_start, data=data_tested)) # not significant
## 
## Call:
## lm(formula = total_eggs ~ days_from_start, data = data_tested)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -113.69  -70.72  -30.56   48.58  253.03 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       90.698     13.254   6.843 1.01e-10 ***
## days_from_start    1.090      0.981   1.112    0.268    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 94.35 on 192 degrees of freedom
##   (420 observations deleted due to missingness)
## Multiple R-squared:  0.006394,   Adjusted R-squared:  0.001219 
## F-statistic: 1.236 on 1 and 192 DF,  p-value: 0.2677

Males

data_male <- d_all[d_all$sex=="M",]
data_male <- center_data(data_male, is_not_binded = FALSE)
R1 = data_male$num_flew
R2 = data_male$num_notflew
A = data_male$host_c
B = data_male$sym_dist_s
C = data_male$average_mass_trans 
D = data_male$wing2body_trans  
E = data_male$avg_days_c

data<-data.frame(R1, R2, A, B, C, D, E)

source("src/compare_models.R")
errors <- withWarnings(model_comparisonsAIC("src/generic models-binomial glm 2R ~ 4-FF + E.R"))
##        [,1]       [,2]       [,3]      
## AICs   427.3933   427.6501   428.1158  
## models 105        50         83        
## probs  0.08394083 0.07382556 0.05848926
## 
## m105     glm(formula = cbind(R1, R2) ~ A * D + B * C + B * D + C * D + 
##     E, family = binomial, data = data)
## m50  glm(formula = cbind(R1, R2) ~ A * D + B * D + E, family = binomial, 
##     data = data)
## m83  glm(formula = cbind(R1, R2) ~ A * D + B * C + B * D + E, family = binomial, 
##     data = data)
#errors <- withWarnings(model_comparisonsAIC("src/generic models-binomial glm 2R ~ 4-FF + E + noE.R")) # AB: maybe better to use this for males since # does not matter?
length(errors$warnings)
## [1] 0
anova(m83, m105, test="Chisq") #marginal improvement from adding C*D
## Analysis of Deviance Table
## 
## Model 1: cbind(R1, R2) ~ A * D + B * C + B * D + E
## Model 2: cbind(R1, R2) ~ A * D + B * C + B * D + C * D + E
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       204     347.73                       
## 2       203     345.01  1   2.7225  0.09894 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(m50, m62, test="Chisq") #no improvement from adding C | m50 is the top model
## Analysis of Deviance Table
## 
## Model 1: cbind(R1, R2) ~ A * D + B * D + E
## Model 2: cbind(R1, R2) ~ A * D + B * D + C + E
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1       206     351.27                     
## 2       205     351.01  1  0.25486   0.6137
anova(m83, m62, test="Chisq") #marginal improvement from B*C
## Analysis of Deviance Table
## 
## Model 1: cbind(R1, R2) ~ A * D + B * C + B * D + E
## Model 2: cbind(R1, R2) ~ A * D + B * D + C + E
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       204     347.73                       
## 2       205     351.01 -1  -3.2794  0.07015 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mass_model_male<-glm(cbind(num_flew, num_notflew)~host_c*wing2body_trans + sym_dist_s*wing2body_trans + avg_days_c, family=binomial, data=data_male) ## AB: host showing up matches top (simpler) model from original dataset (host + mass + days + wing but the nuance is that when used ID host was not significant and when use trial_type it was). Also strange to not see mass show up. It was significant in all the top models but not when it was the only predictor variable in a model.

summary(mass_model_male)
## 
## Call:
## glm(formula = cbind(num_flew, num_notflew) ~ host_c * wing2body_trans + 
##     sym_dist_s * wing2body_trans + avg_days_c, family = binomial, 
##     data = data_male)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6331  -0.7526   0.8309   1.1667   2.0725  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 0.56182    0.14435   3.892 9.94e-05 ***
## host_c                     -0.38215    0.18894  -2.023  0.04312 *  
## wing2body_trans            -9.75687    3.01847  -3.232  0.00123 ** 
## sym_dist_s                  0.13262    0.16364   0.810  0.41770    
## avg_days_c                 -0.03316    0.03421  -0.969  0.33232    
## host_c:wing2body_trans     -9.45796    4.27415  -2.213  0.02691 *  
## wing2body_trans:sym_dist_s  7.45469    3.57473   2.085  0.03703 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 372.15  on 212  degrees of freedom
## Residual deviance: 351.27  on 206  degrees of freedom
## AIC: 427.65
## 
## Number of Fisher Scoring iterations: 4
  • mass does not matter but morphology does by host or sym_dist

Plots

MLC: I just made this a function so it’s easy to collapse. Probably won’t stay in the final summary script, but I found it helpful for looking at these interactions. In making the data_temp summaries, data=d can be swapped for data=d[d$sex==“M”,] (or F) to look at host effects within one sex only.

six_plots<-function(){
##quick plots for y/n flight
##plot-specific grouping variables
d$mass_block<-round(d$average_mass/0.005)*0.005
d$f_prob<-d$num_flew/(d$num_flew+d$num_notflew)
d$wing2body_block<-round(d$wing2body, digits=2)
d$days_block<-round(d$avg_days, digits=0)


par(mfrow=c(2,3), mai=c(0.6, 0.6, 0.02, 0.02))
##mass by sex
data_temp<-aggregate(f_prob~sex*mass_block, data=d, FUN=mean)
data_temp$n<-aggregate(f_prob~sex*mass_block, data=d, FUN=length)$f_prob
plot(data_temp$f_prob~data_temp$mass_block, pch=c(2,19)[as.factor(data_temp$sex)], ylab="Flight probability", xlab="Mass")
##Indeed, we can see that the effect of mass is very pronounced in females but essentially absent in males.
##Really clear to see - great image.
## AB: However, based on the delta mass analyses, it might be more nuanced (but it's definitely weaker). Mass does effect males, it might just be in a smaller window because their mass doesn't change as much? Consider the male group point at 1 flight probability - it might be an outlier? If you print the data temp you see that there is only 1 male bug in that group:

print(data_temp) # I don't think that their masses play a huge role but it does follow the trend in the overall data. I guess one way to parse this out experimentally would be to see how virgin female mass impacts flight probability. Would would have a smaller mass window and maybe more dispersed data points.


##wing2body by sex
data_temp<-aggregate(f_prob~sex*wing2body_block, data=d, FUN=mean)
data_temp$n<-aggregate(f_prob~sex*wing2body_block, data=d, FUN=length)$f_prob
plot(data_temp$f_prob~data_temp$wing2body_block, pch=c(2,19)[as.factor(data_temp$sex)], ylab="Flight probability", xlab="wing-to-body ratio")
##Here we can see that as wing2body ratio increases, flight probability increases, but that the effect is much more pronounced in males.  


##average days from start by sex
data_temp<-aggregate(f_prob~sex*days_block, data=d, FUN=mean)
data_temp$n<-aggregate(f_prob~sex*days_block, data=d, FUN=length)$f_prob
plot(data_temp$f_prob~data_temp$days_block, pch=c(2,19)[as.factor(data_temp$sex)], ylab="Flight probability", xlab="Days from start")
##Here we can see that as the average days from start increases, flight probability increases, but that the effect is really only visible in females. 
##This does not mean days from start didn't impact males - only that our experimental design successfully stopped 
##that from being confounding (eg, males may have been less likely to die, or had less biased mortality by host).
## AB: Or does this mean this is a pool size issue? Also, I might argue that for both males and females they are similarly weak? Not entirely visible for females either.


##mass by host
data_temp<-aggregate(f_prob~host_c*mass_block, data=d, FUN=mean)
data_temp$n<-aggregate(f_prob~host_c*mass_block, data=d, FUN=length)$f_prob
plot(data_temp$f_prob~data_temp$mass_block, pch=19, col=c(rgb(1,0.1,0,0.8),rgb(0,1,0.8,0.8))[as.factor(data_temp$host_c)], ylab="Flight probability", xlab="Mass")
##Here, we can see that the effect of mass is clear on GRT (red) but weak on BV (blue)

print(data_temp)

##wing2body by host
data_temp<-aggregate(f_prob~host_c*wing2body_block, data=d, FUN=mean)
data_temp$n<-aggregate(f_prob~host_c*wing2body_block, data=d, FUN=length)$f_prob
plot(data_temp$f_prob~data_temp$wing2body_block, pch=19, col=c(rgb(1,0.1,0,0.8),rgb(0,1,0.8,0.8))[as.factor(data_temp$host_c)], , ylab="Flight probability", xlab="wing-to-body ratio")
##Here, we can see that the positive effect of wing2body ratio is clear on GRT (red) and on BV (blue)


##average days from start by host
data_temp<-aggregate(f_prob~host_c*days_block, data=d, FUN=mean)
data_temp$n<-aggregate(f_prob~host_c*days_block, data=d, FUN=length)$f_prob
plot(data_temp$f_prob~data_temp$days_block, pch=19, col=c(rgb(1,0.1,0,0.8),rgb(0,1,0.8,0.8))[as.factor(data_temp$host_c)], , ylab="Flight probability", xlab="Days from start")
##Here we can see no clear impact of days from start when separated by host
}
six_plots()
##    sex mass_block    f_prob  n
## 1    M      0.025 0.3333333  6
## 2    M      0.030 0.6617647 34
## 3    M      0.035 0.6810345 58
## 4    F      0.040 1.0000000  1
## 5    M      0.040 0.7711864 59
## 6    F      0.045 0.2500000  6
## 7    M      0.045 0.6285714 35
## 8    F      0.050 0.2272727 11
## 9    M      0.050 0.5769231 13
## 10   F      0.055 0.3333333  6
## 11   M      0.055 0.5000000  7
## 12   F      0.060 0.5500000 10
## 13   M      0.060 1.0000000  1
## 14   F      0.065 0.2083333 12
## 15   F      0.070 0.5000000  8
## 16   F      0.075 0.3888889  9
## 17   F      0.080 0.2083333 12
## 18   F      0.085 0.3500000 10
## 19   F      0.090 0.3571429  7
## 20   F      0.095 0.2500000  4
## 21   F      0.100 0.1428571  7
## 22   F      0.105 0.3333333  6
## 23   F      0.110 0.3333333  3
## 24   F      0.115 0.0000000  2
## 25   F      0.120 0.0000000  2
## 26   F      0.125 0.0000000  1
## 27   F      0.135 0.0000000  2
## 28   F      0.180 0.0000000  1
##    host_c mass_block    f_prob  n
## 1      -1      0.025 0.5000000  2
## 2       1      0.025 0.2500000  4
## 3      -1      0.030 0.7352941 17
## 4       1      0.030 0.5882353 17
## 5      -1      0.035 0.6627907 43
## 6       1      0.035 0.7333333 15
## 7      -1      0.040 0.8000000 45
## 8       1      0.040 0.7000000 15
## 9      -1      0.045 0.6000000 35
## 10      1      0.045 0.4166667  6
## 11     -1      0.050 0.5555556 18
## 12      1      0.050 0.0000000  6
## 13     -1      0.055 0.5000000 11
## 14      1      0.055 0.0000000  2
## 15     -1      0.060 0.6875000  8
## 16      1      0.060 0.3333333  3
## 17     -1      0.065 0.1428571  7
## 18      1      0.065 0.3000000  5
## 19     -1      0.070 0.5000000  4
## 20      1      0.070 0.5000000  4
## 21     -1      0.075 0.2500000  6
## 22      1      0.075 0.6666667  3
## 23     -1      0.080 0.2500000  8
## 24      1      0.080 0.1250000  4
## 25     -1      0.085 0.3125000  8
## 26      1      0.085 0.5000000  2
## 27     -1      0.090 0.3000000  5
## 28      1      0.090 0.5000000  2
## 29     -1      0.095 0.2500000  4
## 30     -1      0.100 0.1666667  6
## 31      1      0.100 0.0000000  1
## 32     -1      0.105 0.2000000  5
## 33      1      0.105 1.0000000  1
## 34     -1      0.110 0.3333333  3
## 35     -1      0.115 0.0000000  2
## 36     -1      0.120 0.0000000  1
## 37      1      0.120 0.0000000  1
## 38     -1      0.125 0.0000000  1
## 39     -1      0.135 0.0000000  2
## 40     -1      0.180 0.0000000  1

AB: Nuances in mass, host, sex, and flight probability goes back to those bivariate plots I made. You can more subtly see how each responds as they change. I know you wnat to see things more continuous, but I’m going to plot all points because it doesn’t fall into those uneven group pitfalls/limitations.

gf_point(num_flew ~ average_mass, data=d) + 
  geom_smooth(method='lm')  +
  ggtitle("All Data: Number Times Flew vs. Average Mass (g)") # as average mass increases, the number of times flew decreases
## `geom_smooth()` using formula 'y ~ x'

Then break it between host and average mass:

p1 <- gf_point(num_flew ~ average_mass, col=~host_plant, data=d[d$average_mass_c < 0,], title="(a)   avgmass_c <0") + 
  geom_smooth(method='lm')
p2 <- gf_point(num_flew ~ average_mass, col=~host_plant, data=d[d$average_mass_c > 0,], title="(b)   avgmass_c >0") + 
  geom_smooth(method='lm')
grid.arrange(p1,p2, nrow=2)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

Consistent with the model and graphs: (a) If GRT and had an average mass below the mean of the population, then you flew less times. (b) But if were GRT and had an average mass above the mean of the population, then you flew more times.

p3 <- gf_point(num_notflew ~ average_mass, col=~host_plant, data=d[d$average_mass_c < 0,], title="(a)   avgmass_c <0") + 
  geom_smooth(method='lm')
p4 <- gf_point(num_notflew ~ average_mass, col=~host_plant, data=d[d$average_mass_c > 0,], title="(b)   avgmass_c >0") + 
  geom_smooth(method='lm')
grid.arrange(p3,p4, nrow=2)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

p5 <- gf_point(num_flew ~ average_mass, col=~sex, data=d[d$average_mass_c < 0,], title="(a)   avgmass_c <0") + 
  geom_smooth(method='lm')
p6 <- gf_point(num_flew ~ average_mass, col=~sex, data=d[d$average_mass_c > 0,], title="(b)   avgmass_c >0") + 
  geom_smooth(method='lm')
grid.arrange(p5,p6, nrow=2)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

p7 <- gf_point(num_notflew ~ average_mass, col=~sex, data=d[d$average_mass_c < 0,], title="(a)   avgmass_c <0") + 
  geom_smooth(method='lm')
p8 <- gf_point(num_notflew ~ average_mass, col=~sex, data=d[d$average_mass_c > 0,], title="(b)   avgmass_c >0") + 
  geom_smooth(method='lm')
grid.arrange(p7,p8, nrow=2)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

Most males’ average mass is below the average mass of the population

gf_point(num_flew ~ average_mass, col=~sex, data=d) + 
  geom_smooth(method='lm')
## `geom_smooth()` using formula 'y ~ x'

Note: I did not take out bugs that only flew once. That is important in order to do delta analyses and important to consider on certain limitations in these graphs (e.g. bugs would never reach 2 num_flew or 2 num_notflew).

Questions

Note: Have ran models using mass transformed and wing2body transformed with the original dataset and have found similar convergence issues. This method with the condensed dataset is better.

Coefficients between the interactions host * average_mass_trans and sym_dist * average_mass_trans for the Males top model are conflicting. Problematic?

Coefficients between the interactions host * wing2body and sym_dist * wing2body for the Males top model are conflicting. Problematic?

noE vs. E source script? Why use the + E source script?

Flight Speed

The Data

rm(list=ls())

output_col = FALSE 
source("src/clean_flight_data.R") 
source("src/regression_output.R") 
source("src/center_flight_data.R")
source("src/get_warnings.R")

data <- read_flight_data("data/all_flight_data-Winter2020.csv")
data_all <- data[[1]]
data_tested <- data[[2]]
### Remove everyone who didn't fly (then remove distances = 0, if that didn't work fully)
data_flew_all <- data_tested[data_tested$flew_b == 1, ] 

### Check for low speeds
low_speeds <- data_flew_all %>%
  filter(average_speed < 0.05)

### Check for high speeds
high_speeds <- data_flew_all %>%
  filter(average_speed > 0.65)

low_speeds$flight_type# have 7 bugs with average_speed = 0 but were marked as bursters (this could be just something very short (second burst) - not enough to grant a calculation) - I decided to remove them. But one bug was continuous and had 0 distance and 0 speeds - that was bug 196 T2 set011-3-03-2020-A3_196.txt
##  [1] B B B B B B B B B C B B B
## Levels:  B BC C CB
high_speeds$flight_type # 3 bugs - also bursters. Could also be short explosive bursts but not true to the biology of these bugs (more like us blowing on them).
## [1] B B B
## Levels:  B BC C CB
### Remove outliers
data_flew <- data_flew_all %>%
  filter(average_speed > 0.05) %>%
  filter(average_speed < 0.65)
  
data_flew <- center_data(data_flew)

## transform mass & speed
data_flew$mass_trans<-log(data_flew$mass)-mean(log(data_flew$mass), na.rm=TRUE)

data_flew$speed_trans<-log(data_flew$average_speed)-mean(log(data_flew$average_speed), na.rm=TRUE)
#######do flight types differ?
data_flew$flight_type <- relevel(data_flew$flight_type, ref="B")
tidy_regression(lmer(speed_trans~flight_type + (1|chamber) + (1|ID), data=data_flew), is_color=output_col) 
## lmer speed_trans ~ flight_type + (1 | chamber) + (1 | ID) data_flew 
## AIC:  150.0831 134.0831 
## (Intercept)      coeff:  0.0818461   tval:  1.691962
## flight_type      coeff:  0.1180218   tval:  0.7058626
## flight_typeBC    coeff:  -0.1442107  tval:  -2.600565
## flight_typeC     coeff:  -0.1245013  tval:  -3.594283
## flight_typeCB    coeff:  -0.2407188  tval:  -2.302263
# yes, B and C differ distinctly in average speed; BC and CB are not different from C, so let's keep those in the continuous flight analyses.

####### Effect of chamber B-2 and B-4
data_flew$chamber <- relevel(data_flew$chamber, ref="A-4")
tidy_regression(lmer(speed_trans~chamber + (1|ID), data=data_flew), is_color=output_col)###Possibly reductions in speed 
## lmer speed_trans ~ chamber + (1 | ID) data_flew 
## AIC:  166.5634 146.5634 
## (Intercept)  coeff:  0.0465751   tval:  1.192484
## chamberA-1   coeff:  0.1317075   tval:  1.961805
## chamberA-2   coeff:  0.0714424   tval:  1.18636
## chamberA-3   coeff:  -0.0614469  tval:  -1.064087
## chamberB-1   coeff:  0.0383557   tval:  0.594997
## chamberB-2   coeff:  -0.1540959  tval:  -2.723254
## chamberB-3   coeff:  -0.0745587  tval:  -1.207074
## chamberB-4   coeff:  -0.258447   tval:  -4.257179
####### No effect of test date
tidy_regression(lmer(speed_trans~days_from_start + (1|chamber), data=data_flew), is_color=output_col)
## lmer speed_trans ~ days_from_start + (1 | chamber) data_flew 
## AIC:  157.7216 149.7216 
## (Intercept)      coeff:  -0.0131111  tval:  -0.2631498
## days_from_start  coeff:  0.0018488   tval:  0.7862338
####### No effect of test time
tidy_regression(lmer(average_speed~min_from_IncStart + (1|chamber), data=data_flew), is_color=output_col)
## lmer average_speed ~ min_from_IncStart + (1 | chamber) data_flew 
## AIC:  -579.7171 -587.7171 
## (Intercept)          coeff:  0.326614    tval:  21.39577
## min_from_IncStart    coeff:  -4.02e-05   tval:  -1.098817

MLC: bursters are likely to be much less reliable than continuous flyers; so, let’s exclude them.

Non-Bursters

C, BC, and CB data for speed

data_flew <- data_flew %>%
  filter(!is.na(mass))
data_flew <- center_data(data_flew)

dC<-data_flew[data_flew$flight_type=="C" | data_flew$flight_type=="BC" | data_flew$flight_type=="CB" ,] 
dC <- dC %>%
  filter(!is.na(body))
dC <- center_data(dC)
data<-data.frame(R=dC$speed_trans,
                 A=dC$host_c, 
                 B=dC$sex_c, 
                 C=dC$mass_trans,
                 D=dC$sym_dist_s, #I note it does not matter whether this is sym_dist or wing2body 
                 X=dC$chamber,
                 Y=dC$ID) 

source("src/compare_models.R")
model_comparisonsAIC("src/generic models-gaussian glmer 2-RF + 4-FF REMLF.R")
##        [,1]      [,2]       [,3]       [,4]      
## AICs   50.57653  52.31405   52.57508   52.87968  
## models 19        28         29         3         
## probs  0.1990675 0.08350328 0.07328603 0.06293305
## 
## m19  R ~ B * C + (1 | X) + (1 | Y)
## m28  R ~ B * C + A + (1 | X) + (1 | Y)
## m29  R ~ B * C + D + (1 | X) + (1 | Y)
## m3   R ~ C + (1 | X) + (1 | Y)
anova(m19, m28, test="Chisq") #adding A does not improve fit
## Data: data
## Models:
## m19: R ~ B * C + (1 | X) + (1 | Y)
## m28: R ~ B * C + A + (1 | X) + (1 | Y)
##     npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)
## m19    7 50.577 73.005 -18.288   36.577                     
## m28    8 52.314 77.946 -18.157   36.314 0.2625  1     0.6084
anova(m19, m29, test="Chisq") #adding D does not improve fit
## Data: data
## Models:
## m19: R ~ B * C + (1 | X) + (1 | Y)
## m29: R ~ B * C + D + (1 | X) + (1 | Y)
##     npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)
## m19    7 50.577 73.005 -18.288   36.577                     
## m29    8 52.575 78.207 -18.288   36.575 0.0015  1     0.9696
anova(m19, m8, test="Chisq") #adding B*C interaction does improve fit | m19 best fit
## Data: data
## Models:
## m8: R ~ B + C + (1 | X) + (1 | Y)
## m19: R ~ B * C + (1 | X) + (1 | Y)
##     npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)  
## m8     6 54.839 74.063 -21.420   42.839                       
## m19    7 50.577 73.005 -18.288   36.577 6.2629  1    0.01233 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
continuous_model<-lmer(speed_trans~sex*mass_trans + (1|ID) + (1|chamber), data=dC)

summary(continuous_model) 
## Linear mixed model fit by REML ['lmerMod']
## Formula: speed_trans ~ sex * mass_trans + (1 | ID) + (1 | chamber)
##    Data: dC
## 
## REML criterion at convergence: 48.2
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.22163 -0.52173 -0.04694  0.53790  2.49311 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  ID       (Intercept) 0.01828  0.1352  
##  chamber  (Intercept) 0.02271  0.1507  
##  Residual             0.04963  0.2228  
## Number of obs: 182, groups:  ID, 135; chamber, 8
## 
## Fixed effects:
##                 Estimate Std. Error t value
## (Intercept)      0.11733    0.10291   1.140
## sexM            -0.14761    0.09177  -1.608
## mass_trans      -0.18444    0.19744  -0.934
## sexM:mass_trans  0.60933    0.24341   2.503
## 
## Correlation of Fixed Effects:
##             (Intr) sexM   mss_tr
## sexM        -0.815              
## mass_trans  -0.732  0.814       
## sxM:mss_trn  0.604 -0.574 -0.822
# QQ plot of residuals 
s.test <- paste("pval: ", shapiro.test(residuals(continuous_model))$p.value)
qqnorm(resid(continuous_model))
qqline(resid(continuous_model))
text(-2, 0.1, s.test) #  normal post-transform

# Diagnostic plot
dC['fitted_values'] = fitted.values(continuous_model)
dC['fitted_residuals'] = residuals(continuous_model)

ggplot(data = dC ) + geom_point(aes(x = fitted_values, y = fitted_residuals)) +
  geom_hline(yintercept = 0) +
  labs(title = "Plot of Fitted Residuals against Fitted Values",
       x = "Fitted Values", y = "Fitted Residuals")

# Plot of the leverage points
halfnorm(hatvalues(continuous_model))

hatvalues(continuous_model)[138]
##       138 
## 0.3962033
hatvalues(continuous_model)[1]
##         1 
## 0.3892357

Plots

speed_summary<-aggregate(average_speed~sex, data=dC, FUN=mean)

speed_summary$se<-aggregate(average_speed~sex, data=dC, FUN=function(x)
  sd(x)/sqrt(length(x)))$average_speed

MLC: There is not enough replication to break this down by sex, so we’ll leave it as it is. This could be generalized, although it’s probably not worth the time?

four_plots<-function(){
##quick plots for speed
##plot-specific grouping variables
data_flew$mass_block<-round(data_flew$mass/0.005)*0.005 # AB (self-note): rounded up so the range, or block, begins 0.05 g less than the recorded mass_block value in data_tmep
data_flew$wing2body_block<-round(data_flew$wing2body, digits=2)

par(mfrow=c(2,2), mai=c(0.8, 0.8, 0.02, 0.02))
##mass by sex
data_temp<-aggregate(average_speed~sex*mass_block, data=data_flew, FUN=mean)
data_temp$n<-aggregate(average_speed~sex*mass_block, data=data_flew, FUN=length)$average_speed
plot(data_temp$average_speed~data_temp$mass_block, pch=c(2,19)[as.factor(data_temp$sex)], ylab="Flight speed", xlab="Mass")
##Unlike for flight probability, here we see a clear effect of mass on male flight speed, but not female flight speed.


##wing2body by sex
data_temp<-aggregate(average_speed~sex*wing2body_block, data=data_flew, FUN=mean)
data_temp$n<-aggregate(average_speed~sex*wing2body_block, data=data_flew, FUN=length)$average_speed
plot(data_temp$average_speed~data_temp$wing2body_block, pch=c(2,19)[as.factor(data_temp$sex)], ylab="Flight speed", xlab="wing-to-body ratio")
##Here we can see that as wing2body ratio increases, flight speed increases marginally in males, but not noticeably in females

#mass by host
data_temp<-aggregate(average_speed~host_c*mass_block, data=data_flew, FUN=mean)
data_temp$n<-aggregate(average_speed~host_c*mass_block, data=data_flew, FUN=length)$average_speed
plot(data_temp$average_speed~data_temp$mass_block, pch=19, col=c(rgb(1,0.1,0,0.8),rgb(0,1,0.8,0.8))[as.factor(data_temp$host_c)], ylab="Flight speed", xlab="Mass")
##Here we can see a weak positive effect of mass on flight speed, that does not appear to differ between hosts


##wing2body by host
data_temp<-aggregate(average_speed~host_c*wing2body_block, data=data_flew, FUN=mean)
data_temp$n<-aggregate(average_speed~host_c*wing2body_block, data=data_flew, FUN=length)$average_speed
plot(data_temp$average_speed~data_temp$wing2body_block, pch=19, col=c(rgb(1,0.1,0,0.8),rgb(0,1,0.8,0.8))[as.factor(data_temp$host_c)], ylab="Flight speed", xlab="wing-to-body ratio")
##Here we can see a potential weak positive effect of wing2body ratio on flight speed that does not differ between hosts.

}
four_plots()

AB: 2 females at 0.41 m/s average speed at mass 0.35 to 0.40 g and 1 male at 0.42 m/s average speed at mass 0.60 to 0.65 g

# pwc = "Pairwise Wilcoxon test" between groups
pwc <- dC %>% 
  dunn_test(average_speed ~ sex, p.adjust.method = "bonferroni") %>%
  add_xy_position(x = "sex")
res.kruskal <- dC %>% kruskal_test(average_speed ~ sex)

ggplot(dC, aes(sex, average_speed)) + 
  geom_violin() +
    labs(
    subtitle = get_test_label(res.kruskal, detailed = TRUE), 
    caption = get_pwc_label(pwc)
    ) + 
  theme(legend.position="none") +
  xlab("sex") +
  ylab("average speed (m/s)") +
  geom_boxplot(width=0.1) +
  stat_pvalue_manual(pwc, hide.ns = TRUE)

Close to being significantly different between the sexes - mostly marginal

# check out the outliers
select(dC[dC$sex == "M" & dC$average_speed > 0.5, ], average_speed, distance, chamber) # really fast moving male bugs
##     average_speed   distance chamber
## 81      0.5400811 12317.8215     B-2
## 118     0.5205269  1662.4818     A-1
## 161     0.5170697   749.5619     A-3
dC_ro <- dC[dC$average_speed < 0.51, ]

pwc <- dC_ro %>% 
  dunn_test(average_speed ~ sex, p.adjust.method = "bonferroni") %>%
  add_xy_position(x = "sex")
res.kruskal <- dC_ro %>% kruskal_test(average_speed ~ sex)
pwc
## # A tibble: 1 x 13
##   .y.   group1 group2    n1    n2 statistic      p  p.adj p.adj.signif
##   <chr> <chr>  <chr>  <int> <int>     <dbl>  <dbl>  <dbl> <chr>       
## 1 aver… F      M         37   142     -2.01 0.0445 0.0445 *           
## # … with 4 more variables: y.position <dbl>, groups <named list>, xmin <int>,
## #   xmax <int>
ggplot(dC_ro, aes(sex, average_speed)) + 
  geom_violin() +
    labs(
    subtitle = get_test_label(res.kruskal, detailed = TRUE), 
    caption = get_pwc_label(pwc)
    ) + 
  theme(legend.position="none") +
  xlab("sex") +
  ylab("average speed (m/s)") +
  geom_boxplot(width=0.1) +
  stat_pvalue_manual(pwc, hide.ns = TRUE)

AB: relationship significant if you remove ‘outliers’ but I wouldn’t consider them outliers. Also important to remember the very different sex pool sizes: there are over 3.5 times as many males as females. Generally, across the literature/across species, speed doesn’t show much difference between sexes within a species but max_speed has been used as a parameter. This parameter is tricky because I would need to go back and make sure I remove any of those first fast outliers from us blowing on them. It’s possible and there are many flight parameters that we aren’t using that may show us more about how these insects perform (Jones 2015) really takes advantage of many flight parameters

Questions

Difference in pool sizes between sexes causes this hard-to-parse-out comparison?

What to do about potential “outliers”? I don’t see them as outliers, just some powerful male flyers.

Use max speed instead of average speed?

Flight Distance

The Data

rm(list=ls())

output_col = FALSE 
source("src/clean_flight_data.R") 
source("src/regression_output.R") 
source("src/center_flight_data.R")
source("src/get_warnings.R")
data <- read_flight_data("data/all_flight_data-Winter2020.csv")
data_all <- data[[1]]
data_tested <- data[[2]]

### Remove everyone who didn't fly (then remove distances = 0, if that didn't work fully)
data_flew <- data_tested[data_tested$flew_b == 1, ]
data_flew <- data_flew[data_flew$distance > 0, ]
data_flew <- center_data(data_flew) ##bursters are likely to be much less reliable than continuous flyers; so, let's exclude them.

data_flew <- data_flew %>%
  filter(!is.na(mass))
data_flew <- center_data(data_flew)
#######do flight types differ?
data_flew$flight_type <- relevel(data_flew$flight_type, ref="B")
tidy_regression(lmer(distance~flight_type + (1|chamber) + (1|ID), data=data_flew), is_color=output_col) ###This is intensely influenced by flight_type - similar to speed, I think these are much more meaningful for continuous fliers.
## lmer distance ~ flight_type + (1 | chamber) + (1 | ID) data_flew 
## AIC:  6351.763 6335.763 
## (Intercept)      coeff:  291.5131498 tval:  0.7138526
## flight_type      coeff:  1023.3143061    tval:  0.5211221
## flight_typeBC    coeff:  3665.0176043    tval:  5.704257
## flight_typeC     coeff:  4662.0917622    tval:  11.56366
## flight_typeCB    coeff:  -14.0062226 tval:  -0.01085566

Non-Bursters

C, BC, and CB data for distance

dC<-data_flew[data_flew$flight_type=="C" | data_flew$flight_type=="BC" | data_flew$flight_type=="CB" ,] 
dC <- dC %>%
  filter(!is.na(body))
dC <- center_data(dC)

##transform mass
dC$mass_trans<-log(dC$mass)-mean(log(dC$mass), na.rm=TRUE)

##transform distance
dC$distance_trans<-log(sqrt(100+dC$distance))-mean(log(sqrt(100+dC$distance)), na.rm=TRUE)
####### Effect of chamber B-1 and A-1
dC$chamber <- relevel(dC$chamber, ref="B-4")
tidy_regression(lmer(distance_trans~chamber + (1|ID), data=dC), is_color=output_col) ###yes, this one's an issue 
## lmer distance_trans ~ chamber + (1 | ID) dC 
## AIC:  341.1044 321.1044 
## (Intercept)  coeff:  -0.1252246  tval:  -1.054125
## chamberA-1   coeff:  0.4215996   tval:  2.252771
## chamberA-2   coeff:  0.1892315   tval:  1.18247
## chamberA-3   coeff:  0.1783989   tval:  1.109788
## chamberA-4   coeff:  -0.1622016  tval:  -0.9709388
## chamberB-1   coeff:  0.3942594   tval:  2.391899
## chamberB-2   coeff:  -0.0820173  tval:  -0.4996263
## chamberB-3   coeff:  0.1264823   tval:  0.7068737
####### No effect of test date
tidy_regression(lmer(distance_trans~days_from_start + (1|chamber), data=dC), is_color=output_col)
## lmer distance_trans ~ days_from_start + (1 | chamber) dC 
## AIC:  338.8225 330.8225 
## (Intercept)      coeff:  0.0338355   tval:  0.345132
## days_from_start  coeff:  -0.0025523  tval:  -0.4287195
####### no effect of start time
tidy_regression(lmer(distance_trans~min_from_IncStart + (1|chamber), data=dC), is_color=output_col)
## lmer distance_trans ~ min_from_IncStart + (1 | chamber) dC 
## AIC:  344.3012 336.3012 
## (Intercept)          coeff:  0.0441264   tval:  0.4925515
## min_from_IncStart    coeff:  -0.0002343  tval:  -0.7209203
### model transformed distance
data<-data.frame(R=dC$distance_trans,
                 A=dC$host_c, 
                 B=dC$sex_c, 
                 C=dC$mass_trans,
                 D=dC$wing2body, #Sym_dist showed up in nothing, reran with wing2body
                 X=dC$chamber,
                 Y=dC$ID) 

source("src/compare_models.R")
model_comparisonsAIC("src/generic models-gaussian glmer 2-RF + 4-FF REMLF.R")
##        [,1]      [,2]       [,3]       [,4]      
## AICs   323.0466  324.1639   324.6107   324.6699  
## models 3         2          10         6         
## probs  0.1221808 0.06988459 0.05589216 0.05426259
## 
## m3   R ~ C + (1 | X) + (1 | Y)
## m2   R ~ B + (1 | X) + (1 | Y)
## m10  R ~ C + D + (1 | X) + (1 | Y)
## m6   R ~ A + C + (1 | X) + (1 | Y)
anova(m2, m9, test="Chisq") #adding D does not improve things
## Data: data
## Models:
## m2: R ~ B + (1 | X) + (1 | Y)
## m9: R ~ B + D + (1 | X) + (1 | Y)
##    npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)
## m2    5 324.16 340.18 -157.08   314.16                     
## m9    6 325.07 344.29 -156.53   313.07 1.0968  1      0.295
anova(m3, m8, test="Chisq") #adding B does not improve things
## Data: data
## Models:
## m3: R ~ C + (1 | X) + (1 | Y)
## m8: R ~ B + C + (1 | X) + (1 | Y)
##    npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)
## m3    5 323.05 339.07 -156.52   313.05                     
## m8    6 324.88 344.10 -156.44   312.88 0.1708  1     0.6794
anova(m2, m8, test="Chisq") #adding C does not improve things
## Data: data
## Models:
## m2: R ~ B + (1 | X) + (1 | Y)
## m8: R ~ B + C + (1 | X) + (1 | Y)
##    npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)
## m2    5 324.16 340.18 -157.08   314.16                     
## m8    6 324.88 344.10 -156.44   312.88 1.2881  1     0.2564
anova(m3, m10, test="Chisq") # Adding D does not improve fit
## Data: data
## Models:
## m3: R ~ C + (1 | X) + (1 | Y)
## m10: R ~ C + D + (1 | X) + (1 | Y)
##     npar    AIC    BIC  logLik deviance  Chisq Df Pr(>Chisq)
## m3     5 323.05 339.07 -156.52   313.05                     
## m10    6 324.61 343.83 -156.31   312.61 0.4358  1     0.5091

MLC: Once again, unclear if it’s mass or sex driving the increase in distance traveled. Unfortunately, we still don’t have the replication to split this by sex (at least, not for females). So, we just have to accept the uncertainty here - but we can use the delta stats to help dig into this for mass!

distance_model_all<-lmer(distance_trans~mass_trans + (1|chamber) + (1|ID), data=dC)
summary(distance_model_all)
## Linear mixed model fit by REML ['lmerMod']
## Formula: distance_trans ~ mass_trans + (1 | chamber) + (1 | ID)
##    Data: dC
## 
## REML criterion at convergence: 318.3
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.57080 -0.66165  0.00832  0.79299  1.75278 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  ID       (Intercept) 0.04723  0.2173  
##  chamber  (Intercept) 0.02809  0.1676  
##  Residual             0.26978  0.5194  
## Number of obs: 182, groups:  ID, 135; chamber, 8
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) 0.005233   0.073614   0.071
## mass_trans  0.350918   0.162555   2.159
## 
## Correlation of Fixed Effects:
##            (Intr)
## mass_trans 0.006
alt_distance_model<-lmer(distance_trans~sex + (1|chamber) + (1|ID), data=dC)
summary(alt_distance_model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: distance_trans ~ sex + (1 | chamber) + (1 | ID)
##    Data: dC
## 
## REML criterion at convergence: 320.2
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.41991 -0.68360 -0.00498  0.79067  1.78744 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  ID       (Intercept) 0.05958  0.2441  
##  chamber  (Intercept) 0.02868  0.1694  
##  Residual             0.26030  0.5102  
## Number of obs: 182, groups:  ID, 135; chamber, 8
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)   0.1653     0.1144   1.445
## sexM         -0.2028     0.1091  -1.860
## 
## Correlation of Fixed Effects:
##      (Intr)
## sexM -0.759
# QQ plot of residuals 
s.test <- paste("pval: ", shapiro.test(residuals(distance_model_all))$p.value)
qqnorm(resid(distance_model_all))
qqline(resid(distance_model_all))
text(-2, 0.1, s.test) #  normal post-transform

# Diagnostic plot
dC['fitted_values'] = fitted.values(distance_model_all)
dC['fitted_residuals'] = residuals(distance_model_all)

ggplot(data = dC ) + geom_point(aes(x = fitted_values, y = fitted_residuals)) +
  geom_hline(yintercept = 0) +
  labs(title = "Plot of Fitted Residuals against Fitted Values",
       x = "Fitted Values", y = "Fitted Residuals")

# AB: Diagnostic plot not looking so good, there's a striking linear relationship. It might not be best to actually let it pass

# Plot of the leverage points
halfnorm(hatvalues(distance_model_all))

hatvalues(distance_model_all)[138]
##      138 
## 0.225122
hatvalues(distance_model_all)[127]
##       127 
## 0.2052411
# QQ plot of residuals 
s.test <- paste("pval: ", shapiro.test(residuals(alt_distance_model))$p.value)
qqnorm(resid(alt_distance_model))
qqline(resid(alt_distance_model))
text(-2, 0.1, s.test) #  normal post-transform

# Diagnostic plot
dC['fitted_values'] = fitted.values(alt_distance_model)
dC['fitted_residuals'] = residuals(alt_distance_model)

ggplot(data = dC ) + geom_point(aes(x = fitted_values, y = fitted_residuals)) +
  geom_hline(yintercept = 0) +
  labs(title = "Plot of Fitted Residuals against Fitted Values",
       x = "Fitted Values", y = "Fitted Residuals")

# AB: this plot isn't looking any better

# Plot of the leverage points
halfnorm(hatvalues(alt_distance_model))

hatvalues(alt_distance_model)[167]
##       167 
## 0.2353251
hatvalues(alt_distance_model)[168]
##       168 
## 0.2353251

Plots

four_plots<-function(){
##quick plots for distance
##plot-specific grouping variables
dC$mass_block<-round(dC$mass/0.005)*0.005
dC$wing2body_block<-round(dC$wing2body, digits=2)


par(mfrow=c(2,2), mai=c(0.8, 0.8, 0.02, 0.02))
##mass by sex
data_temp<-aggregate(distance~sex*mass_block, data=dC, FUN=mean)
data_temp$n<-aggregate(distance~sex*mass_block, data=dC, FUN=length)$distance
plot(data_temp$distance~data_temp$mass_block, pch=c(2,19)[as.factor(data_temp$sex)], ylab="Flight distance", xlab="Mass")
##Like flight speed, here we again see what looks like a positive effect of mass for males, but not for females


##wing2body by sex
data_temp<-aggregate(distance~sex*wing2body_block, data=dC, FUN=mean)
data_temp$n<-aggregate(distance~sex*wing2body_block, data=dC, FUN=length)$distance
plot(data_temp$distance~data_temp$wing2body_block, pch=c(2,19)[as.factor(data_temp$sex)], ylab="Flight distance", xlab="wing-to-body ratio")
##There's not much going on here.

#mass by host
data_temp<-aggregate(distance~host_c*mass_block, data=dC, FUN=mean)
data_temp$n<-aggregate(distance~host_c*mass_block, data=dC, FUN=length)$distance
plot(data_temp$distance~data_temp$mass_block, pch=19, col=c(rgb(1,0.1,0,0.8),rgb(0,1,0.8,0.8))[as.factor(data_temp$host_c)], ylab="Flight distance", xlab="Mass")
##Not much here either


##wing2body by host
data_temp<-aggregate(distance~host_c*wing2body_block, data=dC, FUN=mean)
data_temp$n<-aggregate(distance~host_c*wing2body_block, data=dC, FUN=length)$distance
plot(data_temp$distance~data_temp$wing2body_block, pch=19, col=c(rgb(1,0.1,0,0.8),rgb(0,1,0.8,0.8))[as.factor(data_temp$host_c)], ylab="Flight distance", xlab="wing-to-body ratio")
##Not much here either

}
four_plots()

Questions

MLC pointed out that distance is not normal even after transforming it, but I’m really not sure if it’s ok to let it pass. Seems like it’s better not. We might have to use nonparametric tests like the Kruskal-Wallis.