#knitr::purl("wing_stats.Rmd", output = "wing_stats.R") # convert Rmd to R script

Base statistics of all morphology for wing morph and wing2body ratio (in long-winged bugs).

Conclusion from the wing_timeseries.Rmd: Found no significant temporal dependencies. Determined that most time series were nonstationary and all ts were an AR(0) process [white noise with moving average]. Concluded that the time values were independent of each other.

Sourcing Scripts and Cleaning the Data

source_path = "~/Desktop/git_repositories/SBB-dispersal/avbernat/Rsrc/"

script_names = c("compare_models.R",
                 "regression_output.R", 
                 "clean_morph_data.R", # two functions: read_morph_data and remove_torn_wings
                 "AICprobabilities.R",
                 "remove_torn_wings.R",
                 "get_Akaike_weights.R")

for (script in script_names) { 
  path = paste0(source_path, script)
  source(path) 
}

source("~/Desktop/git_repositories/SBB-dispersal/avbernat/RSsrc/spatial_dependencies.R") # space
data_list <- read_morph_data("data/allmorphology05.10.21-coors.csv")
## morph types: L S  NA LS L/S SL 
##    recoding missing morph types...
##    S if wing2thorax <=2.2, L if wing2thorax >=2.5
## 
## ambiguous wing morph bug count:  32
raw_data = data_list[[1]]
data_long = data_list[[2]]

Checking for Outliers

dotplot = function(var, data) {
  plot(data[,var], seq(1:nrow(data)),
       col=data$color, ylab="Order of the data from text file", xlab=paste0("Value of ", var))
}
# remove NA dates
d = data_long %>%
  filter(!is.na(wing2body))

range(d$wing2body) # 0.5773723 0.8472103
## [1] 0.6623794 0.8472103
MyVar <- c("wing2body", "wing2thorax", "body", "wing", "beak", "thorax", "lat", "long") 
d$color = "red"
d$color[d$year == "2019" & d$month =="May"] = "black"

par(mfrow=c(3,3))
for (v in MyVar) {
  dotplot(v, d)
}

# remove outliers?
d2 = d %>% 
  filter(!beak<4.5) %>%
  filter(!wing2body <0.62) #%>%
  #filter(!body < 8) %>%
  #filter(!wing2body > 0.79)

d3 = remove_torn_wings(d2)
## 
## number of bugs with torn wings: 193
par(mfrow=c(3,3))

for (v in MyVar) {
  dotplot(v, d3)
}

Wing overmeasured? Body measured too small? Probably the body measured off? wing seems off.

Wing Morph

Infer missing wing morph values

raw_data_missing = raw_data %>%
  filter(w_morph=="" | is.na(w_morph)) # had 30 that are hard to identify as either S or L based on the wing2thorax thresholds.
head(raw_data_missing) # need to determine the morph of these bugs manually. In the long term...need to do this manually because the boundaries of what is defined as a morph can change over time.
##      pophost  population sex beak thorax wing body    month   year
## 1 K. elegans Gainesville   M 5.65   2.80 6.36   NA    April 2014.0
## 2 K. elegans Gainesville   M 5.62   2.79   NA   NA    April 2014.0
## 3 K. elegans    Leesburg   M 5.06   2.63 6.14   NA      May 2013.0
## 4 K. elegans    Leesburg   F 7.54   3.79 8.68   NA December 2013.6
## 5 K. elegans    Leesburg   F 7.50   3.04 6.86   NA December 2013.6
## 6 K. elegans    Leesburg   F 6.26   3.32 7.47   NA December 2013.6
##   months_since_start season w_morph      lat      long diapause
## 1                 11 spring         29.66374 -82.36091         
## 2                 11 spring    <NA> 29.66374 -82.36091         
## 3                  0 spring         28.79622 -81.87803         
## 4                  7 winter         28.79602 -81.87777         
## 5                  7 winter         28.79602 -81.87777         
## 6                  7 winter         28.79602 -81.87777         
##   field_date_collected                         notes        site sex_b
## 1                                                     23rd & 8th    -1
## 2                                                     23rd & 8th    -1
## 3           05.02.2013 date estimated; GPS estimated Mount & 8th    -1
## 4                                                    Mount & 8th     1
## 5                                                    Mount & 8th     1
## 6                                                    Mount & 8th     1
##   pophost_b            date datetime      dates month_of_year wing2thorax
## 1         1      April/2014 Apr 2014 2014-04-01             4    2.271429
## 2         1      April/2014 Apr 2014 2014-04-01             4          NA
## 3         1      April/2013 Apr 2013 2013-04-01             4    2.334601
## 4         1 December/2013.6 Dec 2013 2013-12-01            12    2.290237
## 5         1 December/2013.6 Dec 2013 2013-12-01            12    2.256579
## 6         1 December/2013.6 Dec 2013 2013-12-01            12    2.250000
##   wing_morph_b
## 1           NA
## 2           NA
## 3           NA
## 4           NA
## 5           NA
## 6           NA
par(mfrow=c(3,1))
hist(raw_data$wing[raw_data$w_morph=="L"]/raw_data$thorax[raw_data$w_morph=="L"],
     main="Histogram of wing length/thorax length for long winged SBB",
     xlab="wing length/thorax length",
      breaks=seq(0.5, 3.8, by=0.05))
hist(raw_data_missing$wing/raw_data_missing$thorax, 
      main="Histogram of wing length/thorax length for SBB w/o recorded wing morph",
      xlab="wing length/thorax length",
      breaks=seq(0.5, 3.8, by=0.05))
hist(raw_data$wing[raw_data$w_morph=="S"]/raw_data$thorax[raw_data$w_morph=="S"],
      main="Histogram of wing length/thorax length for short winged SBB",
      xlab="wing length/thorax length",
      breaks=seq(0.5,3.8,by=0.05))

Plotting Data Points Across Time and Group

Grouping the data for group counts.

group = raw_data %>% group_by(sex, 
                  datetime) %>% 
  summarize(count = n())
## `summarise()` has grouped output by 'sex'. You can override using the `.groups` argument.

Checking over bug numbers per season.

#raw_data[raw_data$year =="2020",]  #476 survived shipment, so I"m guessing that about 103 bugs did not survive shipment but were still measured for morphology?
#raw_data[raw_data$year =="2019" & raw_data$month=="October",] #similar here. 372 bugs measured but 207 survived. So that means that 165 bugs did not survive and were still measured for morph. 

Plotting the histograms.

data = raw_data
data$population[data$population=="Ft.Myers"]<-"Ft. Myers"
data$population[data$population=="Key_Largo"]<-"Key Largo"
data$population[data$population=="Lake_Placid"]<-"Lake Placid"
data$population[data$population=="Lake_Wales"]<-"Lake Wales"
data$population[data$population=="North_Key_Largo"]<-"North Key Largo"
data$population[data$population=="Plantation_Key"]<-"Plantation Key"

data$pophost[data$pophost=="C.corindum"]<-"C. corindum"
data$pophost[data$pophost=="K.elegans"]<-"K. elegans"

data = data[order(-data$lat),] 
#lat_order = unique(data$population) # should work but lat and long not right so need to do it manually:
lat_order = c("Gainesville", "Leesburg", "Lake Wales", "Lake Placid", "Ft. Myers",  "Homestead",  "North Key Largo", "Key Largo", "Plantation Key")
data$population = as.factor(data$population)
data$population = factor(data$population,levels=lat_order)

I have not removed the bugs with torn wings yet.

Binomial Regressions

What determines wing morph using all the data? Does sex? host plant? month of the year?

Testing time covariates

m = glm(wing_morph_b ~ months_since_start, data=raw_data, family="binomial")
tidy_regression(m, is_color=FALSE)
## glm wing_morph_b ~ months_since_start binomial raw_data 
## AIC:  3585.838 
## (Intercept)          coeff:  1.4966103   Pr(>|t|):  1.462966e-71 *
## months_since_start   coeff:  -0.0034902  Pr(>|t|):  0.013932 *
m = glm(wing_morph_b ~ month_of_year, data=raw_data, family="binomial")
tidy_regression(m, is_color=FALSE)
## glm wing_morph_b ~ month_of_year binomial raw_data 
## AIC:  3548.031 
## (Intercept)      coeff:  0.8065427   Pr(>|t|):  8.994927e-21 *
## month_of_year    coeff:  0.0803754   Pr(>|t|):  7.132571e-11 *
nrow(data)
## [1] 3532
data<-data.frame(R=raw_data$wing_morph_b, 
                 A=raw_data$sex_b, 
                 B=raw_data$pophost_b, 
                 C=(raw_data$month_of_year),
                 D=raw_data$months_since_start)

model_script = paste0(source_path,"generic models-binomial glm 3-FF.R")
model_comparisonsAIC(model_script)
##        [,1]      [,2]      [,3]      [,4]      [,5]      [,6]       [,7]      
## AICs   3185.995  3186.871  3187.155  3187.895  3187.898  3188.452   3189.02   
## models 15        13        11        7         17        16         14        
## probs  0.2635026 0.1700435 0.1475361 0.1019236 0.1017702 0.07714955 0.05806098
## 
## m15  glm(formula = R ~ A * B + B * C, family = binomial, data = data)
## m13  glm(formula = R ~ B * C + A, family = binomial, data = data)
## m11  glm(formula = R ~ A * B + C, family = binomial, data = data)
## m7   glm(formula = R ~ A + B + C, family = binomial, data = data)
## m17  glm(formula = R ~ A * B + A * C + B * C, family = binomial, data = data)
## m16  glm(formula = R ~ A * C + B * C, family = binomial, data = data)
## m14  glm(formula = R ~ A * B + A * C, family = binomial, data = data)
anova(m13, m15, test="Chisq") # Adding A*B marginally improves fit
anova(m7, m13, test="Chisq") # Adding B*C marginally improves fit
anova(m5, m7, test="Chisq")
anova(m6, m7, test="Chisq")
anova(m4, m7, test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: R ~ B * C + A
## Model 2: R ~ A * B + B * C
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1      3480     3176.9                       
## 2      3479     3174.0  1    2.876  0.08991 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Deviance Table
## 
## Model 1: R ~ A + B + C
## Model 2: R ~ B * C + A
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1      3481     3179.9                       
## 2      3480     3176.9  1   3.0237  0.08206 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Deviance Table
## 
## Model 1: R ~ A + C
## Model 2: R ~ A + B + C
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      3482     3541.3                          
## 2      3481     3179.9  1   361.36 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Deviance Table
## 
## Model 1: R ~ B + C
## Model 2: R ~ A + B + C
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1      3482     3186.3                       
## 2      3481     3179.9  1   6.4204  0.01128 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Deviance Table
## 
## Model 1: R ~ A + B
## Model 2: R ~ A + B + C
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)   
## 1      3482     3188.9                        
## 2      3481     3179.9  1   8.9915 0.002712 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m = glm(wing_morph_b ~ sex_b + pophost_b + month_of_year, data=raw_data, family="binomial")
tidy_regression(m, is_color=FALSE) # m7
## glm wing_morph_b ~ sex_b + pophost_b + month_of_year binomial raw_data 
## AIC:  3187.895 
## (Intercept)      coeff:  1.1984718   Pr(>|t|):  7.734972e-36 *
## sex_b            coeff:  -0.1122176  Pr(>|t|):  0.01134836 *
## pophost_b        coeff:  0.8607509   Pr(>|t|):  1.162125e-69 *
## month_of_year    coeff:  0.0389051   Pr(>|t|):  0.00284751 *
summary(m)
## 
## Call:
## glm(formula = wing_morph_b ~ sex_b + pophost_b + month_of_year, 
##     family = "binomial", data = raw_data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3270   0.3716   0.4315   0.8476   1.0516  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    1.19847    0.09590  12.497  < 2e-16 ***
## sex_b         -0.11222    0.04432  -2.532  0.01135 *  
## pophost_b      0.86075    0.04879  17.642  < 2e-16 ***
## month_of_year  0.03891    0.01304   2.984  0.00285 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 3588.0  on 3484  degrees of freedom
## Residual deviance: 3179.9  on 3481  degrees of freedom
##   (47 observations deleted due to missingness)
## AIC: 3187.9
## 
## Number of Fisher Scoring iterations: 5

Top model is m7 * Males are more likely to be long-winged. * K.elegans more likely to be long-winged. * (+) effect later in the season such that long wing morphs become increasingly likely later in the year.

temp = raw_data %>% 
  filter(!is.na(wing_morph_b))

check_spatial_dependencies(m, temp, temp$long, temp$lat, zone = 16, cutoff=14000, is_glm=TRUE)
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj
## = prefer_proj): Discarded datum Unknown based on WGS84 ellipsoid in Proj4
## definition
## Warning in proj4string(res): CRS object has comment, which is lost in output

A variogram is used to display the variability between data points as a function of distance. Here, from distances 0 to 4000 m, the variability between points starts high and then decreases (the points become more similar) until it levels out because at some point the distance between data points will be great enough that the points are no longer considered to be related to each other. The variability will flatten out into a “sill”.

No spatial dependencies for wing morph.

Best fit: Modeling all important covariates

model_script = paste0(source_path,"generic models-gaussian glm 4-FF.R")
model_comparisonsAIC(model_script)
##        [,1]      [,2]      [,3]      [,4]     
## AICs   3199.444  3200.118  3200.972  3201.607 
## models 110       98        112       107      
## probs  0.3416081 0.2438092 0.1591071 0.1158448
## 
## m110     glm(formula = R ~ A * B + A * D + B * C + B * D + C * D, family = gaussian, 
##     data = data)
## m98  glm(formula = R ~ A * B + A * D + B * C + C * D, family = gaussian, 
##     data = data)
## m112     glm(formula = R ~ A * B + A * C + A * D + B * C + B * D + C * 
##     D, family = gaussian, data = data)
## m107     glm(formula = R ~ A * B + A * C + A * D + B * C + C * D, family = gaussian, 
##     data = data)
anova(m98, m110, test="Chisq") # adding B*D does not improve fit
anova(m84, m98, test="Chisq") # adding A*B improves fit

anova(m63, m84, test="Chisq") # Adding C*D improves fit
anova(m51, m63, test="Chisq") # Adding B improves fit
## Analysis of Deviance Table
## 
## Model 1: R ~ A * B + A * D + B * C + C * D
## Model 2: R ~ A * B + A * D + B * C + B * D + C * D
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1      3476     508.19                     
## 2      3475     507.80  1  0.38986   0.1024
## Analysis of Deviance Table
## 
## Model 1: R ~ A * D + B * C + C * D
## Model 2: R ~ A * B + A * D + B * C + C * D
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)   
## 1      3477     509.64                        
## 2      3476     508.19  1   1.4483 0.001647 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Deviance Table
## 
## Model 1: R ~ A * D + C * D + B
## Model 2: R ~ A * D + B * C + C * D
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      3478     511.66                          
## 2      3477     509.64  1   2.0211 0.0002046 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Analysis of Deviance Table
## 
## Model 1: R ~ A * D + C * D
## Model 2: R ~ A * D + C * D + B
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      3479     569.30                          
## 2      3478     511.66  1   57.636 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m = glm(wing_morph_b ~ sex_b * pophost_b + sex_b * months_since_start + pophost_b * month_of_year + month_of_year * months_since_start, data=raw_data, family="binomial") # m98 top model
tidy_regression(m, is_color=FALSE)
## glm wing_morph_b ~ sex_b * pophost_b + sex_b * months_since_start + pophost_b * month_of_year + month_of_year * months_since_start binomial raw_data 
## AIC:  3173.032 
## (Intercept)                          coeff:  0.7420895   Pr(>|t|):  5.207138e-05 *
## sex_b                                coeff:  -0.2706994  Pr(>|t|):  0.002618832 *
## pophost_b                            coeff:  1.1107917   Pr(>|t|):  9.999362e-23 *
## months_since_start                   coeff:  0.0106835   Pr(>|t|):  0.0002869606 *
## month_of_year                        coeff:  0.1004066   Pr(>|t|):  8.030013e-05 *
## sex_b:pophost_b                      coeff:  0.09893 Pr(>|t|):  0.04409693 *
## sex_b:months_since_start             coeff:  0.003887    Pr(>|t|):  0.01091935 *
## pophost_b:month_of_year              coeff:  -0.0371429  Pr(>|t|):  0.01294977 *
## months_since_start:month_of_year     coeff:  -0.0014746  Pr(>|t|):  0.001144355 *
  • Males are more likely to be long-winged.

  • K.elegans more likely to be long-winged.

  • (+) effect of getting later in the season (month_of_year), such that long wing morphs become increasingly likely later in the year.

  • (+) effect of months_since_start, such that long wing morphs become increasingly likely across the decade.

  • (+) effect of sex*host where if F and from K.elegans then more likely to be long-winged

  • (+) effect of sex*months_since_start where if F and later in the decade then more likely to be long-winged

  • (-) effect of pophost*month_of_year where if from C.corindum and later in the season, then more likely long wing morphs will be present

  • weak (-) effect of month_of_year*months_since_start where the later in the year and later in the decade, the more likely short wing morphs will be present.

# m = glm(wing_morph_b ~ sex_b*months_since_start + month_of_year*months_since_start + pophost_b*month_of_year, data=raw_data, family="binomial") # m84 was the top model
temp = raw_data %>%
  filter(!is.na(wing_morph_b)) # filter out NA's that glm did automatically

check_spatial_dependencies(m, temp, temp$long, temp$lat, zone = 16, cutoff=14000, is_glm=TRUE)
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj
## = prefer_proj): Discarded datum Unknown based on WGS84 ellipsoid in Proj4
## definition
## Warning in proj4string(res): CRS object has comment, which is lost in output

Model validation looks fine. Nothing to worry about.

Let’s see what impacts the long-wing morph variance:

SE = function(x){sd(x)/sqrt(length(x))}
wmorph_summaryt<-aggregate(wing_morph_b~sex_b*pophost_b*month_of_year*months_since_start, data=raw_data, FUN=mean)
wmorph_summaryt$sd<-aggregate(wing_morph_b~sex_b*pophost_b*month_of_year*months_since_start, data=raw_data,
                          FUN=sd)$wing_morph_b
wmorph_summaryt$se<-aggregate(wing_morph_b~sex_b*pophost_b*month_of_year*months_since_start, data=raw_data,
                          FUN=SE)$wing_morph_b
data = wmorph_summaryt
data<-data.frame(R=data$sd, 
                 A=data$sex_b, 
                 B=data$pophost_b, 
                 C=(data$month_of_year),
                 D=data$months_since_start)

model_script = paste0(source_path,"generic models-gaussian glm 3-FF.R")
model_comparisonsAIC(model_script)
##        [,1]      [,2]      [,3]      [,4]       [,5]       [,6]      
## AICs   -91.69472 -90.25875 -89.97386 -88.65427  -88.54186  -88.5364  
## models 2         4         6         10         7          8         
## probs  0.3392973 0.1654864 0.1435158 0.07419158 0.07013674 0.06994547
## 
## m2   glm(formula = R ~ B, family = gaussian, data = data)
## m4   glm(formula = R ~ A + B, family = gaussian, data = data)
## m6   glm(formula = R ~ B + C, family = gaussian, data = data)
## m10  glm(formula = R ~ B * C, family = gaussian, data = data)
## m7   glm(formula = R ~ A + B + C, family = gaussian, data = data)
## m8   glm(formula = R ~ A * B, family = gaussian, data = data)
anova(m2, m6, test="Chisq") # Adding C does not improve fit
## Analysis of Deviance Table
## 
## Model 1: R ~ B
## Model 2: R ~ B + C
##   Resid. Df Resid. Dev Df  Deviance Pr(>Chi)
## 1        38    0.20365                      
## 2        37    0.20223  1 0.0014162   0.6107
anova(m2, m4, test="Chisq") # Adding B does not improve fit
## Analysis of Deviance Table
## 
## Model 1: R ~ B
## Model 2: R ~ A + B
##   Resid. Df Resid. Dev Df  Deviance Pr(>Chi)
## 1        38    0.20365                      
## 2        37    0.20080  1 0.0028514   0.4685
anova(m0, m2, test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: R ~ 1
## Model 2: R ~ B
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1        39    0.61828                          
## 2        38    0.20365  1  0.41464 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m = glm(sd ~ pophost_b, data=wmorph_summaryt, family="gaussian")
tidy_regression(m, is_color=FALSE) # -1 = C.corindum
## glm sd ~ pophost_b gaussian wmorph_summaryt 
## AIC:  -91.69472 
## (Intercept)  coeff:  0.3531599   Pr(>|t|):  2.477074e-28 *
## pophost_b    coeff:  -0.101813   Pr(>|t|):  1.063916e-10 *

only a (-) effect of pophost, where if from K.elegans then have less variation vs. if from C.cordinum have more variation.

plot(m$fitted.values, m$residuals)

Wing2Body

Removing Torn Wings

#data_long = remove_torn_wings(data_long)

Winter 2020 vs. other dates

data_long$compare_dates <- -1
data_long$compare_dates[data_long$months_since_start==81] <- 1

compare_dates_model <- glm(wing2body~compare_dates + pophost_b*sex_b, data=data_long)

tidy_regression(compare_dates_model, is_color=FALSE)
## glm wing2body ~ compare_dates + pophost_b * sex_b data_long 
## AIC:  -10712.03 
## (Intercept)      coeff:  0.726583    Pr(>|t|):  0 *
## compare_dates    coeff:  -0.0040607  Pr(>|t|):  2.930608e-15 *
## pophost_b        coeff:  0.0042202   Pr(>|t|):  5.753746e-22 *
## sex_b            coeff:  -0.0018677  Pr(>|t|):  1.213464e-05 *
## pophost_b:sex_b  coeff:  0.0014735   Pr(>|t|):  0.0005211045 *

Yes, bugs from this collection date have detectably smaller wing2body ratios on average than other collection dates, even when controlling for sex and host plant.

Binomial Regressions

What effects wing2body ratio? sex? host plant? month of the year? months since start?

data_long$wing2body_c = (data_long$wing2body-mean(data_long$wing2body, na.rm=TRUE))
data_long$month_of_year_c = (data_long$month_of_year-mean(data_long$month_of_year, na.rm=TRUE))
data_long$months_since_start_c = (data_long$months_since_start-mean(data_long$months_since_start, na.rm=TRUE))
data<-data.frame(R=data_long$wing2body_c, # centered
                 A=data_long$sex_b, # sex
                 B=data_long$pophost_b, # host
                 C=data_long$month_of_year_c, 
                 D=data_long$months_since_start_c) 

model_script = paste0(source_path,"generic models-gaussian glm 3-FF.R")
model_comparisonsAIC(model_script)
##        [,1]      [,2]      [,3]      [,4]     
## AICs   -10694.74 -10694.11 -10692.75 -10692.12
## models 11        14        15        17       
## probs  0.4217301 0.3074314 0.1558672 0.1138381
## 
## m11  glm(formula = R ~ A * B + C, family = gaussian, data = data)
## m14  glm(formula = R ~ A * B + A * C, family = gaussian, data = data)
## m15  glm(formula = R ~ A * B + B * C, family = gaussian, data = data)
## m17  glm(formula = R ~ A * B + A * C + B * C, family = gaussian, data = data)
anova(m15, m17, test="Chisq") # Adding A*C does not improve fit
anova(m11, m15, test="Chisq") # Adding B*C does not improve fit
anova(m11, m14, test="Chisq") # Adding A*C does not improve fit
anova(m8, m11, test="Chisq") # Adding C does improve fit
## Analysis of Deviance Table
## 
## Model 1: R ~ A * B + B * C
## Model 2: R ~ A * B + A * C + B * C
##   Resid. Df Resid. Dev Df   Deviance Pr(>Chi)
## 1      2090    0.74210                       
## 2      2089    0.74161  1 0.00048544   0.2423
## Analysis of Deviance Table
## 
## Model 1: R ~ A * B + C
## Model 2: R ~ A * B + B * C
##   Resid. Df Resid. Dev Df   Deviance Pr(>Chi)
## 1      2091     0.7421                       
## 2      2090     0.7421  1 3.2849e-06   0.9234
## Analysis of Deviance Table
## 
## Model 1: R ~ A * B + C
## Model 2: R ~ A * B + A * C
##   Resid. Df Resid. Dev Df   Deviance Pr(>Chi)
## 1      2091    0.74210                       
## 2      2090    0.74162  1 0.00048411   0.2428
## Analysis of Deviance Table
## 
## Model 1: R ~ A * B
## Model 2: R ~ A * B + C
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      2092    0.75827                          
## 2      2091    0.74210  1 0.016172 1.474e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m = glm(wing2body_c ~ sex_b*pophost_b + month_of_year_c, data=data_long, family=gaussian) 
tidy_regression(m, is_color=FALSE) # m11
## glm wing2body_c ~ sex_b * pophost_b + month_of_year_c gaussian data_long 
## AIC:  -10694.74 
## (Intercept)      coeff:  -0.001235   Pr(>|t|):  0.003762096 *
## sex_b            coeff:  -0.0016493  Pr(>|t|):  0.000111303 *
## pophost_b        coeff:  0.0045941   Pr(>|t|):  3.84725e-26 *
## month_of_year_c  coeff:  0.0008385   Pr(>|t|):  1.903239e-11 *
## sex_b:pophost_b  coeff:  0.001668    Pr(>|t|):  9.33909e-05 *
summary(m)
## 
## Call:
## glm(formula = wing2body_c ~ sex_b * pophost_b + month_of_year_c, 
##     family = gaussian, data = data_long)
## 
## Deviance Residuals: 
##       Min         1Q     Median         3Q        Max  
## -0.069822  -0.011139  -0.000314   0.011127   0.115009  
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -0.0012350  0.0004258  -2.901 0.003762 ** 
## sex_b           -0.0016493  0.0004260  -3.872 0.000111 ***
## pophost_b        0.0045941  0.0004285  10.720  < 2e-16 ***
## month_of_year_c  0.0008385  0.0001242   6.750 1.90e-11 ***
## sex_b:pophost_b  0.0016680  0.0004261   3.915 9.34e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.0003549032)
## 
##     Null deviance: 0.80932  on 2095  degrees of freedom
## Residual deviance: 0.74210  on 2091  degrees of freedom
## AIC: -10695
## 
## Number of Fisher Scoring iterations: 2
nrow(data_long)
## [1] 2096
  • Males have larger wing2body ratios.
  • K.elegans bugs have larger wing2body ratios.
  • Females on K.elegans have longer wings than C.corindum females.
  • wing2body ratios are increasing moderately towards the end of the year
temp = data_long %>% 
  filter(!is.na(wing2body_c))
check_spatial_dependencies(m, temp, temp$long, temp$lat, zone = 16, cutoff=14000, is_glm=TRUE)
## Warning in showSRID(uprojargs, format = "PROJ", multiline = "NO", prefer_proj
## = prefer_proj): Discarded datum Unknown based on WGS84 ellipsoid in Proj4
## definition
## Warning in proj4string(res): CRS object has comment, which is lost in output

There might be some spatial dependencies for long wing freq because seeing clusters of red and clusters of black.

Also the variogram shows distances 0 to 10,000 m where the variability between points starts high and then decreases (the points become more similar) until it levels out because at some point the distance between data points will be great enough that the points are no longer considered to be related to each other. The variability will flatten out into a “sill”.

Strange that it’s decreases over distance. But if you do 20,000 m then see it begins to increase again.

model_script = paste0(source_path,"generic models-gaussian glm 4-FF.R")
model_comparisonsAIC(model_script)
##        [,1]      [,2]       [,3]       [,4]       [,5]      
## AICs   -10698.64 -10697.49  -10697.26  -10697.14  -10696.5  
## models 88        99         92         97         58        
## probs  0.1720245 0.09639838 0.08606771 0.08118068 0.05886596
## 
## m88  glm(formula = R ~ A * B + A * D + B * D + C, family = gaussian, 
##     data = data)
## m99  glm(formula = R ~ A * B + A * D + B * D + C * D, family = gaussian, 
##     data = data)
## m92  glm(formula = R ~ A * B + A * C + A * D + B * D, family = gaussian, 
##     data = data)
## m97  glm(formula = R ~ A * B + A * D + B * C + B * D, family = gaussian, 
##     data = data)
## m58  glm(formula = R ~ A * B + B * D + C, family = gaussian, data = data)
anova(m22, m42, test="Chisq") # adding B*C does not improve fit
## Analysis of Deviance Table
## 
## Model 1: R ~ A * B + C
## Model 2: R ~ A * B + B * C
##   Resid. Df Resid. Dev Df   Deviance Pr(>Chi)
## 1      2091     0.7421                       
## 2      2090     0.7421  1 3.2849e-06   0.9234
anova(m16, m22, test="Chisq") # adding C does improve fit # same top model as before
## Analysis of Deviance Table
## 
## Model 1: R ~ A * B
## Model 2: R ~ A * B + C
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1      2092    0.75827                          
## 2      2091    0.74210  1 0.016172 1.474e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Same best fit model (no months_since_start effect). But can plot the positive effect of month of year on wing2body ratio. It looks cyclical by pophost and more linear by sex:

wing2body_summary<-aggregate(wing2body~pophost*month_of_year, data=data_long, FUN=mean)
plot(wing2body~month_of_year, data=wing2body_summary, pch=19, col=c(1,2)[as.factor(pophost)])

wing2body_summary<-aggregate(wing2body~sex_b*month_of_year, data=data_long, FUN=mean) # black is M; Red is F
plot(wing2body~month_of_year, data=wing2body_summary, pch=19, col=c(1,2)[as.factor(sex_b)]) # black is C. ; Red is K.

These plots are plotted cleanly in the wing_plots.Rmd script.

Let’s plot histograms and analyze the variance. Then model the variance.

SE = function(x){sd(x)/sqrt(length(x))}
w2b_summaryt<-aggregate(wing2body~sex_b*pophost_b*month_of_year*months_since_start, data=data_long, FUN=mean)
w2b_summaryt$sd<-aggregate(wing2body~sex_b*pophost_b*month_of_year*months_since_start, data=data_long,
                          FUN=sd)$wing2body
w2b_summaryt$se<-aggregate(wing2body~sex_b*pophost_b*month_of_year*months_since_start, data=data_long,
                          FUN=SE)$wing2body
data = w2b_summaryt
data<-data.frame(R=data$sd, 
                 A=data$sex_b, 
                 B=data$pophost_b, 
                 C=(data$month_of_year),
                 D=data$months_since_start)

model_script = paste0(source_path,"generic models-gaussian glm 3-FF.R")
model_comparisonsAIC(model_script)
##        [,1]      [,2]      [,3]      [,4]      [,5]       [,6]      
## AICs   -282.1703 -282.1217 -281.8742 -280.1703 -280.1217  -279.8742 
## models 6         10        2         7         13         4         
## probs  0.1912932 0.1866967 0.1649671 0.0703737 0.06868278 0.06068871
## 
## m6   glm(formula = R ~ B + C, family = gaussian, data = data)
## m10  glm(formula = R ~ B * C, family = gaussian, data = data)
## m2   glm(formula = R ~ B, family = gaussian, data = data)
## m7   glm(formula = R ~ A + B + C, family = gaussian, data = data)
## m13  glm(formula = R ~ B * C + A, family = gaussian, data = data)
## m4   glm(formula = R ~ A + B, family = gaussian, data = data)
anova(m2, m6, test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: R ~ B
## Model 2: R ~ B + C
##   Resid. Df Resid. Dev Df   Deviance Pr(>Chi)
## 1        34 0.00070956                       
## 2        33 0.00066572  1 4.3844e-05   0.1404
anova(m0, m2, test="Chisq") # Adding B marginally improves fit
## Analysis of Deviance Table
## 
## Model 1: R ~ 1
## Model 2: R ~ B
##   Resid. Df Resid. Dev Df   Deviance Pr(>Chi)  
## 1        35 0.00081657                         
## 2        34 0.00070956  1 0.00010701  0.02355 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m = glm(sd ~ pophost_b, data=w2b_summaryt, family=gaussian) 
tidy_regression(m, is_color=FALSE)
## glm sd ~ pophost_b gaussian w2b_summaryt 
## AIC:  -281.8742 
## (Intercept)  coeff:  0.0167934   Pr(>|t|):  1.007012e-21 *
## pophost_b    coeff:  0.0017241   Pr(>|t|):  0.03004314 *

Nothing seems to effect the w2b variance.

Plot the histograms between the sexes and conduct var tests.

time_var_tests = function(d, print_test=FALSE) {
  
  months = sort(unique(d$month_of_year))
  month_labs <- c("Feb", "Apr", "May", "Aug", "Sept", "Oct", "Dec")

  table = matrix(nrow=length(months), ncol=9)
  i = 0
  for (m in months) {
    i = i + 1
    data = d[d$month_of_year==m, ]
  
    VAR = var.test(wing2body ~ sex, data = data) # F test to compare the variances of two samples from normal pops
    TTEST= t.test(wing2body ~ sex, data=data) # t.test to find significant difference between the means of two groups
    AOV = aov(wing2body ~ sex, data=data) # used to analyze the differences among means. 
    
    p = TukeyHSD(AOV)$sex[,"p adj"]
    diff = TukeyHSD(AOV)$sex[,"diff"]
    
    if (print_test) {
      print(month_labs[i])
      print("t.test")
      print(TTEST)
      print("anova")
      print(summary(AOV))
      print(TukeyHSD(AOV))
      cat("-------------------------------------------------")
    }
    
    # plot histograms
    h = data %>%
    ggplot( aes(x=wing2body, fill=sex)) +
      geom_histogram( color="#e9ecef", alpha=0.6, position = 'identity') +
      scale_fill_manual(values=c("#69b3a2", "#404080")) +
      labs(fill="")  + labs(title=month_labs[i])
    print(h)
    
    # populate matrix
    table[i,1] = month_labs[i]
    table[i,2] = "M-F"
    table[i,3] = round(diff,4)
    table[i,4] = round(p,5)
    table[i,5] = "M=-1 | F=1"
    table[i,6] = round(TTEST$statistic,4)
    table[i,7] = round(TTEST$p.value,5)
    table[i,8] = round(VAR$statistic,2)
    table[i,9] = round(VAR$p.value, 3)
  }
  colnames(table) = c("month","sex", "Tukey-diff", "p-val", "sex", "ttest", "p-val", "F-test", "p-val")
  return(table)
} 
summary_table = time_var_tests(data_long)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

summary_table
##      month  sex   Tukey-diff p-val     sex          ttest     p-val     F-test
## [1,] "Feb"  "M-F" "0.0069"   "1e-05"   "M=-1 | F=1" "-4.513"  "1e-05"   "1.01"
## [2,] "Apr"  "M-F" "0.0057"   "0.03909" "M=-1 | F=1" "-2.0508" "0.04159" "1.24"
## [3,] "May"  "M-F" "-0.0056"  "0.04002" "M=-1 | F=1" "2.1789"  "0.02992" "1.69"
## [4,] "Aug"  "M-F" "0.0029"   "0.07841" "M=-1 | F=1" "-1.7611" "0.0792"  "1.05"
## [5,] "Sept" "M-F" "0.0024"   "0.35468" "M=-1 | F=1" "-0.9363" "0.35087" "1.23"
## [6,] "Oct"  "M-F" "0.0047"   "0.0083"  "M=-1 | F=1" "-2.6536" "0.00836" "1.04"
## [7,] "Dec"  "M-F" "0.005"    "0.0342"  "M=-1 | F=1" "-2.1903" "0.02964" "0.7" 
##      p-val  
## [1,] "0.952"
## [2,] "0.261"
## [3,] "0"    
## [4,] "0.772"
## [5,] "0.408"
## [6,] "0.8"  
## [7,] "0.082"
y = summary_table[,"F-test"]
xlab = summary_table[, "month"]
x = sort(unique(d$month_of_year))

plot(x,y, ylab="F-test value", xlab="Month of Year", xaxt = "n", main="wing-to-body ~ sex")
axis(1, at=seq(min(x),max(x),2), labels=xlab[-5])
abline(h=1, col=2) # non-linear relationship where at the beginning of the year the variance between sexes is similar but then in spring it deviates a lot. In the summer the variance is similar and then towards the winter the variance deviates.

time_var_tests = function(d, print_test=FALSE) {
  
  months = sort(unique(d$month_of_year))
  month_labs <- c("Feb", "Apr", "May", "Aug", "Sept", "Oct", "Dec")

  table = matrix(nrow=length(months), ncol=9)
  i = 0
  for (m in months) {
    i = i + 1
    data = d[d$month_of_year==m, ]
  
    VAR = var.test(wing2body ~ pophost, data = data) # F test to compare the variances of two samples from normal pops
    TTEST= t.test(wing2body ~ pophost, data=data) # t.test to find significant difference between the means of two groups
    AOV = aov(wing2body ~ pophost, data=data) # used to analyze the differences among means. 
    
    p = TukeyHSD(AOV)$pophost[,"p adj"]
    diff = TukeyHSD(AOV)$pophost[,"diff"]
    
    if (print_test) {
      print(month_labs[i])
      print("t.test")
      print(TTEST)
      print("anova")
      print(summary(AOV))
      print(TukeyHSD(AOV))
      cat("-------------------------------------------------")
    }
    
    # plot histograms
    h = data %>%
    ggplot( aes(x=wing2body, fill=pophost)) +
      geom_histogram( color="#e9ecef", alpha=0.6, position = 'identity') +
      scale_fill_manual(values=c("#69b3a2", "#404080")) +
      labs(fill="")  + labs(title=month_labs[i])
    print(h)
    
    # populate matrix
    table[i,1] = month_labs[i]
    table[i,2] = "GRT-BV"
    table[i,3] = round(diff,4)
    table[i,4] = round(p,5)
    table[i,5] = "GRT=1 | BV=-1"
    table[i,6] = round(TTEST$statistic,4)
    table[i,7] = round(TTEST$p.value,5)
    table[i,8] = round(VAR$statistic,2)
    table[i,9] = round(VAR$p.value, 3)
  }
  colnames(table) = c("month","sex", "Tukey-diff", "p-val", "sex", "ttest", "p-val", "F-test", "p-val")
  
  return(table)
} 
summary_table = time_var_tests(data_long)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

summary_table
##      month  sex      Tukey-diff p-val     sex             ttest     p-val    
## [1,] "Feb"  "GRT-BV" "0.0077"   "0"       "GRT=1 | BV=-1" "-5.3374" "0"      
## [2,] "Apr"  "GRT-BV" "0.003"    "0.36637" "GRT=1 | BV=-1" "-0.9536" "0.34306"
## [3,] "May"  "GRT-BV" "0.0108"   "6e-05"   "GRT=1 | BV=-1" "-4.5295" "1e-05"  
## [4,] "Aug"  "GRT-BV" "0.0065"   "0.0013"  "GRT=1 | BV=-1" "-3.372"  "0.00104"
## [5,] "Sept" "GRT-BV" "0.0082"   "0.0047"  "GRT=1 | BV=-1" "-2.6132" "0.01198"
## [6,] "Oct"  "GRT-BV" "0.0087"   "0"       "GRT=1 | BV=-1" "-5.0098" "0"      
## [7,] "Dec"  "GRT-BV" "0.0107"   "1e-05"   "GRT=1 | BV=-1" "-4.7534" "0"      
##      F-test p-val  
## [1,] "1.12" "0.407"
## [2,] "0.83" "0.457"
## [3,] "0.22" "0"    
## [4,] "0.88" "0.54" 
## [5,] "1.47" "0.156"
## [6,] "1.12" "0.456"
## [7,] "0.8"  "0.298"
y = summary_table[,"F-test"]
xlab = summary_table[, "month"]
x = sort(unique(d$month_of_year))

plot(x,y, ylab="F-test value", xlab="Month of Year", xaxt = "n", main="wing-to-body ~ pophost")
axis(1, at=seq(min(x),max(x),2), labels=xlab[-5])
abline(h=1, col=2) # non-linear relationship where at the beginning of the year the variance between hostplants is similar but then in spring it deviates a lot. In the summer and winter the variance is similar. (GRT/BV var = F-test value where in Sept and Feb only months where GRT is higher and the rest is BV)

temp = data_long %>%
  filter(!is.na(months_since_start), !is.na(sex), !is.na(wing2body))
time_var_tests = function(d, print_test=FALSE) {
  
  months = unique(d$months_since_start)
  #month_labs <- c("Feb", "Apr", "May", "Aug", "Sept", "Oct", "Dec")

  table = matrix(nrow=length(months), ncol=9)
  i = 0
  for (m in months) {
    i = i + 1
    data = d[d$months_since_start==m, ]
  
    VAR = var.test(wing2body ~ sex, data = data) # F test to compare the variances of two samples from normal pops
    TTEST= t.test(wing2body ~ sex, data=data) # t.test to find significant difference between the means of two groups
    AOV = aov(wing2body ~ sex, data=data) # used to analyze the differences among means. 
    
    p = TukeyHSD(AOV)$sex[,"p adj"]
    diff = TukeyHSD(AOV)$sex[,"diff"]
    
    if (print_test) {
      print(m)
      print("t.test")
      print(TTEST)
      print("anova")
      print(summary(AOV))
      print(TukeyHSD(AOV))
      cat("-------------------------------------------------")
    }
    
    # plot histograms
    h = data %>%
    ggplot( aes(x=wing2body, fill=sex)) +
      geom_histogram( color="#e9ecef", alpha=0.6, position = 'identity') +
      scale_fill_manual(values=c("#69b3a2", "#404080")) +
      labs(fill="")  + labs(title=m)
    print(h)
    
    # populate matrix
    table[i,1] = m
    table[i,2] = "M-F"
    table[i,3] = round(diff,4)
    table[i,4] = round(p,5)
    table[i,5] = "M=-1 | F=1"
    table[i,6] = round(TTEST$statistic,4)
    table[i,7] = round(TTEST$p.value,5)
    table[i,8] = round(VAR$statistic,2)
    table[i,9] = round(VAR$p.value, 3)
  }
  colnames(table) = c("month","sex", "Tukey-diff", "p-val", "sex", "ttest", "p-val", "F-test", "p-val")
  
  return(table)
} 

summary_table = time_var_tests(temp)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

summary_table
##       month sex   Tukey-diff p-val     sex          ttest     p-val     F-test
##  [1,] "72"  "M-F" "-0.0056"  "0.04002" "M=-1 | F=1" "2.1789"  "0.02992" "1.69"
##  [2,] "51"  "M-F" "0.0029"   "0.07841" "M=-1 | F=1" "-1.7611" "0.0792"  "1.05"
##  [3,] "64"  "M-F" "0.0024"   "0.35468" "M=-1 | F=1" "-0.9363" "0.35087" "1.23"
##  [4,] "43"  "M-F" "0.005"    "0.0342"  "M=-1 | F=1" "-2.1903" "0.02964" "0.7" 
##  [5,] "81"  "M-F" "0.0069"   "1e-05"   "M=-1 | F=1" "-4.513"  "1e-05"   "1.01"
##  [6,] "77"  "M-F" "0.0047"   "0.0083"  "M=-1 | F=1" "-2.6536" "0.00836" "1.04"
##  [7,] "23"  "M-F" "0.0043"   "0.32845" "M=-1 | F=1" "-0.9809" "0.32901" "1.25"
##  [8,] "0"   "M-F" "-0.0011"  "0.84031" "M=-1 | F=1" "0.1929"  "0.84844" "1.22"
##  [9,] "11"  "M-F" "0.0137"   "0.00601" "M=-1 | F=1" "-2.9604" "0.00471" "1.5" 
##       p-val  
##  [1,] "0"    
##  [2,] "0.772"
##  [3,] "0.408"
##  [4,] "0.082"
##  [5,] "0.952"
##  [6,] "0.8"  
##  [7,] "0.429"
##  [8,] "0.57" 
##  [9,] "0.343"
y = summary_table[,"F-test"]
xlab = summary_table[, "month"]
x = sort(unique(temp$months_since_start))

plot(x,y, ylab="F-test value", xlab="Months Since Start", main="wing-to-body ~ pophost")
#axis(1, at=seq(min(x),max(x),2), labels=xlab[-5])
abline(h=1, col=2)

temp = data_long %>%
  filter(!is.na(months_since_start), !is.na(pophost), !is.na(wing2body))
time_var_tests = function(d, print_test=FALSE) {
  
  months = unique(d$months_since_start)
  #month_labs <- c("Feb", "Apr", "May", "Aug", "Sept", "Oct", "Dec")

  table = matrix(nrow=length(months), ncol=9)
  i = 0
  for (m in months) {
    i = i + 1
    data = d[d$months_since_start==m, ]
  
    VAR = var.test(wing2body ~ pophost, data = data) # F test to compare the variances of two samples from normal pops
    TTEST= t.test(wing2body ~ pophost, data=data) # t.test to find significant difference between the means of two groups
    AOV = aov(wing2body ~ pophost, data=data) # used to analyze the differences among means. 
    
    p = TukeyHSD(AOV)$pophost[,"p adj"]
    diff = TukeyHSD(AOV)$pophost[,"diff"]
    
    if (print_test) {
      print(m)
      print("t.test")
      print(TTEST)
      print("anova")
      print(summary(AOV))
      print(TukeyHSD(AOV))
      cat("-------------------------------------------------")
    }
    
    # plot histograms
    h = data %>%
    ggplot( aes(x=wing2body, fill=pophost)) +
      geom_histogram( color="#e9ecef", alpha=0.6, position = 'identity') +
      scale_fill_manual(values=c("#69b3a2", "#404080")) +
      labs(fill="")  + labs(title=m)
    print(h)
    
    # populate matrix
    table[i,1] = m
    table[i,2] = "GRT-BV"
    table[i,3] = round(diff,4)
    table[i,4] = round(p,5)
    table[i,5] = "GRT=1 | BV=-1"
    table[i,6] = round(TTEST$statistic,4)
    table[i,7] = round(TTEST$p.value,5)
    table[i,8] = round(VAR$statistic,2)
    table[i,9] = round(VAR$p.value, 3)
  }
  colnames(table) = c("month","sex", "Tukey-diff", "p-val", "sex", "ttest", "p-val", "F-test", "p-val")
  
  return(table)
} 

summary_table = time_var_tests(temp)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

summary_table
##       month sex      Tukey-diff p-val     sex             ttest     p-val    
##  [1,] "72"  "GRT-BV" "0.0108"   "6e-05"   "GRT=1 | BV=-1" "-4.5295" "1e-05"  
##  [2,] "51"  "GRT-BV" "0.0065"   "0.0013"  "GRT=1 | BV=-1" "-3.372"  "0.00104"
##  [3,] "64"  "GRT-BV" "0.0082"   "0.0047"  "GRT=1 | BV=-1" "-2.6132" "0.01198"
##  [4,] "43"  "GRT-BV" "0.0107"   "1e-05"   "GRT=1 | BV=-1" "-4.7534" "0"      
##  [5,] "81"  "GRT-BV" "0.0077"   "0"       "GRT=1 | BV=-1" "-5.3374" "0"      
##  [6,] "77"  "GRT-BV" "0.0087"   "0"       "GRT=1 | BV=-1" "-5.0098" "0"      
##  [7,] "23"  "GRT-BV" "0.0131"   "0.02364" "GRT=1 | BV=-1" "-2.447"  "0.02201"
##  [8,] "0"   "GRT-BV" "-0.0025"  "0.62359" "GRT=1 | BV=-1" "0.5289"  "0.59996"
##  [9,] "11"  "GRT-BV" "0"        "0.9978"  "GRT=1 | BV=-1" "-0.0035" "0.99727"
##       F-test p-val  
##  [1,] "0.22" "0"    
##  [2,] "0.88" "0.54" 
##  [3,] "1.47" "0.156"
##  [4,] "0.8"  "0.298"
##  [5,] "1.12" "0.407"
##  [6,] "1.12" "0.456"
##  [7,] "0.83" "0.702"
##  [8,] "0.73" "0.471"
##  [9,] "0.4"  "0.089"
y = summary_table[,"F-test"]
xlab = summary_table[, "month"]
x = sort(unique(temp$months_since_start))

plot(x,y, ylab="F-test value", xlab="Months Since Start", main="wing-to-body ~ pophost")
#axis(1, at=seq(min(x),max(x),2), labels=xlab[-5])
abline(h=1, col=2)