Start by imputing pollen data and creating a new data frame with average pollen consumption on a per-colony basis
### Figure out average pollen consumption by treatment
pollen <- read_csv("pollen1.csv", col_types = cols(round = col_factor(levels = c("1",
"2")), treatment = col_factor(levels = c("1",
"2", "3", "4", "5")), replicate = col_factor(levels = c("1",
"2", "3", "4", "5", "6", "7", "9", "11",
"12")), start_date = col_date(format = "%m/%d/%Y"),
start_time = col_character(), end_date = col_date(format = "%m/%d/%Y"),
end_time = col_character()))
pollen$colony <- as.factor(pollen$colony)
pollen$pid <- as.factor(pollen$pid)
pollen$count <- as.factor(pollen$count)
pollen <- na.omit(pollen)
range(pollen$difference)
## [1] -0.98780 1.56542
# get rid of negative numbers
pollen$difference[pollen$difference < 0] <- NA
pollen <- na.omit(pollen)
range(pollen$difference)
## [1] 0.002715 1.565420
pollen$whole_dif <- as.numeric(pollen$difference)
wd.1 <- lm(difference ~ treatment, data = pollen)
summary(wd.1)
##
## Call:
## lm(formula = difference ~ treatment, data = pollen)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.5268 -0.2476 -0.1363 0.1874 1.0576
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.460679 0.021143 21.789 < 2e-16 ***
## treatment2 0.047174 0.030208 1.562 0.118630
## treatment3 0.100480 0.029931 3.357 0.000812 ***
## treatment4 0.053390 0.029931 1.784 0.074703 .
## treatment5 -0.001879 0.030272 -0.062 0.950524
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3356 on 1231 degrees of freedom
## Multiple R-squared: 0.01281, Adjusted R-squared: 0.009604
## F-statistic: 3.994 on 4 and 1231 DF, p-value: 0.003177
wd.emm <- emmeans(wd.1, "treatment")
summary(wd.emm)
## treatment emmean SE df lower.CL upper.CL
## 1 0.461 0.0211 1231 0.419 0.502
## 2 0.508 0.0216 1231 0.466 0.550
## 3 0.561 0.0212 1231 0.520 0.603
## 4 0.514 0.0212 1231 0.473 0.556
## 5 0.459 0.0217 1231 0.416 0.501
##
## Confidence level used: 0.95
wd.summary <- pollen %>%
group_by(colony) %>%
summarize(whole.mean = mean(difference))
as.data.frame(wd.summary) # this is the data frame I will merge with subsequent data frames to contain average pollen consumption per colony as a new column
## colony whole.mean
## 1 1.11R2 0.2829509
## 2 1.12R2 0.1697964
## 3 1.1R1 0.8049667
## 4 1.1R2 0.5213458
## 5 1.2R1 0.4704294
## 6 1.2R2 0.3374200
## 7 1.3R1 0.3910053
## 8 1.3R2 0.4512891
## 9 1.4R2 0.6063016
## 10 1.5R2 0.7079516
## 11 1.7R2 0.7400381
## 12 1.9R2 0.2240081
## 13 2.11R2 0.4178270
## 14 2.12R2 0.4035568
## 15 2.1R1 0.7282895
## 16 2.1R2 0.6101589
## 17 2.2R1 0.4663045
## 18 2.2R2 0.5109234
## 19 2.3R1 0.4052000
## 20 2.3R2 0.3280036
## 21 2.4R2 0.3881394
## 22 2.5R2 0.7194915
## 23 2.7R2 0.5299685
## 24 2.9R2 0.5857152
## 25 3.11R2 0.4216410
## 26 3.12R2 0.3390993
## 27 3.1R1 0.8014682
## 28 3.1R2 0.3711948
## 29 3.2R1 0.8020500
## 30 3.2R2 0.3461010
## 31 3.3R1 0.5873429
## 32 3.3R2 0.8465806
## 33 3.4R2 0.4120433
## 34 3.5R2 0.8906211
## 35 3.7R2 0.6266680
## 36 3.9R2 0.4409511
## 37 4.11R2 0.3456011
## 38 4.12R2 0.2496295
## 39 4.1R1 0.8837867
## 40 4.1R2 0.7074755
## 41 4.2R1 0.3319238
## 42 4.2R2 0.3871742
## 43 4.3R1 0.3944500
## 44 4.3R2 0.5800074
## 45 4.4R2 0.8356247
## 46 4.5R2 0.2335967
## 47 4.7R2 0.6237400
## 48 4.9R2 0.5322950
## 49 5.11R2 0.4113960
## 50 5.12R2 0.2077741
## 51 5.1R1 0.6799737
## 52 5.1R2 0.3782286
## 53 5.2R1 0.7140056
## 54 5.2R2 0.4912468
## 55 5.3R1 0.2857654
## 56 5.3R2 0.2128778
## 57 5.4R2 0.4563045
## 58 5.5R2 0.7095479
## 59 5.7R2 0.6113189
## 60 5.9R2 0.4188290
drones <- read_csv("drones_rf.csv", col_types = cols(round = col_factor(levels = c("1","2")), treatment = col_factor(levels = c("1","2", "3", "4", "5")), notes = col_skip(), colony_start = col_skip(), colony_count = col_skip(), emerge_date = col_skip()))
drones$order_on_sheet <- as.factor(drones$order_on_sheet)
drones$replicate <- as.factor(drones$replicate)
drones$colony <- as.factor(drones$colony)
drones$id <- as.factor(drones$id)
drones$alive <- as.logical(drones$`alive?`)
drones$relative_fat <- as.double(drones$relative_fat)
drones <- na.omit(drones)
drf.pollen <- merge(wd.summary, drones, by.x = "colony") # this is a new data frame with average pollen consumption data per colony
drf.pollen$dwc <- as.character(drf.pollen$dry_weight)
Look at shape of data and check for outliers
ggplot(drf.pollen) +
geom_boxplot(aes(x = treatment, y = dry_weight, fill = treatment)) +
coord_cartesian(ylim=c(0, 0.1)) +
scale_fill_manual(values = c("pink1", "plum", "peachpuff4", "pink3", "peachpuff2"),
name = "Pristine Level",
labels = c("Treatment 1 (control)", "Treatment 2",
"Treatment 3", "Treatment 4", "Treatment 5"))+
theme_cowplot() +
ggtitle("Drone Dry Weight (g)")
ggplot(drf.pollen) +
geom_histogram(aes(x = dry_weight, fill = treatment),
position = "identity", binwidth = 0.025 ,col=I("black")) +
scale_fill_manual(values = c("pink1", "plum", "peachpuff4", "pink3", "peachpuff2"),
name = "Pristine Level",
labels = c("Treatment 1 (control)", "Treatment 2",
"Treatment 3", "Treatment 4", "Treatment 5"))+
theme_cowplot() +
ggtitle("Drone Dry Weight (g)")
range(drf.pollen$dry_weight)
## [1] 0.0146 0.4570
which(drf.pollen$dry_weight %in% c(0.457))
## [1] 68
drf.pollen <- drf.pollen[-67,]
range(drf.pollen$dry_weight)
## [1] 0.0146 0.4570
ggplot(drf.pollen) +
geom_histogram(aes(x = dry_weight, fill = treatment),
position = "identity", binwidth = 0.0025 ,col=I("black")) +
scale_fill_manual(values = c("pink1", "plum", "peachpuff4", "pink3", "peachpuff2"),
name = "Pristine Level",
labels = c("Treatment 1 (control)", "Treatment 2",
"Treatment 3", "Treatment 4", "Treatment 5"))+
theme_cowplot() +
ggtitle("Drone Dry Weight (g)")
### Look at normality
shapiro.test(drf.pollen$dry_weight) #Normal yay!
##
## Shapiro-Wilk normality test
##
## data: drf.pollen$dry_weight
## W = 0.27272, p-value < 2.2e-16
We see in the output below drone dry weight differed as a function of treatment (ANOVA = 0.0566). Treatment 3 differed significantly from treatments 1 (p = 0.0592). Average drone dry weights were: treatment one = 0.0349g, treatment two = 0.0323g, treatment three = 0.0296g, treatment four = 0.0330g, and treatment five = 0.0339g.
dry.mod1 <- lmer(dry_weight~ treatment + whole.mean + alive + emerge_time + workers_alive + round + (1|colony), data = drf.pollen)
dry.mod2 <- lmer(dry_weight~ treatment + whole.mean + alive + emerge_time + round + (1|colony), data = drf.pollen)
dry.mod3 <- lmer(dry_weight~ treatment + whole.mean + emerge_time + workers_alive + round + (1|colony), data = drf.pollen)
dry.mod4 <- lmer(dry_weight~ treatment + whole.mean + round + (1|colony), data = drf.pollen,)
AIC(dry.mod1, dry.mod2, dry.mod3, dry.mod4 )
## df AIC
## dry.mod1 12 -1891.679
## dry.mod2 11 -1904.592
## dry.mod3 11 -1899.283
## dry.mod4 9 -1928.177
anova( dry.mod1, dry.mod2, dry.mod3, dry.mod4 )
## Data: drf.pollen
## Models:
## dry.mod4: dry_weight ~ treatment + whole.mean + round + (1 | colony)
## dry.mod2: dry_weight ~ treatment + whole.mean + alive + emerge_time + round + (1 | colony)
## dry.mod3: dry_weight ~ treatment + whole.mean + emerge_time + workers_alive + round + (1 | colony)
## dry.mod1: dry_weight ~ treatment + whole.mean + alive + emerge_time + workers_alive + round + (1 | colony)
## npar AIC BIC logLik deviance Chisq Df Pr(>Chisq)
## dry.mod4 9 -1996.4 -1960.1 1007.2 -2014.4
## dry.mod2 11 -1994.4 -1950.1 1008.2 -2016.4 2.0097 2 0.3661
## dry.mod3 11 -1993.6 -1949.2 1007.8 -2015.6 0.0000 0
## dry.mod1 12 -1993.2 -1944.8 1008.6 -2017.2 1.5715 1 0.2100
anova(dry.mod4)
## Analysis of Variance Table
## npar Sum Sq Mean Sq F value
## treatment 4 0.0047803 0.00119506 2.5381
## whole.mean 1 0.0010556 0.00105556 2.2418
## round 1 0.0006005 0.00060051 1.2754
anova(dry.mod1)
## Analysis of Variance Table
## npar Sum Sq Mean Sq F value
## treatment 4 0.0045750 0.00114375 2.4311
## whole.mean 1 0.0010250 0.00102502 2.1787
## alive 1 0.0009649 0.00096488 2.0509
## emerge_time 1 0.0001794 0.00017938 0.3813
## workers_alive 1 0.0003780 0.00037800 0.8034
## round 1 0.0002954 0.00029544 0.6280
#stick with dry.mod4, the other covariates don't add anything
plot(resid(dry.mod4)) +
abline(h=0, col="red", lwd=2)
## integer(0)
qqnorm(resid(dry.mod4));qqline(resid(dry.mod4)) # diagnostic plots look excellent
dry.emm <- emmeans(dry.mod4, "treatment")
pairs(dry.emm)
## contrast estimate SE df t.ratio p.value
## treatment1 - treatment2 0.008196 0.00365 31.3 2.247 0.1893
## treatment1 - treatment3 0.011985 0.00390 32.9 3.069 0.0325
## treatment1 - treatment4 0.007733 0.00363 24.0 2.131 0.2400
## treatment1 - treatment5 0.006103 0.00359 27.8 1.699 0.4510
## treatment2 - treatment3 0.003789 0.00381 38.1 0.995 0.8559
## treatment2 - treatment4 -0.000463 0.00356 27.5 -0.130 0.9999
## treatment2 - treatment5 -0.002093 0.00348 30.9 -0.602 0.9738
## treatment3 - treatment4 -0.004252 0.00375 28.1 -1.134 0.7875
## treatment3 - treatment5 -0.005882 0.00378 34.2 -1.555 0.5354
## treatment4 - treatment5 -0.001630 0.00351 24.4 -0.464 0.9899
##
## Results are averaged over the levels of: round
## Degrees-of-freedom method: kenward-roger
## P value adjustment: tukey method for comparing a family of 5 estimates
summary(dry.emm)
## treatment emmean SE df lower.CL upper.CL
## 1 0.0466 0.00316 46.0 0.0402 0.0530
## 2 0.0384 0.00287 51.8 0.0326 0.0442
## 3 0.0346 0.00317 57.8 0.0283 0.0409
## 4 0.0389 0.00309 38.7 0.0326 0.0451
## 5 0.0405 0.00287 44.4 0.0347 0.0463
##
## Results are averaged over the levels of: round
## Degrees-of-freedom method: kenward-roger
## Confidence level used: 0.95
plot(dry.emm, comparisons = TRUE)
Make a summary table to make a bar plot
dry.means <- drf.pollen %>%
group_by(treatment) %>%
summarize(dry.mean = mean(dry_weight),
dry.sd = sd(dry_weight),
dry.n = length(dry_weight)) %>%
mutate(dry.se = dry.sd/sqrt(dry.n))
ggplot(data = dry.means, aes(x = treatment, y = dry.mean, fill = treatment)) +
geom_col(col = "black")+
coord_cartesian(ylim=c(0.03,0.046)) +
scale_fill_manual(values = c("gray0", "cadetblue2", "cadetblue3", "cadetblue4", "deepskyblue4"),
name = "Pristine Level",
labels = c("Treatment 1 (control)", "Treatment 2",
"Treatment 3", "Treatment 4", "Treatment 5")) +
theme_cowplot()+
geom_errorbar(aes(ymin = dry.mean - dry.se,
ymax = dry.mean + dry.se),
position = position_dodge(0.9), width = 0.4) +
labs(y = "Drone Dry Weight(g)",) +
ggtitle("Average Drone Dry Weight(g) per Treatment") +
scale_x_discrete(name = "Treatment",
labels = c("0 PPB", "150 PPB", "1,500 PPB", "15,000 PPB", "150,000 PPB")) +
theme_classic(base_size = 12)
Let’s actually try an aov function with a post hoc tukey analysis
aov_drfp <- aov(dry_weight ~ treatment + whole.mean + alive + (1|round/colony), data = drf.pollen)
summary(aov_drfp)
## Df Sum Sq Mean Sq F value Pr(>F)
## treatment 4 0.00594 0.0014847 3.130 0.0149 *
## whole.mean 1 0.00122 0.0012223 2.576 0.1092
## alive 1 0.00098 0.0009840 2.074 0.1506
## Residuals 410 0.19451 0.0004744
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Anova(aov_drfp)
## Anova Table (Type II tests)
##
## Response: dry_weight
## Sum Sq Df F value Pr(>F)
## treatment 0.006307 4 3.3235 0.01073 *
## whole.mean 0.001027 1 2.1649 0.14196
## alive 0.000984 1 2.0742 0.15057
## 1 | round/colony 0
## Residuals 0.194506 410
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AIC(aov_drfp)
## [1] -1999.153
tukey_drfp <- HSD.test(aov_drfp, trt = "treatment",
console = TRUE)
##
## Study: aov_drfp ~ "treatment"
##
## HSD Test for dry_weight
##
## Mean Square Error: 0.0004744048
##
## treatment, means
##
## dry_weight std r Min Max
## 1 0.04885641 0.047613339 78 0.0146 0.4570
## 2 0.03968605 0.008240319 86 0.0219 0.0659
## 3 0.03691304 0.008369727 69 0.0184 0.0597
## 4 0.04129348 0.008729422 92 0.0200 0.0582
## 5 0.04173696 0.007172271 92 0.0233 0.0634
##
## Alpha: 0.05 ; DF Error: 410
## Critical Value of Studentized Range: 3.874884
##
## Groups according to probability of means differences and alpha level( 0.05 )
##
## Treatments with the same letter are not significantly different.
##
## dry_weight groups
## 1 0.04885641 a
## 5 0.04173696 ab
## 4 0.04129348 ab
## 2 0.03968605 ab
## 3 0.03691304 b
tidy_tukey <- as.data.frame(tukey_drfp$groups)
tidy_tukey
## dry_weight groups
## 1 0.04885641 a
## 5 0.04173696 ab
## 4 0.04129348 ab
## 2 0.03968605 ab
## 3 0.03691304 b
tukey.test <- TukeyHSD(aov_drfp, which = 'treatment')
tukey.test
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = dry_weight ~ treatment + whole.mean + alive + (1 | round/colony), data = drf.pollen)
##
## $treatment
## diff lwr upr p adj
## 2-1 -0.0091703637 -0.018501695 0.0001609678 0.0567497
## 3-1 -0.0119433668 -0.021806276 -0.0020804574 0.0087138
## 4-1 -0.0075629320 -0.016748404 0.0016225398 0.1615889
## 5-1 -0.0071194537 -0.016304926 0.0020660181 0.2119640
## 3-2 -0.0027730030 -0.012418182 0.0068721762 0.9341045
## 4-2 0.0016074317 -0.007343847 0.0105587106 0.9880827
## 5-2 0.0020509100 -0.006900369 0.0110021888 0.9705501
## 4-3 0.0043804348 -0.005123703 0.0138845721 0.7141609
## 5-3 0.0048239130 -0.004680224 0.0143280504 0.6340072
## 5-4 0.0004434783 -0.008355643 0.0092425996 0.9999179
The AIC here is actually lower than in the lmer model, and the groupings of the treatments makes more sense based on the last bar plot I made. Let’s make a new plot and add in the stats.
dry_box <- drf.pollen %>%
ggplot(aes(x = treatment, y = dry_weight, fill = treatment)) +
geom_boxplot(outlier.shape = NA) +
ggdist::geom_dots(side = "both", color = "black", alpha = 0.5) +
scale_fill_manual(values = c("gray0", "cadetblue2", "cadetblue3", "cadetblue4", "deepskyblue4"),name = "Pristine Level", labels = c("Treatment 1 (control)", "Treatment 2", "Treatment 3", "Treatment 4", "Treatment 5")) +
theme_cowplot()+
theme(legend.position = "none") +
labs(x = "Treatment",
y = "Drone Dry Weight (g)",
title = "Average Drone Dry Weights(g) by Treatment",
caption = "Dry weights of drones after being oven-dried at 60C for 72 hours")
dry_max <- drf.pollen %>%
group_by(treatment) %>%
summarize(max_dry = max(dry_weight))
dry_max
## # A tibble: 5 × 2
## treatment max_dry
## <fct> <dbl>
## 1 1 0.457
## 2 2 0.0659
## 3 3 0.0597
## 4 4 0.0582
## 5 5 0.0634
tidier_tukey <- tidy_tukey %>%
rownames_to_column() %>% # converts rownames to columns
rename(treatment = rowname) # renames the column now called rowname to species
# join
dry_for_plotting <- full_join(tidier_tukey, dry_max,
by = "treatment")
dry_box +
stat_compare_means(method = "anova") +
geom_text(data = dry_for_plotting,
aes(x = treatment,
y = 0.01 + max_dry,
label = groups))
anova <- aov(dry_weight ~ treatment, data = drf.pollen)
tukey <- TukeyHSD(anova)
cld <- multcompLetters4(anova, tukey)
cld
## $treatment
## 1 5 4 2 3
## "a" "ab" "ab" "ab" "b"
dt <- drf.pollen %>%
group_by(treatment) %>%
summarize(w = mean(dry_weight),
sd = sd(dry_weight),
n = length(dry_weight),
se = sd/sqrt(n))
as.data.frame(dt)
## treatment w sd n se
## 1 1 0.04885641 0.047613339 78 0.0053911490
## 2 2 0.03968605 0.008240319 86 0.0008885765
## 3 3 0.03691304 0.008369727 69 0.0010075967
## 4 4 0.04129348 0.008729422 92 0.0009101052
## 5 5 0.04173696 0.007172271 92 0.0007477610
dt$n <- as.double(dt$n)
cld <- as.data.frame.list(cld$treatment)
cld
## Letters monospacedLetters LetterMatrix.a LetterMatrix.b
## 1 a a TRUE FALSE
## 5 ab ab TRUE TRUE
## 4 ab ab TRUE TRUE
## 2 ab ab TRUE TRUE
## 3 b b FALSE TRUE
dt$cld <- cld$Letters
dt
## # A tibble: 5 × 6
## treatment w sd n se cld
## <fct> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 1 0.0489 0.0476 78 0.00539 a
## 2 2 0.0397 0.00824 86 0.000889 ab
## 3 3 0.0369 0.00837 69 0.00101 ab
## 4 4 0.0413 0.00873 92 0.000910 ab
## 5 5 0.0417 0.00717 92 0.000748 b
ggplot(data = dt, aes(x = treatment, y = w, fill = treatment)) +
geom_bar(stat = "identity")+
coord_cartesian(ylim=c(0.03,0.046)) +
scale_fill_manual(values = c("gray0", "cadetblue2", "cadetblue3", "cadetblue4", "deepskyblue4"),
name = "Pristine Level",
labels = c("Treatment 1 (control)",
"Treatment 2",
"Treatment 3", "Treatment 4",
"Treatment 5")) +
theme_cowplot()+
geom_errorbar(aes(ymin = w-se,
ymax = w+se)) +
ggtitle("Average Drone Dry Weight(g) per Treatment") +
labs(y = "Drone Dry Weight(g)") +
scale_x_discrete(name = "Treatment",
labels = c("0 PPB", "150 PPB", "1,500 PPB", "15,000 PPB", "150,000 PPB")) +
theme_classic(base_size = 12) +
geom_text(aes(label = cld, y = w + se), vjust = -0.5)
Here are the actual means for the dry weights of the treatments, which are not the means produced by the emmeans test for either the AOV or LMER models, and I also think that these ones are right and those ones are wrong so now I’m wondering if those models are wrong somehow?
drf.pollen %>%
group_by(treatment) %>%
summarize(w = mean(dry_weight))
## # A tibble: 5 × 2
## treatment w
## <fct> <dbl>
## 1 1 0.0489
## 2 2 0.0397
## 3 3 0.0369
## 4 4 0.0413
## 5 5 0.0417