pollen <- read_csv("pollen1.csv", col_types = cols(round = col_factor(levels = c("1",
"2")), treatment = col_factor(levels = c("1",
"2", "3", "4", "5")), replicate = col_factor(levels = c("1",
"2", "3", "4", "5", "6", "7", "9", "11",
"12")), start_date = col_date(format = "%m/%d/%Y"),
start_time = col_character(), end_date = col_date(format = "%m/%d/%Y"),
end_time = col_character()))
pollen$colony <- as.factor(pollen$colony)
pollen$pid <- as.factor(pollen$pid)
pollen$count <- as.factor(pollen$count)
pollen$whole_dif <- as.double(pollen$whole_dif)
pollen <- subset(pollen, pollen$round != 1)
pollen <- na.omit(pollen)
range(pollen$difference)
## [1] -0.30646 1.56542
# get rid of negative numbers
pollen$difference[pollen$difference < 0] <- NA
pollen <- na.omit(pollen)
range(pollen$difference)
## [1] 0.002715 1.565420
# add queenright original colony column
qro <- read_csv("qro.csv")
qro$colony <- as.factor(qro$colony)
qro$qro <- as.factor(qro$qro)
pollen <- merge(pollen, qro, by.x = "colony")
Let’s look at the shape of the pollen data in a histogram.
shapiro.test(pollen$difference)
##
## Shapiro-Wilk normality test
##
## data: pollen$difference
## W = 0.84265, p-value < 2.2e-16
pollen$boxp <- bcPower(pollen$difference, -3, gamma=1)
shapiro.test(pollen$boxp)
##
## Shapiro-Wilk normality test
##
## data: pollen$boxp
## W = 0.9588, p-value = 2.044e-15
pollen$logp <- log(pollen$difference)
shapiro.test(pollen$logp)
##
## Shapiro-Wilk normality test
##
## data: pollen$logp
## W = 0.94103, p-value < 2.2e-16
ggplot(pollen, aes(x=boxp, fill = treatment)) +
geom_histogram(position = "identity", binwidth = 0.009,col=I("black")) +
scale_fill_manual(values = c("peachpuff3", "darkseagreen", "lightseagreen", "darkolivegreen3", "darkolivegreen4"),
name = "Pristine Level",
labels = c("Treatment 1 (control)", "Treatment 2",
"Treatment 3", "Treatment 4", "Treatment 5")) +
ggtitle("Pollen Consumption (g) - BoxCox Transformed") +
labs(y = "Number of Pollen Balls", x = "Pollen Consumed (g), BoxCox power transformation")
ggplot(pollen, aes(x=boxp, fill = treatment)) +
geom_histogram(position = "identity", binwidth = 0.009,col=I("black")) +
scale_fill_manual(values = c("peachpuff3", "darkseagreen", "lightseagreen", "darkolivegreen3", "darkolivegreen4"),
name = "Pristine Level",
labels = c("Treatment 1 (control)", "Treatment 2",
"Treatment 3", "Treatment 4", "Treatment 5")) +
ggtitle("Pollen Consumption (g) - BoxCox Transformed") +
labs(y = "Number of Pollen Balls", x = "Pollen Consumed (g), BoxCox power transformation") +
facet_wrap(vars(treatment))
ggplot(pollen, aes(x=logp, fill = treatment)) +
geom_histogram(position = "identity", binwidth = 0.05,col=I("black")) +
scale_fill_manual(values = c("peachpuff3", "darkseagreen", "lightseagreen", "darkolivegreen3", "darkolivegreen4"),
name = "Pristine Level",
labels = c("Treatment 1 (control)", "Treatment 2",
"Treatment 3", "Treatment 4", "Treatment 5")) +
ggtitle("Pollen Consumption (g) - Log Transformed") +
labs(y = "Number of Pollen Balls", x = "Pollen Consumed (g), log transformation")
consider outliers
summary(pollen$difference)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.002715 0.234403 0.323385 0.472047 0.632838 1.565420
ggplot(pollen) +
aes(x = treatment, y = difference) +
geom_boxplot() +
theme_minimal()
ggplot(pollen) +
aes(x = "", y = difference) +
geom_boxplot() +
theme_minimal()
boxplot.stats(pollen$difference)$out
## [1] 1.247980 1.372910 1.323870 1.238180 1.423773 1.354040 1.324110 1.332559
## [9] 1.565420 1.408069 1.264060 1.344220 1.473180 1.387130 1.469500 1.434330
## [17] 1.287410 1.474180 1.363000 1.338620 1.323910 1.247490 1.351750 1.557570
## [25] 1.533510 1.286970 1.312720 1.259550 1.294820 1.336980 1.262150 1.349450
## [33] 1.239330 1.251650 1.249190
out <- boxplot.stats(pollen$difference)$out
out_ind <- which(pollen$difference %in% c(out))
out_ind
## [1] 81 122 148 162 163 164 247 264 268 284 326 328 329 330 367 369 370 387 439
## [20] 476 483 514 515 516 517 518 521 613 653 669 673 678 720 875 891
lower_bound <- quantile(pollen$difference, 0.01)
lower_bound
## 1%
## 0.0455171
upper_bound <- quantile(pollen$difference, 0.99)
upper_bound
## 99%
## 1.384428
outlier_ind <- which(pollen$difference < lower_bound | pollen$difference > upper_bound)
outlier_ind
## [1] 135 163 268 284 329 330 367 369 387 486 487 516 517 569 570 629 630 836 838
## [20] 839
newdata <- subset(pollen, difference > 0.0455)
newdata <- subset(newdata, difference < 1.25)
shapiro.test(newdata$difference)
##
## Shapiro-Wilk normality test
##
## data: newdata$difference
## W = 0.83852, p-value < 2.2e-16
hist(newdata$difference)
newdata$boxp <- bcPower(newdata$difference, -3, gamma=1)
shapiro.test(newdata$boxp)
##
## Shapiro-Wilk normality test
##
## data: newdata$boxp
## W = 0.95602, p-value = 1.411e-15
hist(newdata$boxp)
Outlier removal doesn’t really help the shape but honestly the residuals v fitted and qq plot below for the boxcox data isn’t terrible I think overall it’s okay
pmod <- glm(difference ~ treatment + bees_alive + qro + start_date, data = pollen)
summary(pmod)
##
## Call:
## glm(formula = difference ~ treatment + bees_alive + qro + start_date,
## data = pollen)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.59802 -0.20948 -0.09273 0.14819 1.01233
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.249e+02 1.543e+01 -8.096 1.81e-15 ***
## treatment2 5.222e-02 3.269e-02 1.597 0.110519
## treatment3 5.177e-02 3.228e-02 1.604 0.109105
## treatment4 8.921e-02 3.294e-02 2.708 0.006891 **
## treatment5 -2.304e-02 3.302e-02 -0.698 0.485415
## bees_alive 1.477e-01 1.696e-02 8.706 < 2e-16 ***
## qroB3 1.062e-01 3.704e-02 2.868 0.004228 **
## qroB4 3.229e-01 3.676e-02 8.784 < 2e-16 ***
## qroB5 9.085e-02 2.636e-02 3.446 0.000594 ***
## start_date 6.476e-03 8.008e-04 8.088 1.93e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.09920166)
##
## Null deviance: 105.278 on 919 degrees of freedom
## Residual deviance: 90.274 on 910 degrees of freedom
## AIC: 497.04
##
## Number of Fisher Scoring iterations: 2
Anova(pmod)
## Analysis of Deviance Table (Type II tests)
##
## Response: difference
## LR Chisq Df Pr(>Chisq)
## treatment 14.379 4 0.00618 **
## bees_alive 75.793 1 < 2.2e-16 ***
## qro 80.233 3 < 2.2e-16 ***
## start_date 65.411 1 6.079e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(pmod)
pbox <- glm(boxp ~ treatment + bees_alive + qro + start_date, data = pollen)
summary(pbox)
##
## Call:
## glm(formula = boxp ~ treatment + bees_alive + qro + start_date,
## data = pollen)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.201511 -0.038005 -0.006821 0.044469 0.124757
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.806e+01 2.841e+00 -9.878 < 2e-16 ***
## treatment2 1.655e-02 6.017e-03 2.750 0.006084 **
## treatment3 1.610e-02 5.941e-03 2.710 0.006865 **
## treatment4 2.167e-02 6.064e-03 3.573 0.000371 ***
## treatment5 -2.725e-03 6.077e-03 -0.448 0.653937
## bees_alive 2.848e-02 3.123e-03 9.121 < 2e-16 ***
## qroB3 2.024e-02 6.818e-03 2.969 0.003065 **
## qroB4 5.463e-02 6.767e-03 8.073 2.17e-15 ***
## qroB5 1.531e-02 4.852e-03 3.155 0.001655 **
## start_date 1.461e-03 1.474e-04 9.910 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.003361313)
##
## Null deviance: 3.6750 on 919 degrees of freedom
## Residual deviance: 3.0588 on 910 degrees of freedom
## AIC: -2617
##
## Number of Fisher Scoring iterations: 2
Anova(pbox)
## Analysis of Deviance Table (Type II tests)
##
## Response: boxp
## LR Chisq Df Pr(>Chisq)
## treatment 25.542 4 3.914e-05 ***
## bees_alive 83.186 1 < 2.2e-16 ***
## qro 68.697 3 8.113e-15 ***
## start_date 98.200 1 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(pbox)
boxemm <- emmeans(pbox, "treatment")
pairs(boxemm)
## contrast estimate SE df t.ratio p.value
## treatment1 - treatment2 -0.016546 0.00602 910 -2.750 0.0478
## treatment1 - treatment3 -0.016098 0.00594 910 -2.710 0.0533
## treatment1 - treatment4 -0.021668 0.00606 910 -3.573 0.0034
## treatment1 - treatment5 0.002725 0.00608 910 0.448 0.9916
## treatment2 - treatment3 0.000448 0.00603 910 0.074 1.0000
## treatment2 - treatment4 -0.005123 0.00625 910 -0.819 0.9247
## treatment2 - treatment5 0.019271 0.00615 910 3.132 0.0154
## treatment3 - treatment4 -0.005570 0.00620 910 -0.899 0.8974
## treatment3 - treatment5 0.018823 0.00607 910 3.103 0.0169
## treatment4 - treatment5 0.024394 0.00634 910 3.849 0.0012
##
## Results are averaged over the levels of: qro
## P value adjustment: tukey method for comparing a family of 5 estimates
unique(pollen$colony)
## [1] 1.11R2 1.12R2 1.1R2 1.2R2 1.3R2 1.4R2 1.5R2 1.7R2 1.9R2 2.11R2
## [11] 2.12R2 2.1R2 2.2R2 2.3R2 2.4R2 2.5R2 2.7R2 2.9R2 3.11R2 3.12R2
## [21] 3.1R2 3.2R2 3.3R2 3.4R2 3.5R2 3.7R2 3.9R2 4.11R2 4.12R2 4.1R2
## [31] 4.2R2 4.3R2 4.4R2 4.5R2 4.7R2 4.9R2 5.11R2 5.12R2 5.1R2 5.2R2
## [41] 5.3R2 5.4R2 5.5R2 5.7R2 5.9R2
## 60 Levels: 1.11R2 1.12R2 1.1R1 1.1R2 1.2R1 1.2R2 1.3R1 1.3R2 1.4R2 ... 5.9R2
Even though the histogram for the boxcox transformation doesn’t look all that great, the W value is greatly improved and the diagnostic plots for the model look pretty good. The q-q plot and residuals v fitted looked relatively well fitting and evenly spread.
polsum <- pollen %>%
group_by(treatment) %>%
summarise(mp = mean(difference),
sdp = sd(difference),
np = length(difference)) %>%
mutate(sep = sdp/sqrt(np))
polsum
## # A tibble: 5 × 5
## treatment mp sdp np sep
## <fct> <dbl> <dbl> <int> <dbl>
## 1 1 0.430 0.336 195 0.0240
## 2 2 0.502 0.348 180 0.0259
## 3 3 0.508 0.345 190 0.0250
## 4 4 0.488 0.342 178 0.0256
## 5 5 0.435 0.316 177 0.0238
tidy_anova <- broom::tidy(pbox)
knitr::kable(tidy_anova)
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | -28.0610062 | 2.8408405 | -9.877713 | 0.0000000 |
| treatment2 | 0.0165456 | 0.0060174 | 2.749634 | 0.0060844 |
| treatment3 | 0.0160980 | 0.0059413 | 2.709508 | 0.0068650 |
| treatment4 | 0.0216683 | 0.0060636 | 3.573491 | 0.0003708 |
| treatment5 | -0.0027254 | 0.0060775 | -0.448447 | 0.6539374 |
| bees_alive | 0.0284798 | 0.0031226 | 9.120646 | 0.0000000 |
| qroB3 | 0.0202435 | 0.0068180 | 2.969139 | 0.0030647 |
| qroB4 | 0.0546267 | 0.0067669 | 8.072624 | 0.0000000 |
| qroB5 | 0.0153108 | 0.0048522 | 3.155436 | 0.0016552 |
| start_date | 0.0014607 | 0.0001474 | 9.909582 | 0.0000000 |
anova_summary <- summary(pbox)
tukey_treatment <- HSD.test(pbox,
trt = "treatment",
console = TRUE) # prints the results to console
##
## Study: pbox ~ "treatment"
##
## HSD Test for boxp
##
## Mean Square Error: 0.003361313
##
## treatment, means
##
## boxp std r Min Max
## 1 0.1899525 0.06548217 195 0.041653345 0.3099233
## 2 0.2100114 0.05728009 180 0.056362833 0.3135908
## 3 0.2112073 0.05879453 190 0.032092405 0.3134084
## 4 0.2050092 0.06367876 178 0.002700324 0.3076306
## 5 0.1924815 0.06789212 177 0.029659134 0.3041338
##
## Alpha: 0.05 ; DF Error: 910
## Critical Value of Studentized Range: 3.865405
##
## Groups according to probability of means differences and alpha level( 0.05 )
##
## Treatments with the same letter are not significantly different.
##
## boxp groups
## 3 0.2112073 a
## 2 0.2100114 a
## 4 0.2050092 ab
## 5 0.1924815 b
## 1 0.1899525 b
tidy_tukey <- as.data.frame(tukey_treatment$groups)
tidy_tukey
## boxp groups
## 3 0.2112073 a
## 2 0.2100114 a
## 4 0.2050092 ab
## 5 0.1924815 b
## 1 0.1899525 b
tidier_tukey <- tidy_tukey %>%
rownames_to_column() %>% # converts rownames to columns
rename(treatment = rowname)
p_max <- pollen %>%
group_by(treatment) %>%
summarize(maxp = max(mean(difference)))
p_for_plotting <- full_join(tidier_tukey, p_max,
by = "treatment")
A BEAUTIFUL bar graph with accurate P value and group labels I finally did it it took me like two years but I have made a decent bar plot
library(ggpmisc)
ggplot(data = polsum, aes(x = treatment, y = mp, fill = treatment)) +
geom_col(col = "black")+
coord_cartesian(ylim=c(0.4,0.55)) +
scale_fill_manual(values = c("peachpuff3", "darkseagreen", "lightseagreen", "darkolivegreen3", "darkolivegreen4"),
name = "Pristine Level",
labels = c("Treatment 1 (control)", "Treatment 2",
"Treatment 3", "Treatment 4", "Treatment 5")) +
geom_errorbar(aes(ymin = mp - sep,
ymax = mp + sep),
position = position_dodge(0.9), width = 0.4) +
labs(y = "Mean Pollen Consumed (g)",) +
ggtitle("Average Pollen Consumed (g) per Treatment") +
scale_x_discrete(name = "Treatment",
labels = c("0 PPB", "150 PPB", "1,500 PPB", "15,000 PPB", "150,000 PPB")) +
theme_cowplot()+
theme_classic(base_size = 25) +
annotate(geom = "text",
x = 2, y = 0.55,
label = "Anova = 3.914e-05 ***",
size = 10) +
annotate(geom = "text",
x = c(3, 2, 4, 5, 1 ),
y = 0.03 + p_for_plotting$maxp,
label = p_for_plotting$groups,
size = 10)
sum2 <- pollen %>%
group_by(treatment, count) %>%
summarise(mp = mean(difference),
sdp = sd(difference),
np = length(difference)) %>%
mutate(sep = sdp/sqrt(np))
sum2
## # A tibble: 133 × 6
## # Groups: treatment [5]
## treatment count mp sdp np sep
## <fct> <fct> <dbl> <dbl> <int> <dbl>
## 1 1 2 0.272 0.0552 8 0.0195
## 2 1 3 0.240 0.0770 10 0.0244
## 3 1 4 0.227 0.0884 11 0.0266
## 4 1 5 0.137 0.0527 9 0.0176
## 5 1 6 0.170 0.0750 7 0.0284
## 6 1 7 0.384 0.376 9 0.125
## 7 1 8 0.278 0.0989 9 0.0330
## 8 1 9 0.353 0.185 7 0.0698
## 9 1 10 0.363 0.331 5 0.148
## 10 1 11 0.538 0.410 7 0.155
## # … with 123 more rows
ggplot(data = sum2, aes(x = count, y = mp)) +
geom_point(aes(color = treatment)) +
scale_color_manual(values = c("peachpuff3", "darkseagreen", "lightseagreen", "darkolivegreen3", "darkolivegreen4"),
name = "Pristine Level",
labels = c("Treatment 1 (control)", "Treatment 2",
"Treatment 3", "Treatment 4", "Treatment 5")) +
labs(y = "Mean Pollen Consumed (g)", x = "Pollen Ball Number") +
ggtitle("Average Pollen Consumed (g) per Treatment") +
theme_cowplot()+
theme_classic(base_size = 16) +
facet_grid(vars(treatment))