For all code, I will use tidyverse and dplyr…
and the data file, “Overall data” calling the dataframe “bsdata.”
To use the data, I will:
create a variable for seasons
bsdata$season <- as.character(bsdata$Date)
bsdata$season [bsdata$Date %in% 0:60000] <- "spring"
bsdata$season [bsdata$Date %in% 60001:123026] <- "fall"
bsdata$season = factor(bsdata$season)
fix building names; i.e., turn Sage.High into Sage and Polk.ctyd into Polk.
Note: This causes an overrepresentation of samples at Polk and Sage… but I can’t remember why…
bsdata$Building = factor(bsdata$Building)
levels(bsdata$Building)[match("Sage.High",levels(bsdata$Building))] <- "Sage"
levels(bsdata$Building)[match("Polk.ctyd",levels(bsdata$Building))] <- "Polk"
levels(bsdata$Building)
## [1] "ACT" "ALBEE" "BLACKHAWK" "CCE&D" "CFWC"
## [6] "Clow" "DEMPSEY" "FDL" "FLETCHER" "GRUENHAGEN"
## [11] "Halsey" "Harrington" "HORIZON" "LINCOLN" "Polk"
## [16] "PRKGRMP" "RADFORD" "Reeve" "Sage" "SRWC"
## [21] "unknown"
strikes.bldg <-
bsdata %>% filter (Syst.Opp=="systematic") %>%
# filter (sample.confirmed == "yes") %>%
select("Building","Year","season", "strike") %>%
droplevels()
names(strikes.bldg)
## [1] "Building" "Year" "season" "strike"
# "strike" is whether or not that sampling session saw a strike (yes = 1, no = 0)
# To exclude unconfirmed, keep the line
# > filter (sample.confirmed == "yes") %>%
# To include unconfirmed, put a # at the beginning of that line
3-way ANOVA with building, year, and season
(https://www.statology.org/three-way-anova-in-r/)
summary(lm(total.table$proportion ~ total.table$Building + total.table$Year + total.table$season))
##
## Call:
## lm(formula = total.table$proportion ~ total.table$Building +
## total.table$Year + total.table$season)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.20823 -0.04627 -0.01102 0.04404 0.34659
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.20448 0.06309 3.241 0.002333 **
## total.table$BuildingCFWC -0.03109 0.05589 -0.556 0.581039
## total.table$BuildingClow 0.03583 0.05589 0.641 0.524919
## total.table$BuildingHalsey -0.05592 0.05589 -1.000 0.322805
## total.table$BuildingHarrington -0.06152 0.11113 -0.554 0.582772
## total.table$BuildingPolk -0.03799 0.05400 -0.704 0.485538
## total.table$BuildingReeve 0.05638 0.05067 1.113 0.272170
## total.table$BuildingSage 0.01962 0.05067 0.387 0.700518
## total.table$BuildingSRWC -0.03969 0.05589 -0.710 0.481537
## total.table$Year2020 0.02002 0.06946 0.288 0.774540
## total.table$Year2021 0.05337 0.04769 1.119 0.269410
## total.table$Year2022 0.02958 0.04718 0.627 0.534129
## total.table$Year2023 0.02420 0.04671 0.518 0.607059
## total.table$Year2024 0.02654 0.04961 0.535 0.595477
## total.table$seasonspring -0.13825 0.03666 -3.771 0.000503 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09922 on 42 degrees of freedom
## Multiple R-squared: 0.4473, Adjusted R-squared: 0.2631
## F-statistic: 2.428 on 14 and 42 DF, p-value: 0.01344
summary(aov(total.table$proportion ~ total.table$Building))
## Df Sum Sq Mean Sq F value Pr(>F)
## total.table$Building 8 0.1292 0.01615 1.252 0.291
## Residuals 48 0.6189 0.01289
summary(aov(total.table$proportion ~ total.table$Year))
## Df Sum Sq Mean Sq F value Pr(>F)
## total.table$Year 5 0.0713 0.01427 1.075 0.385
## Residuals 51 0.6767 0.01327
t.test(proportion ~ season, data = total.table)
##
## Welch Two Sample t-test
##
## data: proportion by season
## t = 3.1628, df = 12.966, p-value = 0.007507
## alternative hypothesis: true difference in means between group fall and group spring is not equal to 0
## 95 percent confidence interval:
## 0.04823658 0.25633114
## sample estimates:
## mean in group fall mean in group spring
## 0.23911652 0.08683266
This is a difficult thing to measure.
First, we have to only look at Sage and then only at Door 1. (We could also pick a random door at other buildings for comparison but that’s a bit dodgy.). Unfortunately, the only Sage data that have door numbers are 2021 and later. This is when the dots were installed. We can, however, look at strikes in 2021 and beyond.
build.sage <- bsdata %>% filter (Building=="Sage") %>% droplevels()
#levels(build.sage$Building)
build.sage$Door.no = factor(build.sage$Door.no)
build.sage$Year = factor(build.sage$Year)
build.sage = build.sage %>% filter(!is.na(Door.no))
build.sage = build.sage %>% filter(Door.no == 1) %>% droplevels()
levels(build.sage$Door.no)
## [1] "1"
names(build.sage)
## [1] "Obs.no" "Year" "julian.day"
## [4] "Date.full" "Time" "Observer"
## [7] "Building" "Location" "Door.no"
## [10] "Side" "strike" "Carcass"
## [13] "Common.name" "Abbr" "Family"
## [16] "Vegetation" "Notes" "No.photos"
## [19] "link.to.photos" "Syst.Opp" "sample.confirmed"
## [22] "sample.confirmed.by" "spp.confirm" "spp.confirm.by"
## [25] "Data.entry.notes" "season"
build.sage = build.sage %>% select(Year, strike, season)
names(build.sage)
## [1] "Year" "strike" "season"
build.sage.count = build.sage %>% count(strike, Year, season)
#build.sage.count
ggplot(build.sage.count, aes(x=Year, y=n)) +
geom_col(aes(fill = season) ,position = "dodge2")
### 2B. Comparing all buildings pre and post dots
t-test for proportion as function of pre and post; though I haven’t tested any assumptions…
t.test(pre.v.post$proportion ~ pre.v.post$pre.post)
##
## Welch Two Sample t-test
##
## data: pre.v.post$proportion by pre.v.post$pre.post
## t = -2.1031, df = 32.219, p-value = 0.04335
## alternative hypothesis: true difference in means between group pre and group post is not equal to 0
## 95 percent confidence interval:
## -0.107430519 -0.001731521
## sample estimates:
## mean in group pre mean in group post
## 0.07379889 0.12837991
wilcox.test(pre.v.post$proportion ~ pre.v.post$pre.post)
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
##
## Wilcoxon rank sum test with continuity correction
##
## data: pre.v.post$proportion by pre.v.post$pre.post
## W = 133, p-value = 0.1046
## alternative hypothesis: true location shift is not equal to 0
The next three graphs are heat maps that might not be helpful but here they are. For each, the color represents proportion of samples that had strikes with the darker colors having a greater proportion.
I created a data frame, “family.data”, with only columns for building, year, strike, family, and season - omitting all non-strike samples.
I removed unknowns and needed from data frame.
## [1] "Bombycillidae" "Cardinalidae" "Columbidae" "Fringillidae"
## [5] "Mimidae" "Paridae" "Parulidae" "Passerelidae"
## [9] "Picidae" "Regulidae" "Sturnidae" "Trochilidae"
## [13] "Turdidae"
ggplot(family.data, aes(x=Family, y=strike.family)) +
geom_col() +
theme(axis.text.x = element_text(angle = 90))
##
## ACT CFWC Clow Halsey Polk Reeve Sage SRWC
## Bombycillidae 0 0 0 1 0 0 0 0
## Cardinalidae 0 0 0 1 0 0 0 0
## Columbidae 0 1 0 0 0 0 0 0
## Fringillidae 0 2 0 0 0 0 0 0
## Mimidae 0 1 0 0 0 0 0 1
## Paridae 0 0 1 0 0 0 0 0
## Parulidae 0 1 1 0 2 1 0 0
## Passerelidae 1 0 0 0 0 1 0 0
## Picidae 0 0 1 0 1 0 0 0
## Regulidae 0 0 2 0 0 2 0 0
## Sturnidae 0 0 0 0 0 1 0 0
## Trochilidae 0 0 0 0 0 2 1 1
## Turdidae 0 0 0 0 2 0 0 0
##
## 2021 2022 2023 2024
## Bombycillidae 0 0 0 1
## Cardinalidae 0 0 0 1
## Columbidae 0 0 1 0
## Fringillidae 0 0 2 0
## Mimidae 0 0 0 2
## Paridae 0 0 1 0
## Parulidae 1 3 0 1
## Passerelidae 0 1 0 1
## Picidae 0 0 1 1
## Regulidae 0 1 3 0
## Sturnidae 0 1 0 0
## Trochilidae 0 1 1 2
## Turdidae 0 0 1 1
##
## fall spring
## Bombycillidae 0 1
## Cardinalidae 0 1
## Columbidae 1 0
## Fringillidae 0 2
## Mimidae 0 2
## Paridae 1 0
## Parulidae 0 5
## Passerelidae 1 1
## Picidae 1 1
## Regulidae 0 4
## Sturnidae 0 1
## Trochilidae 1 3
## Turdidae 1 1
I used Julian day instead of date to make things easier to code.
julian.day.data.spring <- bsdata %>% filter (julian.day != "NA") %>% select("Building","Year","season", "strike", "julian.day") %>% droplevels()
julian.day.data.spring$jd.season = factor(julian.day.data.spring$season)
julian.day.data.spring <- filter(julian.day.data.spring, jd.season == "spring") %>% droplevels()
#levels(julian.day.data.spring$jd.season)
julian.day.data.spring$strike = as.integer(julian.day.data.spring$strike)
## # A tibble: 179 × 4
## # Groups: julian.day, Year [179]
## julian.day Year season JD.strikes
## <int> <int> <fct> <int>
## 1 104 2020 spring 0
## 2 106 2020 spring 0
## 3 106 2024 spring 2
## 4 107 2021 spring 1
## 5 107 2023 spring 2
## 6 107 2024 spring 1
## 7 108 2020 spring 2
## 8 108 2021 spring 0
## 9 108 2023 spring 0
## 10 109 2021 spring 0
## # ℹ 169 more rows
# labs(x="Familes",y="strikes",title="Julian day")
JD.table.spring$julian.year = factor(JD.table.spring$Year)
ggplot(JD.table.spring,(aes(x=julian.day,y=JD.strikes,group=julian.year,color=julian.year)))+
geom_line()
julian.day.data.fall <- bsdata %>% filter (julian.day != "NA") %>% select("Building","Year","season", "strike", "julian.day") %>% droplevels()
julian.day.data.fall$jd.season = factor(julian.day.data.fall$season)
julian.day.data.fall <- filter(julian.day.data.fall, jd.season == "fall") %>% droplevels()
#levels(julian.day.data.fall$jd.season)
julian.day.data.fall$strike = as.integer(julian.day.data.fall$strike)
Table with totals strikes = JD.table Table with samples = JD.total.table
Table with proportions
Filter bsdata so only Sage
Filter strikes.Sage so it is only systematic data.
Filter strikes.Sage so it is only Door.no 1
Filter strikes.Sage so it only has columns for year, strike, and season.
E. Create table based on strikes.Sage with number of strikes at Sage per year, season
This time the strikes are separated out by year; again, the colors represent years and the shapes represent seasons.
From family data
family x season
ggplot(FxS, aes(FxS.family, FxS.season, fill= FxScounts)) +
geom_tile()+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
scale_fill_gradient2(low="lavender", mid = "orchid", high="slateblue", guide="colorbar", midpoint = (max(FxYcounts)/2))+
labs(x="Familes",y="Seasons",title="Number of strikes per season per taxonomic family")