library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
wildlife <- read_delim("./Urban_Wildlife_Response.csv", delim = ",")
## Rows: 6385 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (16): DT_Initial, DT_Response, Response_Time, Borough, Property, Locatio...
## dbl (3): Response_Duration, Num_of_Animals, Hours_Monitoring
## lgl (4): PEP_Response, Animal_Monitored, Police_Response, ESU_Response
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
wildlife$DT_Rsp_DateFMT <- strptime(wildlife$DT_Response, format = "%m/%d/%Y %H:%M")
wildlife$DT_Int_DateFMT <- strptime(wildlife$DT_Initial, format = "%m/%d/%Y %H:%M")
set.seed(111)
wildlife_2 <- wildlife[sample(nrow(wildlife), size = 3193, replace = TRUE),]
set.seed(222)
wildlife_3 <- wildlife[sample(nrow(wildlife), size = 3193, replace = TRUE),]
set.seed(333)
wildlife_4 <- wildlife[sample(nrow(wildlife), size = 3193, replace = TRUE),]
set.seed(444)
wildlife_5 <- wildlife[sample(nrow(wildlife), size = 3193, replace = TRUE),]
set.seed(555)
wildlife_6 <- wildlife[sample(nrow(wildlife), size = 3193, replace = TRUE),]
I would call the information in these samples very consistent - while the exact number of each call source varies from sample to sample, the distribution is pretty similar, and the order for each sample is the same (i.e. WINORR has the fewest instance, then WBF, then other, and so on). I would not consider there to be any anomalies in this aspect of the data. Given the large gap in values between each call source, it’s not surprising to see the order retained through multiple samples. This indicates that, at least for the call source, these samples are representative.
wildlife_2 |>
group_by(Call_Source) |>
summarise(Instances = n()) |>
arrange(Instances)
## # A tibble: 8 × 2
## Call_Source Instances
## <chr> <int>
## 1 "WINORR" 1
## 2 "WBF" 57
## 3 "Other" 96
## 4 "Observed by Ranger" 225
## 5 "Conservancies/\"Friends of\" Groups" 337
## 6 "Central" 653
## 7 "Public" 781
## 8 "Employee" 1043
wildlife_3 |>
group_by(Call_Source) |>
summarise(Instances = n()) |>
arrange(Instances)
## # A tibble: 8 × 2
## Call_Source Instances
## <chr> <int>
## 1 "WINORR" 3
## 2 "WBF" 49
## 3 "Other" 83
## 4 "Observed by Ranger" 212
## 5 "Conservancies/\"Friends of\" Groups" 350
## 6 "Central" 675
## 7 "Public" 850
## 8 "Employee" 971
wildlife_4 |>
group_by(Call_Source) |>
summarise(Instances = n()) |>
arrange(Instances)
## # A tibble: 8 × 2
## Call_Source Instances
## <chr> <int>
## 1 "WINORR" 3
## 2 "WBF" 49
## 3 "Other" 95
## 4 "Observed by Ranger" 214
## 5 "Conservancies/\"Friends of\" Groups" 329
## 6 "Central" 668
## 7 "Public" 813
## 8 "Employee" 1022
wildlife_5 |>
group_by(Call_Source) |>
summarise(Instances = n()) |>
arrange(Instances)
## # A tibble: 8 × 2
## Call_Source Instances
## <chr> <int>
## 1 "WINORR" 1
## 2 "WBF" 47
## 3 "Other" 78
## 4 "Observed by Ranger" 229
## 5 "Conservancies/\"Friends of\" Groups" 336
## 6 "Central" 721
## 7 "Public" 809
## 8 "Employee" 972
wildlife_6 |>
group_by(Call_Source) |>
summarise(Instances = n()) |>
arrange(Instances)
## # A tibble: 8 × 2
## Call_Source Instances
## <chr> <int>
## 1 "WINORR" 1
## 2 "WBF" 52
## 3 "Other" 82
## 4 "Observed by Ranger" 212
## 5 "Conservancies/\"Friends of\" Groups" 312
## 6 "Central" 694
## 7 "Public" 857
## 8 "Employee" 983
Below is the comparison between several summary statistics for the different samples. Overall they are very similar - the big callout is that wildlife_2 has a somewhat lower mean, likely due to the difference in max values. All of the samples except wildlife_2 have a maximum of 75 for the response time, while wildlife_2 has a maximum response time of 35. Even with the massive difference in the maximum values, the means are relatively similar. I would take this to mean that the 75 hour response time is an outlier for the original dataset, which has been brought over for multiple of the samples. When comparing the samples to the original dataset, it could be said that sample one is missing an important end of the variable for response times.
mean(wildlife_2$Response_Duration)
## [1] 1.382148
mean(wildlife_3$Response_Duration)
## [1] 1.440993
mean(wildlife_4$Response_Duration)
## [1] 1.440301
mean(wildlife_5$Response_Duration)
## [1] 1.456646
mean(wildlife_6$Response_Duration)
## [1] 1.430802
max(wildlife_2$Response_Duration)
## [1] 35
max(wildlife_3$Response_Duration)
## [1] 75
max(wildlife_4$Response_Duration)
## [1] 75
max(wildlife_5$Response_Duration)
## [1] 75
max(wildlife_6$Response_Duration)
## [1] 75
max(wildlife$Response_Duration)
## [1] 75
min(wildlife_2$Response_Duration)
## [1] 0
min(wildlife_3$Response_Duration)
## [1] 0
min(wildlife_4$Response_Duration)
## [1] 0
min(wildlife_5$Response_Duration)
## [1] 0
min(wildlife_6$Response_Duration)
## [1] 0
IQR(wildlife_2$Response_Duration)
## [1] 1.5
IQR(wildlife_3$Response_Duration)
## [1] 1.5
IQR(wildlife_4$Response_Duration)
## [1] 1.5
IQR(wildlife_5$Response_Duration)
## [1] 1.5
IQR(wildlife_6$Response_Duration)
## [1] 1.5
quantile(wildlife_2$Response_Duration, 0.25)
## 25%
## 0.5
quantile(wildlife_3$Response_Duration, 0.25)
## 25%
## 0.5
quantile(wildlife_4$Response_Duration, 0.25)
## 25%
## 0.5
quantile(wildlife_5$Response_Duration, 0.25)
## 25%
## 0.5
quantile(wildlife_6$Response_Duration, 0.25)
## 25%
## 0.5
quantile(wildlife_2$Response_Duration, 0.75)
## 75%
## 2
quantile(wildlife_3$Response_Duration, 0.75)
## 75%
## 2
quantile(wildlife_4$Response_Duration, 0.75)
## 75%
## 2
quantile(wildlife_5$Response_Duration, 0.75)
## 75%
## 2
quantile(wildlife_6$Response_Duration, 0.75)
## 75%
## 2
While the ordering of these lists (from least to most common) does vary a little, that is only because the number of rows for Bronx and Staten Island are similar, as are Queens and Brooklyn. Overall, there does not seem to be any outliers in these datasets in regards to the Borough variable. While the variation here does exist, it’s within what I would consider acceptable for being counted as representative.
wildlife_2 |>
group_by(Borough) |>
summarise(Instances = n()) |>
arrange(Instances)
## # A tibble: 5 × 2
## Borough Instances
## <chr> <int>
## 1 Staten Island 459
## 2 Bronx 512
## 3 Brooklyn 684
## 4 Queens 689
## 5 Manhattan 849
wildlife_3 |>
group_by(Borough) |>
summarise(Instances = n()) |>
arrange(Instances)
## # A tibble: 5 × 2
## Borough Instances
## <chr> <int>
## 1 Staten Island 478
## 2 Bronx 489
## 3 Queens 660
## 4 Brooklyn 677
## 5 Manhattan 889
wildlife_4 |>
group_by(Borough) |>
summarise(Instances = n()) |>
arrange(Instances)
## # A tibble: 5 × 2
## Borough Instances
## <chr> <int>
## 1 Bronx 479
## 2 Staten Island 498
## 3 Queens 647
## 4 Brooklyn 665
## 5 Manhattan 904
wildlife_5 |>
group_by(Borough) |>
summarise(Instances = n()) |>
arrange(Instances)
## # A tibble: 5 × 2
## Borough Instances
## <chr> <int>
## 1 Bronx 471
## 2 Staten Island 500
## 3 Queens 648
## 4 Brooklyn 678
## 5 Manhattan 896
wildlife_6 |>
group_by(Borough) |>
summarise(Instances = n()) |>
arrange(Instances)
## # A tibble: 5 × 2
## Borough Instances
## <chr> <int>
## 1 Bronx 500
## 2 Staten Island 503
## 3 Queens 648
## 4 Brooklyn 679
## 5 Manhattan 863
These values are, again, incredibly similar, and there is nothing that I would consider to be particularly unusual given the original dataset. Finding animals in mixed-age groupings is significantly less common than finding animals either alone, or with animals of the same age group, which is something that holds true across the samples.
wildlife_2 |>
group_by(Age) |>
summarise(Instances = n()) |>
arrange(Instances)
## # A tibble: 7 × 2
## Age Instances
## <chr> <int>
## 1 Adult, Juvenile, Infant 3
## 2 Juvenile, Infant 4
## 3 Adult, Infant 10
## 4 Adult, Juvenile 25
## 5 Infant 155
## 6 Juvenile 652
## 7 Adult 2344
wildlife_3 |>
group_by(Age) |>
summarise(Instances = n()) |>
arrange(Instances)
## # A tibble: 7 × 2
## Age Instances
## <chr> <int>
## 1 Adult, Juvenile, Infant 2
## 2 Juvenile, Infant 2
## 3 Adult, Infant 11
## 4 Adult, Juvenile 37
## 5 Infant 174
## 6 Juvenile 656
## 7 Adult 2311
wildlife_4 |>
group_by(Age) |>
summarise(Instances = n()) |>
arrange(Instances)
## # A tibble: 7 × 2
## Age Instances
## <chr> <int>
## 1 Juvenile, Infant 1
## 2 Adult, Juvenile, Infant 7
## 3 Adult, Infant 13
## 4 Adult, Juvenile 31
## 5 Infant 165
## 6 Juvenile 652
## 7 Adult 2324
wildlife_5 |>
group_by(Age) |>
summarise(Instances = n()) |>
arrange(Instances)
## # A tibble: 7 × 2
## Age Instances
## <chr> <int>
## 1 Juvenile, Infant 1
## 2 Adult, Juvenile, Infant 2
## 3 Adult, Infant 14
## 4 Adult, Juvenile 27
## 5 Infant 185
## 6 Juvenile 677
## 7 Adult 2287
wildlife_6 |>
group_by(Age) |>
summarise(Instances = n()) |>
arrange(Instances)
## # A tibble: 7 × 2
## Age Instances
## <chr> <int>
## 1 Adult, Juvenile, Infant 2
## 2 Juvenile, Infant 2
## 3 Adult, Infant 13
## 4 Adult, Juvenile 30
## 5 Infant 192
## 6 Juvenile 662
## 7 Adult 2292
When we look at the number of response calls overtime, we finally start to see some variation in the data that I would consider significant. This is likely due to their being so many more options being pulled from (almost 7 years worth) versus the 5 or 10 options available from some of the other variables.
With these visualizations, we can see some variance, especially in the earlier time period. In the mid-2018 time frame, sample 2 shows a month with more than 120 calls recorded, which is vastly different from sample 3 with somewhere around 85 (the other 3 samples being somewhere between those two). There is also some variation in the later part of the time frame, though it is much less exaggerated. The last third of the time period seems to cap out at around 85 for samples 1-3, though for samples 4 and 5, the max is closer to 80.
Despite this variation, the overall shape of the data seems to be consistent - with a spike of volume towards the beginning, before dropping and starting a cyclical growth rhythm.
wildlife_2 |>
mutate(Month = floor_date(wildlife_2$DT_Rsp_DateFMT, "month")) |>
group_by(Month) |>
summarise(Instances = n()) |>
mutate(month = as.Date(Month)) |>
arrange(Month) -> wl_2
ggplot(wl_2, aes(x = month, y = Instances)) +
geom_bar(stat = "identity", fill = "mediumseagreen") +
labs (
title = "Sample 1 Count of Responses Over Time",
x = "Month and Year",
y = "Count of Calls"
) +
theme_minimal() +
ylim(0,125)
wildlife_3 |>
mutate(Month = floor_date(wildlife_3$DT_Rsp_DateFMT, "month")) |>
group_by(Month) |>
summarise(Instances = n()) |>
mutate(month = as.Date(Month)) |>
arrange(Month) -> wl_3
ggplot(wl_3, aes(x = month, y = Instances)) +
geom_bar(stat = "identity", fill = "mediumseagreen") +
labs (
title = "Sample 2 Count of Responses Over Time",
x = "Month and Year",
y = "Count of Calls"
) +
theme_minimal() +
ylim(0,125)
wildlife_4 |>
mutate(Month = floor_date(wildlife_4$DT_Rsp_DateFMT, "month")) |>
group_by(Month) |>
summarise(Instances = n()) |>
mutate(month = as.Date(Month)) |>
arrange(Month) -> wl_4
ggplot(wl_4, aes(x = month, y = Instances)) +
geom_bar(stat = "identity", fill = "mediumseagreen") +
labs (
title = "Sample 3 Count of Responses Over Time",
x = "Month and Year",
y = "Count of Calls"
) +
theme_minimal() +
ylim(0,125)
wildlife_5 |>
mutate(Month = floor_date(wildlife_5$DT_Rsp_DateFMT, "month")) |>
group_by(Month) |>
summarise(Instances = n()) |>
mutate(month = as.Date(Month)) |>
arrange(Month) -> wl_5
ggplot(wl_5, aes(x = month, y = Instances)) +
geom_bar(stat = "identity", fill = "mediumseagreen") +
labs (
title = "Sample 4 Count of Responses Over Time",
x = "Month and Year",
y = "Count of Calls"
) +
theme_minimal() +
ylim(0,125)
wildlife_6 |>
mutate(Month = floor_date(wildlife_6$DT_Rsp_DateFMT, "month")) |>
group_by(Month) |>
summarise(Instances = n()) |>
mutate(month = as.Date(Month)) |>
arrange(Month) -> wl_6
ggplot(wl_6, aes(x = month, y = Instances)) +
geom_bar(stat = "identity", fill = "mediumseagreen") +
labs (
title = "Sample 5 Count of Responses Over Time",
x = "Month and Year",
y = "Count of Calls"
) +
theme_minimal() +
ylim(0,125)