Week_4_Datadive

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

wildlife <- read_delim("./Urban_Wildlife_Response.csv", delim = ",")

## Rows: 6385 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (16): DT_Initial, DT_Response, Response_Time, Borough, Property, Locatio...
## dbl  (3): Response_Duration, Num_of_Animals, Hours_Monitoring
## lgl  (4): PEP_Response, Animal_Monitored, Police_Response, ESU_Response
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

wildlife$DT_Rsp_DateFMT <- strptime(wildlife$DT_Response, format = "%m/%d/%Y %H:%M")

wildlife$DT_Int_DateFMT <- strptime(wildlife$DT_Initial, format = "%m/%d/%Y %H:%M")

Creating Samples

set.seed(111)
wildlife_2 <- wildlife[sample(nrow(wildlife), size = 3193, replace = TRUE),]

set.seed(222)
wildlife_3 <- wildlife[sample(nrow(wildlife), size = 3193, replace = TRUE),]

set.seed(333)
wildlife_4 <- wildlife[sample(nrow(wildlife), size = 3193, replace = TRUE),]

set.seed(444)
wildlife_5 <- wildlife[sample(nrow(wildlife), size = 3193, replace = TRUE),]

set.seed(555)
wildlife_6 <- wildlife[sample(nrow(wildlife), size = 3193, replace = TRUE),]

Comparing Samples

Comparing Call Source

I would call the information in these samples very consistent - while the exact number of each call source varies from sample to sample, the distribution is pretty similar, and the order for each sample is the same (i.e. WINORR has the fewest instance, then WBF, then other, and so on). I would not consider there to be any anomalies in this aspect of the data. Given the large gap in values between each call source, it’s not surprising to see the order retained through multiple samples. This indicates that, at least for the call source, these samples are representative.

wildlife_2 |>
  group_by(Call_Source) |>
  summarise(Instances = n()) |>
  arrange(Instances)

## # A tibble: 8 × 2
##   Call_Source                           Instances
##   <chr>                                     <int>
## 1 "WINORR"                                      1
## 2 "WBF"                                        57
## 3 "Other"                                      96
## 4 "Observed by Ranger"                        225
## 5 "Conservancies/\"Friends of\" Groups"       337
## 6 "Central"                                   653
## 7 "Public"                                    781
## 8 "Employee"                                 1043

wildlife_3 |>
  group_by(Call_Source) |>
  summarise(Instances = n()) |>
  arrange(Instances)

## # A tibble: 8 × 2
##   Call_Source                           Instances
##   <chr>                                     <int>
## 1 "WINORR"                                      3
## 2 "WBF"                                        49
## 3 "Other"                                      83
## 4 "Observed by Ranger"                        212
## 5 "Conservancies/\"Friends of\" Groups"       350
## 6 "Central"                                   675
## 7 "Public"                                    850
## 8 "Employee"                                  971

wildlife_4 |>
  group_by(Call_Source) |>
  summarise(Instances = n()) |>
  arrange(Instances)

## # A tibble: 8 × 2
##   Call_Source                           Instances
##   <chr>                                     <int>
## 1 "WINORR"                                      3
## 2 "WBF"                                        49
## 3 "Other"                                      95
## 4 "Observed by Ranger"                        214
## 5 "Conservancies/\"Friends of\" Groups"       329
## 6 "Central"                                   668
## 7 "Public"                                    813
## 8 "Employee"                                 1022

wildlife_5 |>
  group_by(Call_Source) |>
  summarise(Instances = n()) |>
  arrange(Instances)

## # A tibble: 8 × 2
##   Call_Source                           Instances
##   <chr>                                     <int>
## 1 "WINORR"                                      1
## 2 "WBF"                                        47
## 3 "Other"                                      78
## 4 "Observed by Ranger"                        229
## 5 "Conservancies/\"Friends of\" Groups"       336
## 6 "Central"                                   721
## 7 "Public"                                    809
## 8 "Employee"                                  972

wildlife_6 |>
  group_by(Call_Source) |>
  summarise(Instances = n()) |>
  arrange(Instances)

## # A tibble: 8 × 2
##   Call_Source                           Instances
##   <chr>                                     <int>
## 1 "WINORR"                                      1
## 2 "WBF"                                        52
## 3 "Other"                                      82
## 4 "Observed by Ranger"                        212
## 5 "Conservancies/\"Friends of\" Groups"       312
## 6 "Central"                                   694
## 7 "Public"                                    857
## 8 "Employee"                                  983

Comparing Response Times

Below is the comparison between several summary statistics for the different samples. Overall they are very similar - the big callout is that wildlife_2 has a somewhat lower mean, likely due to the difference in max values. All of the samples except wildlife_2 have a maximum of 75 for the response time, while wildlife_2 has a maximum response time of 35. Even with the massive difference in the maximum values, the means are relatively similar. I would take this to mean that the 75 hour response time is an outlier for the original dataset, which has been brought over for multiple of the samples. When comparing the samples to the original dataset, it could be said that sample one is missing an important end of the variable for response times.

Mean

mean(wildlife_2$Response_Duration)

## [1] 1.382148

mean(wildlife_3$Response_Duration)

## [1] 1.440993

mean(wildlife_4$Response_Duration)

## [1] 1.440301

mean(wildlife_5$Response_Duration)

## [1] 1.456646

mean(wildlife_6$Response_Duration)

## [1] 1.430802

Maximum

max(wildlife_2$Response_Duration)

## [1] 35

max(wildlife_3$Response_Duration)

## [1] 75

max(wildlife_4$Response_Duration)

## [1] 75

max(wildlife_5$Response_Duration)

## [1] 75

max(wildlife_6$Response_Duration)

## [1] 75

max(wildlife$Response_Duration)

## [1] 75

Minimum

min(wildlife_2$Response_Duration)

## [1] 0

min(wildlife_3$Response_Duration)

## [1] 0

min(wildlife_4$Response_Duration)

## [1] 0

min(wildlife_5$Response_Duration)

## [1] 0

min(wildlife_6$Response_Duration)

## [1] 0

IQR

IQR(wildlife_2$Response_Duration)

## [1] 1.5

IQR(wildlife_3$Response_Duration)

## [1] 1.5

IQR(wildlife_4$Response_Duration)

## [1] 1.5

IQR(wildlife_5$Response_Duration)

## [1] 1.5

IQR(wildlife_6$Response_Duration)

## [1] 1.5

First Quartile

quantile(wildlife_2$Response_Duration, 0.25)

## 25% 
## 0.5

quantile(wildlife_3$Response_Duration, 0.25)

## 25% 
## 0.5

quantile(wildlife_4$Response_Duration, 0.25)

## 25% 
## 0.5

quantile(wildlife_5$Response_Duration, 0.25)

## 25% 
## 0.5

quantile(wildlife_6$Response_Duration, 0.25)

## 25% 
## 0.5

Third Quartile

quantile(wildlife_2$Response_Duration, 0.75)

## 75% 
##   2

quantile(wildlife_3$Response_Duration, 0.75)

## 75% 
##   2

quantile(wildlife_4$Response_Duration, 0.75)

## 75% 
##   2

quantile(wildlife_5$Response_Duration, 0.75)

## 75% 
##   2

quantile(wildlife_6$Response_Duration, 0.75)

## 75% 
##   2

Comparing Boroughs

While the ordering of these lists (from least to most common) does vary a little, that is only because the number of rows for Bronx and Staten Island are similar, as are Queens and Brooklyn. Overall, there does not seem to be any outliers in these datasets in regards to the Borough variable. While the variation here does exist, it’s within what I would consider acceptable for being counted as representative.

wildlife_2 |>
  group_by(Borough) |>
  summarise(Instances = n()) |>
  arrange(Instances)

## # A tibble: 5 × 2
##   Borough       Instances
##   <chr>             <int>
## 1 Staten Island       459
## 2 Bronx               512
## 3 Brooklyn            684
## 4 Queens              689
## 5 Manhattan           849

wildlife_3 |>
  group_by(Borough) |>
  summarise(Instances = n()) |>
  arrange(Instances)

## # A tibble: 5 × 2
##   Borough       Instances
##   <chr>             <int>
## 1 Staten Island       478
## 2 Bronx               489
## 3 Queens              660
## 4 Brooklyn            677
## 5 Manhattan           889

wildlife_4 |>
  group_by(Borough) |>
  summarise(Instances = n()) |>
  arrange(Instances)

## # A tibble: 5 × 2
##   Borough       Instances
##   <chr>             <int>
## 1 Bronx               479
## 2 Staten Island       498
## 3 Queens              647
## 4 Brooklyn            665
## 5 Manhattan           904

wildlife_5 |>
  group_by(Borough) |>
  summarise(Instances = n()) |>
  arrange(Instances)

## # A tibble: 5 × 2
##   Borough       Instances
##   <chr>             <int>
## 1 Bronx               471
## 2 Staten Island       500
## 3 Queens              648
## 4 Brooklyn            678
## 5 Manhattan           896

wildlife_6 |>
  group_by(Borough) |>
  summarise(Instances = n()) |>
  arrange(Instances)

## # A tibble: 5 × 2
##   Borough       Instances
##   <chr>             <int>
## 1 Bronx               500
## 2 Staten Island       503
## 3 Queens              648
## 4 Brooklyn            679
## 5 Manhattan           863

Comparison by Age

These values are, again, incredibly similar, and there is nothing that I would consider to be particularly unusual given the original dataset. Finding animals in mixed-age groupings is significantly less common than finding animals either alone, or with animals of the same age group, which is something that holds true across the samples.

wildlife_2 |>
  group_by(Age) |>
  summarise(Instances = n()) |>
  arrange(Instances)

## # A tibble: 7 × 2
##   Age                     Instances
##   <chr>                       <int>
## 1 Adult, Juvenile, Infant         3
## 2 Juvenile, Infant                4
## 3 Adult, Infant                  10
## 4 Adult, Juvenile                25
## 5 Infant                        155
## 6 Juvenile                      652
## 7 Adult                        2344

wildlife_3 |>
  group_by(Age) |>
  summarise(Instances = n()) |>
  arrange(Instances)

## # A tibble: 7 × 2
##   Age                     Instances
##   <chr>                       <int>
## 1 Adult, Juvenile, Infant         2
## 2 Juvenile, Infant                2
## 3 Adult, Infant                  11
## 4 Adult, Juvenile                37
## 5 Infant                        174
## 6 Juvenile                      656
## 7 Adult                        2311

wildlife_4 |>
  group_by(Age) |>
  summarise(Instances = n()) |>
  arrange(Instances)

## # A tibble: 7 × 2
##   Age                     Instances
##   <chr>                       <int>
## 1 Juvenile, Infant                1
## 2 Adult, Juvenile, Infant         7
## 3 Adult, Infant                  13
## 4 Adult, Juvenile                31
## 5 Infant                        165
## 6 Juvenile                      652
## 7 Adult                        2324

wildlife_5 |>
  group_by(Age) |>
  summarise(Instances = n()) |>
  arrange(Instances)

## # A tibble: 7 × 2
##   Age                     Instances
##   <chr>                       <int>
## 1 Juvenile, Infant                1
## 2 Adult, Juvenile, Infant         2
## 3 Adult, Infant                  14
## 4 Adult, Juvenile                27
## 5 Infant                        185
## 6 Juvenile                      677
## 7 Adult                        2287

wildlife_6 |>
  group_by(Age) |>
  summarise(Instances = n()) |>
  arrange(Instances)

## # A tibble: 7 × 2
##   Age                     Instances
##   <chr>                       <int>
## 1 Adult, Juvenile, Infant         2
## 2 Juvenile, Infant                2
## 3 Adult, Infant                  13
## 4 Adult, Juvenile                30
## 5 Infant                        192
## 6 Juvenile                      662
## 7 Adult                        2292

Number of Response Calls Over Time

When we look at the number of response calls overtime, we finally start to see some variation in the data that I would consider significant. This is likely due to their being so many more options being pulled from (almost 7 years worth) versus the 5 or 10 options available from some of the other variables.

With these visualizations, we can see some variance, especially in the earlier time period. In the mid-2018 time frame, sample 2 shows a month with more than 120 calls recorded, which is vastly different from sample 3 with somewhere around 85 (the other 3 samples being somewhere between those two). There is also some variation in the later part of the time frame, though it is much less exaggerated. The last third of the time period seems to cap out at around 85 for samples 1-3, though for samples 4 and 5, the max is closer to 80.

Despite this variation, the overall shape of the data seems to be consistent - with a spike of volume towards the beginning, before dropping and starting a cyclical growth rhythm.

wildlife_2 |>
  mutate(Month = floor_date(wildlife_2$DT_Rsp_DateFMT, "month")) |>
  group_by(Month) |>
  summarise(Instances = n()) |>
  mutate(month = as.Date(Month)) |>
  arrange(Month) -> wl_2

ggplot(wl_2, aes(x = month, y = Instances)) +
  geom_bar(stat = "identity", fill = "mediumseagreen") +
  labs (
    title = "Sample 1 Count of Responses Over Time",
    x = "Month and Year",
    y = "Count of Calls"
  ) +
  theme_minimal() +
  ylim(0,125)

wildlife_3 |>
  mutate(Month = floor_date(wildlife_3$DT_Rsp_DateFMT, "month")) |>
  group_by(Month) |>
  summarise(Instances = n()) |>
  mutate(month = as.Date(Month)) |>
  arrange(Month) -> wl_3

ggplot(wl_3, aes(x = month, y = Instances)) +
  geom_bar(stat = "identity", fill = "mediumseagreen") +
  labs (
    title = "Sample 2 Count of Responses Over Time",
    x = "Month and Year",
    y = "Count of Calls"
  ) +
  theme_minimal() +
  ylim(0,125)

wildlife_4 |>
  mutate(Month = floor_date(wildlife_4$DT_Rsp_DateFMT, "month")) |>
  group_by(Month) |>
  summarise(Instances = n()) |>
  mutate(month = as.Date(Month)) |>
  arrange(Month) -> wl_4

ggplot(wl_4, aes(x = month, y = Instances)) +
  geom_bar(stat = "identity", fill = "mediumseagreen") +
  labs (
    title = "Sample 3 Count of Responses Over Time",
    x = "Month and Year",
    y = "Count of Calls"
  ) +
  theme_minimal() +
  ylim(0,125)

wildlife_5 |>
  mutate(Month = floor_date(wildlife_5$DT_Rsp_DateFMT, "month")) |>
  group_by(Month) |>
  summarise(Instances = n()) |>
  mutate(month = as.Date(Month)) |>
  arrange(Month) -> wl_5

ggplot(wl_5, aes(x = month, y = Instances)) +
  geom_bar(stat = "identity", fill = "mediumseagreen") +
  labs (
    title = "Sample 4 Count of Responses Over Time",
    x = "Month and Year",
    y = "Count of Calls"
  ) +
  theme_minimal() +
  ylim(0,125)

wildlife_6 |>
  mutate(Month = floor_date(wildlife_6$DT_Rsp_DateFMT, "month")) |>
  group_by(Month) |>
  summarise(Instances = n()) |>
  mutate(month = as.Date(Month)) |>
  arrange(Month) -> wl_6

ggplot(wl_6, aes(x = month, y = Instances)) +
  geom_bar(stat = "identity", fill = "mediumseagreen") +
  labs (
    title = "Sample 5 Count of Responses Over Time",
    x = "Month and Year",
    y = "Count of Calls"
  ) +
  theme_minimal() +
  ylim(0,125)