Load the Dataset

seattle_pets <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-03-26/seattle_pets.csv") %>%
  mutate(license_issue_date = mdy(license_issue_date)) %>%
  rename(animal_name = animals_name)

Analyzing Common Cat and Dog Breeds in Seattle

seattle_pets %>%
  filter(license_issue_date >= "2017-01-01") %>%
  count(species, primary_breed, sort = TRUE) %>%
  # sorted descending by n (count)
  filter(species %in% c("Cat", "Dog")) %>%
  # filtered out 'goat' and 'pig' (filter %>% count(species) only showed 4 types)
  mutate(percent = n / sum(n)) %>%
  # created new column, percentage of breed over total animal count)
  group_by(species) %>%
  top_n(10, percent) %>%
  # shows top 10 'percent' entries for each species (cat and dog); 20 rows total
  ungroup() %>%
  mutate(primary_breed = fct_reorder(primary_breed, percent)) %>%
  # changed the plot to descending order of 'primary_breed' (y axis) by 'percent' (x-axis). Without it, the descending order is organized alphabetically by 'primary_breed' (looks messy)
  ggplot(aes(primary_breed, percent, fill = species)) +
  geom_col(show.legend = FALSE) +
  scale_y_continuous(labels = scales::percent_format()) +
  # this sets values for y axis by percentage using 'percent_format'. 
  # Without this line, y axis would only be in decimal format.
  facet_wrap(~ species, scales = "free_y", ncol = 1) +
  coord_flip() +
  labs(x = "Primary breed",
       y = "% of this species",
       title = "Most common cat and dog breeds",
       subtitle = "Of licensed pets in Seattle 2017-2018")

Discussion

The graph shows a facet plot of the most common cat and dog breeds in Seattle (2017-2018 data). Using the #fct_reorder code, it organized the plot of the category ‘primary_breed’ by descending order of ‘percent’. Using #top_n(10, percent) was helpful in selecting the top entries based on the ‘percent’ variable. Using #scale_y_continuous(labels = scales::percent_format()) scaled the y-axis of the plot to the unit chosen (in this case, Y-axis is “% of this species” so Robinson used the percentage unit for values, making it easier to read than the default decimal values).

Hypergeometric P-Value Histogram

dogs <- seattle_pets %>%
  filter(species == "Dog")

name_counts <- dogs %>%
  group_by(animal_name) %>%
  summarize(name_total = n()) %>%
  # total of each name in species of dog
  filter(name_total >= 100)
  # chooses names that have 100 or more dogs with that name

breed_counts <- dogs %>%
  group_by(primary_breed) %>%
  summarize(breed_total = n()) %>%
  filter(breed_total >= 200)
  # chooses breeds that have 100 or more dogs within them

total_dogs <- nrow(dogs)
# if you wanted to find total count of only cats: 
# seattle_pets %>% 
# filter (species =="Cat") %>% nrow()

name_breed_counts <- dogs %>%
  count(primary_breed, animal_name) %>%
  complete(primary_breed, animal_name, fill = list(n = 0)) %>%
  # creates new tibble where columns are only "primary_breed", "animal_name" and count (n). 
  # The tibble only counts those breed and animal name that have a count of zero (n=0). 
  inner_join(name_counts, by = "animal_name") %>%
  inner_join(breed_counts, by = "primary_breed")
  # adds the two earlier tibbles created "name_counts" and "breed_counts" by matching them to the categories "animal_name" and "primary_breed" respectively.

# One-sided hypergeometric p-value
hypergeom_test <- name_breed_counts %>%
  mutate(percent_of_breed = n / breed_total,
         percent_overall = name_total / total_dogs) %>%
  mutate(overrepresented_ratio = percent_of_breed / percent_overall) %>%
  # creating new variables using mutate, then uses new variables to create another new variable (#overrepresented_ratio). 
  arrange(desc(overrepresented_ratio)) %>%
  mutate(hypergeom_p_value = 1 - phyper(n, name_total, total_dogs - name_total, breed_total),
         holm_p_value = p.adjust(hypergeom_p_value),
         fdr = p.adjust(hypergeom_p_value, method = "fdr"))
# fdr = false discovery rate, or false positives.

hypergeom_test %>%
  filter(fdr < .05)
## # A tibble: 6 × 11
##   primary_breed            animal_name     n name_total breed_total percent_of_breed
##   <chr>                    <chr>       <dbl>      <int>       <int>            <dbl>
## 1 Havanese                 Oscar           8        100         466          0.0172 
## 2 Spaniel, American Cocker Milo            7        120         362          0.0193 
## 3 Boxer                    Rocky           8        113         485          0.0165 
## 4 Pug                      Zoe             8        117         554          0.0144 
## 5 Beagle                   Lucy           19        337         542          0.0351 
## 6 Retriever, Labrador      Scout          34        127        4867          0.00699
## # … with 5 more variables: percent_overall <dbl>, overrepresented_ratio <dbl>,
## #   hypergeom_p_value <dbl>, holm_p_value <dbl>, fdr <dbl>
# control false positives under 5%

ggplot(hypergeom_test, aes(hypergeom_p_value)) +
  geom_histogram(binwidth = .1) +
  labs(x = "One-sided hypergeometric p-values for overrepresented name")

Discussion

This chunk creates a histogram plot to represent the one-sided hypergeometric p-value of this dataset (showing no under-representation of dog breed/name combinations). Robinson creates a new tibble by using #inner_join to match up the “name_counts” and “breed_counts” data (picked out at the beginning of this chunk) with the categories “animal_name” and “primary_breed” respectively. During this part, #complete(primary_breed, animal_name, fill = list(n = 0)) is used to turn implicit missing values into explicit missing values (in this case, the dog species/name combinations that have zero counts e.g. ‘Australian cattle dog’ named ‘Coco’ = 0).

Next, Robinson uses “fdr” (false discovery rate) to control for lower proportions of false positives in the dataset by keeping this rate under 5%. 6 breed/name combos are then listed, indicating that about 5% of them will draw incorrect conclusions (be over-represented):

  • Havanese, Oscar

  • American Cocker Spaniel, Milo

  • Boxer, Rocky

  • Pug, Zoe

  • Beagle, Lucy

  • Labrador Retriever, Scout

Helpful Link: Robinson’s blog on how to interpret p-value histogram

Exploring a Potential Explanation for P-Value Underrepresentation

crossing(name_total = c(100, 200, 300),
         breed_total = seq(200, 1000, 25)) %>%
# used to create all possible combinations of hypothetical values 
  mutate(max_p_value = 1 - phyper(0, name_total, total_dogs - name_total, breed_total)) %>%
# hypergeometric p value of dataset
  ggplot(aes(breed_total, max_p_value, color = factor(name_total))) +
  geom_line() +
  labs(x = "Total # of dogs in breed",
       y = "Maximum one-sided p-value",
       color = "# of dogs with name")

Discussion

This chunk describes a potential explanation as to why the previous histogram does not accurately display under-represented dog names for each dog breed. #Crossing is used to generate all combinations of variables between “name_total” and “breed_total”, given different hypothetical values for each category.

Robinson glosses over what this graph indicates, however it seems to show that p-values drawn from the dataset are relatively high for dog name/breed combinations that have a low count.

Summary of Hypergeometric Test

library(scales)

hypergeom_test %>%
  filter(fdr <= .05) %>%
  arrange(fdr) %>%
  transmute(`Breed` = primary_breed,
            `Name` = animal_name,
            `# of dogs with name` = n,
            `% of breed` = percent(percent_of_breed),
            `% overall` = percent(percent_overall),
            `FDR-adjusted one-sided p-value` = fdr) %>%
  knitr::kable()
Breed Name # of dogs with name % of breed % overall FDR-adjusted one-sided p-value
Beagle Lucy 19 3.506% 0.9579% 0.0005395
Havanese Oscar 8 1.717% 0.2842% 0.0056380
Spaniel, American Cocker Milo 7 1.934% 0.3411% 0.0107690
Boxer Rocky 8 1.649% 0.3212% 0.0107690
Retriever, Labrador Scout 34 0.699% 0.3610% 0.0107690
Pug Zoe 8 1.444% 0.3326% 0.0252227

Discussion

This chunk creates a table to display the count, percent of breed, percent overall and FDR values of the previous 6 dog breed/name combos. #transmute is used to include the new variables previously created by #mutate (“percent_of_breed”, “percent_overall” and “fdr”) and associates their values to each breed/name combination.