Intro

This is some quick visualization for dog names in NYC as provided from the Kaggle dataset uploaded here: https://www.kaggle.com/new-york-city/nyc-dog-names

Data Clean-up

Let’s take a quick look at the data.

str(dt)
'data.frame':   81542 obs. of  11 variables:
 $ dog_name          : Factor w/ 13803 levels "A.","A.A.","Aaliyah",..: 1718 8761 12 2453 5665 12860 4783 11482 5562 4522 ...
 $ gender            : Factor w/ 3 levels "F","M","n/a": 2 1 1 1 1 2 1 2 2 2 ...
 $ breed             : Factor w/ 138 levels "Afghan Hound",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ birth             : Factor w/ 267 levels "0","2","3","4",..: 91 125 206 93 231 94 147 233 75 98 ...
 $ dominant_color    : Factor w/ 20 levels "APRICOT","BLACK",..: 6 2 2 20 3 3 10 2 20 15 ...
 $ secondary_color   : Factor w/ 20 levels "APRICOT","BLACK",..: 2 14 19 3 20 20 14 20 14 20 ...
 $ third_color       : Factor w/ 20 levels "APRICOT","BLACK",..: 14 14 14 14 2 2 14 13 14 14 ...
 $ spayed_or_neutered: Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 1 1 2 ...
 $ guard_or_trained  : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ borough           : Factor w/ 5 levels "Bronx","Brooklyn",..: 3 3 3 3 3 1 3 3 4 1 ...
 $ zip_code          : int  10003 10021 10034 10024 10022 10472 10021 10023 11354 10469 ...

We have a bit of cleaning up to do, it seems R isn’t properly understanding the “n/a” values provided. We can identify all the columns with a problem with a custom function.

is_na_f <- function(x) {
  sum(x == "n/a")
}
apply(dt,2,is_na_f)
          dog_name             gender              breed              birth     dominant_color    secondary_color        third_color spayed_or_neutered 
              4025                 62                  0                  0                771              25528              64921                  0 
  guard_or_trained            borough           zip_code 
                 0                  0                  0 

Let’s manually change them.

dt$dog_name <- as.factor(as.character(ifelse(dt$dog_name == 'n/a',NA,as.character(dt$dog_name))))
dt$gender <- as.factor(as.character(ifelse(dt$gender == 'n/a',NA,as.character(dt$gender))))
dt$dominant_color <- as.factor(as.character(ifelse(dt$dominant_color == 'n/a',NA,as.character(dt$dominant_color))))
dt$secondary_color <- as.factor(as.character(ifelse(dt$secondary_color == 'n/a',NA,as.character(dt$secondary_color))))
dt$third_color <- as.factor(as.character(ifelse(dt$third_color == 'n/a',NA,as.character(dt$third_color))))
apply(apply(dt,2,is.na),2,sum)
          dog_name             gender              breed              birth     dominant_color    secondary_color        third_color spayed_or_neutered 
              4025                 62                  0                  0                771              25528              64921                  0 
  guard_or_trained            borough           zip_code 
                 0                  0                  0 

Data exploration

We can visualize a bit to get to know our data better.

plot_gender <- dt %>%
  filter(is.na(gender) == F) %>%
  ggplot(aes(x=gender)) +
  geom_bar(fill = 'light blue', alpha = 1, width = .3) +
  theme_light() +
  labs(title = "Gender Distribution", x= "Gender", y = "Total Count")
  
plot_color <- dt %>%
  filter(is.na(dominant_color) == F) %>%
  group_by(dominant_color) %>%
  summarize(count = n()) %>%
  arrange(count) %>%
  ggplot(aes(x=factor(dominant_color,levels= dominant_color), y = count)) +
  geom_bar(fill = 'light blue', alpha = 1, width = .3, stat = 'identity') +
  theme_light() +
  labs(title = "Color Distribution", x= "Dominant Color", y = "Total Count") +
  coord_flip()
plot_gt <- dt %>%
  ggplot(aes(x=guard_or_trained)) +
  geom_bar(fill = 'light blue', alpha = 1, width = .3) +
  theme_light() +
  labs(title = "Guard/Trained Distribution", x= "Guard/Trained", y = "Total Count")
plot_sn <- dt %>%
  ggplot(aes(x=spayed_or_neutered)) +
  geom_bar(fill = 'light blue', alpha = 1, width = .3) +
  theme_light() +
  labs(title = "Spayed/Neutered Distribution", x= "Spayed/Neutered", y = "Total Count")
plot_borough <- dt %>%
  filter(is.na(borough) == F) %>%
  group_by(borough) %>%
  summarize(count = n()) %>%
  arrange(count) %>%
  ggplot(aes(x=factor(borough,levels= borough), y = count)) +
  geom_bar(fill = 'light blue', alpha = 1, width = .3, stat = 'identity') +
  theme_light() +
  labs(title = "Borough Distribution", x= "Borough", y = "Total Count")
grid.arrange(plot_gender, plot_borough, plot_gt, plot_sn)

Let’s dig deeper into the differences in dog names based on gender.

plot_m <- dt %>%
  filter(gender == 'M', is.na(dog_name) == F) %>%
  group_by(dog_name) %>%
  summarize(count = n()) %>%
  arrange(count) %>%
  top_n(15) %>%
  ggplot(aes(x=factor(dog_name,levels= dog_name),y=count)) +
  geom_bar(fill = 'green', alpha = .8, width = .4, stat = 'identity') +
  theme_light() + 
  coord_flip() +
  labs(title = "Popular Male Names", x = "Name", y = "Count")
Selecting by count
plot_f <- dt %>%
  filter(gender == 'F', is.na(dog_name) == F) %>%
  group_by(dog_name) %>%
  summarize(count = n()) %>%
  arrange(count) %>%
  top_n(15) %>%
  ggplot(aes(x=factor(dog_name,levels= dog_name),y=count)) +
  geom_bar(fill = 'green', alpha = .8, width = .4, stat = 'identity') +
  theme_light() + 
  coord_flip()+
  labs(title = "Popular Female Names", x = "Name", y = "Count")
Selecting by count
grid.arrange(plot_m, plot_f)

There are some clear differences for names based on gender. Which names are popular for male and females?

dt %>%
  filter(is.na(dog_name) == F) %>%
  group_by(dog_name) %>%
  summarize(count = n(),
            count_female = length(gender[gender == 'F']), 
            count_male = length(gender[gender == 'M']),
            male_ratio = (count_male)/ (count_female + count_male),
            diff = abs(male_ratio - .5)
            ) %>%
  filter(count > 50) %>%
  arrange(diff) %>%
  top_n(-15) %>%
  ggplot(aes(x=count_female, y=count_male, fill = dog_name, label = dog_name)) +
  geom_point(alpha = .7, color = 'gray',size = 3) +
  geom_label_repel( fontface = 'bold', color = 'white', segment.color = 'grey50') + 
  theme_light() + 
  labs(title = "Most Popular Gender-Neutral Dog Names", x='Female Occurences', y='Male Occurences')
Selecting by diff

This is quite interesting, among these names it seems likely this is direcly related to their appearance or personality.

I can take a guess which borough usually registers dog’s named Brooklyn.

dt %>%
  filter(dog_name == 'Brooklyn') %>%
  ggplot(aes(x=borough, fill=gender)) +
  geom_bar( alpha = .8) + 
  theme_light() +
  labs(title = 'Dogs Named Brooklyn', y = 'Count', x = 'Borough')

Actually not as strong of a distribution as I would expect, but this could be evidence of owners traveling to a different borough when they visit a vet or just moving around in general.

Let’s do the same for “Snow” and “Snowy”… I’d expect their colors to tell the story here.

dt %>%
  filter(dog_name == 'Snow' | dog_name == 'Snowy', is.na(dominant_color) == F) %>%
  ggplot(aes(x=dominant_color, fill=gender)) +
  geom_bar( alpha = .8) + 
  theme_light() +
  labs(title = 'Dogs Named Snow/Snowy', y = 'Count', x = 'Color')

Pretty close. I am interested in the black dog named snowy, perhaps it is spotted.

dt %>%
  filter(dog_name == 'Snow' | dog_name == 'Snowy', dominant_color == 'BLACK') %>%
  select(dog_name, gender, breed, dominant_color, secondary_color, third_color)

The secondary color is almost always white, except for one poor chihuahua. Perhaps Snowy was named in irony.

Let’s take a quick look at the breeds.

dt %>%
  filter(is.na(breed) == F) %>%
  group_by(breed) %>%
  summarize(count = n()) %>%
  arrange(count) %>%
  top_n(25) %>%
  ggplot(aes(x=factor(breed,levels= breed),y=count)) +
  geom_bar(fill = 'light blue', alpha = .8, width = .4, stat = 'identity') +
  theme_light() + 
  coord_flip()+
  labs(title = "Popular Breeds", x = "Breed", y = "Count")
Selecting by count

There’s a lot of granularity here, this is only the top 25. Interesting to see the amount of muts in the population, this is roughly a quarter.

For fun, let’s take a closer look at the most popular names for Golden Retrievers.

dt %>%
  filter(breed == 'Golden Retriever', is.na(dog_name) == F) %>%
  group_by(dog_name) %>%
  summarize(count = n()) %>%
  arrange(desc(count)) %>%
  top_n(15) %>%
  inner_join(dt) %>%
  filter(breed == 'Golden Retriever') %>%
  ggplot(aes(x=dog_name, fill= dominant_color)) + 
  geom_bar(alpha = .8) +
  theme_light() +
  coord_flip() +
  labs(title = 'Top Golden Retriever Names', x = 'Name', y = 'Count')
Selecting by count
Joining, by = "dog_name"

The names are pretty generic, but it is interesting that “Rusty” is a popular name for the dominant “Rust” colored retrievers. I would be a bit curious if there is selection bias when filling out the color of the dog when its name is already Rusty!

---
title: "Dog Names in NYC"
output: html_notebook
---

#Intro
This is some quick visualization for dog names in NYC as provided from the Kaggle dataset uploaded here:
https://www.kaggle.com/new-york-city/nyc-dog-names

```{r}
library(tidyverse)
library(lubridate)
library(gridExtra)
library(ggrepel)
library(xtable)

dt <- read.csv("Dogs of NYC - WNYC.csv")
```
#Data Clean-up

Let's take a quick look at the data.
```{r}
str(dt)
```
We have a bit of cleaning up to do, it seems R isn't properly understanding the "n/a" values provided. We can identify all the columns with a problem with a custom function.
```{r}
is_na_f <- function(x) {
  sum(x == "n/a")
}

apply(dt,2,is_na_f)
```
Let's manually change them.
```{r}
dt$dog_name <- as.factor(as.character(ifelse(dt$dog_name == 'n/a',NA,as.character(dt$dog_name))))
dt$gender <- as.factor(as.character(ifelse(dt$gender == 'n/a',NA,as.character(dt$gender))))
dt$dominant_color <- as.factor(as.character(ifelse(dt$dominant_color == 'n/a',NA,as.character(dt$dominant_color))))
dt$secondary_color <- as.factor(as.character(ifelse(dt$secondary_color == 'n/a',NA,as.character(dt$secondary_color))))
dt$third_color <- as.factor(as.character(ifelse(dt$third_color == 'n/a',NA,as.character(dt$third_color))))

apply(apply(dt,2,is.na),2,sum)
```
#Data exploration
We can visualize a bit to get to know our data better.
```{r}
plot_gender <- dt %>%
  filter(is.na(gender) == F) %>%
  ggplot(aes(x=gender)) +
  geom_bar(fill = 'light blue', alpha = 1, width = .3) +
  theme_light() +
  labs(title = "Gender Distribution", x= "Gender", y = "Total Count")
  
plot_color <- dt %>%
  filter(is.na(dominant_color) == F) %>%
  group_by(dominant_color) %>%
  summarize(count = n()) %>%
  arrange(count) %>%
  ggplot(aes(x=factor(dominant_color,levels= dominant_color), y = count)) +
  geom_bar(fill = 'light blue', alpha = 1, width = .3, stat = 'identity') +
  theme_light() +
  labs(title = "Color Distribution", x= "Dominant Color", y = "Total Count") +
  coord_flip()

plot_gt <- dt %>%
  ggplot(aes(x=guard_or_trained)) +
  geom_bar(fill = 'light blue', alpha = 1, width = .3) +
  theme_light() +
  labs(title = "Guard/Trained Distribution", x= "Guard/Trained", y = "Total Count")

plot_sn <- dt %>%
  ggplot(aes(x=spayed_or_neutered)) +
  geom_bar(fill = 'light blue', alpha = 1, width = .3) +
  theme_light() +
  labs(title = "Spayed/Neutered Distribution", x= "Spayed/Neutered", y = "Total Count")

plot_borough <- dt %>%
  filter(is.na(borough) == F) %>%
  group_by(borough) %>%
  summarize(count = n()) %>%
  arrange(count) %>%
  ggplot(aes(x=factor(borough,levels= borough), y = count)) +
  geom_bar(fill = 'light blue', alpha = 1, width = .3, stat = 'identity') +
  theme_light() +
  labs(title = "Borough Distribution", x= "Borough", y = "Total Count")

grid.arrange(plot_gender, plot_borough, plot_gt, plot_sn)
```
Let's dig deeper into the differences in dog names based on gender.
```{r}
plot_m <- dt %>%
  filter(gender == 'M', is.na(dog_name) == F) %>%
  group_by(dog_name) %>%
  summarize(count = n()) %>%
  arrange(count) %>%
  top_n(15) %>%
  ggplot(aes(x=factor(dog_name,levels= dog_name),y=count)) +
  geom_bar(fill = 'green', alpha = .8, width = .4, stat = 'identity') +
  theme_light() + 
  coord_flip() +
  labs(title = "Popular Male Names", x = "Name", y = "Count")

plot_f <- dt %>%
  filter(gender == 'F', is.na(dog_name) == F) %>%
  group_by(dog_name) %>%
  summarize(count = n()) %>%
  arrange(count) %>%
  top_n(15) %>%
  ggplot(aes(x=factor(dog_name,levels= dog_name),y=count)) +
  geom_bar(fill = 'green', alpha = .8, width = .4, stat = 'identity') +
  theme_light() + 
  coord_flip()+
  labs(title = "Popular Female Names", x = "Name", y = "Count")

grid.arrange(plot_m, plot_f)
```
There are some clear differences for names based on gender. Which names are popular for male and females?
```{r}
dt %>%
  filter(is.na(dog_name) == F) %>%
  group_by(dog_name) %>%
  summarize(count = n(),
            count_female = length(gender[gender == 'F']), 
            count_male = length(gender[gender == 'M']),
            male_ratio = (count_male)/ (count_female + count_male),
            diff = abs(male_ratio - .5)
            ) %>%
  filter(count > 50) %>%
  arrange(diff) %>%
  top_n(-15) %>%
  ggplot(aes(x=count_female, y=count_male, fill = dog_name, label = dog_name)) +
  geom_point(alpha = .7, color = 'gray',size = 3) +
  geom_label_repel( fontface = 'bold', color = 'white', segment.color = 'grey50') + 
  theme_light() + 
  labs(title = "Most Popular Gender-Neutral Dog Names", x='Female Occurences', y='Male Occurences')
```
This is quite interesting, among these names it seems likely this is direcly related to their appearance or personality.

I can take a guess which borough usually registers dog's named Brooklyn.
```{r}
dt %>%
  filter(dog_name == 'Brooklyn') %>%
  ggplot(aes(x=borough, fill=gender)) +
  geom_bar( alpha = .8) + 
  theme_light() +
  labs(title = 'Dogs Named Brooklyn', y = 'Count', x = 'Borough')
```
Actually not as strong of a distribution as I would expect, but this could be evidence of owners traveling to a different borough when they visit a vet or just moving around in general.

Let's do the same for "Snow" and "Snowy"... I'd expect their colors to tell the story here.
```{r}
dt %>%
  filter(dog_name == 'Snow' | dog_name == 'Snowy', is.na(dominant_color) == F) %>%
  ggplot(aes(x=dominant_color, fill=gender)) +
  geom_bar( alpha = .8) + 
  theme_light() +
  labs(title = 'Dogs Named Snow/Snowy', y = 'Count', x = 'Color')
```
Pretty close. I am interested in the black dog named snowy, perhaps it is spotted.
```{r}
dt %>%
  filter(dog_name == 'Snow' | dog_name == 'Snowy', dominant_color == 'BLACK') %>%
  select(dog_name, gender, breed, dominant_color, secondary_color, third_color)
```
The secondary color is almost always white, except for one poor chihuahua. Perhaps Snowy was named in irony.

Let's take a quick look at the breeds.
```{r}
dt %>%
  filter(is.na(breed) == F) %>%
  group_by(breed) %>%
  summarize(count = n()) %>%
  arrange(count) %>%
  top_n(25) %>%
  ggplot(aes(x=factor(breed,levels= breed),y=count)) +
  geom_bar(fill = 'light blue', alpha = .8, width = .4, stat = 'identity') +
  theme_light() + 
  coord_flip()+
  labs(title = "Popular Breeds", x = "Breed", y = "Count")
```
There's a lot of granularity here, this is only the top 25. Interesting to see the amount of muts in the population, this is roughly a quarter.

For fun, let's take a closer look at the most popular names for Golden Retrievers.
```{r}
dt %>%
  filter(breed == 'Golden Retriever', is.na(dog_name) == F) %>%
  group_by(dog_name) %>%
  summarize(count = n()) %>%
  arrange(desc(count)) %>%
  top_n(15) %>%
  inner_join(dt) %>%
  filter(breed == 'Golden Retriever') %>%
  ggplot(aes(x=dog_name, fill= dominant_color)) + 
  geom_bar(alpha = .8) +
  theme_light() +
  coord_flip() +
  labs(title = 'Top Golden Retriever Names', x = 'Name', y = 'Count')
```
The names are pretty generic, but it is interesting that "Rusty" is a popular name for the dominant "Rust" colored retrievers. I would be a bit curious if there is selection bias when filling out the color of the dog when its name is already Rusty!