Part 1: Exploratory Data Analysis

The data we’ll be using for this part of the project were downloaded from kaggle, and include information about “pet licenses issued by the Seattle Animal Shelter between 2005 and early 2017.” We’ll be exploring these data and generating a few exploratory plots in the first part of the project.

The Data

First, we’ll read the data in from our data/raw_data directory.

pets <- read.csv("dataviz_project/data/raw_data/seattle_pet_licenses.csv", stringsAsFactors = FALSE)

Explore the Data

## Include code here to explore the data 
head(pets)
##   animal_s_name      license_issue_date license_number
## 1          Ozzy 2005-03-29T00:00:00.000         130651
## 2          Jack 2009-12-23T00:00:00.000         898148
## 3        Ginger 2006-01-20T00:00:00.000          29654
## 4        Pepper 2006-02-07T00:00:00.000          75432
## 5          Addy 2006-08-04T00:00:00.000         729899
## 6        Rustie 2007-07-24T00:00:00.000         437433
##                       primary_breed     secondary_breed species zip_code
## 1 Dachshund, Standard Smooth Haired                         Dog    98104
## 2              Schnauzer, Miniature        Terrier, Rat     Dog    98107
## 3                 Retriever, Golden Retriever, Labrador     Dog    98117
## 4                              Manx                 Mix     Cat    98103
## 5                 Retriever, Golden                         Dog    98105
## 6 Dachshund, Standard Smooth Haired                         Dog    98108
glimpse(pets)
## Observations: 66,042
## Variables: 7
## $ animal_s_name      <chr> "Ozzy", "Jack", "Ginger", "Pepper", "Addy",...
## $ license_issue_date <chr> "2005-03-29T00:00:00.000", "2009-12-23T00:0...
## $ license_number     <int> 130651, 898148, 29654, 75432, 729899, 43743...
## $ primary_breed      <chr> "Dachshund, Standard Smooth Haired", "Schna...
## $ secondary_breed    <chr> "", "Terrier, Rat", "Retriever, Labrador", ...
## $ species            <chr> "Dog", "Dog", "Dog", "Cat", "Dog", "Dog", "...
## $ zip_code           <chr> "98104", "98107", "98117", "98103", "98105"...

Visualize the Data

## visual breakdown of how many of each species 
## are included in the dataset
ggplot(pets) +
  geom_bar(mapping = aes(species, fill = species)) +
  theme_classic()

## Table: Most frequent Cat Name 
pets %>% 
  filter(species == "Cat", animal_s_name != "") %>%
  group_by(animal_s_name) %>% 
  summarise(n = n()) %>% 
  arrange(-n) %>%
  top_n(n = 5) %>%
  knitr::kable(., caption = "Top 5 Cat Names in Seattle")
## Selecting by n
Top 5 Cat Names in Seattle
animal_s_name n
Lucy 150
Max 120
Luna 119
Bella 113
Oliver 108
## Table: Most frequent Dog Name
pets %>%
  filter(species == "Dog", animal_s_name != "") %>%
  group_by(animal_s_name) %>%
  summarise(n = n()) %>%
  arrange(-n) %>%
  top_n(n = 5) %>%
  knitr::kable(., caption = "Top 5 Dog Names in Seattle")
## Selecting by n
Top 5 Dog Names in Seattle
animal_s_name n
Lucy 416
Charlie 348
Bella 338
Buddy 256
Daisy 256
## add date and ym columns
pets$date <- lubridate::ymd_hms(pets$license_issue_date)
pets$ym <- as.yearmon(pets$date, "%y%m")

## how the number of licenses recorded has changed over time
pets %>%
  ## group by yearmonth (`ym`)
  group_by(ym) %>% 
  ## count number within each group
  summarise(n=n()) %>%
  ggplot(., aes(ym, n)) + 
  ## geom name for line chart
  geom_line(color="red") +
  scale_x_yearmon() + 
  xlab("") + 
  ylab("Number of licenses recorded") +
  theme_classic()

### Explore the data
# Average value of each group using the stat_summary function
ggplot(pets, aes(x=species, y=date, fill=species)) +
  geom_boxplot(alpha=0.4) +
  stat_summary(fun.y=mean, geom="point", shape=20, size=10, color="red", fill="red") +
  theme(legend.position="none") +
  scale_fill_brewer(palette="Set1") +
  theme_classic()

Part 2: Explanatory Data Analysis

The data used in this part of the project were dowloaded from FiveThirtyEight - steak-survey. They were originally used in the article: How Americans Like Their Steak. The goal of this part of the project will be to recreate the data visualization used in this article.

The Data

## read in the data
steak <- read.csv("dataviz_project/data/raw_data/steak-risk-survey.csv", stringsAsFactors = FALSE) %>%
  ## remove first row which just includes the word "Response" in each column
  slice(2:n())

Explore the Data

## Include code here to explore the data 
glimpse(steak)
## Observations: 550
## Variables: 15
## $ RespondentID                                                                                                                                                                                                                                                                      <dbl> ...
## $ Consider.the.following.hypothetical.situations...br.In.Lottery.A..you.have.a.50..chance.of.success..with.a.payout.of..100...br.In.Lottery.B..you.have.a.90..chance.of.success..with.a.payout.of..20...br..br.Assuming.you.have..10.to.bet..would.you.play.Lottery.A.or.Lottery.B. <chr> ...
## $ Do.you.ever.smoke.cigarettes.                                                                                                                                                                                                                                                     <chr> ...
## $ Do.you.ever.drink.alcohol.                                                                                                                                                                                                                                                        <chr> ...
## $ Do.you.ever.gamble.                                                                                                                                                                                                                                                               <chr> ...
## $ Have.you.ever.been.skydiving.                                                                                                                                                                                                                                                     <chr> ...
## $ Do.you.ever.drive.above.the.speed.limit.                                                                                                                                                                                                                                          <chr> ...
## $ Have.you.ever.cheated.on.your.significant.other.                                                                                                                                                                                                                                  <chr> ...
## $ Do.you.eat.steak.                                                                                                                                                                                                                                                                 <chr> ...
## $ How.do.you.like.your.steak.prepared.                                                                                                                                                                                                                                              <chr> ...
## $ Gender                                                                                                                                                                                                                                                                            <chr> ...
## $ Age                                                                                                                                                                                                                                                                               <chr> ...
## $ Household.Income                                                                                                                                                                                                                                                                  <chr> ...
## $ Education                                                                                                                                                                                                                                                                         <chr> ...
## $ Location..Census.Region.                                                                                                                                                                                                                                                          <chr> ...

Wrangle the Data

## wrangle the steak data set
pref <- steak %>% 
  mutate(steak_pref = factor(How.do.you.like.your.steak.prepared.,
                             levels = c("Well",
                                        "Medium Well",
                                        "Medium",
                                        "Medium rare",
                                        "Rare"))) %>% 
  filter(steak_pref != "") %>% 
  group_by(steak_pref) %>% 
  summarise(n = n()) %>%
  mutate(prop = n / sum(n))

Visualize the Data

## generate the plot 
p <- ggplot(pref) +
  ## specify you want to generate a bar chart
  geom_bar(aes(x = steak_pref, y = prop, fill = steak_pref),
                stat = 'identity', 
                width = 0.7) + 
  ## this adds text labels (you don't have to change anything here)
  geom_text(aes(label = paste0(as.integer(prop*100),"%"),
                x = steak_pref,
                y = prop), 
            stat= "identity", 
            hjust = -0.2, 
            size = 5,
            color = "grey40") +
  ## flip coordinates to make horizontal box plot
  coord_flip() +
  ## change the colors of the bars
  scale_colour_manual(values = c("#993300", 
                           "#663300",
                           "#CC6600", 
                           "#FF9933",
                           "#CC0033")) +
  ## change the scale/labels of the steak-wellness variable (x-axis)
  scale_x_discrete(limits = levels(fct_rev(pref$steak_pref)),
                labels = c("Well", 
                           "Medium Well" = "Medium-well",
                           "Medium",
                           "Medium rare"="Medium-rare",
                           "Rare")) +
  ## change the scale/lables of the percent axis (y-axis)
  scale_y_continuous(labels = scales::percent, 
                expand = c(mult = c(0,0), 
                           add = c(0,0.035))) +
  ## change the title, subtitle, and caption
  labs(title="How Do You Like Your Steak Prepared?",
      subtitle="From a survey of 432 steak-eating Americans",
      caption="FiveThirtyEight: Surveymonkey") +
  ## change the theme (use ggthemes)
  theme_fivethirtyeight() +
  ## fine tune the theme
  theme(axis.text = element_text(size = 12), 
                title = element_text(size = 14), 
                legend.position="none", 
                plot.caption=element_text(size = 12), 
                panel.grid.major = element_blank(), 
                panel.grid.minor = element_blank(),
                axis.text.x = element_blank()
  )

p

Save the Plot

## save plot to dataviz_project/figures/explanatory_figures directory
ggsave(plot = p, filename = "steak_R.png", path = "dataviz_project/figures/explanatory_figures", width = 8, height = 4)