The data we’ll be using for this part of the project were downloaded from kaggle, and include information about “pet licenses issued by the Seattle Animal Shelter between 2005 and early 2017.” We’ll be exploring these data and generating a few exploratory plots in the first part of the project.
First, we’ll read the data in from our data/raw_data
directory.
pets <- read.csv("dataviz_project/data/raw_data/seattle_pet_licenses.csv", stringsAsFactors = FALSE)
## Include code here to explore the data
head(pets)
## animal_s_name license_issue_date license_number
## 1 Ozzy 2005-03-29T00:00:00.000 130651
## 2 Jack 2009-12-23T00:00:00.000 898148
## 3 Ginger 2006-01-20T00:00:00.000 29654
## 4 Pepper 2006-02-07T00:00:00.000 75432
## 5 Addy 2006-08-04T00:00:00.000 729899
## 6 Rustie 2007-07-24T00:00:00.000 437433
## primary_breed secondary_breed species zip_code
## 1 Dachshund, Standard Smooth Haired Dog 98104
## 2 Schnauzer, Miniature Terrier, Rat Dog 98107
## 3 Retriever, Golden Retriever, Labrador Dog 98117
## 4 Manx Mix Cat 98103
## 5 Retriever, Golden Dog 98105
## 6 Dachshund, Standard Smooth Haired Dog 98108
glimpse(pets)
## Observations: 66,042
## Variables: 7
## $ animal_s_name <chr> "Ozzy", "Jack", "Ginger", "Pepper", "Addy",...
## $ license_issue_date <chr> "2005-03-29T00:00:00.000", "2009-12-23T00:0...
## $ license_number <int> 130651, 898148, 29654, 75432, 729899, 43743...
## $ primary_breed <chr> "Dachshund, Standard Smooth Haired", "Schna...
## $ secondary_breed <chr> "", "Terrier, Rat", "Retriever, Labrador", ...
## $ species <chr> "Dog", "Dog", "Dog", "Cat", "Dog", "Dog", "...
## $ zip_code <chr> "98104", "98107", "98117", "98103", "98105"...
## visual breakdown of how many of each species
## are included in the dataset
ggplot(pets) +
geom_bar(mapping = aes(species, fill = species)) +
theme_classic()
## Table: Most frequent Cat Name
pets %>%
filter(species == "Cat", animal_s_name != "") %>%
group_by(animal_s_name) %>%
summarise(n = n()) %>%
arrange(-n) %>%
top_n(n = 5) %>%
knitr::kable(., caption = "Top 5 Cat Names in Seattle")
## Selecting by n
animal_s_name | n |
---|---|
Lucy | 150 |
Max | 120 |
Luna | 119 |
Bella | 113 |
Oliver | 108 |
## Table: Most frequent Dog Name
pets %>%
filter(species == "Dog", animal_s_name != "") %>%
group_by(animal_s_name) %>%
summarise(n = n()) %>%
arrange(-n) %>%
top_n(n = 5) %>%
knitr::kable(., caption = "Top 5 Dog Names in Seattle")
## Selecting by n
animal_s_name | n |
---|---|
Lucy | 416 |
Charlie | 348 |
Bella | 338 |
Buddy | 256 |
Daisy | 256 |
## add date and ym columns
pets$date <- lubridate::ymd_hms(pets$license_issue_date)
pets$ym <- as.yearmon(pets$date, "%y%m")
## how the number of licenses recorded has changed over time
pets %>%
## group by yearmonth (`ym`)
group_by(ym) %>%
## count number within each group
summarise(n=n()) %>%
ggplot(., aes(ym, n)) +
## geom name for line chart
geom_line(color="red") +
scale_x_yearmon() +
xlab("") +
ylab("Number of licenses recorded") +
theme_classic()
### Explore the data
# Average value of each group using the stat_summary function
ggplot(pets, aes(x=species, y=date, fill=species)) +
geom_boxplot(alpha=0.4) +
stat_summary(fun.y=mean, geom="point", shape=20, size=10, color="red", fill="red") +
theme(legend.position="none") +
scale_fill_brewer(palette="Set1") +
theme_classic()
The data used in this part of the project were dowloaded from FiveThirtyEight - steak-survey
. They were originally used in the article: How Americans Like Their Steak. The goal of this part of the project will be to recreate the data visualization used in this article.
## read in the data
steak <- read.csv("dataviz_project/data/raw_data/steak-risk-survey.csv", stringsAsFactors = FALSE) %>%
## remove first row which just includes the word "Response" in each column
slice(2:n())
## Include code here to explore the data
glimpse(steak)
## Observations: 550
## Variables: 15
## $ RespondentID <dbl> ...
## $ Consider.the.following.hypothetical.situations...br.In.Lottery.A..you.have.a.50..chance.of.success..with.a.payout.of..100...br.In.Lottery.B..you.have.a.90..chance.of.success..with.a.payout.of..20...br..br.Assuming.you.have..10.to.bet..would.you.play.Lottery.A.or.Lottery.B. <chr> ...
## $ Do.you.ever.smoke.cigarettes. <chr> ...
## $ Do.you.ever.drink.alcohol. <chr> ...
## $ Do.you.ever.gamble. <chr> ...
## $ Have.you.ever.been.skydiving. <chr> ...
## $ Do.you.ever.drive.above.the.speed.limit. <chr> ...
## $ Have.you.ever.cheated.on.your.significant.other. <chr> ...
## $ Do.you.eat.steak. <chr> ...
## $ How.do.you.like.your.steak.prepared. <chr> ...
## $ Gender <chr> ...
## $ Age <chr> ...
## $ Household.Income <chr> ...
## $ Education <chr> ...
## $ Location..Census.Region. <chr> ...
## wrangle the steak data set
pref <- steak %>%
mutate(steak_pref = factor(How.do.you.like.your.steak.prepared.,
levels = c("Well",
"Medium Well",
"Medium",
"Medium rare",
"Rare"))) %>%
filter(steak_pref != "") %>%
group_by(steak_pref) %>%
summarise(n = n()) %>%
mutate(prop = n / sum(n))
## generate the plot
p <- ggplot(pref) +
## specify you want to generate a bar chart
geom_bar(aes(x = steak_pref, y = prop, fill = steak_pref),
stat = 'identity',
width = 0.7) +
## this adds text labels (you don't have to change anything here)
geom_text(aes(label = paste0(as.integer(prop*100),"%"),
x = steak_pref,
y = prop),
stat= "identity",
hjust = -0.2,
size = 5,
color = "grey40") +
## flip coordinates to make horizontal box plot
coord_flip() +
## change the colors of the bars
scale_colour_manual(values = c("#993300",
"#663300",
"#CC6600",
"#FF9933",
"#CC0033")) +
## change the scale/labels of the steak-wellness variable (x-axis)
scale_x_discrete(limits = levels(fct_rev(pref$steak_pref)),
labels = c("Well",
"Medium Well" = "Medium-well",
"Medium",
"Medium rare"="Medium-rare",
"Rare")) +
## change the scale/lables of the percent axis (y-axis)
scale_y_continuous(labels = scales::percent,
expand = c(mult = c(0,0),
add = c(0,0.035))) +
## change the title, subtitle, and caption
labs(title="How Do You Like Your Steak Prepared?",
subtitle="From a survey of 432 steak-eating Americans",
caption="FiveThirtyEight: Surveymonkey") +
## change the theme (use ggthemes)
theme_fivethirtyeight() +
## fine tune the theme
theme(axis.text = element_text(size = 12),
title = element_text(size = 14),
legend.position="none",
plot.caption=element_text(size = 12),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.text.x = element_blank()
)
p
## save plot to dataviz_project/figures/explanatory_figures directory
ggsave(plot = p, filename = "steak_R.png", path = "dataviz_project/figures/explanatory_figures", width = 8, height = 4)