- Overview of ggplot2
- Rewind to
tidyverseto discussgroup_by()andsummarise() - Build bar plots + big exercise
- Build other plots + exercises
October 19, 2019
tidyverse to discuss group_by() and summarise()tidyverse packageWhat we will learn
Requests to learn another plot type?
1 discrete variable (plus other optional discrete and/or continuous variables)
1 continuous variable
2 continuous variables
1 continuous variable + date variable
library(tidyverse) mpg %>% ggplot() + geom_point(aes(displ, hwy))
tidyverse and re-run the test codeinstall.packages('tidyverse')
library(tidyverse)
mpg %>% ggplot(aes(displ, hwy))) + geom_point()
ggplot2 and re-run the test codeinstall.packages('ggplot2')
library(ggplot2)
mpg %>% ggplot(aes(displ, hwy)) + geom_point()
library(tidyverse)
crime <- read_csv('https://bit.ly/2mcZLq4') %>% as.data.frame()
contr <- read_csv('https://bit.ly/2lQySrQ') %>% as.data.frame()
ggplot() function and a geom layer+ggplot() + geom_bar() # create bar and stacked bar plots ggplot() + geom_histogram() # create histograms ggplot() + geom_point() # create scatter plots ggplot() + geom_line() # create line plots
ggplot() functioncontr %>% ggplot() + geom_bar() ggplot(contr) + geom_bar()
aes()
aes() indicates the variables that affect the chart aestheticsaes() can be an argument in ggplot() or the geomaes() functionaes() arguments reference variables in your data framecontr %>% ggplot(aes(primary_general)) + geom_bar() contr %>% ggplot() + geom_bar(aes(primary_general))
aes(x = NULL, y = NULL, color = NULL , fill = NULL, alpha = NULL, label = NULL , shape = NULL, size = NULL, group = NULL , linetype = NULL )
contr %>% ggplot(aes(primary_general)) + geom_bar() contr %>% ggplot() + geom_bar(aes(primary_general))
contr %>% ggplot(aes(primary_general, fill = primary_general)) + geom_bar()
contr %>% ggplot(aes(primary_general, fill = party)) + geom_bar()
crime data that shows how many incidences occurred in sectorsector values: c('B', 'E', 'D', 'R', 'O', 'C', 'K')crime_subcategorycrime_subcategory values: c('ROBBERY-STREET', 'THEFT-BICYCLE', 'AGGRAVATED ASSAULT', 'TRESPASS')Hint
# Use to include/exclude values filter()
crime data that shows how many incidences occurred in sectorsector values: c('B', 'E', 'D', 'R', 'O', 'C', 'K')crime_subcategorycrime_subcategory values: c('ROBBERY-STREET', 'THEFT-BICYCLE', 'AGGRAVATED ASSAULT', 'TRESPASS')Hint
crime %>% filter() %>% ggplot()
crime %>%
filter(
sector %in% c('B', 'E', 'D', 'R', 'O', 'C', 'K') &
crime_subcategory %in% c('ROBBERY-STREET', 'THEFT-BICYCLE'
, 'AGGRAVATED ASSAULT', 'TRESPASS')
) %>%
ggplot(aes(sector, fill = crime_subcategory)) +
geom_bar()
Axis names
2 options
# Option 1
labs(x = 'X Axis Title', y = 'Y Axis Title')
# Option 2
xlab('X Axis Title')
ylab('Y Axis Title')
contr %>% filter(! primary_general %in% NA) %>% ggplot(aes(primary_general, fill = primary_general)) + geom_bar() + labs(x = 'Election Type', y = 'Donations (#)')
Try adding axis names to the sector plot you made during the exercise
crime %>%
filter(
sector %in% c('B', 'E', 'D', 'R', 'O', 'C', 'K') &
crime_subcategory %in% c('ROBBERY-STREET', 'THEFT-BICYCLE'
, 'AGGRAVATED ASSAULT', 'TRESPASS')
) %>%
ggplot(aes(sector, fill = crime_subcategory)) + geom_bar() +
labs(x = 'District Sectors', y = 'Incidences by Event\nClearance Group (#)')
Legend names
labs(fill = '') labs(fill = element_blank()) labs(fill = NULL) labs(colour = 'Check out these colors')
crime %>%
filter(
sector %in% c('B', 'E', 'D', 'R', 'O', 'C', 'K') &
crime_subcategory %in% c('ROBBERY-STREET', 'THEFT-BICYCLE'
, 'AGGRAVATED ASSAULT', 'TRESPASS')
) %>%
ggplot(aes(sector, fill = crime_subcategory)) + geom_bar() +
labs(x = 'Election Type', y = 'Donations (#)', fill = 'Check out these colors')
Try removing the legend name from your sector plot
Try removing the legend name from your sector plot
crime %>%
filter(sector %in% c('B', 'E', 'D', 'R', 'O', 'C', 'K')) %>%
ggplot(aes(sector, fill = crime_subcategory)) + geom_bar() +
labs(x = 'District Sectors', y = 'Incidences by Event\nClearance Group (#)'
, fill = element_blank())
Chart titles and subtitles
# Option 1 labs(title = NULL, subtitle = NULL) # Option 2 ggtitle(title = NULL, subtitle = NULL)
contr %>%
ggplot(aes(primary_general, fill = primary_general)) +
geom_bar() +
labs(
x = 'Election Type'
, y = 'Donations (#)'
, fill = element_blank()
, title = 'Number of Donations by Election Type'
, subtitle = 'Over 6,000 full election cycle donations'
)
Try adding a title to the sector plot
Try adding a title to the sector plot
crime %>%
filter(
sector %in% c('B', 'E', 'D', 'R', 'O', 'C', 'K') &
crime_subcategory %in% c('ROBBERY-STREET', 'THEFT-BICYCLE'
, 'AGGRAVATED ASSAULT', 'TRESPASS')
) %>%
ggplot(aes(sector, fill = crime_subcategory)) + geom_bar() +
labs(x = 'District Sectors', y = 'Incidences by Event\nClearance Group (#)'
, fill = element_blank(), title = 'An Amazing Title'
)
install.packages('scales')
library(scales)
scale_y_continuous(labels = percent) # Add a percentage sign to numbers on axis
scale_y_continuous(labels = dollar) # Add a dollar sign to numbers on axis
scale_y_continuous(labels = comma) # Add a comma to numbers on axis
contr %>%
filter(! primary_general %in% NA) %>%
ggplot(aes(primary_general, fill = primary_general)) +
geom_bar() +
labs(
x = 'Election Type'
, y = 'Donations (#)'
, fill = element_blank()
, title = 'Number of Donations by Election Type'
, subtitle = 'Over 6,000 full election cycle donations'
) +
scale_y_continuous(labels = comma)
ggplot2 themesggthemes themesinstall.packages('ggthemes')
library(ggthemes)
Selected ggplot2 themes
theme_classic() theme_minimal() theme_dark()
Selected ggthemes themes
theme_stata() + scale_colour_stata() # scale_fill_stata()
theme_economist() + scale_colour_economist() # scale_fill_economist()
theme_fivethirtyeight() + scale_color_fivethirtyeight() # scale_fill_fivethirtyeight()
theme_wsj() + scale_colour_wsj() # scale_fill_wsj()
theme_pander() + scale_colour_pander() # scale_fill_pander()
theme_hc(bgcolor = "darkunica") + scale_colour_hc("darkunica") # scale_fill_hc("darkunica")
contr %>%
filter(! primary_general %in% NA) %>%
ggplot(aes(primary_general, fill = primary_general)) +
geom_bar() +
labs(
x = 'Election Type'
, y = 'Donations (#)'
, fill = element_blank()
, title = 'Number of Donations by Election Type'
, subtitle = 'Over 6,000 full election cycle donations'
) +
scale_y_continuous(labels = comma) +
theme_economist() +
scale_fill_economist()
Economist theme
Try adding a theme to the sector plot
theme()theme( plot.title = element_text(size=14, face="bold", vjust=1) , plot.background = element_blank() , panel.grid.major = element_blank() , panel.grid.minor = element_blank() , panel.border = element_blank() , panel.background = element_blank() , axis.ticks = element_blank() , axis.text = element_text(colour="black", size=12) , axis.text.x = element_text(angle=45, hjust=1) , legend.title = element_blank() , legend.position = "none" , legend.text = element_text(size=12) )
contr, which office received the most money in campaign donations from 'MAYOR'?contr, which contributor_state saw the largest number of cash donations, excluding 'WA' and NA values?contr, what is the largest average donation amount by contributor_state?crime, how many crimes were reported (reported_year) one and two years after the year in which they occurred (occurred_year)?group_by(), list the variables by which you want to aggregate datasummarise(), create a variable and define the variable with an aggregation function%>% to ‘link’ group_by() and summarise()n(), n_distinct(), sum(), mean(), max(), min(), etc.summarise() functionExample
In contr, which office associated with 'NON PARTISAN' party values received the most money in donations?
contr %>% group_by(office, party) %>% summarise(dollars = sum(amount, na.rm = TRUE))
Example
In contr, which office associated with 'NON PARTISAN' party values received the most money in donations?
contr %>% group_by(office, party) %>% summarise(dollars = sum(amount, na.rm = TRUE)) %>% filter(party %in% 'NON PARTISAN') %>% arrange(desc(dollars)) %>% head() ## # A tibble: 6 x 3 ## # Groups: office [6] ## office party dollars ## <chr> <chr> <dbl> ## 1 CITY COUNCIL MEMBER NON PARTISAN 686644. ## 2 MAYOR NON PARTISAN 331237. ## 3 <NA> NON PARTISAN 208066. ## 4 PORT COMMISSIONER NON PARTISAN 152623. ## 5 COUNTY COUNCIL MEMBER NON PARTISAN 146563. ## 6 COUNTY EXECUTIVE NON PARTISAN 131929.
Example
In contr, which contributor_state saw the largest number of cash donations (see cash_or_in_kind), excluding 'WA' and NA values?
contr %>%
filter(cash_or_in_kind %in% 'Cash' & ! contributor_state %in% c('WA', NA)) %>%
group_by(contributor_state) %>%
summarise(n = n()) %>%
arrange(desc(n)) %>%
head(5)
## # A tibble: 5 x 2
## contributor_state n
## <chr> <int>
## 1 CA 423
## 2 OR 213
## 3 NY 128
## 4 DC 121
## 5 TX 116
How would you turn the tabular output data into a chart?
contr %>%
filter(cash_or_in_kind %in% 'Cash' & ! contributor_state %in% c('WA', NA)) %>%
group_by(contributor_state) %>%
summarise(n = n()) %>%
arrange(desc(n)) %>%
head(5)
## # A tibble: 5 x 2
## contributor_state n
## <chr> <int>
## 1 CA 423
## 2 OR 213
## 3 NY 128
## 4 DC 121
## 5 TX 116
How would you turn the tabular output data into a chart?
contr %>%
filter(cash_or_in_kind %in% 'Cash' & ! contributor_state %in% c('WA', NA)) %>%
group_by(contributor_state) %>%
summarise(n = n()) %>%
arrange(desc(n)) %>%
head(5) %>%
ggplot(aes(x = contributor_state, y = n, fill = contributor_state)) +
geom_bar(stat = "identity") +
coord_flip() # coord_flip() is a new function
How would you turn the tabular output data into a chart?
contr %>%
filter(cash_or_in_kind %in% 'Cash' & ! contributor_state %in% c('WA', NA)) %>%
group_by(contributor_state) %>%
summarise(n = n()) %>%
arrange(desc(n)) %>%
head(5) %>%
ggplot(aes(reorder(contributor_state, n), n, fill = contributor_state)) + # reorder() is a new function
geom_bar(stat = "identity") +
coord_flip() # coord_flip() is a new function
How would you turn the tabular output data into a chart?
contr %>%
filter(cash_or_in_kind %in% 'Cash' & ! contributor_state %in% c('WA', NA)) %>%
group_by(contributor_state) %>%
summarise(n = n()) %>%
arrange(desc(n)) %>%
head(5) %>%
ggplot(aes(reorder(contributor_state, n), n, fill = contributor_state)) +
geom_bar(stat = "identity") +
coord_flip()
New argument and functions
geom_bar(stat = 'identity') # IMPORTANT: use stat when working with aggregated data reorder(contributor_state, n) # reorder() to sort rows/columns in your visualization coord_flip() # coord_flip(): x becomes y; y becomes x
Example
In contr, what is the largest average donation amount by contributor_state?
contr %>% filter(! contributor_state %in% NA) %>% group_by(contributor_state) %>% summarise(avg_donation = mean(amount, na.rm = TRUE)) %>% arrange(desc(avg_donation)) %>% head(5) ## # A tibble: 5 x 2 ## contributor_state avg_donation ## <chr> <dbl> ## 1 MS 1000 ## 2 UT 750 ## 3 TN 636. ## 4 MO 595 ## 5 OK 550
How would you turn the tabular output data into a chart?
contr %>% filter(! contributor_state %in% NA) %>% group_by(contributor_state) %>% summarise(avg_donation = mean(amount, na.rm = TRUE)) %>% arrange(desc(avg_donation)) %>% head(5) ## # A tibble: 5 x 2 ## contributor_state avg_donation ## <chr> <dbl> ## 1 MS 1000 ## 2 UT 750 ## 3 TN 636. ## 4 MO 595 ## 5 OK 550
How would you turn the tabular output data into a chart?
contr %>% filter(! contributor_state %in% NA) %>% group_by(contributor_state) %>% summarise(avg_donation = mean(amount, na.rm = TRUE)) %>% ggplot(aes(reorder(contributor_state, -avg_donation), avg_donation)) + geom_bar(stat = 'identity') + theme_classic()
crime dataset, report the mean difference in reported_date and occurred_date values by sector.
reported_date and occurred_date should be called date_difsector only report rows where the number of sector values in the dataset is greater than 100transmute and filter as well as group_by and summarise to solve this problemsector, date_dif, ndate_dif values by sectorHint
crime %>%
transmute(
sector
, date_dif = reported_date - occurred_date
) %>%
crime dataset, report the mean difference in reported_date and occurred_date values by sector.
reported_date and occurred_date should be called date_difsector only report rows where the number of sector values in the dataset is greater than 100transmute and filter as well as group_by and summarise to solve this problemsector, date_dif, ndate_dif values by sectorcrime %>%
transmute(
sector
, date_dif = reported_date - occurred_date
) %>%
group_by(sector) %>%
summarise(
date_dif = mean(date_dif, na.rm = TRUE)
, n = n()
) %>%
crime dataset, report the mean difference in reported_date and occurred_date values by sector.
reported_date and occurred_date should be called date_difsector only report rows where the number of sector values in the dataset is greater than 100transmute and filter as well as group_by and summarise to solve this problemsector, date_dif, ndate_dif values by sectorcrime %>%
transmute(
sector
, date_dif = reported_date - occurred_date
) %>%
group_by(sector) %>%
summarise(
date_dif = mean(date_dif, na.rm = TRUE)
, n = n()
) %>%
filter(n > 100) %>%
crime %>%
transmute(
sector
, date_dif = reported_date - occurred_date
) %>%
group_by(sector) %>%
summarise(
date_dif = mean(date_dif, na.rm = TRUE)
, n = n()
) %>%
filter(n > 100) %>%
ggplot(aes(x = sector, y = date_dif)) +
geom_bar(stat = 'identity')
sector columns by date_difNA sector valuescrime %>%
transmute(
sector
, date_dif = reported_date - occurred_date
) %>%
group_by(sector) %>%
summarise(
date_dif = mean(date_dif, na.rm = TRUE)
, n = n()
) %>%
filter(n > 100 & ! sector %in% NA) %>%
ggplot(aes(reorder(sector, -date_dif), date_dif)) +
geom_bar(stat = 'identity') +
labs(
y = 'Average delay in reporting (days)'
, x = 'Sectors'
, title = 'Sector W sees the longest delay in events reporting'
) +
theme_classic()
Histograms: 1 continuous variable
bins is the argument you use to indicate the number of columns in your histogramcontr %>% filter(amount > 0 & amount < 1000) %>% ggplot(aes(amount)) + geom_histogram()
Histograms: 1 continuous variable
bins is the argument you use to indicate the number of columns in your histogramcontr %>% filter(amount > 0 & amount < 1000) %>% ggplot(aes(amount)) + geom_histogram(bins = 10)
Scatterplots: 2 continuous variables
contr %>% filter(amount > 0 & amount < 1000000) %>% ggplot(aes(receipt_date, amount)) + geom_point()
Scatterplots: 2 continuous variables
contr %>% ggplot(aes(receipt_date, amount)) + geom_point()
geom_count() # adds a size component to your scatterplot geom_jitter() # adjusts cartesian location of point relative to other points
Line plot: 1 continuous variable + date variable
contr %>% group_by(receipt_date) %>% summarise(amount = sum(amount)) %>% ggplot(aes(x = receipt_date, y = amount)) + geom_line(stat = 'identity')
Line plot: 1 continuous variable + date variable
contr %>% group_by(receipt_date) %>% summarise(amount = sum(amount)) %>% filter(amount > 0) %>% ggplot(aes(x = receipt_date, y = cumsum(amount))) + # cumsum() is a new function! geom_line(stat = 'identity')
contr or crime, create a scatterplot or line plotaes() arguments in addition to x and y
size, shape, color or alphagroup_by() and summarise() if you would likecrime %>% filter(! precinct %in% NA) %>% ggplot(aes(reported_date, occurred_date, color = precinct)) + geom_point() + theme_classic()
crime %>% filter(! precinct %in% NA) %>% ggplot(aes(reported_date, occurred_date, color = precinct)) + geom_point() + theme_classic()
crime datasetmutate() called occurred_time_ampm that tells whether an incident occurred in the AM or PM.occurred_time_ampm in the color or fill arguments in aes()crime %>%
mutate(occurred_time_ampm = ifelse(occurred_time >= 1200, 'PM', 'AM')) %>%
filter(
! occurred_time_ampm %in% NA &
! crime_subcategory %in% NA
) %>%
group_by(crime_subcategory, occurred_time_ampm) %>%
summarise(n = n()) %>%
ungroup %>%
group_by(crime_subcategory) %>%
mutate(total = sum(n, na.rm = TRUE)) %>%
filter(total >= 100) %>%
ggplot(aes(reorder(crime_subcategory, n), n, fill = occurred_time_ampm)) +
geom_bar(stat = 'identity') +
coord_flip() +
theme_wsj() +
scale_colour_wsj() +
labs(title = 'Most incidence occur\nin the AM', fill = 'Time of Day')
contr datasetcontr %>%
filter(
! party %in% c(NA, 'OTHER', 'NONE') &
amount > 0 &
receipt_date >= '2008-01-01' &
receipt_date <= '2020-12-31'
) %>%
group_by(party, receipt_date) %>%
summarise(amount = sum(amount)) %>%
mutate(amount = cumsum(amount)) %>% # You can see that I use cumsum() in mutate()
ggplot(aes(x = receipt_date, y = amount, color = party)) +
geom_line(size = 1, stat = 'identity') +
theme_wsj() +
scale_y_continuous(labels = dollar) +
scale_colour_wsj() +
labs(
title = 'Democrats receive nearly 1\nmillion more in donations than\nnext closest party',
colour = element_blank()
)