- Yardstick for plotting data in R
tidyversepackage- Virtual no limits to plot types
- Add-on packages available
- Plotting resources
January 24, 2017
tidyverse packageWhat we will learn
Requests to learn another plot type?
1 discrete variable
1 continuous variable
2 continuous variables
1 continuous variable + date variable
library(tidyverse) mpg %>% ggplot() + geom_point(aes(displ, hwy))
tidyverse and re-run the test codeinstall.packages('tidyverse')
library(tidyverse)
mpg %>% ggplot() + geom_point(aes(displ, hwy))
ggplot2 and re-run the test codeinstall.packages('ggplot2')
library(ggplot2)
mpg %>% ggplot() + geom_point(aes(displ, hwy))
library(tidyverse)
donor <- read.csv('https://goo.gl/tm9JQ5')
police <- read.csv('https://goo.gl/nNAuDy')
ggplot() function and a geom layer+ggplot() + geom_bar() # create bar and stacked bar plots ggplot() + geom_histogram() # create histograms ggplot() + geom_point() # create scatter plots ggplot() + geom_line() # create line plots
ggplot() functiondonor %>% ggplot() + geom_bar() ggplot(donor) + geom_bar()
aes()
aes() can be an argument in ggplot() or the geomaes() functionx and y in aes()aes(x = NULL, y = NULL, color = NULL , fill = NULL, alpha = NULL, label = NULL , shape = NULL, size = NULL, group = NULL , linetype = NULL )
donor %>% ggplot() + geom_bar(aes(primary_general)) donor %>% ggplot(aes(primary_general)) + geom_bar()
donor %>% ggplot() + geom_bar(aes(primary_general)) donor %>% ggplot(aes(primary_general)) + geom_bar()
donor %>% ggplot(aes(primary_general, fill = primary_general)) + geom_bar()
police data that shows how many incidences occurred in district_sectordistrict_sector values: c('B', 'E', 'D', 'R', 'O', 'C', 'K')event_clearance_groupevent_clearance_group values: c('TRAFFIC RELATED CALLS', 'FRAUD CALLS', 'BURGLARY', 'BIKE')Hint
# Use to include/exclude values filter()
police data that shows how many incidences occurred in district_sectordistrict_sector values: c('B', 'E', 'D', 'R', 'O', 'C', 'K')event_clearance_groupevent_clearance_group values: c('TRAFFIC RELATED CALLS', 'FRAUD CALLS', 'BURGLARY', 'BIKE')Hint
police %>% filter() %>% ggplot()
police %>%
filter(
district_sector %in% c('B', 'E', 'D', 'R', 'O', 'C', 'K') &
event_clearance_group %in% c('TRAFFIC RELATED CALLS', 'FRAUD CALLS', 'BURGLARY', 'BIKE')
) %>%
ggplot(aes(district_sector, fill = event_clearance_group)) + geom_bar()
Axis names
2 options
# Option 1
labs(x = 'X Axis Title', y = 'Y Axis Title')
# Option 2
xlab('X Axis Title')
ylab('Y Axis Title')
donor %>% ggplot(aes(primary_general, fill = primary_general)) + geom_bar() + labs(x = 'Election Type', y = 'Donations (#)')
Try adding axis names to the district_sector plot you made during the exercise
police %>%
filter(
district_sector %in% c('B', 'E', 'D', 'R', 'O', 'C', 'K') &
event_clearance_group %in% c('TRAFFIC RELATED CALLS', 'FRAUD CALLS', 'BURGLARY', 'BIKE')
) %>%
ggplot(aes(district_sector, fill = event_clearance_group)) + geom_bar() +
labs(x = 'District Sectors', y = 'Incidences by Event\nClearance Group (#)')
Legend names
labs(fill = '') labs(fill = element_blank()) labs(fill = NULL) labs(colour = 'Check out these colors')
police %>%
filter(
district_sector %in% c('B', 'E', 'D', 'R', 'O', 'C', 'K') &
event_clearance_group %in% c('TRAFFIC RELATED CALLS', 'FRAUD CALLS', 'BURGLARY', 'BIKE')
) %>%
ggplot(aes(district_sector, fill = event_clearance_group)) + geom_bar() +
labs(x = 'Election Type', y = 'Donations (#)', fill = element_blank())
Try removing the legend name from your district_sector plot
police %>%
filter(district_sector %in% c('B', 'E', 'D', 'R', 'O', 'C', 'K')) %>%
ggplot(aes(district_sector, fill = event_clearance_group)) + geom_bar() +
labs(x = 'District Sectors', y = 'Incidences by Event\nClearance Group (#)'
, fill = element_blank())
Try removing the legend name from your district_sector plot
Chart titles and subtitles
# Option 1 labs(title = NULL, subtitle = NULL) # Option 2 ggtitle(title = NULL, subtitle = NULL)
donor %>%
ggplot(aes(primary_general, fill = primary_general)) +
geom_bar() +
labs(
x = 'Election Type'
, y = 'Donations (#)'
, fill = element_blank()
, title = 'Number of Donations by Election Type'
, subtitle = 'Over 6,000 full election cycle donations'
)
Try adding a title to the district_sector plot
install.packages('scales')
library(scales)
scale_y_continuous(labels = percent) # Add a percentage sign to numbers on axis
scale_y_continuous(labels = dollar) # Add a dollar sign to numbers on axis
scale_y_continuous(labels = comma) # Add a comma to numbers on axis
donor %>%
ggplot(aes(primary_general, fill = primary_general)) +
geom_bar() +
labs(
x = 'Election Type'
, y = 'Donations (#)'
, fill = element_blank()
, title = 'Number of Donations by Election Type'
, subtitle = 'Over 6,000 full election cycle donations'
) +
scale_y_continuous(labels = comma)
ggplot2 themesggthemes themesinstall.packages('ggthemes')
library(ggthemes)
Selected ggplot2 themes
theme_classic() theme_minimal() theme_dark()
Selected ggthemes themes
theme_stata() + scale_colour_stata() # scale_fill_stata()
theme_economist() + scale_colour_economist() # scale_fill_economist()
theme_fivethirtyeight() + scale_color_fivethirtyeight() # scale_fill_fivethirtyeight()
theme_wsj() + scale_colour_wsj() # scale_fill_wsj()
theme_pander() + scale_colour_pander() # scale_fill_pander()
theme_hc(bgcolor = "darkunica") + scale_colour_hc("darkunica") # scale_fill_hc("darkunica")
donor %>%
ggplot(aes(primary_general, fill = primary_general)) +
geom_bar() +
labs(
x = 'Election Type'
, y = 'Donations (#)'
, fill = element_blank()
, title = 'Number of Donations by Election Type'
, subtitle = 'Over 6,000 full election cycle donations'
) +
scale_y_continuous(labels = comma) +
theme_economist() +
scale_fill_economist()
Economist theme
Try adding a theme to the district_sector plot
Selected ggplot2 themes
theme_classic() theme_minimal() theme_dark()
Selected ggthemes themes
theme_stata() + scale_colour_stata() # scale_fill_stata()
theme_economist() + scale_colour_economist() # scale_fill_economist()
theme_fivethirtyeight() + scale_color_fivethirtyeight() # scale_fill_fivethirtyeight()
theme_wsj() + scale_colour_wsj() # scale_fill_wsj()
theme_pander() + scale_colour_pander() # scale_fill_pander()
theme_hc(bgcolor = "darkunica") + scale_colour_hc("darkunica") # scale_fill_hc("darkunica")
Try adding a theme to the district_sector plot
theme()t <- theme( plot.title = element_text(size=14, face="bold", vjust=1) , plot.background = element_blank() , panel.grid.major = element_blank() , panel.grid.minor = element_blank() , panel.border = element_blank() , panel.background = element_blank() , axis.ticks = element_blank() , axis.text = element_text(colour="black", size=12) , axis.text.x = element_text(angle=45, hjust=1) , legend.title = element_blank() , legend.position = "none" , legend.text = element_text(size=12) )
Histograms: 1 continuous variable
donor %>% filter(amount < 1000) %>% ggplot(aes(amount)) + geom_histogram()
Histograms: 1 continuous variable
donor %>% filter(amount < 1000) %>% ggplot(aes(amount)) + geom_histogram()
geom_density() # works like geom_histogram(); continuous curve
Scatterplots: 2 continuous variables
donor %>% ggplot(aes(receipt_year, election_year)) + geom_point()
Scatterplots: 2 continuous variables
donor %>% ggplot(aes(receipt_year, election_year)) + geom_point()
geom_count() # adds a size component to your scatterplot geom_jitter() # adjusts cartesian location of point relative to other points
Line plot: 1 continuous variable + date variable
donor %>% ggplot(aes(receipt_year, amount)) + geom_line()
donor or police, create a scatterplotaes() arguments in addition to x and y
size, shape, or alphapolice %>% ggplot(aes(latitude, longitude, color = district_sector, alpha = district_sector)) + geom_point() + theme_classic()
COUNTIFS and SUMIFS formulas in Excel and LOD expressions in Tableaudonor, which type received the most money in donations from 'REPUBLICAN'?donor, which type received the most money in donations in the receipt_year 2015?donor, what is the largest average donation amount for contributor_state?group_by(), list the variables by which you want to aggregate datasummarise(), create a variable and define the variable with an aggregation function%>% to ‘link’ group_by() and summarise()n(), n_distinct(), sum(), mean(), max(), min(), etc.Example
In donor, which type received the most money in donations from 'REPUBLICAN'?
donor %>% group_by(type, party) %>% summarise(dollars = sum(amount, na.rm = TRUE))
Example
In donor, which type received the most money in donations from 'REPUBLICAN'?
donor %>% group_by(type, party) %>% summarise(dollars = sum(amount, na.rm = TRUE)) ## # A tibble: 7 x 3 ## # Groups: type [?] ## type party dollars ## <fctr> <fctr> <dbl> ## 1 Candidate DEMOCRAT 311690 ## 2 Candidate INDEPENDENT 467 ## 3 Candidate NON PARTISAN 206196 ## 4 Candidate NONE 6465 ## 5 Candidate OTHER 25859 ## 6 Candidate REPUBLICAN 313540 ## 7 Political Committee <NA> 1563098
How would you turn the tabular output data into a chart?
donor %>% group_by(type, party) %>% summarise(dollars = sum(amount, na.rm = TRUE)) ## # A tibble: 7 x 3 ## # Groups: type [?] ## type party dollars ## <fctr> <fctr> <dbl> ## 1 Candidate DEMOCRAT 311690 ## 2 Candidate INDEPENDENT 467 ## 3 Candidate NON PARTISAN 206196 ## 4 Candidate NONE 6465 ## 5 Candidate OTHER 25859 ## 6 Candidate REPUBLICAN 313540 ## 7 Political Committee <NA> 1563098
How would you turn the tabular output data into a chart?
donor %>% group_by(type, party) %>% summarise(dollars = sum(amount, na.rm = TRUE)) %>% ggplot(aes(type, dollars, fill = party)) + geom_bar(stat = "identity") + coord_flip() + scale_y_continuous(labels = dollar) + theme_classic()
How would you turn the tabular output data into a chart?
donor %>% group_by(type, party) %>% summarise(dollars = sum(amount, na.rm = TRUE)) %>% ggplot(aes(type, dollars, fill = party)) + geom_bar(stat = "identity") + scale_y_continuous(labels = dollar) + theme_classic()
Necessary when data is aggregated
geom_bar(stat = 'identity') # use stat when working with aggregated data geom_line(stat = 'identity')
Example
In donor, which type received the most money in donations in the receipt_year 2015?
donor %>% group_by(receipt_year, type) %>% summarise(total_amount = sum(amount, na.rm = TRUE)) %>% filter(receipt_year %in% 2015) ## # A tibble: 2 x 3 ## # Groups: receipt_year [1] ## receipt_year type total_amount ## <int> <fctr> <dbl> ## 1 2015 Candidate 55729 ## 2 2015 Political Committee 66821
How would you turn the tabular output data into a chart?
## # A tibble: 5 x 3 ## receipt_year type total_amount ## <int> <fctr> <dbl> ## 1 2013 Political Committee 118804 ## 2 2008 Candidate 139377 ## 3 2007 Political Committee 162470 ## 4 2017 Political Committee 44280 ## 5 2013 Candidate 46316
How would you turn the tabular output data into a chart?
donor %>% group_by(receipt_year, type) %>% summarise(total_amount = sum(amount, na.rm = TRUE)) %>% ggplot(aes(receipt_year, total_amount, color = type)) + geom_line(stat = 'identity') + scale_y_continuous(labels = dollar) + theme_classic()
donor, determine what the largest average donation amount is for contributor_state?mean() in the summarise functionNA values with filter()donor %>% group_by() %>% summarise() %>% filter()
donor, determine what the largest average donation amount is for contributor_state?mean() in the summarise functionNA values with filter()donor %>% group_by() %>% summarise() %>% filter() %>% ggplot() %>% geom_bar()
donor, determine what the largest average donation amount is for contributor_state?mean() in the summarise functionNA values with filter()donor %>% group_by() %>% summarise() %>% filter() %>% ggplot() %>% geom_bar(stat = 'identity')
donor %>% group_by(contributor_state) %>% summarise(avg_amount = mean(amount, na.rm = TRUE)) %>% filter(! contributor_state %in% NA) %>% ggplot(aes(reorder(contributor_state, -avg_amount), avg_amount)) + geom_bar(stat = 'identity') + scale_y_continuous(labels = dollar) + theme_classic()
police datasetcolor in aes()police %>%
filter(
! event_clearance_ampm %in% NA &
! event_clearance_group %in% NA
) %>%
group_by(event_clearance_group, event_clearance_ampm) %>%
summarise(n = n()) %>%
ungroup %>%
group_by(event_clearance_group) %>%
mutate(total = sum(n, na.rm = TRUE)) %>%
filter(total >= 100) %>%
ggplot(aes(reorder(event_clearance_group, n), n, fill = event_clearance_ampm)) +
geom_bar(stat = 'identity') +
coord_flip() +
theme_wsj() +
scale_colour_wsj() +
labs(title = 'Most incidence occur\nin the AM', fill = 'Time of Day')
police %>%
filter(
! event_clearance_ampm %in% NA &
! event_clearance_group %in% NA
) %>%
group_by(event_clearance_group, event_clearance_ampm) %>%
summarise(n = n()) %>%
ggplot(aes(n, color = event_clearance_ampm)) +
geom_density(size = 1) +
theme_wsj() +
scale_colour_wsj() +
labs(title = 'Most incidences have\noccurred fewer than 25 times', colour = 'Time of Day')
donor datasetcash_or_in_kinddonor %>% ggplot(aes(cash_or_in_kind, fill = primary_general)) + geom_bar() + theme_wsj() + scale_colour_wsj() + labs(title = 'So many more\ncash donations!!!', colour = 'Election Type')