- Selecting variables
- Creating variables
- Updating values
- Subsetting tibbles/data frames
- Aggregation
October 22, 2020
gather() and spread()library(tidyverse)
covid <- read_csv('https://rb.gy/lzlylj')
crime <- read_csv('https://rb.gy/5zuayh')
covid dataset, determine the location-month combination with the highest number of estimated infections per capita. Exclude dates prior to July and 'Global' values in location. To do this:
Filter rows based on the stated date and locaton conditionsinfections_per_capita that reports est_infections over populationlocation and month and sum infections_per_capitaarrange, sort by infections_per_capita so that rows are in descending (desc()) ordercovid %>%
filter(
month >= 7 &
! location %in% 'Global'
)
covid dataset, determine the location-month combination with the highest number of estimated infections per capita. Exclude dates prior to July and 'Global' values in location. To do this:
Filter rows based on the stated date and locaton conditionsinfections_per_capita that reports est_infections over populationlocation and month and sum infections_per_capitaarrange, sort by infections_per_capita so that rows are in descending (desc()) ordercovid %>%
filter(
month >= 7 &
! location %in% 'Global'
) %>%
mutate(infections_per_capita = est_infections / population)
covid dataset, determine the location-month combination with the highest number of estimated infections per capita. Exclude dates prior to July and 'Global' values in location. To do this:
Filter rows based on the stated date and locaton conditionsinfections_per_capita that reports est_infections over populationlocation and month and sum infections_per_capitaarrange, sort by infections_per_capita so that rows are in descending (desc()) ordercovid %>%
filter(
month >= 7 &
! location %in% 'Global'
) %>%
mutate(infections_per_capita = est_infections / population) %>%
group_by(location, month) %>%
summarise(infections_per_capita = sum(infections_per_capita, na.rm = TRUE))
covid %>%
filter(
month >= 7 &
! location %in% 'Global'
) %>%
mutate(infections_per_capita = est_infections / population) %>%
group_by(location, month) %>%
summarise(infections_per_capita = sum(infections_per_capita, na.rm = TRUE)) %>%
arrange(desc(infections_per_capita)) %>%
head()
## # A tibble: 6 x 3
## # Groups: location [4]
## location month infections_per_capita
## <chr> <dbl> <dbl>
## 1 Manipur 12 0.167
## 2 Meghalaya 12 0.147
## 3 Malta 11 0.135
## 4 Malta 12 0.127
## 5 West Bengal 12 0.107
## 6 Manipur 11 0.106
gather() and spread() are the functions we use to reshape (or pivot) datagather())spread())
gather() and spread() are the functions we use to reshape datagather())spread())
gather()crime %>% select(report_number, occurred_date, reported_date) %>% gather(year_type, year, c(occurred_date, reported_date))
gather()crime %>% select(report_number, occurred_date, reported_date) %>% gather(year_type, year, 2:3)
crime %>% tail(2) %>% select(report_number, occurred_date, reported_date) %>% gather(year_type, year, c(occurred_date, reported_date)) ## # A tibble: 4 x 3 ## report_number year_type year ## <chr> <chr> <date> ## 1 2010-044880 occurred_date 2010-02-06 ## 2 2020-218244 occurred_date 2020-07-20 ## 3 2010-044880 reported_date 2010-02-08 ## 4 2020-218244 reported_date 2020-07-20

spread()covid %>%
filter(! mobility_data_type %in% NA & location %in% c('Peru', 'Ecuador', 'Colombia')) %>%
group_by(mobility_data_type, location) %>%
summarise(mobility_composite = mean(mobility_composite, na.rm = TRUE)) %>%
spread(location, mobility_composite)
covid %>%
filter(! mobility_data_type %in% NA & location %in% c('Peru', 'Ecuador', 'Colombia')) %>%
group_by(mobility_data_type, location) %>%
summarise(mobility_composite = mean(mobility_composite, na.rm = TRUE)) %>%
spread(location, mobility_composite)
## # A tibble: 2 x 4
## # Groups: mobility_data_type [2]
## mobility_data_type Colombia Ecuador Peru
## <chr> <dbl> <dbl> <dbl>
## 1 observed -38.7 -51.1 -54.6
## 2 projected -12.8 -38.7 -44.0

Exercise #1 - 3 minutes
crime, use gather() to convert precinct, beat, and neighborhood columns into rows
report_numberExercise #2 - 5 minutes
crime, report the number of incidents by offense_parent_group and precinct. To do this…
group_by() and summarise()spread() to turn precinct into columns with counts of incidentscrime, use gather() to convert precinct, beat, and neighborhood columns into rows
report_numbercrime %>% gather(group, value, c(precinct, beat, neighborhood)) %>%
crime, use gather() to convert precinct, beat, and neighborhood columns into rows
report_numbercrime %>% gather(group, value, c(precinct, beat, neighborhood)) %>% select(report_number, group, value) %>% sample_n(8) ## # A tibble: 8 x 3 ## report_number group value ## <chr> <chr> <chr> ## 1 2019-119558 neighborhood <NA> ## 2 2010-214293 neighborhood ALASKA JUNCTION ## 3 2020-236131 precinct South ## 4 2012-903035 neighborhood GREENWOOD ## 5 2012-099379 precinct UNKNOWN ## 6 2015-150623 beat R3 ## 7 2012-397750 precinct North ## 8 2015-240127 precinct West
crime, report the number of incidents by offense_parent_group and precinct. To do this…
group_by() and summarise()spread() to turn precinct into columns with counts of incidentscrime %>%
filter(! precinct %in% c('UNKNOWN', NA)) %>%
group_by(precinct, offense_parent_group) %>%
summarise(n = n())
crime, report the number of incidents by offense_parent_group and precinct. To do this…
group_by() and summarise()spread() to turn precinct into columns with counts of incidentscrime %>%
filter(! precinct %in% c('UNKNOWN', NA)) %>%
group_by(precinct, offense_parent_group) %>%
summarise(n = n()) %>%
spread(precinct, n)
## # A tibble: 32 x 6
## offense_parent_group East North South Southwest West
## <chr> <int> <int> <int> <int> <int>
## 1 ANIMAL CRUELTY 1 3 1 NA 3
## 2 ARSON 26 67 33 19 33
## 3 ASSAULT OFFENSES 2979 4488 2887 1882 4777
## 4 BAD CHECKS 146 321 135 90 213
## 5 BRIBERY 1 2 1 NA NA
## 6 BURGLARY/BREAKING&ENTERING 2054 4898 2135 1396 2618
## 7 COUNTERFEITING/FORGERY 114 261 89 59 165
## 8 CURFEW/LOITERING/VAGRANCY VIOLATIONS 12 8 8 4 91
## 9 DESTRUCTION/DAMAGE/VANDALISM OF PROPERTY 1845 3489 1871 1355 2459
## 10 DRIVING UNDER THE INFLUENCE 405 663 357 256 564
## # … with 22 more rows
tidyverse packageWhat we will learn
Requests to learn another plot type?
1 discrete variable (plus other optional discrete and/or continuous variables)
1 continuous variable
2 continuous variables
1 continuous variable + date variable
library(tidyverse) mpg %>% ggplot() + geom_point(aes(displ, hwy))
tidyverse and re-run the test codeinstall.packages('tidyverse')
library(tidyverse)
mpg %>% ggplot(aes(displ, hwy))) + geom_point()
ggplot2 and re-run the test codeinstall.packages('ggplot2')
library(ggplot2)
mpg %>% ggplot(aes(displ, hwy)) + geom_point()
ggplot() function and a geom layer+ggplot() + geom_bar() # create bar and stacked bar plots ggplot() + geom_histogram() # create histograms ggplot() + geom_point() # create scatter plots ggplot() + geom_line() # create line plots
ggplot() functioncovid %>% ggplot() + geom_bar() ggplot(covid) + geom_bar()
aes()
aes() indicates the variables that affect the chart aestheticsaes() can be an argument in ggplot() or the geomaes() functionaes() arguments reference variables in your tibblecovid %>% ggplot(aes(mobility_data_type)) + geom_bar() covid %>% ggplot() + geom_bar(aes(mobility_data_type))
aes(x = NULL, y = NULL, color = NULL , fill = NULL, alpha = NULL, label = NULL , shape = NULL, size = NULL, group = NULL , linetype = NULL )
covid %>% ggplot(aes(x = mobility_data_type)) + geom_bar() covid %>% ggplot() + geom_bar(aes(x = mobility_data_type))
covid %>% ggplot(aes(x = mobility_data_type, fill = mobility_data_type)) + geom_bar()
covid %>% ggplot(aes(x = mobility_data_type, fill = as.character(month))) + geom_bar()
crime data that shows how many incidents occurred in beatbeat values: c('B3', 'E3', 'D2', 'R2', 'O1', 'C3', 'K3')offenseoffense values: c('Impersonation', 'Driving Under the Influence', 'Pocket-picking', 'Embezzlement')Hint
# Use to include/exclude values filter()
crime data that shows how many incidents occurred in beatbeat values: c('B3', 'E3', 'D2', 'R2', 'O1', 'C3', 'K3')offenseoffense values: c('Impersonation', 'Driving Under the Influence', 'Pocket-picking', 'Embezzlement')Hint
crime %>% filter() %>% ggplot()
crime %>%
filter(
beat %in% c('B3', 'E3', 'D2', 'R2', 'O1', 'C3', 'K3') &
offense %in% c('Impersonation', 'Driving Under the Influence'
, 'Pocket-picking', 'Embezzlement')
) %>%
ggplot(aes(beat, fill = offense)) +
geom_bar()
Axis names
2 options
# Option 1
labs(x = 'X Axis Title', y = 'Y Axis Title')
# Option 2
xlab('X Axis Title')
ylab('Y Axis Title')
covid %>% filter(! mobility_data_type %in% NA) %>% ggplot(aes(x = mobility_data_type, fill = mobility_data_type)) + geom_bar() + labs(x = 'Mobility Data Type', y = 'Observations (#)')
Try adding axis names to the beat plot you made during the exercise
crime %>%
filter(
beat %in% c('B3', 'E3', 'D2', 'R2', 'O1', 'C3', 'K3') &
offense %in% c('Impersonation', 'Driving Under the Influence'
, 'Pocket-picking', 'Embezzlement')
) %>%
ggplot(aes(beat, fill = offense)) +
geom_bar() +
labs(x = 'Seattle Beats', y = 'Incidents by Offense\nClearance Group (#)')
Legend names
labs(fill = '') labs(fill = element_blank()) labs(fill = NULL) labs(colour = 'Check out these colors')
covid %>% filter(! mobility_data_type %in% NA) %>% ggplot(aes(x = mobility_data_type, fill = mobility_data_type)) + geom_bar() + labs(x = 'Mobility Data Type', y = 'Observations (#)', fill = 'Check out these colors')
Try removing the legend name from your beat plot
Try removing the legend name from your beat plot
crime %>%
filter(
beat %in% c('B3', 'E3', 'D2', 'R2', 'O1', 'C3', 'K3') &
offense %in% c('Impersonation', 'Driving Under the Influence'
, 'Pocket-picking', 'Embezzlement')
) %>%
ggplot(aes(beat, fill = offense)) +
geom_bar() +
labs(
x = 'Seattle Beats'
, y = 'Incidents by Offense\nClearance Group (#)'
, fill = element_blank()
)
Chart titles and subtitles
# Option 1 labs(title = NULL, subtitle = NULL) # Option 2 ggtitle(title = NULL, subtitle = NULL)
covid %>%
filter(! mobility_data_type %in% NA) %>%
ggplot(aes(x = mobility_data_type, fill = mobility_data_type)) +
geom_bar() +
labs(
x = 'Mobility Data Type'
, y = 'Observations (#)'
, fill = element_blank()
, title = '2020 Covid-19 Mobility Types'
, subtitle = 'Nearly two-thirds of mobility data are observed'
)
Try adding a title to the beat plot
Try adding a title to the beat plot
crime %>%
filter(
beat %in% c('B3', 'E3', 'D2', 'R2', 'O1', 'C3', 'K3') &
offense %in% c('Impersonation', 'Driving Under the Influence'
, 'Pocket-picking', 'Embezzlement')
) %>%
ggplot(aes(beat, fill = offense)) +
geom_bar() +
labs(
x = 'Seattle Beats'
, y = 'Incidents by Offense\nClearance Group (#)'
, fill = element_blank()
, title = 'An Amazing Title'
)
install.packages('scales')
library(scales)
scale_y_continuous(labels = percent) # Add a percentage sign to numbers on axis
scale_y_continuous(labels = dollar) # Add a dollar sign to numbers on axis
scale_y_continuous(labels = comma) # Add a comma to numbers on axis
covid %>%
filter(! mobility_data_type %in% NA) %>%
ggplot(aes(x = mobility_data_type, fill = mobility_data_type)) +
geom_bar() +
labs(
x = 'Mobility Data Type'
, y = 'Observations (#)'
, fill = element_blank()
, title = '2020 Covid-19 Mobility Types'
, subtitle = 'Nearly two-thirds of mobility data are observed'
) +
scale_y_continuous(labels = comma)
ggplot2 themesggthemes themesinstall.packages('ggthemes')
library(ggthemes)
Selected ggplot2 themes
theme_classic() theme_minimal() theme_dark()
Selected ggthemes themes
theme_stata() + scale_colour_stata() # scale_fill_stata()
theme_economist() + scale_colour_economist() # scale_fill_economist()
theme_fivethirtyeight() + scale_color_fivethirtyeight() # scale_fill_fivethirtyeight()
theme_wsj() + scale_colour_wsj() # scale_fill_wsj()
theme_pander() + scale_colour_pander() # scale_fill_pander()
theme_hc(bgcolor = "darkunica") + scale_colour_hc("darkunica") # scale_fill_hc("darkunica")
covid %>%
filter(! mobility_data_type %in% NA) %>%
ggplot(aes(x = mobility_data_type, fill = mobility_data_type)) +
geom_bar() +
labs(
x = 'Mobility Data Type'
, y = 'Observations (#)'
, fill = element_blank()
, title = '2020 Covid-19 Mobility Types'
, subtitle = 'Nearly two-thirds of mobility data are observed'
) +
scale_y_continuous(labels = comma) +
theme_economist() +
scale_fill_economist()
Economist theme
Try adding a theme to the beat plot
theme()theme( plot.title = element_text(size=14, face="bold", vjust=1) , plot.background = element_blank() , panel.grid.major = element_blank() , panel.grid.minor = element_blank() , panel.border = element_blank() , panel.background = element_blank() , axis.ticks = element_blank() , axis.text = element_text(colour="black", size=12) , axis.text.x = element_text(angle=45, hjust=1) , legend.title = element_blank() , legend.position = "none" , legend.text = element_text(size=12) )
We answered the two questions below in the previous class. How would you turn the tabular output data into a chart?
covid, which location reports the largest number of estimated infections in September, excluding 'Global' values?crime, of all 'ASSAULT OFFENSES' values in offense_parent_group which offense value has the fewest incidents?covid %>% group_by(month, location) %>% summarise(est_infections = sum(est_infections, na.rm = TRUE)) %>% filter(month %in% 9 & ! location %in% 'Global') %>% arrange(desc(est_infections)) %>% head()
How would you turn the tabular output data into a chart?
covid %>% group_by(month, location) %>% summarise(est_infections = sum(est_infections, na.rm = TRUE)) %>% filter(month %in% 9 & ! location %in% 'Global') %>% arrange(desc(est_infections)) %>% head() %>% ggplot(aes(x = reorder(location, est_infections), y = est_infections, fill = location)) + geom_bar(stat = "identity") + guides(fill=guide_legend(nrow=3)) + labs(x = 'Location', y = 'Estimated Infections (#)') + coord_flip() + # coord_flip() is a new function ggthemes::theme_hc()
How would you turn the tabular output data into a chart?
How would you turn the tabular output data into a chart?
covid %>% group_by(month, location) %>% summarise(est_infections = sum(est_infections, na.rm = TRUE)) %>% filter(month %in% 9 & ! location %in% 'Global') %>% arrange(desc(est_infections)) %>% head() %>% ggplot(aes(x = reorder(location, est_infections), y = est_infections, fill = location)) + geom_bar(stat = "identity") + guides(fill=guide_legend(nrow=3)) + labs(x = 'Location', y = 'Estimated Infections (#)') + coord_flip() + # coord_flip() is a new function ggthemes::theme_hc()
New argument and functions
geom_bar(stat = 'identity') # IMPORTANT: use stat when working with aggregated data reorder(location, est_infections) # reorder() to sort rows/columns in your visualization coord_flip() # coord_flip(): x becomes y; y becomes x
We answered the two questions below in the previous class. How would you turn the tabular output data into a chart?
covid, which location reports the largest number of estimated infections in September, excluding 'Global' values?crime, of all 'ASSAULT OFFENSES' values in offense_parent_group which offense value has the fewest incidents?crime %>% group_by(offense_parent_group, offense) %>% summarise(incident = n()) %>% filter(offense_parent_group %in% 'ASSAULT OFFENSES' & ! offense %in% NA) ## # A tibble: 3 x 3 ## # Groups: offense_parent_group [1] ## offense_parent_group offense incident ## <chr> <chr> <int> ## 1 ASSAULT OFFENSES Aggravated Assault 3754 ## 2 ASSAULT OFFENSES Intimidation 3963 ## 3 ASSAULT OFFENSES Simple Assault 9527
How would you turn the tabular output data into a chart?
crime %>% group_by(offense_parent_group, offense) %>% summarise(incident = n()) %>% filter(offense_parent_group %in% 'ASSAULT OFFENSES' & ! offense %in% NA) %>% ggplot(aes(reorder(offense, -incident), incident)) + geom_bar(stat = 'identity') + theme_classic()
crime dataset, report the mean difference in reported_date and occurred_date values by beat.
reported_date and occurred_date should be called date_difbeat only report rows where the number of beat values in the dataset is greater than 3000transmute and filter as well as group_by and summarise to solve this problembeat, date_dif, ndate_dif values by beatHint
crime %>%
transmute(
beat
, date_dif = reported_date - occurred_date
) %>%
crime dataset, report the mean difference in reported_date and occurred_date values by beat.
reported_date and occurred_date should be called date_difbeat only report rows where the number of beat values in the dataset is greater than 3000transmute and filter as well as group_by and summarise to solve this problembeat, date_dif, ndate_dif values by beatcrime %>%
transmute(
beat
, date_dif = reported_date - occurred_date
) %>%
group_by(beat) %>%
summarise(
date_dif = mean(date_dif, na.rm = TRUE)
, n = n()
) %>%
crime dataset, report the mean difference in reported_date and occurred_date values by beat.
reported_date and occurred_date should be called date_difbeat only report rows where the number of beat values in the dataset is greater than 3000transmute and filter as well as group_by and summarise to solve this problembeat, date_dif, ndate_dif values by beatcrime %>%
transmute(
beat
, date_dif = reported_date - occurred_date
) %>%
group_by(beat) %>%
summarise(
date_dif = mean(date_dif, na.rm = TRUE)
, n = n()
) %>%
filter(n > 3000) %>%
crime %>%
transmute(
beat
, date_dif = reported_date - occurred_date
) %>%
group_by(beat) %>%
summarise(
date_dif = mean(date_dif, na.rm = TRUE)
, n = n()
) %>%
filter(n > 3000) %>%
ggplot(aes(x = beat, y = date_dif)) +
geom_bar(stat = 'identity')
beat columns by date_difNA beat valuescrime %>%
transmute(
beat
, date_dif = reported_date - occurred_date
) %>%
group_by(beat) %>%
summarise(
date_dif = mean(date_dif, na.rm = TRUE)
, n = n()
) %>%
filter(n > 3000 & ! beat %in% c(NA, 'UNKNOWN')) %>%
ggplot(aes(reorder(beat, -date_dif), date_dif)) +
geom_bar(stat = 'identity') +
labs(
y = 'Average delay in reporting (days)'
, x = 'Beat'
, title = 'Beat U3 sees the longest delay in events reporting'
) +
theme_classic()
Histograms: 1 continuous variable
bins is the argument you use to indicate the number of columns in your histogramcovid %>% ggplot(aes(mobility_composite)) + geom_histogram()
Histograms: 1 continuous variable
bins is the argument you use to indicate the number of columns in your histogramcovid %>% ggplot(aes(mobility_composite)) + geom_histogram(bins = 10)
Scatterplots: 2 continuous variables
covid %>% ggplot(aes(x = deaths_p100k, y = est_infections_p100k)) + geom_point()
Scatterplots: 2 continuous variables
covid %>% ggplot(aes(x = deaths_p100k, y = est_infections_p100k)) + geom_point()
geom_count() # adds a size component to your scatterplot geom_jitter() # adjusts cartesian location of point relative to other points
Line plot: 1 continuous variable + date variable
covid %>% group_by(date) %>% summarise(total_tests = sum(total_tests, na.rm = TRUE)) %>% ggplot(aes(x = date, y = total_tests)) + geom_line(stat = 'identity')
Line plot: 1 continuous variable + date variable
covid %>% group_by(date) %>% summarise(total_tests = sum(total_tests, na.rm = TRUE)) %>% ggplot(aes(x = date, y = cumsum(total_tests))) + # cumsum() is a new function! geom_line(stat = 'identity')
covid or crime, create a scatterplot or line plotaes() arguments in addition to x and y
size, shape, color or alphagroup_by() and summarise() if you would likecrime %>% filter(! precinct %in% NA) %>% ggplot(aes(reported_date, occurred_date, color = precinct)) + geom_point() + theme_classic()
crime %>% filter(! precinct %in% NA) %>% ggplot(aes(reported_date, occurred_date, color = precinct)) + geom_point() + theme_classic()
crime dataset that shows the number of incidents by a character variable (you pick) and a time of day variable that indicates if the incident occurred in the AM or PM (first half or second half of the day)
mutate() called occurred_time_ampm that tells whether an incident occurred in the AM or PM.occurred_time_ampm in the color or fill arguments in aes()crime %>%
mutate(occurred_time_ampm = ifelse(occurred_time >= 1200, 'PM', 'AM')) %>%
filter(
! occurred_time_ampm %in% NA &
! precinct %in% NA
) %>%
group_by(precinct, occurred_time_ampm) %>%
summarise(n = n()) %>%
ungroup %>%
ggplot(aes(reorder(precinct, n), n, fill = occurred_time_ampm)) +
geom_bar(stat = 'identity') +
coord_flip() +
theme_wsj() +
scale_fill_wsj() +
labs(title = 'Most incidents occur\nin the PM', fill = 'Time of Day')