library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.2.0 ✓ stringr 1.4.0
## ✓ readr 2.1.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(babynames)
# Toy data
cases <- tribble(
~Country, ~"2011", ~"2012", ~"2013",
"FR", 7000, 6900, 7000,
"DE", 5800, 6000, 6200,
"US", 15000, 14000, 13000
)
pollution <- tribble(
~city, ~size, ~amount,
"New York", "large", 23,
"New York", "small", 14,
"London", "large", 22,
"London", "small", 16,
"Beijing", "large", 121,
"Beijing", "small", 121
)
names(who) <- stringr::str_replace(names(who),
"newrel",
"new_rel")
table1
## # A tibble: 6 × 4
## country year cases population
## <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
table2
## # A tibble: 12 × 4
## country year type count
## <chr> <int> <chr> <int>
## 1 Afghanistan 1999 cases 745
## 2 Afghanistan 1999 population 19987071
## 3 Afghanistan 2000 cases 2666
## 4 Afghanistan 2000 population 20595360
## 5 Brazil 1999 cases 37737
## 6 Brazil 1999 population 172006362
## 7 Brazil 2000 cases 80488
## 8 Brazil 2000 population 174504898
## 9 China 1999 cases 212258
## 10 China 1999 population 1272915272
## 11 China 2000 cases 213766
## 12 China 2000 population 1280428583
table3
## # A tibble: 6 × 3
## country year rate
## * <chr> <int> <chr>
## 1 Afghanistan 1999 745/19987071
## 2 Afghanistan 2000 2666/20595360
## 3 Brazil 1999 37737/172006362
## 4 Brazil 2000 80488/174504898
## 5 China 1999 212258/1272915272
## 6 China 2000 213766/1280428583
table4a
## # A tibble: 3 × 3
## country `1999` `2000`
## * <chr> <int> <int>
## 1 Afghanistan 745 2666
## 2 Brazil 37737 80488
## 3 China 212258 213766
table4b
## # A tibble: 3 × 3
## country `1999` `2000`
## * <chr> <int> <int>
## 1 Afghanistan 19987071 20595360
## 2 Brazil 172006362 174504898
## 3 China 1272915272 1280428583
table5
## # A tibble: 6 × 4
## country century year rate
## * <chr> <chr> <chr> <chr>
## 1 Afghanistan 19 99 745/19987071
## 2 Afghanistan 20 00 2666/20595360
## 3 Brazil 19 99 37737/172006362
## 4 Brazil 20 00 80488/174504898
## 5 China 19 99 212258/1272915272
## 6 China 20 00 213766/1280428583
On a sheet of paper, draw how the cases data set would look if it had the same values grouped into three columns: country, year, n
Use gather()
to reorganize table4a
into three columns: country, year, and cases.
table4a %>%
gather(key = "year", value = "n", 2:3)
## # A tibble: 6 × 3
## country year n
## <chr> <chr> <int>
## 1 Afghanistan 1999 745
## 2 Brazil 1999 37737
## 3 China 1999 212258
## 4 Afghanistan 2000 2666
## 5 Brazil 2000 80488
## 6 China 2000 213766
On a sheet of paper, draw how this data set would look if it had the same values grouped into three columns: city, large, small
Use spread()
to reorganize table2
into four columns: country, year, cases, and population.
table2 %>%
spread(key = type, value = count)
## # A tibble: 6 × 4
## country year cases population
## <chr> <int> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
Gather the 5th through 60th columns of who
into a pair of key:value columns named codes
and n
.
Then select just the county
, year
, codes
and n
variables.
who %>%
gather(key = "codes", value = "n", 5:60) %>%
select(-iso2, -iso3)
## # A tibble: 405,440 × 4
## country year codes n
## <chr> <int> <chr> <int>
## 1 Afghanistan 1980 new_sp_m014 NA
## 2 Afghanistan 1981 new_sp_m014 NA
## 3 Afghanistan 1982 new_sp_m014 NA
## 4 Afghanistan 1983 new_sp_m014 NA
## 5 Afghanistan 1984 new_sp_m014 NA
## 6 Afghanistan 1985 new_sp_m014 NA
## 7 Afghanistan 1986 new_sp_m014 NA
## 8 Afghanistan 1987 new_sp_m014 NA
## 9 Afghanistan 1988 new_sp_m014 NA
## 10 Afghanistan 1989 new_sp_m014 NA
## # … with 405,430 more rows
Separate the sexage
column into sex
and age
columns.
(Hint: Be sure to remove each _
before running the code)
who %>%
gather("codes", "n", 5:60) %>%
select(-iso2, -iso3) %>%
separate(codes, c("new", "type", "sexage"), sep = "_") %>%
select(-new) %>%
separate(sexage, into = c("sex", "age"), sep = 1)
## # A tibble: 405,440 × 6
## country year type sex age n
## <chr> <int> <chr> <chr> <chr> <int>
## 1 Afghanistan 1980 sp m 014 NA
## 2 Afghanistan 1981 sp m 014 NA
## 3 Afghanistan 1982 sp m 014 NA
## 4 Afghanistan 1983 sp m 014 NA
## 5 Afghanistan 1984 sp m 014 NA
## 6 Afghanistan 1985 sp m 014 NA
## 7 Afghanistan 1986 sp m 014 NA
## 8 Afghanistan 1987 sp m 014 NA
## 9 Afghanistan 1988 sp m 014 NA
## 10 Afghanistan 1989 sp m 014 NA
## # … with 405,430 more rows
Extend this code to reshape the data into a data set with three columns:
year
M
F
Calculate the percent of male (or female) children by year. Then plot the percent over time.
babynames %>%
group_by(year, sex) %>%
summarise(n = sum(n))
## `summarise()` has grouped output by 'year'. You can override using the `.groups` argument.
## # A tibble: 276 × 3
## # Groups: year [138]
## year sex n
## <dbl> <chr> <int>
## 1 1880 F 90993
## 2 1880 M 110491
## 3 1881 F 91953
## 4 1881 M 100743
## 5 1882 F 107847
## 6 1882 M 113686
## 7 1883 F 112319
## 8 1883 M 104627
## 9 1884 F 129020
## 10 1884 M 114442
## # … with 266 more rows
Data comes in many formats but R prefers just one: tidy data.
A data set is tidy if and only if:
What is a variable and an observation may depend on your immediate goal.