# Extract the number of TB cases per country per year.
t2_cases <- filter(table2, type == "cases") %>%
rename(cases = count) %>%
arrange(country, year)
#Extract the matching population per country per year.
t2_population <- filter(table2, type == "population") %>%
rename(population = count) %>%
arrange(country, year)
#Create a new data frame with the population and cases columns, and calculate the cases per capita in a new column.
t2_cases_per_cap <- tibble(
year = t2_cases$year,
country = t2_cases$country,
cases = t2_cases$cases,
population = t2_population$population
) %>%
mutate(cases_per_cap = (cases / population) * 10000) %>%
select(country, year, cases_per_cap)
#Store this new variable to table 2
t2_cases_per_cap <- t2_cases_per_cap %>%
mutate(type = "cases_per_cap") %>%
rename(count = cases_per_cap)
bind_rows(table2, t2_cases_per_cap) %>%
arrange(country, year, type, count)
## # A tibble: 18 × 4
## country year type count
## <chr> <int> <chr> <dbl>
## 1 Afghanistan 1999 cases 7.45e+2
## 2 Afghanistan 1999 cases_per_cap 3.73e-1
## 3 Afghanistan 1999 population 2.00e+7
## 4 Afghanistan 2000 cases 2.67e+3
## 5 Afghanistan 2000 cases_per_cap 1.29e+0
## 6 Afghanistan 2000 population 2.06e+7
## 7 Brazil 1999 cases 3.77e+4
## 8 Brazil 1999 cases_per_cap 2.19e+0
## 9 Brazil 1999 population 1.72e+8
## 10 Brazil 2000 cases 8.05e+4
## 11 Brazil 2000 cases_per_cap 4.61e+0
## 12 Brazil 2000 population 1.75e+8
## 13 China 1999 cases 2.12e+5
## 14 China 1999 cases_per_cap 1.67e+0
## 15 China 1999 population 1.27e+9
## 16 China 2000 cases 2.14e+5
## 17 China 2000 cases_per_cap 1.67e+0
## 18 China 2000 population 1.28e+9
#Before creating the plot with change in cases over time, we need to filter table to only include rows representing cases of TB.
table2 %>%
filter(type == "cases") %>%
ggplot(aes(year, count)) +
geom_line(aes(group = country), colour = "grey50") +
geom_point(aes(colour = country)) +
scale_x_continuous(breaks = unique(table2$year)) +
ylab("cases")
Carefully consider the following example:
stocks <- tibble(
year = c(2015, 2015, 2016, 2016),
half = c( 1, 2, 1, 2),
return = c(1.88, 0.59, 0.92, 0.17)
)
stocks %>%
pivot_wider(names_from = year, values_from = return) %>%
pivot_longer(`2015`:`2016`, names_to = "year", values_to = "return")
## # A tibble: 4 × 3
## half year return
## <dbl> <chr> <dbl>
## 1 1 2015 1.88
## 2 1 2016 0.92
## 3 2 2015 0.59
## 4 2 2016 0.17
#The code fails because the column names 1999 and 2000 are not non-syntactic variable names.To select the columns 1999 and 2000, the names must be strings.
table4a %>%
pivot_longer(c(1999, 2000), names_to = "year", values_to = "cases")
## Error in `loc_validate()`:
## ! Can't subset columns past the end.
## ℹ Locations 1999 and 2000 don't exist.
## ℹ There are only 3 columns.
table4a %>%
pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "cases")
## # A tibble: 6 × 3
## country year cases
## <chr> <chr> <int>
## 1 Afghanistan 1999 745
## 2 Afghanistan 2000 2666
## 3 Brazil 1999 37737
## 4 Brazil 2000 80488
## 5 China 1999 212258
## 6 China 2000 213766
people <- tribble(
~name, ~key, ~value,
#-----------------|--------|------
"Phillip Woods", "age", 45,
"Phillip Woods", "height", 186,
"Phillip Woods", "age", 50,
"Jessica Cordero", "age", 37,
"Jessica Cordero", "height", 156
)
glimpse(people)
## Rows: 5
## Columns: 3
## $ name <chr> "Phillip Woods", "Phillip Woods", "Phillip Woods", "Jessica Cord…
## $ key <chr> "age", "height", "age", "age", "height"
## $ value <dbl> 45, 186, 50, 37, 156
#Widening this data frame using pivot_wider() produces columns that are lists of numeric vectors because the name and key columns do not uniquely identify rows. In particular, there are two rows with values for the age of “Phillip Woods”.
pivot_wider(people, names_from="name", values_from = "value")
## Warning: Values from `value` are not uniquely identified; output will contain list-cols.
## * Use `values_fn = list` to suppress this warning.
## * Use `values_fn = {summary_fun}` to summarise duplicates.
## * Use the following dplyr code to identify duplicates.
## {data} %>%
## dplyr::group_by(key, name) %>%
## dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
## dplyr::filter(n > 1L)
## # A tibble: 2 × 3
## key `Phillip Woods` `Jessica Cordero`
## <chr> <list> <list>
## 1 age <dbl [2]> <dbl [1]>
## 2 height <dbl [1]> <dbl [1]>
#We could solve the problem by adding a row with a distinct observation count for each combination of name and key.
people2 <- people %>%
group_by(name, key) %>%
mutate(obs = row_number())
people2
## # A tibble: 5 × 4
## # Groups: name, key [4]
## name key value obs
## <chr> <chr> <dbl> <int>
## 1 Phillip Woods age 45 1
## 2 Phillip Woods height 186 1
## 3 Phillip Woods age 50 2
## 4 Jessica Cordero age 37 1
## 5 Jessica Cordero height 156 1
#We can make people2 wider because the combination of name and obs will uniquely identify the rows in the wide data frame.
pivot_wider(people2, names_from="name", values_from = "value")
## # A tibble: 3 × 4
## # Groups: key [2]
## key obs `Phillip Woods` `Jessica Cordero`
## <chr> <int> <dbl> <dbl>
## 1 age 1 45 37
## 2 height 1 186 156
## 3 age 2 50 NA
# example with separators
tibble(x = c("X_1", "X_2", "AA_1", "AA_2")) %>%
separate(x, c("variable", "into"), sep = "_")
## # A tibble: 4 × 2
## variable into
## <chr> <chr>
## 1 X 1
## 2 X 2
## 3 AA 1
## 4 AA 2
# example with position
tibble(x = c("X1", "X2", "Y1", "Y2")) %>%
separate(x, c("variable", "into"), sep = c(1))
## # A tibble: 4 × 2
## variable into
## <chr> <chr>
## 1 X 1
## 2 X 2
## 3 Y 1
## 4 Y 2
# example with separators
tibble(x = c("X_1", "X_2", "AA_1", "AA_2")) %>%
extract(x, c("variable", "id"), regex = "([A-Z])_([0-9])")
## # A tibble: 4 × 2
## variable id
## <chr> <chr>
## 1 X 1
## 2 X 2
## 3 A 1
## 4 A 2
# example with position
tibble(x = c("X1", "X2", "Y1", "Y2")) %>%
extract(x, c("variable", "id"), regex = "([A-Z])([0-9])")
## # A tibble: 4 × 2
## variable id
## <chr> <chr>
## 1 X 1
## 2 X 2
## 3 Y 1
## 4 Y 2
# example that separate could not parse
tibble(x = c("X1", "X20", "AA11", "AA2")) %>%
extract(x, c("variable", "id"), regex = "([A-Z]+)([0-9]+)")
## # A tibble: 4 × 2
## variable id
## <chr> <chr>
## 1 X 1
## 2 X 20
## 3 AA 11
## 4 AA 2
tibble(variable = c("X", "X", "Y", "Y"), id = c(1, 2, 1, 2)) %>%
unite(x, variable, id, sep = "_")
## # A tibble: 4 × 1
## x
## <chr>
## 1 X_1
## 2 X_2
## 3 Y_1
## 4 Y_2
#For example, this will fill in the missing values of the long data frame with 0 complete():
stocks <- tibble(
year = c(2015, 2015, 2015, 2015, 2016, 2016, 2016),
qtr = c( 1, 2, 3, 4, 2, 3, 4),
return = c(1.88, 0.59, 0.35, NA, 0.92, 0.17, 2.66)
)
stocks %>%
pivot_wider(names_from = year, values_from = return,
values_fill = 0)
## # A tibble: 4 × 3
## qtr `2015` `2016`
## <dbl> <dbl> <dbl>
## 1 1 1.88 0
## 2 2 0.59 0.92
## 3 3 0.35 0.17
## 4 4 NA 2.66
#For example, this will fill in the missing values of the long data frame with 0 complete():
stocks %>%
complete(year, qtr, fill=list(return=0))
## # A tibble: 8 × 3
## year qtr return
## <dbl> <dbl> <dbl>
## 1 2015 1 1.88
## 2 2015 2 0.59
## 3 2015 3 0.35
## 4 2015 4 0
## 5 2016 1 0
## 6 2016 2 0.92
## 7 2016 3 0.17
## 8 2016 4 2.66
who1 <- who %>%
pivot_longer(
cols = new_sp_m014:newrel_f65,
names_to = "key",
values_to = "cases",
values_drop_na = TRUE
)
who2 <- who1 %>%
mutate(names_from = stringr::str_replace(key, "newrel", "new_rel"))
who3 <- who2 %>%
separate(key, c("new", "type", "sexage"), sep = "_")
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 2580 rows [243,
## 244, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 903,
## 904, 905, 906, ...].
who3 %>%
count(new)
## # A tibble: 2 × 2
## new n
## <chr> <int>
## 1 new 73466
## 2 newrel 2580
who4 <- who3 %>%
select(-new, -iso2, -iso3)
who5 <- who4 %>%
separate(sexage, c("sex", "age"), sep = 1)
#check if there are zeros in the data
who1 %>%
filter(cases == 0) %>%
nrow()
## [1] 11080
#check whether all values for a (country, year) are missing or whether it is possible for only some columns to be missing.
pivot_longer(who, c(new_sp_m014:newrel_f65), names_to = "key", values_to = "cases") %>%
group_by(country, year) %>%
mutate(prop_missing = sum(is.na(cases)) / n()) %>%
filter(prop_missing > 0, prop_missing < 1)
## # A tibble: 195,104 × 7
## # Groups: country, year [3,484]
## country iso2 iso3 year key cases prop_missing
## <chr> <chr> <chr> <int> <chr> <int> <dbl>
## 1 Afghanistan AF AFG 1997 new_sp_m014 0 0.75
## 2 Afghanistan AF AFG 1997 new_sp_m1524 10 0.75
## 3 Afghanistan AF AFG 1997 new_sp_m2534 6 0.75
## 4 Afghanistan AF AFG 1997 new_sp_m3544 3 0.75
## 5 Afghanistan AF AFG 1997 new_sp_m4554 5 0.75
## 6 Afghanistan AF AFG 1997 new_sp_m5564 2 0.75
## 7 Afghanistan AF AFG 1997 new_sp_m65 0 0.75
## 8 Afghanistan AF AFG 1997 new_sp_f014 5 0.75
## 9 Afghanistan AF AFG 1997 new_sp_f1524 38 0.75
## 10 Afghanistan AF AFG 1997 new_sp_f2534 36 0.75
## # … with 195,094 more rows
# check for implicit missing values. Implicit missing values are (year, country) combinations that do not appear in the data.
nrow(who)
## [1] 7240
who %>%
complete(country, year) %>%
nrow()
## [1] 7446
#The separate() function emits the warning “too few values”. If we check the rows for keys beginning with "newrel_", we see that sexage is missing, and type = m014.
who3a <- who1 %>%
separate(key, c("new", "type", "sexage"), sep = "_")
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 2580 rows [243,
## 244, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 903,
## 904, 905, 906, ...].
filter(who3a, new == "newrel") %>% head()
## # A tibble: 6 × 8
## country iso2 iso3 year new type sexage cases
## <chr> <chr> <chr> <int> <chr> <chr> <chr> <int>
## 1 Afghanistan AF AFG 2013 newrel m014 <NA> 1705
## 2 Afghanistan AF AFG 2013 newrel f014 <NA> 1749
## 3 Albania AL ALB 2013 newrel m014 <NA> 14
## 4 Albania AL ALB 2013 newrel m1524 <NA> 60
## 5 Albania AL ALB 2013 newrel m2534 <NA> 61
## 6 Albania AL ALB 2013 newrel m3544 <NA> 32
who5 %>%
group_by(country, year, sex) %>%
filter(year > 1995) %>%
summarise(cases = sum(cases)) %>%
unite(country_sex, country, sex, remove = FALSE) %>%
ggplot(aes(x = year, y = cases, group = country_sex, colour = sex)) +
geom_line()
## `summarise()` has grouped output by 'country', 'year'. You can override using
## the `.groups` argument.