#Case Study: 2014 World Health Organization Global Tuberculosis Report Data contains redundant columns, odd variable codes, and many missing values,let’s tidy:
library(tidyverse)
## -- Attaching packages ----------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.1 v dplyr 1.0.0
## v tidyr 1.1.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts -------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
#build up a complex pipe:
who %>%
pivot_longer(
cols = new_sp_m014:newrel_f65,
names_to = "key",
values_to = "cases",
values_drop_na = TRUE
) %>%
mutate(
key = stringr::str_replace(key, "newrel", "new_rel")
) %>%
separate(key, c("new", "var", "sexage")) %>%
select(-new, -iso2, -iso3) %>%
separate(sexage, c("sex", "age"), sep = 1)
## # A tibble: 76,046 x 6
## country year var sex age cases
## <chr> <int> <chr> <chr> <chr> <int>
## 1 Afghanistan 1997 sp m 014 0
## 2 Afghanistan 1997 sp m 1524 10
## 3 Afghanistan 1997 sp m 2534 6
## 4 Afghanistan 1997 sp m 3544 3
## 5 Afghanistan 1997 sp m 4554 5
## 6 Afghanistan 1997 sp m 5564 2
## 7 Afghanistan 1997 sp m 65 0
## 8 Afghanistan 1997 sp f 014 5
## 9 Afghanistan 1997 sp f 1524 38
## 10 Afghanistan 1997 sp f 2534 36
## # ... with 76,036 more rows
who
## # A tibble: 7,240 x 60
## country iso2 iso3 year new_sp_m014 new_sp_m1524 new_sp_m2534 new_sp_m3544
## <chr> <chr> <chr> <int> <int> <int> <int> <int>
## 1 Afghan~ AF AFG 1980 NA NA NA NA
## 2 Afghan~ AF AFG 1981 NA NA NA NA
## 3 Afghan~ AF AFG 1982 NA NA NA NA
## 4 Afghan~ AF AFG 1983 NA NA NA NA
## 5 Afghan~ AF AFG 1984 NA NA NA NA
## 6 Afghan~ AF AFG 1985 NA NA NA NA
## 7 Afghan~ AF AFG 1986 NA NA NA NA
## 8 Afghan~ AF AFG 1987 NA NA NA NA
## 9 Afghan~ AF AFG 1988 NA NA NA NA
## 10 Afghan~ AF AFG 1989 NA NA NA NA
## # ... with 7,230 more rows, and 52 more variables: new_sp_m4554 <int>,
## # new_sp_m5564 <int>, new_sp_m65 <int>, new_sp_f014 <int>,
## # new_sp_f1524 <int>, new_sp_f2534 <int>, new_sp_f3544 <int>,
## # new_sp_f4554 <int>, new_sp_f5564 <int>, new_sp_f65 <int>,
## # new_sn_m014 <int>, new_sn_m1524 <int>, new_sn_m2534 <int>,
## # new_sn_m3544 <int>, new_sn_m4554 <int>, new_sn_m5564 <int>,
## # new_sn_m65 <int>, new_sn_f014 <int>, new_sn_f1524 <int>,
## # new_sn_f2534 <int>, new_sn_f3544 <int>, new_sn_f4554 <int>,
## # new_sn_f5564 <int>, new_sn_f65 <int>, new_ep_m014 <int>,
## # new_ep_m1524 <int>, new_ep_m2534 <int>, new_ep_m3544 <int>,
## # new_ep_m4554 <int>, new_ep_m5564 <int>, new_ep_m65 <int>,
## # new_ep_f014 <int>, new_ep_f1524 <int>, new_ep_f2534 <int>,
## # new_ep_f3544 <int>, new_ep_f4554 <int>, new_ep_f5564 <int>,
## # new_ep_f65 <int>, newrel_m014 <int>, newrel_m1524 <int>,
## # newrel_m2534 <int>, newrel_m3544 <int>, newrel_m4554 <int>,
## # newrel_m5564 <int>, newrel_m65 <int>, newrel_f014 <int>,
## # newrel_f1524 <int>, newrel_f2534 <int>, newrel_f3544 <int>,
## # newrel_f4554 <int>, newrel_f5564 <int>, newrel_f65 <int>
#It looks like country, iso2, and iso3 are three variables that redundantly specify the country.
#year is clearly also a variable.
#We don’t know what all the other columns are yet, but given the structure in the variable names (e.g. new_sp_m014, new_ep_m014, new_ep_f014) these are likely to be values, not variables.
who1 <- who %>%
pivot_longer(
cols = new_sp_m014:newrel_f65,
names_to = "key",
values_to = "cases",
values_drop_na = TRUE
)
#We can get some hint of the structure of the values in the new key column by counting them:
who1 %>%
count(key)
## # A tibble: 56 x 2
## key n
## <chr> <int>
## 1 new_ep_f014 1032
## 2 new_ep_f1524 1021
## 3 new_ep_f2534 1021
## 4 new_ep_f3544 1021
## 5 new_ep_f4554 1017
## 6 new_ep_f5564 1017
## 7 new_ep_f65 1014
## 8 new_ep_m014 1038
## 9 new_ep_m1524 1026
## 10 new_ep_m2534 1020
## # ... with 46 more rows
who1
## # A tibble: 76,046 x 6
## country iso2 iso3 year key cases
## <chr> <chr> <chr> <int> <chr> <int>
## 1 Afghanistan AF AFG 1997 new_sp_m014 0
## 2 Afghanistan AF AFG 1997 new_sp_m1524 10
## 3 Afghanistan AF AFG 1997 new_sp_m2534 6
## 4 Afghanistan AF AFG 1997 new_sp_m3544 3
## 5 Afghanistan AF AFG 1997 new_sp_m4554 5
## 6 Afghanistan AF AFG 1997 new_sp_m5564 2
## 7 Afghanistan AF AFG 1997 new_sp_m65 0
## 8 Afghanistan AF AFG 1997 new_sp_f014 5
## 9 Afghanistan AF AFG 1997 new_sp_f1524 38
## 10 Afghanistan AF AFG 1997 new_sp_f2534 36
## # ... with 76,036 more rows
who2 <- who1 %>%
mutate(names_from = stringr::str_replace(key, "newrel", "new_rel"))
who2
## # A tibble: 76,046 x 7
## country iso2 iso3 year key cases names_from
## <chr> <chr> <chr> <int> <chr> <int> <chr>
## 1 Afghanistan AF AFG 1997 new_sp_m014 0 new_sp_m014
## 2 Afghanistan AF AFG 1997 new_sp_m1524 10 new_sp_m1524
## 3 Afghanistan AF AFG 1997 new_sp_m2534 6 new_sp_m2534
## 4 Afghanistan AF AFG 1997 new_sp_m3544 3 new_sp_m3544
## 5 Afghanistan AF AFG 1997 new_sp_m4554 5 new_sp_m4554
## 6 Afghanistan AF AFG 1997 new_sp_m5564 2 new_sp_m5564
## 7 Afghanistan AF AFG 1997 new_sp_m65 0 new_sp_m65
## 8 Afghanistan AF AFG 1997 new_sp_f014 5 new_sp_f014
## 9 Afghanistan AF AFG 1997 new_sp_f1524 38 new_sp_f1524
## 10 Afghanistan AF AFG 1997 new_sp_f2534 36 new_sp_f2534
## # ... with 76,036 more rows
who3 <- who2 %>%
separate(key, c("new", "type", "sexage"), sep = "_")
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 2580 rows [243,
## 244, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 903,
## 904, 905, 906, ...].
who3
## # A tibble: 76,046 x 9
## country iso2 iso3 year new type sexage cases names_from
## <chr> <chr> <chr> <int> <chr> <chr> <chr> <int> <chr>
## 1 Afghanistan AF AFG 1997 new sp m014 0 new_sp_m014
## 2 Afghanistan AF AFG 1997 new sp m1524 10 new_sp_m1524
## 3 Afghanistan AF AFG 1997 new sp m2534 6 new_sp_m2534
## 4 Afghanistan AF AFG 1997 new sp m3544 3 new_sp_m3544
## 5 Afghanistan AF AFG 1997 new sp m4554 5 new_sp_m4554
## 6 Afghanistan AF AFG 1997 new sp m5564 2 new_sp_m5564
## 7 Afghanistan AF AFG 1997 new sp m65 0 new_sp_m65
## 8 Afghanistan AF AFG 1997 new sp f014 5 new_sp_f014
## 9 Afghanistan AF AFG 1997 new sp f1524 38 new_sp_f1524
## 10 Afghanistan AF AFG 1997 new sp f2534 36 new_sp_f2534
## # ... with 76,036 more rows
stocks <- tibble(
year = c(2015, 2015, 2016, 2016),
half = c( 1, 2, 1, 2),
return = c(1.88, 0.59, 0.92, 0.17)
)
stocks %>%
pivot_wider(names_from = year, values_from = return) %>%
pivot_longer(`2015`:`2016`, names_to = "year", values_to = "return")
## # A tibble: 4 x 3
## half year return
## <dbl> <chr> <dbl>
## 1 1 2015 1.88
## 2 1 2016 0.92
## 3 2 2015 0.59
## 4 2 2016 0.17
onetwothree <- tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>%
separate(x, c("one", "two", "three"))
## Warning: Expected 3 pieces. Additional pieces discarded in 1 rows [2].
onetwothree %>%
unite(new, two, three, sep = "", na.rm=FALSE)
## # A tibble: 3 x 2
## one new
## <chr> <chr>
## 1 a bc
## 2 d ef
## 3 h ij
stocksMvals <- tibble(
year = c(2015, 2015, 2015, 2015, 2016, 2016, 2016),
qtr = c( 1, 2, 3, 4, 2, 3, 4),
return = c(1.88, 0.59, 0.35, NA, 0.92, 0.17, 2.66)
)
#The return for the fourth quarter of 2015 is explicitly missing, because the cell where its value should be instead contains NA.
#The return for the first quarter of 2016 is implicitly missing, because it simply does not appear in the dataset.
#make the implicit missing value explicit by putting years in the columns:
stocksMvals %>%
pivot_wider(names_from = year, values_from = return)
## # A tibble: 4 x 3
## qtr `2015` `2016`
## <dbl> <dbl> <dbl>
## 1 1 1.88 NA
## 2 2 0.59 0.92
## 3 3 0.35 0.17
## 4 4 NA 2.66
#or turn explicit missing values implicit:
stocksMvals %>%
pivot_wider(names_from = year, values_from = return) %>%
pivot_longer(
cols = c(`2015`, `2016`),
names_to = "year",
values_to = "return",
values_drop_na = TRUE
)
## # A tibble: 6 x 3
## qtr year return
## <dbl> <chr> <dbl>
## 1 1 2015 1.88
## 2 2 2015 0.59
## 3 2 2016 0.92
## 4 3 2015 0.35
## 5 3 2016 0.17
## 6 4 2016 2.66
stocksMvals %>%
complete(year, qtr)
## # A tibble: 8 x 3
## year qtr return
## <dbl> <dbl> <dbl>
## 1 2015 1 1.88
## 2 2015 2 0.59
## 3 2015 3 0.35
## 4 2015 4 NA
## 5 2016 1 NA
## 6 2016 2 0.92
## 7 2016 3 0.17
## 8 2016 4 2.66
treatment <- tribble(
~ person, ~ treatment, ~response,
"Derrick Whitmore", 1, 7,
NA, 2, 10,
NA, 3, 9,
"Katherine Burke", 1, 4
)
#last observation carried forward
treatment %>%
fill(person)
## # A tibble: 4 x 3
## person treatment response
## <chr> <dbl> <dbl>
## 1 Derrick Whitmore 1 7
## 2 Derrick Whitmore 2 10
## 3 Derrick Whitmore 3 9
## 4 Katherine Burke 1 4