The tidyr package (part of the tidyverse) helps to organize or restructure messy or inconvenient datasets.

The Datasets


The following datasets will be used to demonstrate some of tidyr’s functionality. Each dataset shows the same four variables in a different way: country, year, population, and cases

> library(tidyverse) #load tidyr and datasets
> library(knitr) #load library for table formatting
> library(kableExtra) #load library for table formatting
> 
> # table1
> kable(table1) %>%
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")
country year cases population
Afghanistan 1999 745 19987071
Afghanistan 2000 2666 20595360
Brazil 1999 37737 172006362
Brazil 2000 80488 174504898
China 1999 212258 1272915272
China 2000 213766 1280428583
> # table2
> kable(table2) %>%
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")
country year type count
Afghanistan 1999 cases 745
Afghanistan 1999 population 19987071
Afghanistan 2000 cases 2666
Afghanistan 2000 population 20595360
Brazil 1999 cases 37737
Brazil 1999 population 172006362
Brazil 2000 cases 80488
Brazil 2000 population 174504898
China 1999 cases 212258
China 1999 population 1272915272
China 2000 cases 213766
China 2000 population 1280428583
> # table3
> kable(table3) %>%
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")
country year rate
Afghanistan 1999 745/19987071
Afghanistan 2000 2666/20595360
Brazil 1999 37737/172006362
Brazil 2000 80488/174504898
China 1999 212258/1272915272
China 2000 213766/1280428583
> # table4a
> kable(table4a) %>%
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")
country 1999 2000
Afghanistan 745 2666
Brazil 37737 80488
China 212258 213766
> # table4b
> kable(table4b) %>%
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")
country 1999 2000
Afghanistan 19987071 20595360
Brazil 172006362 174504898
China 1272915272 1280428583

There are three rules (interrelated) that make a datset tidy:

  1. Each variable must have its own column.
  2. Each observation must have its own row.
  3. Each value must have its own cell.

In this case only table1 is tidy.

The Datasets - Exercises

  1. Compute the rate for table2 and table4a + table4b. You will need to perform four operations:
  1. Extract the number of TB cases per country per year.

  2. Extract the matching population per country per year.

  3. Divide cases by population and multiply by 10,000.

  4. Store back in the appropriate place.

  • Which representation is easiest to work with?

Table 4a and 4b were easier to work with.

Table2

> library(tidyverse)
> 
> (TB_yr <- table2 %>% filter(type=="cases"))
# A tibble: 6 x 4
  country      year type   count
  <chr>       <int> <chr>  <int>
1 Afghanistan  1999 cases    745
2 Afghanistan  2000 cases   2666
3 Brazil       1999 cases  37737
4 Brazil       2000 cases  80488
5 China        1999 cases 212258
6 China        2000 cases 213766
> (Pop_yr <- table2 %>% filter(type=="population"))
# A tibble: 6 x 4
  country      year type            count
  <chr>       <int> <chr>           <int>
1 Afghanistan  1999 population   19987071
2 Afghanistan  2000 population   20595360
3 Brazil       1999 population  172006362
4 Brazil       2000 population  174504898
5 China        1999 population 1272915272
6 China        2000 population 1280428583
> (Rate_yr <- TB_yr %>% inner_join(Pop_yr, by=c("country","year")) %>% 
+   mutate(rate=count.x/count.y*10000, type.z="rate") %>% 
+     select(country,year,type=type.z,count=rate)) 
# A tibble: 6 x 4
  country      year type  count
  <chr>       <int> <chr> <dbl>
1 Afghanistan  1999 rate  0.373
2 Afghanistan  2000 rate  1.29 
3 Brazil       1999 rate  2.19 
4 Brazil       2000 rate  4.61 
5 China        1999 rate  1.67 
6 China        2000 rate  1.67 
> bind_rows(TB_yr,Pop_yr,Rate_yr) %>% 
+   arrange(country,year,type)
# A tibble: 18 x 4
   country      year type         count
   <chr>       <int> <chr>        <dbl>
 1 Afghanistan  1999 cases      7.45e+2
 2 Afghanistan  1999 population 2.00e+7
 3 Afghanistan  1999 rate       3.73e-1
 4 Afghanistan  2000 cases      2.67e+3
 5 Afghanistan  2000 population 2.06e+7
 6 Afghanistan  2000 rate       1.29e+0
 7 Brazil       1999 cases      3.77e+4
 8 Brazil       1999 population 1.72e+8
 9 Brazil       1999 rate       2.19e+0
10 Brazil       2000 cases      8.05e+4
11 Brazil       2000 population 1.75e+8
12 Brazil       2000 rate       4.61e+0
13 China        1999 cases      2.12e+5
14 China        1999 population 1.27e+9
15 China        1999 rate       1.67e+0
16 China        2000 cases      2.14e+5
17 China        2000 population 1.28e+9
18 China        2000 rate       1.67e+0

Table4a + Table4b

> table4a %>% inner_join(table4b,by="country",
+                        suffix=c("_cases","_pop")) %>% 
+   mutate(`1999_rate`=`1999_cases`/`1999_pop`*10000,
+          `2000_rate`=`2000_cases`/`2000_pop`*10000) %>% 
+   select(country,`1999`=`1999_rate`,`2000`=`2000_rate`)
# A tibble: 3 x 3
  country     `1999` `2000`
  <chr>        <dbl>  <dbl>
1 Afghanistan  0.373   1.29
2 Brazil       2.19    4.61
3 China        1.67    1.67
  1. Re-create the plot showing change in cases over time using table2 instead of table1.
> table2 %>%
+   filter(type == "cases") %>%
+   ggplot(aes(year, count)) +
+   geom_line(aes(group = country), color = "grey50") +
+   geom_point(aes(color = country)) +
+   scale_x_continuous(breaks = unique(table2$year)) +
+   ylab("cases")

Pivoting Longer (Gathering)


Sometimes columns would be better represented as values of a variable. In the case of table4a, 1999 and 2000 should be gathered and placed into the variable year.

This can be accomplished with the function gather()

  • key = The gathered column’s name
  • value = The column name for the values spread over those cells.
> #using gather
> table4a %>% 
+   gather(`1999`,`2000`,key="year",value="cases") %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")
country year cases
Afghanistan 1999 745
Brazil 1999 37737
China 1999 212258
Afghanistan 2000 2666
Brazil 2000 80488
China 2000 213766

However, the gather() function has be sunsetted and replaced with pivot_longer().

pivot_longer() “lengthens” data, increasing the number of rows and decreasing the number of columns.

Below you can see the same output with the newer function. Note that instead of selecting 1999 and 2000 I just de-selected country instead.

> #using pivot longer
> table4a %>% 
+   pivot_longer(-country,names_to="year",values_to="cases") %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")
country year cases
Afghanistan 1999 745
Afghanistan 2000 2666
Brazil 1999 37737
Brazil 2000 80488
China 1999 212258
China 2000 213766

We can do the same with table4b.

> #using pivot longer
> table4b %>% 
+   pivot_longer(-country,names_to="year",values_to="population") %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")
country year population
Afghanistan 1999 19987071
Afghanistan 2000 20595360
Brazil 1999 172006362
Brazil 2000 174504898
China 1999 1272915272
China 2000 1280428583

We can then join table4a and table4b with a left join from dplyr. The output matches table1.

> tidy4a <- table4a %>% 
+   pivot_longer(-country,names_to="year",values_to="cases")
> 
> tidy4b <- table4b %>% 
+   pivot_longer(-country,names_to="year",values_to="population")
> 
> left_join(tidy4a,tidy4b) %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")
Joining, by = c("country", "year")
country year cases population
Afghanistan 1999 745 19987071
Afghanistan 2000 2666 20595360
Brazil 1999 37737 172006362
Brazil 2000 80488 174504898
China 1999 212258 1272915272
China 2000 213766 1280428583

We can now plot case growth over time.

> CombinTbl <- left_join(tidy4a,tidy4b)
> ggplot(CombinTbl,aes(year,cases))+
+   geom_line(aes(group=country),color="grey50")+
+   geom_point(aes(color=country))+theme_light()

Pivoting Wider (Spreading)


The function spread() is the opposite of gather() and the function pivot_wider is the opposite of pivot_longer.

Like gather(), spread() has been sunsetted. Going forward it is best to use pivot_wider.

pivot_wider() “widens” data, increasing the number of columns and decreasing the number of rows.

> #table2
> table2 %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")
country year type count
Afghanistan 1999 cases 745
Afghanistan 1999 population 19987071
Afghanistan 2000 cases 2666
Afghanistan 2000 population 20595360
Brazil 1999 cases 37737
Brazil 1999 population 172006362
Brazil 2000 cases 80488
Brazil 2000 population 174504898
China 1999 cases 212258
China 1999 population 1272915272
China 2000 cases 213766
China 2000 population 1280428583
> #table 2 spread
> table2 %>% spread(key=type, value=count) %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")
country year cases population
Afghanistan 1999 745 19987071
Afghanistan 2000 2666 20595360
Brazil 1999 37737 172006362
Brazil 2000 80488 174504898
China 1999 212258 1272915272
China 2000 213766 1280428583
> #table2 pivot wider
> table2 %>% pivot_wider(names_from=type, values_from=count ) %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")
country year cases population
Afghanistan 1999 745 19987071
Afghanistan 2000 2666 20595360
Brazil 1999 37737 172006362
Brazil 2000 80488 174504898
China 1999 212258 1272915272
China 2000 213766 1280428583

Pivoting - Exercises

  1. Why are pivot_longer() and pivot_wider() not perfectly symmetrical? Carefully consider the following example:
> stocks <- tibble(
+   year   = c(2015, 2015, 2016, 2016),
+   half  = c(   1,    2,     1,    2),
+   return = c(1.88, 0.59, 0.92, 0.17)
+ )
> stocks %>% 
+   pivot_wider(names_from = year, values_from = return) %>% 
+   pivot_longer(`2015`:`2016`, names_to = "year", values_to = "return")
  • In the original tibble year was <dbl>. When we went from pivot_wider back to pivot_longer it became <chr>.
  • pivot_wider converted the years into column names (characters).
> (stocks <- tibble(
+   year   = c(2015, 2015, 2016, 2016),
+   half  = c(   1,    2,     1,    2),
+   return = c(1.88, 0.59, 0.92, 0.17)
+ ))
# A tibble: 4 x 3
   year  half return
  <dbl> <dbl>  <dbl>
1  2015     1   1.88
2  2015     2   0.59
3  2016     1   0.92
4  2016     2   0.17
> stocks %>% 
+   pivot_wider(names_from = year, values_from = return) %>% 
+   pivot_longer(`2015`:`2016`, names_to = "year", values_to = "return")
# A tibble: 4 x 3
   half year  return
  <dbl> <chr>  <dbl>
1     1 2015    1.88
2     1 2016    0.92
3     2 2015    0.59
4     2 2016    0.17

pivot_longer() has a names_ptypes argument, e.g. names_ptypes = list(year = double()). What does it do?

  • names_ptypes and names_transform allow you to confirm or change the type of specific columns.
  • names_ptypes - A list of column name-prototype pairs. A prototype (or ptype for short) is a zero-length vector (like integer() or numeric()) that defines the type, class, and attributes of a vector. Use these arguments to confirm that the created columns are the types that you expect
  • names_transform - A list of column name-function pairs. Use these arguments if you need to change the type of specific columns.
> stocks %>%
+   pivot_wider(names_from = year, values_from = return)%>%
+   pivot_longer(`2015`:`2016`, names_to = "year", values_to = "return",
+                names_ptypes = list(year = double()))
Error: Can't convert <character> to <double>.
> stocks %>%
+   pivot_wider(names_from = year, values_from = return)%>%
+   pivot_longer(`2015`:`2016`, names_to = "year", values_to = "return",
+                names_transform = list(year = as.numeric))
# A tibble: 4 x 3
   half  year return
  <dbl> <dbl>  <dbl>
1     1  2015   1.88
2     1  2016   0.92
3     2  2015   0.59
4     2  2016   0.17
  1. Why does this code fail?
> table4a %>% 
+   pivot_longer(c(1999, 2000), names_to = "year", values_to = "cases")
Error: Can't subset columns that don't exist.
x Locations 1999 and 2000 don't exist.
i There are only 3 columns.
  • The code interprets 1999 and 2000 as column numbers. They need to be surrounded by backticks.
> table4a %>% 
+   pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "cases")
# A tibble: 6 x 3
  country     year   cases
  <chr>       <chr>  <int>
1 Afghanistan 1999     745
2 Afghanistan 2000    2666
3 Brazil      1999   37737
4 Brazil      2000   80488
5 China       1999  212258
6 China       2000  213766
  1. What would happen if you widen this table? Why? How could you add a new column to uniquely identify each value?
> people <- tribble(
+   ~name, ~key, ~value,
+   #-----------------|--------|------
+   "Phillip Woods",  "age", 45,
+   "Phillip Woods", "height", 186,
+   "Phillip Woods", "age", 50,
+   "Jessica Cordero", "age", 37,
+   "Jessica Cordero", "height", 156
+ )
  • The name and key don’t uniquely identify each value. You can group them and add a row number.
> (people2 <- people %>%
+   group_by(name, key) %>%
+   mutate(obs = row_number()))
# A tibble: 5 x 4
# Groups:   name, key [4]
  name            key    value   obs
  <chr>           <chr>  <dbl> <int>
1 Phillip Woods   age       45     1
2 Phillip Woods   height   186     1
3 Phillip Woods   age       50     2
4 Jessica Cordero age       37     1
5 Jessica Cordero height   156     1
> pivot_wider(people2, names_from="name", values_from = "value")
# A tibble: 3 x 4
# Groups:   key [2]
  key      obs `Phillip Woods` `Jessica Cordero`
  <chr>  <int>           <dbl>             <dbl>
1 age        1              45                37
2 height     1             186               156
3 age        2              50                NA
  1. Tidy the simple tibble below. Do you need to make it wider or longer?
> preg <- tribble(
+   ~pregnant, ~male, ~female,
+   "yes", NA, 10,
+   "no", 20, 12
+ )
  • pivot_longer works well with this data.
> preg %>% pivot_longer(c(male, female), names_to = "sex", 
+              values_to = "count", values_drop_na = TRUE)
# A tibble: 3 x 3
  pregnant sex    count
  <chr>    <chr>  <dbl>
1 yes      female    10
2 no       male      20
3 no       female    12

Separate


separate() pulls apart one column into multiple columns based on a separator character.

> table3 %>% kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")
country year rate
Afghanistan 1999 745/19987071
Afghanistan 2000 2666/20595360
Brazil 1999 37737/172006362
Brazil 2000 80488/174504898
China 1999 212258/1272915272
China 2000 213766/1280428583
> table3 %>% separate(rate, into=c("cases","population")) %>% 
+   kable %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")
country year cases population
Afghanistan 1999 745 19987071
Afghanistan 2000 2666 20595360
Brazil 1999 37737 172006362
Brazil 2000 80488 174504898
China 1999 212258 1272915272
China 2000 213766 1280428583

By default it selects a non alphanumeric character on which to split the variable. However, you can select it:

> table3 %>% separate(rate, into=c("cases","population"), sep="/")

Also, by default separate() leaves the type of column as is. In this instance both cases and population remain as character. However, by adding convert=TRUE the function will change them to integers, which is more appropriate.

> SepTbl3a <- table3 %>% separate(rate, into=c("cases","population"), sep="/")
> 
> SepTbl3b <- table3 %>% separate(
+   rate, into=c("cases","population"),
+   convert=TRUE
+ )
> 
> str(SepTbl3a)
tibble [6 x 4] (S3: tbl_df/tbl/data.frame)
 $ country   : chr [1:6] "Afghanistan" "Afghanistan" "Brazil" "Brazil" ...
 $ year      : int [1:6] 1999 2000 1999 2000 1999 2000
 $ cases     : chr [1:6] "745" "2666" "37737" "80488" ...
 $ population: chr [1:6] "19987071" "20595360" "172006362" "174504898" ...
> str(SepTbl3b)
tibble [6 x 4] (S3: tbl_df/tbl/data.frame)
 $ country   : chr [1:6] "Afghanistan" "Afghanistan" "Brazil" "Brazil" ...
 $ year      : int [1:6] 1999 2000 1999 2000 1999 2000
 $ cases     : int [1:6] 745 2666 37737 80488 212258 213766
 $ population: int [1:6] 19987071 20595360 172006362 174504898 1272915272 1280428583

You can also separate variables based on integers. Positive values start from the left and negative values start from the right.

> table3 %>% 
+   separate(year, into=c("century","year"), sep=2) %>% 
+   kable %>% 
+    kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")
country century year rate
Afghanistan 19 99 745/19987071
Afghanistan 20 00 2666/20595360
Brazil 19 99 37737/172006362
Brazil 20 00 80488/174504898
China 19 99 212258/1272915272
China 20 00 213766/1280428583

Unite


unite() is the inverse of separate().

> table5 %>% 
+   kable %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")
country century year rate
Afghanistan 19 99 745/19987071
Afghanistan 20 00 2666/20595360
Brazil 19 99 37737/172006362
Brazil 20 00 80488/174504898
China 19 99 212258/1272915272
China 20 00 213766/1280428583

By default it will place an underscore _ between values.

> table5 %>% unite(new,century,year) %>% 
+   kable %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")
country new rate
Afghanistan 19_99 745/19987071
Afghanistan 20_00 2666/20595360
Brazil 19_99 37737/172006362
Brazil 20_00 80488/174504898
China 19_99 212258/1272915272
China 20_00 213766/1280428583

However, you can specify "" for no separator.

> table5 %>% unite(new,century,year, sep="") %>% 
+   kable %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")
country new rate
Afghanistan 1999 745/19987071
Afghanistan 2000 2666/20595360
Brazil 1999 37737/172006362
Brazil 2000 80488/174504898
China 1999 212258/1272915272
China 2000 213766/1280428583

Separating and Uniting Exercises

  1. What do the extra and fill arguments do in separate()? Experiment with the various options for the following two toy datasets.
> tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>%
+   separate(x, c("one", "two", "three"))
Warning: Expected 3 pieces. Additional pieces discarded in 1 rows [2].
# A tibble: 3 x 3
  one   two   three
  <chr> <chr> <chr>
1 a     b     c    
2 d     e     f    
3 h     i     j    
> tibble(x = c("a,b,c", "d,e", "f,g,i")) %>%
+   separate(x, c("one", "two", "three"))
Warning: Expected 3 pieces. Missing pieces filled with `NA` in 1 rows [2].
# A tibble: 3 x 3
  one   two   three
  <chr> <chr> <chr>
1 a     b     c    
2 d     e     <NA> 
3 f     g     i    
  • extra - instructions for too many pieces
  • fill - instructions for too few pieces
> tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>%
+   separate(x, c("one", "two", "three"), extra = "drop")
# A tibble: 3 x 3
  one   two   three
  <chr> <chr> <chr>
1 a     b     c    
2 d     e     f    
3 h     i     j    
> tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>%
+   separate(x, c("one", "two", "three"), extra = "merge")
# A tibble: 3 x 3
  one   two   three
  <chr> <chr> <chr>
1 a     b     c    
2 d     e     f,g  
3 h     i     j    
> tibble(x = c("a,b,c", "d,e", "f,g,i")) %>%
+   separate(x, c("one", "two", "three"), fill = "right")
# A tibble: 3 x 3
  one   two   three
  <chr> <chr> <chr>
1 a     b     c    
2 d     e     <NA> 
3 f     g     i    
  1. Both unite() and separate() have a remove argument. What does it do? Why would you set it to FALSE?
  • remove discards the original columns. Setting it to FALSE keeps them.
> tibble(x = c("a,b,c", "d,e,f", "h,i,j")) %>%
+   separate(x, c("one", "two", "three"),
+            remove = FALSE)
# A tibble: 3 x 4
  x     one   two   three
  <chr> <chr> <chr> <chr>
1 a,b,c a     b     c    
2 d,e,f d     e     f    
3 h,i,j h     i     j    
  1. Compare and contrast separate() and extract(). Why are there three variations of separation (by position, by separator, and with groups), but only one unite?
  • extract() is more flexible because it can handle non-standard patterns.
> table3
# A tibble: 6 x 3
  country      year rate             
* <chr>       <int> <chr>            
1 Afghanistan  1999 745/19987071     
2 Afghanistan  2000 2666/20595360    
3 Brazil       1999 37737/172006362  
4 Brazil       2000 80488/174504898  
5 China        1999 212258/1272915272
6 China        2000 213766/1280428583
> table3 %>% separate(rate, into=c("cases","population"),
+                     convert = TRUE)
# A tibble: 6 x 4
  country      year  cases population
  <chr>       <int>  <int>      <int>
1 Afghanistan  1999    745   19987071
2 Afghanistan  2000   2666   20595360
3 Brazil       1999  37737  172006362
4 Brazil       2000  80488  174504898
5 China        1999 212258 1272915272
6 China        2000 213766 1280428583
> table3 %>% extract(rate,c("cases","population"),
+                    "(\\d*)[^\\d](\\d*)",
+                    convert=TRUE)
# A tibble: 6 x 4
  country      year  cases population
  <chr>       <int>  <int>      <int>
1 Afghanistan  1999    745   19987071
2 Afghanistan  2000   2666   20595360
3 Brazil       1999  37737  172006362
4 Brazil       2000  80488  174504898
5 China        1999 212258 1272915272
6 China        2000 213766 1280428583
  • unite() only has one option because you are just merging entire columns.
> table5 %>% unite(year,century,year,sep="")
# A tibble: 6 x 3
  country     year  rate             
  <chr>       <chr> <chr>            
1 Afghanistan 1999  745/19987071     
2 Afghanistan 2000  2666/20595360    
3 Brazil      1999  37737/172006362  
4 Brazil      2000  80488/174504898  
5 China       1999  212258/1272915272
6 China       2000  213766/1280428583

Missing Values


There are two types of missing values

  1. Explicitly - recorded as NA
  2. Implicitly - excluded from the dataset

In the following dataset Q4 2015 is recorded as NA and Q1 2016 is not present.

> stocks <- tibble(
+   year = c(2015,2015,2015,2015,2016,2016,2016),
+   qtr = c(1,2,3,4,2,3,4),
+   return = c(1.88,0.59,0.35,NA,0.92,0.17,2.66)
+ )
> stocks %>% kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")
year qtr return
2015 1 1.88
2015 2 0.59
2015 3 0.35
2015 4 NA
2016 2 0.92
2016 3 0.17
2016 4 2.66

We can first remove the implicit missing value by giving each year its own column.

> stocks %>% pivot_wider(names_from = year, values_from=return) %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")
qtr 2015 2016
1 1.88 NA
2 0.59 0.92
3 0.35 0.17
4 NA 2.66

If we want to eliminate the NA values we could add values_drop_na = TRUE.

> stocks %>% pivot_wider(names_from = year, values_from=return) %>% 
+   pivot_longer(-qtr,names_to="year",values_to="return",
+                values_drop_na = TRUE) %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")
qtr year return
1 2015 1.88
2 2015 0.59
2 2016 0.92
3 2015 0.35
3 2016 0.17
4 2016 2.66

The function complete() finds all unique combinations from a set of columns and fills in missing values with NA.

> stocks %>% 
+   complete(year,qtr) %>% kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")
year qtr return
2015 1 1.88
2015 2 0.59
2015 3 0.35
2015 4 NA
2016 1 NA
2016 2 0.92
2016 3 0.17
2016 4 2.66

Sometimes it makes sense for a value to be carried forward. We can do so with the fill() function.

> treatment <- tribble(
+ ~person, ~treatment, ~response,
+ "Derrick Whitmore", 1, 7,
+ NA, 2, 10,
+ NA, 3, 9,
+ "Katherine Burke", 1, 4
+ )
> 
> treatment %>% kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")
person treatment response
Derrick Whitmore 1 7
NA 2 10
NA 3 9
Katherine Burke 1 4
> treatment %>% 
+   fill(person) %>% kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")
person treatment response
Derrick Whitmore 1 7
Derrick Whitmore 2 10
Derrick Whitmore 3 9
Katherine Burke 1 4

The function coalesce() will replace missing values.

> nonalcoholic = tibble(
+   name=c("g&t","mimosa","rum and coke","calimocho","polar"), 
+   mixer=c("tonic","orange juice", "coke", "coke","seltzer"), 
+   price=c(3, 2, 2, 2, 5))
> 
> alcoholic = tibble(
+   name=c("g&t","mimosa","rum and coke","calimocho","IPA"), 
+   spirit=c("gin","sparkling wine", "rum", "red wine","beer"),
+   price=c(5, 4, 5, 3, 5))
> 
> full_join(nonalcoholic, alcoholic, by='name') %>% kable() %>% 
+    kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")
name mixer price.x spirit price.y
g&t tonic 3 gin 5
mimosa orange juice 2 sparkling wine 4
rum and coke coke 2 rum 5
calimocho coke 2 red wine 3
polar seltzer 5 NA NA
IPA NA NA beer 5

We can first change the NA values in price.x and price.y to 0s and combine the prices for a total.

> drinks <- full_join(nonalcoholic, alcoholic, by='name')%>% 
+   mutate(price.x=coalesce(price.x,0), price.y=coalesce(price.y,0)
+          ,total=price.x+price.y)
> 
> drinks %>% kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")
name mixer price.x spirit price.y total
g&t tonic 3 gin 5 8
mimosa orange juice 2 sparkling wine 4 6
rum and coke coke 2 rum 5 7
calimocho coke 2 red wine 3 5
polar seltzer 5 NA 0 5
IPA NA 0 beer 5 5
> drinks %>% select(spirit, mixer, total) %>% kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")
spirit mixer total
gin tonic 8
sparkling wine orange juice 6
rum coke 7
red wine coke 5
NA seltzer 5
beer NA 5

We can then create a new column that merges the information in spirit and mixer

> drinks <- drinks %>% select(spirit, mixer, total)
> drinks %>% unite(name, spirit, mixer, sep=' & ') %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")
name total
gin & tonic 8
sparkling wine & orange juice 6
rum & coke 7
red wine & coke 5
NA & seltzer 5
beer & NA 5

Unfortunately, it doesn’t look great because there are NAs. You can substitute the NAs by nothing using coalesce to make the output look a little nicer:

> new <-  drinks %>% 
+         mutate(spirit=coalesce(spirit, "nothing"),   mixer=coalesce(mixer, "nothing")) %>%
+         unite(name, spirit, mixer, sep=' & ') %>% 
+         mutate(name=trimws(name))
> new %>% kable() %>% 
+ kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")
name total
gin & tonic 8
sparkling wine & orange juice 6
rum & coke 7
red wine & coke 5
nothing & seltzer 5
beer & nothing 5

The function trimws got rid of some empty spaces at the beginning and end of name.

You can make the new variable look even nicer with the function gsub by substituting & nothing and nothing & by (literally) nothing.

> final <-  new %>% mutate(name1 = gsub("& nothing","",name),
+                        name = trimws(gsub("nothing &","",name1))) %>%
+                        select(-name1)
> final %>% kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")
name total
gin & tonic 8
sparkling wine & orange juice 6
rum & coke 7
red wine & coke 5
seltzer 5
beer 5

Missing Values - Exercises

  1. Compare and contrast the fill arguments to pivot_wider() and complete().
  • values_fill replaces implicitly missing values .
  • fill replaces all NA values, implicit and explicit.
> stocks <- tibble(
+   year   = c(2015, 2015, 2015, 2015, 2016, 2016, 2016),
+   qtr    = c(   1,    2,    3,    4,    2,    3,    4),
+   return = c(1.88, 0.59, 0.35,   NA, 0.92, 0.17, 2.66)
+ )
> stocks %>% 
+   pivot_wider(names_from = year, values_from = return,
+               values_fill = 0)
# A tibble: 4 x 3
    qtr `2015` `2016`
  <dbl>  <dbl>  <dbl>
1     1   1.88   0   
2     2   0.59   0.92
3     3   0.35   0.17
4     4  NA      2.66
> stocks %>% 
+   complete(year, qtr, fill=list(return=0))
# A tibble: 8 x 3
   year   qtr return
  <dbl> <dbl>  <dbl>
1  2015     1   1.88
2  2015     2   0.59
3  2015     3   0.35
4  2015     4   0   
5  2016     1   0   
6  2016     2   0.92
7  2016     3   0.17
8  2016     4   2.66
  1. What does the direction argument to fill() do?.
  • Direction in which to fill missing values. Currently either “down” (the default), “up”, “downup” (i.e. first down and then up) or “updown” (first up and then down).
> (treatment <- tribble(
+    ~person, ~treatment, ~response,
+    "Derrick Whitmore", 1, 7,
+    NA, 2, 10,
+    NA, 3, 9,
+   "Katherine Burke", 1, 4
+    ))
# A tibble: 4 x 3
  person           treatment response
  <chr>                <dbl>    <dbl>
1 Derrick Whitmore         1        7
2 <NA>                     2       10
3 <NA>                     3        9
4 Katherine Burke          1        4
> treatment %>% fill(person, .direction = "down")
# A tibble: 4 x 3
  person           treatment response
  <chr>                <dbl>    <dbl>
1 Derrick Whitmore         1        7
2 Derrick Whitmore         2       10
3 Derrick Whitmore         3        9
4 Katherine Burke          1        4
> treatment %>% fill(person, .direction = "up")
# A tibble: 4 x 3
  person           treatment response
  <chr>                <dbl>    <dbl>
1 Derrick Whitmore         1        7
2 Katherine Burke          2       10
3 Katherine Burke          3        9
4 Katherine Burke          1        4

Case Study


We can apply some of the previous methods to the who dataset, which contains tuberculosis cases broken down by year, country, age, gender, and diagnosis method.

> who
# A tibble: 7,240 x 60
   country iso2  iso3   year new_sp_m014 new_sp_m1524 new_sp_m2534 new_sp_m3544
   <chr>   <chr> <chr> <int>       <int>        <int>        <int>        <int>
 1 Afghan~ AF    AFG    1980          NA           NA           NA           NA
 2 Afghan~ AF    AFG    1981          NA           NA           NA           NA
 3 Afghan~ AF    AFG    1982          NA           NA           NA           NA
 4 Afghan~ AF    AFG    1983          NA           NA           NA           NA
 5 Afghan~ AF    AFG    1984          NA           NA           NA           NA
 6 Afghan~ AF    AFG    1985          NA           NA           NA           NA
 7 Afghan~ AF    AFG    1986          NA           NA           NA           NA
 8 Afghan~ AF    AFG    1987          NA           NA           NA           NA
 9 Afghan~ AF    AFG    1988          NA           NA           NA           NA
10 Afghan~ AF    AFG    1989          NA           NA           NA           NA
# ... with 7,230 more rows, and 52 more variables: new_sp_m4554 <int>,
#   new_sp_m5564 <int>, new_sp_m65 <int>, new_sp_f014 <int>,
#   new_sp_f1524 <int>, new_sp_f2534 <int>, new_sp_f3544 <int>,
#   new_sp_f4554 <int>, new_sp_f5564 <int>, new_sp_f65 <int>,
#   new_sn_m014 <int>, new_sn_m1524 <int>, new_sn_m2534 <int>,
#   new_sn_m3544 <int>, new_sn_m4554 <int>, new_sn_m5564 <int>,
#   new_sn_m65 <int>, new_sn_f014 <int>, new_sn_f1524 <int>,
#   new_sn_f2534 <int>, new_sn_f3544 <int>, new_sn_f4554 <int>,
#   new_sn_f5564 <int>, new_sn_f65 <int>, new_ep_m014 <int>,
#   new_ep_m1524 <int>, new_ep_m2534 <int>, new_ep_m3544 <int>,
#   new_ep_m4554 <int>, new_ep_m5564 <int>, new_ep_m65 <int>,
#   new_ep_f014 <int>, new_ep_f1524 <int>, new_ep_f2534 <int>,
#   new_ep_f3544 <int>, new_ep_f4554 <int>, new_ep_f5564 <int>,
#   new_ep_f65 <int>, newrel_m014 <int>, newrel_m1524 <int>,
#   newrel_m2534 <int>, newrel_m3544 <int>, newrel_m4554 <int>,
#   newrel_m5564 <int>, newrel_m65 <int>, newrel_f014 <int>,
#   newrel_f1524 <int>, newrel_f2534 <int>, newrel_f3544 <int>,
#   newrel_f4554 <int>, newrel_f5564 <int>, newrel_f65 <int>
  • country, iso2, and iso3 all identify the country
  • year is a variable
  • The other columns, new_sp_m014 to newrel_f65, should be variables

We can begin tidying the data by using the pivot_longer() function.

> who1 <- who %>% 
+   pivot_longer(-(country:year),names_to="key",values_to="cases",
+                values_drop_na=TRUE)
> who1
# A tibble: 76,046 x 6
   country     iso2  iso3   year key          cases
   <chr>       <chr> <chr> <int> <chr>        <int>
 1 Afghanistan AF    AFG    1997 new_sp_m014      0
 2 Afghanistan AF    AFG    1997 new_sp_m1524    10
 3 Afghanistan AF    AFG    1997 new_sp_m2534     6
 4 Afghanistan AF    AFG    1997 new_sp_m3544     3
 5 Afghanistan AF    AFG    1997 new_sp_m4554     5
 6 Afghanistan AF    AFG    1997 new_sp_m5564     2
 7 Afghanistan AF    AFG    1997 new_sp_m65       0
 8 Afghanistan AF    AFG    1997 new_sp_f014      5
 9 Afghanistan AF    AFG    1997 new_sp_f1524    38
10 Afghanistan AF    AFG    1997 new_sp_f2534    36
# ... with 76,036 more rows

We can examine it further by evaluating the count.

> who1 %>% count(key)
# A tibble: 56 x 2
   key              n
   <chr>        <int>
 1 new_ep_f014   1032
 2 new_ep_f1524  1021
 3 new_ep_f2534  1021
 4 new_ep_f3544  1021
 5 new_ep_f4554  1017
 6 new_ep_f5564  1017
 7 new_ep_f65    1014
 8 new_ep_m014   1038
 9 new_ep_m1524  1026
10 new_ep_m2534  1020
# ... with 46 more rows

The new key column can be parsed further:

  • The first three letters denote whether it is a new or old case. In this dataset they are all new.
  • The next two letters describe the type of TB. The values are rel, ep, sn, sp
  • The sixth letter represent’s the patients sex, M,F.
  • The remaining letter represent an age group. ex 014 = 0 to 14.

Most follow the format new_ep etc., but instead of new_rel we have newrel. We can change that with str_replace().

> who2 <- who1 %>% 
+   mutate(key=stringr::str_replace(key,"newrel","new_rel"))
> who2
# A tibble: 76,046 x 6
   country     iso2  iso3   year key          cases
   <chr>       <chr> <chr> <int> <chr>        <int>
 1 Afghanistan AF    AFG    1997 new_sp_m014      0
 2 Afghanistan AF    AFG    1997 new_sp_m1524    10
 3 Afghanistan AF    AFG    1997 new_sp_m2534     6
 4 Afghanistan AF    AFG    1997 new_sp_m3544     3
 5 Afghanistan AF    AFG    1997 new_sp_m4554     5
 6 Afghanistan AF    AFG    1997 new_sp_m5564     2
 7 Afghanistan AF    AFG    1997 new_sp_m65       0
 8 Afghanistan AF    AFG    1997 new_sp_f014      5
 9 Afghanistan AF    AFG    1997 new_sp_f1524    38
10 Afghanistan AF    AFG    1997 new_sp_f2534    36
# ... with 76,036 more rows

The we can separate key into more useful columns.

> who3 <- who2 %>% 
+   separate(key, c("new","type","sexage"),sep="_")
> who3
# A tibble: 76,046 x 8
   country     iso2  iso3   year new   type  sexage cases
   <chr>       <chr> <chr> <int> <chr> <chr> <chr>  <int>
 1 Afghanistan AF    AFG    1997 new   sp    m014       0
 2 Afghanistan AF    AFG    1997 new   sp    m1524     10
 3 Afghanistan AF    AFG    1997 new   sp    m2534      6
 4 Afghanistan AF    AFG    1997 new   sp    m3544      3
 5 Afghanistan AF    AFG    1997 new   sp    m4554      5
 6 Afghanistan AF    AFG    1997 new   sp    m5564      2
 7 Afghanistan AF    AFG    1997 new   sp    m65        0
 8 Afghanistan AF    AFG    1997 new   sp    f014       5
 9 Afghanistan AF    AFG    1997 new   sp    f1524     38
10 Afghanistan AF    AFG    1997 new   sp    f2534     36
# ... with 76,036 more rows

Since iso2 and iso3 are redundant (we already have country) and since all cases are “new” we can drop those three columns.

> who4 <- who3 %>% 
+   select(-new,-iso2,-iso3)

Finally, we can split sexage into sex and age.

> who5 <- who4 %>% 
+   separate(sexage,c("sex","age"),sep=1)
> who5
# A tibble: 76,046 x 6
   country      year type  sex   age   cases
   <chr>       <int> <chr> <chr> <chr> <int>
 1 Afghanistan  1997 sp    m     014       0
 2 Afghanistan  1997 sp    m     1524     10
 3 Afghanistan  1997 sp    m     2534      6
 4 Afghanistan  1997 sp    m     3544      3
 5 Afghanistan  1997 sp    m     4554      5
 6 Afghanistan  1997 sp    m     5564      2
 7 Afghanistan  1997 sp    m     65        0
 8 Afghanistan  1997 sp    f     014       5
 9 Afghanistan  1997 sp    f     1524     38
10 Afghanistan  1997 sp    f     2534     36
# ... with 76,036 more rows

The resulting dataset is now tidy.

Case Study - Exercises

> (who1 <- who %>% 
+   pivot_longer(
+     cols = new_sp_m014:newrel_f65, 
+     names_to = "key", 
+     values_to = "cases", 
+     values_drop_na = TRUE
+   ))
# A tibble: 76,046 x 6
   country     iso2  iso3   year key          cases
   <chr>       <chr> <chr> <int> <chr>        <int>
 1 Afghanistan AF    AFG    1997 new_sp_m014      0
 2 Afghanistan AF    AFG    1997 new_sp_m1524    10
 3 Afghanistan AF    AFG    1997 new_sp_m2534     6
 4 Afghanistan AF    AFG    1997 new_sp_m3544     3
 5 Afghanistan AF    AFG    1997 new_sp_m4554     5
 6 Afghanistan AF    AFG    1997 new_sp_m5564     2
 7 Afghanistan AF    AFG    1997 new_sp_m65       0
 8 Afghanistan AF    AFG    1997 new_sp_f014      5
 9 Afghanistan AF    AFG    1997 new_sp_f1524    38
10 Afghanistan AF    AFG    1997 new_sp_f2534    36
# ... with 76,036 more rows
> (who2 <- who1 %>% 
+   mutate(names_from = stringr::str_replace(key, "newrel", "new_rel")))
# A tibble: 76,046 x 7
   country     iso2  iso3   year key          cases names_from  
   <chr>       <chr> <chr> <int> <chr>        <int> <chr>       
 1 Afghanistan AF    AFG    1997 new_sp_m014      0 new_sp_m014 
 2 Afghanistan AF    AFG    1997 new_sp_m1524    10 new_sp_m1524
 3 Afghanistan AF    AFG    1997 new_sp_m2534     6 new_sp_m2534
 4 Afghanistan AF    AFG    1997 new_sp_m3544     3 new_sp_m3544
 5 Afghanistan AF    AFG    1997 new_sp_m4554     5 new_sp_m4554
 6 Afghanistan AF    AFG    1997 new_sp_m5564     2 new_sp_m5564
 7 Afghanistan AF    AFG    1997 new_sp_m65       0 new_sp_m65  
 8 Afghanistan AF    AFG    1997 new_sp_f014      5 new_sp_f014 
 9 Afghanistan AF    AFG    1997 new_sp_f1524    38 new_sp_f1524
10 Afghanistan AF    AFG    1997 new_sp_f2534    36 new_sp_f2534
# ... with 76,036 more rows
> (who3 <- who2 %>% 
+   separate(key, c("new", "type", "sexage"), sep = "_"))
# A tibble: 76,046 x 9
   country     iso2  iso3   year new   type  sexage cases names_from  
   <chr>       <chr> <chr> <int> <chr> <chr> <chr>  <int> <chr>       
 1 Afghanistan AF    AFG    1997 new   sp    m014       0 new_sp_m014 
 2 Afghanistan AF    AFG    1997 new   sp    m1524     10 new_sp_m1524
 3 Afghanistan AF    AFG    1997 new   sp    m2534      6 new_sp_m2534
 4 Afghanistan AF    AFG    1997 new   sp    m3544      3 new_sp_m3544
 5 Afghanistan AF    AFG    1997 new   sp    m4554      5 new_sp_m4554
 6 Afghanistan AF    AFG    1997 new   sp    m5564      2 new_sp_m5564
 7 Afghanistan AF    AFG    1997 new   sp    m65        0 new_sp_m65  
 8 Afghanistan AF    AFG    1997 new   sp    f014       5 new_sp_f014 
 9 Afghanistan AF    AFG    1997 new   sp    f1524     38 new_sp_f1524
10 Afghanistan AF    AFG    1997 new   sp    f2534     36 new_sp_f2534
# ... with 76,036 more rows
> who3 %>%
+   count(new)
# A tibble: 2 x 2
  new        n
  <chr>  <int>
1 new    73466
2 newrel  2580
> (who4 <- who3 %>%
+   select(-new, -iso2, -iso3))
# A tibble: 76,046 x 6
   country      year type  sexage cases names_from  
   <chr>       <int> <chr> <chr>  <int> <chr>       
 1 Afghanistan  1997 sp    m014       0 new_sp_m014 
 2 Afghanistan  1997 sp    m1524     10 new_sp_m1524
 3 Afghanistan  1997 sp    m2534      6 new_sp_m2534
 4 Afghanistan  1997 sp    m3544      3 new_sp_m3544
 5 Afghanistan  1997 sp    m4554      5 new_sp_m4554
 6 Afghanistan  1997 sp    m5564      2 new_sp_m5564
 7 Afghanistan  1997 sp    m65        0 new_sp_m65  
 8 Afghanistan  1997 sp    f014       5 new_sp_f014 
 9 Afghanistan  1997 sp    f1524     38 new_sp_f1524
10 Afghanistan  1997 sp    f2534     36 new_sp_f2534
# ... with 76,036 more rows
> (who5 <- who4 %>%
+   separate(sexage, c("sex", "age"), sep = 1))
# A tibble: 76,046 x 7
   country      year type  sex   age   cases names_from  
   <chr>       <int> <chr> <chr> <chr> <int> <chr>       
 1 Afghanistan  1997 sp    m     014       0 new_sp_m014 
 2 Afghanistan  1997 sp    m     1524     10 new_sp_m1524
 3 Afghanistan  1997 sp    m     2534      6 new_sp_m2534
 4 Afghanistan  1997 sp    m     3544      3 new_sp_m3544
 5 Afghanistan  1997 sp    m     4554      5 new_sp_m4554
 6 Afghanistan  1997 sp    m     5564      2 new_sp_m5564
 7 Afghanistan  1997 sp    m     65        0 new_sp_m65  
 8 Afghanistan  1997 sp    f     014       5 new_sp_f014 
 9 Afghanistan  1997 sp    f     1524     38 new_sp_f1524
10 Afghanistan  1997 sp    f     2534     36 new_sp_f2534
# ... with 76,036 more rows
  1. In this case study, I set values_drop_na = TRUE just to make it easier to check that we had the correct values. Is this reasonable? Think about how missing values are represented in this dataset. Are there implicit missing values? What’s the difference between an NA and zero?
  • An NA is an explicitly missing value and a 0 is a recorded value of no TB cases.
  • There are some implicit missing values, where a county,year combination is not present.
> #Cases recorded as 0
> who1 %>%
+   filter(cases == 0) %>%
+   nrow()
[1] 11080
> #Implicit missing values
> nrow(who)
[1] 7240
> who %>% 
+   complete(country, year) %>%
+   nrow()
[1] 7446
  1. What happens if you neglect the mutate() step? (mutate(key = str_replace(key, "newrel", "new_rel"))
  • It’s expecting 3 columns but will only have 2, so the data is wrong and some values are also missing.
> who3a <- who1 %>%
+   separate(key, c("new", "type", "sexage"), sep = "_")
Warning: Expected 3 pieces. Missing pieces filled with `NA` in 2580 rows [243,
244, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 903,
904, 905, 906, ...].
> filter(who3a, new == "newrel") %>% head()
# A tibble: 6 x 8
  country     iso2  iso3   year new    type  sexage cases
  <chr>       <chr> <chr> <int> <chr>  <chr> <chr>  <int>
1 Afghanistan AF    AFG    2013 newrel m014  <NA>    1705
2 Afghanistan AF    AFG    2013 newrel f014  <NA>    1749
3 Albania     AL    ALB    2013 newrel m014  <NA>      14
4 Albania     AL    ALB    2013 newrel m1524 <NA>      60
5 Albania     AL    ALB    2013 newrel m2534 <NA>      61
6 Albania     AL    ALB    2013 newrel m3544 <NA>      32
  1. I claimed that iso2 and iso3 were redundant with country. Confirm this claim.
  • This is a correct assumption
> who3 %>% select(country, iso2, iso3) %>%
+   distinct() %>%
+   group_by(country) %>%
+   filter(n() > 1)
# A tibble: 0 x 3
# Groups:   country [0]
# ... with 3 variables: country <chr>, iso2 <chr>, iso3 <chr>
  1. For each country, year, and sex compute the total number of cases of TB. Make an informative visualization of the data.
> who5 %>%
+   group_by(country, year, sex) %>% 
+   summarize(cases = sum(cases)) %>% 
+   na.omit() %>% 
+   unite(country_sex, country, sex, remove = FALSE) %>% 
+   filter(year > 1995) %>% 
+   ggplot(aes(x = year, y = cases, group = country_sex, color = sex)) +
+   geom_line()