The tidyr package (part of the tidyverse) helps to organize or restructure messy or inconvenient datasets.

The Datasets

The following datasets will be used to demonstrate some of tidyr’s functionality. Each dataset shows the same four variables in a different way: country, year, population, and cases

> library(tidyverse) #load tidyr and datasets
> library(knitr) #load library for table formatting
> library(kableExtra) #load library for table formatting
> 
> # table1
> kable(table1) %>%
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")

country	year	cases	population
Afghanistan	1999	745	19987071
Afghanistan	2000	2666	20595360
Brazil	1999	37737	172006362
Brazil	2000	80488	174504898
China	1999	212258	1272915272
China	2000	213766	1280428583

> # table2
> kable(table2) %>%
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")

country	year	type	count
Afghanistan	1999	cases	745
Afghanistan	1999	population	19987071
Afghanistan	2000	cases	2666
Afghanistan	2000	population	20595360
Brazil	1999	cases	37737
Brazil	1999	population	172006362
Brazil	2000	cases	80488
Brazil	2000	population	174504898
China	1999	cases	212258
China	1999	population	1272915272
China	2000	cases	213766
China	2000	population	1280428583

> # table3
> kable(table3) %>%
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")

country	year	rate
Afghanistan	1999	745/19987071
Afghanistan	2000	2666/20595360
Brazil	1999	37737/172006362
Brazil	2000	80488/174504898
China	1999	212258/1272915272
China	2000	213766/1280428583

> # table4a
> kable(table4a) %>%
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")

country	1999	2000
Afghanistan	745	2666
Brazil	37737	80488
China	212258	213766

> # table4b
> kable(table4b) %>%
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")

country	1999	2000
Afghanistan	19987071	20595360
Brazil	172006362	174504898
China	1272915272	1280428583

There are three rules (interrelated) that make a datset tidy:

Each variable must have its own column.
Each observation must have its own row.
Each value must have its own cell.

In this case only table1 is tidy.

The Datasets - Exercises

Compute the rate for table2 and table4a + table4b. You will need to perform four operations:

Extract the number of TB cases per country per year.
Extract the matching population per country per year.
Divide cases by population and multiply by 10,000.
Store back in the appropriate place.

Which representation is easiest to work with?

Table 4a and 4b were easier to work with.

Table2

> library(tidyverse)
> 
> (TB_yr <- table2 %>% filter(type=="cases"))

# A tibble: 6 x 4
  country      year type   count
  <chr>       <int> <chr>  <int>
1 Afghanistan  1999 cases    745
2 Afghanistan  2000 cases   2666
3 Brazil       1999 cases  37737
4 Brazil       2000 cases  80488
5 China        1999 cases 212258
6 China        2000 cases 213766

> (Pop_yr <- table2 %>% filter(type=="population"))

# A tibble: 6 x 4
  country      year type            count
  <chr>       <int> <chr>           <int>
1 Afghanistan  1999 population   19987071
2 Afghanistan  2000 population   20595360
3 Brazil       1999 population  172006362
4 Brazil       2000 population  174504898
5 China        1999 population 1272915272
6 China        2000 population 1280428583

> (Rate_yr <- TB_yr %>% inner_join(Pop_yr, by=c("country","year")) %>% 
+   mutate(rate=count.x/count.y*10000, type.z="rate") %>% 
+     select(country,year,type=type.z,count=rate))

# A tibble: 6 x 4
  country      year type  count
  <chr>       <int> <chr> <dbl>
1 Afghanistan  1999 rate  0.373
2 Afghanistan  2000 rate  1.29 
3 Brazil       1999 rate  2.19 
4 Brazil       2000 rate  4.61 
5 China        1999 rate  1.67 
6 China        2000 rate  1.67

> bind_rows(TB_yr,Pop_yr,Rate_yr) %>% 
+   arrange(country,year,type)

# A tibble: 18 x 4
   country      year type         count
   <chr>       <int> <chr>        <dbl>
 1 Afghanistan  1999 cases      7.45e+2
 2 Afghanistan  1999 population 2.00e+7
 3 Afghanistan  1999 rate       3.73e-1
 4 Afghanistan  2000 cases      2.67e+3
 5 Afghanistan  2000 population 2.06e+7
 6 Afghanistan  2000 rate       1.29e+0
 7 Brazil       1999 cases      3.77e+4
 8 Brazil       1999 population 1.72e+8
 9 Brazil       1999 rate       2.19e+0
10 Brazil       2000 cases      8.05e+4
11 Brazil       2000 population 1.75e+8
12 Brazil       2000 rate       4.61e+0
13 China        1999 cases      2.12e+5
14 China        1999 population 1.27e+9
15 China        1999 rate       1.67e+0
16 China        2000 cases      2.14e+5
17 China        2000 population 1.28e+9
18 China        2000 rate       1.67e+0

Table4a + Table4b

> table4a %>% inner_join(table4b,by="country",
+                        suffix=c("_cases","_pop")) %>% 
+   mutate(`1999_rate`=`1999_cases`/`1999_pop`*10000,
+          `2000_rate`=`2000_cases`/`2000_pop`*10000) %>% 
+   select(country,`1999`=`1999_rate`,`2000`=`2000_rate`)

# A tibble: 3 x 3
  country     `1999` `2000`
  <chr>        <dbl>  <dbl>
1 Afghanistan  0.373   1.29
2 Brazil       2.19    4.61
3 China        1.67    1.67

Re-create the plot showing change in cases over time using table2 instead of table1.

> table2 %>%
+   filter(type == "cases") %>%
+   ggplot(aes(year, count)) +
+   geom_line(aes(group = country), color = "grey50") +
+   geom_point(aes(color = country)) +
+   scale_x_continuous(breaks = unique(table2$year)) +
+   ylab("cases")

Pivoting Longer (Gathering)

Sometimes columns would be better represented as values of a variable. In the case of table4a, 1999 and 2000 should be gathered and placed into the variable year.

This can be accomplished with the function gather()

key = The gathered column’s name
value = The column name for the values spread over those cells.

> #using gather
> table4a %>% 
+   gather(`1999`,`2000`,key="year",value="cases") %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")

country	year	cases
Afghanistan	1999	745
Brazil	1999	37737
China	1999	212258
Afghanistan	2000	2666
Brazil	2000	80488
China	2000	213766

However, the gather() function has be sunsetted and replaced with pivot_longer().

pivot_longer() “lengthens” data, increasing the number of rows and decreasing the number of columns.

Below you can see the same output with the newer function. Note that instead of selecting 1999 and 2000 I just de-selected country instead.

> #using pivot longer
> table4a %>% 
+   pivot_longer(-country,names_to="year",values_to="cases") %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")

country	year	cases
Afghanistan	1999	745
Afghanistan	2000	2666
Brazil	1999	37737
Brazil	2000	80488
China	1999	212258
China	2000	213766

We can do the same with table4b.

> #using pivot longer
> table4b %>% 
+   pivot_longer(-country,names_to="year",values_to="population") %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")

country	year	population
Afghanistan	1999	19987071
Afghanistan	2000	20595360
Brazil	1999	172006362
Brazil	2000	174504898
China	1999	1272915272
China	2000	1280428583

We can then join table4a and table4b with a left join from dplyr. The output matches table1.

> tidy4a <- table4a %>% 
+   pivot_longer(-country,names_to="year",values_to="cases")
> 
> tidy4b <- table4b %>% 
+   pivot_longer(-country,names_to="year",values_to="population")
> 
> left_join(tidy4a,tidy4b) %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")

Joining, by = c("country", "year")

country	year	cases	population
Afghanistan	1999	745	19987071
Afghanistan	2000	2666	20595360
Brazil	1999	37737	172006362
Brazil	2000	80488	174504898
China	1999	212258	1272915272
China	2000	213766	1280428583

We can now plot case growth over time.

> CombinTbl <- left_join(tidy4a,tidy4b)
> ggplot(CombinTbl,aes(year,cases))+
+   geom_line(aes(group=country),color="grey50")+
+   geom_point(aes(color=country))+theme_light()

Pivoting Wider (Spreading)

The function spread() is the opposite of gather() and the function pivot_wider is the opposite of pivot_longer.

Like gather(), spread() has been sunsetted. Going forward it is best to use pivot_wider.

pivot_wider() “widens” data, increasing the number of columns and decreasing the number of rows.

> #table2
> table2 %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")

country	year	type	count
Afghanistan	1999	cases	745
Afghanistan	1999	population	19987071
Afghanistan	2000	cases	2666
Afghanistan	2000	population	20595360
Brazil	1999	cases	37737
Brazil	1999	population	172006362
Brazil	2000	cases	80488
Brazil	2000	population	174504898
China	1999	cases	212258
China	1999	population	1272915272
China	2000	cases	213766
China	2000	population	1280428583

> #table 2 spread
> table2 %>% spread(key=type, value=count) %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")

country	year	cases	population
Afghanistan	1999	745	19987071
Afghanistan	2000	2666	20595360
Brazil	1999	37737	172006362
Brazil	2000	80488	174504898
China	1999	212258	1272915272
China	2000	213766	1280428583

> #table2 pivot wider
> table2 %>% pivot_wider(names_from=type, values_from=count ) %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")

country	year	cases	population
Afghanistan	1999	745	19987071
Afghanistan	2000	2666	20595360
Brazil	1999	37737	172006362
Brazil	2000	80488	174504898
China	1999	212258	1272915272
China	2000	213766	1280428583

Pivoting - Exercises

Why are pivot_longer() and pivot_wider() not perfectly symmetrical? Carefully consider the following example:

> stocks <- tibble(
+   year   = c(2015, 2015, 2016, 2016),
+   half  = c(   1,    2,     1,    2),
+   return = c(1.88, 0.59, 0.92, 0.17)
+ )
> stocks %>% 
+   pivot_wider(names_from = year, values_from = return) %>% 
+   pivot_longer(`2015`:`2016`, names_to = "year", values_to = "return")

In the original tibble year was <dbl>. When we went from pivot_wider back to pivot_longer it became <chr>.
pivot_wider converted the years into column names (characters).

> (stocks <- tibble(
+   year   = c(2015, 2015, 2016, 2016),
+   half  = c(   1,    2,     1,    2),
+   return = c(1.88, 0.59, 0.92, 0.17)
+ ))

# A tibble: 4 x 3
   year  half return
  <dbl> <dbl>  <dbl>
1  2015     1   1.88
2  2015     2   0.59
3  2016     1   0.92
4  2016     2   0.17

> stocks %>% 
+   pivot_wider(names_from = year, values_from = return) %>% 
+   pivot_longer(`2015`:`2016`, names_to = "year", values_to = "return")

# A tibble: 4 x 3
   half year  return
  <dbl> <chr>  <dbl>
1     1 2015    1.88
2     1 2016    0.92
3     2 2015    0.59
4     2 2016    0.17

pivot_longer() has a names_ptypes argument, e.g. names_ptypes = list(year = double()). What does it do?

names_ptypes and names_transform allow you to confirm or change the type of specific columns.
names_ptypes - A list of column name-prototype pairs. A prototype (or ptype for short) is a zero-length vector (like integer() or numeric()) that defines the type, class, and attributes of a vector. Use these arguments to confirm that the created columns are the types that you expect
names_transform - A list of column name-function pairs. Use these arguments if you need to change the type of specific columns.

> stocks %>%
+   pivot_wider(names_from = year, values_from = return)%>%
+   pivot_longer(`2015`:`2016`, names_to = "year", values_to = "return",
+                names_ptypes = list(year = double()))

Error: Can't convert <character> to <double>.

> stocks %>%
+   pivot_wider(names_from = year, values_from = return)%>%
+   pivot_longer(`2015`:`2016`, names_to = "year", values_to = "return",
+                names_transform = list(year = as.numeric))

# A tibble: 4 x 3
   half  year return
  <dbl> <dbl>  <dbl>
1     1  2015   1.88
2     1  2016   0.92
3     2  2015   0.59
4     2  2016   0.17

Why does this code fail?

> table4a %>% 
+   pivot_longer(c(1999, 2000), names_to = "year", values_to = "cases")

Error: Can't subset columns that don't exist.
[31mx[39m Locations 1999 and 2000 don't exist.
[34mi[39m There are only 3 columns.

The code interprets 1999 and 2000 as column numbers. They need to be surrounded by backticks.

> table4a %>% 
+   pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "cases")

# A tibble: 6 x 3
  country     year   cases
  <chr>       <chr>  <int>
1 Afghanistan 1999     745
2 Afghanistan 2000    2666
3 Brazil      1999   37737
4 Brazil      2000   80488
5 China       1999  212258
6 China       2000  213766

What would happen if you widen this table? Why? How could you add a new column to uniquely identify each value?

> people <- tribble(
+   ~name, ~key, ~value,
+   #-----------------|--------|------
+   "Phillip Woods",  "age", 45,
+   "Phillip Woods", "height", 186,
+   "Phillip Woods", "age", 50,
+   "Jessica Cordero", "age", 37,
+   "Jessica Cordero", "height", 156
+ )

The name and key don’t uniquely identify each value. You can group them and add a row number.

> (people2 <- people %>%
+   group_by(name, key) %>%
+   mutate(obs = row_number()))

# A tibble: 5 x 4
# Groups:   name, key [4]
  name            key    value   obs
  <chr>           <chr>  <dbl> <int>
1 Phillip Woods   age       45     1
2 Phillip Woods   height   186     1
3 Phillip Woods   age       50     2
4 Jessica Cordero age       37     1
5 Jessica Cordero height   156     1

> pivot_wider(people2, names_from="name", values_from = "value")

# A tibble: 3 x 4
# Groups:   key [2]
  key      obs `Phillip Woods` `Jessica Cordero`
  <chr>  <int>           <dbl>             <dbl>
1 age        1              45                37
2 height     1             186               156
3 age        2              50                NA

Tidy the simple tibble below. Do you need to make it wider or longer?

> preg <- tribble(
+   ~pregnant, ~male, ~female,
+   "yes", NA, 10,
+   "no", 20, 12
+ )

pivot_longer works well with this data.

> preg %>% pivot_longer(c(male, female), names_to = "sex", 
+              values_to = "count", values_drop_na = TRUE)

# A tibble: 3 x 3
  pregnant sex    count
  <chr>    <chr>  <dbl>
1 yes      female    10
2 no       male      20
3 no       female    12

Separate

separate() pulls apart one column into multiple columns based on a separator character.

> table3 %>% kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")

country	year	rate
Afghanistan	1999	745/19987071
Afghanistan	2000	2666/20595360
Brazil	1999	37737/172006362
Brazil	2000	80488/174504898
China	1999	212258/1272915272
China	2000	213766/1280428583

> table3 %>% separate(rate, into=c("cases","population")) %>% 
+   kable %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")

country	year	cases	population
Afghanistan	1999	745	19987071
Afghanistan	2000	2666	20595360
Brazil	1999	37737	172006362
Brazil	2000	80488	174504898
China	1999	212258	1272915272
China	2000	213766	1280428583

By default it selects a non alphanumeric character on which to split the variable. However, you can select it:

> table3 %>% separate(rate, into=c("cases","population"), sep="/")

Also, by default separate() leaves the type of column as is. In this instance both cases and population remain as character. However, by adding convert=TRUE the function will change them to integers, which is more appropriate.

> SepTbl3a <- table3 %>% separate(rate, into=c("cases","population"), sep="/")
> 
> SepTbl3b <- table3 %>% separate(
+   rate, into=c("cases","population"),
+   convert=TRUE
+ )
> 
> str(SepTbl3a)

tibble [6 x 4] (S3: tbl_df/tbl/data.frame)
 $ country   : chr [1:6] "Afghanistan" "Afghanistan" "Brazil" "Brazil" ...
 $ year      : int [1:6] 1999 2000 1999 2000 1999 2000
 $ cases     : chr [1:6] "745" "2666" "37737" "80488" ...
 $ population: chr [1:6] "19987071" "20595360" "172006362" "174504898" ...

> str(SepTbl3b)

tibble [6 x 4] (S3: tbl_df/tbl/data.frame)
 $ country   : chr [1:6] "Afghanistan" "Afghanistan" "Brazil" "Brazil" ...
 $ year      : int [1:6] 1999 2000 1999 2000 1999 2000
 $ cases     : int [1:6] 745 2666 37737 80488 212258 213766
 $ population: int [1:6] 19987071 20595360 172006362 174504898 1272915272 1280428583

You can also separate variables based on integers. Positive values start from the left and negative values start from the right.

> table3 %>% 
+   separate(year, into=c("century","year"), sep=2) %>% 
+   kable %>% 
+    kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")

country	century	year	rate
Afghanistan	19	99	745/19987071
Afghanistan	20	00	2666/20595360
Brazil	19	99	37737/172006362
Brazil	20	00	80488/174504898
China	19	99	212258/1272915272
China	20	00	213766/1280428583

Unite

unite() is the inverse of separate().

> table5 %>% 
+   kable %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")

country	century	year	rate
Afghanistan	19	99	745/19987071
Afghanistan	20	00	2666/20595360
Brazil	19	99	37737/172006362
Brazil	20	00	80488/174504898
China	19	99	212258/1272915272
China	20	00	213766/1280428583

By default it will place an underscore _ between values.

> table5 %>% unite(new,century,year) %>% 
+   kable %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")

country	new	rate
Afghanistan	19_99	745/19987071
Afghanistan	20_00	2666/20595360
Brazil	19_99	37737/172006362
Brazil	20_00	80488/174504898
China	19_99	212258/1272915272
China	20_00	213766/1280428583

However, you can specify "" for no separator.

> table5 %>% unite(new,century,year, sep="") %>% 
+   kable %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")

country	new	rate
Afghanistan	1999	745/19987071
Afghanistan	2000	2666/20595360
Brazil	1999	37737/172006362
Brazil	2000	80488/174504898
China	1999	212258/1272915272
China	2000	213766/1280428583

Separating and Uniting Exercises

What do the extra and fill arguments do in separate()? Experiment with the various options for the following two toy datasets.

> tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>%
+   separate(x, c("one", "two", "three"))

Warning: Expected 3 pieces. Additional pieces discarded in 1 rows [2].

# A tibble: 3 x 3
  one   two   three
  <chr> <chr> <chr>
1 a     b     c    
2 d     e     f    
3 h     i     j

> tibble(x = c("a,b,c", "d,e", "f,g,i")) %>%
+   separate(x, c("one", "two", "three"))

Warning: Expected 3 pieces. Missing pieces filled with `NA` in 1 rows [2].

# A tibble: 3 x 3
  one   two   three
  <chr> <chr> <chr>
1 a     b     c    
2 d     e     <NA> 
3 f     g     i

extra - instructions for too many pieces
fill - instructions for too few pieces

> tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>%
+   separate(x, c("one", "two", "three"), extra = "drop")

# A tibble: 3 x 3
  one   two   three
  <chr> <chr> <chr>
1 a     b     c    
2 d     e     f    
3 h     i     j

> tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>%
+   separate(x, c("one", "two", "three"), extra = "merge")

# A tibble: 3 x 3
  one   two   three
  <chr> <chr> <chr>
1 a     b     c    
2 d     e     f,g  
3 h     i     j

> tibble(x = c("a,b,c", "d,e", "f,g,i")) %>%
+   separate(x, c("one", "two", "three"), fill = "right")

# A tibble: 3 x 3
  one   two   three
  <chr> <chr> <chr>
1 a     b     c    
2 d     e     <NA> 
3 f     g     i

Both unite() and separate() have a remove argument. What does it do? Why would you set it to FALSE?

remove discards the original columns. Setting it to FALSE keeps them.

> tibble(x = c("a,b,c", "d,e,f", "h,i,j")) %>%
+   separate(x, c("one", "two", "three"),
+            remove = FALSE)

# A tibble: 3 x 4
  x     one   two   three
  <chr> <chr> <chr> <chr>
1 a,b,c a     b     c    
2 d,e,f d     e     f    
3 h,i,j h     i     j

Compare and contrast separate() and extract(). Why are there three variations of separation (by position, by separator, and with groups), but only one unite?

extract() is more flexible because it can handle non-standard patterns.

> table3

# A tibble: 6 x 3
  country      year rate             
* <chr>       <int> <chr>            
1 Afghanistan  1999 745/19987071     
2 Afghanistan  2000 2666/20595360    
3 Brazil       1999 37737/172006362  
4 Brazil       2000 80488/174504898  
5 China        1999 212258/1272915272
6 China        2000 213766/1280428583

> table3 %>% separate(rate, into=c("cases","population"),
+                     convert = TRUE)

# A tibble: 6 x 4
  country      year  cases population
  <chr>       <int>  <int>      <int>
1 Afghanistan  1999    745   19987071
2 Afghanistan  2000   2666   20595360
3 Brazil       1999  37737  172006362
4 Brazil       2000  80488  174504898
5 China        1999 212258 1272915272
6 China        2000 213766 1280428583

> table3 %>% extract(rate,c("cases","population"),
+                    "(\\d*)[^\\d](\\d*)",
+                    convert=TRUE)

# A tibble: 6 x 4
  country      year  cases population
  <chr>       <int>  <int>      <int>
1 Afghanistan  1999    745   19987071
2 Afghanistan  2000   2666   20595360
3 Brazil       1999  37737  172006362
4 Brazil       2000  80488  174504898
5 China        1999 212258 1272915272
6 China        2000 213766 1280428583

unite() only has one option because you are just merging entire columns.

> table5 %>% unite(year,century,year,sep="")

# A tibble: 6 x 3
  country     year  rate             
  <chr>       <chr> <chr>            
1 Afghanistan 1999  745/19987071     
2 Afghanistan 2000  2666/20595360    
3 Brazil      1999  37737/172006362  
4 Brazil      2000  80488/174504898  
5 China       1999  212258/1272915272
6 China       2000  213766/1280428583

Missing Values

There are two types of missing values

Explicitly - recorded as NA
Implicitly - excluded from the dataset

In the following dataset Q4 2015 is recorded as NA and Q1 2016 is not present.

> stocks <- tibble(
+   year = c(2015,2015,2015,2015,2016,2016,2016),
+   qtr = c(1,2,3,4,2,3,4),
+   return = c(1.88,0.59,0.35,NA,0.92,0.17,2.66)
+ )
> stocks %>% kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")

year	qtr	return
2015	1	1.88
2015	2	0.59
2015	3	0.35
2015	4	NA
2016	2	0.92
2016	3	0.17
2016	4	2.66

We can first remove the implicit missing value by giving each year its own column.

> stocks %>% pivot_wider(names_from = year, values_from=return) %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")

qtr	2015	2016
1	1.88	NA
2	0.59	0.92
3	0.35	0.17
4	NA	2.66

If we want to eliminate the NA values we could add values_drop_na = TRUE.

> stocks %>% pivot_wider(names_from = year, values_from=return) %>% 
+   pivot_longer(-qtr,names_to="year",values_to="return",
+                values_drop_na = TRUE) %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")

qtr	year	return
1	2015	1.88
2	2015	0.59
2	2016	0.92
3	2015	0.35
3	2016	0.17
4	2016	2.66

The function complete() finds all unique combinations from a set of columns and fills in missing values with NA.

> stocks %>% 
+   complete(year,qtr) %>% kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")

year	qtr	return
2015	1	1.88
2015	2	0.59
2015	3	0.35
2015	4	NA
2016	1	NA
2016	2	0.92
2016	3	0.17
2016	4	2.66

Sometimes it makes sense for a value to be carried forward. We can do so with the fill() function.

> treatment <- tribble(
+ ~person, ~treatment, ~response,
+ "Derrick Whitmore", 1, 7,
+ NA, 2, 10,
+ NA, 3, 9,
+ "Katherine Burke", 1, 4
+ )
> 
> treatment %>% kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")

person	treatment	response
Derrick Whitmore	1	7
NA	2	10
NA	3	9
Katherine Burke	1	4

> treatment %>% 
+   fill(person) %>% kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")

person	treatment	response
Derrick Whitmore	1	7
Derrick Whitmore	2	10
Derrick Whitmore	3	9
Katherine Burke	1	4

The function coalesce() will replace missing values.

> nonalcoholic = tibble(
+   name=c("g&t","mimosa","rum and coke","calimocho","polar"), 
+   mixer=c("tonic","orange juice", "coke", "coke","seltzer"), 
+   price=c(3, 2, 2, 2, 5))
> 
> alcoholic = tibble(
+   name=c("g&t","mimosa","rum and coke","calimocho","IPA"), 
+   spirit=c("gin","sparkling wine", "rum", "red wine","beer"),
+   price=c(5, 4, 5, 3, 5))
> 
> full_join(nonalcoholic, alcoholic, by='name') %>% kable() %>% 
+    kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")

name	mixer	price.x	spirit	price.y
g&t	tonic	3	gin	5
mimosa	orange juice	2	sparkling wine	4
rum and coke	coke	2	rum	5
calimocho	coke	2	red wine	3
polar	seltzer	5	NA	NA
IPA	NA	NA	beer	5

We can first change the NA values in price.x and price.y to 0s and combine the prices for a total.

> drinks <- full_join(nonalcoholic, alcoholic, by='name')%>% 
+   mutate(price.x=coalesce(price.x,0), price.y=coalesce(price.y,0)
+          ,total=price.x+price.y)
> 
> drinks %>% kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")

name	mixer	price.x	spirit	price.y	total
g&t	tonic	3	gin	5	8
mimosa	orange juice	2	sparkling wine	4	6
rum and coke	coke	2	rum	5	7
calimocho	coke	2	red wine	3	5
polar	seltzer	5	NA	0	5
IPA	NA	0	beer	5	5

> drinks %>% select(spirit, mixer, total) %>% kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")

spirit	mixer	total
gin	tonic	8
sparkling wine	orange juice	6
rum	coke	7
red wine	coke	5
NA	seltzer	5
beer	NA	5

We can then create a new column that merges the information in spirit and mixer

> drinks <- drinks %>% select(spirit, mixer, total)
> drinks %>% unite(name, spirit, mixer, sep=' & ') %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")

name	total
gin & tonic	8
sparkling wine & orange juice	6
rum & coke	7
red wine & coke	5
NA & seltzer	5
beer & NA	5

Unfortunately, it doesn’t look great because there are NAs. You can substitute the NAs by nothing using coalesce to make the output look a little nicer:

> new <-  drinks %>% 
+         mutate(spirit=coalesce(spirit, "nothing"),   mixer=coalesce(mixer, "nothing")) %>%
+         unite(name, spirit, mixer, sep=' & ') %>% 
+         mutate(name=trimws(name))
> new %>% kable() %>% 
+ kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")

name	total
gin & tonic	8
sparkling wine & orange juice	6
rum & coke	7
red wine & coke	5
nothing & seltzer	5
beer & nothing	5

The function trimws got rid of some empty spaces at the beginning and end of name.

You can make the new variable look even nicer with the function gsub by substituting & nothing and nothing & by (literally) nothing.

> final <-  new %>% mutate(name1 = gsub("& nothing","",name),
+                        name = trimws(gsub("nothing &","",name1))) %>%
+                        select(-name1)
> final %>% kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 12,position="left")%>%
+                 row_spec(0,background="lightpink")

name	total
gin & tonic	8
sparkling wine & orange juice	6
rum & coke	7
red wine & coke	5
seltzer	5
beer	5

Missing Values - Exercises

Compare and contrast the fill arguments to pivot_wider() and complete().

values_fill replaces implicitly missing values .
fill replaces all NA values, implicit and explicit.

> stocks <- tibble(
+   year   = c(2015, 2015, 2015, 2015, 2016, 2016, 2016),
+   qtr    = c(   1,    2,    3,    4,    2,    3,    4),
+   return = c(1.88, 0.59, 0.35,   NA, 0.92, 0.17, 2.66)
+ )
> stocks %>% 
+   pivot_wider(names_from = year, values_from = return,
+               values_fill = 0)

# A tibble: 4 x 3
    qtr `2015` `2016`
  <dbl>  <dbl>  <dbl>
1     1   1.88   0   
2     2   0.59   0.92
3     3   0.35   0.17
4     4  NA      2.66

> stocks %>% 
+   complete(year, qtr, fill=list(return=0))

# A tibble: 8 x 3
   year   qtr return
  <dbl> <dbl>  <dbl>
1  2015     1   1.88
2  2015     2   0.59
3  2015     3   0.35
4  2015     4   0   
5  2016     1   0   
6  2016     2   0.92
7  2016     3   0.17
8  2016     4   2.66

What does the direction argument to fill() do?.

Direction in which to fill missing values. Currently either “down” (the default), “up”, “downup” (i.e. first down and then up) or “updown” (first up and then down).

> (treatment <- tribble(
+    ~person, ~treatment, ~response,
+    "Derrick Whitmore", 1, 7,
+    NA, 2, 10,
+    NA, 3, 9,
+   "Katherine Burke", 1, 4
+    ))

# A tibble: 4 x 3
  person           treatment response
  <chr>                <dbl>    <dbl>
1 Derrick Whitmore         1        7
2 <NA>                     2       10
3 <NA>                     3        9
4 Katherine Burke          1        4

> treatment %>% fill(person, .direction = "down")

# A tibble: 4 x 3
  person           treatment response
  <chr>                <dbl>    <dbl>
1 Derrick Whitmore         1        7
2 Derrick Whitmore         2       10
3 Derrick Whitmore         3        9
4 Katherine Burke          1        4

> treatment %>% fill(person, .direction = "up")

# A tibble: 4 x 3
  person           treatment response
  <chr>                <dbl>    <dbl>
1 Derrick Whitmore         1        7
2 Katherine Burke          2       10
3 Katherine Burke          3        9
4 Katherine Burke          1        4

Case Study

We can apply some of the previous methods to the who dataset, which contains tuberculosis cases broken down by year, country, age, gender, and diagnosis method.

> who

# A tibble: 7,240 x 60
   country iso2  iso3   year new_sp_m014 new_sp_m1524 new_sp_m2534 new_sp_m3544
   <chr>   <chr> <chr> <int>       <int>        <int>        <int>        <int>
 1 Afghan~ AF    AFG    1980          NA           NA           NA           NA
 2 Afghan~ AF    AFG    1981          NA           NA           NA           NA
 3 Afghan~ AF    AFG    1982          NA           NA           NA           NA
 4 Afghan~ AF    AFG    1983          NA           NA           NA           NA
 5 Afghan~ AF    AFG    1984          NA           NA           NA           NA
 6 Afghan~ AF    AFG    1985          NA           NA           NA           NA
 7 Afghan~ AF    AFG    1986          NA           NA           NA           NA
 8 Afghan~ AF    AFG    1987          NA           NA           NA           NA
 9 Afghan~ AF    AFG    1988          NA           NA           NA           NA
10 Afghan~ AF    AFG    1989          NA           NA           NA           NA
# ... with 7,230 more rows, and 52 more variables: new_sp_m4554 <int>,
#   new_sp_m5564 <int>, new_sp_m65 <int>, new_sp_f014 <int>,
#   new_sp_f1524 <int>, new_sp_f2534 <int>, new_sp_f3544 <int>,
#   new_sp_f4554 <int>, new_sp_f5564 <int>, new_sp_f65 <int>,
#   new_sn_m014 <int>, new_sn_m1524 <int>, new_sn_m2534 <int>,
#   new_sn_m3544 <int>, new_sn_m4554 <int>, new_sn_m5564 <int>,
#   new_sn_m65 <int>, new_sn_f014 <int>, new_sn_f1524 <int>,
#   new_sn_f2534 <int>, new_sn_f3544 <int>, new_sn_f4554 <int>,
#   new_sn_f5564 <int>, new_sn_f65 <int>, new_ep_m014 <int>,
#   new_ep_m1524 <int>, new_ep_m2534 <int>, new_ep_m3544 <int>,
#   new_ep_m4554 <int>, new_ep_m5564 <int>, new_ep_m65 <int>,
#   new_ep_f014 <int>, new_ep_f1524 <int>, new_ep_f2534 <int>,
#   new_ep_f3544 <int>, new_ep_f4554 <int>, new_ep_f5564 <int>,
#   new_ep_f65 <int>, newrel_m014 <int>, newrel_m1524 <int>,
#   newrel_m2534 <int>, newrel_m3544 <int>, newrel_m4554 <int>,
#   newrel_m5564 <int>, newrel_m65 <int>, newrel_f014 <int>,
#   newrel_f1524 <int>, newrel_f2534 <int>, newrel_f3544 <int>,
#   newrel_f4554 <int>, newrel_f5564 <int>, newrel_f65 <int>

country, iso2, and iso3 all identify the country
year is a variable
The other columns, new_sp_m014 to newrel_f65, should be variables

We can begin tidying the data by using the pivot_longer() function.

> who1 <- who %>% 
+   pivot_longer(-(country:year),names_to="key",values_to="cases",
+                values_drop_na=TRUE)
> who1

# A tibble: 76,046 x 6
   country     iso2  iso3   year key          cases
   <chr>       <chr> <chr> <int> <chr>        <int>
 1 Afghanistan AF    AFG    1997 new_sp_m014      0
 2 Afghanistan AF    AFG    1997 new_sp_m1524    10
 3 Afghanistan AF    AFG    1997 new_sp_m2534     6
 4 Afghanistan AF    AFG    1997 new_sp_m3544     3
 5 Afghanistan AF    AFG    1997 new_sp_m4554     5
 6 Afghanistan AF    AFG    1997 new_sp_m5564     2
 7 Afghanistan AF    AFG    1997 new_sp_m65       0
 8 Afghanistan AF    AFG    1997 new_sp_f014      5
 9 Afghanistan AF    AFG    1997 new_sp_f1524    38
10 Afghanistan AF    AFG    1997 new_sp_f2534    36
# ... with 76,036 more rows

We can examine it further by evaluating the count.

> who1 %>% count(key)

# A tibble: 56 x 2
   key              n
   <chr>        <int>
 1 new_ep_f014   1032
 2 new_ep_f1524  1021
 3 new_ep_f2534  1021
 4 new_ep_f3544  1021
 5 new_ep_f4554  1017
 6 new_ep_f5564  1017
 7 new_ep_f65    1014
 8 new_ep_m014   1038
 9 new_ep_m1524  1026
10 new_ep_m2534  1020
# ... with 46 more rows

The new key column can be parsed further:

The first three letters denote whether it is a new or old case. In this dataset they are all new.
The next two letters describe the type of TB. The values are rel, ep, sn, sp
The sixth letter represent’s the patients sex, M,F.
The remaining letter represent an age group. ex 014 = 0 to 14.

Most follow the format new_ep etc., but instead of new_rel we have newrel. We can change that with str_replace().

> who2 <- who1 %>% 
+   mutate(key=stringr::str_replace(key,"newrel","new_rel"))
> who2

# A tibble: 76,046 x 6
   country     iso2  iso3   year key          cases
   <chr>       <chr> <chr> <int> <chr>        <int>
 1 Afghanistan AF    AFG    1997 new_sp_m014      0
 2 Afghanistan AF    AFG    1997 new_sp_m1524    10
 3 Afghanistan AF    AFG    1997 new_sp_m2534     6
 4 Afghanistan AF    AFG    1997 new_sp_m3544     3
 5 Afghanistan AF    AFG    1997 new_sp_m4554     5
 6 Afghanistan AF    AFG    1997 new_sp_m5564     2
 7 Afghanistan AF    AFG    1997 new_sp_m65       0
 8 Afghanistan AF    AFG    1997 new_sp_f014      5
 9 Afghanistan AF    AFG    1997 new_sp_f1524    38
10 Afghanistan AF    AFG    1997 new_sp_f2534    36
# ... with 76,036 more rows

The we can separate key into more useful columns.

> who3 <- who2 %>% 
+   separate(key, c("new","type","sexage"),sep="_")
> who3

# A tibble: 76,046 x 8
   country     iso2  iso3   year new   type  sexage cases
   <chr>       <chr> <chr> <int> <chr> <chr> <chr>  <int>
 1 Afghanistan AF    AFG    1997 new   sp    m014       0
 2 Afghanistan AF    AFG    1997 new   sp    m1524     10
 3 Afghanistan AF    AFG    1997 new   sp    m2534      6
 4 Afghanistan AF    AFG    1997 new   sp    m3544      3
 5 Afghanistan AF    AFG    1997 new   sp    m4554      5
 6 Afghanistan AF    AFG    1997 new   sp    m5564      2
 7 Afghanistan AF    AFG    1997 new   sp    m65        0
 8 Afghanistan AF    AFG    1997 new   sp    f014       5
 9 Afghanistan AF    AFG    1997 new   sp    f1524     38
10 Afghanistan AF    AFG    1997 new   sp    f2534     36
# ... with 76,036 more rows

Since iso2 and iso3 are redundant (we already have country) and since all cases are “new” we can drop those three columns.

> who4 <- who3 %>% 
+   select(-new,-iso2,-iso3)

Finally, we can split sexage into sex and age.

> who5 <- who4 %>% 
+   separate(sexage,c("sex","age"),sep=1)
> who5

# A tibble: 76,046 x 6
   country      year type  sex   age   cases
   <chr>       <int> <chr> <chr> <chr> <int>
 1 Afghanistan  1997 sp    m     014       0
 2 Afghanistan  1997 sp    m     1524     10
 3 Afghanistan  1997 sp    m     2534      6
 4 Afghanistan  1997 sp    m     3544      3
 5 Afghanistan  1997 sp    m     4554      5
 6 Afghanistan  1997 sp    m     5564      2
 7 Afghanistan  1997 sp    m     65        0
 8 Afghanistan  1997 sp    f     014       5
 9 Afghanistan  1997 sp    f     1524     38
10 Afghanistan  1997 sp    f     2534     36
# ... with 76,036 more rows

The resulting dataset is now tidy.

Case Study - Exercises

> (who1 <- who %>% 
+   pivot_longer(
+     cols = new_sp_m014:newrel_f65, 
+     names_to = "key", 
+     values_to = "cases", 
+     values_drop_na = TRUE
+   ))

# A tibble: 76,046 x 6
   country     iso2  iso3   year key          cases
   <chr>       <chr> <chr> <int> <chr>        <int>
 1 Afghanistan AF    AFG    1997 new_sp_m014      0
 2 Afghanistan AF    AFG    1997 new_sp_m1524    10
 3 Afghanistan AF    AFG    1997 new_sp_m2534     6
 4 Afghanistan AF    AFG    1997 new_sp_m3544     3
 5 Afghanistan AF    AFG    1997 new_sp_m4554     5
 6 Afghanistan AF    AFG    1997 new_sp_m5564     2
 7 Afghanistan AF    AFG    1997 new_sp_m65       0
 8 Afghanistan AF    AFG    1997 new_sp_f014      5
 9 Afghanistan AF    AFG    1997 new_sp_f1524    38
10 Afghanistan AF    AFG    1997 new_sp_f2534    36
# ... with 76,036 more rows

> (who2 <- who1 %>% 
+   mutate(names_from = stringr::str_replace(key, "newrel", "new_rel")))

# A tibble: 76,046 x 7
   country     iso2  iso3   year key          cases names_from  
   <chr>       <chr> <chr> <int> <chr>        <int> <chr>       
 1 Afghanistan AF    AFG    1997 new_sp_m014      0 new_sp_m014 
 2 Afghanistan AF    AFG    1997 new_sp_m1524    10 new_sp_m1524
 3 Afghanistan AF    AFG    1997 new_sp_m2534     6 new_sp_m2534
 4 Afghanistan AF    AFG    1997 new_sp_m3544     3 new_sp_m3544
 5 Afghanistan AF    AFG    1997 new_sp_m4554     5 new_sp_m4554
 6 Afghanistan AF    AFG    1997 new_sp_m5564     2 new_sp_m5564
 7 Afghanistan AF    AFG    1997 new_sp_m65       0 new_sp_m65  
 8 Afghanistan AF    AFG    1997 new_sp_f014      5 new_sp_f014 
 9 Afghanistan AF    AFG    1997 new_sp_f1524    38 new_sp_f1524
10 Afghanistan AF    AFG    1997 new_sp_f2534    36 new_sp_f2534
# ... with 76,036 more rows

> (who3 <- who2 %>% 
+   separate(key, c("new", "type", "sexage"), sep = "_"))

# A tibble: 76,046 x 9
   country     iso2  iso3   year new   type  sexage cases names_from  
   <chr>       <chr> <chr> <int> <chr> <chr> <chr>  <int> <chr>       
 1 Afghanistan AF    AFG    1997 new   sp    m014       0 new_sp_m014 
 2 Afghanistan AF    AFG    1997 new   sp    m1524     10 new_sp_m1524
 3 Afghanistan AF    AFG    1997 new   sp    m2534      6 new_sp_m2534
 4 Afghanistan AF    AFG    1997 new   sp    m3544      3 new_sp_m3544
 5 Afghanistan AF    AFG    1997 new   sp    m4554      5 new_sp_m4554
 6 Afghanistan AF    AFG    1997 new   sp    m5564      2 new_sp_m5564
 7 Afghanistan AF    AFG    1997 new   sp    m65        0 new_sp_m65  
 8 Afghanistan AF    AFG    1997 new   sp    f014       5 new_sp_f014 
 9 Afghanistan AF    AFG    1997 new   sp    f1524     38 new_sp_f1524
10 Afghanistan AF    AFG    1997 new   sp    f2534     36 new_sp_f2534
# ... with 76,036 more rows

> who3 %>%
+   count(new)

# A tibble: 2 x 2
  new        n
  <chr>  <int>
1 new    73466
2 newrel  2580

> (who4 <- who3 %>%
+   select(-new, -iso2, -iso3))

# A tibble: 76,046 x 6
   country      year type  sexage cases names_from  
   <chr>       <int> <chr> <chr>  <int> <chr>       
 1 Afghanistan  1997 sp    m014       0 new_sp_m014 
 2 Afghanistan  1997 sp    m1524     10 new_sp_m1524
 3 Afghanistan  1997 sp    m2534      6 new_sp_m2534
 4 Afghanistan  1997 sp    m3544      3 new_sp_m3544
 5 Afghanistan  1997 sp    m4554      5 new_sp_m4554
 6 Afghanistan  1997 sp    m5564      2 new_sp_m5564
 7 Afghanistan  1997 sp    m65        0 new_sp_m65  
 8 Afghanistan  1997 sp    f014       5 new_sp_f014 
 9 Afghanistan  1997 sp    f1524     38 new_sp_f1524
10 Afghanistan  1997 sp    f2534     36 new_sp_f2534
# ... with 76,036 more rows

> (who5 <- who4 %>%
+   separate(sexage, c("sex", "age"), sep = 1))

# A tibble: 76,046 x 7
   country      year type  sex   age   cases names_from  
   <chr>       <int> <chr> <chr> <chr> <int> <chr>       
 1 Afghanistan  1997 sp    m     014       0 new_sp_m014 
 2 Afghanistan  1997 sp    m     1524     10 new_sp_m1524
 3 Afghanistan  1997 sp    m     2534      6 new_sp_m2534
 4 Afghanistan  1997 sp    m     3544      3 new_sp_m3544
 5 Afghanistan  1997 sp    m     4554      5 new_sp_m4554
 6 Afghanistan  1997 sp    m     5564      2 new_sp_m5564
 7 Afghanistan  1997 sp    m     65        0 new_sp_m65  
 8 Afghanistan  1997 sp    f     014       5 new_sp_f014 
 9 Afghanistan  1997 sp    f     1524     38 new_sp_f1524
10 Afghanistan  1997 sp    f     2534     36 new_sp_f2534
# ... with 76,036 more rows

In this case study, I set values_drop_na = TRUE just to make it easier to check that we had the correct values. Is this reasonable? Think about how missing values are represented in this dataset. Are there implicit missing values? What’s the difference between an NA and zero?

An NA is an explicitly missing value and a 0 is a recorded value of no TB cases.
There are some implicit missing values, where a county,year combination is not present.

> #Cases recorded as 0
> who1 %>%
+   filter(cases == 0) %>%
+   nrow()

[1] 11080

> #Implicit missing values
> nrow(who)

[1] 7240

> who %>% 
+   complete(country, year) %>%
+   nrow()

[1] 7446

What happens if you neglect the mutate() step? (mutate(key = str_replace(key, "newrel", "new_rel"))

It’s expecting 3 columns but will only have 2, so the data is wrong and some values are also missing.

> who3a <- who1 %>%
+   separate(key, c("new", "type", "sexage"), sep = "_")

Warning: Expected 3 pieces. Missing pieces filled with `NA` in 2580 rows [243,
244, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 903,
904, 905, 906, ...].

> filter(who3a, new == "newrel") %>% head()

# A tibble: 6 x 8
  country     iso2  iso3   year new    type  sexage cases
  <chr>       <chr> <chr> <int> <chr>  <chr> <chr>  <int>
1 Afghanistan AF    AFG    2013 newrel m014  <NA>    1705
2 Afghanistan AF    AFG    2013 newrel f014  <NA>    1749
3 Albania     AL    ALB    2013 newrel m014  <NA>      14
4 Albania     AL    ALB    2013 newrel m1524 <NA>      60
5 Albania     AL    ALB    2013 newrel m2534 <NA>      61
6 Albania     AL    ALB    2013 newrel m3544 <NA>      32

I claimed that iso2 and iso3 were redundant with country. Confirm this claim.

This is a correct assumption

> who3 %>% select(country, iso2, iso3) %>%
+   distinct() %>%
+   group_by(country) %>%
+   filter(n() > 1)

# A tibble: 0 x 3
# Groups:   country [0]
# ... with 3 variables: country <chr>, iso2 <chr>, iso3 <chr>

For each country, year, and sex compute the total number of cases of TB. Make an informative visualization of the data.

> who5 %>%
+   group_by(country, year, sex) %>% 
+   summarize(cases = sum(cases)) %>% 
+   na.omit() %>% 
+   unite(country_sex, country, sex, remove = FALSE) %>% 
+   filter(year > 1995) %>% 
+   ggplot(aes(x = year, y = cases, group = country_sex, color = sex)) +
+   geom_line()

Tidying Data with tidyr

R Markdown

Paul Jozefek

2020-07-31