##Tibbles

library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Creating Tibbles

as_tibble(iris)
## # A tibble: 150 × 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>  
##  1          5.1         3.5          1.4         0.2 setosa 
##  2          4.9         3            1.4         0.2 setosa 
##  3          4.7         3.2          1.3         0.2 setosa 
##  4          4.6         3.1          1.5         0.2 setosa 
##  5          5           3.6          1.4         0.2 setosa 
##  6          5.4         3.9          1.7         0.4 setosa 
##  7          4.6         3.4          1.4         0.3 setosa 
##  8          5           3.4          1.5         0.2 setosa 
##  9          4.4         2.9          1.4         0.2 setosa 
## 10          4.9         3.1          1.5         0.1 setosa 
## # ℹ 140 more rows
#> # A tibble: 150 × 5
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#>          <dbl>       <dbl>        <dbl>       <dbl> <fct>  
#> 1          5.1         3.5          1.4         0.2 setosa 
#> 2          4.9         3            1.4         0.2 setosa 
#> 3          4.7         3.2          1.3         0.2 setosa 
#> 4          4.6         3.1          1.5         0.2 setosa 
#> 5          5           3.6          1.4         0.2 setosa 
#> 6          5.4         3.9          1.7         0.4 setosa 
#> # ℹ 144 more rows
tibble(
  x = 1:5, 
  y = 1, 
  z = x ^ 2 + y
)
## # A tibble: 5 × 3
##       x     y     z
##   <int> <dbl> <dbl>
## 1     1     1     2
## 2     2     1     5
## 3     3     1    10
## 4     4     1    17
## 5     5     1    26
#> # A tibble: 5 × 3
#>       x     y     z
#>   <int> <dbl> <dbl>
#> 1     1     1     2
#> 2     2     1     5
#> 3     3     1    10
#> 4     4     1    17
#> 5     5     1    26
tb <- tibble(
  `:)` = "smile", 
  ` ` = "space",
  `2000` = "number"
)
tb
## # A tibble: 1 × 3
##   `:)`  ` `   `2000`
##   <chr> <chr> <chr> 
## 1 smile space number
#> # A tibble: 1 × 3
#>   `:)`  ` `   `2000`
#>   <chr> <chr> <chr> 
#> 1 smile space number
tribble(
  ~x, ~y, ~z,
  #--|--|----
  "a", 2, 3.6,
  "b", 1, 8.5
)
## # A tibble: 2 × 3
##   x         y     z
##   <chr> <dbl> <dbl>
## 1 a         2   3.6
## 2 b         1   8.5
#> # A tibble: 2 × 3
#>   x         y     z
#>   <chr> <dbl> <dbl>
#> 1 a         2   3.6
#> 2 b         1   8.5

Tibbles vs. data.frame

tibble(
  a = lubridate::now() + runif(1e3) * 86400,
  b = lubridate::today() + runif(1e3) * 30,
  c = 1:1e3,
  d = runif(1e3),
  e = sample(letters, 1e3, replace = TRUE)
)
## # A tibble: 1,000 × 5
##    a                   b              c      d e    
##    <dttm>              <date>     <int>  <dbl> <chr>
##  1 2024-11-15 00:49:36 2024-12-09     1 0.0469 m    
##  2 2024-11-15 08:07:52 2024-12-08     2 0.624  m    
##  3 2024-11-14 10:37:45 2024-11-30     3 0.519  u    
##  4 2024-11-14 19:24:37 2024-11-25     4 0.945  v    
##  5 2024-11-14 11:09:54 2024-12-08     5 0.843  j    
##  6 2024-11-14 11:21:09 2024-11-15     6 0.933  w    
##  7 2024-11-14 10:56:23 2024-12-03     7 0.108  l    
##  8 2024-11-15 05:38:29 2024-12-02     8 0.448  v    
##  9 2024-11-14 15:52:56 2024-11-23     9 0.245  u    
## 10 2024-11-14 18:05:10 2024-12-09    10 0.720  v    
## # ℹ 990 more rows
#> # A tibble: 1,000 × 5
#>   a                   b              c     d e    
#>   <dttm>              <date>     <int> <dbl> <chr>
#> 1 2023-07-24 17:04:35 2023-07-31     1 0.368 n    
#> 2 2023-07-25 11:09:45 2023-08-05     2 0.612 l    
#> 3 2023-07-25 05:33:24 2023-08-15     3 0.415 p    
#> 4 2023-07-24 18:54:41 2023-08-14     4 0.212 m    
#> 5 2023-07-24 15:18:58 2023-08-11     5 0.733 i    
#> 6 2023-07-25 02:19:55 2023-08-07     6 0.460 n    
#> # ℹ 994 more rows
nycflights13::flights %>% 
  print(n = 10, width = Inf)
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
##    arr_delay carrier flight tailnum origin dest  air_time distance  hour minute
##        <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>  <dbl>
##  1        11 UA        1545 N14228  EWR    IAH        227     1400     5     15
##  2        20 UA        1714 N24211  LGA    IAH        227     1416     5     29
##  3        33 AA        1141 N619AA  JFK    MIA        160     1089     5     40
##  4       -18 B6         725 N804JB  JFK    BQN        183     1576     5     45
##  5       -25 DL         461 N668DN  LGA    ATL        116      762     6      0
##  6        12 UA        1696 N39463  EWR    ORD        150      719     5     58
##  7        19 B6         507 N516JB  EWR    FLL        158     1065     6      0
##  8       -14 EV        5708 N829AS  LGA    IAD         53      229     6      0
##  9        -8 B6          79 N593JB  JFK    MCO        140      944     6      0
## 10         8 AA         301 N3ALAA  LGA    ORD        138      733     6      0
##    time_hour          
##    <dttm>             
##  1 2013-01-01 05:00:00
##  2 2013-01-01 05:00:00
##  3 2013-01-01 05:00:00
##  4 2013-01-01 05:00:00
##  5 2013-01-01 06:00:00
##  6 2013-01-01 05:00:00
##  7 2013-01-01 06:00:00
##  8 2013-01-01 06:00:00
##  9 2013-01-01 06:00:00
## 10 2013-01-01 06:00:00
## # ℹ 336,766 more rows
nycflights13::flights %>% 
  View()
df <- tibble(
  x = runif(5),
  y = rnorm(5)
)

# Extract by name
df$x
## [1] 0.91830724 0.22257532 0.08603712 0.40116595 0.40650682
#> [1] 0.73296674 0.23436542 0.66035540 0.03285612 0.46049161
df[["x"]]
## [1] 0.91830724 0.22257532 0.08603712 0.40116595 0.40650682
#> [1] 0.73296674 0.23436542 0.66035540 0.03285612 0.46049161

# Extract by position
df[[1]]
## [1] 0.91830724 0.22257532 0.08603712 0.40116595 0.40650682
#> [1] 0.73296674 0.23436542 0.66035540 0.03285612 0.46049161
df %>% .$x
## [1] 0.91830724 0.22257532 0.08603712 0.40116595 0.40650682
#> [1] 0.73296674 0.23436542 0.66035540 0.03285612 0.46049161
df %>% .[["x"]]
## [1] 0.91830724 0.22257532 0.08603712 0.40116595 0.40650682
#> [1] 0.73296674 0.23436542 0.66035540 0.03285612 0.46049161

Interacting with older code

class(as.data.frame(tb))
## [1] "data.frame"
#> [1] "data.frame"

Exercises

  1. How can you tell if an object is a tibble? You can tell if an object is tibble by putting the object through a function to find out if it is tibble. 2.Compare and contrast the following operations on a data.frame and equivalent tibble. What is different? Why might the default data frame behaviours cause you frustration?
df <- data.frame(abc = 1, xyz = "a")
df$x
## [1] "a"
df[, "xyz"]
## [1] "a"
df[, c("abc", "xyz")]
##   abc xyz
## 1   1   a

What is different are the columns that you thought you were using. It will cause frustration when if you try to use one column it won’t return a data frame but if you use more than one you will get a data frame.

  1. If you have the name of a variable stored in an object, e.g. var <- “mpg”, how can you extract the reference variable from a tibble? You can use the double bracket, like df[[var]]. You cannot use the dollar sign, because df$var would look for a column named var.

  2. Practice referring to non-syntactic names in the following data frame by:

Extracting the variable called 1. Plotting a scatterplot of 1 vs 2. Creating a new column called 3 which is 2 divided by 1. Renaming the columns to one, two and three.

annoying <- tibble(
  `1` = 1:10,
  `2` = `1` * 2 + rnorm(length(`1`))
)
annoying[["1"]]
##  [1]  1  2  3  4  5  6  7  8  9 10
#>  [1]  1  2  3  4  5  6  7  8  9 10
ggplot(annoying, aes(x = `1`, y = `2`)) +
  geom_point()

mutate(annoying, `3` = `2` / `1`)
## # A tibble: 10 × 3
##      `1`   `2`   `3`
##    <int> <dbl> <dbl>
##  1     1  2.40  2.40
##  2     2  3.34  1.67
##  3     3  6.69  2.23
##  4     4  7.87  1.97
##  5     5 10.3   2.05
##  6     6  9.74  1.62
##  7     7 14.0   2.01
##  8     8 15.5   1.94
##  9     9 16.2   1.80
## 10    10 19.5   1.95
#> # A tibble: 10 x 3
#>     `1`    `2`   `3`
#>   <int>  <dbl> <dbl>
#> 1     1  0.600 0.600
#> 2     2  4.26  2.13 
#> 3     3  3.56  1.19 
#> 4     4  7.99  2.00 
#> 5     5 10.6   2.12 
#> 6     6 13.1   2.19 
#> # … with 4 more rows
  1. What does tibble::enframe() do? When might you use it? The function tibble::enframe() converts named vectors to a data frame with names and values
enframe(c(a = 1, b = 2, c = 3))
## # A tibble: 3 × 2
##   name  value
##   <chr> <dbl>
## 1 a         1
## 2 b         2
## 3 c         3
#> # A tibble: 3 x 2
#>   name  value
#>   <chr> <dbl>
#> 1 a         1
#> 2 b         2
#> 3 c         3
  1. What option controls how many additional column names are printed at the footer of a tibble? The help page for the print() method of tibble objects is discussed in ?print.tbl. The n_extra argument determines the number of extra columns to print information for.

Chapter 11

##Prerequisites

library(tidyverse)
read_csv("a,b,c
1,2,3
4,5,6")
## Rows: 2 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): a, b, c
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 2 × 3
##       a     b     c
##   <dbl> <dbl> <dbl>
## 1     1     2     3
## 2     4     5     6
#> Rows: 2 Columns: 3
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> dbl (3): a, b, c
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#> # A tibble: 2 × 3
#>       a     b     c
#>   <dbl> <dbl> <dbl>
#> 1     1     2     3
#> 2     4     5     6
read_csv("The first line of metadata
  The second line of metadata
  x,y,z
  1,2,3", skip = 2)
## Rows: 1 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): x, y, z
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 1 × 3
##       x     y     z
##   <dbl> <dbl> <dbl>
## 1     1     2     3
#> Rows: 1 Columns: 3
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> dbl (3): x, y, z
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#> # A tibble: 1 × 3
#>       x     y     z
#>   <dbl> <dbl> <dbl>
#> 1     1     2     3

read_csv("# A comment I want to skip
  x,y,z
  1,2,3", comment = "#")
## Rows: 1 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): x, y, z
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 1 × 3
##       x     y     z
##   <dbl> <dbl> <dbl>
## 1     1     2     3
#> Rows: 1 Columns: 3
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> dbl (3): x, y, z
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#> # A tibble: 1 × 3
#>       x     y     z
#>   <dbl> <dbl> <dbl>
#> 1     1     2     3
read_csv("1,2,3\n4,5,6", col_names = FALSE)
## Rows: 2 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): X1, X2, X3
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 2 × 3
##      X1    X2    X3
##   <dbl> <dbl> <dbl>
## 1     1     2     3
## 2     4     5     6
#> Rows: 2 Columns: 3
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> dbl (3): X1, X2, X3
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#> # A tibble: 2 × 3
#>      X1    X2    X3
#>   <dbl> <dbl> <dbl>
#> 1     1     2     3
#> 2     4     5     6
read_csv("1,2,3\n4,5,6", col_names = c("x", "y", "z"))
## Rows: 2 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): x, y, z
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 2 × 3
##       x     y     z
##   <dbl> <dbl> <dbl>
## 1     1     2     3
## 2     4     5     6
#> Rows: 2 Columns: 3
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> dbl (3): x, y, z
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#> # A tibble: 2 × 3
#>       x     y     z
#>   <dbl> <dbl> <dbl>
#> 1     1     2     3
#> 2     4     5     6
read_csv("a,b,c\n1,2,.", na = ".")
## Rows: 1 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): a, b
## lgl (1): c
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 1 × 3
##       a     b c    
##   <dbl> <dbl> <lgl>
## 1     1     2 NA
#> Rows: 1 Columns: 3
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> dbl (2): a, b
#> lgl (1): c
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#> # A tibble: 1 × 3
#>       a     b c    
#>   <dbl> <dbl> <lgl>
#> 1     1     2 NA

Exercises

  1. What function would you use to read a file where fields were separated with “|”? Use the read_delim() function with the argument delim=“|”.

  2. Apart from file, skip, and comment, what other arguments do read_csv() and read_tsv() have in common?

col_names and col_types are used to specify the column names and how to parse the columns locale is important for determining things like the encoding and whether “.” or “,” is used as a decimal mark. na and quoted_na control which strings are treated as missing values when parsing vectors trim_ws trims whitespace before and after cells before parsing n_max sets how many rows to read guess_max sets how many rows to use when guessing the column type progress determines whether a progress bar is shown

  1. What are the most important arguments to read_fwf()? The most important argument to read_fwf() which reads “fixed-width formats”, is col_positions which tells the function where data columns begin and end.

  2. Sometimes strings in a CSV file contain commas. To prevent them from causing problems they need to be surrounded by a quoting character, like ” or ’. By default, read_csv() assumes that the quoting character will be “. What argument to read_csv() do you need to specify to read the following text into a data frame?

"x,y\n1,'a,b'"
## [1] "x,y\n1,'a,b'"

You need to read these codes

x <- "x,y\n1,'a,b'"
read_delim(x, ",", quote = "'")
## Rows: 1 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): y
## dbl (1): x
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 1 × 2
##       x y    
##   <dbl> <chr>
## 1     1 a,b
#> # A tibble: 1 x 2
#>       x y    
#>   <dbl> <chr>
#> 1     1 a,b
read_csv(x, quote = "'")
## Rows: 1 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): y
## dbl (1): x
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 1 × 2
##       x y    
##   <dbl> <chr>
## 1     1 a,b
#> # A tibble: 1 x 2
#>       x y    
#>   <dbl> <chr>
#> 1     1 a,b
  1. Identify what is wrong with each of the following inline CSV files. What happens when you run the code?
read_csv("a,b\n1,2,3\n4,5,6")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 2 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1): a
## num (1): b
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 2 × 2
##       a     b
##   <dbl> <dbl>
## 1     1    23
## 2     4    56
read_csv("a,b,c\n1,2\n1,2,3,4")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 2 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): a, b
## num (1): c
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 2 × 3
##       a     b     c
##   <dbl> <dbl> <dbl>
## 1     1     2    NA
## 2     1     2    34
read_csv("a,b\n\"1")
## Rows: 0 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): a, b
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 0 × 2
## # ℹ 2 variables: a <chr>, b <chr>
read_csv("a,b\n1,2\na,b")
## Rows: 2 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): a, b
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 2 × 2
##   a     b    
##   <chr> <chr>
## 1 1     2    
## 2 a     b
read_csv("a;b\n1;3")
## Rows: 1 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): a;b
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 1 × 1
##   `a;b`
##   <chr>
## 1 1;3
read_csv("a,b\n1,2,3\n4,5,6")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 2 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1): a
## num (1): b
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 2 × 2
##       a     b
##   <dbl> <dbl>
## 1     1    23
## 2     4    56
#> Warning: 2 parsing failures.
#> row col  expected    actual         file
#>   1  -- 2 columns 3 columns literal data
#>   2  -- 2 columns 3 columns literal data
#> # A tibble: 2 x 2
#>       a     b
#>   <dbl> <dbl>
#> 1     1     2
#> 2     4     5
read_csv("a,b,c\n1,2\n1,2,3,4")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 2 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): a, b
## num (1): c
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 2 × 3
##       a     b     c
##   <dbl> <dbl> <dbl>
## 1     1     2    NA
## 2     1     2    34
#> Warning: 2 parsing failures.
#> row col  expected    actual         file
#>   1  -- 3 columns 2 columns literal data
#>   2  -- 3 columns 4 columns literal data
#> # A tibble: 2 x 3
#>       a     b     c
#>   <dbl> <dbl> <dbl>
#> 1     1     2    NA
#> 2     1     2     3
read_csv("a,b\n\"1")
## Rows: 0 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): a, b
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 0 × 2
## # ℹ 2 variables: a <chr>, b <chr>
#> Warning: 2 parsing failures.
#> row col                     expected    actual         file
#>   1  a  closing quote at end of file           literal data
#>   1  -- 2 columns                    1 columns literal data
#> # A tibble: 1 x 2
#>       a b    
#>   <dbl> <chr>
#> 1     1 <NA>
read_csv("a,b\n1,2\na,b")
## Rows: 2 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): a, b
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 2 × 2
##   a     b    
##   <chr> <chr>
## 1 1     2    
## 2 a     b
#> # A tibble: 2 x 2
#>   a     b    
#>   <chr> <chr>
#> 1 1     2    
#> 2 a     b
read_csv("a;b\n1;3")
## Rows: 1 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): a;b
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 1 × 1
##   `a;b`
##   <chr>
## 1 1;3
#> # A tibble: 1 x 1
#>   `a;b`
#>   <chr>
#> 1 1;3
read_csv2("a;b\n1;3")
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## Rows: 1 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## dbl (2): a, b
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 1 × 2
##       a     b
##   <dbl> <dbl>
## 1     1     3
#> Using ',' as decimal and '.' as grouping mark. Use read_delim() for more control.
#> # A tibble: 1 x 2
#>       a     b
#>   <dbl> <dbl>
#> 1     1     3

Parsing a vector

str(parse_logical(c("TRUE", "FALSE", "NA")))
##  logi [1:3] TRUE FALSE NA
#>  logi [1:3] TRUE FALSE NA
str(parse_integer(c("1", "2", "3")))
##  int [1:3] 1 2 3
#>  int [1:3] 1 2 3
str(parse_date(c("2010-01-01", "1979-10-14")))
##  Date[1:2], format: "2010-01-01" "1979-10-14"
#>  Date[1:2], format: "2010-01-01" "1979-10-14"
parse_integer(c("1", "231", ".", "456"), na = ".")
## [1]   1 231  NA 456
#> [1]   1 231  NA 456
x <- parse_integer(c("123", "345", "abc", "123.45"))
## Warning: 2 parsing failures.
## row col               expected actual
##   3  -- no trailing characters abc   
##   4  -- no trailing characters 123.45
#> Warning: 2 parsing failures.
#> row col               expected actual
#>   3  -- no trailing characters abc   
#>   4  -- no trailing characters 123.45
x
## [1] 123 345  NA  NA
## attr(,"problems")
## # A tibble: 2 × 4
##     row   col expected               actual
##   <int> <int> <chr>                  <chr> 
## 1     3    NA no trailing characters abc   
## 2     4    NA no trailing characters 123.45
#> [1] 123 345  NA  NA
#> attr(,"problems")
#> # A tibble: 2 × 4
#>     row   col expected               actual
#>   <int> <int> <chr>                  <chr> 
#> 1     3    NA no trailing characters abc   
#> 2     4    NA no trailing characters 123.45
problems(x)
## # A tibble: 2 × 4
##     row   col expected               actual
##   <int> <int> <chr>                  <chr> 
## 1     3    NA no trailing characters abc   
## 2     4    NA no trailing characters 123.45
#> # A tibble: 2 × 4
#>     row   col expected               actual
#>   <int> <int> <chr>                  <chr> 
#> 1     3    NA no trailing characters abc   
#> 2     4    NA no trailing characters 123.45

Numbers

parse_double("1.23")
## [1] 1.23
#> [1] 1.23
parse_double("1,23", locale = locale(decimal_mark = ","))
## [1] 1.23
#> [1] 1.23
parse_number("$100")
## [1] 100
#> [1] 100
parse_number("20%")
## [1] 20
#> [1] 20
parse_number("It cost $123.45")
## [1] 123.45
#> [1] 123.45
# Used in America
parse_number("$123,456,789")
## [1] 123456789
#> [1] 123456789

# Used in many parts of Europe
parse_number("123.456.789", locale = locale(grouping_mark = "."))
## [1] 123456789
#> [1] 123456789

# Used in Switzerland
parse_number("123'456'789", locale = locale(grouping_mark = "'"))
## [1] 123456789
#> [1] 123456789

Strings

charToRaw("Hadley")
## [1] 48 61 64 6c 65 79
#> [1] 48 61 64 6c 65 79
x1 <- "El Ni\xf1o was particularly bad this year"
x2 <- "\x82\xb1\x82\xf1\x82\xc9\x82\xbf\x82\xcd"

x1
## [1] "El Ni\xf1o was particularly bad this year"
#> [1] "El Ni\xf1o was particularly bad this year"
x2
## [1] "\x82\xb1\x82\xf1\x82ɂ\xbf\x82\xcd"
#> [1] "\x82\xb1\x82\xf1\x82ɂ\xbf\x82\xcd"
parse_character(x1, locale = locale(encoding = "Latin1"))
## [1] "El Niño was particularly bad this year"
#> [1] "El Niño was particularly bad this year"
parse_character(x2, locale = locale(encoding = "Shift-JIS"))
## [1] "こんにちは"
#> [1] "こんにちは"
guess_encoding(charToRaw(x1))
## # A tibble: 2 × 2
##   encoding   confidence
##   <chr>           <dbl>
## 1 ISO-8859-1       0.46
## 2 ISO-8859-9       0.23
#> # A tibble: 2 × 2
#>   encoding   confidence
#>   <chr>           <dbl>
#> 1 ISO-8859-1       0.46
#> 2 ISO-8859-9       0.23
guess_encoding(charToRaw(x2))
## # A tibble: 1 × 2
##   encoding confidence
##   <chr>         <dbl>
## 1 KOI8-R         0.42
#> # A tibble: 1 × 2
#>   encoding confidence
#>   <chr>         <dbl>
#> 1 KOI8-R         0.42

Factors

fruit <- c("apple", "banana")
parse_factor(c("apple", "banana", "bananana"), levels = fruit)
## Warning: 1 parsing failure.
## row col           expected   actual
##   3  -- value in level set bananana
## [1] apple  banana <NA>  
## attr(,"problems")
## # A tibble: 1 × 4
##     row   col expected           actual  
##   <int> <int> <chr>              <chr>   
## 1     3    NA value in level set bananana
## Levels: apple banana
#> Warning: 1 parsing failure.
#> row col           expected   actual
#>   3  -- value in level set bananana
#> [1] apple  banana <NA>  
#> attr(,"problems")
#> # A tibble: 1 × 4
#>     row   col expected           actual  
#>   <int> <int> <chr>              <chr>   
#> 1     3    NA value in level set bananana
#> Levels: apple banana

Dates, date-times, and times

parse_datetime("2010-10-01T2010")
## [1] "2010-10-01 20:10:00 UTC"
#> [1] "2010-10-01 20:10:00 UTC"
# If time is omitted, it will be set to midnight
parse_datetime("20101010")
## [1] "2010-10-10 UTC"
#> [1] "2010-10-10 UTC"
parse_date("2010-10-01")
## [1] "2010-10-01"
#> [1] "2010-10-01"
library(hms)
## 
## Attaching package: 'hms'
## The following object is masked from 'package:lubridate':
## 
##     hms
#> 
#> Attaching package: 'hms'
#> The following object is masked from 'package:lubridate':
#> 
#>     hms
parse_time("01:10 am")
## 01:10:00
#> 01:10:00
parse_time("20:10:01")
## 20:10:01
#> 20:10:01
parse_date("01/02/15", "%m/%d/%y")
## [1] "2015-01-02"
#> [1] "2015-01-02"
parse_date("01/02/15", "%d/%m/%y")
## [1] "2015-02-01"
#> [1] "2015-02-01"
parse_date("01/02/15", "%y/%m/%d")
## [1] "2001-02-15"
#> [1] "2001-02-15"
parse_date("1 janvier 2015", "%d %B %Y", locale = locale("fr"))
## [1] "2015-01-01"
#> [1] "2015-01-01"

Exercises

  1. What are the most important arguments to locale()? The locale object has arguments to set the following:

date and time formats: date_names, date_format, and time_format time zone: tz numbers: decimal_mark, grouping_mark encoding: encoding

  1. What happens if you try and set decimal_mark and grouping_mark to the same character? What happens to the default value of grouping_mark when you set decimal_mark to “,”? What happens to the default value of decimal_mark when you set the grouping_mark to “.”?

If the decimal and grouping marks are set to the same character, locale throws an error If the decimal_mark is set to the comma “,”, then the grouping mark is set to the period “.” If the grouping mark is set to a period, then the decimal mark is set to a comma

  1. I didn’t discuss the date_format and time_format options to locale(). What do they do? Construct an example that shows when they might be useful.
locale()
## <locale>
## Numbers:  123,456.78
## Formats:  %AD / %AT
## Timezone: UTC
## Encoding: UTF-8
## <date_names>
## Days:   Sunday (Sun), Monday (Mon), Tuesday (Tue), Wednesday (Wed), Thursday
##         (Thu), Friday (Fri), Saturday (Sat)
## Months: January (Jan), February (Feb), March (Mar), April (Apr), May (May),
##         June (Jun), July (Jul), August (Aug), September (Sep), October
##         (Oct), November (Nov), December (Dec)
## AM/PM:  AM/PM
#> <locale>
#> Numbers:  123,456.78
#> Formats:  %AD / %AT
#> Timezone: UTC
#> Encoding: UTF-8
#> <date_names>
#> Days:   Sunday (Sun), Monday (Mon), Tuesday (Tue), Wednesday (Wed), Thursday
#>         (Thu), Friday (Fri), Saturday (Sat)
#> Months: January (Jan), February (Feb), March (Mar), April (Apr), May (May),
#>         June (Jun), July (Jul), August (Aug), September (Sep), October
#>         (Oct), November (Nov), December (Dec)
#> AM/PM:  AM/PM
locale_custom <- locale(date_format = "Day %d Mon %M Year %y",
                 time_format = "Sec %S Min %M Hour %H")
date_custom <- c("Day 01 Mon 02 Year 03", "Day 03 Mon 01 Year 01")
parse_date(date_custom)
## Warning: 2 parsing failures.
## row col   expected                actual
##   1  -- date like  Day 01 Mon 02 Year 03
##   2  -- date like  Day 03 Mon 01 Year 01
## [1] NA NA
#> Warning: 2 parsing failures.
#> row col   expected                actual
#>   1  -- date like  Day 01 Mon 02 Year 03
#>   2  -- date like  Day 03 Mon 01 Year 01
#> [1] NA NA
parse_date(date_custom, locale = locale_custom)
## [1] "2003-01-01" "2001-01-03"
#> [1] "2003-01-01" "2001-01-03"
time_custom <- c("Sec 01 Min 02 Hour 03", "Sec 03 Min 02 Hour 01")
parse_time(time_custom)
## Warning: 2 parsing failures.
## row col   expected                actual
##   1  -- time like  Sec 01 Min 02 Hour 03
##   2  -- time like  Sec 03 Min 02 Hour 01
## NA
## NA
#> Warning: 2 parsing failures.
#> row col   expected                actual
#>   1  -- time like  Sec 01 Min 02 Hour 03
#>   2  -- time like  Sec 03 Min 02 Hour 01
#> NA
#> NA
parse_time(time_custom, locale = locale_custom)
## 03:02:01
## 01:02:03
#> 03:02:01
#> 01:02:03

They provide default date and time formats. The readr vignette discusses using these to parse dates: since dates can include languages specific weekday and month names, and different conventions for specifying AM/PM Both the date format and time format are used for guessing column types. Thus if you were often parsing data that had non-standard formats for the date and time, you could specify custom values for date_format and time_format.

  1. If you live outside the US, create a new locale object that encapsulates the settings for the types of file you read most commonly.
parse_date("02/01/2006")
## Warning: 1 parsing failure.
## row col   expected     actual
##   1  -- date like  02/01/2006
## [1] NA
#> Warning: 1 parsing failure.
#> row col   expected     actual
#>   1  -- date like  02/01/2006
#> [1] NA
  1. What’s the difference between read_csv() and read_csv2()? The delimiter. The function read_csv() uses a comma, while read_csv2() uses a semi-colon (;). Using a semi-colon is useful when commas are used as the decimal point (as in Europe).

Tidy data

table1
## # A tibble: 6 × 4
##   country      year  cases population
##   <chr>       <dbl>  <dbl>      <dbl>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583
#> # A tibble: 6 × 4
#>   country      year  cases population
#>   <chr>       <dbl>  <dbl>      <dbl>
#> 1 Afghanistan  1999    745   19987071
#> 2 Afghanistan  2000   2666   20595360
#> 3 Brazil       1999  37737  172006362
#> 4 Brazil       2000  80488  174504898
#> 5 China        1999 212258 1272915272
#> 6 China        2000 213766 1280428583
table2
## # A tibble: 12 × 4
##    country      year type            count
##    <chr>       <dbl> <chr>           <dbl>
##  1 Afghanistan  1999 cases             745
##  2 Afghanistan  1999 population   19987071
##  3 Afghanistan  2000 cases            2666
##  4 Afghanistan  2000 population   20595360
##  5 Brazil       1999 cases           37737
##  6 Brazil       1999 population  172006362
##  7 Brazil       2000 cases           80488
##  8 Brazil       2000 population  174504898
##  9 China        1999 cases          212258
## 10 China        1999 population 1272915272
## 11 China        2000 cases          213766
## 12 China        2000 population 1280428583
#> # A tibble: 12 × 4
#>   country      year type           count
#>   <chr>       <dbl> <chr>          <dbl>
#> 1 Afghanistan  1999 cases            745
#> 2 Afghanistan  1999 population  19987071
#> 3 Afghanistan  2000 cases           2666
#> 4 Afghanistan  2000 population  20595360
#> 5 Brazil       1999 cases          37737
#> 6 Brazil       1999 population 172006362
#> # ℹ 6 more rows
table3
## # A tibble: 6 × 3
##   country      year rate             
##   <chr>       <dbl> <chr>            
## 1 Afghanistan  1999 745/19987071     
## 2 Afghanistan  2000 2666/20595360    
## 3 Brazil       1999 37737/172006362  
## 4 Brazil       2000 80488/174504898  
## 5 China        1999 212258/1272915272
## 6 China        2000 213766/1280428583
#> # A tibble: 6 × 3
#>   country      year rate             
#>   <chr>       <dbl> <chr>            
#> 1 Afghanistan  1999 745/19987071     
#> 2 Afghanistan  2000 2666/20595360    
#> 3 Brazil       1999 37737/172006362  
#> 4 Brazil       2000 80488/174504898  
#> 5 China        1999 212258/1272915272
#> 6 China        2000 213766/1280428583

# Spread across two tibbles
table4a  # cases
## # A tibble: 3 × 3
##   country     `1999` `2000`
##   <chr>        <dbl>  <dbl>
## 1 Afghanistan    745   2666
## 2 Brazil       37737  80488
## 3 China       212258 213766
#> # A tibble: 3 × 3
#>   country     `1999` `2000`
#>   <chr>        <dbl>  <dbl>
#> 1 Afghanistan    745   2666
#> 2 Brazil       37737  80488
#> 3 China       212258 213766
table4b  # population
## # A tibble: 3 × 3
##   country         `1999`     `2000`
##   <chr>            <dbl>      <dbl>
## 1 Afghanistan   19987071   20595360
## 2 Brazil       172006362  174504898
## 3 China       1272915272 1280428583
#> # A tibble: 3 × 3
#>   country         `1999`     `2000`
#>   <chr>            <dbl>      <dbl>
#> 1 Afghanistan   19987071   20595360
#> 2 Brazil       172006362  174504898
#> 3 China       1272915272 1280428583
# Compute rate per 10,000
table1 %>% 
  mutate(rate = cases / population * 10000)
## # A tibble: 6 × 5
##   country      year  cases population  rate
##   <chr>       <dbl>  <dbl>      <dbl> <dbl>
## 1 Afghanistan  1999    745   19987071 0.373
## 2 Afghanistan  2000   2666   20595360 1.29 
## 3 Brazil       1999  37737  172006362 2.19 
## 4 Brazil       2000  80488  174504898 4.61 
## 5 China        1999 212258 1272915272 1.67 
## 6 China        2000 213766 1280428583 1.67
#> # A tibble: 6 × 5
#>   country      year  cases population  rate
#>   <chr>       <dbl>  <dbl>      <dbl> <dbl>
#> 1 Afghanistan  1999    745   19987071 0.373
#> 2 Afghanistan  2000   2666   20595360 1.29 
#> 3 Brazil       1999  37737  172006362 2.19 
#> 4 Brazil       2000  80488  174504898 4.61 
#> 5 China        1999 212258 1272915272 1.67 
#> 6 China        2000 213766 1280428583 1.67

# Compute cases per year
table1 %>% 
  count(year, wt = cases)
## # A tibble: 2 × 2
##    year      n
##   <dbl>  <dbl>
## 1  1999 250740
## 2  2000 296920
#> # A tibble: 2 × 2
#>    year      n
#>   <dbl>  <dbl>
#> 1  1999 250740
#> 2  2000 296920

# Visualise changes over time
library(ggplot2)
ggplot(table1, aes(year, cases)) + 
  geom_line(aes(group = country), colour = "grey50") + 
  geom_point(aes(colour = country))

Exercise

  1. Using prose, describe how the variables and observations are organized in each of the sample tables. In table table1, each row represents a (country, year) combination. The columns cases and population contain the values for those variables.
table1
## # A tibble: 6 × 4
##   country      year  cases population
##   <chr>       <dbl>  <dbl>      <dbl>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583
#> # A tibble: 6 x 4
#>   country      year  cases population
#>   <chr>       <int>  <int>      <int>
#> 1 Afghanistan  1999    745   19987071
#> 2 Afghanistan  2000   2666   20595360
#> 3 Brazil       1999  37737  172006362
#> 4 Brazil       2000  80488  174504898
#> 5 China        1999 212258 1272915272
#> 6 China        2000 213766 1280428583

In table2, each row represents a (country, year, variable) combination. The column count contains the values of variables cases and population in separate rows.

table2
## # A tibble: 12 × 4
##    country      year type            count
##    <chr>       <dbl> <chr>           <dbl>
##  1 Afghanistan  1999 cases             745
##  2 Afghanistan  1999 population   19987071
##  3 Afghanistan  2000 cases            2666
##  4 Afghanistan  2000 population   20595360
##  5 Brazil       1999 cases           37737
##  6 Brazil       1999 population  172006362
##  7 Brazil       2000 cases           80488
##  8 Brazil       2000 population  174504898
##  9 China        1999 cases          212258
## 10 China        1999 population 1272915272
## 11 China        2000 cases          213766
## 12 China        2000 population 1280428583
#> # A tibble: 12 x 4
#>   country      year type           count
#>   <chr>       <int> <chr>          <int>
#> 1 Afghanistan  1999 cases            745
#> 2 Afghanistan  1999 population  19987071
#> 3 Afghanistan  2000 cases           2666
#> 4 Afghanistan  2000 population  20595360
#> 5 Brazil       1999 cases          37737
#> 6 Brazil       1999 population 172006362
#> # … with 6 more rows
  1. Compute the rate for table2, and table4a + table4b. You will need to perform four operations:

Extract the number of TB cases per country per year. Extract the matching population per country per year. Divide cases by population, and multiply by 10000. Store back in the appropriate place. Which representation is easiest to work with? Which is hardest? Why?

Neither table is particularly easy to work with. Since table2 has separate rows for cases and population we needed to generate a table with columns for cases and population where we could calculate cases per capita. table4a and table4b split the cases and population variables into different tables which made it easy to divide cases by population. However, we had to repeat this calculation for each row.

The ideal format of a data frame to answer this question is one with columns country, year, cases, and population. Then problem could be answered with a single mutate() call.

  1. Recreate the plot showing change in cases over time using table2 instead of table1. What do you need to do first? Before creating the plot with change in cases over time, we need to filter table to only include rows representing cases of TB.
table2 %>%
  filter(type == "cases") %>%
  ggplot(aes(year, count)) +
  geom_line(aes(group = country), colour = "grey50") +
  geom_point(aes(colour = country)) +
  scale_x_continuous(breaks = unique(table2$year)) +
  ylab("cases")

Pivoting

table4a
## # A tibble: 3 × 3
##   country     `1999` `2000`
##   <chr>        <dbl>  <dbl>
## 1 Afghanistan    745   2666
## 2 Brazil       37737  80488
## 3 China       212258 213766
#> # A tibble: 3 × 3
#>   country     `1999` `2000`
#>   <chr>        <dbl>  <dbl>
#> 1 Afghanistan    745   2666
#> 2 Brazil       37737  80488
#> 3 China       212258 213766
table4a %>% 
  pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "cases")
## # A tibble: 6 × 3
##   country     year   cases
##   <chr>       <chr>  <dbl>
## 1 Afghanistan 1999     745
## 2 Afghanistan 2000    2666
## 3 Brazil      1999   37737
## 4 Brazil      2000   80488
## 5 China       1999  212258
## 6 China       2000  213766
#> # A tibble: 6 × 3
#>   country     year   cases
#>   <chr>       <chr>  <dbl>
#> 1 Afghanistan 1999     745
#> 2 Afghanistan 2000    2666
#> 3 Brazil      1999   37737
#> 4 Brazil      2000   80488
#> 5 China       1999  212258
#> 6 China       2000  213766
table4b %>% 
  pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "population")
## # A tibble: 6 × 3
##   country     year  population
##   <chr>       <chr>      <dbl>
## 1 Afghanistan 1999    19987071
## 2 Afghanistan 2000    20595360
## 3 Brazil      1999   172006362
## 4 Brazil      2000   174504898
## 5 China       1999  1272915272
## 6 China       2000  1280428583
#> # A tibble: 6 × 3
#>   country     year  population
#>   <chr>       <chr>      <dbl>
#> 1 Afghanistan 1999    19987071
#> 2 Afghanistan 2000    20595360
#> 3 Brazil      1999   172006362
#> 4 Brazil      2000   174504898
#> 5 China       1999  1272915272
#> 6 China       2000  1280428583
tidy4a <- table4a %>% 
  pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "cases")
tidy4b <- table4b %>% 
  pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "population")
left_join(tidy4a, tidy4b)
## Joining with `by = join_by(country, year)`
## # A tibble: 6 × 4
##   country     year   cases population
##   <chr>       <chr>  <dbl>      <dbl>
## 1 Afghanistan 1999     745   19987071
## 2 Afghanistan 2000    2666   20595360
## 3 Brazil      1999   37737  172006362
## 4 Brazil      2000   80488  174504898
## 5 China       1999  212258 1272915272
## 6 China       2000  213766 1280428583
#> Joining with `by = join_by(country, year)`
#> # A tibble: 6 × 4
#>   country     year   cases population
#>   <chr>       <chr>  <dbl>      <dbl>
#> 1 Afghanistan 1999     745   19987071
#> 2 Afghanistan 2000    2666   20595360
#> 3 Brazil      1999   37737  172006362
#> 4 Brazil      2000   80488  174504898
#> 5 China       1999  212258 1272915272
#> 6 China       2000  213766 1280428583
table2
## # A tibble: 12 × 4
##    country      year type            count
##    <chr>       <dbl> <chr>           <dbl>
##  1 Afghanistan  1999 cases             745
##  2 Afghanistan  1999 population   19987071
##  3 Afghanistan  2000 cases            2666
##  4 Afghanistan  2000 population   20595360
##  5 Brazil       1999 cases           37737
##  6 Brazil       1999 population  172006362
##  7 Brazil       2000 cases           80488
##  8 Brazil       2000 population  174504898
##  9 China        1999 cases          212258
## 10 China        1999 population 1272915272
## 11 China        2000 cases          213766
## 12 China        2000 population 1280428583
#> # A tibble: 12 × 4
#>   country      year type           count
#>   <chr>       <dbl> <chr>          <dbl>
#> 1 Afghanistan  1999 cases            745
#> 2 Afghanistan  1999 population  19987071
#> 3 Afghanistan  2000 cases           2666
#> 4 Afghanistan  2000 population  20595360
#> 5 Brazil       1999 cases          37737
#> 6 Brazil       1999 population 172006362
#> # ℹ 6 more rows
table2 %>%
    pivot_wider(names_from = type, values_from = count)
## # A tibble: 6 × 4
##   country      year  cases population
##   <chr>       <dbl>  <dbl>      <dbl>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583
#> # A tibble: 6 × 4
#>   country      year  cases population
#>   <chr>       <dbl>  <dbl>      <dbl>
#> 1 Afghanistan  1999    745   19987071
#> 2 Afghanistan  2000   2666   20595360
#> 3 Brazil       1999  37737  172006362
#> 4 Brazil       2000  80488  174504898
#> 5 China        1999 212258 1272915272
#> 6 China        2000 213766 1280428583

Exercises

  1. Why are pivot_longer() and pivot_wider() not perfectly symmetrical? Carefully consider the following example:
stocks <- tibble(
  year   = c(2015, 2015, 2016, 2016),
  half  = c(   1,    2,     1,    2),
  return = c(1.88, 0.59, 0.92, 0.17)
)
stocks %>% 
  pivot_wider(names_from = year, values_from = return) %>% 
  pivot_longer(`2015`:`2016`, names_to = "year", values_to = "return")
## # A tibble: 4 × 3
##    half year  return
##   <dbl> <chr>  <dbl>
## 1     1 2015    1.88
## 2     1 2016    0.92
## 3     2 2015    0.59
## 4     2 2016    0.17

The functions pivot_longer() and pivot_wider() are not perfectly symmetrical because column type information is lost when a data frame is converted from wide to long. The function pivot_longer() stacks multiple columns which may have had multiple data types into a single column with a single data type. This transformation throws away the individual data types of the original columns. The function pivot_wider() creates column names from values in column. These column names will always be treated as character values by pivot_longer() so if the original variable used to create the column names did not have a character data type, then the round-trip will not reproduce the same dataset.

  1. Why does this code fail?

table4a %>% pivot_longer(c(1999, 2000), names_to = “year”, values_to = “cases”) #> Error in pivot_longer(): #> ! Can’t subset columns past the end. #> ℹ Locations 1999 and 2000 don’t exist. #> ℹ There are only 3 columns.

The code fails because the column names 1999 and 2000 are not non-syntactic variable names.

  1. What would happen if you widen this table? Why? How could you add a new column to uniquely identify each value?
people <- tribble(
  ~name,             ~names,  ~values,
  #-----------------|--------|------
  "Phillip Woods",   "age",       45,
  "Phillip Woods",   "height",   186,
  "Phillip Woods",   "age",       50,
  "Jessica Cordero", "age",       37,
  "Jessica Cordero", "height",   156
)

Widening this data frame using pivot_wider() produces columns that are lists of numeric vectors because the name and key columns do not uniquely identify rows.

  1. Tidy the simple tibble below. Do you need to make it wider or longer? What are the variables?
preg <- tribble(
  ~pregnant, ~male, ~female,
  "yes",     NA,    10,
  "no",      20,    12
)

To tidy the preg table use pivot_longer() to create a long table. The variables in this data are:

sex (“female”, “male”) pregnant (“yes”, “no”) count, which is a non-negative integer representing the number of observations.

Separating and uniting

table3
## # A tibble: 6 × 3
##   country      year rate             
##   <chr>       <dbl> <chr>            
## 1 Afghanistan  1999 745/19987071     
## 2 Afghanistan  2000 2666/20595360    
## 3 Brazil       1999 37737/172006362  
## 4 Brazil       2000 80488/174504898  
## 5 China        1999 212258/1272915272
## 6 China        2000 213766/1280428583
#> # A tibble: 6 × 3
#>   country      year rate             
#>   <chr>       <dbl> <chr>            
#> 1 Afghanistan  1999 745/19987071     
#> 2 Afghanistan  2000 2666/20595360    
#> 3 Brazil       1999 37737/172006362  
#> 4 Brazil       2000 80488/174504898  
#> 5 China        1999 212258/1272915272
#> 6 China        2000 213766/1280428583
table3 %>% 
  separate(rate, into = c("cases", "population"))
## # A tibble: 6 × 4
##   country      year cases  population
##   <chr>       <dbl> <chr>  <chr>     
## 1 Afghanistan  1999 745    19987071  
## 2 Afghanistan  2000 2666   20595360  
## 3 Brazil       1999 37737  172006362 
## 4 Brazil       2000 80488  174504898 
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583
#> # A tibble: 6 × 4
#>   country      year cases  population
#>   <chr>       <dbl> <chr>  <chr>     
#> 1 Afghanistan  1999 745    19987071  
#> 2 Afghanistan  2000 2666   20595360  
#> 3 Brazil       1999 37737  172006362 
#> 4 Brazil       2000 80488  174504898 
#> 5 China        1999 212258 1272915272
#> 6 China        2000 213766 1280428583
table3 %>% 
  separate(rate, into = c("cases", "population"), sep = "/")
## # A tibble: 6 × 4
##   country      year cases  population
##   <chr>       <dbl> <chr>  <chr>     
## 1 Afghanistan  1999 745    19987071  
## 2 Afghanistan  2000 2666   20595360  
## 3 Brazil       1999 37737  172006362 
## 4 Brazil       2000 80488  174504898 
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583
table3 %>% 
  separate(rate, into = c("cases", "population"), convert = TRUE)
## # A tibble: 6 × 4
##   country      year  cases population
##   <chr>       <dbl>  <int>      <int>
## 1 Afghanistan  1999    745   19987071
## 2 Afghanistan  2000   2666   20595360
## 3 Brazil       1999  37737  172006362
## 4 Brazil       2000  80488  174504898
## 5 China        1999 212258 1272915272
## 6 China        2000 213766 1280428583
#> # A tibble: 6 × 4
#>   country      year  cases population
#>   <chr>       <dbl>  <int>      <int>
#> 1 Afghanistan  1999    745   19987071
#> 2 Afghanistan  2000   2666   20595360
#> 3 Brazil       1999  37737  172006362
#> 4 Brazil       2000  80488  174504898
#> 5 China        1999 212258 1272915272
#> 6 China        2000 213766 1280428583
table3 %>% 
  separate(year, into = c("century", "year"), sep = 2)
## # A tibble: 6 × 4
##   country     century year  rate             
##   <chr>       <chr>   <chr> <chr>            
## 1 Afghanistan 19      99    745/19987071     
## 2 Afghanistan 20      00    2666/20595360    
## 3 Brazil      19      99    37737/172006362  
## 4 Brazil      20      00    80488/174504898  
## 5 China       19      99    212258/1272915272
## 6 China       20      00    213766/1280428583
#> # A tibble: 6 × 4
#>   country     century year  rate             
#>   <chr>       <chr>   <chr> <chr>            
#> 1 Afghanistan 19      99    745/19987071     
#> 2 Afghanistan 20      00    2666/20595360    
#> 3 Brazil      19      99    37737/172006362  
#> 4 Brazil      20      00    80488/174504898  
#> 5 China       19      99    212258/1272915272
#> 6 China       20      00    213766/1280428583
table5 %>% 
  unite(new, century, year)
## # A tibble: 6 × 3
##   country     new   rate             
##   <chr>       <chr> <chr>            
## 1 Afghanistan 19_99 745/19987071     
## 2 Afghanistan 20_00 2666/20595360    
## 3 Brazil      19_99 37737/172006362  
## 4 Brazil      20_00 80488/174504898  
## 5 China       19_99 212258/1272915272
## 6 China       20_00 213766/1280428583
#> # A tibble: 6 × 3
#>   country     new   rate             
#>   <chr>       <chr> <chr>            
#> 1 Afghanistan 19_99 745/19987071     
#> 2 Afghanistan 20_00 2666/20595360    
#> 3 Brazil      19_99 37737/172006362  
#> 4 Brazil      20_00 80488/174504898  
#> 5 China       19_99 212258/1272915272
#> 6 China       20_00 213766/1280428583
table5 %>% 
  unite(new, century, year, sep = "")
## # A tibble: 6 × 3
##   country     new   rate             
##   <chr>       <chr> <chr>            
## 1 Afghanistan 1999  745/19987071     
## 2 Afghanistan 2000  2666/20595360    
## 3 Brazil      1999  37737/172006362  
## 4 Brazil      2000  80488/174504898  
## 5 China       1999  212258/1272915272
## 6 China       2000  213766/1280428583
#> # A tibble: 6 × 3
#>   country     new   rate             
#>   <chr>       <chr> <chr>            
#> 1 Afghanistan 1999  745/19987071     
#> 2 Afghanistan 2000  2666/20595360    
#> 3 Brazil      1999  37737/172006362  
#> 4 Brazil      2000  80488/174504898  
#> 5 China       1999  212258/1272915272
#> 6 China       2000  213766/1280428583

Exercises

  1. What do the extra and fill arguments do in separate()? Experiment with the various options for the following two toy datasets.
tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>% 
  separate(x, c("one", "two", "three"))
## Warning: Expected 3 pieces. Additional pieces discarded in 1 rows [2].
## # A tibble: 3 × 3
##   one   two   three
##   <chr> <chr> <chr>
## 1 a     b     c    
## 2 d     e     f    
## 3 h     i     j
tibble(x = c("a,b,c", "d,e", "f,g,i")) %>% 
  separate(x, c("one", "two", "three"))
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 1 rows [2].
## # A tibble: 3 × 3
##   one   two   three
##   <chr> <chr> <chr>
## 1 a     b     c    
## 2 d     e     <NA> 
## 3 f     g     i

The extra argument tells separate() what to do if there are too many pieces, and the fill argument tells it what to do if there aren’t enough. By default, separate() drops extra values with a warning.

  1. Both unite() and separate() have a remove argument. What does it do? Why would you set it to FALSE? The remove argument discards input columns in the result data frame. You would set it to FALSE if you want to create a new variable, but keep the old one.

  2. Compare and contrast separate() and extract(), Why are there three variations of separation (by position, by separator, and with groups), but only one unite? The function separate(), splits a column into multiple columns by separator, if the sep argument is a character vector, or by character positions, if sep is numeric.

Missing Values

stocks <- tibble(
  year   = c(2015, 2015, 2015, 2015, 2016, 2016, 2016),
  qtr    = c(   1,    2,    3,    4,    2,    3,    4),
  return = c(1.88, 0.59, 0.35,   NA, 0.92, 0.17, 2.66)
)
stocks %>% 
  pivot_wider(names_from = year, values_from = return)
## # A tibble: 4 × 3
##     qtr `2015` `2016`
##   <dbl>  <dbl>  <dbl>
## 1     1   1.88  NA   
## 2     2   0.59   0.92
## 3     3   0.35   0.17
## 4     4  NA      2.66
#> # A tibble: 4 × 3
#>     qtr `2015` `2016`
#>   <dbl>  <dbl>  <dbl>
#> 1     1   1.88  NA   
#> 2     2   0.59   0.92
#> 3     3   0.35   0.17
#> 4     4  NA      2.66
stocks %>% 
  pivot_wider(names_from = year, values_from = return) %>% 
  pivot_longer(
    cols = c(`2015`, `2016`), 
    names_to = "year", 
    values_to = "return", 
    values_drop_na = TRUE
  )
## # A tibble: 6 × 3
##     qtr year  return
##   <dbl> <chr>  <dbl>
## 1     1 2015    1.88
## 2     2 2015    0.59
## 3     2 2016    0.92
## 4     3 2015    0.35
## 5     3 2016    0.17
## 6     4 2016    2.66
#> # A tibble: 6 × 3
#>     qtr year  return
#>   <dbl> <chr>  <dbl>
#> 1     1 2015    1.88
#> 2     2 2015    0.59
#> 3     2 2016    0.92
#> 4     3 2015    0.35
#> 5     3 2016    0.17
#> 6     4 2016    2.66