##Tibbles
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
as_tibble(iris)
## # A tibble: 150 × 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## # ℹ 140 more rows
#> # A tibble: 150 × 5
#> Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> <dbl> <dbl> <dbl> <dbl> <fct>
#> 1 5.1 3.5 1.4 0.2 setosa
#> 2 4.9 3 1.4 0.2 setosa
#> 3 4.7 3.2 1.3 0.2 setosa
#> 4 4.6 3.1 1.5 0.2 setosa
#> 5 5 3.6 1.4 0.2 setosa
#> 6 5.4 3.9 1.7 0.4 setosa
#> # ℹ 144 more rows
tibble(
x = 1:5,
y = 1,
z = x ^ 2 + y
)
## # A tibble: 5 × 3
## x y z
## <int> <dbl> <dbl>
## 1 1 1 2
## 2 2 1 5
## 3 3 1 10
## 4 4 1 17
## 5 5 1 26
#> # A tibble: 5 × 3
#> x y z
#> <int> <dbl> <dbl>
#> 1 1 1 2
#> 2 2 1 5
#> 3 3 1 10
#> 4 4 1 17
#> 5 5 1 26
tb <- tibble(
`:)` = "smile",
` ` = "space",
`2000` = "number"
)
tb
## # A tibble: 1 × 3
## `:)` ` ` `2000`
## <chr> <chr> <chr>
## 1 smile space number
#> # A tibble: 1 × 3
#> `:)` ` ` `2000`
#> <chr> <chr> <chr>
#> 1 smile space number
tribble(
~x, ~y, ~z,
#--|--|----
"a", 2, 3.6,
"b", 1, 8.5
)
## # A tibble: 2 × 3
## x y z
## <chr> <dbl> <dbl>
## 1 a 2 3.6
## 2 b 1 8.5
#> # A tibble: 2 × 3
#> x y z
#> <chr> <dbl> <dbl>
#> 1 a 2 3.6
#> 2 b 1 8.5
tibble(
a = lubridate::now() + runif(1e3) * 86400,
b = lubridate::today() + runif(1e3) * 30,
c = 1:1e3,
d = runif(1e3),
e = sample(letters, 1e3, replace = TRUE)
)
## # A tibble: 1,000 × 5
## a b c d e
## <dttm> <date> <int> <dbl> <chr>
## 1 2024-11-15 00:49:36 2024-12-09 1 0.0469 m
## 2 2024-11-15 08:07:52 2024-12-08 2 0.624 m
## 3 2024-11-14 10:37:45 2024-11-30 3 0.519 u
## 4 2024-11-14 19:24:37 2024-11-25 4 0.945 v
## 5 2024-11-14 11:09:54 2024-12-08 5 0.843 j
## 6 2024-11-14 11:21:09 2024-11-15 6 0.933 w
## 7 2024-11-14 10:56:23 2024-12-03 7 0.108 l
## 8 2024-11-15 05:38:29 2024-12-02 8 0.448 v
## 9 2024-11-14 15:52:56 2024-11-23 9 0.245 u
## 10 2024-11-14 18:05:10 2024-12-09 10 0.720 v
## # ℹ 990 more rows
#> # A tibble: 1,000 × 5
#> a b c d e
#> <dttm> <date> <int> <dbl> <chr>
#> 1 2023-07-24 17:04:35 2023-07-31 1 0.368 n
#> 2 2023-07-25 11:09:45 2023-08-05 2 0.612 l
#> 3 2023-07-25 05:33:24 2023-08-15 3 0.415 p
#> 4 2023-07-24 18:54:41 2023-08-14 4 0.212 m
#> 5 2023-07-24 15:18:58 2023-08-11 5 0.733 i
#> 6 2023-07-25 02:19:55 2023-08-07 6 0.460 n
#> # ℹ 994 more rows
nycflights13::flights %>%
print(n = 10, width = Inf)
## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## arr_delay carrier flight tailnum origin dest air_time distance hour minute
## <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 11 UA 1545 N14228 EWR IAH 227 1400 5 15
## 2 20 UA 1714 N24211 LGA IAH 227 1416 5 29
## 3 33 AA 1141 N619AA JFK MIA 160 1089 5 40
## 4 -18 B6 725 N804JB JFK BQN 183 1576 5 45
## 5 -25 DL 461 N668DN LGA ATL 116 762 6 0
## 6 12 UA 1696 N39463 EWR ORD 150 719 5 58
## 7 19 B6 507 N516JB EWR FLL 158 1065 6 0
## 8 -14 EV 5708 N829AS LGA IAD 53 229 6 0
## 9 -8 B6 79 N593JB JFK MCO 140 944 6 0
## 10 8 AA 301 N3ALAA LGA ORD 138 733 6 0
## time_hour
## <dttm>
## 1 2013-01-01 05:00:00
## 2 2013-01-01 05:00:00
## 3 2013-01-01 05:00:00
## 4 2013-01-01 05:00:00
## 5 2013-01-01 06:00:00
## 6 2013-01-01 05:00:00
## 7 2013-01-01 06:00:00
## 8 2013-01-01 06:00:00
## 9 2013-01-01 06:00:00
## 10 2013-01-01 06:00:00
## # ℹ 336,766 more rows
nycflights13::flights %>%
View()
df <- tibble(
x = runif(5),
y = rnorm(5)
)
# Extract by name
df$x
## [1] 0.91830724 0.22257532 0.08603712 0.40116595 0.40650682
#> [1] 0.73296674 0.23436542 0.66035540 0.03285612 0.46049161
df[["x"]]
## [1] 0.91830724 0.22257532 0.08603712 0.40116595 0.40650682
#> [1] 0.73296674 0.23436542 0.66035540 0.03285612 0.46049161
# Extract by position
df[[1]]
## [1] 0.91830724 0.22257532 0.08603712 0.40116595 0.40650682
#> [1] 0.73296674 0.23436542 0.66035540 0.03285612 0.46049161
df %>% .$x
## [1] 0.91830724 0.22257532 0.08603712 0.40116595 0.40650682
#> [1] 0.73296674 0.23436542 0.66035540 0.03285612 0.46049161
df %>% .[["x"]]
## [1] 0.91830724 0.22257532 0.08603712 0.40116595 0.40650682
#> [1] 0.73296674 0.23436542 0.66035540 0.03285612 0.46049161
class(as.data.frame(tb))
## [1] "data.frame"
#> [1] "data.frame"
df <- data.frame(abc = 1, xyz = "a")
df$x
## [1] "a"
df[, "xyz"]
## [1] "a"
df[, c("abc", "xyz")]
## abc xyz
## 1 1 a
What is different are the columns that you thought you were using. It will cause frustration when if you try to use one column it won’t return a data frame but if you use more than one you will get a data frame.
If you have the name of a variable stored in an object, e.g. var <- “mpg”, how can you extract the reference variable from a tibble? You can use the double bracket, like df[[var]]. You cannot use the dollar sign, because df$var would look for a column named var.
Practice referring to non-syntactic names in the following data frame by:
Extracting the variable called 1. Plotting a scatterplot of 1 vs 2. Creating a new column called 3 which is 2 divided by 1. Renaming the columns to one, two and three.
annoying <- tibble(
`1` = 1:10,
`2` = `1` * 2 + rnorm(length(`1`))
)
annoying[["1"]]
## [1] 1 2 3 4 5 6 7 8 9 10
#> [1] 1 2 3 4 5 6 7 8 9 10
ggplot(annoying, aes(x = `1`, y = `2`)) +
geom_point()
mutate(annoying, `3` = `2` / `1`)
## # A tibble: 10 × 3
## `1` `2` `3`
## <int> <dbl> <dbl>
## 1 1 2.40 2.40
## 2 2 3.34 1.67
## 3 3 6.69 2.23
## 4 4 7.87 1.97
## 5 5 10.3 2.05
## 6 6 9.74 1.62
## 7 7 14.0 2.01
## 8 8 15.5 1.94
## 9 9 16.2 1.80
## 10 10 19.5 1.95
#> # A tibble: 10 x 3
#> `1` `2` `3`
#> <int> <dbl> <dbl>
#> 1 1 0.600 0.600
#> 2 2 4.26 2.13
#> 3 3 3.56 1.19
#> 4 4 7.99 2.00
#> 5 5 10.6 2.12
#> 6 6 13.1 2.19
#> # … with 4 more rows
enframe(c(a = 1, b = 2, c = 3))
## # A tibble: 3 × 2
## name value
## <chr> <dbl>
## 1 a 1
## 2 b 2
## 3 c 3
#> # A tibble: 3 x 2
#> name value
#> <chr> <dbl>
#> 1 a 1
#> 2 b 2
#> 3 c 3
##Prerequisites
library(tidyverse)
read_csv("a,b,c
1,2,3
4,5,6")
## Rows: 2 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): a, b, c
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 2 × 3
## a b c
## <dbl> <dbl> <dbl>
## 1 1 2 3
## 2 4 5 6
#> Rows: 2 Columns: 3
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> dbl (3): a, b, c
#>
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#> # A tibble: 2 × 3
#> a b c
#> <dbl> <dbl> <dbl>
#> 1 1 2 3
#> 2 4 5 6
read_csv("The first line of metadata
The second line of metadata
x,y,z
1,2,3", skip = 2)
## Rows: 1 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): x, y, z
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 1 × 3
## x y z
## <dbl> <dbl> <dbl>
## 1 1 2 3
#> Rows: 1 Columns: 3
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> dbl (3): x, y, z
#>
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#> # A tibble: 1 × 3
#> x y z
#> <dbl> <dbl> <dbl>
#> 1 1 2 3
read_csv("# A comment I want to skip
x,y,z
1,2,3", comment = "#")
## Rows: 1 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): x, y, z
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 1 × 3
## x y z
## <dbl> <dbl> <dbl>
## 1 1 2 3
#> Rows: 1 Columns: 3
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> dbl (3): x, y, z
#>
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#> # A tibble: 1 × 3
#> x y z
#> <dbl> <dbl> <dbl>
#> 1 1 2 3
read_csv("1,2,3\n4,5,6", col_names = FALSE)
## Rows: 2 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): X1, X2, X3
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 2 × 3
## X1 X2 X3
## <dbl> <dbl> <dbl>
## 1 1 2 3
## 2 4 5 6
#> Rows: 2 Columns: 3
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> dbl (3): X1, X2, X3
#>
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#> # A tibble: 2 × 3
#> X1 X2 X3
#> <dbl> <dbl> <dbl>
#> 1 1 2 3
#> 2 4 5 6
read_csv("1,2,3\n4,5,6", col_names = c("x", "y", "z"))
## Rows: 2 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): x, y, z
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 2 × 3
## x y z
## <dbl> <dbl> <dbl>
## 1 1 2 3
## 2 4 5 6
#> Rows: 2 Columns: 3
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> dbl (3): x, y, z
#>
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#> # A tibble: 2 × 3
#> x y z
#> <dbl> <dbl> <dbl>
#> 1 1 2 3
#> 2 4 5 6
read_csv("a,b,c\n1,2,.", na = ".")
## Rows: 1 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): a, b
## lgl (1): c
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 1 × 3
## a b c
## <dbl> <dbl> <lgl>
## 1 1 2 NA
#> Rows: 1 Columns: 3
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> dbl (2): a, b
#> lgl (1): c
#>
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#> # A tibble: 1 × 3
#> a b c
#> <dbl> <dbl> <lgl>
#> 1 1 2 NA
What function would you use to read a file where fields were separated with “|”? Use the read_delim() function with the argument delim=“|”.
Apart from file, skip, and comment, what other arguments do read_csv() and read_tsv() have in common?
col_names and col_types are used to specify the column names and how to parse the columns locale is important for determining things like the encoding and whether “.” or “,” is used as a decimal mark. na and quoted_na control which strings are treated as missing values when parsing vectors trim_ws trims whitespace before and after cells before parsing n_max sets how many rows to read guess_max sets how many rows to use when guessing the column type progress determines whether a progress bar is shown
What are the most important arguments to read_fwf()? The most important argument to read_fwf() which reads “fixed-width formats”, is col_positions which tells the function where data columns begin and end.
Sometimes strings in a CSV file contain commas. To prevent them from causing problems they need to be surrounded by a quoting character, like ” or ’. By default, read_csv() assumes that the quoting character will be “. What argument to read_csv() do you need to specify to read the following text into a data frame?
"x,y\n1,'a,b'"
## [1] "x,y\n1,'a,b'"
You need to read these codes
x <- "x,y\n1,'a,b'"
read_delim(x, ",", quote = "'")
## Rows: 1 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): y
## dbl (1): x
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 1 × 2
## x y
## <dbl> <chr>
## 1 1 a,b
#> # A tibble: 1 x 2
#> x y
#> <dbl> <chr>
#> 1 1 a,b
read_csv(x, quote = "'")
## Rows: 1 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): y
## dbl (1): x
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 1 × 2
## x y
## <dbl> <chr>
## 1 1 a,b
#> # A tibble: 1 x 2
#> x y
#> <dbl> <chr>
#> 1 1 a,b
read_csv("a,b\n1,2,3\n4,5,6")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 2 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1): a
## num (1): b
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 2 × 2
## a b
## <dbl> <dbl>
## 1 1 23
## 2 4 56
read_csv("a,b,c\n1,2\n1,2,3,4")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 2 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): a, b
## num (1): c
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 2 × 3
## a b c
## <dbl> <dbl> <dbl>
## 1 1 2 NA
## 2 1 2 34
read_csv("a,b\n\"1")
## Rows: 0 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): a, b
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 0 × 2
## # ℹ 2 variables: a <chr>, b <chr>
read_csv("a,b\n1,2\na,b")
## Rows: 2 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): a, b
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 2 × 2
## a b
## <chr> <chr>
## 1 1 2
## 2 a b
read_csv("a;b\n1;3")
## Rows: 1 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): a;b
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 1 × 1
## `a;b`
## <chr>
## 1 1;3
read_csv("a,b\n1,2,3\n4,5,6")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 2 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1): a
## num (1): b
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 2 × 2
## a b
## <dbl> <dbl>
## 1 1 23
## 2 4 56
#> Warning: 2 parsing failures.
#> row col expected actual file
#> 1 -- 2 columns 3 columns literal data
#> 2 -- 2 columns 3 columns literal data
#> # A tibble: 2 x 2
#> a b
#> <dbl> <dbl>
#> 1 1 2
#> 2 4 5
read_csv("a,b,c\n1,2\n1,2,3,4")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 2 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): a, b
## num (1): c
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 2 × 3
## a b c
## <dbl> <dbl> <dbl>
## 1 1 2 NA
## 2 1 2 34
#> Warning: 2 parsing failures.
#> row col expected actual file
#> 1 -- 3 columns 2 columns literal data
#> 2 -- 3 columns 4 columns literal data
#> # A tibble: 2 x 3
#> a b c
#> <dbl> <dbl> <dbl>
#> 1 1 2 NA
#> 2 1 2 3
read_csv("a,b\n\"1")
## Rows: 0 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): a, b
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 0 × 2
## # ℹ 2 variables: a <chr>, b <chr>
#> Warning: 2 parsing failures.
#> row col expected actual file
#> 1 a closing quote at end of file literal data
#> 1 -- 2 columns 1 columns literal data
#> # A tibble: 1 x 2
#> a b
#> <dbl> <chr>
#> 1 1 <NA>
read_csv("a,b\n1,2\na,b")
## Rows: 2 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): a, b
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 2 × 2
## a b
## <chr> <chr>
## 1 1 2
## 2 a b
#> # A tibble: 2 x 2
#> a b
#> <chr> <chr>
#> 1 1 2
#> 2 a b
read_csv("a;b\n1;3")
## Rows: 1 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): a;b
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 1 × 1
## `a;b`
## <chr>
## 1 1;3
#> # A tibble: 1 x 1
#> `a;b`
#> <chr>
#> 1 1;3
read_csv2("a;b\n1;3")
## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.
## Rows: 1 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## dbl (2): a, b
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 1 × 2
## a b
## <dbl> <dbl>
## 1 1 3
#> Using ',' as decimal and '.' as grouping mark. Use read_delim() for more control.
#> # A tibble: 1 x 2
#> a b
#> <dbl> <dbl>
#> 1 1 3
str(parse_logical(c("TRUE", "FALSE", "NA")))
## logi [1:3] TRUE FALSE NA
#> logi [1:3] TRUE FALSE NA
str(parse_integer(c("1", "2", "3")))
## int [1:3] 1 2 3
#> int [1:3] 1 2 3
str(parse_date(c("2010-01-01", "1979-10-14")))
## Date[1:2], format: "2010-01-01" "1979-10-14"
#> Date[1:2], format: "2010-01-01" "1979-10-14"
parse_integer(c("1", "231", ".", "456"), na = ".")
## [1] 1 231 NA 456
#> [1] 1 231 NA 456
x <- parse_integer(c("123", "345", "abc", "123.45"))
## Warning: 2 parsing failures.
## row col expected actual
## 3 -- no trailing characters abc
## 4 -- no trailing characters 123.45
#> Warning: 2 parsing failures.
#> row col expected actual
#> 3 -- no trailing characters abc
#> 4 -- no trailing characters 123.45
x
## [1] 123 345 NA NA
## attr(,"problems")
## # A tibble: 2 × 4
## row col expected actual
## <int> <int> <chr> <chr>
## 1 3 NA no trailing characters abc
## 2 4 NA no trailing characters 123.45
#> [1] 123 345 NA NA
#> attr(,"problems")
#> # A tibble: 2 × 4
#> row col expected actual
#> <int> <int> <chr> <chr>
#> 1 3 NA no trailing characters abc
#> 2 4 NA no trailing characters 123.45
problems(x)
## # A tibble: 2 × 4
## row col expected actual
## <int> <int> <chr> <chr>
## 1 3 NA no trailing characters abc
## 2 4 NA no trailing characters 123.45
#> # A tibble: 2 × 4
#> row col expected actual
#> <int> <int> <chr> <chr>
#> 1 3 NA no trailing characters abc
#> 2 4 NA no trailing characters 123.45
parse_double("1.23")
## [1] 1.23
#> [1] 1.23
parse_double("1,23", locale = locale(decimal_mark = ","))
## [1] 1.23
#> [1] 1.23
parse_number("$100")
## [1] 100
#> [1] 100
parse_number("20%")
## [1] 20
#> [1] 20
parse_number("It cost $123.45")
## [1] 123.45
#> [1] 123.45
# Used in America
parse_number("$123,456,789")
## [1] 123456789
#> [1] 123456789
# Used in many parts of Europe
parse_number("123.456.789", locale = locale(grouping_mark = "."))
## [1] 123456789
#> [1] 123456789
# Used in Switzerland
parse_number("123'456'789", locale = locale(grouping_mark = "'"))
## [1] 123456789
#> [1] 123456789
charToRaw("Hadley")
## [1] 48 61 64 6c 65 79
#> [1] 48 61 64 6c 65 79
x1 <- "El Ni\xf1o was particularly bad this year"
x2 <- "\x82\xb1\x82\xf1\x82\xc9\x82\xbf\x82\xcd"
x1
## [1] "El Ni\xf1o was particularly bad this year"
#> [1] "El Ni\xf1o was particularly bad this year"
x2
## [1] "\x82\xb1\x82\xf1\x82ɂ\xbf\x82\xcd"
#> [1] "\x82\xb1\x82\xf1\x82ɂ\xbf\x82\xcd"
parse_character(x1, locale = locale(encoding = "Latin1"))
## [1] "El Niño was particularly bad this year"
#> [1] "El Niño was particularly bad this year"
parse_character(x2, locale = locale(encoding = "Shift-JIS"))
## [1] "こんにちは"
#> [1] "こんにちは"
guess_encoding(charToRaw(x1))
## # A tibble: 2 × 2
## encoding confidence
## <chr> <dbl>
## 1 ISO-8859-1 0.46
## 2 ISO-8859-9 0.23
#> # A tibble: 2 × 2
#> encoding confidence
#> <chr> <dbl>
#> 1 ISO-8859-1 0.46
#> 2 ISO-8859-9 0.23
guess_encoding(charToRaw(x2))
## # A tibble: 1 × 2
## encoding confidence
## <chr> <dbl>
## 1 KOI8-R 0.42
#> # A tibble: 1 × 2
#> encoding confidence
#> <chr> <dbl>
#> 1 KOI8-R 0.42
fruit <- c("apple", "banana")
parse_factor(c("apple", "banana", "bananana"), levels = fruit)
## Warning: 1 parsing failure.
## row col expected actual
## 3 -- value in level set bananana
## [1] apple banana <NA>
## attr(,"problems")
## # A tibble: 1 × 4
## row col expected actual
## <int> <int> <chr> <chr>
## 1 3 NA value in level set bananana
## Levels: apple banana
#> Warning: 1 parsing failure.
#> row col expected actual
#> 3 -- value in level set bananana
#> [1] apple banana <NA>
#> attr(,"problems")
#> # A tibble: 1 × 4
#> row col expected actual
#> <int> <int> <chr> <chr>
#> 1 3 NA value in level set bananana
#> Levels: apple banana
parse_datetime("2010-10-01T2010")
## [1] "2010-10-01 20:10:00 UTC"
#> [1] "2010-10-01 20:10:00 UTC"
# If time is omitted, it will be set to midnight
parse_datetime("20101010")
## [1] "2010-10-10 UTC"
#> [1] "2010-10-10 UTC"
parse_date("2010-10-01")
## [1] "2010-10-01"
#> [1] "2010-10-01"
library(hms)
##
## Attaching package: 'hms'
## The following object is masked from 'package:lubridate':
##
## hms
#>
#> Attaching package: 'hms'
#> The following object is masked from 'package:lubridate':
#>
#> hms
parse_time("01:10 am")
## 01:10:00
#> 01:10:00
parse_time("20:10:01")
## 20:10:01
#> 20:10:01
parse_date("01/02/15", "%m/%d/%y")
## [1] "2015-01-02"
#> [1] "2015-01-02"
parse_date("01/02/15", "%d/%m/%y")
## [1] "2015-02-01"
#> [1] "2015-02-01"
parse_date("01/02/15", "%y/%m/%d")
## [1] "2001-02-15"
#> [1] "2001-02-15"
parse_date("1 janvier 2015", "%d %B %Y", locale = locale("fr"))
## [1] "2015-01-01"
#> [1] "2015-01-01"
date and time formats: date_names, date_format, and time_format time zone: tz numbers: decimal_mark, grouping_mark encoding: encoding
If the decimal and grouping marks are set to the same character, locale throws an error If the decimal_mark is set to the comma “,”, then the grouping mark is set to the period “.” If the grouping mark is set to a period, then the decimal mark is set to a comma
locale()
## <locale>
## Numbers: 123,456.78
## Formats: %AD / %AT
## Timezone: UTC
## Encoding: UTF-8
## <date_names>
## Days: Sunday (Sun), Monday (Mon), Tuesday (Tue), Wednesday (Wed), Thursday
## (Thu), Friday (Fri), Saturday (Sat)
## Months: January (Jan), February (Feb), March (Mar), April (Apr), May (May),
## June (Jun), July (Jul), August (Aug), September (Sep), October
## (Oct), November (Nov), December (Dec)
## AM/PM: AM/PM
#> <locale>
#> Numbers: 123,456.78
#> Formats: %AD / %AT
#> Timezone: UTC
#> Encoding: UTF-8
#> <date_names>
#> Days: Sunday (Sun), Monday (Mon), Tuesday (Tue), Wednesday (Wed), Thursday
#> (Thu), Friday (Fri), Saturday (Sat)
#> Months: January (Jan), February (Feb), March (Mar), April (Apr), May (May),
#> June (Jun), July (Jul), August (Aug), September (Sep), October
#> (Oct), November (Nov), December (Dec)
#> AM/PM: AM/PM
locale_custom <- locale(date_format = "Day %d Mon %M Year %y",
time_format = "Sec %S Min %M Hour %H")
date_custom <- c("Day 01 Mon 02 Year 03", "Day 03 Mon 01 Year 01")
parse_date(date_custom)
## Warning: 2 parsing failures.
## row col expected actual
## 1 -- date like Day 01 Mon 02 Year 03
## 2 -- date like Day 03 Mon 01 Year 01
## [1] NA NA
#> Warning: 2 parsing failures.
#> row col expected actual
#> 1 -- date like Day 01 Mon 02 Year 03
#> 2 -- date like Day 03 Mon 01 Year 01
#> [1] NA NA
parse_date(date_custom, locale = locale_custom)
## [1] "2003-01-01" "2001-01-03"
#> [1] "2003-01-01" "2001-01-03"
time_custom <- c("Sec 01 Min 02 Hour 03", "Sec 03 Min 02 Hour 01")
parse_time(time_custom)
## Warning: 2 parsing failures.
## row col expected actual
## 1 -- time like Sec 01 Min 02 Hour 03
## 2 -- time like Sec 03 Min 02 Hour 01
## NA
## NA
#> Warning: 2 parsing failures.
#> row col expected actual
#> 1 -- time like Sec 01 Min 02 Hour 03
#> 2 -- time like Sec 03 Min 02 Hour 01
#> NA
#> NA
parse_time(time_custom, locale = locale_custom)
## 03:02:01
## 01:02:03
#> 03:02:01
#> 01:02:03
They provide default date and time formats. The readr vignette discusses using these to parse dates: since dates can include languages specific weekday and month names, and different conventions for specifying AM/PM Both the date format and time format are used for guessing column types. Thus if you were often parsing data that had non-standard formats for the date and time, you could specify custom values for date_format and time_format.
parse_date("02/01/2006")
## Warning: 1 parsing failure.
## row col expected actual
## 1 -- date like 02/01/2006
## [1] NA
#> Warning: 1 parsing failure.
#> row col expected actual
#> 1 -- date like 02/01/2006
#> [1] NA
table1
## # A tibble: 6 × 4
## country year cases population
## <chr> <dbl> <dbl> <dbl>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
#> # A tibble: 6 × 4
#> country year cases population
#> <chr> <dbl> <dbl> <dbl>
#> 1 Afghanistan 1999 745 19987071
#> 2 Afghanistan 2000 2666 20595360
#> 3 Brazil 1999 37737 172006362
#> 4 Brazil 2000 80488 174504898
#> 5 China 1999 212258 1272915272
#> 6 China 2000 213766 1280428583
table2
## # A tibble: 12 × 4
## country year type count
## <chr> <dbl> <chr> <dbl>
## 1 Afghanistan 1999 cases 745
## 2 Afghanistan 1999 population 19987071
## 3 Afghanistan 2000 cases 2666
## 4 Afghanistan 2000 population 20595360
## 5 Brazil 1999 cases 37737
## 6 Brazil 1999 population 172006362
## 7 Brazil 2000 cases 80488
## 8 Brazil 2000 population 174504898
## 9 China 1999 cases 212258
## 10 China 1999 population 1272915272
## 11 China 2000 cases 213766
## 12 China 2000 population 1280428583
#> # A tibble: 12 × 4
#> country year type count
#> <chr> <dbl> <chr> <dbl>
#> 1 Afghanistan 1999 cases 745
#> 2 Afghanistan 1999 population 19987071
#> 3 Afghanistan 2000 cases 2666
#> 4 Afghanistan 2000 population 20595360
#> 5 Brazil 1999 cases 37737
#> 6 Brazil 1999 population 172006362
#> # ℹ 6 more rows
table3
## # A tibble: 6 × 3
## country year rate
## <chr> <dbl> <chr>
## 1 Afghanistan 1999 745/19987071
## 2 Afghanistan 2000 2666/20595360
## 3 Brazil 1999 37737/172006362
## 4 Brazil 2000 80488/174504898
## 5 China 1999 212258/1272915272
## 6 China 2000 213766/1280428583
#> # A tibble: 6 × 3
#> country year rate
#> <chr> <dbl> <chr>
#> 1 Afghanistan 1999 745/19987071
#> 2 Afghanistan 2000 2666/20595360
#> 3 Brazil 1999 37737/172006362
#> 4 Brazil 2000 80488/174504898
#> 5 China 1999 212258/1272915272
#> 6 China 2000 213766/1280428583
# Spread across two tibbles
table4a # cases
## # A tibble: 3 × 3
## country `1999` `2000`
## <chr> <dbl> <dbl>
## 1 Afghanistan 745 2666
## 2 Brazil 37737 80488
## 3 China 212258 213766
#> # A tibble: 3 × 3
#> country `1999` `2000`
#> <chr> <dbl> <dbl>
#> 1 Afghanistan 745 2666
#> 2 Brazil 37737 80488
#> 3 China 212258 213766
table4b # population
## # A tibble: 3 × 3
## country `1999` `2000`
## <chr> <dbl> <dbl>
## 1 Afghanistan 19987071 20595360
## 2 Brazil 172006362 174504898
## 3 China 1272915272 1280428583
#> # A tibble: 3 × 3
#> country `1999` `2000`
#> <chr> <dbl> <dbl>
#> 1 Afghanistan 19987071 20595360
#> 2 Brazil 172006362 174504898
#> 3 China 1272915272 1280428583
# Compute rate per 10,000
table1 %>%
mutate(rate = cases / population * 10000)
## # A tibble: 6 × 5
## country year cases population rate
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan 1999 745 19987071 0.373
## 2 Afghanistan 2000 2666 20595360 1.29
## 3 Brazil 1999 37737 172006362 2.19
## 4 Brazil 2000 80488 174504898 4.61
## 5 China 1999 212258 1272915272 1.67
## 6 China 2000 213766 1280428583 1.67
#> # A tibble: 6 × 5
#> country year cases population rate
#> <chr> <dbl> <dbl> <dbl> <dbl>
#> 1 Afghanistan 1999 745 19987071 0.373
#> 2 Afghanistan 2000 2666 20595360 1.29
#> 3 Brazil 1999 37737 172006362 2.19
#> 4 Brazil 2000 80488 174504898 4.61
#> 5 China 1999 212258 1272915272 1.67
#> 6 China 2000 213766 1280428583 1.67
# Compute cases per year
table1 %>%
count(year, wt = cases)
## # A tibble: 2 × 2
## year n
## <dbl> <dbl>
## 1 1999 250740
## 2 2000 296920
#> # A tibble: 2 × 2
#> year n
#> <dbl> <dbl>
#> 1 1999 250740
#> 2 2000 296920
# Visualise changes over time
library(ggplot2)
ggplot(table1, aes(year, cases)) +
geom_line(aes(group = country), colour = "grey50") +
geom_point(aes(colour = country))
table1
## # A tibble: 6 × 4
## country year cases population
## <chr> <dbl> <dbl> <dbl>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
#> # A tibble: 6 x 4
#> country year cases population
#> <chr> <int> <int> <int>
#> 1 Afghanistan 1999 745 19987071
#> 2 Afghanistan 2000 2666 20595360
#> 3 Brazil 1999 37737 172006362
#> 4 Brazil 2000 80488 174504898
#> 5 China 1999 212258 1272915272
#> 6 China 2000 213766 1280428583
In table2, each row represents a (country, year, variable) combination. The column count contains the values of variables cases and population in separate rows.
table2
## # A tibble: 12 × 4
## country year type count
## <chr> <dbl> <chr> <dbl>
## 1 Afghanistan 1999 cases 745
## 2 Afghanistan 1999 population 19987071
## 3 Afghanistan 2000 cases 2666
## 4 Afghanistan 2000 population 20595360
## 5 Brazil 1999 cases 37737
## 6 Brazil 1999 population 172006362
## 7 Brazil 2000 cases 80488
## 8 Brazil 2000 population 174504898
## 9 China 1999 cases 212258
## 10 China 1999 population 1272915272
## 11 China 2000 cases 213766
## 12 China 2000 population 1280428583
#> # A tibble: 12 x 4
#> country year type count
#> <chr> <int> <chr> <int>
#> 1 Afghanistan 1999 cases 745
#> 2 Afghanistan 1999 population 19987071
#> 3 Afghanistan 2000 cases 2666
#> 4 Afghanistan 2000 population 20595360
#> 5 Brazil 1999 cases 37737
#> 6 Brazil 1999 population 172006362
#> # … with 6 more rows
Extract the number of TB cases per country per year. Extract the matching population per country per year. Divide cases by population, and multiply by 10000. Store back in the appropriate place. Which representation is easiest to work with? Which is hardest? Why?
Neither table is particularly easy to work with. Since table2 has separate rows for cases and population we needed to generate a table with columns for cases and population where we could calculate cases per capita. table4a and table4b split the cases and population variables into different tables which made it easy to divide cases by population. However, we had to repeat this calculation for each row.
The ideal format of a data frame to answer this question is one with columns country, year, cases, and population. Then problem could be answered with a single mutate() call.
table2 %>%
filter(type == "cases") %>%
ggplot(aes(year, count)) +
geom_line(aes(group = country), colour = "grey50") +
geom_point(aes(colour = country)) +
scale_x_continuous(breaks = unique(table2$year)) +
ylab("cases")
table4a
## # A tibble: 3 × 3
## country `1999` `2000`
## <chr> <dbl> <dbl>
## 1 Afghanistan 745 2666
## 2 Brazil 37737 80488
## 3 China 212258 213766
#> # A tibble: 3 × 3
#> country `1999` `2000`
#> <chr> <dbl> <dbl>
#> 1 Afghanistan 745 2666
#> 2 Brazil 37737 80488
#> 3 China 212258 213766
table4a %>%
pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "cases")
## # A tibble: 6 × 3
## country year cases
## <chr> <chr> <dbl>
## 1 Afghanistan 1999 745
## 2 Afghanistan 2000 2666
## 3 Brazil 1999 37737
## 4 Brazil 2000 80488
## 5 China 1999 212258
## 6 China 2000 213766
#> # A tibble: 6 × 3
#> country year cases
#> <chr> <chr> <dbl>
#> 1 Afghanistan 1999 745
#> 2 Afghanistan 2000 2666
#> 3 Brazil 1999 37737
#> 4 Brazil 2000 80488
#> 5 China 1999 212258
#> 6 China 2000 213766
table4b %>%
pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "population")
## # A tibble: 6 × 3
## country year population
## <chr> <chr> <dbl>
## 1 Afghanistan 1999 19987071
## 2 Afghanistan 2000 20595360
## 3 Brazil 1999 172006362
## 4 Brazil 2000 174504898
## 5 China 1999 1272915272
## 6 China 2000 1280428583
#> # A tibble: 6 × 3
#> country year population
#> <chr> <chr> <dbl>
#> 1 Afghanistan 1999 19987071
#> 2 Afghanistan 2000 20595360
#> 3 Brazil 1999 172006362
#> 4 Brazil 2000 174504898
#> 5 China 1999 1272915272
#> 6 China 2000 1280428583
tidy4a <- table4a %>%
pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "cases")
tidy4b <- table4b %>%
pivot_longer(c(`1999`, `2000`), names_to = "year", values_to = "population")
left_join(tidy4a, tidy4b)
## Joining with `by = join_by(country, year)`
## # A tibble: 6 × 4
## country year cases population
## <chr> <chr> <dbl> <dbl>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
#> Joining with `by = join_by(country, year)`
#> # A tibble: 6 × 4
#> country year cases population
#> <chr> <chr> <dbl> <dbl>
#> 1 Afghanistan 1999 745 19987071
#> 2 Afghanistan 2000 2666 20595360
#> 3 Brazil 1999 37737 172006362
#> 4 Brazil 2000 80488 174504898
#> 5 China 1999 212258 1272915272
#> 6 China 2000 213766 1280428583
table2
## # A tibble: 12 × 4
## country year type count
## <chr> <dbl> <chr> <dbl>
## 1 Afghanistan 1999 cases 745
## 2 Afghanistan 1999 population 19987071
## 3 Afghanistan 2000 cases 2666
## 4 Afghanistan 2000 population 20595360
## 5 Brazil 1999 cases 37737
## 6 Brazil 1999 population 172006362
## 7 Brazil 2000 cases 80488
## 8 Brazil 2000 population 174504898
## 9 China 1999 cases 212258
## 10 China 1999 population 1272915272
## 11 China 2000 cases 213766
## 12 China 2000 population 1280428583
#> # A tibble: 12 × 4
#> country year type count
#> <chr> <dbl> <chr> <dbl>
#> 1 Afghanistan 1999 cases 745
#> 2 Afghanistan 1999 population 19987071
#> 3 Afghanistan 2000 cases 2666
#> 4 Afghanistan 2000 population 20595360
#> 5 Brazil 1999 cases 37737
#> 6 Brazil 1999 population 172006362
#> # ℹ 6 more rows
table2 %>%
pivot_wider(names_from = type, values_from = count)
## # A tibble: 6 × 4
## country year cases population
## <chr> <dbl> <dbl> <dbl>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
#> # A tibble: 6 × 4
#> country year cases population
#> <chr> <dbl> <dbl> <dbl>
#> 1 Afghanistan 1999 745 19987071
#> 2 Afghanistan 2000 2666 20595360
#> 3 Brazil 1999 37737 172006362
#> 4 Brazil 2000 80488 174504898
#> 5 China 1999 212258 1272915272
#> 6 China 2000 213766 1280428583
stocks <- tibble(
year = c(2015, 2015, 2016, 2016),
half = c( 1, 2, 1, 2),
return = c(1.88, 0.59, 0.92, 0.17)
)
stocks %>%
pivot_wider(names_from = year, values_from = return) %>%
pivot_longer(`2015`:`2016`, names_to = "year", values_to = "return")
## # A tibble: 4 × 3
## half year return
## <dbl> <chr> <dbl>
## 1 1 2015 1.88
## 2 1 2016 0.92
## 3 2 2015 0.59
## 4 2 2016 0.17
The functions pivot_longer() and pivot_wider() are not perfectly symmetrical because column type information is lost when a data frame is converted from wide to long. The function pivot_longer() stacks multiple columns which may have had multiple data types into a single column with a single data type. This transformation throws away the individual data types of the original columns. The function pivot_wider() creates column names from values in column. These column names will always be treated as character values by pivot_longer() so if the original variable used to create the column names did not have a character data type, then the round-trip will not reproduce the same dataset.
table4a %>% pivot_longer(c(1999, 2000), names_to = “year”,
values_to = “cases”) #> Error in pivot_longer()
: #> !
Can’t subset columns past the end. #> ℹ Locations 1999 and 2000 don’t
exist. #> ℹ There are only 3 columns.
The code fails because the column names 1999 and 2000 are not non-syntactic variable names.
people <- tribble(
~name, ~names, ~values,
#-----------------|--------|------
"Phillip Woods", "age", 45,
"Phillip Woods", "height", 186,
"Phillip Woods", "age", 50,
"Jessica Cordero", "age", 37,
"Jessica Cordero", "height", 156
)
Widening this data frame using pivot_wider() produces columns that are lists of numeric vectors because the name and key columns do not uniquely identify rows.
preg <- tribble(
~pregnant, ~male, ~female,
"yes", NA, 10,
"no", 20, 12
)
To tidy the preg table use pivot_longer() to create a long table. The variables in this data are:
sex (“female”, “male”) pregnant (“yes”, “no”) count, which is a non-negative integer representing the number of observations.
table3
## # A tibble: 6 × 3
## country year rate
## <chr> <dbl> <chr>
## 1 Afghanistan 1999 745/19987071
## 2 Afghanistan 2000 2666/20595360
## 3 Brazil 1999 37737/172006362
## 4 Brazil 2000 80488/174504898
## 5 China 1999 212258/1272915272
## 6 China 2000 213766/1280428583
#> # A tibble: 6 × 3
#> country year rate
#> <chr> <dbl> <chr>
#> 1 Afghanistan 1999 745/19987071
#> 2 Afghanistan 2000 2666/20595360
#> 3 Brazil 1999 37737/172006362
#> 4 Brazil 2000 80488/174504898
#> 5 China 1999 212258/1272915272
#> 6 China 2000 213766/1280428583
table3 %>%
separate(rate, into = c("cases", "population"))
## # A tibble: 6 × 4
## country year cases population
## <chr> <dbl> <chr> <chr>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
#> # A tibble: 6 × 4
#> country year cases population
#> <chr> <dbl> <chr> <chr>
#> 1 Afghanistan 1999 745 19987071
#> 2 Afghanistan 2000 2666 20595360
#> 3 Brazil 1999 37737 172006362
#> 4 Brazil 2000 80488 174504898
#> 5 China 1999 212258 1272915272
#> 6 China 2000 213766 1280428583
table3 %>%
separate(rate, into = c("cases", "population"), sep = "/")
## # A tibble: 6 × 4
## country year cases population
## <chr> <dbl> <chr> <chr>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
table3 %>%
separate(rate, into = c("cases", "population"), convert = TRUE)
## # A tibble: 6 × 4
## country year cases population
## <chr> <dbl> <int> <int>
## 1 Afghanistan 1999 745 19987071
## 2 Afghanistan 2000 2666 20595360
## 3 Brazil 1999 37737 172006362
## 4 Brazil 2000 80488 174504898
## 5 China 1999 212258 1272915272
## 6 China 2000 213766 1280428583
#> # A tibble: 6 × 4
#> country year cases population
#> <chr> <dbl> <int> <int>
#> 1 Afghanistan 1999 745 19987071
#> 2 Afghanistan 2000 2666 20595360
#> 3 Brazil 1999 37737 172006362
#> 4 Brazil 2000 80488 174504898
#> 5 China 1999 212258 1272915272
#> 6 China 2000 213766 1280428583
table3 %>%
separate(year, into = c("century", "year"), sep = 2)
## # A tibble: 6 × 4
## country century year rate
## <chr> <chr> <chr> <chr>
## 1 Afghanistan 19 99 745/19987071
## 2 Afghanistan 20 00 2666/20595360
## 3 Brazil 19 99 37737/172006362
## 4 Brazil 20 00 80488/174504898
## 5 China 19 99 212258/1272915272
## 6 China 20 00 213766/1280428583
#> # A tibble: 6 × 4
#> country century year rate
#> <chr> <chr> <chr> <chr>
#> 1 Afghanistan 19 99 745/19987071
#> 2 Afghanistan 20 00 2666/20595360
#> 3 Brazil 19 99 37737/172006362
#> 4 Brazil 20 00 80488/174504898
#> 5 China 19 99 212258/1272915272
#> 6 China 20 00 213766/1280428583
table5 %>%
unite(new, century, year)
## # A tibble: 6 × 3
## country new rate
## <chr> <chr> <chr>
## 1 Afghanistan 19_99 745/19987071
## 2 Afghanistan 20_00 2666/20595360
## 3 Brazil 19_99 37737/172006362
## 4 Brazil 20_00 80488/174504898
## 5 China 19_99 212258/1272915272
## 6 China 20_00 213766/1280428583
#> # A tibble: 6 × 3
#> country new rate
#> <chr> <chr> <chr>
#> 1 Afghanistan 19_99 745/19987071
#> 2 Afghanistan 20_00 2666/20595360
#> 3 Brazil 19_99 37737/172006362
#> 4 Brazil 20_00 80488/174504898
#> 5 China 19_99 212258/1272915272
#> 6 China 20_00 213766/1280428583
table5 %>%
unite(new, century, year, sep = "")
## # A tibble: 6 × 3
## country new rate
## <chr> <chr> <chr>
## 1 Afghanistan 1999 745/19987071
## 2 Afghanistan 2000 2666/20595360
## 3 Brazil 1999 37737/172006362
## 4 Brazil 2000 80488/174504898
## 5 China 1999 212258/1272915272
## 6 China 2000 213766/1280428583
#> # A tibble: 6 × 3
#> country new rate
#> <chr> <chr> <chr>
#> 1 Afghanistan 1999 745/19987071
#> 2 Afghanistan 2000 2666/20595360
#> 3 Brazil 1999 37737/172006362
#> 4 Brazil 2000 80488/174504898
#> 5 China 1999 212258/1272915272
#> 6 China 2000 213766/1280428583
tibble(x = c("a,b,c", "d,e,f,g", "h,i,j")) %>%
separate(x, c("one", "two", "three"))
## Warning: Expected 3 pieces. Additional pieces discarded in 1 rows [2].
## # A tibble: 3 × 3
## one two three
## <chr> <chr> <chr>
## 1 a b c
## 2 d e f
## 3 h i j
tibble(x = c("a,b,c", "d,e", "f,g,i")) %>%
separate(x, c("one", "two", "three"))
## Warning: Expected 3 pieces. Missing pieces filled with `NA` in 1 rows [2].
## # A tibble: 3 × 3
## one two three
## <chr> <chr> <chr>
## 1 a b c
## 2 d e <NA>
## 3 f g i
The extra argument tells separate() what to do if there are too many pieces, and the fill argument tells it what to do if there aren’t enough. By default, separate() drops extra values with a warning.
Both unite() and separate() have a remove argument. What does it do? Why would you set it to FALSE? The remove argument discards input columns in the result data frame. You would set it to FALSE if you want to create a new variable, but keep the old one.
Compare and contrast separate() and extract(), Why are there three variations of separation (by position, by separator, and with groups), but only one unite? The function separate(), splits a column into multiple columns by separator, if the sep argument is a character vector, or by character positions, if sep is numeric.
stocks <- tibble(
year = c(2015, 2015, 2015, 2015, 2016, 2016, 2016),
qtr = c( 1, 2, 3, 4, 2, 3, 4),
return = c(1.88, 0.59, 0.35, NA, 0.92, 0.17, 2.66)
)
stocks %>%
pivot_wider(names_from = year, values_from = return)
## # A tibble: 4 × 3
## qtr `2015` `2016`
## <dbl> <dbl> <dbl>
## 1 1 1.88 NA
## 2 2 0.59 0.92
## 3 3 0.35 0.17
## 4 4 NA 2.66
#> # A tibble: 4 × 3
#> qtr `2015` `2016`
#> <dbl> <dbl> <dbl>
#> 1 1 1.88 NA
#> 2 2 0.59 0.92
#> 3 3 0.35 0.17
#> 4 4 NA 2.66
stocks %>%
pivot_wider(names_from = year, values_from = return) %>%
pivot_longer(
cols = c(`2015`, `2016`),
names_to = "year",
values_to = "return",
values_drop_na = TRUE
)
## # A tibble: 6 × 3
## qtr year return
## <dbl> <chr> <dbl>
## 1 1 2015 1.88
## 2 2 2015 0.59
## 3 2 2016 0.92
## 4 3 2015 0.35
## 5 3 2016 0.17
## 6 4 2016 2.66
#> # A tibble: 6 × 3
#> qtr year return
#> <dbl> <chr> <dbl>
#> 1 1 2015 1.88
#> 2 2 2015 0.59
#> 3 2 2016 0.92
#> 4 3 2015 0.35
#> 5 3 2016 0.17
#> 6 4 2016 2.66