Librerías

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.1
## ✓ tidyr   1.1.1     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Introducción

Filosofía

head(iris, n = 4)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
iris %>% head(. , n = 4)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
iris %>% head(n = 4)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
4 %>% head(iris, .)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa

Ejemplos

wide

notas <- data.frame(
    names = c("Beimar", "Johanna", "Adriana"), 
    Talleres = c(50, 50, 10), 
    Quices = c(25, 30, 35),
    Parciales = c(10, 12, 45) 
  )

notas
##     names Talleres Quices Parciales
## 1  Beimar       50     25        10
## 2 Johanna       50     30        12
## 3 Adriana       10     35        45

long

NOTAS <- data.frame(
    names = c("Beimar", "Johanna", "Adriana", "Beimar", "Johanna", "Adriana", "Beimar", "Johanna", "Adriana"), 
    rubric = c("Talleres", "Talleres", "Talleres",  "Quices", "Quices", "Quices",  "Parciales", "Parciales", "Parciales"), 
    note = c(50, 50, 10, 25, 30, 35, 10, 12, 45)
  )

NOTAS
##     names    rubric note
## 1  Beimar  Talleres   50
## 2 Johanna  Talleres   50
## 3 Adriana  Talleres   10
## 4  Beimar    Quices   25
## 5 Johanna    Quices   30
## 6 Adriana    Quices   35
## 7  Beimar Parciales   10
## 8 Johanna Parciales   12
## 9 Adriana Parciales   45

gather() y spread()

NOTAs <- notas %>% gather(rubric, note, 2:4)

NOTAs
##     names    rubric note
## 1  Beimar  Talleres   50
## 2 Johanna  Talleres   50
## 3 Adriana  Talleres   10
## 4  Beimar    Quices   25
## 5 Johanna    Quices   30
## 6 Adriana    Quices   35
## 7  Beimar Parciales   10
## 8 Johanna Parciales   12
## 9 Adriana Parciales   45
notas <- NOTAS %>% spread(rubric, note)

notas
##     names Parciales Quices Talleres
## 1 Adriana        45     35       10
## 2  Beimar        10     25       50
## 3 Johanna        12     30       50

separate() y unite()

notas <- data.frame(
    name_surname = c("Beimar_Rodríguez", "Johanna_Vanegas", "Adriana_Guerrero"), 
    Talleres = c(50, 50, 10), 
    Quices = c(25, 30, 35),
    Parciales = c(10, 12, 45) 
  )

notas
##       name_surname Talleres Quices Parciales
## 1 Beimar_Rodríguez       50     25        10
## 2  Johanna_Vanegas       50     30        12
## 3 Adriana_Guerrero       10     35        45
Notas <- notas %>% separate(name_surname, c("name", "surname"), sep = "_")

Notas
##      name   surname Talleres Quices Parciales
## 1  Beimar Rodríguez       50     25        10
## 2 Johanna   Vanegas       50     30        12
## 3 Adriana  Guerrero       10     35        45
notas <- Notas %>% unite(name_surname, name:surname, sep = "_")

notas
##       name_surname Talleres Quices Parciales
## 1 Beimar_Rodríguez       50     25        10
## 2  Johanna_Vanegas       50     30        12
## 3 Adriana_Guerrero       10     35        45

summarise()

NOTAS %>%
  group_by(rubric) %>%
  summarise(
    mean(note), median(note), sd(note), IQR(note)
    )
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 3 x 5
##   rubric    `mean(note)` `median(note)` `sd(note)` `IQR(note)`
##   <chr>            <dbl>          <dbl>      <dbl>       <dbl>
## 1 Parciales         22.3             12       19.7        17.5
## 2 Quices            30               30        5           5  
## 3 Talleres          36.7             50       23.1        20

summarise_at()

NOTAS %>%
  group_by(rubric) %>%
  summarise_at(
    vars(note), 
    funs(mean, median, sd, IQR)
               )
## Warning: `funs()` is deprecated as of dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
## # A tibble: 3 x 5
##   rubric     mean median    sd   IQR
##   <chr>     <dbl>  <dbl> <dbl> <dbl>
## 1 Parciales  22.3     12  19.7  17.5
## 2 Quices     30       30   5     5  
## 3 Talleres   36.7     50  23.1  20

summarise_if()

NOTAS %>%
  group_by(rubric) %>%
    summarise_if(
      is.numeric, 
      funs(mean, median, sd, IQR)
                 )
## # A tibble: 3 x 5
##   rubric     mean median    sd   IQR
##   <chr>     <dbl>  <dbl> <dbl> <dbl>
## 1 Parciales  22.3     12  19.7  17.5
## 2 Quices     30       30   5     5  
## 3 Talleres   36.7     50  23.1  20

Conjunto de datos

Miembros de la banda

head(band_members)
## # A tibble: 3 x 2
##   name  band   
##   <chr> <chr>  
## 1 Mick  Stones 
## 2 John  Beatles
## 3 Paul  Beatles

Instrumentos de la banda

head(band_instruments)
## # A tibble: 3 x 2
##   name  plays 
##   <chr> <chr> 
## 1 John  guitar
## 2 Paul  bass  
## 3 Keith guitar
head(band_instruments2)
## # A tibble: 3 x 2
##   artist plays 
##   <chr>  <chr> 
## 1 John   guitar
## 2 Paul   bass  
## 3 Keith  guitar

Joins Types

inner_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)

Inner Joins

inner_join(band_members, band_instruments, by = "name")
## # A tibble: 2 x 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 John  Beatles guitar
## 2 Paul  Beatles bass
band_members %>% inner_join(band_instruments)
## Joining, by = "name"
## # A tibble: 2 x 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 John  Beatles guitar
## 2 Paul  Beatles bass

Outer Joins

Left Join

left_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)

left_join(band_members, band_instruments, by = "name")
## # A tibble: 3 x 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 Mick  Stones  <NA>  
## 2 John  Beatles guitar
## 3 Paul  Beatles bass
band_members %>% left_join(band_instruments)
## Joining, by = "name"
## # A tibble: 3 x 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 Mick  Stones  <NA>  
## 2 John  Beatles guitar
## 3 Paul  Beatles bass

Right Join

right_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)

right_join(band_members, band_instruments, by = "name")
## # A tibble: 3 x 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 John  Beatles guitar
## 2 Paul  Beatles bass  
## 3 Keith <NA>    guitar
band_members %>% right_join(band_instruments)
## Joining, by = "name"
## # A tibble: 3 x 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 John  Beatles guitar
## 2 Paul  Beatles bass  
## 3 Keith <NA>    guitar

Full Join

full_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)

full_join(band_members, band_instruments, by = "name")
## # A tibble: 4 x 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 Mick  Stones  <NA>  
## 2 John  Beatles guitar
## 3 Paul  Beatles bass  
## 4 Keith <NA>    guitar
band_members %>% full_join(band_instruments)
## Joining, by = "name"
## # A tibble: 4 x 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 Mick  Stones  <NA>  
## 2 John  Beatles guitar
## 3 Paul  Beatles bass  
## 4 Keith <NA>    guitar

Additional Information

Join on Variables with Different Names

full_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)

full_join(band_members, band_instruments2, by = c("name" = "artist"))
## # A tibble: 4 x 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 Mick  Stones  <NA>  
## 2 John  Beatles guitar
## 3 Paul  Beatles bass  
## 4 Keith <NA>    guitar

Additional Arguments

\(copy\) - if the datasets are from different sources and copy = TRUE then y will be copied across to the datasource where x is located.

\(suffix\) - if a variable name occurs in both datasets, and is not used as part of the join, a suffix is added to ensure variable names are unique. By default “.x” and “.y” are added to the variable names but other suffixes can be specified.

Piping

x %>% full_join(y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)

band_members %>% full_join(band_instruments)
## Joining, by = "name"
## # A tibble: 4 x 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 Mick  Stones  <NA>  
## 2 John  Beatles guitar
## 3 Paul  Beatles bass  
## 4 Keith <NA>    guitar

Filtering

semi_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)

band_members %>% semi_join(band_instruments)
## Joining, by = "name"
## # A tibble: 2 x 2
##   name  band   
##   <chr> <chr>  
## 1 John  Beatles
## 2 Paul  Beatles

anti_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)

band_members %>% anti_join(band_instruments)
## Joining, by = "name"
## # A tibble: 1 x 2
##   name  band  
##   <chr> <chr> 
## 1 Mick  Stones

nest_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)

band_members %>% nest_join(band_instruments)
## Joining, by = "name"
## # A tibble: 3 x 3
##   name  band    band_instruments
##   <chr> <chr>   <list>          
## 1 Mick  Stones  <tibble [0 × 1]>
## 2 John  Beatles <tibble [1 × 1]>
## 3 Paul  Beatles <tibble [1 × 1]>