Joining data

Librerías

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.1
## ✓ tidyr   1.1.1     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0

## ── Conflicts ────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Introducción

Filosofía

head(iris, n = 4)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa

iris %>% head(. , n = 4)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa

iris %>% head(n = 4)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa

4 %>% head(iris, .)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa

Ejemplos

wide

notas <- data.frame(
    names = c("Beimar", "Johanna", "Adriana"), 
    Talleres = c(50, 50, 10), 
    Quices = c(25, 30, 35),
    Parciales = c(10, 12, 45) 
  )

notas

##     names Talleres Quices Parciales
## 1  Beimar       50     25        10
## 2 Johanna       50     30        12
## 3 Adriana       10     35        45

long

NOTAS <- data.frame(
    names = c("Beimar", "Johanna", "Adriana", "Beimar", "Johanna", "Adriana", "Beimar", "Johanna", "Adriana"), 
    rubric = c("Talleres", "Talleres", "Talleres",  "Quices", "Quices", "Quices",  "Parciales", "Parciales", "Parciales"), 
    note = c(50, 50, 10, 25, 30, 35, 10, 12, 45)
  )

NOTAS

##     names    rubric note
## 1  Beimar  Talleres   50
## 2 Johanna  Talleres   50
## 3 Adriana  Talleres   10
## 4  Beimar    Quices   25
## 5 Johanna    Quices   30
## 6 Adriana    Quices   35
## 7  Beimar Parciales   10
## 8 Johanna Parciales   12
## 9 Adriana Parciales   45

gather() y spread()

NOTAs <- notas %>% gather(rubric, note, 2:4)

NOTAs

##     names    rubric note
## 1  Beimar  Talleres   50
## 2 Johanna  Talleres   50
## 3 Adriana  Talleres   10
## 4  Beimar    Quices   25
## 5 Johanna    Quices   30
## 6 Adriana    Quices   35
## 7  Beimar Parciales   10
## 8 Johanna Parciales   12
## 9 Adriana Parciales   45

notas <- NOTAS %>% spread(rubric, note)

notas

##     names Parciales Quices Talleres
## 1 Adriana        45     35       10
## 2  Beimar        10     25       50
## 3 Johanna        12     30       50

separate() y unite()

notas <- data.frame(
    name_surname = c("Beimar_Rodríguez", "Johanna_Vanegas", "Adriana_Guerrero"), 
    Talleres = c(50, 50, 10), 
    Quices = c(25, 30, 35),
    Parciales = c(10, 12, 45) 
  )

notas

##       name_surname Talleres Quices Parciales
## 1 Beimar_Rodríguez       50     25        10
## 2  Johanna_Vanegas       50     30        12
## 3 Adriana_Guerrero       10     35        45

Notas <- notas %>% separate(name_surname, c("name", "surname"), sep = "_")

Notas

##      name   surname Talleres Quices Parciales
## 1  Beimar Rodríguez       50     25        10
## 2 Johanna   Vanegas       50     30        12
## 3 Adriana  Guerrero       10     35        45

notas <- Notas %>% unite(name_surname, name:surname, sep = "_")

notas

##       name_surname Talleres Quices Parciales
## 1 Beimar_Rodríguez       50     25        10
## 2  Johanna_Vanegas       50     30        12
## 3 Adriana_Guerrero       10     35        45

summarise()

NOTAS %>%
  group_by(rubric) %>%
  summarise(
    mean(note), median(note), sd(note), IQR(note)
    )

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 3 x 5
##   rubric    `mean(note)` `median(note)` `sd(note)` `IQR(note)`
##   <chr>            <dbl>          <dbl>      <dbl>       <dbl>
## 1 Parciales         22.3             12       19.7        17.5
## 2 Quices            30               30        5           5  
## 3 Talleres          36.7             50       23.1        20

summarise_at()

NOTAS %>%
  group_by(rubric) %>%
  summarise_at(
    vars(note), 
    funs(mean, median, sd, IQR)
               )

## Warning: `funs()` is deprecated as of dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

## # A tibble: 3 x 5
##   rubric     mean median    sd   IQR
##   <chr>     <dbl>  <dbl> <dbl> <dbl>
## 1 Parciales  22.3     12  19.7  17.5
## 2 Quices     30       30   5     5  
## 3 Talleres   36.7     50  23.1  20

summarise_if()

NOTAS %>%
  group_by(rubric) %>%
    summarise_if(
      is.numeric, 
      funs(mean, median, sd, IQR)
                 )

## # A tibble: 3 x 5
##   rubric     mean median    sd   IQR
##   <chr>     <dbl>  <dbl> <dbl> <dbl>
## 1 Parciales  22.3     12  19.7  17.5
## 2 Quices     30       30   5     5  
## 3 Talleres   36.7     50  23.1  20

Conjunto de datos

Miembros de la banda

head(band_members)

## # A tibble: 3 x 2
##   name  band   
##   <chr> <chr>  
## 1 Mick  Stones 
## 2 John  Beatles
## 3 Paul  Beatles

Instrumentos de la banda

head(band_instruments)

## # A tibble: 3 x 2
##   name  plays 
##   <chr> <chr> 
## 1 John  guitar
## 2 Paul  bass  
## 3 Keith guitar

head(band_instruments2)

## # A tibble: 3 x 2
##   artist plays 
##   <chr>  <chr> 
## 1 John   guitar
## 2 Paul   bass  
## 3 Keith  guitar

Joins Types

inner_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)

Inner Joins

inner_join(band_members, band_instruments, by = "name")

## # A tibble: 2 x 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 John  Beatles guitar
## 2 Paul  Beatles bass

band_members %>% inner_join(band_instruments)

## Joining, by = "name"

## # A tibble: 2 x 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 John  Beatles guitar
## 2 Paul  Beatles bass

Outer Joins

Left Join

left_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)

left_join(band_members, band_instruments, by = "name")

## # A tibble: 3 x 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 Mick  Stones  <NA>  
## 2 John  Beatles guitar
## 3 Paul  Beatles bass

band_members %>% left_join(band_instruments)

## Joining, by = "name"

## # A tibble: 3 x 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 Mick  Stones  <NA>  
## 2 John  Beatles guitar
## 3 Paul  Beatles bass

Right Join

right_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)

right_join(band_members, band_instruments, by = "name")

## # A tibble: 3 x 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 John  Beatles guitar
## 2 Paul  Beatles bass  
## 3 Keith <NA>    guitar

band_members %>% right_join(band_instruments)

## Joining, by = "name"

## # A tibble: 3 x 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 John  Beatles guitar
## 2 Paul  Beatles bass  
## 3 Keith <NA>    guitar

Full Join

full_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)

full_join(band_members, band_instruments, by = "name")

## # A tibble: 4 x 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 Mick  Stones  <NA>  
## 2 John  Beatles guitar
## 3 Paul  Beatles bass  
## 4 Keith <NA>    guitar

band_members %>% full_join(band_instruments)

## Joining, by = "name"

## # A tibble: 4 x 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 Mick  Stones  <NA>  
## 2 John  Beatles guitar
## 3 Paul  Beatles bass  
## 4 Keith <NA>    guitar

Additional Information

Join on Variables with Different Names

full_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)

full_join(band_members, band_instruments2, by = c("name" = "artist"))

## # A tibble: 4 x 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 Mick  Stones  <NA>  
## 2 John  Beatles guitar
## 3 Paul  Beatles bass  
## 4 Keith <NA>    guitar

Additional Arguments

\(copy\) - if the datasets are from different sources and copy = TRUE then y will be copied across to the datasource where x is located.

\(suffix\) - if a variable name occurs in both datasets, and is not used as part of the join, a suffix is added to ensure variable names are unique. By default “.x” and “.y” are added to the variable names but other suffixes can be specified.

Piping

x %>% full_join(y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)

band_members %>% full_join(band_instruments)

## Joining, by = "name"

## # A tibble: 4 x 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 Mick  Stones  <NA>  
## 2 John  Beatles guitar
## 3 Paul  Beatles bass  
## 4 Keith <NA>    guitar

Filtering

semi_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)

band_members %>% semi_join(band_instruments)

## Joining, by = "name"

## # A tibble: 2 x 2
##   name  band   
##   <chr> <chr>  
## 1 John  Beatles
## 2 Paul  Beatles

anti_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)

band_members %>% anti_join(band_instruments)

## Joining, by = "name"

## # A tibble: 1 x 2
##   name  band  
##   <chr> <chr> 
## 1 Mick  Stones

nest_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)

band_members %>% nest_join(band_instruments)

## Joining, by = "name"

## # A tibble: 3 x 3
##   name  band    band_instruments
##   <chr> <chr>   <list>          
## 1 Mick  Stones  <tibble [0 × 1]>
## 2 John  Beatles <tibble [1 × 1]>
## 3 Paul  Beatles <tibble [1 × 1]>

Joining data

M Sc. Mario Gregorio Saavedra Rodríguez

23/10/2019

Librerías

Introducción

Filosofía

Ejemplos

wide

long

gather() y spread()

separate() y unite()

summarise()

summarise_at()

summarise_if()

Conjunto de datos

Miembros de la banda

Instrumentos de la banda

Joins Types

Inner Joins

Outer Joins

Left Join

Right Join

Full Join

Additional Information

Join on Variables with Different Names

Additional Arguments

Piping

Filtering