1 Librerías

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

2 Introducción

2.1 Filosofía

g(x) %>% f(y) = f(g(x),y)

head(iris, n = 4)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa

iris %>% head(. , n = 4)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa

iris %>% head(n = 4)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa

4 %>% head(iris, .)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa

3 Ejemplos

3.1 wide

notas <- data.frame(
    nombre = c("Beimar", "Johanna", "Adriana"),
    Talleres = c(50, 50, 10),
    Quices = c(25, 30, 35),
    Parciales = c(10, 12, 45)
  )

notas

##    nombre Talleres Quices Parciales
## 1  Beimar       50     25        10
## 2 Johanna       50     30        12
## 3 Adriana       10     35        45

3.2 long

NOTAS <- data.frame(
    nombre = c("Beimar", "Johanna", "Adriana", "Beimar", "Johanna", "Adriana", "Beimar", "Johanna", "Adriana"),
    rubrica = c("Talleres", "Talleres", "Talleres",  "Quices", "Quices", "Quices",  "Parciales", "Parciales", "Parciales"),
    nota = c(50, 50, 10, 25, 30, 35, 10, 12, 45)
  )

NOTAS

##    nombre   rubrica nota
## 1  Beimar  Talleres   50
## 2 Johanna  Talleres   50
## 3 Adriana  Talleres   10
## 4  Beimar    Quices   25
## 5 Johanna    Quices   30
## 6 Adriana    Quices   35
## 7  Beimar Parciales   10
## 8 Johanna Parciales   12
## 9 Adriana Parciales   45

4 Formatos wide y long

4.1 gather() y spread()

NOTAS <- notas %>% gather(rubrica, nota, 2:3)

NOTAS

##    nombre Parciales  rubrica nota
## 1  Beimar        10 Talleres   50
## 2 Johanna        12 Talleres   50
## 3 Adriana        45 Talleres   10
## 4  Beimar        10   Quices   25
## 5 Johanna        12   Quices   30
## 6 Adriana        45   Quices   35

NOTAS <- notas %>% gather(rubrica, nota, 3:4)

NOTAS

##    nombre Talleres   rubrica nota
## 1  Beimar       50    Quices   25
## 2 Johanna       50    Quices   30
## 3 Adriana       10    Quices   35
## 4  Beimar       50 Parciales   10
## 5 Johanna       50 Parciales   12
## 6 Adriana       10 Parciales   45

NOTAS <- notas %>% gather(rubrica, nota, 2:4)

NOTAS

##    nombre   rubrica nota
## 1  Beimar  Talleres   50
## 2 Johanna  Talleres   50
## 3 Adriana  Talleres   10
## 4  Beimar    Quices   25
## 5 Johanna    Quices   30
## 6 Adriana    Quices   35
## 7  Beimar Parciales   10
## 8 Johanna Parciales   12
## 9 Adriana Parciales   45

melt(notas, id.vars = c("nombre"))

##    nombre  variable value
## 1  Beimar  Talleres    50
## 2 Johanna  Talleres    50
## 3 Adriana  Talleres    10
## 4  Beimar    Quices    25
## 5 Johanna    Quices    30
## 6 Adriana    Quices    35
## 7  Beimar Parciales    10
## 8 Johanna Parciales    12
## 9 Adriana Parciales    45

notas %>% melt(id.vars = c("nombre"))

##    nombre  variable value
## 1  Beimar  Talleres    50
## 2 Johanna  Talleres    50
## 3 Adriana  Talleres    10
## 4  Beimar    Quices    25
## 5 Johanna    Quices    30
## 6 Adriana    Quices    35
## 7  Beimar Parciales    10
## 8 Johanna Parciales    12
## 9 Adriana Parciales    45

notas <- NOTAS %>% spread(rubrica, nota)

notas

##    nombre Parciales Quices Talleres
## 1 Adriana        45     35       10
## 2  Beimar        10     25       50
## 3 Johanna        12     30       50

dcast(NOTAS, formula = nombre ~ rubrica, value.var = c("nota"))

##    nombre Parciales Quices Talleres
## 1 Adriana        45     35       10
## 2  Beimar        10     25       50
## 3 Johanna        12     30       50

NOTAS %>% dcast(formula = nombre ~ rubrica, value.var = c("nota"))

##    nombre Parciales Quices Talleres
## 1 Adriana        45     35       10
## 2  Beimar        10     25       50
## 3 Johanna        12     30       50

NOTAS %>% dcast(formula = rubrica ~ nombre, value.var = c("nota"))

##     rubrica Adriana Beimar Johanna
## 1 Parciales      45     10      12
## 2    Quices      35     25      30
## 3  Talleres      10     50      50

NOTAS %>% dcast(formula = nombre ~ ., value.var = c("nota"), fun.aggregate = mean, na.rm = TRUE) %>%
  `colnames<-`(c("nombre", "promedio"))

##    nombre promedio
## 1 Adriana 30.00000
## 2  Beimar 28.33333
## 3 Johanna 30.66667

NOTAS %>% dcast(formula = nombre ~ ., value.var = c("nota"), fun.aggregate = mean, na.rm = TRUE) %>%
  `names<-`(c("nombre", "promedio"))

##    nombre promedio
## 1 Adriana 30.00000
## 2  Beimar 28.33333
## 3 Johanna 30.66667

4.2 separate() y unite()

notas <- data.frame(
    nombreyapellido = c("Beimar Rodríguez", "Johanna Vanegas", "Adriana Guerrero"),
    Talleres = c(50, 50, 10),
    Quices = c(25, 30, 35),
    Parciales = c(10, 12, 45)
  )

notas

##    nombreyapellido Talleres Quices Parciales
## 1 Beimar Rodríguez       50     25        10
## 2  Johanna Vanegas       50     30        12
## 3 Adriana Guerrero       10     35        45

Notas <- notas %>% separate(nombreyapellido, c("nombre", "apellido"), sep = " ")

Notas

##    nombre  apellido Talleres Quices Parciales
## 1  Beimar Rodríguez       50     25        10
## 2 Johanna   Vanegas       50     30        12
## 3 Adriana  Guerrero       10     35        45

notas <- Notas %>% unite(nombreyapellido, nombre:apellido, sep = " ")

notas

##    nombreyapellido Talleres Quices Parciales
## 1 Beimar Rodríguez       50     25        10
## 2  Johanna Vanegas       50     30        12
## 3 Adriana Guerrero       10     35        45

notas <- Notas %>% 
  unite(nombreyapellido, nombre:apellido, sep = " ") %>% 
  unite(talleresparcialesyquices, c(Talleres, Parciales, Quices), sep = " ")

notas

##    nombreyapellido talleresparcialesyquices
## 1 Beimar Rodríguez                 50 10 25
## 2  Johanna Vanegas                 50 12 30
## 3 Adriana Guerrero                 10 45 35

4.3 summarise()

NOTAS %>%
  group_by(rubrica) %>%
  summarise(
    mean(nota), median(nota), sd(nota), IQR(nota)
    )

## # A tibble: 3 × 5
##   rubrica   `mean(nota)` `median(nota)` `sd(nota)` `IQR(nota)`
##   <chr>            <dbl>          <dbl>      <dbl>       <dbl>
## 1 Parciales         22.3             12       19.7        17.5
## 2 Quices            30               30        5           5  
## 3 Talleres          36.7             50       23.1        20

NOTAS %>%
  group_by(nombre) %>%
  summarise(
    mean(nota), median(nota), sd(nota), IQR(nota)
    )

## # A tibble: 3 × 5
##   nombre  `mean(nota)` `median(nota)` `sd(nota)` `IQR(nota)`
##   <chr>          <dbl>          <dbl>      <dbl>       <dbl>
## 1 Adriana         30               35       18.0        17.5
## 2 Beimar          28.3             25       20.2        20  
## 3 Johanna         30.7             30       19.0        19

4.4 summarise_at()

NOTAS %>%
  group_by(rubrica) %>%
  summarise_at(
    vars(nota),
    funs(mean, median, sd, IQR)
    )

## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))

## # A tibble: 3 × 5
##   rubrica    mean median    sd   IQR
##   <chr>     <dbl>  <dbl> <dbl> <dbl>
## 1 Parciales  22.3     12  19.7  17.5
## 2 Quices     30       30   5     5  
## 3 Talleres   36.7     50  23.1  20

NOTAS %>%
  group_by(nombre) %>%
  summarise_at(
    vars(nota),
    funs(mean, median, sd, IQR)
    )

## # A tibble: 3 × 5
##   nombre   mean median    sd   IQR
##   <chr>   <dbl>  <dbl> <dbl> <dbl>
## 1 Adriana  30       35  18.0  17.5
## 2 Beimar   28.3     25  20.2  20  
## 3 Johanna  30.7     30  19.0  19

4.5 summarise_if()

NOTAS %>%
  group_by(rubrica) %>%
    summarise_if(
      is.numeric,
      funs(mean, median, sd, IQR)
      )

## # A tibble: 3 × 5
##   rubrica    mean median    sd   IQR
##   <chr>     <dbl>  <dbl> <dbl> <dbl>
## 1 Parciales  22.3     12  19.7  17.5
## 2 Quices     30       30   5     5  
## 3 Talleres   36.7     50  23.1  20

NOTAS %>%
  group_by(nombre) %>%
    summarise_if(
      is.numeric,
      funs(mean, median, sd, IQR)
      )

## # A tibble: 3 × 5
##   nombre   mean median    sd   IQR
##   <chr>   <dbl>  <dbl> <dbl> <dbl>
## 1 Adriana  30       35  18.0  17.5
## 2 Beimar   28.3     25  20.2  20  
## 3 Johanna  30.7     30  19.0  19

NOTAS %>%
  group_by(nombre) %>%
    summarise_if(
      is.numeric,
      funs(mean, median, sd, IQR)
      )

## # A tibble: 3 × 5
##   nombre   mean median    sd   IQR
##   <chr>   <dbl>  <dbl> <dbl> <dbl>
## 1 Adriana  30       35  18.0  17.5
## 2 Beimar   28.3     25  20.2  20  
## 3 Johanna  30.7     30  19.0  19

5 Conjunto de datos

5.1 Miembros de la banda

head(band_members)

## # A tibble: 3 × 2
##   name  band   
##   <chr> <chr>  
## 1 Mick  Stones 
## 2 John  Beatles
## 3 Paul  Beatles

5.2 Instrumentos de la banda

band_instruments y band_instruments2 contienen los mismos datos con diferentes nombres de la primera columna; band_instruments usa name, que coincide con el nombre de la llave primaría de band_members; y band_instruments2 usa en su lugar como nombre de la misma columna artist.

head(band_instruments)

## # A tibble: 3 × 2
##   name  plays 
##   <chr> <chr> 
## 1 John  guitar
## 2 Paul  bass  
## 3 Keith guitar

head(band_instruments2)

## # A tibble: 3 × 2
##   artist plays 
##   <chr>  <chr> 
## 1 John   guitar
## 2 Paul   bass  
## 3 Keith  guitar

6 Joins Types

inner_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)

6.1 Inner Joins

inner_join(band_members, band_instruments, by = "name")

## # A tibble: 2 × 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 John  Beatles guitar
## 2 Paul  Beatles bass

band_members %>% inner_join(band_instruments)

## Joining, by = "name"

## # A tibble: 2 × 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 John  Beatles guitar
## 2 Paul  Beatles bass

merge(x = band_members, y = band_instruments, by = "name")

##   name    band  plays
## 1 John Beatles guitar
## 2 Paul Beatles   bass

7 Cross Joins

merge(x = band_members, y = band_instruments, by = NULL)

##   name.x    band name.y  plays
## 1   Mick  Stones   John guitar
## 2   John Beatles   John guitar
## 3   Paul Beatles   John guitar
## 4   Mick  Stones   Paul   bass
## 5   John Beatles   Paul   bass
## 6   Paul Beatles   Paul   bass
## 7   Mick  Stones  Keith guitar
## 8   John Beatles  Keith guitar
## 9   Paul Beatles  Keith guitar

8 Outer Joins

8.1 Left Join

left_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)

left_join(band_members, band_instruments, by = "name")

## # A tibble: 3 × 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 Mick  Stones  <NA>  
## 2 John  Beatles guitar
## 3 Paul  Beatles bass

band_members %>% left_join(band_instruments)

## Joining, by = "name"

## # A tibble: 3 × 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 Mick  Stones  <NA>  
## 2 John  Beatles guitar
## 3 Paul  Beatles bass

merge(x = band_members, y = band_instruments, by = "name", all.x = TRUE)

##   name    band  plays
## 1 John Beatles guitar
## 2 Mick  Stones   <NA>
## 3 Paul Beatles   bass

8.2 Right Join

right_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)

right_join(band_members, band_instruments, by = "name")

## # A tibble: 3 × 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 John  Beatles guitar
## 2 Paul  Beatles bass  
## 3 Keith <NA>    guitar

band_members %>% right_join(band_instruments, by = "name")

## # A tibble: 3 × 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 John  Beatles guitar
## 2 Paul  Beatles bass  
## 3 Keith <NA>    guitar

merge(x = band_members, y = band_instruments, by = "name", all.y = TRUE)

##    name    band  plays
## 1  John Beatles guitar
## 2 Keith    <NA> guitar
## 3  Paul Beatles   bass

8.3 Full Join

full_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)

full_join(band_members, band_instruments, by = "name")

## # A tibble: 4 × 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 Mick  Stones  <NA>  
## 2 John  Beatles guitar
## 3 Paul  Beatles bass  
## 4 Keith <NA>    guitar

band_members %>% full_join(band_instruments, by = "name")

## # A tibble: 4 × 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 Mick  Stones  <NA>  
## 2 John  Beatles guitar
## 3 Paul  Beatles bass  
## 4 Keith <NA>    guitar

merge(x = band_members, y = band_instruments, by = "name", all = TRUE)

##    name    band  plays
## 1  John Beatles guitar
## 2 Keith    <NA> guitar
## 3  Mick  Stones   <NA>
## 4  Paul Beatles   bass

9 Additional Information

9.1 Join on Variables with Different Names

full_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)

full_join(band_members, band_instruments2, by = c("name" = "artist"))

## # A tibble: 4 × 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 Mick  Stones  <NA>  
## 2 John  Beatles guitar
## 3 Paul  Beatles bass  
## 4 Keith <NA>    guitar

full_join(band_members, band_instruments2, by = c("name" = "artist"), keep = TRUE)

## # A tibble: 4 × 4
##   name  band    artist plays 
##   <chr> <chr>   <chr>  <chr> 
## 1 Mick  Stones  <NA>   <NA>  
## 2 John  Beatles John   guitar
## 3 Paul  Beatles Paul   bass  
## 4 <NA>  <NA>    Keith  guitar

10 Additional Arguments

copy - if the datasets are from different sources and copy = TRUE then y will be copied across to the datasource where x is located.

suffix - if a variable name occurs in both datasets, and is not used as part of the join, a suffix is added to ensure variable names are unique. By default “.x” and “.y” are added to the variable names but other suffixes can be specified.

10.1 Piping

x %>% full_join(y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)

band_members %>% full_join(band_instruments)

## Joining, by = "name"

## # A tibble: 4 × 3
##   name  band    plays 
##   <chr> <chr>   <chr> 
## 1 Mick  Stones  <NA>  
## 2 John  Beatles guitar
## 3 Paul  Beatles bass  
## 4 Keith <NA>    guitar

10.2 Filtering

semi_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)

band_members %>% semi_join(band_instruments)

## Joining, by = "name"

## # A tibble: 2 × 2
##   name  band   
##   <chr> <chr>  
## 1 John  Beatles
## 2 Paul  Beatles

anti_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)

band_members %>% anti_join(band_instruments)

## Joining, by = "name"

## # A tibble: 1 × 2
##   name  band  
##   <chr> <chr> 
## 1 Mick  Stones

nest_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)

band_members %>% nest_join(band_instruments)

## Joining, by = "name"

## # A tibble: 3 × 3
##   name  band    band_instruments
##   <chr> <chr>   <list>          
## 1 Mick  Stones  <tibble [0 × 1]>
## 2 John  Beatles <tibble [1 × 1]>
## 3 Paul  Beatles <tibble [1 × 1]>

(band_members %>% nest_join(band_instruments))$band_instruments

## Joining, by = "name"

## [[1]]
## # A tibble: 0 × 1
## # … with 1 variable: plays <chr>
## 
## [[2]]
## # A tibble: 1 × 1
##   plays 
##   <chr> 
## 1 guitar
## 
## [[3]]
## # A tibble: 1 × 1
##   plays
##   <chr>
## 1 bass

11 Examples (arilines, airports, planes & weather)

11.1 nycflights13

library(nycflights13)

11.2 Databases

airlines

## # A tibble: 16 × 2
##    carrier name                       
##    <chr>   <chr>                      
##  1 9E      Endeavor Air Inc.          
##  2 AA      American Airlines Inc.     
##  3 AS      Alaska Airlines Inc.       
##  4 B6      JetBlue Airways            
##  5 DL      Delta Air Lines Inc.       
##  6 EV      ExpressJet Airlines Inc.   
##  7 F9      Frontier Airlines Inc.     
##  8 FL      AirTran Airways Corporation
##  9 HA      Hawaiian Airlines Inc.     
## 10 MQ      Envoy Air                  
## 11 OO      SkyWest Airlines Inc.      
## 12 UA      United Air Lines Inc.      
## 13 US      US Airways Inc.            
## 14 VX      Virgin America             
## 15 WN      Southwest Airlines Co.     
## 16 YV      Mesa Airlines Inc.

carrier: Abreviación de dos letras
name: Nombre completo de la aerolínea

str(airlines)

## tibble [16 × 2] (S3: tbl_df/tbl/data.frame)
##  $ carrier: chr [1:16] "9E" "AA" "AS" "B6" ...
##  $ name   : chr [1:16] "Endeavor Air Inc." "American Airlines Inc." "Alaska Airlines Inc." "JetBlue Airways" ...

airports

## # A tibble: 1,458 × 8
##    faa   name                             lat    lon   alt    tz dst   tzone    
##    <chr> <chr>                          <dbl>  <dbl> <dbl> <dbl> <chr> <chr>    
##  1 04G   Lansdowne Airport               41.1  -80.6  1044    -5 A     America/…
##  2 06A   Moton Field Municipal Airport   32.5  -85.7   264    -6 A     America/…
##  3 06C   Schaumburg Regional             42.0  -88.1   801    -6 A     America/…
##  4 06N   Randall Airport                 41.4  -74.4   523    -5 A     America/…
##  5 09J   Jekyll Island Airport           31.1  -81.4    11    -5 A     America/…
##  6 0A9   Elizabethton Municipal Airport  36.4  -82.2  1593    -5 A     America/…
##  7 0G6   Williams County Airport         41.5  -84.5   730    -5 A     America/…
##  8 0G7   Finger Lakes Regional Airport   42.9  -76.8   492    -5 A     America/…
##  9 0P2   Shoestring Aviation Airfield    39.8  -76.6  1000    -5 U     America/…
## 10 0S9   Jefferson County Intl           48.1 -123.    108    -8 A     America/…
## # … with 1,448 more rows

str(airports)

## tibble [1,458 × 8] (S3: tbl_df/tbl/data.frame)
##  $ faa  : chr [1:1458] "04G" "06A" "06C" "06N" ...
##  $ name : chr [1:1458] "Lansdowne Airport" "Moton Field Municipal Airport" "Schaumburg Regional" "Randall Airport" ...
##  $ lat  : num [1:1458] 41.1 32.5 42 41.4 31.1 ...
##  $ lon  : num [1:1458] -80.6 -85.7 -88.1 -74.4 -81.4 ...
##  $ alt  : num [1:1458] 1044 264 801 523 11 ...
##  $ tz   : num [1:1458] -5 -6 -6 -5 -5 -5 -5 -5 -5 -8 ...
##  $ dst  : chr [1:1458] "A" "A" "A" "A" ...
##  $ tzone: chr [1:1458] "America/New_York" "America/Chicago" "America/Chicago" "America/New_York" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   id = col_double(),
##   ..   name = col_character(),
##   ..   city = col_character(),
##   ..   country = col_character(),
##   ..   faa = col_character(),
##   ..   icao = col_character(),
##   ..   lat = col_double(),
##   ..   lon = col_double(),
##   ..   alt = col_double(),
##   ..   tz = col_double(),
##   ..   dst = col_character(),
##   ..   tzone = col_character()
##   .. )

planes

## # A tibble: 3,322 × 9
##    tailnum  year type          manufacturer   model  engines seats speed engine 
##    <chr>   <int> <chr>         <chr>          <chr>    <int> <int> <int> <chr>  
##  1 N10156   2004 Fixed wing m… EMBRAER        EMB-1…       2    55    NA Turbo-…
##  2 N102UW   1998 Fixed wing m… AIRBUS INDUST… A320-…       2   182    NA Turbo-…
##  3 N103US   1999 Fixed wing m… AIRBUS INDUST… A320-…       2   182    NA Turbo-…
##  4 N104UW   1999 Fixed wing m… AIRBUS INDUST… A320-…       2   182    NA Turbo-…
##  5 N10575   2002 Fixed wing m… EMBRAER        EMB-1…       2    55    NA Turbo-…
##  6 N105UW   1999 Fixed wing m… AIRBUS INDUST… A320-…       2   182    NA Turbo-…
##  7 N107US   1999 Fixed wing m… AIRBUS INDUST… A320-…       2   182    NA Turbo-…
##  8 N108UW   1999 Fixed wing m… AIRBUS INDUST… A320-…       2   182    NA Turbo-…
##  9 N109UW   1999 Fixed wing m… AIRBUS INDUST… A320-…       2   182    NA Turbo-…
## 10 N110UW   1999 Fixed wing m… AIRBUS INDUST… A320-…       2   182    NA Turbo-…
## # … with 3,312 more rows

str(planes)

## tibble [3,322 × 9] (S3: tbl_df/tbl/data.frame)
##  $ tailnum     : chr [1:3322] "N10156" "N102UW" "N103US" "N104UW" ...
##  $ year        : int [1:3322] 2004 1998 1999 1999 2002 1999 1999 1999 1999 1999 ...
##  $ type        : chr [1:3322] "Fixed wing multi engine" "Fixed wing multi engine" "Fixed wing multi engine" "Fixed wing multi engine" ...
##  $ manufacturer: chr [1:3322] "EMBRAER" "AIRBUS INDUSTRIE" "AIRBUS INDUSTRIE" "AIRBUS INDUSTRIE" ...
##  $ model       : chr [1:3322] "EMB-145XR" "A320-214" "A320-214" "A320-214" ...
##  $ engines     : int [1:3322] 2 2 2 2 2 2 2 2 2 2 ...
##  $ seats       : int [1:3322] 55 182 182 182 55 182 182 182 182 182 ...
##  $ speed       : int [1:3322] NA NA NA NA NA NA NA NA NA NA ...
##  $ engine      : chr [1:3322] "Turbo-fan" "Turbo-fan" "Turbo-fan" "Turbo-fan" ...

weather

## # A tibble: 26,115 × 15
##    origin  year month   day  hour  temp  dewp humid wind_dir wind_speed
##    <chr>  <int> <int> <int> <int> <dbl> <dbl> <dbl>    <dbl>      <dbl>
##  1 EWR     2013     1     1     1  39.0  26.1  59.4      270      10.4 
##  2 EWR     2013     1     1     2  39.0  27.0  61.6      250       8.06
##  3 EWR     2013     1     1     3  39.0  28.0  64.4      240      11.5 
##  4 EWR     2013     1     1     4  39.9  28.0  62.2      250      12.7 
##  5 EWR     2013     1     1     5  39.0  28.0  64.4      260      12.7 
##  6 EWR     2013     1     1     6  37.9  28.0  67.2      240      11.5 
##  7 EWR     2013     1     1     7  39.0  28.0  64.4      240      15.0 
##  8 EWR     2013     1     1     8  39.9  28.0  62.2      250      10.4 
##  9 EWR     2013     1     1     9  39.9  28.0  62.2      260      15.0 
## 10 EWR     2013     1     1    10  41    28.0  59.6      260      13.8 
## # … with 26,105 more rows, and 5 more variables: wind_gust <dbl>, precip <dbl>,
## #   pressure <dbl>, visib <dbl>, time_hour <dttm>

str(weather)

## tibble [26,115 × 15] (S3: tbl_df/tbl/data.frame)
##  $ origin    : chr [1:26115] "EWR" "EWR" "EWR" "EWR" ...
##  $ year      : int [1:26115] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ month     : int [1:26115] 1 1 1 1 1 1 1 1 1 1 ...
##  $ day       : int [1:26115] 1 1 1 1 1 1 1 1 1 1 ...
##  $ hour      : int [1:26115] 1 2 3 4 5 6 7 8 9 10 ...
##  $ temp      : num [1:26115] 39 39 39 39.9 39 ...
##  $ dewp      : num [1:26115] 26.1 27 28 28 28 ...
##  $ humid     : num [1:26115] 59.4 61.6 64.4 62.2 64.4 ...
##  $ wind_dir  : num [1:26115] 270 250 240 250 260 240 240 250 260 260 ...
##  $ wind_speed: num [1:26115] 10.36 8.06 11.51 12.66 12.66 ...
##  $ wind_gust : num [1:26115] NA NA NA NA NA NA NA NA NA NA ...
##  $ precip    : num [1:26115] 0 0 0 0 0 0 0 0 0 0 ...
##  $ pressure  : num [1:26115] 1012 1012 1012 1012 1012 ...
##  $ visib     : num [1:26115] 10 10 10 10 10 10 10 10 10 10 ...
##  $ time_hour : POSIXct[1:26115], format: "2013-01-01 01:00:00" "2013-01-01 02:00:00" ...

11.3 Keys

flights se conecta a planes a través de una sola variable, tailnum.
flights se conecta a arilines a través de la variable carrier.
flights se conecta a airports de dos formas: a través de las variables de origin y dest.
flights se conecta al weather a través de origin (la ubicación) y el year, month, day y hour (la hora).

11.3.1 planes

planes %>% 
  count(tailnum) %>% 
  filter(n > 1)

## # A tibble: 0 × 2
## # … with 2 variables: tailnum <chr>, n <int>

11.3.2 weather

weather %>% 
  count(year, month, day, hour, origin) %>% 
  filter(n > 1)

## # A tibble: 3 × 6
##    year month   day  hour origin     n
##   <int> <int> <int> <int> <chr>  <int>
## 1  2013    11     3     1 EWR        2
## 2  2013    11     3     1 JFK        2
## 3  2013    11     3     1 LGA        2

11.3.3 flights

flights %>% 
  count(year, month, day, flight) %>% 
  filter(n > 1)

## # A tibble: 29,768 × 5
##     year month   day flight     n
##    <int> <int> <int>  <int> <int>
##  1  2013     1     1      1     2
##  2  2013     1     1      3     2
##  3  2013     1     1      4     2
##  4  2013     1     1     11     3
##  5  2013     1     1     15     2
##  6  2013     1     1     21     2
##  7  2013     1     1     27     4
##  8  2013     1     1     31     2
##  9  2013     1     1     32     2
## 10  2013     1     1     35     2
## # … with 29,758 more rows

flights %>% 
  count(year, month, day, tailnum) %>% 
  filter(n > 1)

## # A tibble: 64,928 × 5
##     year month   day tailnum     n
##    <int> <int> <int> <chr>   <int>
##  1  2013     1     1 N0EGMQ      2
##  2  2013     1     1 N11189      2
##  3  2013     1     1 N11536      2
##  4  2013     1     1 N11544      3
##  5  2013     1     1 N11551      2
##  6  2013     1     1 N12540      2
##  7  2013     1     1 N12567      2
##  8  2013     1     1 N13123      2
##  9  2013     1     1 N13538      3
## 10  2013     1     1 N13566      3
## # … with 64,918 more rows

11.4 Mutating Join

flights2 <- flights %>% 
  select(year:day, hour, origin, dest, tailnum, carrier)
flights2

## # A tibble: 336,776 × 8
##     year month   day  hour origin dest  tailnum carrier
##    <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>  
##  1  2013     1     1     5 EWR    IAH   N14228  UA     
##  2  2013     1     1     5 LGA    IAH   N24211  UA     
##  3  2013     1     1     5 JFK    MIA   N619AA  AA     
##  4  2013     1     1     5 JFK    BQN   N804JB  B6     
##  5  2013     1     1     6 LGA    ATL   N668DN  DL     
##  6  2013     1     1     5 EWR    ORD   N39463  UA     
##  7  2013     1     1     6 EWR    FLL   N516JB  B6     
##  8  2013     1     1     6 LGA    IAD   N829AS  EV     
##  9  2013     1     1     6 JFK    MCO   N593JB  B6     
## 10  2013     1     1     6 LGA    ORD   N3ALAA  AA     
## # … with 336,766 more rows

flights2 %>%
  select(-origin, -dest) %>% 
  left_join(airlines, by = "carrier")

## # A tibble: 336,776 × 7
##     year month   day  hour tailnum carrier name                    
##    <int> <int> <int> <dbl> <chr>   <chr>   <chr>                   
##  1  2013     1     1     5 N14228  UA      United Air Lines Inc.   
##  2  2013     1     1     5 N24211  UA      United Air Lines Inc.   
##  3  2013     1     1     5 N619AA  AA      American Airlines Inc.  
##  4  2013     1     1     5 N804JB  B6      JetBlue Airways         
##  5  2013     1     1     6 N668DN  DL      Delta Air Lines Inc.    
##  6  2013     1     1     5 N39463  UA      United Air Lines Inc.   
##  7  2013     1     1     6 N516JB  B6      JetBlue Airways         
##  8  2013     1     1     6 N829AS  EV      ExpressJet Airlines Inc.
##  9  2013     1     1     6 N593JB  B6      JetBlue Airways         
## 10  2013     1     1     6 N3ALAA  AA      American Airlines Inc.  
## # … with 336,766 more rows

flights2 %>%
  select(-origin, -dest) %>% 
  mutate(name = airlines$name[match(carrier, airlines$carrier)])

## # A tibble: 336,776 × 7
##     year month   day  hour tailnum carrier name                    
##    <int> <int> <int> <dbl> <chr>   <chr>   <chr>                   
##  1  2013     1     1     5 N14228  UA      United Air Lines Inc.   
##  2  2013     1     1     5 N24211  UA      United Air Lines Inc.   
##  3  2013     1     1     5 N619AA  AA      American Airlines Inc.  
##  4  2013     1     1     5 N804JB  B6      JetBlue Airways         
##  5  2013     1     1     6 N668DN  DL      Delta Air Lines Inc.    
##  6  2013     1     1     5 N39463  UA      United Air Lines Inc.   
##  7  2013     1     1     6 N516JB  B6      JetBlue Airways         
##  8  2013     1     1     6 N829AS  EV      ExpressJet Airlines Inc.
##  9  2013     1     1     6 N593JB  B6      JetBlue Airways         
## 10  2013     1     1     6 N3ALAA  AA      American Airlines Inc.  
## # … with 336,766 more rows

flights2 %>%
  select(-origin, -dest) %>% 
  right_join(airlines, by = "carrier")

## # A tibble: 336,776 × 7
##     year month   day  hour tailnum carrier name                    
##    <int> <int> <int> <dbl> <chr>   <chr>   <chr>                   
##  1  2013     1     1     5 N14228  UA      United Air Lines Inc.   
##  2  2013     1     1     5 N24211  UA      United Air Lines Inc.   
##  3  2013     1     1     5 N619AA  AA      American Airlines Inc.  
##  4  2013     1     1     5 N804JB  B6      JetBlue Airways         
##  5  2013     1     1     6 N668DN  DL      Delta Air Lines Inc.    
##  6  2013     1     1     5 N39463  UA      United Air Lines Inc.   
##  7  2013     1     1     6 N516JB  B6      JetBlue Airways         
##  8  2013     1     1     6 N829AS  EV      ExpressJet Airlines Inc.
##  9  2013     1     1     6 N593JB  B6      JetBlue Airways         
## 10  2013     1     1     6 N3ALAA  AA      American Airlines Inc.  
## # … with 336,766 more rows

airlines %>%
  select(-name) %>% 
  mutate(name = flights2$carrier[match(carrier, flights2$carrier)])

## # A tibble: 16 × 2
##    carrier name 
##    <chr>   <chr>
##  1 9E      9E   
##  2 AA      AA   
##  3 AS      AS   
##  4 B6      B6   
##  5 DL      DL   
##  6 EV      EV   
##  7 F9      F9   
##  8 FL      FL   
##  9 HA      HA   
## 10 MQ      MQ   
## 11 OO      OO   
## 12 UA      UA   
## 13 US      US   
## 14 VX      VX   
## 15 WN      WN   
## 16 YV      YV

11.4.1 left_join & right_join

flights2 %>% 
  left_join(weather)

## Joining, by = c("year", "month", "day", "hour", "origin")

## # A tibble: 336,776 × 18
##     year month   day  hour origin dest  tailnum carrier  temp  dewp humid
##    <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <dbl> <dbl> <dbl>
##  1  2013     1     1     5 EWR    IAH   N14228  UA       39.0  28.0  64.4
##  2  2013     1     1     5 LGA    IAH   N24211  UA       39.9  25.0  54.8
##  3  2013     1     1     5 JFK    MIA   N619AA  AA       39.0  27.0  61.6
##  4  2013     1     1     5 JFK    BQN   N804JB  B6       39.0  27.0  61.6
##  5  2013     1     1     6 LGA    ATL   N668DN  DL       39.9  25.0  54.8
##  6  2013     1     1     5 EWR    ORD   N39463  UA       39.0  28.0  64.4
##  7  2013     1     1     6 EWR    FLL   N516JB  B6       37.9  28.0  67.2
##  8  2013     1     1     6 LGA    IAD   N829AS  EV       39.9  25.0  54.8
##  9  2013     1     1     6 JFK    MCO   N593JB  B6       37.9  27.0  64.3
## 10  2013     1     1     6 LGA    ORD   N3ALAA  AA       39.9  25.0  54.8
## # … with 336,766 more rows, and 7 more variables: wind_dir <dbl>,
## #   wind_speed <dbl>, wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## #   visib <dbl>, time_hour <dttm>

weather %>% 
  left_join(flights2)

## Joining, by = c("origin", "year", "month", "day", "hour")

## # A tibble: 341,957 × 18
##    origin  year month   day  hour  temp  dewp humid wind_dir wind_speed
##    <chr>  <int> <int> <int> <dbl> <dbl> <dbl> <dbl>    <dbl>      <dbl>
##  1 EWR     2013     1     1     1  39.0  26.1  59.4      270      10.4 
##  2 EWR     2013     1     1     2  39.0  27.0  61.6      250       8.06
##  3 EWR     2013     1     1     3  39.0  28.0  64.4      240      11.5 
##  4 EWR     2013     1     1     4  39.9  28.0  62.2      250      12.7 
##  5 EWR     2013     1     1     5  39.0  28.0  64.4      260      12.7 
##  6 EWR     2013     1     1     5  39.0  28.0  64.4      260      12.7 
##  7 EWR     2013     1     1     6  37.9  28.0  67.2      240      11.5 
##  8 EWR     2013     1     1     6  37.9  28.0  67.2      240      11.5 
##  9 EWR     2013     1     1     6  37.9  28.0  67.2      240      11.5 
## 10 EWR     2013     1     1     6  37.9  28.0  67.2      240      11.5 
## # … with 341,947 more rows, and 8 more variables: wind_gust <dbl>,
## #   precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm>, dest <chr>,
## #   tailnum <chr>, carrier <chr>

flights2 %>% 
  right_join(weather)

## Joining, by = c("year", "month", "day", "hour", "origin")

## # A tibble: 341,957 × 18
##     year month   day  hour origin dest  tailnum carrier  temp  dewp humid
##    <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <dbl> <dbl> <dbl>
##  1  2013     1     1     5 EWR    IAH   N14228  UA       39.0  28.0  64.4
##  2  2013     1     1     5 LGA    IAH   N24211  UA       39.9  25.0  54.8
##  3  2013     1     1     5 JFK    MIA   N619AA  AA       39.0  27.0  61.6
##  4  2013     1     1     5 JFK    BQN   N804JB  B6       39.0  27.0  61.6
##  5  2013     1     1     6 LGA    ATL   N668DN  DL       39.9  25.0  54.8
##  6  2013     1     1     5 EWR    ORD   N39463  UA       39.0  28.0  64.4
##  7  2013     1     1     6 EWR    FLL   N516JB  B6       37.9  28.0  67.2
##  8  2013     1     1     6 LGA    IAD   N829AS  EV       39.9  25.0  54.8
##  9  2013     1     1     6 JFK    MCO   N593JB  B6       37.9  27.0  64.3
## 10  2013     1     1     6 LGA    ORD   N3ALAA  AA       39.9  25.0  54.8
## # … with 341,947 more rows, and 7 more variables: wind_dir <dbl>,
## #   wind_speed <dbl>, wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## #   visib <dbl>, time_hour <dttm>

weather %>% 
  right_join(flights2)

## Joining, by = c("origin", "year", "month", "day", "hour")

## # A tibble: 336,776 × 18
##    origin  year month   day  hour  temp  dewp humid wind_dir wind_speed
##    <chr>  <int> <int> <int> <dbl> <dbl> <dbl> <dbl>    <dbl>      <dbl>
##  1 EWR     2013     1     1     5  39.0  28.0  64.4      260       12.7
##  2 EWR     2013     1     1     5  39.0  28.0  64.4      260       12.7
##  3 EWR     2013     1     1     6  37.9  28.0  67.2      240       11.5
##  4 EWR     2013     1     1     6  37.9  28.0  67.2      240       11.5
##  5 EWR     2013     1     1     6  37.9  28.0  67.2      240       11.5
##  6 EWR     2013     1     1     6  37.9  28.0  67.2      240       11.5
##  7 EWR     2013     1     1     6  37.9  28.0  67.2      240       11.5
##  8 EWR     2013     1     1     6  37.9  28.0  67.2      240       11.5
##  9 EWR     2013     1     1     6  37.9  28.0  67.2      240       11.5
## 10 EWR     2013     1     1     6  37.9  28.0  67.2      240       11.5
## # … with 336,766 more rows, and 8 more variables: wind_gust <dbl>,
## #   precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm>, dest <chr>,
## #   tailnum <chr>, carrier <chr>

flights2 %>% 
  left_join(planes, by = "tailnum")

## # A tibble: 336,776 × 16
##    year.x month   day  hour origin dest  tailnum carrier year.y type            
##     <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>    <int> <chr>           
##  1   2013     1     1     5 EWR    IAH   N14228  UA        1999 Fixed wing mult…
##  2   2013     1     1     5 LGA    IAH   N24211  UA        1998 Fixed wing mult…
##  3   2013     1     1     5 JFK    MIA   N619AA  AA        1990 Fixed wing mult…
##  4   2013     1     1     5 JFK    BQN   N804JB  B6        2012 Fixed wing mult…
##  5   2013     1     1     6 LGA    ATL   N668DN  DL        1991 Fixed wing mult…
##  6   2013     1     1     5 EWR    ORD   N39463  UA        2012 Fixed wing mult…
##  7   2013     1     1     6 EWR    FLL   N516JB  B6        2000 Fixed wing mult…
##  8   2013     1     1     6 LGA    IAD   N829AS  EV        1998 Fixed wing mult…
##  9   2013     1     1     6 JFK    MCO   N593JB  B6        2004 Fixed wing mult…
## 10   2013     1     1     6 LGA    ORD   N3ALAA  AA          NA <NA>            
## # … with 336,766 more rows, and 6 more variables: manufacturer <chr>,
## #   model <chr>, engines <int>, seats <int>, speed <int>, engine <chr>

flights2 %>% 
  right_join(planes, by = "tailnum")

## # A tibble: 284,170 × 16
##    year.x month   day  hour origin dest  tailnum carrier year.y type            
##     <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>    <int> <chr>           
##  1   2013     1     1     5 EWR    IAH   N14228  UA        1999 Fixed wing mult…
##  2   2013     1     1     5 LGA    IAH   N24211  UA        1998 Fixed wing mult…
##  3   2013     1     1     5 JFK    MIA   N619AA  AA        1990 Fixed wing mult…
##  4   2013     1     1     5 JFK    BQN   N804JB  B6        2012 Fixed wing mult…
##  5   2013     1     1     6 LGA    ATL   N668DN  DL        1991 Fixed wing mult…
##  6   2013     1     1     5 EWR    ORD   N39463  UA        2012 Fixed wing mult…
##  7   2013     1     1     6 EWR    FLL   N516JB  B6        2000 Fixed wing mult…
##  8   2013     1     1     6 LGA    IAD   N829AS  EV        1998 Fixed wing mult…
##  9   2013     1     1     6 JFK    MCO   N593JB  B6        2004 Fixed wing mult…
## 10   2013     1     1     6 JFK    PBI   N793JB  B6        2011 Fixed wing mult…
## # … with 284,160 more rows, and 6 more variables: manufacturer <chr>,
## #   model <chr>, engines <int>, seats <int>, speed <int>, engine <chr>

flights2 %>% 
  left_join(airports, c("dest" = "faa"))

## # A tibble: 336,776 × 15
##     year month   day  hour origin dest  tailnum carrier name     lat   lon   alt
##    <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <chr>  <dbl> <dbl> <dbl>
##  1  2013     1     1     5 EWR    IAH   N14228  UA      Georg…  30.0 -95.3    97
##  2  2013     1     1     5 LGA    IAH   N24211  UA      Georg…  30.0 -95.3    97
##  3  2013     1     1     5 JFK    MIA   N619AA  AA      Miami…  25.8 -80.3     8
##  4  2013     1     1     5 JFK    BQN   N804JB  B6      <NA>    NA    NA      NA
##  5  2013     1     1     6 LGA    ATL   N668DN  DL      Harts…  33.6 -84.4  1026
##  6  2013     1     1     5 EWR    ORD   N39463  UA      Chica…  42.0 -87.9   668
##  7  2013     1     1     6 EWR    FLL   N516JB  B6      Fort …  26.1 -80.2     9
##  8  2013     1     1     6 LGA    IAD   N829AS  EV      Washi…  38.9 -77.5   313
##  9  2013     1     1     6 JFK    MCO   N593JB  B6      Orlan…  28.4 -81.3    96
## 10  2013     1     1     6 LGA    ORD   N3ALAA  AA      Chica…  42.0 -87.9   668
## # … with 336,766 more rows, and 3 more variables: tz <dbl>, dst <chr>,
## #   tzone <chr>

flights2 %>% 
  right_join(airports, c("dest" = "faa"))

## # A tibble: 330,531 × 15
##     year month   day  hour origin dest  tailnum carrier name     lat   lon   alt
##    <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <chr>  <dbl> <dbl> <dbl>
##  1  2013     1     1     5 EWR    IAH   N14228  UA      Georg…  30.0 -95.3    97
##  2  2013     1     1     5 LGA    IAH   N24211  UA      Georg…  30.0 -95.3    97
##  3  2013     1     1     5 JFK    MIA   N619AA  AA      Miami…  25.8 -80.3     8
##  4  2013     1     1     6 LGA    ATL   N668DN  DL      Harts…  33.6 -84.4  1026
##  5  2013     1     1     5 EWR    ORD   N39463  UA      Chica…  42.0 -87.9   668
##  6  2013     1     1     6 EWR    FLL   N516JB  B6      Fort …  26.1 -80.2     9
##  7  2013     1     1     6 LGA    IAD   N829AS  EV      Washi…  38.9 -77.5   313
##  8  2013     1     1     6 JFK    MCO   N593JB  B6      Orlan…  28.4 -81.3    96
##  9  2013     1     1     6 LGA    ORD   N3ALAA  AA      Chica…  42.0 -87.9   668
## 10  2013     1     1     6 JFK    PBI   N793JB  B6      Palm …  26.7 -80.1    19
## # … with 330,521 more rows, and 3 more variables: tz <dbl>, dst <chr>,
## #   tzone <chr>

flights2 %>% 
  left_join(airports, c("origin" = "faa"))

## # A tibble: 336,776 × 15
##     year month   day  hour origin dest  tailnum carrier name     lat   lon   alt
##    <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <chr>  <dbl> <dbl> <dbl>
##  1  2013     1     1     5 EWR    IAH   N14228  UA      Newar…  40.7 -74.2    18
##  2  2013     1     1     5 LGA    IAH   N24211  UA      La Gu…  40.8 -73.9    22
##  3  2013     1     1     5 JFK    MIA   N619AA  AA      John …  40.6 -73.8    13
##  4  2013     1     1     5 JFK    BQN   N804JB  B6      John …  40.6 -73.8    13
##  5  2013     1     1     6 LGA    ATL   N668DN  DL      La Gu…  40.8 -73.9    22
##  6  2013     1     1     5 EWR    ORD   N39463  UA      Newar…  40.7 -74.2    18
##  7  2013     1     1     6 EWR    FLL   N516JB  B6      Newar…  40.7 -74.2    18
##  8  2013     1     1     6 LGA    IAD   N829AS  EV      La Gu…  40.8 -73.9    22
##  9  2013     1     1     6 JFK    MCO   N593JB  B6      John …  40.6 -73.8    13
## 10  2013     1     1     6 LGA    ORD   N3ALAA  AA      La Gu…  40.8 -73.9    22
## # … with 336,766 more rows, and 3 more variables: tz <dbl>, dst <chr>,
## #   tzone <chr>

flights2 %>% 
  right_join(airports, c("origin" = "faa"))

## # A tibble: 338,231 × 15
##     year month   day  hour origin dest  tailnum carrier name     lat   lon   alt
##    <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <chr>  <dbl> <dbl> <dbl>
##  1  2013     1     1     5 EWR    IAH   N14228  UA      Newar…  40.7 -74.2    18
##  2  2013     1     1     5 LGA    IAH   N24211  UA      La Gu…  40.8 -73.9    22
##  3  2013     1     1     5 JFK    MIA   N619AA  AA      John …  40.6 -73.8    13
##  4  2013     1     1     5 JFK    BQN   N804JB  B6      John …  40.6 -73.8    13
##  5  2013     1     1     6 LGA    ATL   N668DN  DL      La Gu…  40.8 -73.9    22
##  6  2013     1     1     5 EWR    ORD   N39463  UA      Newar…  40.7 -74.2    18
##  7  2013     1     1     6 EWR    FLL   N516JB  B6      Newar…  40.7 -74.2    18
##  8  2013     1     1     6 LGA    IAD   N829AS  EV      La Gu…  40.8 -73.9    22
##  9  2013     1     1     6 JFK    MCO   N593JB  B6      John …  40.6 -73.8    13
## 10  2013     1     1     6 LGA    ORD   N3ALAA  AA      La Gu…  40.8 -73.9    22
## # … with 338,221 more rows, and 3 more variables: tz <dbl>, dst <chr>,
## #   tzone <chr>

11.4.2 semi_join

airports %>%
  semi_join(flights, c("faa" = "dest")) %>%
  ggplot(aes(lon, lat)) +
    borders("state") +
    geom_point(color = "orange") +
    coord_quickmap()

airports %>%
  semi_join(flights, c("faa" = "dest")) %>%
  ggplot(aes(lon, lat, shape = dst)) +
    borders("state") +
    geom_point(color = "orange") +
    coord_quickmap()

11.4.3 anti_join

flights %>% 
  anti_join(planes, "tailnum")

## # A tibble: 52,606 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      558            600        -2      753            745
##  2  2013     1     1      559            600        -1      941            910
##  3  2013     1     1      600            600         0      837            825
##  4  2013     1     1      602            605        -3      821            805
##  5  2013     1     1      608            600         8      807            735
##  6  2013     1     1      611            600        11      945            931
##  7  2013     1     1      623            610        13      920            915
##  8  2013     1     1      624            630        -6      840            830
##  9  2013     1     1      628            630        -2     1137           1140
## 10  2013     1     1      629            630        -1      824            810
## # … with 52,596 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

planes %>% 
  anti_join(flights, "tailnum")

## # A tibble: 0 × 9
## # … with 9 variables: tailnum <chr>, year <int>, type <chr>,
## #   manufacturer <chr>, model <chr>, engines <int>, seats <int>, speed <int>,
## #   engine <chr>

11.4.4 semi_join & anti_join

airports %>% 
  rename(dest = faa) %>% 
  semi_join(flights) %>%
  ggplot(aes(lon, lat, size = alt)) +
    borders("state") +
    geom_point(color = "orange") +
    coord_quickmap()

## Joining, by = "dest"

flights %>% 
  anti_join(airports %>%
              rename(dest = faa))

## Joining, by = "dest"

## # A tibble: 7,602 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      544            545        -1     1004           1022
##  2  2013     1     1      615            615         0     1039           1100
##  3  2013     1     1      628            630        -2     1137           1140
##  4  2013     1     1      701            700         1     1123           1154
##  5  2013     1     1      711            715        -4     1151           1206
##  6  2013     1     1      820            820         0     1254           1310
##  7  2013     1     1      820            820         0     1249           1329
##  8  2013     1     1      840            845        -5     1311           1350
##  9  2013     1     1      909            810        59     1331           1315
## 10  2013     1     1      913            918        -5     1346           1416
## # … with 7,592 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

airports %>% 
  rename(origin = faa) %>% 
  semi_join(flights)

## Joining, by = "origin"

## # A tibble: 3 × 8
##   origin name                  lat   lon   alt    tz dst   tzone           
##   <chr>  <chr>               <dbl> <dbl> <dbl> <dbl> <chr> <chr>           
## 1 EWR    Newark Liberty Intl  40.7 -74.2    18    -5 A     America/New_York
## 2 JFK    John F Kennedy Intl  40.6 -73.8    13    -5 A     America/New_York
## 3 LGA    La Guardia           40.8 -73.9    22    -5 A     America/New_York

11.4.5 Filtering Joins

flights %>% filter(dep_time >= 600, dep_time <= 605)

## # A tibble: 2,460 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      600            600         0      851            858
##  2  2013     1     1      600            600         0      837            825
##  3  2013     1     1      601            600         1      844            850
##  4  2013     1     1      602            610        -8      812            820
##  5  2013     1     1      602            605        -3      821            805
##  6  2013     1     2      600            600         0      814            749
##  7  2013     1     2      600            605        -5      751            818
##  8  2013     1     2      600            600         0      819            815
##  9  2013     1     2      600            600         0      846            846
## 10  2013     1     2      600            600         0      737            725
## # … with 2,450 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

top_dest <- flights %>%
  count(dest, sort = TRUE) %>%
  head(10)
top_dest

## # A tibble: 10 × 2
##    dest      n
##    <chr> <int>
##  1 ORD   17283
##  2 ATL   17215
##  3 LAX   16174
##  4 BOS   15508
##  5 MCO   14082
##  6 CLT   14064
##  7 SFO   13331
##  8 FLL   12055
##  9 MIA   11728
## 10 DCA    9705

top_dest <- flights %>%
  count(dest, sort = TRUE) %>%
  tail(10)
top_dest

## # A tibble: 10 × 2
##    dest      n
##    <chr> <int>
##  1 BZN      36
##  2 JAC      25
##  3 PSP      19
##  4 EYW      17
##  5 HDN      15
##  6 MTJ      15
##  7 SBN      10
##  8 ANC       8
##  9 LEX       1
## 10 LGA       1

flights %>%
    select(carrier, dep_delay) %>%
    filter(dep_delay > 60) %>%
    head()

## # A tibble: 6 × 2
##   carrier dep_delay
##   <chr>       <dbl>
## 1 MQ            101
## 2 AA             71
## 3 MQ            853
## 4 UA            144
## 5 UA            134
## 6 EV             96

flights %>%
    select(carrier, dep_delay) %>%
    filter(dep_delay < 60) %>%
    tail()

## # A tibble: 6 × 2
##   carrier dep_delay
##   <chr>       <dbl>
## 1 B6             -8
## 2 B6             -5
## 3 B6            -10
## 4 B6             -5
## 5 B6             12
## 6 B6            -10

flights %>% 
  filter(dest %in% top_dest$dest)

## # A tibble: 147 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      848            851        -3     1155           1136
##  2  2013     1     2      900            859         1     1146           1144
##  3  2013     1     5      829            830        -1     1047           1111
##  4  2013     1     5      850            825        25     1132           1123
##  5  2013     1     5      859            901        -2     1144           1201
##  6  2013     1     5     1048           1035        13     1443           1358
##  7  2013     1     5     1101           1055         6     1342           1400
##  8  2013     1    12      825            825         0     1115           1123
##  9  2013     1    12      827            830        -3     1112           1111
## 10  2013     1    12      859            901        -2     1146           1201
## # … with 137 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

flights %>% 
  semi_join(top_dest)

## Joining, by = "dest"

## # A tibble: 147 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      848            851        -3     1155           1136
##  2  2013     1     2      900            859         1     1146           1144
##  3  2013     1     5      829            830        -1     1047           1111
##  4  2013     1     5      850            825        25     1132           1123
##  5  2013     1     5      859            901        -2     1144           1201
##  6  2013     1     5     1048           1035        13     1443           1358
##  7  2013     1     5     1101           1055         6     1342           1400
##  8  2013     1    12      825            825         0     1115           1123
##  9  2013     1    12      827            830        -3     1112           1111
## 10  2013     1    12      859            901        -2     1146           1201
## # … with 137 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

flights %>%
  anti_join(planes, by = "tailnum") %>%
  count(tailnum, sort = TRUE)

## # A tibble: 722 × 2
##    tailnum     n
##    <chr>   <int>
##  1 <NA>     2512
##  2 N725MQ    575
##  3 N722MQ    513
##  4 N723MQ    507
##  5 N713MQ    483
##  6 N735MQ    396
##  7 N0EGMQ    371
##  8 N534MQ    364
##  9 N542MQ    363
## 10 N531MQ    349
## # … with 712 more rows

11.4.6 group_by & summarize

flights %>% 
  group_by(month) %>% 
  summarise(dist = sum(distance))

## # A tibble: 12 × 2
##    month     dist
##    <int>    <dbl>
##  1     1 27188805
##  2     2 24975509
##  3     3 29179636
##  4     4 29427294
##  5     5 29974128
##  6     6 29856388
##  7     7 31149199
##  8     8 31149334
##  9     9 28711426
## 10    10 30012086
## 11    11 28639718
## 12    12 29954084

11.4.7 group_by & summarise

flights %>%
    group_by(dest) %>%
    summarise(flight_count = n(), plane_count = n_distinct(tailnum)) %>%
    head()

## # A tibble: 6 × 3
##   dest  flight_count plane_count
##   <chr>        <int>       <int>
## 1 ABQ            254         108
## 2 ACK            265          58
## 3 ALB            439         172
## 4 ANC              8           6
## 5 ATL          17215        1180
## 6 AUS           2439         993

11.4.8 group_by, select, filter & arrange

flights %>%
    group_by(carrier) %>%
    select(month, day, dep_delay) %>%
    filter(min_rank(desc(dep_delay)) <= 2 & dep_delay!="NA") %>%
    arrange(carrier, desc(dep_delay)) %>%
    head()

## Adding missing grouping variables: `carrier`

## # A tibble: 6 × 4
## # Groups:   carrier [3]
##   carrier month   day dep_delay
##   <chr>   <int> <int>     <dbl>
## 1 9E          2    16       747
## 2 9E          7    24       430
## 3 AA          9    20      1014
## 4 AA         12     5       896
## 5 AS          5    23       225
## 6 AS          1    20       222

11.4.9 inner_join

flights_latlon <- flights %>%
  inner_join(select(airports, origin = faa, origin_lat = lat, origin_lon = lon),
    by = "origin"
  ) %>%
  inner_join(select(airports, dest = faa, dest_lat = lat, dest_lon = lon),
    by = "dest"
  )

11.4.10 filter & ggplot

flights_latlon %>%
  filter(origin == "JFK") %>%
  ggplot(aes(
    x = origin_lon, xend = dest_lon,
    y = origin_lat, yend = dest_lat
  )) +
  borders("state") +
  geom_segment(arrow = arrow(length = unit(0.1, "cm")), color = "orange") +
  coord_quickmap() +
  labs(y = "Latitude", x = "Longitude")

flights_latlon %>%
  filter(dest %in% c("MIA", "FLL", "SFO", "CLT", "MCO", "BOS", "LAX", "ATL", "ORD")) %>%
  ggplot(aes(
    x = origin_lon, xend = dest_lon,
    y = origin_lat, yend = dest_lat
  )) +
  borders("state") +
  geom_segment(arrow = arrow(length = unit(0.1, "cm")), color = "orange") +
  coord_quickmap() +
  labs(y = "Latitude", x = "Longitude")

11.4.11 slice & ggplot

flights_latlon %>%
  slice(1:100) %>%
  ggplot(aes(
    x = origin_lon, xend = dest_lon,
    y = origin_lat, yend = dest_lat
  )) +
  borders("state") +
  geom_segment(arrow = arrow(length = unit(0.1, "cm")), color = "orange") +
  coord_quickmap() +
  labs(y = "Latitude", x = "Longitude")

11.4.12 filter & ggplot - scaterplots

alaska_flights <- flights %>%   
     filter(carrier == "AS")

 ggplot(alaska_flights, aes(x = dep_delay, y = arr_delay)) + 
  geom_point()

## Warning: Removed 5 rows containing missing values (geom_point).

11.4.13 filter & ggplot - scaterplots

11.4.13.1 Changing the transparency

ggplot(alaska_flights, aes(x = dep_delay, y = arr_delay)) + 
  geom_point(alpha =0.3 )

## Warning: Removed 5 rows containing missing values (geom_point).

11.4.13.2 Jittering the points

ggplot(data = alaska_flights, mapping = aes(x = dep_delay, y = arr_delay)) + 
  geom_jitter(width = 30, height = 30)

## Warning: Removed 5 rows containing missing values (geom_point).

11.4.14 Line graphs

early_january_weather <- weather %>% 
  filter(origin == "EWR" & month == 1 & day <= 15)
  
ggplot(early_january_weather, aes(x = time_hour, y=temp)) +
     geom_line()

11.5 Histograms

ggplot(data = weather, mapping = aes(x = temp)) +
  geom_histogram(bins = 40, color = "white", fill = "orange")

## Warning: Removed 1 rows containing non-finite values (stat_bin).

ggplot(data = weather, mapping = aes(x = temp)) +
  geom_histogram(binwidth = 10, color = "orange", fill = "yellow")

## Warning: Removed 1 rows containing non-finite values (stat_bin).

11.6 Faceting

 ggplot(data = weather, mapping = aes(x = temp)) +
  geom_histogram(binwidth = 5, color = "orange") +
  facet_wrap(~ month)

## Warning: Removed 1 rows containing non-finite values (stat_bin).

ggplot(data = weather, mapping = aes(x = temp)) +
  geom_histogram(binwidth = 5, color = "orange") +
  facet_wrap(~ month, nrow = 4)

## Warning: Removed 1 rows containing non-finite values (stat_bin).

11.7 Bosplots

ggplot(data = weather, mapping = aes(x = month, y = temp)) +
  geom_boxplot(fill = "orange")

## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

## Warning: Removed 1 rows containing non-finite values (stat_boxplot).

ggplot(data = weather, mapping = aes(x = factor(month), y = temp)) +
  geom_boxplot(fill = "orange")

## Warning: Removed 1 rows containing non-finite values (stat_boxplot).

11.8 Barplots

ggplot(data = flights, mapping = aes(x = carrier)) +
  geom_bar(fill="orange", colour = "yellow")

flights_counted <- flights %>%
  group_by(carrier) %>%
  summarize(number=n())
flights_counted

## # A tibble: 16 × 2
##    carrier number
##    <chr>    <int>
##  1 9E       18460
##  2 AA       32729
##  3 AS         714
##  4 B6       54635
##  5 DL       48110
##  6 EV       54173
##  7 F9         685
##  8 FL        3260
##  9 HA         342
## 10 MQ       26397
## 11 OO          32
## 12 UA       58665
## 13 US       20536
## 14 VX        5162
## 15 WN       12275
## 16 YV         601

ggplot(flights_counted, aes(x=carrier, y=number)) +
 geom_col(fill="red", colour = "orange")

ggplot(data = flights, mapping = aes(x = carrier, fill = origin)) +
  geom_bar()

ggplot(data = flights, mapping = aes(x = carrier, color = origin)) +
  geom_bar()

ggplot(data = flights, mapping = aes(x = carrier, fill = origin)) +
  geom_bar(position = "dodge")

ggplot(flights, aes(x=carrier)) +
  geom_bar(colour="orange") +
  facet_wrap(~ origin, ncol=1)

12 echarts4r

library(echarts4r)

12.1 Horizontal barchar

top_destinations <- flights %>% 
  count(dest) %>% 
  top_n(15, n) %>% 
  arrange(n)

top_destinations %>%
  e_charts(x = dest) %>%
  e_bar(n, legend = FALSE, name = "Flights") %>% 
  e_labels(position = "right") %>% 
  e_tooltip() %>% 
  e_title("Flights by destination", "Top 15 destinations") %>% 
  e_flip_coords() %>% 
  e_y_axis(splitLine = list(show = FALSE)) %>% 
  e_x_axis(show = FALSE) %>% 
  e_toolbox_feature(
    feature = "saveAsImage",
    title = "Save as image"
  )

12.2 Stacked barchar

flights_daytime <- flights %>% 
  transmute(origin, daytime = case_when(
    hour >= 22 & hour < 6 ~ "Night",
    hour >= 6 & hour < 12 ~ "Morning",
    hour >= 12 & hour < 18 ~ "Afternoon",
    TRUE ~ "Evening"
  )) %>% 
  count(origin, daytime) %>% 
  group_by(daytime)

flights_daytime %>% 
  e_charts(origin, stack = "grp") %>% 
  e_bar(n) %>% 
  e_tooltip(
    trigger = "axis",
    axisPointer = list(
      type = "shadow"
    )
  ) %>% 
  e_title(
    text = "Outgoing flights by time of day",
    subtext = "There are no night flights"
  ) %>% 
  e_y_axis(
    splitArea = list(show = FALSE),
    splitLine = list(show = FALSE)
  )

12.3 Scatter plot

set.seed(123)
flights_sm <- flights %>% 
  filter(complete.cases(.)) %>% 
  sample_n(1000)

flights_sm %>% 
  e_charts(x = dep_delay) %>% 
  e_scatter(arr_delay, name = "Flight") %>% 
  e_lm(arr_delay ~ dep_delay, name = "Linear model") %>% 
  e_axis_labels(x = "Departure delay", y = "Arrival delay") %>%
  e_title(
    text = "Arrival delay vs. departure delay",
    subtext = "The later you start, the later you finish"
  ) %>% 
  e_x_axis(
    nameLocation = "center", 
    splitArea = list(show = FALSE),
    axisLabel = list(margin = 3),
    axisPointer = list(
      show = TRUE, 
      lineStyle = list(
        color = "#999999",
        width = 0.75,
        type = "dotted"
      )
    )
  ) %>% 
  e_y_axis(
    nameLocation = "center", 
    splitArea = list(show = FALSE),
    axisLabel = list(margin = 0),
    axisPointer = list(
      show = TRUE, 
      lineStyle = list(
        color = "#999999",
        width = 0.75,
        type = "dotted"
      )
    )
  )

n_bins <- 100
flights %>% 
  filter(complete.cases(.)) %>% 
  mutate(
    arr_delay = cut(arr_delay, n_bins),
    dep_delay = cut(dep_delay, n_bins)
  ) %>% 
  count(arr_delay, dep_delay) %>% 
  e_charts(dep_delay) %>% 
  e_heatmap(arr_delay, n) %>% 
  e_visual_map(n) %>% 
  e_title("Arrival delay vs. departure delay") %>% 
  e_axis_labels("Departure delay", "Arrival delay")

12.4 Pie chart

pie <- count(flights, origin) %>% 
  e_charts(x = origin) %>% 
  e_pie(n, legend = FALSE, name = "Flights") %>% 
  e_tooltip() %>% 
  e_title("Flights by origin", "This is really hard with ggplot2")
pie

12.5 Time series

flights_ts <- flights %>% 
  transmute(week = as.Date(cut(time_hour, "week")), dep_delay, origin) %>% 
  group_by(origin, week) %>% # works with echarts
  summarise(dep_delay = sum(dep_delay, na.rm = TRUE))

## `summarise()` has grouped output by 'origin'. You can override using the `.groups` argument.

ts_base <- flights_ts %>% 
  e_charts(x = week) %>% 
  e_datazoom(
    type = "slider", 
    toolbox = FALSE,
    bottom = -5
  ) %>% 
  e_tooltip() %>% 
  e_title("Departure delays by airport") %>% 
  e_x_axis(week, axisPointer = list(show = TRUE))

ts_base %>% e_line(dep_delay)

flights_ts <- flights %>% 
  transmute(week = as.Date(cut(time_hour, "week")), dep_delay, origin) %>% 
  group_by(origin, week) %>% # works with echarts
  summarise(dep_delay = sum(dep_delay, na.rm = TRUE))

## `summarise()` has grouped output by 'origin'. You can override using the `.groups` argument.

ts_base <- flights_ts %>% 
  e_charts(x = week) %>% 
  e_datazoom(
    type = "slider", 
    toolbox = FALSE,
    bottom = -5
  ) %>% 
  e_tooltip() %>% 
  e_title("Departure delays by airport") %>% 
  e_x_axis(week, axisPointer = list(show = TRUE))

ts_base %>% e_line(dep_delay)

12.6 Stacked area

area <- ts_base %>% e_area(dep_delay, stack = "grp")
area

12.7 Timeline

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

flights_ts %>% 
  filter(origin == "JFK") %>% 
  group_by(month = month(week, label = TRUE)) %>% 
  e_charts(x = week, timeline = TRUE) %>% 
  e_bar(
    dep_delay, 
    name = "Departure Delay", 
    symbol = "none",
    legend = FALSE
  )

12.8 Correlation matrix

flights %>%
  select(dep_time, sched_dep_time, dep_delay, arr_time, sched_arr_time, arr_delay, air_time, distance,hour) %>%
  na.omit() %>%
  cor() %>% 
  e_charts() %>% 
  e_correlations(order = "hclust") %>% 
  e_tooltip()

12.9 Boxplot

df <- data.frame(
  x = c(
    rnorm(100),
    runif(100, -5, 10),
    rnorm(100, 10, 3)
  ),
  grp = c(
    rep(LETTERS[1], 100),
    rep(LETTERS[2], 100),
    rep(LETTERS[3], 100)
  )
)

df %>% 
  group_by(grp) %>% 
  e_charts() %>% 
  e_boxplot(x)

Joining data

M Sc. Mario Gregorio Saavedra Rodríguez

23/10/2019