library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
g(x) %>% f(y) = f(g(x),y)
head(iris, n = 4)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
iris %>% head(. , n = 4)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
iris %>% head(n = 4)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
4 %>% head(iris, .)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
notas <- data.frame(
nombre = c("Beimar", "Johanna", "Adriana"),
Talleres = c(50, 50, 10),
Quices = c(25, 30, 35),
Parciales = c(10, 12, 45)
)
notas
## nombre Talleres Quices Parciales
## 1 Beimar 50 25 10
## 2 Johanna 50 30 12
## 3 Adriana 10 35 45
NOTAS <- data.frame(
nombre = c("Beimar", "Johanna", "Adriana", "Beimar", "Johanna", "Adriana", "Beimar", "Johanna", "Adriana"),
rubrica = c("Talleres", "Talleres", "Talleres", "Quices", "Quices", "Quices", "Parciales", "Parciales", "Parciales"),
nota = c(50, 50, 10, 25, 30, 35, 10, 12, 45)
)
NOTAS
## nombre rubrica nota
## 1 Beimar Talleres 50
## 2 Johanna Talleres 50
## 3 Adriana Talleres 10
## 4 Beimar Quices 25
## 5 Johanna Quices 30
## 6 Adriana Quices 35
## 7 Beimar Parciales 10
## 8 Johanna Parciales 12
## 9 Adriana Parciales 45
NOTAS <- notas %>% gather(rubrica, nota, 2:3)
NOTAS
## nombre Parciales rubrica nota
## 1 Beimar 10 Talleres 50
## 2 Johanna 12 Talleres 50
## 3 Adriana 45 Talleres 10
## 4 Beimar 10 Quices 25
## 5 Johanna 12 Quices 30
## 6 Adriana 45 Quices 35
NOTAS <- notas %>% gather(rubrica, nota, 3:4)
NOTAS
## nombre Talleres rubrica nota
## 1 Beimar 50 Quices 25
## 2 Johanna 50 Quices 30
## 3 Adriana 10 Quices 35
## 4 Beimar 50 Parciales 10
## 5 Johanna 50 Parciales 12
## 6 Adriana 10 Parciales 45
NOTAS <- notas %>% gather(rubrica, nota, 2:4)
NOTAS
## nombre rubrica nota
## 1 Beimar Talleres 50
## 2 Johanna Talleres 50
## 3 Adriana Talleres 10
## 4 Beimar Quices 25
## 5 Johanna Quices 30
## 6 Adriana Quices 35
## 7 Beimar Parciales 10
## 8 Johanna Parciales 12
## 9 Adriana Parciales 45
melt(notas, id.vars = c("nombre"))
## nombre variable value
## 1 Beimar Talleres 50
## 2 Johanna Talleres 50
## 3 Adriana Talleres 10
## 4 Beimar Quices 25
## 5 Johanna Quices 30
## 6 Adriana Quices 35
## 7 Beimar Parciales 10
## 8 Johanna Parciales 12
## 9 Adriana Parciales 45
notas %>% melt(id.vars = c("nombre"))
## nombre variable value
## 1 Beimar Talleres 50
## 2 Johanna Talleres 50
## 3 Adriana Talleres 10
## 4 Beimar Quices 25
## 5 Johanna Quices 30
## 6 Adriana Quices 35
## 7 Beimar Parciales 10
## 8 Johanna Parciales 12
## 9 Adriana Parciales 45
notas <- NOTAS %>% spread(rubrica, nota)
notas
## nombre Parciales Quices Talleres
## 1 Adriana 45 35 10
## 2 Beimar 10 25 50
## 3 Johanna 12 30 50
dcast(NOTAS, formula = nombre ~ rubrica, value.var = c("nota"))
## nombre Parciales Quices Talleres
## 1 Adriana 45 35 10
## 2 Beimar 10 25 50
## 3 Johanna 12 30 50
NOTAS %>% dcast(formula = nombre ~ rubrica, value.var = c("nota"))
## nombre Parciales Quices Talleres
## 1 Adriana 45 35 10
## 2 Beimar 10 25 50
## 3 Johanna 12 30 50
NOTAS %>% dcast(formula = rubrica ~ nombre, value.var = c("nota"))
## rubrica Adriana Beimar Johanna
## 1 Parciales 45 10 12
## 2 Quices 35 25 30
## 3 Talleres 10 50 50
NOTAS %>% dcast(formula = nombre ~ ., value.var = c("nota"), fun.aggregate = mean, na.rm = TRUE) %>%
`colnames<-`(c("nombre", "promedio"))
## nombre promedio
## 1 Adriana 30.00000
## 2 Beimar 28.33333
## 3 Johanna 30.66667
NOTAS %>% dcast(formula = nombre ~ ., value.var = c("nota"), fun.aggregate = mean, na.rm = TRUE) %>%
`names<-`(c("nombre", "promedio"))
## nombre promedio
## 1 Adriana 30.00000
## 2 Beimar 28.33333
## 3 Johanna 30.66667
notas <- data.frame(
nombreyapellido = c("Beimar Rodríguez", "Johanna Vanegas", "Adriana Guerrero"),
Talleres = c(50, 50, 10),
Quices = c(25, 30, 35),
Parciales = c(10, 12, 45)
)
notas
## nombreyapellido Talleres Quices Parciales
## 1 Beimar Rodríguez 50 25 10
## 2 Johanna Vanegas 50 30 12
## 3 Adriana Guerrero 10 35 45
Notas <- notas %>% separate(nombreyapellido, c("nombre", "apellido"), sep = " ")
Notas
## nombre apellido Talleres Quices Parciales
## 1 Beimar Rodríguez 50 25 10
## 2 Johanna Vanegas 50 30 12
## 3 Adriana Guerrero 10 35 45
notas <- Notas %>% unite(nombreyapellido, nombre:apellido, sep = " ")
notas
## nombreyapellido Talleres Quices Parciales
## 1 Beimar Rodríguez 50 25 10
## 2 Johanna Vanegas 50 30 12
## 3 Adriana Guerrero 10 35 45
notas <- Notas %>%
unite(nombreyapellido, nombre:apellido, sep = " ") %>%
unite(talleresparcialesyquices, c(Talleres, Parciales, Quices), sep = " ")
notas
## nombreyapellido talleresparcialesyquices
## 1 Beimar Rodríguez 50 10 25
## 2 Johanna Vanegas 50 12 30
## 3 Adriana Guerrero 10 45 35
NOTAS %>%
group_by(rubrica) %>%
summarise(
mean(nota), median(nota), sd(nota), IQR(nota)
)
## # A tibble: 3 × 5
## rubrica `mean(nota)` `median(nota)` `sd(nota)` `IQR(nota)`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Parciales 22.3 12 19.7 17.5
## 2 Quices 30 30 5 5
## 3 Talleres 36.7 50 23.1 20
NOTAS %>%
group_by(nombre) %>%
summarise(
mean(nota), median(nota), sd(nota), IQR(nota)
)
## # A tibble: 3 × 5
## nombre `mean(nota)` `median(nota)` `sd(nota)` `IQR(nota)`
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Adriana 30 35 18.0 17.5
## 2 Beimar 28.3 25 20.2 20
## 3 Johanna 30.7 30 19.0 19
NOTAS %>%
group_by(rubrica) %>%
summarise_at(
vars(nota),
funs(mean, median, sd, IQR)
)
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## # A tibble: 3 × 5
## rubrica mean median sd IQR
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Parciales 22.3 12 19.7 17.5
## 2 Quices 30 30 5 5
## 3 Talleres 36.7 50 23.1 20
NOTAS %>%
group_by(nombre) %>%
summarise_at(
vars(nota),
funs(mean, median, sd, IQR)
)
## # A tibble: 3 × 5
## nombre mean median sd IQR
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Adriana 30 35 18.0 17.5
## 2 Beimar 28.3 25 20.2 20
## 3 Johanna 30.7 30 19.0 19
NOTAS %>%
group_by(rubrica) %>%
summarise_if(
is.numeric,
funs(mean, median, sd, IQR)
)
## # A tibble: 3 × 5
## rubrica mean median sd IQR
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Parciales 22.3 12 19.7 17.5
## 2 Quices 30 30 5 5
## 3 Talleres 36.7 50 23.1 20
NOTAS %>%
group_by(nombre) %>%
summarise_if(
is.numeric,
funs(mean, median, sd, IQR)
)
## # A tibble: 3 × 5
## nombre mean median sd IQR
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Adriana 30 35 18.0 17.5
## 2 Beimar 28.3 25 20.2 20
## 3 Johanna 30.7 30 19.0 19
NOTAS %>%
group_by(nombre) %>%
summarise_if(
is.numeric,
funs(mean, median, sd, IQR)
)
## # A tibble: 3 × 5
## nombre mean median sd IQR
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Adriana 30 35 18.0 17.5
## 2 Beimar 28.3 25 20.2 20
## 3 Johanna 30.7 30 19.0 19
head(band_members)
## # A tibble: 3 × 2
## name band
## <chr> <chr>
## 1 Mick Stones
## 2 John Beatles
## 3 Paul Beatles
band_instruments y band_instruments2 contienen los mismos datos con diferentes nombres de la primera columna; band_instruments usa name, que coincide con el nombre de la llave primaría de band_members; y band_instruments2 usa en su lugar como nombre de la misma columna artist.
head(band_instruments)
## # A tibble: 3 × 2
## name plays
## <chr> <chr>
## 1 John guitar
## 2 Paul bass
## 3 Keith guitar
head(band_instruments2)
## # A tibble: 3 × 2
## artist plays
## <chr> <chr>
## 1 John guitar
## 2 Paul bass
## 3 Keith guitar
inner_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)
inner_join(band_members, band_instruments, by = "name")
## # A tibble: 2 × 3
## name band plays
## <chr> <chr> <chr>
## 1 John Beatles guitar
## 2 Paul Beatles bass
band_members %>% inner_join(band_instruments)
## Joining, by = "name"
## # A tibble: 2 × 3
## name band plays
## <chr> <chr> <chr>
## 1 John Beatles guitar
## 2 Paul Beatles bass
merge(x = band_members, y = band_instruments, by = "name")
## name band plays
## 1 John Beatles guitar
## 2 Paul Beatles bass
merge(x = band_members, y = band_instruments, by = NULL)
## name.x band name.y plays
## 1 Mick Stones John guitar
## 2 John Beatles John guitar
## 3 Paul Beatles John guitar
## 4 Mick Stones Paul bass
## 5 John Beatles Paul bass
## 6 Paul Beatles Paul bass
## 7 Mick Stones Keith guitar
## 8 John Beatles Keith guitar
## 9 Paul Beatles Keith guitar
left_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)
left_join(band_members, band_instruments, by = "name")
## # A tibble: 3 × 3
## name band plays
## <chr> <chr> <chr>
## 1 Mick Stones <NA>
## 2 John Beatles guitar
## 3 Paul Beatles bass
band_members %>% left_join(band_instruments)
## Joining, by = "name"
## # A tibble: 3 × 3
## name band plays
## <chr> <chr> <chr>
## 1 Mick Stones <NA>
## 2 John Beatles guitar
## 3 Paul Beatles bass
merge(x = band_members, y = band_instruments, by = "name", all.x = TRUE)
## name band plays
## 1 John Beatles guitar
## 2 Mick Stones <NA>
## 3 Paul Beatles bass
right_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)
right_join(band_members, band_instruments, by = "name")
## # A tibble: 3 × 3
## name band plays
## <chr> <chr> <chr>
## 1 John Beatles guitar
## 2 Paul Beatles bass
## 3 Keith <NA> guitar
band_members %>% right_join(band_instruments, by = "name")
## # A tibble: 3 × 3
## name band plays
## <chr> <chr> <chr>
## 1 John Beatles guitar
## 2 Paul Beatles bass
## 3 Keith <NA> guitar
merge(x = band_members, y = band_instruments, by = "name", all.y = TRUE)
## name band plays
## 1 John Beatles guitar
## 2 Keith <NA> guitar
## 3 Paul Beatles bass
full_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)
full_join(band_members, band_instruments, by = "name")
## # A tibble: 4 × 3
## name band plays
## <chr> <chr> <chr>
## 1 Mick Stones <NA>
## 2 John Beatles guitar
## 3 Paul Beatles bass
## 4 Keith <NA> guitar
band_members %>% full_join(band_instruments, by = "name")
## # A tibble: 4 × 3
## name band plays
## <chr> <chr> <chr>
## 1 Mick Stones <NA>
## 2 John Beatles guitar
## 3 Paul Beatles bass
## 4 Keith <NA> guitar
merge(x = band_members, y = band_instruments, by = "name", all = TRUE)
## name band plays
## 1 John Beatles guitar
## 2 Keith <NA> guitar
## 3 Mick Stones <NA>
## 4 Paul Beatles bass
full_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)
full_join(band_members, band_instruments2, by = c("name" = "artist"))
## # A tibble: 4 × 3
## name band plays
## <chr> <chr> <chr>
## 1 Mick Stones <NA>
## 2 John Beatles guitar
## 3 Paul Beatles bass
## 4 Keith <NA> guitar
full_join(band_members, band_instruments2, by = c("name" = "artist"), keep = TRUE)
## # A tibble: 4 × 4
## name band artist plays
## <chr> <chr> <chr> <chr>
## 1 Mick Stones <NA> <NA>
## 2 John Beatles John guitar
## 3 Paul Beatles Paul bass
## 4 <NA> <NA> Keith guitar
copy - if the datasets are from different sources and copy = TRUE then y will be copied across to the datasource where x is located.
suffix - if a variable name occurs in both datasets, and is not used as part of the join, a suffix is added to ensure variable names are unique. By default “.x” and “.y” are added to the variable names but other suffixes can be specified.
x %>% full_join(y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)
band_members %>% full_join(band_instruments)
## Joining, by = "name"
## # A tibble: 4 × 3
## name band plays
## <chr> <chr> <chr>
## 1 Mick Stones <NA>
## 2 John Beatles guitar
## 3 Paul Beatles bass
## 4 Keith <NA> guitar
semi_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)
band_members %>% semi_join(band_instruments)
## Joining, by = "name"
## # A tibble: 2 × 2
## name band
## <chr> <chr>
## 1 John Beatles
## 2 Paul Beatles
anti_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)
band_members %>% anti_join(band_instruments)
## Joining, by = "name"
## # A tibble: 1 × 2
## name band
## <chr> <chr>
## 1 Mick Stones
nest_join(x, y, by = NULL, copy = FALSE, suffix = c(“.x”, “.y”), …)
band_members %>% nest_join(band_instruments)
## Joining, by = "name"
## # A tibble: 3 × 3
## name band band_instruments
## <chr> <chr> <list>
## 1 Mick Stones <tibble [0 × 1]>
## 2 John Beatles <tibble [1 × 1]>
## 3 Paul Beatles <tibble [1 × 1]>
(band_members %>% nest_join(band_instruments))$band_instruments
## Joining, by = "name"
## [[1]]
## # A tibble: 0 × 1
## # … with 1 variable: plays <chr>
##
## [[2]]
## # A tibble: 1 × 1
## plays
## <chr>
## 1 guitar
##
## [[3]]
## # A tibble: 1 × 1
## plays
## <chr>
## 1 bass
library(nycflights13)
airlines
## # A tibble: 16 × 2
## carrier name
## <chr> <chr>
## 1 9E Endeavor Air Inc.
## 2 AA American Airlines Inc.
## 3 AS Alaska Airlines Inc.
## 4 B6 JetBlue Airways
## 5 DL Delta Air Lines Inc.
## 6 EV ExpressJet Airlines Inc.
## 7 F9 Frontier Airlines Inc.
## 8 FL AirTran Airways Corporation
## 9 HA Hawaiian Airlines Inc.
## 10 MQ Envoy Air
## 11 OO SkyWest Airlines Inc.
## 12 UA United Air Lines Inc.
## 13 US US Airways Inc.
## 14 VX Virgin America
## 15 WN Southwest Airlines Co.
## 16 YV Mesa Airlines Inc.
carrier: Abreviación de dos letras
name: Nombre completo de la aerolínea
str(airlines)
## tibble [16 × 2] (S3: tbl_df/tbl/data.frame)
## $ carrier: chr [1:16] "9E" "AA" "AS" "B6" ...
## $ name : chr [1:16] "Endeavor Air Inc." "American Airlines Inc." "Alaska Airlines Inc." "JetBlue Airways" ...
airports
## # A tibble: 1,458 × 8
## faa name lat lon alt tz dst tzone
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 04G Lansdowne Airport 41.1 -80.6 1044 -5 A America/…
## 2 06A Moton Field Municipal Airport 32.5 -85.7 264 -6 A America/…
## 3 06C Schaumburg Regional 42.0 -88.1 801 -6 A America/…
## 4 06N Randall Airport 41.4 -74.4 523 -5 A America/…
## 5 09J Jekyll Island Airport 31.1 -81.4 11 -5 A America/…
## 6 0A9 Elizabethton Municipal Airport 36.4 -82.2 1593 -5 A America/…
## 7 0G6 Williams County Airport 41.5 -84.5 730 -5 A America/…
## 8 0G7 Finger Lakes Regional Airport 42.9 -76.8 492 -5 A America/…
## 9 0P2 Shoestring Aviation Airfield 39.8 -76.6 1000 -5 U America/…
## 10 0S9 Jefferson County Intl 48.1 -123. 108 -8 A America/…
## # … with 1,448 more rows
str(airports)
## tibble [1,458 × 8] (S3: tbl_df/tbl/data.frame)
## $ faa : chr [1:1458] "04G" "06A" "06C" "06N" ...
## $ name : chr [1:1458] "Lansdowne Airport" "Moton Field Municipal Airport" "Schaumburg Regional" "Randall Airport" ...
## $ lat : num [1:1458] 41.1 32.5 42 41.4 31.1 ...
## $ lon : num [1:1458] -80.6 -85.7 -88.1 -74.4 -81.4 ...
## $ alt : num [1:1458] 1044 264 801 523 11 ...
## $ tz : num [1:1458] -5 -6 -6 -5 -5 -5 -5 -5 -5 -8 ...
## $ dst : chr [1:1458] "A" "A" "A" "A" ...
## $ tzone: chr [1:1458] "America/New_York" "America/Chicago" "America/Chicago" "America/New_York" ...
## - attr(*, "spec")=
## .. cols(
## .. id = col_double(),
## .. name = col_character(),
## .. city = col_character(),
## .. country = col_character(),
## .. faa = col_character(),
## .. icao = col_character(),
## .. lat = col_double(),
## .. lon = col_double(),
## .. alt = col_double(),
## .. tz = col_double(),
## .. dst = col_character(),
## .. tzone = col_character()
## .. )
planes
## # A tibble: 3,322 × 9
## tailnum year type manufacturer model engines seats speed engine
## <chr> <int> <chr> <chr> <chr> <int> <int> <int> <chr>
## 1 N10156 2004 Fixed wing m… EMBRAER EMB-1… 2 55 NA Turbo-…
## 2 N102UW 1998 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## 3 N103US 1999 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## 4 N104UW 1999 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## 5 N10575 2002 Fixed wing m… EMBRAER EMB-1… 2 55 NA Turbo-…
## 6 N105UW 1999 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## 7 N107US 1999 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## 8 N108UW 1999 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## 9 N109UW 1999 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## 10 N110UW 1999 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## # … with 3,312 more rows
str(planes)
## tibble [3,322 × 9] (S3: tbl_df/tbl/data.frame)
## $ tailnum : chr [1:3322] "N10156" "N102UW" "N103US" "N104UW" ...
## $ year : int [1:3322] 2004 1998 1999 1999 2002 1999 1999 1999 1999 1999 ...
## $ type : chr [1:3322] "Fixed wing multi engine" "Fixed wing multi engine" "Fixed wing multi engine" "Fixed wing multi engine" ...
## $ manufacturer: chr [1:3322] "EMBRAER" "AIRBUS INDUSTRIE" "AIRBUS INDUSTRIE" "AIRBUS INDUSTRIE" ...
## $ model : chr [1:3322] "EMB-145XR" "A320-214" "A320-214" "A320-214" ...
## $ engines : int [1:3322] 2 2 2 2 2 2 2 2 2 2 ...
## $ seats : int [1:3322] 55 182 182 182 55 182 182 182 182 182 ...
## $ speed : int [1:3322] NA NA NA NA NA NA NA NA NA NA ...
## $ engine : chr [1:3322] "Turbo-fan" "Turbo-fan" "Turbo-fan" "Turbo-fan" ...
weather
## # A tibble: 26,115 × 15
## origin year month day hour temp dewp humid wind_dir wind_speed
## <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 2013 1 1 1 39.0 26.1 59.4 270 10.4
## 2 EWR 2013 1 1 2 39.0 27.0 61.6 250 8.06
## 3 EWR 2013 1 1 3 39.0 28.0 64.4 240 11.5
## 4 EWR 2013 1 1 4 39.9 28.0 62.2 250 12.7
## 5 EWR 2013 1 1 5 39.0 28.0 64.4 260 12.7
## 6 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5
## 7 EWR 2013 1 1 7 39.0 28.0 64.4 240 15.0
## 8 EWR 2013 1 1 8 39.9 28.0 62.2 250 10.4
## 9 EWR 2013 1 1 9 39.9 28.0 62.2 260 15.0
## 10 EWR 2013 1 1 10 41 28.0 59.6 260 13.8
## # … with 26,105 more rows, and 5 more variables: wind_gust <dbl>, precip <dbl>,
## # pressure <dbl>, visib <dbl>, time_hour <dttm>
str(weather)
## tibble [26,115 × 15] (S3: tbl_df/tbl/data.frame)
## $ origin : chr [1:26115] "EWR" "EWR" "EWR" "EWR" ...
## $ year : int [1:26115] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ month : int [1:26115] 1 1 1 1 1 1 1 1 1 1 ...
## $ day : int [1:26115] 1 1 1 1 1 1 1 1 1 1 ...
## $ hour : int [1:26115] 1 2 3 4 5 6 7 8 9 10 ...
## $ temp : num [1:26115] 39 39 39 39.9 39 ...
## $ dewp : num [1:26115] 26.1 27 28 28 28 ...
## $ humid : num [1:26115] 59.4 61.6 64.4 62.2 64.4 ...
## $ wind_dir : num [1:26115] 270 250 240 250 260 240 240 250 260 260 ...
## $ wind_speed: num [1:26115] 10.36 8.06 11.51 12.66 12.66 ...
## $ wind_gust : num [1:26115] NA NA NA NA NA NA NA NA NA NA ...
## $ precip : num [1:26115] 0 0 0 0 0 0 0 0 0 0 ...
## $ pressure : num [1:26115] 1012 1012 1012 1012 1012 ...
## $ visib : num [1:26115] 10 10 10 10 10 10 10 10 10 10 ...
## $ time_hour : POSIXct[1:26115], format: "2013-01-01 01:00:00" "2013-01-01 02:00:00" ...
flights se conecta a planes a través de una sola variable, tailnum.
flights se conecta a arilines a través de la variable carrier.
flights se conecta a airports de dos formas: a través de las variables de origin y dest.
flights se conecta al weather a través de origin (la ubicación) y el year, month, day y hour (la hora).
planes %>%
count(tailnum) %>%
filter(n > 1)
## # A tibble: 0 × 2
## # … with 2 variables: tailnum <chr>, n <int>
weather %>%
count(year, month, day, hour, origin) %>%
filter(n > 1)
## # A tibble: 3 × 6
## year month day hour origin n
## <int> <int> <int> <int> <chr> <int>
## 1 2013 11 3 1 EWR 2
## 2 2013 11 3 1 JFK 2
## 3 2013 11 3 1 LGA 2
flights %>%
count(year, month, day, flight) %>%
filter(n > 1)
## # A tibble: 29,768 × 5
## year month day flight n
## <int> <int> <int> <int> <int>
## 1 2013 1 1 1 2
## 2 2013 1 1 3 2
## 3 2013 1 1 4 2
## 4 2013 1 1 11 3
## 5 2013 1 1 15 2
## 6 2013 1 1 21 2
## 7 2013 1 1 27 4
## 8 2013 1 1 31 2
## 9 2013 1 1 32 2
## 10 2013 1 1 35 2
## # … with 29,758 more rows
flights %>%
count(year, month, day, tailnum) %>%
filter(n > 1)
## # A tibble: 64,928 × 5
## year month day tailnum n
## <int> <int> <int> <chr> <int>
## 1 2013 1 1 N0EGMQ 2
## 2 2013 1 1 N11189 2
## 3 2013 1 1 N11536 2
## 4 2013 1 1 N11544 3
## 5 2013 1 1 N11551 2
## 6 2013 1 1 N12540 2
## 7 2013 1 1 N12567 2
## 8 2013 1 1 N13123 2
## 9 2013 1 1 N13538 3
## 10 2013 1 1 N13566 3
## # … with 64,918 more rows
flights2 <- flights %>%
select(year:day, hour, origin, dest, tailnum, carrier)
flights2
## # A tibble: 336,776 × 8
## year month day hour origin dest tailnum carrier
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr>
## 1 2013 1 1 5 EWR IAH N14228 UA
## 2 2013 1 1 5 LGA IAH N24211 UA
## 3 2013 1 1 5 JFK MIA N619AA AA
## 4 2013 1 1 5 JFK BQN N804JB B6
## 5 2013 1 1 6 LGA ATL N668DN DL
## 6 2013 1 1 5 EWR ORD N39463 UA
## 7 2013 1 1 6 EWR FLL N516JB B6
## 8 2013 1 1 6 LGA IAD N829AS EV
## 9 2013 1 1 6 JFK MCO N593JB B6
## 10 2013 1 1 6 LGA ORD N3ALAA AA
## # … with 336,766 more rows
flights2 %>%
select(-origin, -dest) %>%
left_join(airlines, by = "carrier")
## # A tibble: 336,776 × 7
## year month day hour tailnum carrier name
## <int> <int> <int> <dbl> <chr> <chr> <chr>
## 1 2013 1 1 5 N14228 UA United Air Lines Inc.
## 2 2013 1 1 5 N24211 UA United Air Lines Inc.
## 3 2013 1 1 5 N619AA AA American Airlines Inc.
## 4 2013 1 1 5 N804JB B6 JetBlue Airways
## 5 2013 1 1 6 N668DN DL Delta Air Lines Inc.
## 6 2013 1 1 5 N39463 UA United Air Lines Inc.
## 7 2013 1 1 6 N516JB B6 JetBlue Airways
## 8 2013 1 1 6 N829AS EV ExpressJet Airlines Inc.
## 9 2013 1 1 6 N593JB B6 JetBlue Airways
## 10 2013 1 1 6 N3ALAA AA American Airlines Inc.
## # … with 336,766 more rows
flights2 %>%
select(-origin, -dest) %>%
mutate(name = airlines$name[match(carrier, airlines$carrier)])
## # A tibble: 336,776 × 7
## year month day hour tailnum carrier name
## <int> <int> <int> <dbl> <chr> <chr> <chr>
## 1 2013 1 1 5 N14228 UA United Air Lines Inc.
## 2 2013 1 1 5 N24211 UA United Air Lines Inc.
## 3 2013 1 1 5 N619AA AA American Airlines Inc.
## 4 2013 1 1 5 N804JB B6 JetBlue Airways
## 5 2013 1 1 6 N668DN DL Delta Air Lines Inc.
## 6 2013 1 1 5 N39463 UA United Air Lines Inc.
## 7 2013 1 1 6 N516JB B6 JetBlue Airways
## 8 2013 1 1 6 N829AS EV ExpressJet Airlines Inc.
## 9 2013 1 1 6 N593JB B6 JetBlue Airways
## 10 2013 1 1 6 N3ALAA AA American Airlines Inc.
## # … with 336,766 more rows
flights2 %>%
select(-origin, -dest) %>%
right_join(airlines, by = "carrier")
## # A tibble: 336,776 × 7
## year month day hour tailnum carrier name
## <int> <int> <int> <dbl> <chr> <chr> <chr>
## 1 2013 1 1 5 N14228 UA United Air Lines Inc.
## 2 2013 1 1 5 N24211 UA United Air Lines Inc.
## 3 2013 1 1 5 N619AA AA American Airlines Inc.
## 4 2013 1 1 5 N804JB B6 JetBlue Airways
## 5 2013 1 1 6 N668DN DL Delta Air Lines Inc.
## 6 2013 1 1 5 N39463 UA United Air Lines Inc.
## 7 2013 1 1 6 N516JB B6 JetBlue Airways
## 8 2013 1 1 6 N829AS EV ExpressJet Airlines Inc.
## 9 2013 1 1 6 N593JB B6 JetBlue Airways
## 10 2013 1 1 6 N3ALAA AA American Airlines Inc.
## # … with 336,766 more rows
airlines %>%
select(-name) %>%
mutate(name = flights2$carrier[match(carrier, flights2$carrier)])
## # A tibble: 16 × 2
## carrier name
## <chr> <chr>
## 1 9E 9E
## 2 AA AA
## 3 AS AS
## 4 B6 B6
## 5 DL DL
## 6 EV EV
## 7 F9 F9
## 8 FL FL
## 9 HA HA
## 10 MQ MQ
## 11 OO OO
## 12 UA UA
## 13 US US
## 14 VX VX
## 15 WN WN
## 16 YV YV
flights2 %>%
left_join(weather)
## Joining, by = c("year", "month", "day", "hour", "origin")
## # A tibble: 336,776 × 18
## year month day hour origin dest tailnum carrier temp dewp humid
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2013 1 1 5 EWR IAH N14228 UA 39.0 28.0 64.4
## 2 2013 1 1 5 LGA IAH N24211 UA 39.9 25.0 54.8
## 3 2013 1 1 5 JFK MIA N619AA AA 39.0 27.0 61.6
## 4 2013 1 1 5 JFK BQN N804JB B6 39.0 27.0 61.6
## 5 2013 1 1 6 LGA ATL N668DN DL 39.9 25.0 54.8
## 6 2013 1 1 5 EWR ORD N39463 UA 39.0 28.0 64.4
## 7 2013 1 1 6 EWR FLL N516JB B6 37.9 28.0 67.2
## 8 2013 1 1 6 LGA IAD N829AS EV 39.9 25.0 54.8
## 9 2013 1 1 6 JFK MCO N593JB B6 37.9 27.0 64.3
## 10 2013 1 1 6 LGA ORD N3ALAA AA 39.9 25.0 54.8
## # … with 336,766 more rows, and 7 more variables: wind_dir <dbl>,
## # wind_speed <dbl>, wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## # visib <dbl>, time_hour <dttm>
weather %>%
left_join(flights2)
## Joining, by = c("origin", "year", "month", "day", "hour")
## # A tibble: 341,957 × 18
## origin year month day hour temp dewp humid wind_dir wind_speed
## <chr> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 2013 1 1 1 39.0 26.1 59.4 270 10.4
## 2 EWR 2013 1 1 2 39.0 27.0 61.6 250 8.06
## 3 EWR 2013 1 1 3 39.0 28.0 64.4 240 11.5
## 4 EWR 2013 1 1 4 39.9 28.0 62.2 250 12.7
## 5 EWR 2013 1 1 5 39.0 28.0 64.4 260 12.7
## 6 EWR 2013 1 1 5 39.0 28.0 64.4 260 12.7
## 7 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5
## 8 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5
## 9 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5
## 10 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5
## # … with 341,947 more rows, and 8 more variables: wind_gust <dbl>,
## # precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm>, dest <chr>,
## # tailnum <chr>, carrier <chr>
flights2 %>%
right_join(weather)
## Joining, by = c("year", "month", "day", "hour", "origin")
## # A tibble: 341,957 × 18
## year month day hour origin dest tailnum carrier temp dewp humid
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2013 1 1 5 EWR IAH N14228 UA 39.0 28.0 64.4
## 2 2013 1 1 5 LGA IAH N24211 UA 39.9 25.0 54.8
## 3 2013 1 1 5 JFK MIA N619AA AA 39.0 27.0 61.6
## 4 2013 1 1 5 JFK BQN N804JB B6 39.0 27.0 61.6
## 5 2013 1 1 6 LGA ATL N668DN DL 39.9 25.0 54.8
## 6 2013 1 1 5 EWR ORD N39463 UA 39.0 28.0 64.4
## 7 2013 1 1 6 EWR FLL N516JB B6 37.9 28.0 67.2
## 8 2013 1 1 6 LGA IAD N829AS EV 39.9 25.0 54.8
## 9 2013 1 1 6 JFK MCO N593JB B6 37.9 27.0 64.3
## 10 2013 1 1 6 LGA ORD N3ALAA AA 39.9 25.0 54.8
## # … with 341,947 more rows, and 7 more variables: wind_dir <dbl>,
## # wind_speed <dbl>, wind_gust <dbl>, precip <dbl>, pressure <dbl>,
## # visib <dbl>, time_hour <dttm>
weather %>%
right_join(flights2)
## Joining, by = c("origin", "year", "month", "day", "hour")
## # A tibble: 336,776 × 18
## origin year month day hour temp dewp humid wind_dir wind_speed
## <chr> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 2013 1 1 5 39.0 28.0 64.4 260 12.7
## 2 EWR 2013 1 1 5 39.0 28.0 64.4 260 12.7
## 3 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5
## 4 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5
## 5 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5
## 6 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5
## 7 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5
## 8 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5
## 9 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5
## 10 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5
## # … with 336,766 more rows, and 8 more variables: wind_gust <dbl>,
## # precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm>, dest <chr>,
## # tailnum <chr>, carrier <chr>
flights2 %>%
left_join(planes, by = "tailnum")
## # A tibble: 336,776 × 16
## year.x month day hour origin dest tailnum carrier year.y type
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <int> <chr>
## 1 2013 1 1 5 EWR IAH N14228 UA 1999 Fixed wing mult…
## 2 2013 1 1 5 LGA IAH N24211 UA 1998 Fixed wing mult…
## 3 2013 1 1 5 JFK MIA N619AA AA 1990 Fixed wing mult…
## 4 2013 1 1 5 JFK BQN N804JB B6 2012 Fixed wing mult…
## 5 2013 1 1 6 LGA ATL N668DN DL 1991 Fixed wing mult…
## 6 2013 1 1 5 EWR ORD N39463 UA 2012 Fixed wing mult…
## 7 2013 1 1 6 EWR FLL N516JB B6 2000 Fixed wing mult…
## 8 2013 1 1 6 LGA IAD N829AS EV 1998 Fixed wing mult…
## 9 2013 1 1 6 JFK MCO N593JB B6 2004 Fixed wing mult…
## 10 2013 1 1 6 LGA ORD N3ALAA AA NA <NA>
## # … with 336,766 more rows, and 6 more variables: manufacturer <chr>,
## # model <chr>, engines <int>, seats <int>, speed <int>, engine <chr>
flights2 %>%
right_join(planes, by = "tailnum")
## # A tibble: 284,170 × 16
## year.x month day hour origin dest tailnum carrier year.y type
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <int> <chr>
## 1 2013 1 1 5 EWR IAH N14228 UA 1999 Fixed wing mult…
## 2 2013 1 1 5 LGA IAH N24211 UA 1998 Fixed wing mult…
## 3 2013 1 1 5 JFK MIA N619AA AA 1990 Fixed wing mult…
## 4 2013 1 1 5 JFK BQN N804JB B6 2012 Fixed wing mult…
## 5 2013 1 1 6 LGA ATL N668DN DL 1991 Fixed wing mult…
## 6 2013 1 1 5 EWR ORD N39463 UA 2012 Fixed wing mult…
## 7 2013 1 1 6 EWR FLL N516JB B6 2000 Fixed wing mult…
## 8 2013 1 1 6 LGA IAD N829AS EV 1998 Fixed wing mult…
## 9 2013 1 1 6 JFK MCO N593JB B6 2004 Fixed wing mult…
## 10 2013 1 1 6 JFK PBI N793JB B6 2011 Fixed wing mult…
## # … with 284,160 more rows, and 6 more variables: manufacturer <chr>,
## # model <chr>, engines <int>, seats <int>, speed <int>, engine <chr>
flights2 %>%
left_join(airports, c("dest" = "faa"))
## # A tibble: 336,776 × 15
## year month day hour origin dest tailnum carrier name lat lon alt
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2013 1 1 5 EWR IAH N14228 UA Georg… 30.0 -95.3 97
## 2 2013 1 1 5 LGA IAH N24211 UA Georg… 30.0 -95.3 97
## 3 2013 1 1 5 JFK MIA N619AA AA Miami… 25.8 -80.3 8
## 4 2013 1 1 5 JFK BQN N804JB B6 <NA> NA NA NA
## 5 2013 1 1 6 LGA ATL N668DN DL Harts… 33.6 -84.4 1026
## 6 2013 1 1 5 EWR ORD N39463 UA Chica… 42.0 -87.9 668
## 7 2013 1 1 6 EWR FLL N516JB B6 Fort … 26.1 -80.2 9
## 8 2013 1 1 6 LGA IAD N829AS EV Washi… 38.9 -77.5 313
## 9 2013 1 1 6 JFK MCO N593JB B6 Orlan… 28.4 -81.3 96
## 10 2013 1 1 6 LGA ORD N3ALAA AA Chica… 42.0 -87.9 668
## # … with 336,766 more rows, and 3 more variables: tz <dbl>, dst <chr>,
## # tzone <chr>
flights2 %>%
right_join(airports, c("dest" = "faa"))
## # A tibble: 330,531 × 15
## year month day hour origin dest tailnum carrier name lat lon alt
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2013 1 1 5 EWR IAH N14228 UA Georg… 30.0 -95.3 97
## 2 2013 1 1 5 LGA IAH N24211 UA Georg… 30.0 -95.3 97
## 3 2013 1 1 5 JFK MIA N619AA AA Miami… 25.8 -80.3 8
## 4 2013 1 1 6 LGA ATL N668DN DL Harts… 33.6 -84.4 1026
## 5 2013 1 1 5 EWR ORD N39463 UA Chica… 42.0 -87.9 668
## 6 2013 1 1 6 EWR FLL N516JB B6 Fort … 26.1 -80.2 9
## 7 2013 1 1 6 LGA IAD N829AS EV Washi… 38.9 -77.5 313
## 8 2013 1 1 6 JFK MCO N593JB B6 Orlan… 28.4 -81.3 96
## 9 2013 1 1 6 LGA ORD N3ALAA AA Chica… 42.0 -87.9 668
## 10 2013 1 1 6 JFK PBI N793JB B6 Palm … 26.7 -80.1 19
## # … with 330,521 more rows, and 3 more variables: tz <dbl>, dst <chr>,
## # tzone <chr>
flights2 %>%
left_join(airports, c("origin" = "faa"))
## # A tibble: 336,776 × 15
## year month day hour origin dest tailnum carrier name lat lon alt
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2013 1 1 5 EWR IAH N14228 UA Newar… 40.7 -74.2 18
## 2 2013 1 1 5 LGA IAH N24211 UA La Gu… 40.8 -73.9 22
## 3 2013 1 1 5 JFK MIA N619AA AA John … 40.6 -73.8 13
## 4 2013 1 1 5 JFK BQN N804JB B6 John … 40.6 -73.8 13
## 5 2013 1 1 6 LGA ATL N668DN DL La Gu… 40.8 -73.9 22
## 6 2013 1 1 5 EWR ORD N39463 UA Newar… 40.7 -74.2 18
## 7 2013 1 1 6 EWR FLL N516JB B6 Newar… 40.7 -74.2 18
## 8 2013 1 1 6 LGA IAD N829AS EV La Gu… 40.8 -73.9 22
## 9 2013 1 1 6 JFK MCO N593JB B6 John … 40.6 -73.8 13
## 10 2013 1 1 6 LGA ORD N3ALAA AA La Gu… 40.8 -73.9 22
## # … with 336,766 more rows, and 3 more variables: tz <dbl>, dst <chr>,
## # tzone <chr>
flights2 %>%
right_join(airports, c("origin" = "faa"))
## # A tibble: 338,231 × 15
## year month day hour origin dest tailnum carrier name lat lon alt
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2013 1 1 5 EWR IAH N14228 UA Newar… 40.7 -74.2 18
## 2 2013 1 1 5 LGA IAH N24211 UA La Gu… 40.8 -73.9 22
## 3 2013 1 1 5 JFK MIA N619AA AA John … 40.6 -73.8 13
## 4 2013 1 1 5 JFK BQN N804JB B6 John … 40.6 -73.8 13
## 5 2013 1 1 6 LGA ATL N668DN DL La Gu… 40.8 -73.9 22
## 6 2013 1 1 5 EWR ORD N39463 UA Newar… 40.7 -74.2 18
## 7 2013 1 1 6 EWR FLL N516JB B6 Newar… 40.7 -74.2 18
## 8 2013 1 1 6 LGA IAD N829AS EV La Gu… 40.8 -73.9 22
## 9 2013 1 1 6 JFK MCO N593JB B6 John … 40.6 -73.8 13
## 10 2013 1 1 6 LGA ORD N3ALAA AA La Gu… 40.8 -73.9 22
## # … with 338,221 more rows, and 3 more variables: tz <dbl>, dst <chr>,
## # tzone <chr>
airports %>%
semi_join(flights, c("faa" = "dest")) %>%
ggplot(aes(lon, lat)) +
borders("state") +
geom_point(color = "orange") +
coord_quickmap()
airports %>%
semi_join(flights, c("faa" = "dest")) %>%
ggplot(aes(lon, lat, shape = dst)) +
borders("state") +
geom_point(color = "orange") +
coord_quickmap()
flights %>%
anti_join(planes, "tailnum")
## # A tibble: 52,606 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 558 600 -2 753 745
## 2 2013 1 1 559 600 -1 941 910
## 3 2013 1 1 600 600 0 837 825
## 4 2013 1 1 602 605 -3 821 805
## 5 2013 1 1 608 600 8 807 735
## 6 2013 1 1 611 600 11 945 931
## 7 2013 1 1 623 610 13 920 915
## 8 2013 1 1 624 630 -6 840 830
## 9 2013 1 1 628 630 -2 1137 1140
## 10 2013 1 1 629 630 -1 824 810
## # … with 52,596 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
planes %>%
anti_join(flights, "tailnum")
## # A tibble: 0 × 9
## # … with 9 variables: tailnum <chr>, year <int>, type <chr>,
## # manufacturer <chr>, model <chr>, engines <int>, seats <int>, speed <int>,
## # engine <chr>
airports %>%
rename(dest = faa) %>%
semi_join(flights) %>%
ggplot(aes(lon, lat, size = alt)) +
borders("state") +
geom_point(color = "orange") +
coord_quickmap()
## Joining, by = "dest"
flights %>%
anti_join(airports %>%
rename(dest = faa))
## Joining, by = "dest"
## # A tibble: 7,602 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 544 545 -1 1004 1022
## 2 2013 1 1 615 615 0 1039 1100
## 3 2013 1 1 628 630 -2 1137 1140
## 4 2013 1 1 701 700 1 1123 1154
## 5 2013 1 1 711 715 -4 1151 1206
## 6 2013 1 1 820 820 0 1254 1310
## 7 2013 1 1 820 820 0 1249 1329
## 8 2013 1 1 840 845 -5 1311 1350
## 9 2013 1 1 909 810 59 1331 1315
## 10 2013 1 1 913 918 -5 1346 1416
## # … with 7,592 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
airports %>%
rename(origin = faa) %>%
semi_join(flights)
## Joining, by = "origin"
## # A tibble: 3 × 8
## origin name lat lon alt tz dst tzone
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 EWR Newark Liberty Intl 40.7 -74.2 18 -5 A America/New_York
## 2 JFK John F Kennedy Intl 40.6 -73.8 13 -5 A America/New_York
## 3 LGA La Guardia 40.8 -73.9 22 -5 A America/New_York
flights %>% filter(dep_time >= 600, dep_time <= 605)
## # A tibble: 2,460 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 600 600 0 851 858
## 2 2013 1 1 600 600 0 837 825
## 3 2013 1 1 601 600 1 844 850
## 4 2013 1 1 602 610 -8 812 820
## 5 2013 1 1 602 605 -3 821 805
## 6 2013 1 2 600 600 0 814 749
## 7 2013 1 2 600 605 -5 751 818
## 8 2013 1 2 600 600 0 819 815
## 9 2013 1 2 600 600 0 846 846
## 10 2013 1 2 600 600 0 737 725
## # … with 2,450 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
top_dest <- flights %>%
count(dest, sort = TRUE) %>%
head(10)
top_dest
## # A tibble: 10 × 2
## dest n
## <chr> <int>
## 1 ORD 17283
## 2 ATL 17215
## 3 LAX 16174
## 4 BOS 15508
## 5 MCO 14082
## 6 CLT 14064
## 7 SFO 13331
## 8 FLL 12055
## 9 MIA 11728
## 10 DCA 9705
top_dest <- flights %>%
count(dest, sort = TRUE) %>%
tail(10)
top_dest
## # A tibble: 10 × 2
## dest n
## <chr> <int>
## 1 BZN 36
## 2 JAC 25
## 3 PSP 19
## 4 EYW 17
## 5 HDN 15
## 6 MTJ 15
## 7 SBN 10
## 8 ANC 8
## 9 LEX 1
## 10 LGA 1
flights %>%
select(carrier, dep_delay) %>%
filter(dep_delay > 60) %>%
head()
## # A tibble: 6 × 2
## carrier dep_delay
## <chr> <dbl>
## 1 MQ 101
## 2 AA 71
## 3 MQ 853
## 4 UA 144
## 5 UA 134
## 6 EV 96
flights %>%
select(carrier, dep_delay) %>%
filter(dep_delay < 60) %>%
tail()
## # A tibble: 6 × 2
## carrier dep_delay
## <chr> <dbl>
## 1 B6 -8
## 2 B6 -5
## 3 B6 -10
## 4 B6 -5
## 5 B6 12
## 6 B6 -10
flights %>%
filter(dest %in% top_dest$dest)
## # A tibble: 147 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 848 851 -3 1155 1136
## 2 2013 1 2 900 859 1 1146 1144
## 3 2013 1 5 829 830 -1 1047 1111
## 4 2013 1 5 850 825 25 1132 1123
## 5 2013 1 5 859 901 -2 1144 1201
## 6 2013 1 5 1048 1035 13 1443 1358
## 7 2013 1 5 1101 1055 6 1342 1400
## 8 2013 1 12 825 825 0 1115 1123
## 9 2013 1 12 827 830 -3 1112 1111
## 10 2013 1 12 859 901 -2 1146 1201
## # … with 137 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
semi_join(top_dest)
## Joining, by = "dest"
## # A tibble: 147 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 848 851 -3 1155 1136
## 2 2013 1 2 900 859 1 1146 1144
## 3 2013 1 5 829 830 -1 1047 1111
## 4 2013 1 5 850 825 25 1132 1123
## 5 2013 1 5 859 901 -2 1144 1201
## 6 2013 1 5 1048 1035 13 1443 1358
## 7 2013 1 5 1101 1055 6 1342 1400
## 8 2013 1 12 825 825 0 1115 1123
## 9 2013 1 12 827 830 -3 1112 1111
## 10 2013 1 12 859 901 -2 1146 1201
## # … with 137 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
flights %>%
anti_join(planes, by = "tailnum") %>%
count(tailnum, sort = TRUE)
## # A tibble: 722 × 2
## tailnum n
## <chr> <int>
## 1 <NA> 2512
## 2 N725MQ 575
## 3 N722MQ 513
## 4 N723MQ 507
## 5 N713MQ 483
## 6 N735MQ 396
## 7 N0EGMQ 371
## 8 N534MQ 364
## 9 N542MQ 363
## 10 N531MQ 349
## # … with 712 more rows
flights %>%
group_by(month) %>%
summarise(dist = sum(distance))
## # A tibble: 12 × 2
## month dist
## <int> <dbl>
## 1 1 27188805
## 2 2 24975509
## 3 3 29179636
## 4 4 29427294
## 5 5 29974128
## 6 6 29856388
## 7 7 31149199
## 8 8 31149334
## 9 9 28711426
## 10 10 30012086
## 11 11 28639718
## 12 12 29954084
flights %>%
group_by(dest) %>%
summarise(flight_count = n(), plane_count = n_distinct(tailnum)) %>%
head()
## # A tibble: 6 × 3
## dest flight_count plane_count
## <chr> <int> <int>
## 1 ABQ 254 108
## 2 ACK 265 58
## 3 ALB 439 172
## 4 ANC 8 6
## 5 ATL 17215 1180
## 6 AUS 2439 993
flights %>%
group_by(carrier) %>%
select(month, day, dep_delay) %>%
filter(min_rank(desc(dep_delay)) <= 2 & dep_delay!="NA") %>%
arrange(carrier, desc(dep_delay)) %>%
head()
## Adding missing grouping variables: `carrier`
## # A tibble: 6 × 4
## # Groups: carrier [3]
## carrier month day dep_delay
## <chr> <int> <int> <dbl>
## 1 9E 2 16 747
## 2 9E 7 24 430
## 3 AA 9 20 1014
## 4 AA 12 5 896
## 5 AS 5 23 225
## 6 AS 1 20 222
flights_latlon <- flights %>%
inner_join(select(airports, origin = faa, origin_lat = lat, origin_lon = lon),
by = "origin"
) %>%
inner_join(select(airports, dest = faa, dest_lat = lat, dest_lon = lon),
by = "dest"
)
flights_latlon %>%
filter(origin == "JFK") %>%
ggplot(aes(
x = origin_lon, xend = dest_lon,
y = origin_lat, yend = dest_lat
)) +
borders("state") +
geom_segment(arrow = arrow(length = unit(0.1, "cm")), color = "orange") +
coord_quickmap() +
labs(y = "Latitude", x = "Longitude")
flights_latlon %>%
filter(dest %in% c("MIA", "FLL", "SFO", "CLT", "MCO", "BOS", "LAX", "ATL", "ORD")) %>%
ggplot(aes(
x = origin_lon, xend = dest_lon,
y = origin_lat, yend = dest_lat
)) +
borders("state") +
geom_segment(arrow = arrow(length = unit(0.1, "cm")), color = "orange") +
coord_quickmap() +
labs(y = "Latitude", x = "Longitude")
flights_latlon %>%
slice(1:100) %>%
ggplot(aes(
x = origin_lon, xend = dest_lon,
y = origin_lat, yend = dest_lat
)) +
borders("state") +
geom_segment(arrow = arrow(length = unit(0.1, "cm")), color = "orange") +
coord_quickmap() +
labs(y = "Latitude", x = "Longitude")
alaska_flights <- flights %>%
filter(carrier == "AS")
ggplot(alaska_flights, aes(x = dep_delay, y = arr_delay)) +
geom_point()
## Warning: Removed 5 rows containing missing values (geom_point).
ggplot(alaska_flights, aes(x = dep_delay, y = arr_delay)) +
geom_point(alpha =0.3 )
## Warning: Removed 5 rows containing missing values (geom_point).
ggplot(data = alaska_flights, mapping = aes(x = dep_delay, y = arr_delay)) +
geom_jitter(width = 30, height = 30)
## Warning: Removed 5 rows containing missing values (geom_point).
early_january_weather <- weather %>%
filter(origin == "EWR" & month == 1 & day <= 15)
ggplot(early_january_weather, aes(x = time_hour, y=temp)) +
geom_line()
ggplot(data = weather, mapping = aes(x = temp)) +
geom_histogram(bins = 40, color = "white", fill = "orange")
## Warning: Removed 1 rows containing non-finite values (stat_bin).
ggplot(data = weather, mapping = aes(x = temp)) +
geom_histogram(binwidth = 10, color = "orange", fill = "yellow")
## Warning: Removed 1 rows containing non-finite values (stat_bin).
ggplot(data = weather, mapping = aes(x = temp)) +
geom_histogram(binwidth = 5, color = "orange") +
facet_wrap(~ month)
## Warning: Removed 1 rows containing non-finite values (stat_bin).
ggplot(data = weather, mapping = aes(x = temp)) +
geom_histogram(binwidth = 5, color = "orange") +
facet_wrap(~ month, nrow = 4)
## Warning: Removed 1 rows containing non-finite values (stat_bin).
ggplot(data = weather, mapping = aes(x = month, y = temp)) +
geom_boxplot(fill = "orange")
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
ggplot(data = weather, mapping = aes(x = factor(month), y = temp)) +
geom_boxplot(fill = "orange")
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
ggplot(data = flights, mapping = aes(x = carrier)) +
geom_bar(fill="orange", colour = "yellow")
flights_counted <- flights %>%
group_by(carrier) %>%
summarize(number=n())
flights_counted
## # A tibble: 16 × 2
## carrier number
## <chr> <int>
## 1 9E 18460
## 2 AA 32729
## 3 AS 714
## 4 B6 54635
## 5 DL 48110
## 6 EV 54173
## 7 F9 685
## 8 FL 3260
## 9 HA 342
## 10 MQ 26397
## 11 OO 32
## 12 UA 58665
## 13 US 20536
## 14 VX 5162
## 15 WN 12275
## 16 YV 601
ggplot(flights_counted, aes(x=carrier, y=number)) +
geom_col(fill="red", colour = "orange")
ggplot(data = flights, mapping = aes(x = carrier, fill = origin)) +
geom_bar()
ggplot(data = flights, mapping = aes(x = carrier, color = origin)) +
geom_bar()
ggplot(data = flights, mapping = aes(x = carrier, fill = origin)) +
geom_bar(position = "dodge")
ggplot(flights, aes(x=carrier)) +
geom_bar(colour="orange") +
facet_wrap(~ origin, ncol=1)
library(echarts4r)
top_destinations <- flights %>%
count(dest) %>%
top_n(15, n) %>%
arrange(n)
top_destinations %>%
e_charts(x = dest) %>%
e_bar(n, legend = FALSE, name = "Flights") %>%
e_labels(position = "right") %>%
e_tooltip() %>%
e_title("Flights by destination", "Top 15 destinations") %>%
e_flip_coords() %>%
e_y_axis(splitLine = list(show = FALSE)) %>%
e_x_axis(show = FALSE) %>%
e_toolbox_feature(
feature = "saveAsImage",
title = "Save as image"
)
flights_daytime <- flights %>%
transmute(origin, daytime = case_when(
hour >= 22 & hour < 6 ~ "Night",
hour >= 6 & hour < 12 ~ "Morning",
hour >= 12 & hour < 18 ~ "Afternoon",
TRUE ~ "Evening"
)) %>%
count(origin, daytime) %>%
group_by(daytime)
flights_daytime %>%
e_charts(origin, stack = "grp") %>%
e_bar(n) %>%
e_tooltip(
trigger = "axis",
axisPointer = list(
type = "shadow"
)
) %>%
e_title(
text = "Outgoing flights by time of day",
subtext = "There are no night flights"
) %>%
e_y_axis(
splitArea = list(show = FALSE),
splitLine = list(show = FALSE)
)
set.seed(123)
flights_sm <- flights %>%
filter(complete.cases(.)) %>%
sample_n(1000)
flights_sm %>%
e_charts(x = dep_delay) %>%
e_scatter(arr_delay, name = "Flight") %>%
e_lm(arr_delay ~ dep_delay, name = "Linear model") %>%
e_axis_labels(x = "Departure delay", y = "Arrival delay") %>%
e_title(
text = "Arrival delay vs. departure delay",
subtext = "The later you start, the later you finish"
) %>%
e_x_axis(
nameLocation = "center",
splitArea = list(show = FALSE),
axisLabel = list(margin = 3),
axisPointer = list(
show = TRUE,
lineStyle = list(
color = "#999999",
width = 0.75,
type = "dotted"
)
)
) %>%
e_y_axis(
nameLocation = "center",
splitArea = list(show = FALSE),
axisLabel = list(margin = 0),
axisPointer = list(
show = TRUE,
lineStyle = list(
color = "#999999",
width = 0.75,
type = "dotted"
)
)
)
n_bins <- 100
flights %>%
filter(complete.cases(.)) %>%
mutate(
arr_delay = cut(arr_delay, n_bins),
dep_delay = cut(dep_delay, n_bins)
) %>%
count(arr_delay, dep_delay) %>%
e_charts(dep_delay) %>%
e_heatmap(arr_delay, n) %>%
e_visual_map(n) %>%
e_title("Arrival delay vs. departure delay") %>%
e_axis_labels("Departure delay", "Arrival delay")
pie <- count(flights, origin) %>%
e_charts(x = origin) %>%
e_pie(n, legend = FALSE, name = "Flights") %>%
e_tooltip() %>%
e_title("Flights by origin", "This is really hard with ggplot2")
pie
flights_ts <- flights %>%
transmute(week = as.Date(cut(time_hour, "week")), dep_delay, origin) %>%
group_by(origin, week) %>% # works with echarts
summarise(dep_delay = sum(dep_delay, na.rm = TRUE))
## `summarise()` has grouped output by 'origin'. You can override using the `.groups` argument.
ts_base <- flights_ts %>%
e_charts(x = week) %>%
e_datazoom(
type = "slider",
toolbox = FALSE,
bottom = -5
) %>%
e_tooltip() %>%
e_title("Departure delays by airport") %>%
e_x_axis(week, axisPointer = list(show = TRUE))
ts_base %>% e_line(dep_delay)
flights_ts <- flights %>%
transmute(week = as.Date(cut(time_hour, "week")), dep_delay, origin) %>%
group_by(origin, week) %>% # works with echarts
summarise(dep_delay = sum(dep_delay, na.rm = TRUE))
## `summarise()` has grouped output by 'origin'. You can override using the `.groups` argument.
ts_base <- flights_ts %>%
e_charts(x = week) %>%
e_datazoom(
type = "slider",
toolbox = FALSE,
bottom = -5
) %>%
e_tooltip() %>%
e_title("Departure delays by airport") %>%
e_x_axis(week, axisPointer = list(show = TRUE))
ts_base %>% e_line(dep_delay)
area <- ts_base %>% e_area(dep_delay, stack = "grp")
area
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
flights_ts %>%
filter(origin == "JFK") %>%
group_by(month = month(week, label = TRUE)) %>%
e_charts(x = week, timeline = TRUE) %>%
e_bar(
dep_delay,
name = "Departure Delay",
symbol = "none",
legend = FALSE
)
flights %>%
select(dep_time, sched_dep_time, dep_delay, arr_time, sched_arr_time, arr_delay, air_time, distance,hour) %>%
na.omit() %>%
cor() %>%
e_charts() %>%
e_correlations(order = "hclust") %>%
e_tooltip()
df <- data.frame(
x = c(
rnorm(100),
runif(100, -5, 10),
rnorm(100, 10, 3)
),
grp = c(
rep(LETTERS[1], 100),
rep(LETTERS[2], 100),
rep(LETTERS[3], 100)
)
)
df %>%
group_by(grp) %>%
e_charts() %>%
e_boxplot(x)