library(nycflights13)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# tarea 5.2.4
#Find all flights that
glimpse(flights)
## Observations: 336,776
## Variables: 19
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,...
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 55...
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 60...
## $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2...
## $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 7...
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 7...
## $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -...
## $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV",...
## $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79...
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN...
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR"...
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL"...
## $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138...
## $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 94...
## $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5,...
## $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013...
#1.1. Had an arrival delay of two or more hours
flights %>%
filter(arr_delay >= 120) %>%
mutate(hora=arr_delay /60 ) %>%
select(flight, carrier, origin, dest, hora) %>%
arrange(desc(hora))
## # A tibble: 10,200 x 5
## flight carrier origin dest hora
## <int> <chr> <chr> <chr> <dbl>
## 1 51 HA JFK HNL 21.2
## 2 3535 MQ JFK CMH 18.8
## 3 3695 MQ EWR ORD 18.5
## 4 177 AA JFK SFO 16.8
## 5 3075 MQ JFK CVG 16.5
## 6 2391 DL JFK TPA 15.5
## 7 2119 DL LGA MSP 15.2
## 8 2047 DL LGA ATL 14.9
## 9 172 AA EWR MIA 14.6
## 10 3744 MQ EWR ORD 14.6
## # ... with 10,190 more rows
#1.2. Flew to Houston (IAH or HOU)
flights %>%
filter(dest == 'IAH' | dest =='HOU') %>%
group_by(dest) %>%
summarize(cnt = n())
## # A tibble: 2 x 2
## dest cnt
## <chr> <int>
## 1 HOU 2115
## 2 IAH 7198
#1.3. Were operated by United, American, or Delta
flights %>%
filter(carrier == 'UA' | carrier == 'AA' | carrier == 'DL')%>%
group_by(carrier) %>%
summarize(cnt = n())
#1.4. Departed in summer (July, August, and September)
flights %>%
filter(month == c(7,8,9))%>%
group_by(month) %>%
summarize(cnt = n())
## Warning in month == c(7, 8, 9): longer object length is not a multiple of
## shorter object length
## # A tibble: 3 x 2
## month cnt
## <int> <int>
## 1 7 9808
## 2 8 9775
## 3 9 9191
#1.5. Arrived more than two hours late, but didn’t leave late
flights %>%
filter(arr_delay >= 120) %>%
filter(dep_delay < 5) %>%
mutate(hora_retraso=arr_delay /60 ) %>%
mutate(hora_salida=dep_delay /60 ) %>%
select(flight, carrier, origin, dest, hora_retraso,hora_salida) %>%
arrange(desc(hora_retraso))
## # A tibble: 35 x 6
## flight carrier origin dest hora_retraso hora_salida
## <int> <chr> <chr> <chr> <dbl> <dbl>
## 1 399 VX JFK LAX 3.23 -0.0333
## 2 707 AA LGA DFW 2.98 -0.0333
## 3 2183 US LGA DCA 2.45 -0.0167
## 4 1619 DL LGA MSP 2.42 -0.15
## 5 2083 AA EWR DFW 2.38 -0.0833
## 6 2402 B6 JFK BUF 2.37 -0.0333
## 7 341 AA LGA ORD 2.35 0.05
## 8 389 UA JFK SFO 2.33 -0.05
## 9 1057 DL LGA MIA 2.33 0
## 10 706 DL JFK AUS 2.3 -0.05
## # ... with 25 more rows
#1.6. Were delayed by at least an hour, but made up over 30 minutes in flight
flights %>%
filter(arr_delay >= 60) %>%
filter(air_time < 30) %>%
mutate(retraso=arr_delay /60 ) %>%
mutate(hora_vuelo=air_time ) %>%
select(flight, carrier, origin, dest, retraso,hora_vuelo) %>%
arrange(desc(hora_vuelo))
## # A tibble: 103 x 6
## flight carrier origin dest retraso hora_vuelo
## <int> <chr> <chr> <chr> <dbl> <dbl>
## 1 3609 9E JFK PHL 1.97 29
## 2 4271 EV EWR ALB 4.47 29
## 3 4233 EV EWR ALB 1.02 29
## 4 522 UA EWR BDL 2.27 29
## 5 3832 EV EWR PVD 1.37 29
## 6 3638 9E JFK PHL 2.22 29
## 7 4309 EV EWR ALB 2.78 29
## 8 4091 EV EWR PVD 3.27 29
## 9 4088 EV EWR ALB 1.33 29
## 10 4404 EV EWR PVD 1.03 29
## # ... with 93 more rows
#1.7. Departed between midnight and 6am (inclusive)
flights %>%
filter(hour >= 0 & hour<=6) %>%
group_by(hour) %>%
summarize(cnt = n())
## # A tibble: 3 x 2
## hour cnt
## <dbl> <int>
## 1 1 1
## 2 5 1953
## 3 6 25951
#2.Another useful dplyr filtering helper is between(). What does it do? Can you use it to simplify the code needed to answer the previous challenges?
flights %>%
filter(between(hour, 0 ,6) ) %>%
group_by(hour) %>%
summarize(cnt = n())
## # A tibble: 3 x 2
## hour cnt
## <dbl> <int>
## 1 1 1
## 2 5 1953
## 3 6 25951
#3. How many flights have a missing dep_time? What other variables are missing? What might these rows represent?
flights %>%
filter(is.na(dep_time) ) %>%
summarize(cnt = n())
## # A tibble: 1 x 1
## cnt
## <int>
## 1 8255
# tarea 5.3.1
#1. How could you use arrange() to sort all missing values to the start? (Hint: use is.na()).
flights %>%
filter(!is.na(dep_time) ) %>%
arrange(desc(dep_time))
## # A tibble: 328,521 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 10 30 2400 2359 1 327
## 2 2013 11 27 2400 2359 1 515
## 3 2013 12 5 2400 2359 1 427
## 4 2013 12 9 2400 2359 1 432
## 5 2013 12 9 2400 2250 70 59
## 6 2013 12 13 2400 2359 1 432
## 7 2013 12 19 2400 2359 1 434
## 8 2013 12 29 2400 1700 420 302
## 9 2013 2 7 2400 2359 1 432
## 10 2013 2 7 2400 2359 1 443
## # ... with 328,511 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
#2. Sort flights to find the most delayed flights.
flights %>%
filter(!is.na(dep_delay) ) %>%
filter(!is.na(arr_delay) ) %>%
mutate(retrasotoal = dep_delay + arr_delay) %>%
select(flight, carrier, origin, dest, retrasotoal) %>%
arrange(desc(retrasotoal))
## # A tibble: 327,346 x 5
## flight carrier origin dest retrasotoal
## <int> <chr> <chr> <chr> <dbl>
## 1 51 HA JFK HNL 2573
## 2 3535 MQ JFK CMH 2264
## 3 3695 MQ EWR ORD 2235
## 4 177 AA JFK SFO 2021
## 5 3075 MQ JFK CVG 1994
## 6 2391 DL JFK TPA 1891
## 7 2119 DL LGA MSP 1826
## 8 2047 DL LGA ATL 1793
## 9 172 AA EWR MIA 1774
## 10 3744 MQ EWR ORD 1753
## # ... with 327,336 more rows
#2 Find the flights that left earliest.
flights %>%
filter(!is.na(dep_delay) ) %>%
mutate(retrasotoal = dep_delay) %>%
select(flight, carrier, origin, dest, retrasotoal) %>%
arrange(retrasotoal)
## # A tibble: 328,521 x 5
## flight carrier origin dest retrasotoal
## <int> <chr> <chr> <chr> <dbl>
## 1 97 B6 JFK DEN -43
## 2 1715 DL LGA MSY -33
## 3 5713 EV LGA IAD -32
## 4 1435 DL LGA TPA -30
## 5 837 F9 LGA DEN -27
## 6 3478 MQ LGA DTW -26
## 7 4361 EV EWR TYS -25
## 8 4573 MQ LGA DTW -25
## 9 3318 9E JFK BUF -24
## 10 375 B6 LGA FLL -24
## # ... with 328,511 more rows
#3. Sort flights to find the fastest flights.
flights %>%
filter(!is.na(distance) & !is.na(air_time) ) %>%
mutate(vueloenhoras = air_time/60) %>%
mutate(velocidad = distance/vueloenhoras) %>%
select(flight, carrier, origin, dest, velocidad) %>%
arrange(desc(velocidad))
## # A tibble: 327,346 x 5
## flight carrier origin dest velocidad
## <int> <chr> <chr> <chr> <dbl>
## 1 1499 DL LGA ATL 703.
## 2 4667 EV EWR MSP 650.
## 3 4292 EV EWR GSP 648
## 4 3805 EV EWR BNA 641.
## 5 1902 DL LGA PBI 591.
## 6 315 DL JFK SJU 564
## 7 707 B6 JFK SJU 557.
## 8 936 AA JFK STT 556.
## 9 347 DL JFK SJU 554.
## 10 1503 B6 JFK SJU 554.
## # ... with 327,336 more rows
#4. Which flights travelled the longest?
flights %>%
filter(!is.na(distance) ) %>%
select(flight, carrier, origin, dest, distance) %>%
arrange(desc(distance))
## # A tibble: 336,776 x 5
## flight carrier origin dest distance
## <int> <chr> <chr> <chr> <dbl>
## 1 51 HA JFK HNL 4983
## 2 51 HA JFK HNL 4983
## 3 51 HA JFK HNL 4983
## 4 51 HA JFK HNL 4983
## 5 51 HA JFK HNL 4983
## 6 51 HA JFK HNL 4983
## 7 51 HA JFK HNL 4983
## 8 51 HA JFK HNL 4983
## 9 51 HA JFK HNL 4983
## 10 51 HA JFK HNL 4983
## # ... with 336,766 more rows
#4 Which travelled the shortest?
flights %>%
filter(!is.na(distance) ) %>%
select(flight, carrier, origin, dest, distance) %>%
arrange(distance)
## # A tibble: 336,776 x 5
## flight carrier origin dest distance
## <int> <chr> <chr> <chr> <dbl>
## 1 1632 US EWR LGA 17
## 2 3833 EV EWR PHL 80
## 3 4193 EV EWR PHL 80
## 4 4502 EV EWR PHL 80
## 5 4645 EV EWR PHL 80
## 6 4193 EV EWR PHL 80
## 7 4619 EV EWR PHL 80
## 8 4619 EV EWR PHL 80
## 9 4619 EV EWR PHL 80
## 10 4619 EV EWR PHL 80
## # ... with 336,766 more rows
# tarea 5.4.1
#1. Brainstorm as many ways as possible to select dep_time, dep_delay, arr_time, and arr_delay from flights.
flights %>%
select(dep_time, dep_delay, arr_time, arr_delay )
## # A tibble: 336,776 x 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2 830 11
## 2 533 4 850 20
## 3 542 2 923 33
## 4 544 -1 1004 -18
## 5 554 -6 812 -25
## 6 554 -4 740 12
## 7 555 -5 913 19
## 8 557 -3 709 -14
## 9 557 -3 838 -8
## 10 558 -2 753 8
## # ... with 336,766 more rows
#2. What happens if you include the name of a variable multiple times in a select() call?
flights %>%
select(dep_time, dep_time, dep_delay, arr_time, arr_delay )
## # A tibble: 336,776 x 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2 830 11
## 2 533 4 850 20
## 3 542 2 923 33
## 4 544 -1 1004 -18
## 5 554 -6 812 -25
## 6 554 -4 740 12
## 7 555 -5 913 19
## 8 557 -3 709 -14
## 9 557 -3 838 -8
## 10 558 -2 753 8
## # ... with 336,766 more rows
## No paso nada, solo incluyó la primera
#5.5.2 Exercises
#1. Currently dep_time and sched_dep_time are convenient to look at, but hard to compute with because they’re not really continuous numbers. Convert them to a more convenient representation of number of minutes since midnight.
flights %>%
transmute(hora_salida = dep_time %/% 100 ,
hora_salida_min = dep_time %% 100,
hora_salida_agenda = sched_dep_time %/% 100,
hora_salida_min_agenda = sched_dep_time %% 100)
## # A tibble: 336,776 x 4
## hora_salida hora_salida_min hora_salida_agenda hora_salida_min_agenda
## <dbl> <dbl> <dbl> <dbl>
## 1 5 17 5 15
## 2 5 33 5 29
## 3 5 42 5 40
## 4 5 44 5 45
## 5 5 54 6 0
## 6 5 54 5 58
## 7 5 55 6 0
## 8 5 57 6 0
## 9 5 57 6 0
## 10 5 58 6 0
## # ... with 336,766 more rows