library(nycflights13)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# tarea 5.2.4
#Find all flights that

glimpse(flights)
## Observations: 336,776
## Variables: 19
## $ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,...
## $ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 55...
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 60...
## $ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2...
## $ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 7...
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 7...
## $ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -...
## $ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV",...
## $ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79...
## $ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN...
## $ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR"...
## $ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL"...
## $ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138...
## $ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 94...
## $ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5,...
## $ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013...
#1.1. Had an arrival delay of two or more hours
flights %>%
filter(arr_delay >= 120) %>%
mutate(hora=arr_delay /60 ) %>%
select(flight, carrier, origin, dest, hora) %>%
arrange(desc(hora))
## # A tibble: 10,200 x 5
##    flight carrier origin dest   hora
##     <int> <chr>   <chr>  <chr> <dbl>
##  1     51 HA      JFK    HNL    21.2
##  2   3535 MQ      JFK    CMH    18.8
##  3   3695 MQ      EWR    ORD    18.5
##  4    177 AA      JFK    SFO    16.8
##  5   3075 MQ      JFK    CVG    16.5
##  6   2391 DL      JFK    TPA    15.5
##  7   2119 DL      LGA    MSP    15.2
##  8   2047 DL      LGA    ATL    14.9
##  9    172 AA      EWR    MIA    14.6
## 10   3744 MQ      EWR    ORD    14.6
## # ... with 10,190 more rows
#1.2. Flew to Houston (IAH or HOU)
flights %>%
filter(dest == 'IAH'  | dest =='HOU') %>%
group_by(dest) %>%
summarize(cnt = n())
## # A tibble: 2 x 2
##   dest    cnt
##   <chr> <int>
## 1 HOU    2115
## 2 IAH    7198
#1.3. Were operated by United, American, or Delta
flights %>%
  filter(carrier == 'UA' | carrier == 'AA' | carrier == 'DL')%>%
  group_by(carrier) %>%
  summarize(cnt = n())
#1.4. Departed in summer (July, August, and September)

flights %>%
  filter(month == c(7,8,9))%>%
  group_by(month) %>%
  summarize(cnt = n())
## Warning in month == c(7, 8, 9): longer object length is not a multiple of
## shorter object length
## # A tibble: 3 x 2
##   month   cnt
##   <int> <int>
## 1     7  9808
## 2     8  9775
## 3     9  9191
#1.5. Arrived more than two hours late, but didn’t leave late
flights %>%
  filter(arr_delay >= 120) %>%
  filter(dep_delay < 5) %>%
  mutate(hora_retraso=arr_delay /60  ) %>%
  mutate(hora_salida=dep_delay /60  ) %>%
  select(flight, carrier, origin, dest, hora_retraso,hora_salida) %>%
  arrange(desc(hora_retraso))
## # A tibble: 35 x 6
##    flight carrier origin dest  hora_retraso hora_salida
##     <int> <chr>   <chr>  <chr>        <dbl>       <dbl>
##  1    399 VX      JFK    LAX           3.23     -0.0333
##  2    707 AA      LGA    DFW           2.98     -0.0333
##  3   2183 US      LGA    DCA           2.45     -0.0167
##  4   1619 DL      LGA    MSP           2.42     -0.15  
##  5   2083 AA      EWR    DFW           2.38     -0.0833
##  6   2402 B6      JFK    BUF           2.37     -0.0333
##  7    341 AA      LGA    ORD           2.35      0.05  
##  8    389 UA      JFK    SFO           2.33     -0.05  
##  9   1057 DL      LGA    MIA           2.33      0     
## 10    706 DL      JFK    AUS           2.3      -0.05  
## # ... with 25 more rows
#1.6. Were delayed by at least an hour, but made up over 30 minutes in flight
flights %>%
  filter(arr_delay >= 60) %>%
  filter(air_time < 30) %>%
  mutate(retraso=arr_delay /60  ) %>%
  mutate(hora_vuelo=air_time   ) %>%
  select(flight, carrier, origin, dest, retraso,hora_vuelo) %>%
  arrange(desc(hora_vuelo))
## # A tibble: 103 x 6
##    flight carrier origin dest  retraso hora_vuelo
##     <int> <chr>   <chr>  <chr>   <dbl>      <dbl>
##  1   3609 9E      JFK    PHL      1.97         29
##  2   4271 EV      EWR    ALB      4.47         29
##  3   4233 EV      EWR    ALB      1.02         29
##  4    522 UA      EWR    BDL      2.27         29
##  5   3832 EV      EWR    PVD      1.37         29
##  6   3638 9E      JFK    PHL      2.22         29
##  7   4309 EV      EWR    ALB      2.78         29
##  8   4091 EV      EWR    PVD      3.27         29
##  9   4088 EV      EWR    ALB      1.33         29
## 10   4404 EV      EWR    PVD      1.03         29
## # ... with 93 more rows
#1.7. Departed between midnight and 6am (inclusive)
flights %>%
  filter(hour >= 0 & hour<=6) %>%
   group_by(hour) %>%
     summarize(cnt = n())
## # A tibble: 3 x 2
##    hour   cnt
##   <dbl> <int>
## 1     1     1
## 2     5  1953
## 3     6 25951
#2.Another useful dplyr filtering helper is between(). What does it do? Can you use it to simplify the code needed to answer the previous challenges?
flights %>%
  filter(between(hour, 0 ,6) ) %>%
  group_by(hour) %>%
  summarize(cnt = n())
## # A tibble: 3 x 2
##    hour   cnt
##   <dbl> <int>
## 1     1     1
## 2     5  1953
## 3     6 25951
#3. How many flights have a missing dep_time? What other variables are missing? What might these rows represent?

flights %>%
  filter(is.na(dep_time)  ) %>%
  summarize(cnt = n())
## # A tibble: 1 x 1
##     cnt
##   <int>
## 1  8255
# tarea 5.3.1
#1. How could you use arrange() to sort all missing values to the start? (Hint: use is.na()).
flights %>%
  filter(!is.na(dep_time)  ) %>%
  arrange(desc(dep_time))
## # A tibble: 328,521 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013    10    30     2400           2359         1      327
##  2  2013    11    27     2400           2359         1      515
##  3  2013    12     5     2400           2359         1      427
##  4  2013    12     9     2400           2359         1      432
##  5  2013    12     9     2400           2250        70       59
##  6  2013    12    13     2400           2359         1      432
##  7  2013    12    19     2400           2359         1      434
##  8  2013    12    29     2400           1700       420      302
##  9  2013     2     7     2400           2359         1      432
## 10  2013     2     7     2400           2359         1      443
## # ... with 328,511 more rows, and 12 more variables: sched_arr_time <int>,
## #   arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## #   origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## #   minute <dbl>, time_hour <dttm>
#2. Sort flights to find the most delayed flights. 
flights %>%
  filter(!is.na(dep_delay)  ) %>%
  filter(!is.na(arr_delay)  ) %>%
  mutate(retrasotoal = dep_delay + arr_delay) %>%
  select(flight, carrier, origin, dest, retrasotoal) %>%
  arrange(desc(retrasotoal))
## # A tibble: 327,346 x 5
##    flight carrier origin dest  retrasotoal
##     <int> <chr>   <chr>  <chr>       <dbl>
##  1     51 HA      JFK    HNL          2573
##  2   3535 MQ      JFK    CMH          2264
##  3   3695 MQ      EWR    ORD          2235
##  4    177 AA      JFK    SFO          2021
##  5   3075 MQ      JFK    CVG          1994
##  6   2391 DL      JFK    TPA          1891
##  7   2119 DL      LGA    MSP          1826
##  8   2047 DL      LGA    ATL          1793
##  9    172 AA      EWR    MIA          1774
## 10   3744 MQ      EWR    ORD          1753
## # ... with 327,336 more rows
#2 Find the flights that left earliest.
flights %>%
  filter(!is.na(dep_delay)  ) %>%
  mutate(retrasotoal = dep_delay) %>%
  select(flight, carrier, origin, dest, retrasotoal) %>%
  arrange(retrasotoal)
## # A tibble: 328,521 x 5
##    flight carrier origin dest  retrasotoal
##     <int> <chr>   <chr>  <chr>       <dbl>
##  1     97 B6      JFK    DEN           -43
##  2   1715 DL      LGA    MSY           -33
##  3   5713 EV      LGA    IAD           -32
##  4   1435 DL      LGA    TPA           -30
##  5    837 F9      LGA    DEN           -27
##  6   3478 MQ      LGA    DTW           -26
##  7   4361 EV      EWR    TYS           -25
##  8   4573 MQ      LGA    DTW           -25
##  9   3318 9E      JFK    BUF           -24
## 10    375 B6      LGA    FLL           -24
## # ... with 328,511 more rows
#3. Sort flights to find the fastest flights.
flights %>%
  filter(!is.na(distance) & !is.na(air_time)  ) %>%
  mutate(vueloenhoras = air_time/60) %>%
  mutate(velocidad = distance/vueloenhoras) %>%
  select(flight, carrier, origin, dest, velocidad) %>%
  arrange(desc(velocidad))
## # A tibble: 327,346 x 5
##    flight carrier origin dest  velocidad
##     <int> <chr>   <chr>  <chr>     <dbl>
##  1   1499 DL      LGA    ATL        703.
##  2   4667 EV      EWR    MSP        650.
##  3   4292 EV      EWR    GSP        648 
##  4   3805 EV      EWR    BNA        641.
##  5   1902 DL      LGA    PBI        591.
##  6    315 DL      JFK    SJU        564 
##  7    707 B6      JFK    SJU        557.
##  8    936 AA      JFK    STT        556.
##  9    347 DL      JFK    SJU        554.
## 10   1503 B6      JFK    SJU        554.
## # ... with 327,336 more rows
#4. Which flights travelled the longest? 
flights %>%
  filter(!is.na(distance)  ) %>%
  select(flight, carrier, origin, dest, distance) %>%
  arrange(desc(distance))
## # A tibble: 336,776 x 5
##    flight carrier origin dest  distance
##     <int> <chr>   <chr>  <chr>    <dbl>
##  1     51 HA      JFK    HNL       4983
##  2     51 HA      JFK    HNL       4983
##  3     51 HA      JFK    HNL       4983
##  4     51 HA      JFK    HNL       4983
##  5     51 HA      JFK    HNL       4983
##  6     51 HA      JFK    HNL       4983
##  7     51 HA      JFK    HNL       4983
##  8     51 HA      JFK    HNL       4983
##  9     51 HA      JFK    HNL       4983
## 10     51 HA      JFK    HNL       4983
## # ... with 336,766 more rows
#4 Which travelled the shortest?
flights %>%
  filter(!is.na(distance)  ) %>%
  select(flight, carrier, origin, dest, distance) %>%
  arrange(distance)
## # A tibble: 336,776 x 5
##    flight carrier origin dest  distance
##     <int> <chr>   <chr>  <chr>    <dbl>
##  1   1632 US      EWR    LGA         17
##  2   3833 EV      EWR    PHL         80
##  3   4193 EV      EWR    PHL         80
##  4   4502 EV      EWR    PHL         80
##  5   4645 EV      EWR    PHL         80
##  6   4193 EV      EWR    PHL         80
##  7   4619 EV      EWR    PHL         80
##  8   4619 EV      EWR    PHL         80
##  9   4619 EV      EWR    PHL         80
## 10   4619 EV      EWR    PHL         80
## # ... with 336,766 more rows
# tarea 5.4.1
#1. Brainstorm as many ways as possible to select dep_time, dep_delay, arr_time, and arr_delay from flights.
flights %>%
  select(dep_time, dep_delay, arr_time,  arr_delay ) 
## # A tibble: 336,776 x 4
##    dep_time dep_delay arr_time arr_delay
##       <int>     <dbl>    <int>     <dbl>
##  1      517         2      830        11
##  2      533         4      850        20
##  3      542         2      923        33
##  4      544        -1     1004       -18
##  5      554        -6      812       -25
##  6      554        -4      740        12
##  7      555        -5      913        19
##  8      557        -3      709       -14
##  9      557        -3      838        -8
## 10      558        -2      753         8
## # ... with 336,766 more rows
#2. What happens if you include the name of a variable multiple times in a select() call?
flights %>%
  select(dep_time, dep_time, dep_delay, arr_time,  arr_delay ) 
## # A tibble: 336,776 x 4
##    dep_time dep_delay arr_time arr_delay
##       <int>     <dbl>    <int>     <dbl>
##  1      517         2      830        11
##  2      533         4      850        20
##  3      542         2      923        33
##  4      544        -1     1004       -18
##  5      554        -6      812       -25
##  6      554        -4      740        12
##  7      555        -5      913        19
##  8      557        -3      709       -14
##  9      557        -3      838        -8
## 10      558        -2      753         8
## # ... with 336,766 more rows
## No paso nada, solo incluyó la primera
#5.5.2 Exercises
#1. Currently dep_time and sched_dep_time are convenient to look at, but hard to compute with because they’re not really continuous numbers. Convert them to a more convenient representation of number of minutes since midnight.
flights %>%
  transmute(hora_salida = dep_time %/% 100 ,
            hora_salida_min = dep_time %% 100,
            hora_salida_agenda = sched_dep_time %/% 100, 
            hora_salida_min_agenda = sched_dep_time %% 100)
## # A tibble: 336,776 x 4
##    hora_salida hora_salida_min hora_salida_agenda hora_salida_min_agenda
##          <dbl>           <dbl>              <dbl>                  <dbl>
##  1           5              17                  5                     15
##  2           5              33                  5                     29
##  3           5              42                  5                     40
##  4           5              44                  5                     45
##  5           5              54                  6                      0
##  6           5              54                  5                     58
##  7           5              55                  6                      0
##  8           5              57                  6                      0
##  9           5              57                  6                      0
## 10           5              58                  6                      0
## # ... with 336,766 more rows