flights
## # A tibble: 336,776 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 1 1 517 515 2 830 819 11 UA
## 2 2013 1 1 533 529 4 850 830 20 UA
## 3 2013 1 1 542 540 2 923 850 33 AA
## 4 2013 1 1 544 545 -1 1004 1022 -18 B6
## 5 2013 1 1 554 600 -6 812 837 -25 DL
## 6 2013 1 1 554 558 -4 740 728 12 UA
## 7 2013 1 1 555 600 -5 913 854 19 B6
## 8 2013 1 1 557 600 -3 709 723 -14 EV
## 9 2013 1 1 557 600 -3 838 846 -8 B6
## 10 2013 1 1 558 600 -2 753 745 8 AA
## # … with 336,766 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
filter(flights, month == 1, day == 1)
## # A tibble: 842 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 1 1 517 515 2 830 819 11 UA
## 2 2013 1 1 533 529 4 850 830 20 UA
## 3 2013 1 1 542 540 2 923 850 33 AA
## 4 2013 1 1 544 545 -1 1004 1022 -18 B6
## 5 2013 1 1 554 600 -6 812 837 -25 DL
## 6 2013 1 1 554 558 -4 740 728 12 UA
## 7 2013 1 1 555 600 -5 913 854 19 B6
## 8 2013 1 1 557 600 -3 709 723 -14 EV
## 9 2013 1 1 557 600 -3 838 846 -8 B6
## 10 2013 1 1 558 600 -2 753 745 8 AA
## # … with 832 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
filter(flights, month == 1 & day == 1)
## # A tibble: 842 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 1 1 517 515 2 830 819 11 UA
## 2 2013 1 1 533 529 4 850 830 20 UA
## 3 2013 1 1 542 540 2 923 850 33 AA
## 4 2013 1 1 544 545 -1 1004 1022 -18 B6
## 5 2013 1 1 554 600 -6 812 837 -25 DL
## 6 2013 1 1 554 558 -4 740 728 12 UA
## 7 2013 1 1 555 600 -5 913 854 19 B6
## 8 2013 1 1 557 600 -3 709 723 -14 EV
## 9 2013 1 1 557 600 -3 838 846 -8 B6
## 10 2013 1 1 558 600 -2 753 745 8 AA
## # … with 832 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
filter(flights, month == 1 | day == 1)
## # A tibble: 37,198 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 1 1 517 515 2 830 819 11 UA
## 2 2013 1 1 533 529 4 850 830 20 UA
## 3 2013 1 1 542 540 2 923 850 33 AA
## 4 2013 1 1 544 545 -1 1004 1022 -18 B6
## 5 2013 1 1 554 600 -6 812 837 -25 DL
## 6 2013 1 1 554 558 -4 740 728 12 UA
## 7 2013 1 1 555 600 -5 913 854 19 B6
## 8 2013 1 1 557 600 -3 709 723 -14 EV
## 9 2013 1 1 557 600 -3 838 846 -8 B6
## 10 2013 1 1 558 600 -2 753 745 8 AA
## # … with 37,188 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
filter(flights, month %in% c(11, 12))
## # A tibble: 55,403 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 11 1 5 2359 6 352 345 7 B6
## 2 2013 11 1 35 2250 105 123 2356 87 B6
## 3 2013 11 1 455 500 -5 641 651 -10 US
## 4 2013 11 1 539 545 -6 856 827 29 UA
## 5 2013 11 1 542 545 -3 831 855 -24 AA
## 6 2013 11 1 549 600 -11 912 923 -11 UA
## 7 2013 11 1 550 600 -10 705 659 6 US
## 8 2013 11 1 554 600 -6 659 701 -2 US
## 9 2013 11 1 554 600 -6 826 827 -1 DL
## 10 2013 11 1 554 600 -6 749 751 -2 DL
## # … with 55,393 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
arrange(flights, desc(month), desc(day))
## # A tibble: 336,776 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 12 31 13 2359 14 439 437 2 B6
## 2 2013 12 31 18 2359 19 449 444 5 DL
## 3 2013 12 31 26 2245 101 129 2353 96 B6
## 4 2013 12 31 459 500 -1 655 651 4 US
## 5 2013 12 31 514 515 -1 814 812 2 UA
## 6 2013 12 31 549 551 -2 925 900 25 UA
## 7 2013 12 31 550 600 -10 725 745 -20 AA
## 8 2013 12 31 552 600 -8 811 826 -15 EV
## 9 2013 12 31 553 600 -7 741 754 -13 DL
## 10 2013 12 31 554 550 4 1024 1027 -3 B6
## # … with 336,766 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
select(flights, year:dep_time)
## # A tibble: 336,776 × 4
## year month day dep_time
## <int> <int> <int> <int>
## 1 2013 1 1 517
## 2 2013 1 1 533
## 3 2013 1 1 542
## 4 2013 1 1 544
## 5 2013 1 1 554
## 6 2013 1 1 554
## 7 2013 1 1 555
## 8 2013 1 1 557
## 9 2013 1 1 557
## 10 2013 1 1 558
## # … with 336,766 more rows
select(flights, year, month, day, dep_time)
## # A tibble: 336,776 × 4
## year month day dep_time
## <int> <int> <int> <int>
## 1 2013 1 1 517
## 2 2013 1 1 533
## 3 2013 1 1 542
## 4 2013 1 1 544
## 5 2013 1 1 554
## 6 2013 1 1 554
## 7 2013 1 1 555
## 8 2013 1 1 557
## 9 2013 1 1 557
## 10 2013 1 1 558
## # … with 336,766 more rows
select(flights, year, month, day, dep_time, dep_delay)
## # A tibble: 336,776 × 5
## year month day dep_time dep_delay
## <int> <int> <int> <int> <dbl>
## 1 2013 1 1 517 2
## 2 2013 1 1 533 4
## 3 2013 1 1 542 2
## 4 2013 1 1 544 -1
## 5 2013 1 1 554 -6
## 6 2013 1 1 554 -4
## 7 2013 1 1 555 -5
## 8 2013 1 1 557 -3
## 9 2013 1 1 557 -3
## 10 2013 1 1 558 -2
## # … with 336,766 more rows
select(flights, year, month, day, starts_with("dep"))
## # A tibble: 336,776 × 5
## year month day dep_time dep_delay
## <int> <int> <int> <int> <dbl>
## 1 2013 1 1 517 2
## 2 2013 1 1 533 4
## 3 2013 1 1 542 2
## 4 2013 1 1 544 -1
## 5 2013 1 1 554 -6
## 6 2013 1 1 554 -4
## 7 2013 1 1 555 -5
## 8 2013 1 1 557 -3
## 9 2013 1 1 557 -3
## 10 2013 1 1 558 -2
## # … with 336,766 more rows
select(flights, year, month, day, contains("time"))
## # A tibble: 336,776 × 9
## year month day dep_t…¹ sched…² arr_t…³ sched…⁴ air_t…⁵ time_hour
## <int> <int> <int> <int> <int> <int> <int> <dbl> <dttm>
## 1 2013 1 1 517 515 830 819 227 2013-01-01 05:00:00
## 2 2013 1 1 533 529 850 830 227 2013-01-01 05:00:00
## 3 2013 1 1 542 540 923 850 160 2013-01-01 05:00:00
## 4 2013 1 1 544 545 1004 1022 183 2013-01-01 05:00:00
## 5 2013 1 1 554 600 812 837 116 2013-01-01 06:00:00
## 6 2013 1 1 554 558 740 728 150 2013-01-01 05:00:00
## 7 2013 1 1 555 600 913 854 158 2013-01-01 06:00:00
## 8 2013 1 1 557 600 709 723 53 2013-01-01 06:00:00
## 9 2013 1 1 557 600 838 846 140 2013-01-01 06:00:00
## 10 2013 1 1 558 600 753 745 138 2013-01-01 06:00:00
## # … with 336,766 more rows, and abbreviated variable names ¹dep_time,
## # ²sched_dep_time, ³arr_time, ⁴sched_arr_time, ⁵air_time
select(flights, year, month, day, ends_with("time"))
## # A tibble: 336,776 × 8
## year month day dep_time sched_dep_time arr_time sched_arr_time air_time
## <int> <int> <int> <int> <int> <int> <int> <dbl>
## 1 2013 1 1 517 515 830 819 227
## 2 2013 1 1 533 529 850 830 227
## 3 2013 1 1 542 540 923 850 160
## 4 2013 1 1 544 545 1004 1022 183
## 5 2013 1 1 554 600 812 837 116
## 6 2013 1 1 554 558 740 728 150
## 7 2013 1 1 555 600 913 854 158
## 8 2013 1 1 557 600 709 723 53
## 9 2013 1 1 557 600 838 846 140
## 10 2013 1 1 558 600 753 745 138
## # … with 336,766 more rows
select(flights, year, month, day, contains("time"), everything())
## # A tibble: 336,776 × 19
## year month day dep_t…¹ sched…² arr_t…³ sched…⁴ air_t…⁵ time_hour
## <int> <int> <int> <int> <int> <int> <int> <dbl> <dttm>
## 1 2013 1 1 517 515 830 819 227 2013-01-01 05:00:00
## 2 2013 1 1 533 529 850 830 227 2013-01-01 05:00:00
## 3 2013 1 1 542 540 923 850 160 2013-01-01 05:00:00
## 4 2013 1 1 544 545 1004 1022 183 2013-01-01 05:00:00
## 5 2013 1 1 554 600 812 837 116 2013-01-01 06:00:00
## 6 2013 1 1 554 558 740 728 150 2013-01-01 05:00:00
## 7 2013 1 1 555 600 913 854 158 2013-01-01 06:00:00
## 8 2013 1 1 557 600 709 723 53 2013-01-01 06:00:00
## 9 2013 1 1 557 600 838 846 140 2013-01-01 06:00:00
## 10 2013 1 1 558 600 753 745 138 2013-01-01 06:00:00
## # … with 336,766 more rows, 10 more variables: dep_delay <dbl>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>, origin <chr>,
## # dest <chr>, distance <dbl>, hour <dbl>, minute <dbl>, and abbreviated
## # variable names ¹dep_time, ²sched_dep_time, ³arr_time, ⁴sched_arr_time,
## # ⁵air_time
mutate(flights,
gain = dep_delay - arr_delay) %>%
# select year, month, day, gain
select(year:day, gain)
## # A tibble: 336,776 × 4
## year month day gain
## <int> <int> <int> <dbl>
## 1 2013 1 1 -9
## 2 2013 1 1 -16
## 3 2013 1 1 -31
## 4 2013 1 1 17
## 5 2013 1 1 19
## 6 2013 1 1 -16
## 7 2013 1 1 -24
## 8 2013 1 1 11
## 9 2013 1 1 5
## 10 2013 1 1 -10
## # … with 336,766 more rows
mutate(flights,
gain = dep_delay - arr_delay) %>%
# select year, month, day, gain
select(gain)
## # A tibble: 336,776 × 1
## gain
## <dbl>
## 1 -9
## 2 -16
## 3 -31
## 4 17
## 5 19
## 6 -16
## 7 -24
## 8 11
## 9 5
## 10 -10
## # … with 336,766 more rows
# Alternative using transmute function
transmute(flights,
gain = dep_delay - arr_delay)
## # A tibble: 336,776 × 1
## gain
## <dbl>
## 1 -9
## 2 -16
## 3 -31
## 4 17
## 5 19
## 6 -16
## 7 -24
## 8 11
## 9 5
## 10 -10
## # … with 336,766 more rows
# lag()
select(flights,
dep_time) %>%
mutate(dep_time_lag1 = lag(dep_time))
## # A tibble: 336,776 × 2
## dep_time dep_time_lag1
## <int> <int>
## 1 517 NA
## 2 533 517
## 3 542 533
## 4 544 542
## 5 554 544
## 6 554 554
## 7 555 554
## 8 557 555
## 9 557 557
## 10 558 557
## # … with 336,766 more rows
# CumSum()
select(flights, minute) %>%
mutate(minute_cumsum = cumsum(minute))
## # A tibble: 336,776 × 2
## minute minute_cumsum
## <dbl> <dbl>
## 1 15 15
## 2 29 44
## 3 40 84
## 4 45 129
## 5 0 129
## 6 58 187
## 7 0 187
## 8 0 187
## 9 0 187
## 10 0 187
## # … with 336,766 more rows
Collapsing data into a single row
flights
## # A tibble: 336,776 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 1 1 517 515 2 830 819 11 UA
## 2 2013 1 1 533 529 4 850 830 20 UA
## 3 2013 1 1 542 540 2 923 850 33 AA
## 4 2013 1 1 544 545 -1 1004 1022 -18 B6
## 5 2013 1 1 554 600 -6 812 837 -25 DL
## 6 2013 1 1 554 558 -4 740 728 12 UA
## 7 2013 1 1 555 600 -5 913 854 19 B6
## 8 2013 1 1 557 600 -3 709 723 -14 EV
## 9 2013 1 1 557 600 -3 838 846 -8 B6
## 10 2013 1 1 558 600 -2 753 745 8 AA
## # … with 336,766 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
summarise(flights, delay = mean(dep_delay, na.rm = TRUE))
## # A tibble: 1 × 1
## delay
## <dbl>
## 1 12.6
Summarize by group
(flights) %>%
#group by airlines
group_by(carrier) %>%
#calculate average delay time
summarize(delay = mean(dep_delay, na.rm = TRUE)) %>%
# Sort it
arrange(delay)
## # A tibble: 16 × 2
## carrier delay
## <chr> <dbl>
## 1 US 3.78
## 2 HA 4.90
## 3 AS 5.80
## 4 AA 8.59
## 5 DL 9.26
## 6 MQ 10.6
## 7 UA 12.1
## 8 OO 12.6
## 9 VX 12.9
## 10 B6 13.0
## 11 9E 16.7
## 12 WN 17.7
## 13 FL 18.7
## 14 YV 19.0
## 15 EV 20.0
## 16 F9 20.2
Delays increase with distance up to ~750 miles and then decrease
flights %>%
group_by(dest) %>%
summarize(count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)) %>%
# Plot
ggplot(mapping = aes(x = dist, y = delay)) + geom_point(aes(size = count), alpha = .3) + geom_smooth(se = FALSE)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_point).
Missing Values
flights %>%
#remove missing values
filter(!is.na(dep_delay))
## # A tibble: 328,521 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 1 1 517 515 2 830 819 11 UA
## 2 2013 1 1 533 529 4 850 830 20 UA
## 3 2013 1 1 542 540 2 923 850 33 AA
## 4 2013 1 1 544 545 -1 1004 1022 -18 B6
## 5 2013 1 1 554 600 -6 812 837 -25 DL
## 6 2013 1 1 554 558 -4 740 728 12 UA
## 7 2013 1 1 555 600 -5 913 854 19 B6
## 8 2013 1 1 557 600 -3 709 723 -14 EV
## 9 2013 1 1 557 600 -3 838 846 -8 B6
## 10 2013 1 1 558 600 -2 753 745 8 AA
## # … with 328,511 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
counts
flights
## # A tibble: 336,776 × 19
## year month day dep_time sched_de…¹ dep_d…² arr_t…³ sched…⁴ arr_d…⁵ carrier
## <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr>
## 1 2013 1 1 517 515 2 830 819 11 UA
## 2 2013 1 1 533 529 4 850 830 20 UA
## 3 2013 1 1 542 540 2 923 850 33 AA
## 4 2013 1 1 544 545 -1 1004 1022 -18 B6
## 5 2013 1 1 554 600 -6 812 837 -25 DL
## 6 2013 1 1 554 558 -4 740 728 12 UA
## 7 2013 1 1 555 600 -5 913 854 19 B6
## 8 2013 1 1 557 600 -3 709 723 -14 EV
## 9 2013 1 1 557 600 -3 838 846 -8 B6
## 10 2013 1 1 558 600 -2 753 745 8 AA
## # … with 336,766 more rows, 9 more variables: flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>, and abbreviated variable names
## # ¹sched_dep_time, ²dep_delay, ³arr_time, ⁴sched_arr_time, ⁵arr_delay
Useful Summary functions
grouping multiple variations
flights %>%
group_by(year, month, day) %>%
summarise %>%
ungroup()
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 3
## year month day
## <int> <int> <int>
## 1 2013 1 1
## 2 2013 1 2
## 3 2013 1 3
## 4 2013 1 4
## 5 2013 1 5
## 6 2013 1 6
## 7 2013 1 7
## 8 2013 1 8
## 9 2013 1 9
## 10 2013 1 10
## # … with 355 more rows
ungrouping