660 / 3 + 15
## [1] 235
x <- 660 / 3 + 15
x
## [1] 235
TAB
seq(from = 1, to = 10, by = 1)
## [1] 1 2 3 4 5 6 7 8 9 10
+
seq(from = 1, to = 10)
## [1] 1 2 3 4 5 6 7 8 9 10
y <- seq(from = 1, to = 10)
y
## [1] 1 2 3 4 5 6 7 8 9 10
library(tidyverse)
library(nycflights13)
Remember that variable names (those created by <- and those created by mutate()) should use only lowercase letters, numbers, and . Use to separate words within a name.
# Strive for:
short_flights <- flights |> filter(air_time < 60)
##Spaces Put spaces on either side of mathematical operators apart from ^ (i.e. +, -, ==, <, …), and around the assignment operator (<-).
# Strive for
z <- (1 + 2)^2 / 3
Don’t put spaces inside or outside parentheses for regular function calls. Always put a space after a comma, just like in standard English.
# Strive for
mean (1, na.rm = TRUE)
## [1] 1
It’s OK to add extra spaces if it improves alignment. For example, if you’re creating multiple variables in mutate(), you might want to add spaces so that all the = line up.1 This makes it easier to skim the code.
flights |>
mutate(
speed = distance / air_time,
dep_hour = dep_time %/% 100,
dep_minute = dep_time %% 100
)
## # A tibble: 336,776 × 22
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 336,766 more rows
## # ℹ 14 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>, speed <dbl>, dep_hour <dbl>,
## # dep_minute <dbl>
##Pipes |> should always have a space before it and should typically be the last thing on a line. This makes it easier to add new steps, rearrange existing steps, modify elements within a step, and get a 10,000 ft view by skimming the verbs on the left-hand side.
# Strive for
flights |>
filter(!is.na(arr_delay), !is.na(tailnum)) |>
count(dest)
## # A tibble: 104 × 2
## dest n
## <chr> <int>
## 1 ABQ 254
## 2 ACK 264
## 3 ALB 418
## 4 ANC 8
## 5 ATL 16837
## 6 AUS 2411
## 7 AVL 261
## 8 BDL 412
## 9 BGR 358
## 10 BHM 269
## # ℹ 94 more rows
If the function you’re piping into has named arguments (like mutate() or summarize()), put each argument on a new line. If the function doesn’t have named arguments (like select() or filter()), keep everything on one line unless it doesn’t fit, in which case you should put each argument on its own line.
# Strive for
flights |>
group_by(tailnum) |>
summarize(
delay = mean(arr_delay, na.rm = TRUE),
n = n()
)
## # A tibble: 4,044 × 3
## tailnum delay n
## <chr> <dbl> <int>
## 1 D942DN 31.5 4
## 2 N0EGMQ 9.98 371
## 3 N10156 12.7 153
## 4 N102UW 2.94 48
## 5 N103US -6.93 46
## 6 N104UW 1.80 47
## 7 N10575 20.7 289
## 8 N105UW -0.267 45
## 9 N107US -5.73 41
## 10 N108UW -1.25 60
## # ℹ 4,034 more rows
After the first step of the pipeline, indent each line by two spaces. RStudio will automatically put the spaces in for you after a line break following a |> . If you’re putting each argument on its own line, indent by an extra two spaces. Make sure ) is on its own line, and un-indented to match the horizontal position of the function name.
# Strive for
flights |>
group_by(tailnum) |>
summarize(
delay = mean(arr_delay, na.rm = TRUE),
n = n()
)
## # A tibble: 4,044 × 3
## tailnum delay n
## <chr> <dbl> <int>
## 1 D942DN 31.5 4
## 2 N0EGMQ 9.98 371
## 3 N10156 12.7 153
## 4 N102UW 2.94 48
## 5 N103US -6.93 46
## 6 N104UW 1.80 47
## 7 N10575 20.7 289
## 8 N105UW -0.267 45
## 9 N107US -5.73 41
## 10 N108UW -1.25 60
## # ℹ 4,034 more rows
It’s OK to shirk some of these rules if your pipeline fits easily on one line. But in our collective experience, it’s common for short snippets to grow longer, so you’ll usually save time in the long run by starting with all the vertical space you need.
##ggplot2 The same basic rules that apply to the pipe also apply to ggplot2; just treat + the same way as |>.
flights |>
group_by(month) |>
summarize(
delay = mean(arr_delay, na.rm = TRUE)
) |>
ggplot(aes(x = month, y = delay)) +
geom_point() +
geom_line()
Again, if you can’t fit all of the arguments to a function on to a
single line, put each argument on its own line:
flights |>
group_by(dest) |>
summarize(
distance = mean(distance),
speed = mean(distance / air_time, na.rm = TRUE)
) |>
ggplot(aes(x = distance, y = speed)) +
geom_smooth(
method = "loess",
span = 0.5,
se = FALSE,
color = "white",
linewidth = 4
) +
geom_point()
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
##Sectioning comments As your scripts get longer, you can use sectioning comments to break up your file into manageable pieces:
# Load data --------------------------------------
# Plot data --------------------------------------
##Exercise restyle pipelines
flights|>filter(dest=="IAH")|>group_by(year,month,day)|>summarize(n=n(),
delay=mean(arr_delay,na.rm=TRUE))|>filter(n>10)
## # A tibble: 365 × 5
## # Groups: year, month [12]
## year month day n delay
## <int> <int> <int> <int> <dbl>
## 1 2013 1 1 20 17.8
## 2 2013 1 2 20 7
## 3 2013 1 3 19 18.3
## 4 2013 1 4 20 -3.2
## 5 2013 1 5 13 20.2
## 6 2013 1 6 18 9.28
## 7 2013 1 7 19 -7.74
## 8 2013 1 8 19 7.79
## 9 2013 1 9 19 18.1
## 10 2013 1 10 19 6.68
## # ℹ 355 more rows
flights|>filter(carrier=="UA",dest%in%c("IAH","HOU"),sched_dep_time>
0900,sched_arr_time<2000)|>group_by(flight)|>summarize(delay=mean(
arr_delay,na.rm=TRUE),cancelled=sum(is.na(arr_delay)),n=n())|>filter(n>10)
## # A tibble: 74 × 4
## flight delay cancelled n
## <int> <dbl> <int> <int>
## 1 53 12.5 2 18
## 2 112 14.1 0 14
## 3 205 -1.71 0 14
## 4 235 -5.36 0 14
## 5 255 -9.47 0 15
## 6 268 38.6 1 15
## 7 292 6.57 0 21
## 8 318 10.7 1 20
## 9 337 20.1 2 21
## 10 370 17.5 0 11
## # ℹ 64 more rows
flights |>
filter(dest == "IAH" ) |>
group_by(year,month,day) |>
summarize(
n = n(),
delay = mean (arr_delay, na.rm = TRUE)) |>
filter(n > 10)
## # A tibble: 365 × 5
## # Groups: year, month [12]
## year month day n delay
## <int> <int> <int> <int> <dbl>
## 1 2013 1 1 20 17.8
## 2 2013 1 2 20 7
## 3 2013 1 3 19 18.3
## 4 2013 1 4 20 -3.2
## 5 2013 1 5 13 20.2
## 6 2013 1 6 18 9.28
## 7 2013 1 7 19 -7.74
## 8 2013 1 8 19 7.79
## 9 2013 1 9 19 18.1
## 10 2013 1 10 19 6.68
## # ℹ 355 more rows
flights |>
filter (carrier == "UA", dest%in%c("IAH","HOU"),
sched_dep_time > 0900, sched_arr_time<2000) |>
group_by (flight) |>
summarize (delay = mean( arr_delay,na.rm = TRUE), cancelled = sum (is.na (arr_delay)), n = n()) |>
filter (n>10)
## # A tibble: 74 × 4
## flight delay cancelled n
## <int> <dbl> <int> <int>
## 1 53 12.5 2 18
## 2 112 14.1 0 14
## 3 205 -1.71 0 14
## 4 235 -5.36 0 14
## 5 255 -9.47 0 15
## 6 268 38.6 1 15
## 7 292 6.57 0 21
## 8 318 10.7 1 20
## 9 337 20.1 2 21
## 10 370 17.5 0 11
## # ℹ 64 more rows
#Import data
flights
## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, month == 1, day == 1)
## # A tibble: 842 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 832 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, month == 1 | day == 1)
## # A tibble: 37,198 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 37,188 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, month == 1 & day == 1)
## # A tibble: 842 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 832 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
nov_dec <- filter(flights, month %in% c(11, 12))
#Exercise
filter(flights, dep_delay > 2)
## # A tibble: 114,149 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 533 529 4 850 830
## 2 2013 1 1 608 600 8 807 735
## 3 2013 1 1 611 600 11 945 931
## 4 2013 1 1 613 610 3 925 921
## 5 2013 1 1 623 610 13 920 915
## 6 2013 1 1 632 608 24 740 728
## 7 2013 1 1 644 636 8 931 940
## 8 2013 1 1 709 700 9 852 832
## 9 2013 1 1 732 729 3 1041 1039
## 10 2013 1 1 732 645 47 1011 941
## # ℹ 114,139 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
filter(flights, dest %in% c("IAH","HOU"))
## # A tibble: 9,313 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 623 627 -4 933 932
## 4 2013 1 1 728 732 -4 1041 1038
## 5 2013 1 1 739 739 0 1104 1038
## 6 2013 1 1 908 908 0 1228 1219
## 7 2013 1 1 1028 1026 2 1350 1339
## 8 2013 1 1 1044 1045 -1 1352 1351
## 9 2013 1 1 1114 900 134 1447 1222
## 10 2013 1 1 1205 1200 5 1503 1505
## # ℹ 9,303 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
filtered_flights <- filter(
flights,
carrier %in% c("UA", "AA", "DL"),
month %in% c(7, 8, 9),
arr_delay > 120,
dep_delay <= 0,
dep_delay >= 60, air_time - (arr_delay - dep_delay) >= 30,
dep_time >= 0 & dep_time <= 600
)
arrange(flights, desc(month), desc(day))
## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 12 31 13 2359 14 439 437
## 2 2013 12 31 18 2359 19 449 444
## 3 2013 12 31 26 2245 101 129 2353
## 4 2013 12 31 459 500 -1 655 651
## 5 2013 12 31 514 515 -1 814 812
## 6 2013 12 31 549 551 -2 925 900
## 7 2013 12 31 550 600 -10 725 745
## 8 2013 12 31 552 600 -8 811 826
## 9 2013 12 31 553 600 -7 741 754
## 10 2013 12 31 554 550 4 1024 1027
## # ℹ 336,766 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
select(flights, year:dep_time)
## # A tibble: 336,776 × 4
## year month day dep_time
## <int> <int> <int> <int>
## 1 2013 1 1 517
## 2 2013 1 1 533
## 3 2013 1 1 542
## 4 2013 1 1 544
## 5 2013 1 1 554
## 6 2013 1 1 554
## 7 2013 1 1 555
## 8 2013 1 1 557
## 9 2013 1 1 557
## 10 2013 1 1 558
## # ℹ 336,766 more rows
select(flights, year, month, day, dep_time)
## # A tibble: 336,776 × 4
## year month day dep_time
## <int> <int> <int> <int>
## 1 2013 1 1 517
## 2 2013 1 1 533
## 3 2013 1 1 542
## 4 2013 1 1 544
## 5 2013 1 1 554
## 6 2013 1 1 554
## 7 2013 1 1 555
## 8 2013 1 1 557
## 9 2013 1 1 557
## 10 2013 1 1 558
## # ℹ 336,766 more rows
select(flights, year, month, day, dep_time, dep_delay)
## # A tibble: 336,776 × 5
## year month day dep_time dep_delay
## <int> <int> <int> <int> <dbl>
## 1 2013 1 1 517 2
## 2 2013 1 1 533 4
## 3 2013 1 1 542 2
## 4 2013 1 1 544 -1
## 5 2013 1 1 554 -6
## 6 2013 1 1 554 -4
## 7 2013 1 1 555 -5
## 8 2013 1 1 557 -3
## 9 2013 1 1 557 -3
## 10 2013 1 1 558 -2
## # ℹ 336,766 more rows
select(flights, year, month, day, starts_with("dep"))
## # A tibble: 336,776 × 5
## year month day dep_time dep_delay
## <int> <int> <int> <int> <dbl>
## 1 2013 1 1 517 2
## 2 2013 1 1 533 4
## 3 2013 1 1 542 2
## 4 2013 1 1 544 -1
## 5 2013 1 1 554 -6
## 6 2013 1 1 554 -4
## 7 2013 1 1 555 -5
## 8 2013 1 1 557 -3
## 9 2013 1 1 557 -3
## 10 2013 1 1 558 -2
## # ℹ 336,766 more rows
select(flights, year, month, day, contains("time"))
## # A tibble: 336,776 × 9
## year month day dep_time sched_dep_time arr_time sched_arr_time air_time
## <int> <int> <int> <int> <int> <int> <int> <dbl>
## 1 2013 1 1 517 515 830 819 227
## 2 2013 1 1 533 529 850 830 227
## 3 2013 1 1 542 540 923 850 160
## 4 2013 1 1 544 545 1004 1022 183
## 5 2013 1 1 554 600 812 837 116
## 6 2013 1 1 554 558 740 728 150
## 7 2013 1 1 555 600 913 854 158
## 8 2013 1 1 557 600 709 723 53
## 9 2013 1 1 557 600 838 846 140
## 10 2013 1 1 558 600 753 745 138
## # ℹ 336,766 more rows
## # ℹ 1 more variable: time_hour <dttm>
select(flights, year, month, day, ends_with("time"))
## # A tibble: 336,776 × 8
## year month day dep_time sched_dep_time arr_time sched_arr_time air_time
## <int> <int> <int> <int> <int> <int> <int> <dbl>
## 1 2013 1 1 517 515 830 819 227
## 2 2013 1 1 533 529 850 830 227
## 3 2013 1 1 542 540 923 850 160
## 4 2013 1 1 544 545 1004 1022 183
## 5 2013 1 1 554 600 812 837 116
## 6 2013 1 1 554 558 740 728 150
## 7 2013 1 1 555 600 913 854 158
## 8 2013 1 1 557 600 709 723 53
## 9 2013 1 1 557 600 838 846 140
## 10 2013 1 1 558 600 753 745 138
## # ℹ 336,766 more rows
select(flights, year, month, day, contains("time"))
## # A tibble: 336,776 × 9
## year month day dep_time sched_dep_time arr_time sched_arr_time air_time
## <int> <int> <int> <int> <int> <int> <int> <dbl>
## 1 2013 1 1 517 515 830 819 227
## 2 2013 1 1 533 529 850 830 227
## 3 2013 1 1 542 540 923 850 160
## 4 2013 1 1 544 545 1004 1022 183
## 5 2013 1 1 554 600 812 837 116
## 6 2013 1 1 554 558 740 728 150
## 7 2013 1 1 555 600 913 854 158
## 8 2013 1 1 557 600 709 723 53
## 9 2013 1 1 557 600 838 846 140
## 10 2013 1 1 558 600 753 745 138
## # ℹ 336,766 more rows
## # ℹ 1 more variable: time_hour <dttm>
mutate(flights,
gain = dep_delay - arr_delay) %>%
select(year:day, gain)
## # A tibble: 336,776 × 4
## year month day gain
## <int> <int> <int> <dbl>
## 1 2013 1 1 -9
## 2 2013 1 1 -16
## 3 2013 1 1 -31
## 4 2013 1 1 17
## 5 2013 1 1 19
## 6 2013 1 1 -16
## 7 2013 1 1 -24
## 8 2013 1 1 11
## 9 2013 1 1 5
## 10 2013 1 1 -10
## # ℹ 336,766 more rows
mutate(flights,
gain = dep_delay - arr_delay) %>%
select(gain)
## # A tibble: 336,776 × 1
## gain
## <dbl>
## 1 -9
## 2 -16
## 3 -31
## 4 17
## 5 19
## 6 -16
## 7 -24
## 8 11
## 9 5
## 10 -10
## # ℹ 336,766 more rows
transmute(flights,
gain = dep_delay - arr_delay)
## # A tibble: 336,776 × 1
## gain
## <dbl>
## 1 -9
## 2 -16
## 3 -31
## 4 17
## 5 19
## 6 -16
## 7 -24
## 8 11
## 9 5
## 10 -10
## # ℹ 336,766 more rows
select(flights, dep_time) %>%
mutate(dep_time_lag1 = lag(dep_time))
## # A tibble: 336,776 × 2
## dep_time dep_time_lag1
## <int> <int>
## 1 517 NA
## 2 533 517
## 3 542 533
## 4 544 542
## 5 554 544
## 6 554 554
## 7 555 554
## 8 557 555
## 9 557 557
## 10 558 557
## # ℹ 336,766 more rows
select(flights, minute) %>%
mutate(minute_cumsum = cumsum(minute))
## # A tibble: 336,776 × 2
## minute minute_cumsum
## <dbl> <dbl>
## 1 15 15
## 2 29 44
## 3 40 84
## 4 45 129
## 5 0 129
## 6 58 187
## 7 0 187
## 8 0 187
## 9 0 187
## 10 0 187
## # ℹ 336,766 more rows
#Useful Creation functions
(x <- 1:10)
## [1] 1 2 3 4 5 6 7 8 9 10
#> [1] 1 2 3 4 5 6 7 8 9 10
lag(x)
## [1] NA 1 2 3 4 5 6 7 8 9
#> [1] NA 1 2 3 4 5 6 7 8 9
lead(x)
## [1] 2 3 4 5 6 7 8 9 10 NA
#> [1] 2 3 4 5 6 7 8 9 10 NA
transmute(flights,
dep_time,
hour = dep_time %/% 100,
minute = dep_time %% 100
)
## # A tibble: 336,776 × 3
## dep_time hour minute
## <int> <dbl> <dbl>
## 1 517 5 17
## 2 533 5 33
## 3 542 5 42
## 4 544 5 44
## 5 554 5 54
## 6 554 5 54
## 7 555 5 55
## 8 557 5 57
## 9 557 5 57
## 10 558 5 58
## # ℹ 336,766 more rows
#> # A tibble: 336,776 × 3
#> dep_time hour minute
#> <int> <dbl> <dbl>
#> 1 517 5 17
#> 2 533 5 33
#> 3 542 5 42
#> 4 544 5 44
#> 5 554 5 54
#> 6 554 5 54
#> # ℹ 336,770 more rows
y <- c(1, 2, 2, NA, 3, 4)
min_rank(y)
## [1] 1 2 2 NA 4 5
#> [1] 1 2 2 NA 4 5
min_rank(desc(y))
## [1] 5 3 3 NA 2 1
#> [1] 5 3 3 NA 2 1
#Exercise
flights <- flights %>%
mutate(
dep_time_mins = (dep_time %/% 100) * 60 + (dep_time %% 100),
sched_dep_time_mins = (sched_dep_time %/% 100) * 60 + (sched_dep_time %% 100)
)
flights %>%
filter(!is.na(dep_delay)) %>%
mutate(rank = min_rank(desc(dep_delay))) %>%
filter(rank <= 10) %>%
arrange(desc(dep_delay))
## # A tibble: 10 × 22
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 6 15 1432 1935 1137 1607 2120
## 3 2013 1 10 1121 1635 1126 1239 1810
## 4 2013 9 20 1139 1845 1014 1457 2210
## 5 2013 7 22 845 1600 1005 1044 1815
## 6 2013 4 10 1100 1900 960 1342 2211
## 7 2013 3 17 2321 810 911 135 1020
## 8 2013 6 27 959 1900 899 1236 2226
## 9 2013 7 22 2257 759 898 121 1026
## 10 2013 12 5 756 1700 896 1058 2020
## # ℹ 14 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>, dep_time_mins <dbl>,
## # sched_dep_time_mins <dbl>, rank <int>
by_dest <- group_by(flights, dest)
delay <- summarise(by_dest,
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
)
delay <- filter(delay, count > 20, dest != "HNL")
# It looks like delays increase with distance up to ~750 miles
# and then decrease. Maybe as flights get longer there's more
# ability to make up delays in the air?
ggplot(data = delay, mapping = aes(x = dist, y = delay)) +
geom_point(aes(size = count), alpha = 1/3) +
geom_smooth(se = FALSE)
#> `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
Another way to tackle
delays <- flights %>%
group_by(dest) %>%
summarise(
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)
) %>%
filter(count > 20, dest != "HNL")
flights %>%
group_by(dest) %>%
summarize(count =n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE)) %>%
ggplot(mapping = aes(x = dist, y = delay)) +
geom_point(aes(size = count), alpha = 0.3) +
geom_smooth(se = FALSE)
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
Missing values
flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))
## # A tibble: 327,346 × 21
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 327,336 more rows
## # ℹ 13 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>, dep_time_mins <dbl>,
## # sched_dep_time_mins <dbl>
not_cancelled <- flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))
delays <- not_cancelled %>%
group_by(tailnum) %>%
summarise(delay = mean(arr_delay))
ggplot(data = delays, mapping = aes(x = delay)) +
geom_freqpoly(binwidth = 10)
not_cancelled <- flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))
delays <- not_cancelled %>%
group_by(tailnum) %>%
summarise(delay = mean(arr_delay, na.rm = TRUE),
n = n()
)
ggplot(data = delays, mapping = aes(x = n, y = delay)) +
geom_point(alpha = 1/10)
flights %>%
group_by(year, month, day) %>%
summarise(count = n())
## # A tibble: 365 × 4
## # Groups: year, month [12]
## year month day count
## <int> <int> <int> <int>
## 1 2013 1 1 842
## 2 2013 1 2 943
## 3 2013 1 3 914
## 4 2013 1 4 915
## 5 2013 1 5 720
## 6 2013 1 6 832
## 7 2013 1 7 933
## 8 2013 1 8 899
## 9 2013 1 9 902
## 10 2013 1 10 932
## # ℹ 355 more rows
Ungrouping
daily <- flights %>%
group_by(year, month, day) %>%
summarise(flights = n(), .groups = "drop")
daily %>%
ungroup() %>%
summarise(flights = n())
## # A tibble: 1 × 1
## flights
## <int>
## 1 365