Notes:
#install.packages('hflights')
# load packages
suppressMessages(library(dplyr))
library(hflights)
# explore data
data(hflights)
head(hflights)
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011 1 1 6 1400 1500 AA
## 5425 2011 1 2 7 1401 1501 AA
## 5426 2011 1 3 1 1352 1502 AA
## 5427 2011 1 4 2 1403 1513 AA
## 5428 2011 1 5 3 1405 1507 AA
## 5429 2011 1 6 4 1359 1503 AA
## FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424 428 N576AA 60 40 -10 0 IAH
## 5425 428 N557AA 60 45 -9 1 IAH
## 5426 428 N541AA 70 48 -8 -8 IAH
## 5427 428 N403AA 70 39 3 3 IAH
## 5428 428 N492AA 62 44 -3 5 IAH
## 5429 428 N262AA 64 45 -7 -1 IAH
## Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424 DFW 224 7 13 0 0
## 5425 DFW 224 6 9 0 0
## 5426 DFW 224 5 17 0 0
## 5427 DFW 224 9 22 0 0
## 5428 DFW 224 9 9 0 0
## 5429 DFW 224 6 13 0 0
Notes:
# convert to local data frame
flights <- tbl_df(hflights)
# printing Local_Dataframe only shows 10 rows and as many columns as can fit on your screen
flights
## Source: local data frame [227,496 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 1 1 6 1400 1500 AA
## 2 2011 1 2 7 1401 1501 AA
## 3 2011 1 3 1 1352 1502 AA
## 4 2011 1 4 2 1403 1513 AA
## 5 2011 1 5 3 1405 1507 AA
## 6 2011 1 6 4 1359 1503 AA
## 7 2011 1 7 5 1359 1509 AA
## 8 2011 1 8 6 1355 1454 AA
## 9 2011 1 9 7 1443 1554 AA
## 10 2011 1 10 1 1443 1553 AA
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
Notes:
# you can specify that you want to see more rows
print(flights, n=20)
## Source: local data frame [227,496 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 1 1 6 1400 1500 AA
## 2 2011 1 2 7 1401 1501 AA
## 3 2011 1 3 1 1352 1502 AA
## 4 2011 1 4 2 1403 1513 AA
## 5 2011 1 5 3 1405 1507 AA
## 6 2011 1 6 4 1359 1503 AA
## 7 2011 1 7 5 1359 1509 AA
## 8 2011 1 8 6 1355 1454 AA
## 9 2011 1 9 7 1443 1554 AA
## 10 2011 1 10 1 1443 1553 AA
## 11 2011 1 11 2 1429 1539 AA
## 12 2011 1 12 3 1419 1515 AA
## 13 2011 1 13 4 1358 1501 AA
## 14 2011 1 14 5 1357 1504 AA
## 15 2011 1 15 6 1359 1459 AA
## 16 2011 1 16 7 1359 1509 AA
## 17 2011 1 17 1 1530 1634 AA
## 18 2011 1 18 2 1408 1508 AA
## 19 2011 1 19 3 1356 1503 AA
## 20 2011 1 20 4 1507 1622 AA
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
# convert to a normal data frame to see all of the columns
data.frame(head(flights))
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 2 7 1401 1501 AA 428
## 3 2011 1 3 1 1352 1502 AA 428
## 4 2011 1 4 2 1403 1513 AA 428
## 5 2011 1 5 3 1405 1507 AA 428
## 6 2011 1 6 4 1359 1503 AA 428
## TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
## 1 N576AA 60 40 -10 0 IAH DFW 224
## 2 N557AA 60 45 -9 1 IAH DFW 224
## 3 N541AA 70 48 -8 -8 IAH DFW 224
## 4 N403AA 70 39 3 3 IAH DFW 224
## 5 N492AA 62 44 -3 5 IAH DFW 224
## 6 N262AA 64 45 -7 -1 IAH DFW 224
## TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 1 7 13 0 0
## 2 6 9 0 0
## 3 5 17 0 0
## 4 9 22 0 0
## 5 9 9 0 0
## 6 6 13 0 0
Notes:
# base R approach to view all flights on January 1
flights[flights$Month==1 & flights$DayofMonth==1, ]
## Source: local data frame [552 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 1 1 6 1400 1500 AA
## 2 2011 1 1 6 728 840 AA
## 3 2011 1 1 6 1631 1736 AA
## 4 2011 1 1 6 1756 2112 AA
## 5 2011 1 1 6 1012 1347 AA
## 6 2011 1 1 6 1211 1325 AA
## 7 2011 1 1 6 557 906 AA
## 8 2011 1 1 6 1824 2106 AS
## 9 2011 1 1 6 654 1124 B6
## 10 2011 1 1 6 1639 2110 B6
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
# dplyr approach
# note: you can use comma or ampersand to represent AND condition
# Note - all dplyr cmds - nothing is modified in place.
# Note - row names are included when you use base R cmds, byt bplyr discards row names
#
filter(flights, Month==1, DayofMonth==1)
## Source: local data frame [552 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 1 1 6 1400 1500 AA
## 2 2011 1 1 6 728 840 AA
## 3 2011 1 1 6 1631 1736 AA
## 4 2011 1 1 6 1756 2112 AA
## 5 2011 1 1 6 1012 1347 AA
## 6 2011 1 1 6 1211 1325 AA
## 7 2011 1 1 6 557 906 AA
## 8 2011 1 1 6 1824 2106 AS
## 9 2011 1 1 6 654 1124 B6
## 10 2011 1 1 6 1639 2110 B6
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
filter(flights, Month==1 & DayofMonth==1)
## Source: local data frame [552 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 1 1 6 1400 1500 AA
## 2 2011 1 1 6 728 840 AA
## 3 2011 1 1 6 1631 1736 AA
## 4 2011 1 1 6 1756 2112 AA
## 5 2011 1 1 6 1012 1347 AA
## 6 2011 1 1 6 1211 1325 AA
## 7 2011 1 1 6 557 906 AA
## 8 2011 1 1 6 1824 2106 AS
## 9 2011 1 1 6 654 1124 B6
## 10 2011 1 1 6 1639 2110 B6
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
# use pipe for OR condition
filter(flights, UniqueCarrier=="AA" | UniqueCarrier=="UA")
## Source: local data frame [5,316 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 1 1 6 1400 1500 AA
## 2 2011 1 2 7 1401 1501 AA
## 3 2011 1 3 1 1352 1502 AA
## 4 2011 1 4 2 1403 1513 AA
## 5 2011 1 5 3 1405 1507 AA
## 6 2011 1 6 4 1359 1503 AA
## 7 2011 1 7 5 1359 1509 AA
## 8 2011 1 8 6 1355 1454 AA
## 9 2011 1 9 7 1443 1554 AA
## 10 2011 1 10 1 1443 1553 AA
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
# you can also use the infix %in% operator
filter(flights, UniqueCarrier %in% c("AA", "UA"))
## Source: local data frame [5,316 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 1 1 6 1400 1500 AA
## 2 2011 1 2 7 1401 1501 AA
## 3 2011 1 3 1 1352 1502 AA
## 4 2011 1 4 2 1403 1513 AA
## 5 2011 1 5 3 1405 1507 AA
## 6 2011 1 6 4 1359 1503 AA
## 7 2011 1 7 5 1359 1509 AA
## 8 2011 1 8 6 1355 1454 AA
## 9 2011 1 9 7 1443 1554 AA
## 10 2011 1 10 1 1443 1553 AA
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
Notes:
# base R approach to select DepTime, ArrTime, and FlightNum columns
# Note - used character vector which is hard to read.
flights[, c("DepTime", "ArrTime", "FlightNum")]
## Source: local data frame [227,496 x 3]
##
## DepTime ArrTime FlightNum
## (int) (int) (int)
## 1 1400 1500 428
## 2 1401 1501 428
## 3 1352 1502 428
## 4 1403 1513 428
## 5 1405 1507 428
## 6 1359 1503 428
## 7 1359 1509 428
## 8 1355 1454 428
## 9 1443 1554 428
## 10 1443 1553 428
## .. ... ... ...
# dplyr approach
select(flights, DepTime, ArrTime, FlightNum)
## Source: local data frame [227,496 x 3]
##
## DepTime ArrTime FlightNum
## (int) (int) (int)
## 1 1400 1500 428
## 2 1401 1501 428
## 3 1352 1502 428
## 4 1403 1513 428
## 5 1405 1507 428
## 6 1359 1503 428
## 7 1359 1509 428
## 8 1355 1454 428
## 9 1443 1554 428
## 10 1443 1553 428
## .. ... ... ...
# use colon to select multiple contiguous columns, and use `contains` to match columns by name
# note: `starts_with`, `ends_with`, and `matches` (for regular expressions) can also be used to match columns by name
select(flights, Year:DayofMonth, contains("Taxi"), contains("Delay"))
## Source: local data frame [227,496 x 7]
##
## Year Month DayofMonth TaxiIn TaxiOut ArrDelay DepDelay
## (int) (int) (int) (int) (int) (int) (int)
## 1 2011 1 1 7 13 -10 0
## 2 2011 1 2 6 9 -9 1
## 3 2011 1 3 5 17 -8 -8
## 4 2011 1 4 9 22 3 3
## 5 2011 1 5 9 9 -3 5
## 6 2011 1 6 6 13 -7 -1
## 7 2011 1 7 12 15 -1 -1
## 8 2011 1 8 7 12 -16 -5
## 9 2011 1 9 8 22 44 43
## 10 2011 1 10 6 19 43 43
## .. ... ... ... ... ... ... ...
Notes:
# nesting method to select UniqueCarrier and DepDelay columns and filter for delays over 60 minutes
filter(select(flights, UniqueCarrier, DepDelay), DepDelay > 60)
## Source: local data frame [10,242 x 2]
##
## UniqueCarrier DepDelay
## (chr) (int)
## 1 AA 90
## 2 AA 67
## 3 AA 74
## 4 AA 125
## 5 AA 82
## 6 AA 99
## 7 AA 70
## 8 AA 61
## 9 AA 74
## 10 AS 73
## .. ... ...
# chaining method
# Note: the operator %>% says 'take what you just did and put it as the first argument' to the
# next command, thus the chaining. So you can write commands in a natural order.
flights %>%
select(UniqueCarrier, DepDelay) %>%
filter(DepDelay > 60)
## Source: local data frame [10,242 x 2]
##
## UniqueCarrier DepDelay
## (chr) (int)
## 1 AA 90
## 2 AA 67
## 3 AA 74
## 4 AA 125
## 5 AA 82
## 6 AA 99
## 7 AA 70
## 8 AA 61
## 9 AA 74
## 10 AS 73
## .. ... ...
# Note: You can use this operator in R commands outside of dplyr
# base R cmd
# create two vectors and calculate Euclidian distance between them
x1 <- 1:5; x2 <- 2:6
sqrt(sum((x1-x2)^2))
## [1] 2.236068
# Note: You can use this operator in R commands outside of dplyr
# chaining method - mroe intuitive
(x1-x2)^2 %>% sum() %>% sqrt()
## [1] 2.236068
Notes:
# base R approach to select UniqueCarrier and DepDelay columns and sort by DepDelay
flights[order(flights$DepDelay), c("UniqueCarrier", "DepDelay")]
## Source: local data frame [227,496 x 2]
##
## UniqueCarrier DepDelay
## (chr) (int)
## 1 OO -33
## 2 MQ -23
## 3 XE -19
## 4 XE -19
## 5 CO -18
## 6 EV -18
## 7 XE -17
## 8 CO -17
## 9 XE -17
## 10 MQ -17
## .. ... ...
# dplyr approach
flights %>%
select(UniqueCarrier, DepDelay) %>%
arrange(DepDelay)
## Source: local data frame [227,496 x 2]
##
## UniqueCarrier DepDelay
## (chr) (int)
## 1 OO -33
## 2 MQ -23
## 3 XE -19
## 4 XE -19
## 5 CO -18
## 6 EV -18
## 7 XE -17
## 8 CO -17
## 9 XE -17
## 10 MQ -17
## .. ... ...
# use `desc` for descending
flights %>%
select(UniqueCarrier, DepDelay) %>%
arrange(desc(DepDelay))
## Source: local data frame [227,496 x 2]
##
## UniqueCarrier DepDelay
## (chr) (int)
## 1 CO 981
## 2 AA 970
## 3 MQ 931
## 4 UA 869
## 5 MQ 814
## 6 MQ 803
## 7 CO 780
## 8 CO 758
## 9 DL 730
## 10 MQ 691
## .. ... ...
Notes:
# base R approach to create a new variable Speed (in mph)
# You can see your results by selecting out those columns
#
flights$Speed <- flights$Distance / flights$AirTime*60
flights[, c("Distance", "AirTime", "Speed")]
## Source: local data frame [227,496 x 3]
##
## Distance AirTime Speed
## (int) (int) (dbl)
## 1 224 40 336.0000
## 2 224 45 298.6667
## 3 224 48 280.0000
## 4 224 39 344.6154
## 5 224 44 305.4545
## 6 224 45 298.6667
## 7 224 43 312.5581
## 8 224 40 336.0000
## 9 224 41 327.8049
## 10 224 45 298.6667
## .. ... ... ...
# dplyr approach (prints the new variable but does not store it)
# Mutate - articulate new variable name "speed", and when referring to existing column
# names, no $ needed.
# Note: the select is not explicitly needed
#
flights %>%
select(Distance, AirTime) %>%
mutate(Speed = Distance/AirTime*60)
## Source: local data frame [227,496 x 3]
##
## Distance AirTime Speed
## (int) (int) (dbl)
## 1 224 40 336.0000
## 2 224 45 298.6667
## 3 224 48 280.0000
## 4 224 39 344.6154
## 5 224 44 305.4545
## 6 224 45 298.6667
## 7 224 43 312.5581
## 8 224 40 336.0000
## 9 224 41 327.8049
## 10 224 45 298.6667
## .. ... ... ...
# store the new variable
flights <- flights %>% mutate(Speed = Distance/AirTime*60)
Note: mutate_each is also available
Summarise is when you want to group data together by some variable and then aggregate it with some function.
Helper function n_distinct(vector) counts the number of unique items in that vector
Notes:
# base R approaches to calculate the average arrival delay to each destination
head(with(flights, tapply(ArrDelay, Dest, mean, na.rm=TRUE)))
## ABQ AEX AGS AMA ANC ASE
## 7.226259 5.839437 4.000000 6.840095 26.080645 6.794643
head(aggregate(ArrDelay ~ Dest, flights, mean))
## Dest ArrDelay
## 1 ABQ 7.226259
## 2 AEX 5.839437
## 3 AGS 4.000000
## 4 AMA 6.840095
## 5 ANC 26.080645
## 6 ASE 6.794643
# dplyr approach: create a table grouped by Dest, and then summarise each group by taking the mean of ArrDelay
# Summarise is to group data together by some variable and then aggregate it with some function
# Below: Group_By destination, then for EACH group, SUMMARISE it by calculating the MEAN arrival delay
# and assign it to the variable "avg_delay" Because there are NA's, they must be delt with.
flights %>%
group_by(Dest) %>%
summarise(avg_delay = mean(ArrDelay, na.rm=TRUE))
## Source: local data frame [116 x 2]
##
## Dest avg_delay
## (chr) (dbl)
## 1 ABQ 7.226259
## 2 AEX 5.839437
## 3 AGS 4.000000
## 4 AMA 6.840095
## 5 ANC 26.080645
## 6 ASE 6.794643
## 7 ATL 8.233251
## 8 AUS 7.448718
## 9 AVL 9.973988
## 10 BFL -13.198807
## .. ... ...
# SUMMARISE_EACH (like MUTATE_EACH)
# summarise_each allows you to apply the "same"" summary function to "multiple"" columns at once
# for each carrier (aka 'group_by'), calculate the percentage of flights cancelled or diverted
# The FUNS keyword takes an argument of a function, and then you list the columns you want to run it on.
flights %>%
group_by(UniqueCarrier) %>%
summarise_each(funs(mean), Cancelled, Diverted)
## Source: local data frame [15 x 3]
##
## UniqueCarrier Cancelled Diverted
## (chr) (dbl) (dbl)
## 1 AA 0.018495684 0.001849568
## 2 AS 0.000000000 0.002739726
## 3 B6 0.025899281 0.005755396
## 4 CO 0.006782614 0.002627370
## 5 DL 0.015903067 0.003029156
## 6 EV 0.034482759 0.003176044
## 7 F9 0.007159905 0.000000000
## 8 FL 0.009817672 0.003272557
## 9 MQ 0.029044750 0.001936317
## 10 OO 0.013946828 0.003486707
## 11 UA 0.016409266 0.002413127
## 12 US 0.011268986 0.001469868
## 13 WN 0.015504047 0.002293629
## 14 XE 0.015495599 0.003449550
## 15 YV 0.012658228 0.000000000
# for each carrier, calculate the minimum and maximum arrival and departure delays
# Note: The DOT is a place holder for the data that you are passing in.
# NOTE: Match Keyword is on ANY column with "DELAY" in it.
# NOTE: And it actually creates these useful variable names.
flights %>%
group_by(UniqueCarrier) %>%
summarise_each(funs(min(., na.rm=TRUE), max(., na.rm=TRUE)),
matches("Delay"))
## Source: local data frame [15 x 5]
##
## UniqueCarrier ArrDelay_min DepDelay_min ArrDelay_max DepDelay_max
## (chr) (int) (int) (int) (int)
## 1 AA -39 -15 978 970
## 2 AS -43 -15 183 172
## 3 B6 -44 -14 335 310
## 4 CO -55 -18 957 981
## 5 DL -32 -17 701 730
## 6 EV -40 -18 469 479
## 7 F9 -24 -15 277 275
## 8 FL -30 -14 500 507
## 9 MQ -38 -23 918 931
## 10 OO -57 -33 380 360
## 11 UA -47 -11 861 869
## 12 US -42 -17 433 425
## 13 WN -44 -10 499 548
## 14 XE -70 -19 634 628
## 15 YV -32 -11 72 54
# Helper function: n() counts the number of rows in a group
# Helper function: n_distinct(vector) counts the number of "unique"" items in that vector
# for each day of the year, count the total number of flights and sort in descending order
# Note: You can "group_by" as many variables as you like
# Note: All the n() function does is count the number of rows in the group.
# After summarising, then we arrange in decending order.
#
flights %>%
group_by(Month, DayofMonth) %>%
summarise(flight_count = n()) %>%
arrange(desc(flight_count))
## Source: local data frame [365 x 3]
## Groups: Month [12]
##
## Month DayofMonth flight_count
## (int) (int) (int)
## 1 1 3 702
## 2 1 2 678
## 3 1 20 663
## 4 1 27 663
## 5 1 13 662
## 6 1 7 661
## 7 1 14 661
## 8 1 21 661
## 9 1 28 661
## 10 1 6 660
## .. ... ... ...
# TALLY FUNCTION
# REWRITE the Previous code more simply with the `tally` function
# Except that it doesn't give you the Column name for the N function - only diff.
#
flights %>%
group_by(Month, DayofMonth) %>%
tally(sort = TRUE)
## Source: local data frame [365 x 3]
## Groups: Month [12]
##
## Month DayofMonth n
## (int) (int) (int)
## 1 1 3 702
## 2 1 2 678
## 3 1 20 663
## 4 1 27 663
## 5 1 13 662
## 6 1 7 661
## 7 1 14 661
## 8 1 21 661
## 9 1 28 661
## 10 1 6 660
## .. ... ... ...
# N_DISTINCT FUNCTION
# for each destination, count the total number of flights and the number of 'distinct' planes that flew there
#
flights %>%
group_by(Dest) %>%
summarise(flight_count = n(), plane_count = n_distinct(TailNum))
## Source: local data frame [116 x 3]
##
## Dest flight_count plane_count
## (chr) (int) (int)
## 1 ABQ 2812 716
## 2 AEX 724 215
## 3 AGS 1 1
## 4 AMA 1297 158
## 5 ANC 125 38
## 6 ASE 125 60
## 7 ATL 7886 983
## 8 AUS 5022 1015
## 9 AVL 350 142
## 10 BFL 504 70
## .. ... ... ...
# Grouping can sometimes be useful without summarising
# Note: you can use other base R function with the dplyr operator "%>%"
#
# for each destination, show the number of "cancelled" and "not cancelled" flights
flights %>%
group_by(Dest) %>%
select(Cancelled) %>%
table() %>%
head()
## Cancelled
## Dest 0 1
## ABQ 2787 25
## AEX 712 12
## AGS 1 0
## AMA 1265 32
## ANC 125 0
## ASE 120 5
Aggregation functions (like mean) takes n inputs and returns 1 (one) value
Notes:
# for each carrier, calculate which two days of the year they had their longest departure delays
# note: smallest (not largest) value is ranked as 1, so you have to use `desc` to rank by largest value
flights %>%
group_by(UniqueCarrier) %>%
select(Month, DayofMonth, DepDelay) %>%
filter(min_rank(desc(DepDelay)) <= 2) %>%
arrange(UniqueCarrier, desc(DepDelay))
## Source: local data frame [30 x 4]
## Groups: UniqueCarrier [15]
##
## UniqueCarrier Month DayofMonth DepDelay
## (chr) (int) (int) (int)
## 1 AA 12 12 970
## 2 AA 11 19 677
## 3 AS 2 28 172
## 4 AS 7 6 138
## 5 B6 10 29 310
## 6 B6 8 19 283
## 7 CO 8 1 981
## 8 CO 1 20 780
## 9 DL 10 25 730
## 10 DL 4 5 497
## .. ... ... ... ...
# Window Function: TOP_N - give me the top N of a group.
#
# rewrite more simply with the `top_n` function
#
flights %>%
group_by(UniqueCarrier) %>%
select(Month, DayofMonth, DepDelay) %>%
top_n(2) %>%
arrange(UniqueCarrier, desc(DepDelay))
## Selecting by DepDelay
## Source: local data frame [30 x 4]
## Groups: UniqueCarrier [15]
##
## UniqueCarrier Month DayofMonth DepDelay
## (chr) (int) (int) (int)
## 1 AA 12 12 970
## 2 AA 11 19 677
## 3 AS 2 28 172
## 4 AS 7 6 138
## 5 B6 10 29 310
## 6 B6 8 19 283
## 7 CO 8 1 981
## 8 CO 1 20 780
## 9 DL 10 25 730
## 10 DL 4 5 497
## .. ... ... ... ...
# Window Function: LAG (look at earlier value), LEAD (look at the next value)
# for each month, calculate the number of flights and the "change" from the previous month
flights %>%
group_by(Month) %>%
summarise(flight_count = n()) %>%
mutate(change = flight_count - lag(flight_count))
## Source: local data frame [12 x 3]
##
## Month flight_count change
## (int) (int) (int)
## 1 1 18910 NA
## 2 2 17128 -1782
## 3 3 19470 2342
## 4 4 18593 -877
## 5 5 19172 579
## 6 6 19600 428
## 7 7 20548 948
## 8 8 20176 -372
## 9 9 18065 -2111
## 10 10 18696 631
## 11 11 18021 -675
## 12 12 19117 1096
# REWRITE - more simply with the `tally` function
# Note: "n" referrs to "flight_count" in this code.
flights %>%
group_by(Month) %>%
tally() %>%
mutate(change = n - lag(n))
## Source: local data frame [12 x 3]
##
## Month n change
## (int) (int) (int)
## 1 1 18910 NA
## 2 2 17128 -1782
## 3 3 19470 2342
## 4 4 18593 -877
## 5 5 19172 579
## 6 6 19600 428
## 7 7 20548 948
## 8 8 20176 -372
## 9 9 18065 -2111
## 10 10 18696 631
## 11 11 18021 -675
## 12 12 19117 1096
Notes:
# SAMPLE - sample random rows from the dataset
# randomly sample a fixed number of rows, without replacement
# Extract out 5 ramdom rows from the dataset.
flights %>% sample_n(5)
## Source: local data frame [5 x 22]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## (int) (int) (int) (int) (int) (int) (chr) (int)
## 1 2011 9 20 2 758 859 WN 2177
## 2 2011 4 24 7 1027 1142 XE 2482
## 3 2011 6 15 3 1438 1819 XE 3078
## 4 2011 1 18 2 920 1111 XE 2885
## 5 2011 9 30 5 845 1003 CO 1430
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
## (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
## Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int), Speed (dbl)
# SAMPLE_FREC -s sample a faction (percentage) of the random rows from the dataset
# NOTE: "Replace=True" means to sample with replacement
# randomly sample a fraction of rows, with replacement
flights %>% sample_frac(0.25, replace=TRUE)
## Source: local data frame [56,874 x 22]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 4 3 7 1754 1932 CO
## 2 2011 11 5 6 1312 1605 CO
## 3 2011 4 4 1 2109 2316 XE
## 4 2011 5 10 2 1635 1953 XE
## 5 2011 11 26 6 1154 1341 AA
## 6 2011 10 29 6 1854 2147 XE
## 7 2011 4 25 1 717 1008 XE
## 8 2011 4 2 6 1431 1600 XE
## 9 2011 3 17 4 1048 1201 XE
## 10 2011 4 30 6 1918 2045 XE
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int), Speed (dbl)
# STR - looking at the structure of a dataset
# BASE R: - ugly.
# base R approach to view the structure of an object
str(flights)
## Classes 'tbl_df', 'tbl' and 'data.frame': 227496 obs. of 22 variables:
## $ Year : int 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
## $ Month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DayofMonth : int 1 2 3 4 5 6 7 8 9 10 ...
## $ DayOfWeek : int 6 7 1 2 3 4 5 6 7 1 ...
## $ DepTime : int 1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
## $ ArrTime : int 1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
## $ UniqueCarrier : chr "AA" "AA" "AA" "AA" ...
## $ FlightNum : int 428 428 428 428 428 428 428 428 428 428 ...
## $ TailNum : chr "N576AA" "N557AA" "N541AA" "N403AA" ...
## $ ActualElapsedTime: int 60 60 70 70 62 64 70 59 71 70 ...
## $ AirTime : int 40 45 48 39 44 45 43 40 41 45 ...
## $ ArrDelay : int -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
## $ DepDelay : int 0 1 -8 3 5 -1 -1 -5 43 43 ...
## $ Origin : chr "IAH" "IAH" "IAH" "IAH" ...
## $ Dest : chr "DFW" "DFW" "DFW" "DFW" ...
## $ Distance : int 224 224 224 224 224 224 224 224 224 224 ...
## $ TaxiIn : int 7 6 5 9 9 6 12 7 8 6 ...
## $ TaxiOut : int 13 9 17 22 9 13 15 12 22 19 ...
## $ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CancellationCode : chr "" "" "" "" ...
## $ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Speed : num 336 299 280 345 305 ...
# GLIMPSE - better formatting, etc.
# dplyr approach: better formatting, and adapts to your screen width
glimpse(flights)
## Observations: 227,496
## Variables: 22
## $ Year (int) 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek (int) 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime (int) 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime (int) 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier (chr) "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A...
## $ FlightNum (int) 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum (chr) "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime (int) 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime (int) 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay (int) -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay (int) 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin (chr) "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest (chr) "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance (int) 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn (int) 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut (int) 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode (chr) "", "", "", "", "", "", "", "", "", "", "", ...
## $ Diverted (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Speed (dbl) 336.0000, 298.6667, 280.0000, 344.6154, 305....