dplyr is an R package used for Data Manipulation and Exploration
Let’s get started!
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(hflights)
data("hflights")
head(hflights)
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011 1 1 6 1400 1500 AA
## 5425 2011 1 2 7 1401 1501 AA
## 5426 2011 1 3 1 1352 1502 AA
## 5427 2011 1 4 2 1403 1513 AA
## 5428 2011 1 5 3 1405 1507 AA
## 5429 2011 1 6 4 1359 1503 AA
## FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424 428 N576AA 60 40 -10 0 IAH
## 5425 428 N557AA 60 45 -9 1 IAH
## 5426 428 N541AA 70 48 -8 -8 IAH
## 5427 428 N403AA 70 39 3 3 IAH
## 5428 428 N492AA 62 44 -3 5 IAH
## 5429 428 N262AA 64 45 -7 -1 IAH
## Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424 DFW 224 7 13 0 0
## 5425 DFW 224 6 9 0 0
## 5426 DFW 224 5 17 0 0
## 5427 DFW 224 9 22 0 0
## 5428 DFW 224 9 9 0 0
## 5429 DFW 224 6 13 0 0
str(hflights)
## 'data.frame': 227496 obs. of 21 variables:
## $ Year : int 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
## $ Month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DayofMonth : int 1 2 3 4 5 6 7 8 9 10 ...
## $ DayOfWeek : int 6 7 1 2 3 4 5 6 7 1 ...
## $ DepTime : int 1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
## $ ArrTime : int 1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
## $ UniqueCarrier : chr "AA" "AA" "AA" "AA" ...
## $ FlightNum : int 428 428 428 428 428 428 428 428 428 428 ...
## $ TailNum : chr "N576AA" "N557AA" "N541AA" "N403AA" ...
## $ ActualElapsedTime: int 60 60 70 70 62 64 70 59 71 70 ...
## $ AirTime : int 40 45 48 39 44 45 43 40 41 45 ...
## $ ArrDelay : int -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
## $ DepDelay : int 0 1 -8 3 5 -1 -1 -5 43 43 ...
## $ Origin : chr "IAH" "IAH" "IAH" "IAH" ...
## $ Dest : chr "DFW" "DFW" "DFW" "DFW" ...
## $ Distance : int 224 224 224 224 224 224 224 224 224 224 ...
## $ TaxiIn : int 7 6 5 9 9 6 12 7 8 6 ...
## $ TaxiOut : int 13 9 17 22 9 13 15 12 22 19 ...
## $ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CancellationCode : chr "" "" "" "" ...
## $ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
# Converting to local data frame
flights <- tbl_df(hflights)
flights
## Source: local data frame [227,496 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 2 7 1401 1501 AA 428
## 3 2011 1 3 1 1352 1502 AA 428
## 4 2011 1 4 2 1403 1513 AA 428
## 5 2011 1 5 3 1405 1507 AA 428
## 6 2011 1 6 4 1359 1503 AA 428
## 7 2011 1 7 5 1359 1509 AA 428
## 8 2011 1 8 6 1355 1454 AA 428
## 9 2011 1 9 7 1443 1554 AA 428
## 10 2011 1 10 1 1443 1553 AA 428
## .. ... ... ... ... ... ... ... ...
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
## (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
## Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
# I can specify number of rows I want to see
print(flights, n = 30)
## Source: local data frame [227,496 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 2 7 1401 1501 AA 428
## 3 2011 1 3 1 1352 1502 AA 428
## 4 2011 1 4 2 1403 1513 AA 428
## 5 2011 1 5 3 1405 1507 AA 428
## 6 2011 1 6 4 1359 1503 AA 428
## 7 2011 1 7 5 1359 1509 AA 428
## 8 2011 1 8 6 1355 1454 AA 428
## 9 2011 1 9 7 1443 1554 AA 428
## 10 2011 1 10 1 1443 1553 AA 428
## 11 2011 1 11 2 1429 1539 AA 428
## 12 2011 1 12 3 1419 1515 AA 428
## 13 2011 1 13 4 1358 1501 AA 428
## 14 2011 1 14 5 1357 1504 AA 428
## 15 2011 1 15 6 1359 1459 AA 428
## 16 2011 1 16 7 1359 1509 AA 428
## 17 2011 1 17 1 1530 1634 AA 428
## 18 2011 1 18 2 1408 1508 AA 428
## 19 2011 1 19 3 1356 1503 AA 428
## 20 2011 1 20 4 1507 1622 AA 428
## 21 2011 1 21 5 1357 1459 AA 428
## 22 2011 1 22 6 1355 1456 AA 428
## 23 2011 1 23 7 1356 1501 AA 428
## 24 2011 1 24 1 1356 1513 AA 428
## 25 2011 1 25 2 1352 1452 AA 428
## 26 2011 1 26 3 1353 1455 AA 428
## 27 2011 1 27 4 1356 1458 AA 428
## 28 2011 1 28 5 1359 1505 AA 428
## 29 2011 1 29 6 1355 1455 AA 428
## 30 2011 1 30 7 1359 1456 AA 428
## .. ... ... ... ... ... ... ... ...
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
## (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
## Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
# Convert to a normal data frame to see all of the columns
data.frame(head(flights))
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 2 7 1401 1501 AA 428
## 3 2011 1 3 1 1352 1502 AA 428
## 4 2011 1 4 2 1403 1513 AA 428
## 5 2011 1 5 3 1405 1507 AA 428
## 6 2011 1 6 4 1359 1503 AA 428
## TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin Dest Distance
## 1 N576AA 60 40 -10 0 IAH DFW 224
## 2 N557AA 60 45 -9 1 IAH DFW 224
## 3 N541AA 70 48 -8 -8 IAH DFW 224
## 4 N403AA 70 39 3 3 IAH DFW 224
## 5 N492AA 62 44 -3 5 IAH DFW 224
## 6 N262AA 64 45 -7 -1 IAH DFW 224
## TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 1 7 13 0 0
## 2 6 9 0 0
## 3 5 17 0 0
## 4 9 22 0 0
## 5 9 9 0 0
## 6 6 13 0 0
# Base R approach
flights[flights$Month == 1 & flights$DayofMonth == 1,]
## Source: local data frame [552 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 1 6 728 840 AA 460
## 3 2011 1 1 6 1631 1736 AA 1121
## 4 2011 1 1 6 1756 2112 AA 1294
## 5 2011 1 1 6 1012 1347 AA 1700
## 6 2011 1 1 6 1211 1325 AA 1820
## 7 2011 1 1 6 557 906 AA 1994
## 8 2011 1 1 6 1824 2106 AS 731
## 9 2011 1 1 6 654 1124 B6 620
## 10 2011 1 1 6 1639 2110 B6 622
## .. ... ... ... ... ... ... ... ...
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
## (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
## Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
# dplyr approach
# AND with ampersand
filter(.data = flights, ... = Month == 1 & DayofMonth == 1)
## Source: local data frame [552 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 1 6 728 840 AA 460
## 3 2011 1 1 6 1631 1736 AA 1121
## 4 2011 1 1 6 1756 2112 AA 1294
## 5 2011 1 1 6 1012 1347 AA 1700
## 6 2011 1 1 6 1211 1325 AA 1820
## 7 2011 1 1 6 557 906 AA 1994
## 8 2011 1 1 6 1824 2106 AS 731
## 9 2011 1 1 6 654 1124 B6 620
## 10 2011 1 1 6 1639 2110 B6 622
## .. ... ... ... ... ... ... ... ...
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
## (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
## Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
# OR with pipe sumbol
filter(.data = flights, ... = UniqueCarrier == "AA" | UniqueCarrier == "UA")
## Source: local data frame [5,316 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 2 7 1401 1501 AA 428
## 3 2011 1 3 1 1352 1502 AA 428
## 4 2011 1 4 2 1403 1513 AA 428
## 5 2011 1 5 3 1405 1507 AA 428
## 6 2011 1 6 4 1359 1503 AA 428
## 7 2011 1 7 5 1359 1509 AA 428
## 8 2011 1 8 6 1355 1454 AA 428
## 9 2011 1 9 7 1443 1554 AA 428
## 10 2011 1 10 1 1443 1553 AA 428
## .. ... ... ... ... ... ... ... ...
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
## (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
## Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
# Note : dplyr discards rownames
# You can even use the %in% operator
filter(.data = flights, ... = UniqueCarrier %in% c("AA", "UA"))
## Source: local data frame [5,316 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011 1 1 6 1400 1500 AA 428
## 2 2011 1 2 7 1401 1501 AA 428
## 3 2011 1 3 1 1352 1502 AA 428
## 4 2011 1 4 2 1403 1513 AA 428
## 5 2011 1 5 3 1405 1507 AA 428
## 6 2011 1 6 4 1359 1503 AA 428
## 7 2011 1 7 5 1359 1509 AA 428
## 8 2011 1 8 6 1355 1454 AA 428
## 9 2011 1 9 7 1443 1554 AA 428
## 10 2011 1 10 1 1443 1553 AA 428
## .. ... ... ... ... ... ... ... ...
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
## (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
## Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
# Base R apporach
flights[, c("ArrTime", "DepTime")]
## Source: local data frame [227,496 x 2]
##
## ArrTime DepTime
## 1 1500 1400
## 2 1501 1401
## 3 1502 1352
## 4 1513 1403
## 5 1507 1405
## 6 1503 1359
## 7 1509 1359
## 8 1454 1355
## 9 1554 1443
## 10 1553 1443
## .. ... ...
# dplyr approach
select(.data = flights, ArrTime, DepTime)
## Source: local data frame [227,496 x 2]
##
## ArrTime DepTime
## 1 1500 1400
## 2 1501 1401
## 3 1502 1352
## 4 1513 1403
## 5 1507 1405
## 6 1503 1359
## 7 1509 1359
## 8 1454 1355
## 9 1554 1443
## 10 1553 1443
## .. ... ...
select(.data = flights, Year:DayofMonth, contains("Taxi"), contains("Delay"))
## Source: local data frame [227,496 x 7]
##
## Year Month DayofMonth TaxiIn TaxiOut ArrDelay DepDelay
## 1 2011 1 1 7 13 -10 0
## 2 2011 1 2 6 9 -9 1
## 3 2011 1 3 5 17 -8 -8
## 4 2011 1 4 9 22 3 3
## 5 2011 1 5 9 9 -3 5
## 6 2011 1 6 6 13 -7 -1
## 7 2011 1 7 12 15 -1 -1
## 8 2011 1 8 7 12 -16 -5
## 9 2011 1 9 8 22 44 43
## 10 2011 1 10 6 19 43 43
## .. ... ... ... ... ... ... ...
# nesting method
filter(select(flights, ArrDelay, DepDelay), DepDelay > 3)
## Source: local data frame [82,841 x 2]
##
## ArrDelay DepDelay
## 1 -3 5
## 2 44 43
## 3 43 43
## 4 29 29
## 5 5 19
## 6 84 90
## 7 -2 8
## 8 72 67
## 9 43 41
## 10 5 8
## .. ... ...
# chaining method
flights %>%
select(ArrDelay, DepDelay) %>%
filter(DepDelay > 3)
## Source: local data frame [82,841 x 2]
##
## ArrDelay DepDelay
## 1 -3 5
## 2 44 43
## 3 43 43
## 4 29 29
## 5 5 19
## 6 84 90
## 7 -2 8
## 8 72 67
## 9 43 41
## 10 5 8
## .. ... ...
# create two vectors and calculate Euclidean distance between them
x1 <- 1:5
x2 <- 3:7
(x1 - x2) ^ 2 %>% sum() %>% sqrt()
## [1] 4.472136
flights %>%
select(UniqueCarrier, DepDelay) %>%
arrange(DepDelay)
## Source: local data frame [227,496 x 2]
##
## UniqueCarrier DepDelay
## 1 OO -33
## 2 MQ -23
## 3 XE -19
## 4 XE -19
## 5 CO -18
## 6 EV -18
## 7 XE -17
## 8 CO -17
## 9 XE -17
## 10 MQ -17
## .. ... ...
flights %>%
select(UniqueCarrier, DepDelay) %>%
arrange(desc(DepDelay))
## Source: local data frame [227,496 x 2]
##
## UniqueCarrier DepDelay
## 1 CO 981
## 2 AA 970
## 3 MQ 931
## 4 UA 869
## 5 MQ 814
## 6 MQ 803
## 7 CO 780
## 8 CO 758
## 9 DL 730
## 10 MQ 691
## .. ... ...
# Create new variables which are functions of existing variables
flights %>%
select(Distance, AirTime) %>%
mutate(Speed = Distance / AirTime * 60)
## Source: local data frame [227,496 x 3]
##
## Distance AirTime Speed
## 1 224 40 336.0000
## 2 224 45 298.6667
## 3 224 48 280.0000
## 4 224 39 344.6154
## 5 224 44 305.4545
## 6 224 45 298.6667
## 7 224 43 312.5581
## 8 224 40 336.0000
## 9 224 41 327.8049
## 10 224 45 298.6667
## .. ... ... ...
str(flights)
## Classes 'tbl_df', 'tbl' and 'data.frame': 227496 obs. of 21 variables:
## $ Year : int 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
## $ Month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DayofMonth : int 1 2 3 4 5 6 7 8 9 10 ...
## $ DayOfWeek : int 6 7 1 2 3 4 5 6 7 1 ...
## $ DepTime : int 1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
## $ ArrTime : int 1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
## $ UniqueCarrier : chr "AA" "AA" "AA" "AA" ...
## $ FlightNum : int 428 428 428 428 428 428 428 428 428 428 ...
## $ TailNum : chr "N576AA" "N557AA" "N541AA" "N403AA" ...
## $ ActualElapsedTime: int 60 60 70 70 62 64 70 59 71 70 ...
## $ AirTime : int 40 45 48 39 44 45 43 40 41 45 ...
## $ ArrDelay : int -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
## $ DepDelay : int 0 1 -8 3 5 -1 -1 -5 43 43 ...
## $ Origin : chr "IAH" "IAH" "IAH" "IAH" ...
## $ Dest : chr "DFW" "DFW" "DFW" "DFW" ...
## $ Distance : int 224 224 224 224 224 224 224 224 224 224 ...
## $ TaxiIn : int 7 6 5 9 9 6 12 7 8 6 ...
## $ TaxiOut : int 13 9 17 22 9 13 15 12 22 19 ...
## $ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CancellationCode : chr "" "" "" "" ...
## $ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
# To store the new variable
flights <- flights %>%
mutate(Speed = Distance / AirTime * 60)
str(flights)
## Classes 'tbl_df', 'tbl' and 'data.frame': 227496 obs. of 22 variables:
## $ Year : int 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
## $ Month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DayofMonth : int 1 2 3 4 5 6 7 8 9 10 ...
## $ DayOfWeek : int 6 7 1 2 3 4 5 6 7 1 ...
## $ DepTime : int 1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
## $ ArrTime : int 1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
## $ UniqueCarrier : chr "AA" "AA" "AA" "AA" ...
## $ FlightNum : int 428 428 428 428 428 428 428 428 428 428 ...
## $ TailNum : chr "N576AA" "N557AA" "N541AA" "N403AA" ...
## $ ActualElapsedTime: int 60 60 70 70 62 64 70 59 71 70 ...
## $ AirTime : int 40 45 48 39 44 45 43 40 41 45 ...
## $ ArrDelay : int -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
## $ DepDelay : int 0 1 -8 3 5 -1 -1 -5 43 43 ...
## $ Origin : chr "IAH" "IAH" "IAH" "IAH" ...
## $ Dest : chr "DFW" "DFW" "DFW" "DFW" ...
## $ Distance : int 224 224 224 224 224 224 224 224 224 224 ...
## $ TaxiIn : int 7 6 5 9 9 6 12 7 8 6 ...
## $ TaxiOut : int 13 9 17 22 9 13 15 12 22 19 ...
## $ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CancellationCode : chr "" "" "" "" ...
## $ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Speed : num 336 299 280 345 305 ...
head(data.frame(flights %>% select(Dest, TailNum) %>% group_by(Dest)))
## Dest TailNum
## 1 DFW N576AA
## 2 DFW N557AA
## 3 DFW N541AA
## 4 DFW N403AA
## 5 DFW N492AA
## 6 DFW N262AA
flights %>% group_by(Dest) %>% summarise(avg_delay = mean(ArrDelay, na.rm = TRUE))
## Source: local data frame [116 x 2]
##
## Dest avg_delay
## 1 ABQ 7.226259
## 2 AEX 5.839437
## 3 AGS 4.000000
## 4 AMA 6.840095
## 5 ANC 26.080645
## 6 ASE 6.794643
## 7 ATL 8.233251
## 8 AUS 7.448718
## 9 AVL 9.973988
## 10 BFL -13.198807
## .. ... ...
Allows you to apply the same function to multiple columns at once.
Similar to summarize_each, we have mutate_each serving the same purpose as mutate but can be applied on multiple columns.
flights %>%
group_by(UniqueCarrier) %>%
summarise_each(funs(mean), Cancelled, Diverted)
## Source: local data frame [15 x 3]
##
## UniqueCarrier Cancelled Diverted
## 1 AA 0.018495684 0.001849568
## 2 AS 0.000000000 0.002739726
## 3 B6 0.025899281 0.005755396
## 4 CO 0.006782614 0.002627370
## 5 DL 0.015903067 0.003029156
## 6 EV 0.034482759 0.003176044
## 7 F9 0.007159905 0.000000000
## 8 FL 0.009817672 0.003272557
## 9 MQ 0.029044750 0.001936317
## 10 OO 0.013946828 0.003486707
## 11 UA 0.016409266 0.002413127
## 12 US 0.011268986 0.001469868
## 13 WN 0.015504047 0.002293629
## 14 XE 0.015495599 0.003449550
## 15 YV 0.012658228 0.000000000
flights %>%
group_by(UniqueCarrier) %>%
summarise_each(funs(min(., na.rm = TRUE), max(., na.rm = TRUE)), matches("Delay"))
## Source: local data frame [15 x 5]
##
## UniqueCarrier ArrDelay_min DepDelay_min ArrDelay_max DepDelay_max
## 1 AA -39 -15 978 970
## 2 AS -43 -15 183 172
## 3 B6 -44 -14 335 310
## 4 CO -55 -18 957 981
## 5 DL -32 -17 701 730
## 6 EV -40 -18 469 479
## 7 F9 -24 -15 277 275
## 8 FL -30 -14 500 507
## 9 MQ -38 -23 918 931
## 10 OO -57 -33 380 360
## 11 UA -47 -11 861 869
## 12 US -42 -17 433 425
## 13 WN -44 -10 499 548
## 14 XE -70 -19 634 628
## 15 YV -32 -11 72 54
# n() - counts the number of rows in a group
# n_distinct(vector) - counts the number of unique items in a vector
flights %>%
group_by(Month, DayofMonth) %>%
summarize(flight_count = n()) %>%
arrange(desc(flight_count))
## Source: local data frame [365 x 3]
## Groups: Month
##
## Month DayofMonth flight_count
## 1 1 3 702
## 2 1 2 678
## 3 1 20 663
## 4 1 27 663
## 5 1 13 662
## 6 1 7 661
## 7 1 14 661
## 8 1 21 661
## 9 1 28 661
## 10 1 6 660
## .. ... ... ...
# Doing same thing with tally() function
flights %>%
group_by(Month, DayofMonth) %>%
tally(sort = TRUE)
## Source: local data frame [365 x 3]
## Groups: Month
##
## Month DayofMonth n
## 1 1 3 702
## 2 1 2 678
## 3 1 20 663
## 4 1 27 663
## 5 1 13 662
## 6 1 7 661
## 7 1 14 661
## 8 1 21 661
## 9 1 28 661
## 10 1 6 660
## .. ... ... ...
flights %>%
group_by(Dest) %>%
summarise(flight_count = n(), plane_count = n_distinct(TailNum))
## Source: local data frame [116 x 3]
##
## Dest flight_count plane_count
## 1 ABQ 2812 716
## 2 AEX 724 215
## 3 AGS 1 1
## 4 AMA 1297 158
## 5 ANC 125 38
## 6 ASE 125 60
## 7 ATL 7886 983
## 8 AUS 5022 1015
## 9 AVL 350 142
## 10 BFL 504 70
## .. ... ... ...
Grouping can sometimes be useful without summarizing
flights %>%
group_by(Dest) %>%
select(Cancelled) %>%
table() %>%
head()
## Cancelled
## Dest 0 1
## ABQ 2787 25
## AEX 712 12
## AGS 1 0
## AMA 1265 32
## ANC 125 0
## ASE 120 5
An aggregation function - takes n inputs, return 1 value
A window function - takes n inputs, returns n values eg ranking and ordering functions, offset functions, and cumulative aggregates
Ranking and Ordering Fuctions, like min_rank
Offset Functions, like lead and lag
Cumulative Aggregates, like cummean
# for each carrier, calculate which two days of the year they had their longest departure delays
flights %>%
group_by(UniqueCarrier) %>%
select(Month, DayofMonth, DepDelay) %>%
filter(min_rank(desc(DepDelay)) <= 2) %>%
arrange(UniqueCarrier, desc(DepDelay))
## Source: local data frame [30 x 4]
## Groups: UniqueCarrier
##
## UniqueCarrier Month DayofMonth DepDelay
## 1 AA 12 12 970
## 2 AA 11 19 677
## 3 AS 2 28 172
## 4 AS 7 6 138
## 5 B6 10 29 310
## 6 B6 8 19 283
## 7 CO 8 1 981
## 8 CO 1 20 780
## 9 DL 10 25 730
## 10 DL 4 5 497
## .. ... ... ... ...
# using top_n() function to simplify the above command
flights %>%
group_by(UniqueCarrier) %>%
select(Month, DayofMonth, DepDelay) %>%
top_n(n = 2) %>%
arrange(UniqueCarrier, desc(DepDelay))
## Selecting by DepDelay
## Source: local data frame [30 x 4]
## Groups: UniqueCarrier
##
## UniqueCarrier Month DayofMonth DepDelay
## 1 AA 12 12 970
## 2 AA 11 19 677
## 3 AS 2 28 172
## 4 AS 7 6 138
## 5 B6 10 29 310
## 6 B6 8 19 283
## 7 CO 8 1 981
## 8 CO 1 20 780
## 9 DL 10 25 730
## 10 DL 4 5 497
## .. ... ... ... ...
# for each month calculate the number of flights and the change from the previous month
flights %>%
group_by(Month) %>%
summarise(flight_count = n()) %>%
mutate(change = flight_count - lag(flight_count))
## Source: local data frame [12 x 3]
##
## Month flight_count change
## 1 1 18910 NA
## 2 2 17128 -1782
## 3 3 19470 2342
## 4 4 18593 -877
## 5 5 19172 579
## 6 6 19600 428
## 7 7 20548 948
## 8 8 20176 -372
## 9 9 18065 -2111
## 10 10 18696 631
## 11 11 18021 -675
## 12 12 19117 1096
# randomly sample a fixed number of rows without replacement
flights %>% sample_n(size = 10)
## Source: local data frame [10 x 22]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011 7 16 6 1543 1653 XE 2069
## 2 2011 1 11 2 1325 1637 XE 2980
## 3 2011 2 20 7 1428 1757 CO 1844
## 4 2011 8 20 6 1023 1206 UA 434
## 5 2011 8 7 7 2014 2154 WN 164
## 6 2011 10 28 5 2009 2320 WN 287
## 7 2011 12 24 6 906 1216 WN 2215
## 8 2011 8 4 4 843 1327 CO 1529
## 9 2011 5 29 7 1648 1735 WN 40
## 10 2011 11 5 6 1830 2117 FL 296
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
## (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
## Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int), Speed (dbl)
# randomly sample a fraction of rows with replacement
flights %>% sample_frac(size = 0.6, replace = TRUE)
## Source: local data frame [136,498 x 22]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier FlightNum
## 1 2011 2 22 2 1036 1123 WN 576
## 2 2011 2 14 1 1038 1337 XE 2029
## 3 2011 7 5 2 1759 2241 CO 1540
## 4 2011 12 29 4 706 1045 XE 4407
## 5 2011 12 21 3 1401 1730 UA 312
## 6 2011 7 18 1 2106 2207 XE 2654
## 7 2011 3 17 4 1833 2045 OO 4455
## 8 2011 12 16 5 1948 2041 WN 47
## 9 2011 9 4 7 1603 1810 XE 2944
## 10 2011 5 5 4 719 804 OO 1184
## .. ... ... ... ... ... ... ... ...
## Variables not shown: TailNum (chr), ActualElapsedTime (int), AirTime
## (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest (chr),
## Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int), Speed (dbl)
# dplyr approach, better formatting and adapts to your screen width
glimpse(flights) # dplyr approach to str
## Observations: 227496
## Variables:
## $ Year (int) 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth (int) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek (int) 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime (int) 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime (int) 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier (chr) "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A...
## $ FlightNum (int) 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum (chr) "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime (int) 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime (int) 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay (int) -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay (int) 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin (chr) "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest (chr) "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance (int) 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn (int) 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut (int) 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode (chr) "", "", "", "", "", "", "", "", "", "", "", ...
## $ Diverted (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Speed (dbl) 336.0000, 298.6667, 280.0000, 344.6154, 305....