library(nycflights13)
## Warning: package 'nycflights13' was built under R version 3.1.1
suppressMessages(library(dplyr))
Package nycflights13 has data about all flights that departed NYC in 2013 - 5 datasets
flights: Flights dataairlines: Airline namesairports: Airport metadataplanes : Plane metadataweather: Hourly weather dataPrints nicely and prevents an accidental display of the whole dataset
tblflights <- tbl_df(flights)
head(tblflights,3) # Can also use print(flights,3) instead
## Source: local data frame [3 x 16]
##
## year month day dep_time dep_delay arr_time arr_delay carrier tailnum
## 1 2013 1 1 517 2 830 11 UA N14228
## 2 2013 1 1 533 4 850 20 UA N24211
## 3 2013 1 1 542 2 923 33 AA N619AA
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
## (dbl), distance (dbl), hour (dbl), minute (dbl)
filter: for subsetting variablesselect: for subsetting rowsarrange: for re-ordering rowsmutate: for adding new columnssummarise or summarize: for reducing each group to a smaller number of summary statistics# base R: tblflights[tblflights$carrier=="AA" & tblflights$origin=="LGA", ]
filter(tblflights, carrier=="AA" & origin=="LGA")
## Source: local data frame [15,459 x 16]
##
## year month day dep_time dep_delay arr_time arr_delay carrier tailnum
## 1 2013 1 1 558 -2 753 8 AA N3ALAA
## 2 2013 1 1 559 -1 941 31 AA N3DUAA
## 3 2013 1 1 623 13 920 5 AA N3EMAA
## 4 2013 1 1 629 -1 824 14 AA N3CYAA
## 5 2013 1 1 635 0 1028 48 AA N3GKAA
## 6 2013 1 1 656 -4 854 4 AA N4WNAA
## 7 2013 1 1 659 -1 1008 -7 AA N3EKAA
## 8 2013 1 1 724 -6 1111 31 AA N541AA
## 9 2013 1 1 739 -6 918 -12 AA N4WPAA
## 10 2013 1 1 753 -2 1056 -14 AA N3HMAA
## .. ... ... ... ... ... ... ... ... ...
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
## (dbl), distance (dbl), hour (dbl), minute (dbl)
# same as filter(tblflights, carrier=="AA", origin=="LGA")
filter(tblflights, carrier %in% c("AA", "UA"))
## Source: local data frame [91,394 x 16]
##
## year month day dep_time dep_delay arr_time arr_delay carrier tailnum
## 1 2013 1 1 517 2 830 11 UA N14228
## 2 2013 1 1 533 4 850 20 UA N24211
## 3 2013 1 1 542 2 923 33 AA N619AA
## 4 2013 1 1 554 -4 740 12 UA N39463
## 5 2013 1 1 558 -2 753 8 AA N3ALAA
## 6 2013 1 1 558 -2 924 7 UA N29129
## 7 2013 1 1 558 -2 923 -14 UA N53441
## 8 2013 1 1 559 -1 941 31 AA N3DUAA
## 9 2013 1 1 559 -1 854 -8 UA N76515
## 10 2013 1 1 606 -4 858 -12 AA N633AA
## .. ... ... ... ... ... ... ... ... ...
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
## (dbl), distance (dbl), hour (dbl), minute (dbl)
# dplyr approach
print(select(tblflights, dep_time, arr_time, flight),n=6)
## Source: local data frame [336,776 x 3]
##
## dep_time arr_time flight
## 1 517 830 1545
## 2 533 850 1714
## 3 542 923 1141
## 4 544 1004 725
## 5 554 812 461
## 6 554 740 1696
## .. ... ... ...
# nesting method to select carrier and dep_delay columns and filter for delays over 60 minutes
head(filter(select(tblflights, carrier, dep_delay), dep_delay > 60))
## Source: local data frame [6 x 2]
##
## carrier dep_delay
## 1 MQ 101
## 2 AA 71
## 3 MQ 853
## 4 UA 144
## 5 UA 134
## 6 EV 96
# base R approach to select carrier and dep_delay columns and sort by dep_delay
head(tblflights[order(tblflights$dep_delay), c("carrier", "dep_delay")])
## Source: local data frame [6 x 2]
##
## carrier dep_delay
## 89674 B6 -43
## 113634 DL -33
## 64502 EV -32
## 9620 DL -30
## 24916 F9 -27
## 287743 MQ -26
# base R approach to create a new variable - sum of squares of delays (arr and dep)
tblflights$delaysquare <- tblflights$dep_delay^2 + tblflights$arr_delay^2
head(tblflights[, c("dep_delay", "arr_delay", "delaysquare")])
## Source: local data frame [6 x 3]
##
## dep_delay arr_delay delaysquare
## 1 2 11 125
## 2 4 20 416
## 3 2 33 1093
## 4 -1 -18 325
## 5 -6 -25 661
## 6 -4 12 160
group_by creates the groups that will be operated onsummarise uses the provided aggregation function to summarise each group# base R approaches to calculate the mean arrival delays at different airports
aggregate(arr_delay ~ origin, tblflights, mean)
## origin arr_delay
## 1 EWR 9.107
## 2 JFK 5.551
## 3 LGA 5.783
# or with(tblflights, tapply(arr_delay, origin, mean, na.rm=TRUE))
# for each carrier, calculate the mean arrival and departure delays at the different origin airports
tblflights %>%
group_by(origin) %>%
summarise_each(funs(mean(.,na.rm=TRUE)), arr_delay, dep_delay) %>%
head()
## Source: local data frame [3 x 3]
##
## origin arr_delay dep_delay
## 1 EWR 9.107 15.11
## 2 JFK 5.551 12.11
## 3 LGA 5.783 10.35
tblflights %>%
select(matches("_delay")) %>%
head(3)
## Source: local data frame [3 x 2]
##
## dep_delay arr_delay
## 1 2 11
## 2 4 20
## 3 2 33
tblflights %>%
select(matches("_delay")) %>%
mutate_each(funs(half=./2)) %>%
head(3)
## Source: local data frame [3 x 2]
##
## dep_delay arr_delay
## 1 1 5.5
## 2 2 10.0
## 3 1 16.5
# for each destination, count the total number of flights and the number of distinct planes that flew there
tblflights %>%
group_by(dest) %>%
summarise(flight_count = n(), plane_count = n_distinct(tailnum)) %>%
head()
## Source: local data frame [6 x 3]
##
## dest flight_count plane_count
## 1 ABQ 254 108
## 2 ACK 265 58
## 3 ALB 439 172
## 4 ANC 8 6
## 5 ATL 17215 1180
## 6 AUS 2439 993
mean) takes n inputs and returns 1 valuemin_rank), offset functions (lead and lag), and cumulative aggregates (like cummean).# for each carrier, calculate which two days of the year they had their longest departure delays--- rewrite previous with the `top_n` function
tblflights %>%
group_by(carrier) %>%
select(month, day, dep_delay) %>%
filter(dep_delay!="NA") %>%
top_n(2) %>%
arrange(carrier, desc(dep_delay)) %>% head()
# for each month, calculate the number of flights and the change from the previous month
tblflights %>%
group_by(month) %>%
summarise(flight_count = n()) %>%
mutate(change = flight_count - lag(flight_count))
# rewrite previous with the `tally` function
tblflights %>%
group_by(month) %>%
tally() %>%
mutate(change = n - lag(n))
model=tblflights %>% group_by(origin) %>% do(lm=lm(dep_delay~arr_delay+carrier,data=.))
##
|================================ | 67% ~2 s remaining
Completed after 4 s
model %>% summarise(rsq=summary(lm)$r.squared)
## Source: local data frame [3 x 1]
##
## rsq
## 1 0.8636
## 2 0.8182
## 3 0.8417
tblflights %>% head(1)
## Source: local data frame [1 x 17]
##
## year month day dep_time dep_delay arr_time arr_delay carrier tailnum
## 1 2013 1 1 517 2 830 11 UA N14228
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
## (dbl), distance (dbl), hour (dbl), minute (dbl), delaysquare (dbl)
airlines %>% head(1)
## Source: local data frame [1 x 2]
##
## carrier name
## 1 9E Endeavor Air Inc.
airports %>% head(1)
## Source: local data frame [1 x 7]
##
## faa name lat lon alt tz dst
## 1 04G Lansdowne Airport 41.13 -80.62 1044 -5 A
faatblflights=tblflights %>% select(origin) %>% mutate(faa=origin)
faatblflights %>% left_join(airports) %>% head(2)
## Joining by: "faa"
## Source: local data frame [2 x 8]
##
## origin faa name lat lon alt tz dst
## 1 EWR EWR Newark Liberty Intl 40.69 -74.17 18 -5 A
## 2 LGA LGA La Guardia 40.78 -73.87 22 -5 A
airports %>% left_join(faatblflights) %>% head(2)
## Joining by: "faa"
## Source: local data frame [2 x 8]
##
## faa name lat lon alt tz dst origin
## 1 04G Lansdowne Airport 41.13 -80.62 1044 -5 A NA
## 2 06A Moton Field Municipal Airport 32.46 -85.68 264 -5 A NA
faatblflights %>% anti_join(airports)
## Joining by: "faa"
## Source: local data frame [0 x 2]
airports %>% anti_join(faatblflights)
## Joining by: "faa"
## Source: local data frame [1,394 x 7]
##
## faa name lat lon alt tz dst
## 1 PVC Provincetown Muni 42.07 -70.22 9 -5 A
## 2 GDV Dawson Community Airport 47.14 -104.81 749 -7 A
## 3 OWB Owensboro Daviess County Airport 37.74 -87.17 406 -6 A
## 4 ESD Orcas Island Airport 48.71 -122.91 31 -8 A
## 5 PBI Palm Beach Intl 26.68 -80.10 19 -5 A
## 6 NIB Nikolai Airport 63.01 -154.38 427 -9 A
## 7 DLL Baraboo Wisconsin Dells Airport 43.52 -89.77 979 -6 A
## 8 MKK Molokai 21.15 -157.10 454 -10 N
## 9 CFD Coulter Fld 30.72 -96.33 367 -6 A
## 10 BJC Rocky Mountain Metropolitan Airport 39.91 -105.12 5670 -7 A
## .. ... ... ... ... ... ... ...