# dplyr is an R package, a collection of functions and data sets that enhance the R language.
# install.packages("dplyr")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# package hflights stores data set of airline flight data containing flights that departed from Houston
# Load the hflights package
# install.packages("hflights")
library(hflights)
class(hflights)
## [1] "data.frame"
head(hflights)
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011 1 1 6 1400 1500 AA
## 5425 2011 1 2 7 1401 1501 AA
## 5426 2011 1 3 1 1352 1502 AA
## 5427 2011 1 4 2 1403 1513 AA
## 5428 2011 1 5 3 1405 1507 AA
## 5429 2011 1 6 4 1359 1503 AA
## FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424 428 N576AA 60 40 -10 0 IAH
## 5425 428 N557AA 60 45 -9 1 IAH
## 5426 428 N541AA 70 48 -8 -8 IAH
## 5427 428 N403AA 70 39 3 3 IAH
## 5428 428 N492AA 62 44 -3 5 IAH
## 5429 428 N262AA 64 45 -7 -1 IAH
## Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424 DFW 224 7 13 0 0
## 5425 DFW 224 6 9 0 0
## 5426 DFW 224 5 17 0 0
## 5427 DFW 224 9 22 0 0
## 5428 DFW 224 9 9 0 0
## 5429 DFW 224 6 13 0 0
summary(hflights)
## Year Month DayofMonth DayOfWeek
## Min. :2011 Min. : 1.000 Min. : 1.00 Min. :1.000
## 1st Qu.:2011 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.:2.000
## Median :2011 Median : 7.000 Median :16.00 Median :4.000
## Mean :2011 Mean : 6.514 Mean :15.74 Mean :3.948
## 3rd Qu.:2011 3rd Qu.: 9.000 3rd Qu.:23.00 3rd Qu.:6.000
## Max. :2011 Max. :12.000 Max. :31.00 Max. :7.000
##
## DepTime ArrTime UniqueCarrier FlightNum
## Min. : 1 Min. : 1 Length:227496 Min. : 1
## 1st Qu.:1021 1st Qu.:1215 Class :character 1st Qu.: 855
## Median :1416 Median :1617 Mode :character Median :1696
## Mean :1396 Mean :1578 Mean :1962
## 3rd Qu.:1801 3rd Qu.:1953 3rd Qu.:2755
## Max. :2400 Max. :2400 Max. :7290
## NA's :2905 NA's :3066
## TailNum ActualElapsedTime AirTime ArrDelay
## Length:227496 Min. : 34.0 Min. : 11.0 Min. :-70.000
## Class :character 1st Qu.: 77.0 1st Qu.: 58.0 1st Qu.: -8.000
## Mode :character Median :128.0 Median :107.0 Median : 0.000
## Mean :129.3 Mean :108.1 Mean : 7.094
## 3rd Qu.:165.0 3rd Qu.:141.0 3rd Qu.: 11.000
## Max. :575.0 Max. :549.0 Max. :978.000
## NA's :3622 NA's :3622 NA's :3622
## DepDelay Origin Dest Distance
## Min. :-33.000 Length:227496 Length:227496 Min. : 79.0
## 1st Qu.: -3.000 Class :character Class :character 1st Qu.: 376.0
## Median : 0.000 Mode :character Mode :character Median : 809.0
## Mean : 9.445 Mean : 787.8
## 3rd Qu.: 9.000 3rd Qu.:1042.0
## Max. :981.000 Max. :3904.0
## NA's :2905
## TaxiIn TaxiOut Cancelled CancellationCode
## Min. : 1.000 Min. : 1.00 Min. :0.00000 Length:227496
## 1st Qu.: 4.000 1st Qu.: 10.00 1st Qu.:0.00000 Class :character
## Median : 5.000 Median : 14.00 Median :0.00000 Mode :character
## Mean : 6.099 Mean : 15.09 Mean :0.01307
## 3rd Qu.: 7.000 3rd Qu.: 18.00 3rd Qu.:0.00000
## Max. :165.000 Max. :163.00 Max. :1.00000
## NA's :3066 NA's :2947
## Diverted
## Min. :0.000000
## 1st Qu.:0.000000
## Median :0.000000
## Mean :0.002853
## 3rd Qu.:0.000000
## Max. :1.000000
##
str(hflights)
## 'data.frame': 227496 obs. of 21 variables:
## $ Year : int 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
## $ Month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DayofMonth : int 1 2 3 4 5 6 7 8 9 10 ...
## $ DayOfWeek : int 6 7 1 2 3 4 5 6 7 1 ...
## $ DepTime : int 1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
## $ ArrTime : int 1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
## $ UniqueCarrier : chr "AA" "AA" "AA" "AA" ...
## $ FlightNum : int 428 428 428 428 428 428 428 428 428 428 ...
## $ TailNum : chr "N576AA" "N557AA" "N541AA" "N403AA" ...
## $ ActualElapsedTime: int 60 60 70 70 62 64 70 59 71 70 ...
## $ AirTime : int 40 45 48 39 44 45 43 40 41 45 ...
## $ ArrDelay : int -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
## $ DepDelay : int 0 1 -8 3 5 -1 -1 -5 43 43 ...
## $ Origin : chr "IAH" "IAH" "IAH" "IAH" ...
## $ Dest : chr "DFW" "DFW" "DFW" "DFW" ...
## $ Distance : int 224 224 224 224 224 224 224 224 224 224 ...
## $ TaxiIn : int 7 6 5 9 9 6 12 7 8 6 ...
## $ TaxiOut : int 13 9 17 22 9 13 15 12 22 19 ...
## $ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CancellationCode : chr "" "" "" "" ...
## $ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
# derive a tbl from a data.frame structure
# Convert the hflights data.frame into a hflights tbl
hflights <- tbl_df(hflights)
# Display the new hflights in your console window.
# Notice the easy-to-read layout.
hflights
## # A tibble: 227,496 x 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## * <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 1 6 1400 1500 AA
## 2 2011 1 2 7 1401 1501 AA
## 3 2011 1 3 1 1352 1502 AA
## 4 2011 1 4 2 1403 1513 AA
## 5 2011 1 5 3 1405 1507 AA
## 6 2011 1 6 4 1359 1503 AA
## 7 2011 1 7 5 1359 1509 AA
## 8 2011 1 8 6 1355 1454 AA
## 9 2011 1 9 7 1443 1554 AA
## 10 2011 1 10 1 1443 1553 AA
## # ... with 227,486 more rows, and 14 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
# inherits the original class of its input
class(hflights)
## [1] "tbl_df" "tbl" "data.frame"
# save the UniqueCarrier column of hflights as an object named carriers, using base R syntax only.
carriers <- hflights$UniqueCarrier
unique(hflights$UniqueCarrier) # uses a confusing code system.
## [1] "AA" "AS" "B6" "CO" "DL" "OO" "UA" "US" "WN" "EV" "F9" "FL" "MQ" "XE"
## [15] "YV"
# "lookup table"" in the form of a named vector
# When you subset the lookup table with a character string
# R will return the values of the lookup table
# that correspond to the names in the character string
two <- c("AA", "AS")
lut <- c("AA" = "American",
"AS" = "Alaska",
"B6" = "JetBlue")
two <- lut[two]
two
## AA AS
## "American" "Alaska"
lut <- c("AA" = "American", "AS" = "Alaska", "B6" = "JetBlue", "CO" = "Continental",
"DL" = "Delta", "OO" = "SkyWest", "UA" = "United", "US" = "US_Airways",
"WN" = "Southwest", "EV" = "Atlantic_Southeast", "F9" = "Frontier",
"FL" = "AirTran", "MQ" = "American_Eagle", "XE" = "ExpressJet", "YV" = "Mesa")
# Add a new Carrier column to hflights
# by combining lut
# with the UniqueCarrier column of hflights
# INCORRECT: hflights$Carrier <- c(lut,carriers)
hflights$Carrier <- lut[hflights$UniqueCarrier]
# how to check this was added correctly?
# Carrier variable does not appear when you print hflights
glimpse(hflights)
## Observations: 227,496
## Variables: 22
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A...
## $ FlightNum <int> 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance <int> 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode <chr> "", "", "", "", "", "", "", "", "", "", "", ...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Carrier <chr> "American", "American", "American", "America...
# lists reasons why a flight was cancelled using a non-informative alphabetical code
unique(hflights$CancellationCode)
## [1] "" "A" "B" "C" "D"
# The lookup table: converts the alphabetical codes into more meaningful strings
lut <- c("A" = "carrier", "B" = "weather", "C" = "FFA", "D" = "security", "E" = "not cancelled")
# Use lut to change the labels of the CancellationCode column of hflights.
# Store the recoded vector in a new column Code.
# Add the Code column
hflights$Code <- lut[hflights$CancellationCode]
# check your results
glimpse(hflights)
## Observations: 227,496
## Variables: 23
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A...
## $ FlightNum <int> 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance <int> 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode <chr> "", "", "", "", "", "", "", "", "", "", "", ...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Carrier <chr> "American", "American", "American", "America...
## $ Code <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
select(), which returns a subset of the columns,
# selects the variables cyl and disp from the data frame df: e.g. mtcars
# select(df, var1, var2)
select(mtcars, cyl, disp)
## cyl disp
## Mazda RX4 6 160.0
## Mazda RX4 Wag 6 160.0
## Datsun 710 4 108.0
## Hornet 4 Drive 6 258.0
## Hornet Sportabout 8 360.0
## Valiant 6 225.0
## Duster 360 8 360.0
## Merc 240D 4 146.7
## Merc 230 4 140.8
## Merc 280 6 167.6
## Merc 280C 6 167.6
## Merc 450SE 8 275.8
## Merc 450SL 8 275.8
## Merc 450SLC 8 275.8
## Cadillac Fleetwood 8 472.0
## Lincoln Continental 8 460.0
## Chrysler Imperial 8 440.0
## Fiat 128 4 78.7
## Honda Civic 4 75.7
## Toyota Corolla 4 71.1
## Toyota Corona 4 120.1
## Dodge Challenger 8 318.0
## AMC Javelin 8 304.0
## Camaro Z28 8 350.0
## Pontiac Firebird 8 400.0
## Fiat X1-9 4 79.0
## Porsche 914-2 4 120.3
## Lotus Europa 4 95.1
## Ford Pantera L 8 351.0
## Ferrari Dino 6 145.0
## Maserati Bora 8 301.0
## Volvo 142E 4 121.0
# use : to select a range of variables
# and - to exclude some variables
# selects the four first variables except for the second one
select(mtcars, 1:4, -2)
## mpg disp hp
## Mazda RX4 21.0 160.0 110
## Mazda RX4 Wag 21.0 160.0 110
## Datsun 710 22.8 108.0 93
## Hornet 4 Drive 21.4 258.0 110
## Hornet Sportabout 18.7 360.0 175
## Valiant 18.1 225.0 105
## Duster 360 14.3 360.0 245
## Merc 240D 24.4 146.7 62
## Merc 230 22.8 140.8 95
## Merc 280 19.2 167.6 123
## Merc 280C 17.8 167.6 123
## Merc 450SE 16.4 275.8 180
## Merc 450SL 17.3 275.8 180
## Merc 450SLC 15.2 275.8 180
## Cadillac Fleetwood 10.4 472.0 205
## Lincoln Continental 10.4 460.0 215
## Chrysler Imperial 14.7 440.0 230
## Fiat 128 32.4 78.7 66
## Honda Civic 30.4 75.7 52
## Toyota Corolla 33.9 71.1 65
## Toyota Corona 21.5 120.1 97
## Dodge Challenger 15.5 318.0 150
## AMC Javelin 15.2 304.0 150
## Camaro Z28 13.3 350.0 245
## Pontiac Firebird 19.2 400.0 175
## Fiat X1-9 27.3 79.0 66
## Porsche 914-2 26.0 120.3 91
## Lotus Europa 30.4 95.1 113
## Ford Pantera L 15.8 351.0 264
## Ferrari Dino 19.7 145.0 175
## Maserati Bora 15.0 301.0 335
## Volvo 142E 21.4 121.0 109
# Use select() to print out a tbl
# that contains only the columns
# ActualElapsedTime, AirTime, ArrDelay and DepDelay
# of hflights.
# Print out a tbl with the four columns of hflights related to delay
select(hflights, ActualElapsedTime, AirTime, ArrDelay, DepDelay)
## # A tibble: 227,496 x 4
## ActualElapsedTime AirTime ArrDelay DepDelay
## * <int> <int> <int> <int>
## 1 60 40 -10 0
## 2 60 45 -9 1
## 3 70 48 -8 -8
## 4 70 39 3 3
## 5 62 44 -3 5
## 6 64 45 -7 -1
## 7 70 43 -1 -1
## 8 59 40 -16 -5
## 9 71 41 44 43
## 10 70 45 43 43
## # ... with 227,486 more rows
# Print out a tbl
# with the columns Origin up to and including Cancelled
# of hflights.
# Print out the columns Origin up to Cancelled of hflights
select(hflights, Origin:Cancelled)
## # A tibble: 227,496 x 6
## Origin Dest Distance TaxiIn TaxiOut Cancelled
## * <chr> <chr> <int> <int> <int> <int>
## 1 IAH DFW 224 7 13 0
## 2 IAH DFW 224 6 9 0
## 3 IAH DFW 224 5 17 0
## 4 IAH DFW 224 9 22 0
## 5 IAH DFW 224 9 9 0
## 6 IAH DFW 224 6 13 0
## 7 IAH DFW 224 12 15 0
## 8 IAH DFW 224 7 12 0
## 9 IAH DFW 224 8 22 0
## 10 IAH DFW 224 6 19 0
## # ... with 227,486 more rows
# Find the most concise way to select:
# columns Year up to and including DayOfWeek,
# columns ArrDelay up to and including Diverted.
# You can examine the order of the variables in hflights
# with names(hflights) in the console.
# Answer to last question: be concise!
names(hflights)
## [1] "Year" "Month" "DayofMonth"
## [4] "DayOfWeek" "DepTime" "ArrTime"
## [7] "UniqueCarrier" "FlightNum" "TailNum"
## [10] "ActualElapsedTime" "AirTime" "ArrDelay"
## [13] "DepDelay" "Origin" "Dest"
## [16] "Distance" "TaxiIn" "TaxiOut"
## [19] "Cancelled" "CancellationCode" "Diverted"
## [22] "Carrier" "Code"
select(hflights, Year:DayOfWeek, ArrDelay:Diverted)
## # A tibble: 227,496 x 14
## Year Month DayofMonth DayOfWeek ArrDelay DepDelay Origin Dest
## * <int> <int> <int> <int> <int> <int> <chr> <chr>
## 1 2011 1 1 6 -10 0 IAH DFW
## 2 2011 1 2 7 -9 1 IAH DFW
## 3 2011 1 3 1 -8 -8 IAH DFW
## 4 2011 1 4 2 3 3 IAH DFW
## 5 2011 1 5 3 -3 5 IAH DFW
## 6 2011 1 6 4 -7 -1 IAH DFW
## 7 2011 1 7 5 -1 -1 IAH DFW
## 8 2011 1 8 6 -16 -5 IAH DFW
## 9 2011 1 9 7 44 43 IAH DFW
## 10 2011 1 10 1 43 43 IAH DFW
## # ... with 227,486 more rows, and 6 more variables: Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
NB1: When you refer to columns directly inside select(), you don’t use quotes.
NB2: If you use the helper functions, you do use quotes.
# Print out a tbl containing just ArrDelay and DepDelay
select(hflights, ArrDelay, DepDelay)
## # A tibble: 227,496 x 2
## ArrDelay DepDelay
## * <int> <int>
## 1 -10 0
## 2 -9 1
## 3 -8 -8
## 4 3 3
## 5 -3 5
## 6 -7 -1
## 7 -1 -1
## 8 -16 -5
## 9 44 43
## 10 43 43
## # ... with 227,486 more rows
select(hflights, ends_with("Delay"))
## # A tibble: 227,496 x 2
## ArrDelay DepDelay
## * <int> <int>
## 1 -10 0
## 2 -9 1
## 3 -8 -8
## 4 3 3
## 5 -3 5
## 6 -7 -1
## 7 -1 -1
## 8 -16 -5
## 9 44 43
## 10 43 43
## # ... with 227,486 more rows
# Use a combination of helper functions and variable names
# to print out only
# UniqueCarrier, FlightNum, TailNum, Cancelled, and CancellationCode
# columns of hflights.
select(hflights, UniqueCarrier:TailNum, starts_with("Cancell"))
## # A tibble: 227,496 x 5
## UniqueCarrier FlightNum TailNum Cancelled CancellationCode
## * <chr> <int> <chr> <int> <chr>
## 1 AA 428 N576AA 0 ""
## 2 AA 428 N557AA 0 ""
## 3 AA 428 N541AA 0 ""
## 4 AA 428 N403AA 0 ""
## 5 AA 428 N492AA 0 ""
## 6 AA 428 N262AA 0 ""
## 7 AA 428 N493AA 0 ""
## 8 AA 428 N477AA 0 ""
## 9 AA 428 N476AA 0 ""
## 10 AA 428 N504AA 0 ""
## # ... with 227,486 more rows
select(hflights, UniqueCarrier, ends_with("Num"), starts_with("Cancel"))
## # A tibble: 227,496 x 5
## UniqueCarrier FlightNum TailNum Cancelled CancellationCode
## * <chr> <int> <chr> <int> <chr>
## 1 AA 428 N576AA 0 ""
## 2 AA 428 N557AA 0 ""
## 3 AA 428 N541AA 0 ""
## 4 AA 428 N403AA 0 ""
## 5 AA 428 N492AA 0 ""
## 6 AA 428 N262AA 0 ""
## 7 AA 428 N493AA 0 ""
## 8 AA 428 N477AA 0 ""
## 9 AA 428 N476AA 0 ""
## 10 AA 428 N504AA 0 ""
## # ... with 227,486 more rows
# Find the most concise way to return
# the following columns
# with select
# and its helper functions:
# DepTime, ArrTime, ActualElapsedTime, AirTime, ArrDelay, DepDelay
# using only helper functions.
select(hflights, contains("Time"), ends_with("Delay"))
## # A tibble: 227,496 x 6
## DepTime ArrTime ActualElapsedTime AirTime ArrDelay DepDelay
## * <int> <int> <int> <int> <int> <int>
## 1 1400 1500 60 40 -10 0
## 2 1401 1501 60 45 -9 1
## 3 1352 1502 70 48 -8 -8
## 4 1403 1513 70 39 3 3
## 5 1405 1507 62 44 -3 5
## 6 1359 1503 64 45 -7 -1
## 7 1359 1509 70 43 -1 -1
## 8 1355 1454 59 40 -16 -5
## 9 1443 1554 71 41 44 43
## 10 1443 1553 70 45 43 43
## # ... with 227,486 more rows
select(hflights, contains("Tim"), contains("Del"))
## # A tibble: 227,496 x 6
## DepTime ArrTime ActualElapsedTime AirTime ArrDelay DepDelay
## * <int> <int> <int> <int> <int> <int>
## 1 1400 1500 60 40 -10 0
## 2 1401 1501 60 45 -9 1
## 3 1352 1502 70 48 -8 -8
## 4 1403 1513 70 39 3 3
## 5 1405 1507 62 44 -3 5
## 6 1359 1503 64 45 -7 -1
## 7 1359 1509 70 43 -1 -1
## 8 1355 1454 59 40 -16 -5
## 9 1443 1554 71 41 44 43
## 10 1443 1553 70 45 43 43
## # ... with 227,486 more rows
To see the added value of the dplyr package, it is useful to compare its syntax with base R
# Finish select call so that ex1d matches ex1r
ex1r <- hflights[c("TaxiIn", "TaxiOut", "Distance")]
ex1d <- select(hflights, starts_with("Taxi"), Distance)
ex1d <- select(hflights, contains("Taxi"), Distance)
# Finish select call so that ex2d matches ex2r
ex2r <- hflights[c("Year", "Month", "DayOfWeek", "DepTime", "ArrTime")]
ex2d <- select(hflights, Year:Month, DayOfWeek:ArrTime)
ex2d <- select(hflights, Year:ArrTime, -DayofMonth)
# Finish select call so that ex3d matches ex3r
ex3r <- hflights[c("TailNum", "TaxiIn", "TaxiOut")]
ex3d <- select(hflights, starts_with("T"))
mutate(), used to add columns from existing data,
# Create a new data frame, g1,
# which is the data frame hflights
# with an additional column:
# ActualGroundTime, the difference between
# ActualElapsedTime and AirTime.
# Add the new variable ActualGroundTime to a copy of hflights and save the result as g1.
g1 <- mutate(hflights, ActualGroundTime = ActualElapsedTime - AirTime)
# Extend g1 further,
# by adding an additional column GroundTime.
# sum of the TaxiIn and TaxiOut columns
# Store the resulting data frame in g2.
# Add the new variable GroundTime to g1. Save the result as g2.
g2 <- mutate(g1, GroundTime = TaxiIn + TaxiOut)
# Check in the console that the GroundTime and ActualGroundTime columns are equal.
sum(g2$GroundTime, na.rm = TRUE)
## [1] 4755621
sum(g2$ActualGroundTime, na.rm = TRUE)
## [1] 4741967
sum(na.omit(g2$GroundTime))
## [1] 4755621
sum(na.omit(g2$ActualGroundTime))
## [1] 4741967
sum(as.numeric(na.omit(g2$GroundTime - g2$ActualGroundTime)))
## [1] 0
# Add the new variable AverageSpeed to g2. Save the result as g3.
g3 <- mutate(g2, AverageSpeed = Distance / AirTime * 60)
# Print out g3
g3
## # A tibble: 227,496 x 26
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 1 6 1400 1500 AA
## 2 2011 1 2 7 1401 1501 AA
## 3 2011 1 3 1 1352 1502 AA
## 4 2011 1 4 2 1403 1513 AA
## 5 2011 1 5 3 1405 1507 AA
## 6 2011 1 6 4 1359 1503 AA
## 7 2011 1 7 5 1359 1509 AA
## 8 2011 1 8 6 1355 1454 AA
## 9 2011 1 9 7 1443 1554 AA
## 10 2011 1 10 1 1443 1553 AA
## # ... with 227,486 more rows, and 19 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>, Carrier <chr>, Code <chr>, ActualGroundTime <int>,
## # GroundTime <int>, AverageSpeed <dbl>
mutate(my_df, x = a + b, y = x + c)
# Add a second variable loss_ratio to the dataset: m1
m1 <- mutate(hflights, loss = ArrDelay - DepDelay)
m1 <- mutate(hflights, loss = ArrDelay - DepDelay, loss_ratio = loss / DepDelay)
m2 <- mutate(hflights, TotalTaxi = TaxiIn + TaxiOut, ActualGroundTime = ActualElapsedTime - AirTime, Diff = TotalTaxi - ActualGroundTime)
# Diff column should be zero for all observations!
head(as.numeric(na.omit(m2$Diff)),20)
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
head(m2$Diff[!is.na(m2$Diff)],20)
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
filter(), that is able to return a subset of the rows,
e.g. keep observations for which a is positive filter(df, a > 0)
# All flights that traveled 3000 miles or more
filter(hflights, Distance >= 3000)
## # A tibble: 527 x 23
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 31 1 924 1413 CO
## 2 2011 1 30 7 925 1410 CO
## 3 2011 1 29 6 1045 1445 CO
## 4 2011 1 28 5 1516 1916 CO
## 5 2011 1 27 4 950 1344 CO
## 6 2011 1 26 3 944 1350 CO
## 7 2011 1 25 2 924 1337 CO
## 8 2011 1 24 1 1144 1605 CO
## 9 2011 1 23 7 926 1335 CO
## 10 2011 1 22 6 942 1340 CO
## # ... with 517 more rows, and 16 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>, Carrier <chr>, Code <chr>
# All flights flown by one of JetBlue, Southwest, or Delta
filter(hflights, UniqueCarrier %in% c("JetBlue", "Southwest", "Delta"))
## # A tibble: 0 x 23
## # ... with 23 variables: Year <int>, Month <int>, DayofMonth <int>,
## # DayOfWeek <int>, DepTime <int>, ArrTime <int>, UniqueCarrier <chr>,
## # FlightNum <int>, TailNum <chr>, ActualElapsedTime <int>,
## # AirTime <int>, ArrDelay <int>, DepDelay <int>, Origin <chr>,
## # Dest <chr>, Distance <int>, TaxiIn <int>, TaxiOut <int>,
## # Cancelled <int>, CancellationCode <chr>, Diverted <int>,
## # Carrier <chr>, Code <chr>
# All flights where taxiing took longer than flying
# actual flight
# Avoid the use of mutate()
# do the math directly in the logical expression of filter().
filter(hflights, TaxiIn + TaxiOut > AirTime)
## # A tibble: 1,389 x 23
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 24 1 731 904 AA
## 2 2011 1 30 7 1959 2132 AA
## 3 2011 1 24 1 1621 1749 AA
## 4 2011 1 10 1 941 1113 AA
## 5 2011 1 31 1 1301 1356 CO
## 6 2011 1 31 1 2113 2215 CO
## 7 2011 1 31 1 1434 1539 CO
## 8 2011 1 31 1 900 1006 CO
## 9 2011 1 30 7 1304 1408 CO
## 10 2011 1 30 7 2004 2128 CO
## # ... with 1,379 more rows, and 16 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>, Carrier <chr>, Code <chr>
Instead of using the & operator, you can also pass several logical tests to filter(), separated by commas.
The following two calls are completely equivalent:
filter(df, a > 0 & b > 0)
filter(df, a > 0, b > 0)
To keep the observations in df for which the variable x is not NA:
filter(df, !is.na(x))
# Use R's logical and boolean operators
# to select just the rows
# where a flight
# left before 5:00 am (500)
# or arrived after 10:00 pm (2200)
# All flights that departed before 5am or arrived after 10pm
filter(hflights, DepTime < 500 | ArrTime > 2200)
## # A tibble: 27,799 x 23
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 4 2 2100 2207 AA
## 2 2011 1 14 5 2119 2229 AA
## 3 2011 1 10 1 1934 2235 AA
## 4 2011 1 26 3 1905 2211 AA
## 5 2011 1 30 7 1856 2209 AA
## 6 2011 1 9 7 1938 2228 AS
## 7 2011 1 31 1 1919 2231 CO
## 8 2011 1 31 1 2116 2344 CO
## 9 2011 1 31 1 1850 2211 CO
## 10 2011 1 31 1 2102 2216 CO
## # ... with 27,789 more rows, and 16 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>, Carrier <chr>, Code <chr>
# All flights that departed late but arrived ahead of schedule
filter(hflights, DepDelay > 0, ArrDelay < 0)
## # A tibble: 27,712 x 23
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 2 7 1401 1501 AA
## 2 2011 1 5 3 1405 1507 AA
## 3 2011 1 18 2 1408 1508 AA
## 4 2011 1 18 2 721 827 AA
## 5 2011 1 12 3 2015 2113 AA
## 6 2011 1 13 4 2020 2116 AA
## 7 2011 1 26 3 2009 2103 AA
## 8 2011 1 1 6 1631 1736 AA
## 9 2011 1 10 1 1639 1740 AA
## 10 2011 1 12 3 1631 1739 AA
## # ... with 27,702 more rows, and 16 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>, Carrier <chr>, Code <chr>
# All flights that were cancelled after being delayed
filter(hflights, DepDelay > 0, Cancelled == 1)
## # A tibble: 40 x 23
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 26 3 1926 NA CO
## 2 2011 1 11 2 1100 NA US
## 3 2011 1 19 3 1811 NA XE
## 4 2011 1 7 5 2028 NA XE
## 5 2011 2 4 5 1638 NA AA
## 6 2011 2 8 2 1057 NA CO
## 7 2011 2 2 3 802 NA XE
## 8 2011 2 9 3 904 NA XE
## 9 2011 2 1 2 1508 NA OO
## 10 2011 3 31 4 1016 NA CO
## # ... with 30 more rows, and 16 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>, Carrier <chr>, Code <chr>
# Select the flights that had JFK as their destination: c1
# flights that had JFK as their destination
hflights$Dest[hflights$Dest == "JFK"]
## [1] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [12] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [23] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [34] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [45] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [56] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [67] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [78] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [89] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [100] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [111] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [122] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [133] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [144] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [155] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [166] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [177] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [188] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [199] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [210] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [221] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [232] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [243] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [254] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [265] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [276] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [287] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [298] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [309] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [320] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [331] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [342] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [353] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [364] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [375] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [386] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [397] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [408] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [419] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [430] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [441] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [452] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [463] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [474] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [485] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [496] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [507] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [518] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [529] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [540] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [551] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [562] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [573] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [584] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [595] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [606] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [617] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [628] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [639] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [650] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [661] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [672] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [683] "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK" "JFK"
## [694] "JFK" "JFK"
length(hflights$Dest[hflights$Dest == "JFK"])
## [1] 695
filter(hflights, Dest == "JFK")
## # A tibble: 695 x 23
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 1 6 654 1124 B6
## 2 2011 1 1 6 1639 2110 B6
## 3 2011 1 2 7 703 1113 B6
## 4 2011 1 2 7 1604 2040 B6
## 5 2011 1 3 1 659 1100 B6
## 6 2011 1 3 1 1801 2200 B6
## 7 2011 1 4 2 654 1103 B6
## 8 2011 1 4 2 1608 2034 B6
## 9 2011 1 5 3 700 1103 B6
## 10 2011 1 5 3 1544 1954 B6
## # ... with 685 more rows, and 16 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>, Carrier <chr>, Code <chr>
c1 <- filter(hflights, Dest == "JFK")
# Combine the Year, Month and DayofMonth variables to create a Date column: c2
c2 <- mutate(c1, Date = paste(Year, "-", Month, "-", DayofMonth))
head(c2$Date) # INCORRECT
## [1] "2011 - 1 - 1" "2011 - 1 - 1" "2011 - 1 - 2" "2011 - 1 - 2"
## [5] "2011 - 1 - 3" "2011 - 1 - 3"
# using the sep attribute of paste()
c2 <- mutate(c1, Date = paste(Year, Month, DayofMonth, sep = "-"))
head(c2$Date) # CORRECT
## [1] "2011-1-1" "2011-1-1" "2011-1-2" "2011-1-2" "2011-1-3" "2011-1-3"
# Finally, select some columns to provide an overview:
# Date, DepTime, ArrTime and TailNum, in this order.
# Do not assign the resulting data frame to a variable;
# just print it to the console.
# Print out a selection of columns of c2
select(c2, Date, DepTime, ArrTime, TailNum)
## # A tibble: 695 x 4
## Date DepTime ArrTime TailNum
## <chr> <int> <int> <chr>
## 1 2011-1-1 654 1124 N324JB
## 2 2011-1-1 1639 2110 N324JB
## 3 2011-1-2 703 1113 N324JB
## 4 2011-1-2 1604 2040 N324JB
## 5 2011-1-3 659 1100 N229JB
## 6 2011-1-3 1801 2200 N206JB
## 7 2011-1-4 654 1103 N267JB
## 8 2011-1-4 1608 2034 N267JB
## 9 2011-1-5 700 1103 N708JB
## 10 2011-1-5 1544 1954 N644JB
## # ... with 685 more rows
# How many weekend flights flew a distance of more than 1000 miles but had a total taxiing time below 15 minutes?
names(hflights)
## [1] "Year" "Month" "DayofMonth"
## [4] "DayOfWeek" "DepTime" "ArrTime"
## [7] "UniqueCarrier" "FlightNum" "TailNum"
## [10] "ActualElapsedTime" "AirTime" "ArrDelay"
## [13] "DepDelay" "Origin" "Dest"
## [16] "Distance" "TaxiIn" "TaxiOut"
## [19] "Cancelled" "CancellationCode" "Diverted"
## [22] "Carrier" "Code"
head(hflights$DayOfWeek)
## [1] 6 7 1 2 3 4
filter(hflights, DayOfWeek > 5, Distance > 1000, TaxiIn + TaxiOut < 15)
## # A tibble: 1,739 x 23
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 23 7 1535 1933 B6
## 2 2011 1 30 7 851 1230 CO
## 3 2011 1 30 7 2234 2 CO
## 4 2011 1 29 6 1220 1353 CO
## 5 2011 1 23 7 847 1213 CO
## 6 2011 1 23 7 1224 1345 CO
## 7 2011 1 23 7 931 1045 CO
## 8 2011 1 22 6 942 1340 CO
## 9 2011 1 16 7 848 1136 CO
## 10 2011 1 16 7 1030 1406 CO
## # ... with 1,729 more rows, and 16 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>, Carrier <chr>, Code <chr>
arrange(), that reorders the rows according to single or multiple variables,
# Definition of dtc
dtc <- filter(hflights, Cancelled == 1, !is.na(DepDelay))
# Arrange dtc by departure delays
# i.e. shortest departure delay is at the top of the data set.
arrange(dtc, DepDelay)
## # A tibble: 68 x 23
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 7 23 6 605 NA F9
## 2 2011 1 17 1 916 NA XE
## 3 2011 12 1 4 541 NA US
## 4 2011 10 12 3 2022 NA MQ
## 5 2011 7 29 5 1424 NA CO
## 6 2011 9 29 4 1639 NA OO
## 7 2011 2 9 3 555 NA MQ
## 8 2011 5 9 1 715 NA OO
## 9 2011 1 20 4 1413 NA UA
## 10 2011 1 17 1 831 NA WN
## # ... with 58 more rows, and 16 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>, Carrier <chr>, Code <chr>
# verify shortest departure delay is at the top
dtc$DepDelay # informs of order in "dtc"
## [1] 26 0 -4 135 -4 6 -9 73 8 187 2 4 28 -3 -5 156 -3
## [18] 42 548 3 131 109 5 1 87 -2 -3 -3 -5 -1 -3 110 -1 -3
## [35] 64 -6 24 271 -10 -3 110 153 -1 173 -1 37 3 0 83 -2 220
## [52] 27 64 -6 -2 1 -8 75 21 129 80 -3 64 21 103 27 0 -9
select(arrange(dtc, DepDelay),DepDelay)
## # A tibble: 68 x 1
## DepDelay
## <int>
## 1 -10
## 2 -9
## 3 -9
## 4 -8
## 5 -6
## 6 -6
## 7 -5
## 8 -5
## 9 -4
## 10 -4
## # ... with 58 more rows
# Arrange dtc so that
# flights that were cancelled
# for the same reason
# appear next to each other.
# Arrange dtc so that cancellation reasons are grouped
names(dtc)
## [1] "Year" "Month" "DayofMonth"
## [4] "DayOfWeek" "DepTime" "ArrTime"
## [7] "UniqueCarrier" "FlightNum" "TailNum"
## [10] "ActualElapsedTime" "AirTime" "ArrDelay"
## [13] "DepDelay" "Origin" "Dest"
## [16] "Distance" "TaxiIn" "TaxiOut"
## [19] "Cancelled" "CancellationCode" "Diverted"
## [22] "Carrier" "Code"
arrange(dtc, CancellationCode)
## # A tibble: 68 x 23
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 20 4 1413 NA UA
## 2 2011 1 7 5 2028 NA XE
## 3 2011 2 4 5 1638 NA AA
## 4 2011 2 8 2 1057 NA CO
## 5 2011 2 1 2 1508 NA OO
## 6 2011 2 21 1 2257 NA OO
## 7 2011 2 9 3 555 NA MQ
## 8 2011 3 18 5 727 NA UA
## 9 2011 4 4 1 1632 NA DL
## 10 2011 4 8 5 1608 NA WN
## # ... with 58 more rows, and 16 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>, Carrier <chr>, Code <chr>
# verify
select(arrange(dtc, CancellationCode),CancellationCode)
## # A tibble: 68 x 1
## CancellationCode
## <chr>
## 1 A
## 2 A
## 3 A
## 4 A
## 5 A
## 6 A
## 7 A
## 8 A
## 9 A
## 10 A
## # ... with 58 more rows
# Arrange dtc so that flights
# by the same carrier appear next to each other
# Within each carrier,
# flights that have smaller departure delays
# appear before flights that have higher departure delays
# Do this in a one-liner.
# Arrange dtc according to carrier and departure delays
names(dtc)
## [1] "Year" "Month" "DayofMonth"
## [4] "DayOfWeek" "DepTime" "ArrTime"
## [7] "UniqueCarrier" "FlightNum" "TailNum"
## [10] "ActualElapsedTime" "AirTime" "ArrDelay"
## [13] "DepDelay" "Origin" "Dest"
## [16] "Distance" "TaxiIn" "TaxiOut"
## [19] "Cancelled" "CancellationCode" "Diverted"
## [22] "Carrier" "Code"
arrange(dtc, UniqueCarrier, DepDelay)
## # A tibble: 68 x 23
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 8 18 4 1808 NA AA
## 2 2011 2 4 5 1638 NA AA
## 3 2011 7 29 5 1424 NA CO
## 4 2011 1 26 3 1703 NA CO
## 5 2011 8 11 4 1320 NA CO
## 6 2011 7 25 1 1654 NA CO
## 7 2011 1 26 3 1926 NA CO
## 8 2011 3 31 4 1016 NA CO
## 9 2011 2 8 2 1057 NA CO
## 10 2011 4 4 1 1632 NA DL
## # ... with 58 more rows, and 16 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>, Carrier <chr>, Code <chr>
var1
) - rows from smallest to largestvar1
)) - rows from largest to smallest
# Arrange hflights so that
# flights by the same carrier
# appear next to each other
# and within each carrier,
# flights that have larger departure delays
# appear before flights that have
# smaller departure delays.
# before
# arrange(dtc, UniqueCarrier, DepDelay)
# now reverse
arrange(hflights, UniqueCarrier, desc(DepDelay))
## # A tibble: 227,496 x 23
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 12 12 1 650 808 AA
## 2 2011 11 19 6 1752 1910 AA
## 3 2011 12 22 4 1728 1848 AA
## 4 2011 10 23 7 2305 2 AA
## 5 2011 9 27 2 1206 1300 AA
## 6 2011 3 17 4 1647 1747 AA
## 7 2011 6 21 2 955 1315 AA
## 8 2011 5 20 5 2359 130 AA
## 9 2011 4 19 2 2023 2142 AA
## 10 2011 5 12 4 2133 53 AA
## # ... with 227,486 more rows, and 16 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>, Carrier <chr>, Code <chr>
# Arrange flights by total delay (normal order).
arrange(hflights, DepDelay + ArrDelay)
## # A tibble: 227,496 x 23
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 7 3 7 1914 2039 XE
## 2 2011 8 31 3 934 1039 OO
## 3 2011 8 21 7 935 1039 OO
## 4 2011 8 28 7 2059 2206 OO
## 5 2011 8 29 1 935 1041 OO
## 6 2011 12 25 7 741 926 OO
## 7 2011 1 30 7 620 812 OO
## 8 2011 8 3 3 1741 1810 XE
## 9 2011 8 4 4 930 1041 OO
## 10 2011 8 18 4 939 1043 OO
## # ... with 227,486 more rows, and 16 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>, Carrier <chr>, Code <chr>
summarise(), which reduces each group to a single row by calculating aggregate measures.
# Print out a summary with variables min_dist and max_dist
summarise(hflights, min_dist = min(Distance), max_dist = max(Distance))
## # A tibble: 1 x 2
## min_dist max_dist
## <dbl> <dbl>
## 1 79 3904
# Print out a summary of hflights
# with a single variable,
# max_div:
# the longest Distance for diverted flights.
# You will need one of the four other verbs to do this!
# i.e. Print out a summary with variable max_div
unique(hflights$Diverted)
## [1] 0 1
summarise(filter(hflights, Diverted == 1), max_div = max(Distance))
## # A tibble: 1 x 1
## max_div
## <dbl>
## 1 3904
# Remove rows that have NAs in the arrival delay column
# Remove rows that have NA ArrDelay: temp1
temp1 <- filter(hflights, !is.na(ArrDelay))
# Generate summary about ArrDelay column of temp1
# 1. earliest: the minimum arrival delay,
# 2. average: the average arrival delay,
# 3. latest: the longest arrival delay,
# 4. sd: the standard deviation for arrival delays.
# test:
summarise(temp1, earliest = min(temp1$ArrDelay))
## # A tibble: 1 x 1
## earliest
## <int>
## 1 -70
# alternatively:
summarise(temp1, earliest = min(ArrDelay))
## # A tibble: 1 x 1
## earliest
## <dbl>
## 1 -70
# all four
summarise(temp1, earliest = min(ArrDelay),
average = mean(ArrDelay),
latest = max(ArrDelay),
sd = sd(ArrDelay)
)
## # A tibble: 1 x 4
## earliest average latest sd
## <dbl> <dbl> <dbl> <dbl>
## 1 -70 7.09 978 30.7
# Keep rows that have no NA TaxiIn and no NA TaxiOut: temp2
temp2 <- filter(hflights, !is.na(TaxiIn), !is.na(TaxiOut))
# Print the maximum taxiing difference of temp2 with summarise()
summarise(temp2, max_taxi_diff = max(abs(TaxiIn - TaxiOut)))
## # A tibble: 1 x 1
## max_taxi_diff
## <int>
## 1 160
# Print out a summary of hflights with the following variables:
# n_obs: the total number of observations,
# n_carrier: the total number of carriers,
# n_dest: the total number of destinations,
# Generate summarizing statistics for hflights
summarise(hflights,
n_obs = n(),
n_carrier = n_distinct(UniqueCarrier),
n_dest = n_distinct(Dest))
## # A tibble: 1 x 3
## n_obs n_carrier n_dest
## <int> <int> <int>
## 1 227496 15 116
# All American Airline flights
aa <- filter(hflights, UniqueCarrier == "American")
# Print out a summary of aa with the following variables:
# n_flights: the total number of flights (each observation is a flight),
# n_canc: the total number of cancelled flights,
# avg_delay: the average arrival delay of flights whose delay is not NA (na.rm = TRUE).
# Generate summarizing statistics for aa
summarise(aa,
n_flights = n(),
n_canc = sum(Cancelled == 1),
avg_delay = mean(ArrDelay, na.rm = TRUE))
## # A tibble: 1 x 3
## n_flights n_canc avg_delay
## <int> <int> <dbl>
## 1 0 0 NaN
saving intermediate results to temporary variables or nesting function calls is cumbersome and error-prone.
The %>%
operator allows you to extract the first argument of a function from the arguments list and put it in front of it
# the following two commands are completely equivalent:
mean(c(1, 2, 3, NA), na.rm = TRUE)
## [1] 2
c(1, 2, 3, NA) %>% mean(na.rm = TRUE)
## [1] 2
Syntax:
# Write the 'piped' version of the English sentences.
# Take the hflights data set and then ...
hflights %>%
# Add a variable named diff that is the result of subtracting TaxiIn from TaxiOut, and then ...
mutate( diff = TaxiOut - TaxiIn ) %>%
# Pick all of the rows whose diff value does not equal NA, and then ...
# filter( diff != NA ) %>% # INCORRECT
filter( !is.na(diff) ) %>% # CORRECT
# Summarise the data set with a value named avg that is the mean diff value
summarise(avg = mean(diff))
## # A tibble: 1 x 1
## avg
## <dbl>
## 1 8.99
hflights %>%
# mutate() the hflights dataset and add two variables:
# RealTime:
# the actual elapsed time plus 100 minutes (for the overhead that flying involves) and
# mph:
# calculated as Distance / RealTime * 60, then
mutate(RealTime = ActualElapsedTime + 100, mph = Distance / RealTime * 60) %>%
# filter() to keep observations that have an mph that is not NA and that is below 70, finally
filter( !is.na(mph), mph < 70) %>%
# summarise() the result by creating four summary variables:
# n_less, the number of observations,
# n_dest, the number of destinations,
# min_dist, the minimum distance and
# max_dist, the maximum distance.
summarise(
n_less = n(),
n_dest = n_distinct(Dest),
min_dist = min(Distance),
max_dist = max(Distance)
)
## # A tibble: 1 x 4
## n_less n_dest min_dist max_dist
## <int> <int> <dbl> <dbl>
## 1 6726 13 79 305
from previous exercise, some flights might be less efficient than driving in terms of speed
But is speed all that matters?
Flying imposes burdens on a traveler that driving does not:
airplane tickets are very expensive Air travelers also need to limit what they bring on their trip arrange for a pick up or a drop off
DEFINE:
preferable flights = flights that are at least 50% faster than driving,
i.e. that travel 105 mph or greater in real time
ASSUME:
cancelled or diverted flights are less preferable than driving.
hflights %>%
mutate(RealTime = ActualElapsedTime + 100, mph = Distance / RealTime * 60) %>%
# filter()
# keep observations that have an mph under 105
# or for which Cancelled equals 1
# or for which Diverted equals 1.
filter( mph < 105 | Cancelled == 1 | Diverted == 1) %>%
# summarise() the result by creating four summary variables:
# n_non, the number of observations,
# n_dest, the number of destinations,
# min_dist, the minimum distance and
# max_dist, the maximum distance.
summarise(
n_non = n(),
n_dest = n_distinct(Dest),
min_dist = min(Distance),
max_dist = max(Distance)
)
## # A tibble: 1 x 4
## n_non n_dest min_dist max_dist
## <int> <int> <dbl> <dbl>
## 1 42400 113 79 3904
The results show that almost 19% of flights appear less desirable than simply driving to the destination, which is rather surprising!
hflights %>%
filter(!is.na(DepTime), !is.na(ArrTime), DepTime> ArrTime) %>%
summarise(num = n()) # 265 flights
## # A tibble: 1 x 1
## num
## <int>
## 1 2718
# Make an ordered per-carrier summary of hflights
hflights %>%
group_by(UniqueCarrier) %>%
summarise(p_canc = mean(Cancelled == 1) * 100,
# percentage of cancelled flights
avg_delay = mean(ArrDelay, na.rm = TRUE)) %>%
# average arrival delay of flights
# whose delay does not equal NA
arrange(avg_delay, p_canc)
## # A tibble: 15 x 3
## UniqueCarrier p_canc avg_delay
## <chr> <dbl> <dbl>
## 1 US 1.13 -0.631
## 2 AA 1.85 0.892
## 3 FL 0.982 1.85
## 4 AS 0 3.19
## 5 YV 1.27 4.01
## 6 DL 1.59 6.08
## 7 CO 0.678 6.10
## 8 MQ 2.90 7.15
## 9 EV 3.45 7.26
## 10 WN 1.55 7.59
## 11 F9 0.716 7.67
## 12 XE 1.55 8.19
## 13 OO 1.39 8.69
## 14 B6 2.59 9.86
## 15 UA 1.64 10.5
# SkyWest had the longest average arrival delay
# Here, we didn't use mutate()
# yet: when you combine it with group_by() you can do great things!
# Ordered overview of average arrival delays per carrier
hflights %>%
filter( !is.na(ArrDelay), ArrDelay > 0 ) %>%
group_by(UniqueCarrier) %>%
summarise(avg = mean(ArrDelay)) %>%
mutate(rank = rank(avg)) %>%
arrange(rank)
## # A tibble: 15 x 3
## UniqueCarrier avg rank
## <chr> <dbl> <dbl>
## 1 YV 18.7 1
## 2 F9 18.7 2
## 3 US 20.7 3
## 4 CO 22.1 4
## 5 AS 22.9 5
## 6 OO 24.1 6
## 7 XE 24.2 7
## 8 WN 25.3 8
## 9 FL 27.9 9
## 10 AA 28.5 10
## 11 DL 32.1 11
## 12 UA 32.5 12
## 13 MQ 38.8 13
## 14 EV 40.2 14
## 15 B6 45.5 15
# How many airplanes only flew to one destination?
hflights %>%
group_by(TailNum) %>%
summarise(ndest = n_distinct(Dest)) %>%
filter(ndest == 1) %>%
summarise(nplanes = n())
## # A tibble: 1 x 1
## nplanes
## <int>
## 1 1526
# Find the most visited destination for each carrier
hflights %>%
group_by(UniqueCarrier, Dest) %>%
summarise(n = n()
# how often a carrier visited a particular destination
) %>%
mutate(rank = rank(desc(n))) %>%
filter(rank == 1)
## # A tibble: 15 x 4
## # Groups: UniqueCarrier [15]
## UniqueCarrier Dest n rank
## <chr> <chr> <int> <dbl>
## 1 AA DFW 2105 1
## 2 AS SEA 365 1
## 3 B6 JFK 695 1
## 4 CO EWR 3924 1
## 5 DL ATL 2396 1
## 6 EV DTW 851 1
## 7 F9 DEN 837 1
## 8 FL ATL 2029 1
## 9 MQ DFW 2424 1
## 10 OO COS 1335 1
## 11 UA SFO 643 1
## 12 US CLT 2212 1
## 13 WN DAL 8243 1
## 14 XE CRP 3175 1
## 15 YV CLT 71 1
# from section one
# hflights <- tbl_df(hflights)
class(hflights)
## [1] "tbl_df" "tbl" "data.frame"
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
hflights2 <- as.data.table(hflights)
class(hflights2)
## [1] "data.table" "data.frame"
# Even though hflights2 is a different data structure,
# you can use the same dplyr functions
# to manipulate hflights2
# as you used to manipulate hflights.
# total number of unique carriers
hflights2 %>%
summarise(n_carrier = n_distinct(UniqueCarrier))
## n_carrier
## 1 15
# OR
summarise(hflights2, n_carrier = n_distinct(UniqueCarrier))
## n_carrier
## 1 15
library(RMySQL)
## Loading required package: DBI
library(dbplyr)
##
## Attaching package: 'dbplyr'
## The following objects are masked from 'package:dplyr':
##
## ident, sql
# Set up a connection to the mysql database
my_db <- src_mysql(dbname = "dplyr",
host = "courses.csrrinzqubik.us-east-1.rds.amazonaws.com",
port = 3306,
user = "student",
password = "datacamp")
# data about flights that departed from New York City in 2013
# data is similar to the data in hflights
# but it does not contain information about cancellations or diversions
# Reference a MySQL table within that source: nycflights
nycflights <- tbl(my_db, "dplyr")
# Behind the scenes, dplyr will convert the commands to the database's native language (in this case, SQL), and return the results
# glimpse at nycflights
glimpse(nycflights)
## Observations: ??
## Variables: 17
## $ id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013...
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 55...
## $ dep_delay <int> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2,...
## $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 8...
## $ arr_delay <int> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7,...
## $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6"...
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N...
## $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301...
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LG...
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IA...
## $ air_time <int> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149...
## $ distance <int> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 73...
## $ hour <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6...
## $ minute <int> 17, 33, 42, 44, 54, 54, 55, 57, 57, 58, 58, 58, 58, ...
# Ordered, grouped summary of nycflights
nycflights %>%
group_by(carrier) %>%
summarise(n_flights = n(),
avg_delay = mean(arr_delay)
) %>%
arrange(avg_delay)
## Warning: Missing values are always removed in SQL.
## Use `AVG(x, na.rm = TRUE)` to silence this warning
## Warning in .local(conn, statement, ...): Decimal MySQL column 2 imported as
## numeric
## # Source: lazy query [?? x 3]
## # Database: mysql 5.6.34-log
## # [student@courses.csrrinzqubik.us-east-1.rds.amazonaws.com:/dplyr]
## # Ordered by: avg_delay
## carrier n_flights avg_delay
## <chr> <dbl> <dbl>
## 1 AS 714 -9.86
## 2 HA 342 -6.92
## 3 AA 32729 0.356
## 4 DL 48110 1.63
## 5 VX 5162 1.75
## 6 US 20536 2.06
## 7 UA 58665 3.50
## 8 9E 18460 6.91
## 9 B6 54635 9.36
## 10 WN 12275 9.47
## # ... with more rows