mutate
, select
, filter
, summarise
, arrange
, and the joins# Load the dplyr package
library(dplyr)
# Load the hflights package
library(hflights)
# Call both head() and summary() on hflights
head(hflights)
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## 5424 2011 1 1 6 1400 1500 AA
## 5425 2011 1 2 7 1401 1501 AA
## 5426 2011 1 3 1 1352 1502 AA
## 5427 2011 1 4 2 1403 1513 AA
## 5428 2011 1 5 3 1405 1507 AA
## 5429 2011 1 6 4 1359 1503 AA
## FlightNum TailNum ActualElapsedTime AirTime ArrDelay DepDelay Origin
## 5424 428 N576AA 60 40 -10 0 IAH
## 5425 428 N557AA 60 45 -9 1 IAH
## 5426 428 N541AA 70 48 -8 -8 IAH
## 5427 428 N403AA 70 39 3 3 IAH
## 5428 428 N492AA 62 44 -3 5 IAH
## 5429 428 N262AA 64 45 -7 -1 IAH
## Dest Distance TaxiIn TaxiOut Cancelled CancellationCode Diverted
## 5424 DFW 224 7 13 0 0
## 5425 DFW 224 6 9 0 0
## 5426 DFW 224 5 17 0 0
## 5427 DFW 224 9 22 0 0
## 5428 DFW 224 9 9 0 0
## 5429 DFW 224 6 13 0 0
summary(hflights)
## Year Month DayofMonth DayOfWeek
## Min. :2011 Min. : 1.000 Min. : 1.00 Min. :1.000
## 1st Qu.:2011 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.:2.000
## Median :2011 Median : 7.000 Median :16.00 Median :4.000
## Mean :2011 Mean : 6.514 Mean :15.74 Mean :3.948
## 3rd Qu.:2011 3rd Qu.: 9.000 3rd Qu.:23.00 3rd Qu.:6.000
## Max. :2011 Max. :12.000 Max. :31.00 Max. :7.000
##
## DepTime ArrTime UniqueCarrier FlightNum
## Min. : 1 Min. : 1 Length:227496 Min. : 1
## 1st Qu.:1021 1st Qu.:1215 Class :character 1st Qu.: 855
## Median :1416 Median :1617 Mode :character Median :1696
## Mean :1396 Mean :1578 Mean :1962
## 3rd Qu.:1801 3rd Qu.:1953 3rd Qu.:2755
## Max. :2400 Max. :2400 Max. :7290
## NA's :2905 NA's :3066
## TailNum ActualElapsedTime AirTime ArrDelay
## Length:227496 Min. : 34.0 Min. : 11.0 Min. :-70.000
## Class :character 1st Qu.: 77.0 1st Qu.: 58.0 1st Qu.: -8.000
## Mode :character Median :128.0 Median :107.0 Median : 0.000
## Mean :129.3 Mean :108.1 Mean : 7.094
## 3rd Qu.:165.0 3rd Qu.:141.0 3rd Qu.: 11.000
## Max. :575.0 Max. :549.0 Max. :978.000
## NA's :3622 NA's :3622 NA's :3622
## DepDelay Origin Dest Distance
## Min. :-33.000 Length:227496 Length:227496 Min. : 79.0
## 1st Qu.: -3.000 Class :character Class :character 1st Qu.: 376.0
## Median : 0.000 Mode :character Mode :character Median : 809.0
## Mean : 9.445 Mean : 787.8
## 3rd Qu.: 9.000 3rd Qu.:1042.0
## Max. :981.000 Max. :3904.0
## NA's :2905
## TaxiIn TaxiOut Cancelled CancellationCode
## Min. : 1.000 Min. : 1.00 Min. :0.00000 Length:227496
## 1st Qu.: 4.000 1st Qu.: 10.00 1st Qu.:0.00000 Class :character
## Median : 5.000 Median : 14.00 Median :0.00000 Mode :character
## Mean : 6.099 Mean : 15.09 Mean :0.01307
## 3rd Qu.: 7.000 3rd Qu.: 18.00 3rd Qu.:0.00000
## Max. :165.000 Max. :163.00 Max. :1.00000
## NA's :3066 NA's :2947
## Diverted
## Min. :0.000000
## 1st Qu.:0.000000
## Median :0.000000
## Mean :0.002853
## 3rd Qu.:0.000000
## Max. :1.000000
##
str(hflights)
## 'data.frame': 227496 obs. of 21 variables:
## $ Year : int 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
## $ Month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DayofMonth : int 1 2 3 4 5 6 7 8 9 10 ...
## $ DayOfWeek : int 6 7 1 2 3 4 5 6 7 1 ...
## $ DepTime : int 1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
## $ ArrTime : int 1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
## $ UniqueCarrier : chr "AA" "AA" "AA" "AA" ...
## $ FlightNum : int 428 428 428 428 428 428 428 428 428 428 ...
## $ TailNum : chr "N576AA" "N557AA" "N541AA" "N403AA" ...
## $ ActualElapsedTime: int 60 60 70 70 62 64 70 59 71 70 ...
## $ AirTime : int 40 45 48 39 44 45 43 40 41 45 ...
## $ ArrDelay : int -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
## $ DepDelay : int 0 1 -8 3 5 -1 -1 -5 43 43 ...
## $ Origin : chr "IAH" "IAH" "IAH" "IAH" ...
## $ Dest : chr "DFW" "DFW" "DFW" "DFW" ...
## $ Distance : int 224 224 224 224 224 224 224 224 224 224 ...
## $ TaxiIn : int 7 6 5 9 9 6 12 7 8 6 ...
## $ TaxiOut : int 13 9 17 22 9 13 15 12 22 19 ...
## $ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CancellationCode : chr "" "" "" "" ...
## $ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
str
and head
# Convert the hflights data.frame into a hflights tbl
hflights <- tbl_df(hflights)
# Display the hflights tbl
hflights
## # A tibble: 227,496 x 21
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## * <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 1 6 1400 1500 AA
## 2 2011 1 2 7 1401 1501 AA
## 3 2011 1 3 1 1352 1502 AA
## 4 2011 1 4 2 1403 1513 AA
## 5 2011 1 5 3 1405 1507 AA
## 6 2011 1 6 4 1359 1503 AA
## 7 2011 1 7 5 1359 1509 AA
## 8 2011 1 8 6 1355 1454 AA
## 9 2011 1 9 7 1443 1554 AA
## 10 2011 1 10 1 1443 1553 AA
## # ... with 227,486 more rows, and 14 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
glimpse(hflights)
## Observations: 227,496
## Variables: 21
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AA", "A...
## $ FlightNum <int> 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance <int> 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode <chr> "", "", "", "", "", "", "", "", "", "", "", ...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
# Create the object carriers, containing only the UniqueCarrier variable of hflights
carriers <- hflights$UniqueCarrier
str(carriers)
## chr [1:227496] "AA" "AA" "AA" "AA" "AA" "AA" "AA" "AA" "AA" "AA" "AA" ...
# Both the dplyr and hflights packages are loaded into workspace
lut <- c("AA" = "American", "AS" = "Alaska", "B6" = "JetBlue", "CO" = "Continental",
"DL" = "Delta", "OO" = "SkyWest", "UA" = "United", "US" = "US_Airways",
"WN" = "Southwest", "EV" = "Atlantic_Southeast", "F9" = "Frontier",
"FL" = "AirTran", "MQ" = "American_Eagle", "XE" = "ExpressJet", "YV" = "Mesa")
# Use lut to translate the UniqueCarrier column of hflights
hflights$UniqueCarrier <- lut[hflights$UniqueCarrier]
# Inspect the resulting raw values of your variables
glimpse(hflights)
## Observations: 227,496
## Variables: 21
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier <chr> "American", "American", "American", "America...
## $ FlightNum <int> 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance <int> 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode <chr> "", "", "", "", "", "", "", "", "", "", "", ...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## The "E" in my dataset is "" for some reason so I have changed it here
table(hflights$CancellationCode)
##
## A B C D
## 224523 1202 1652 118 1
hflights <- hflights %>%
mutate(
CancellationCode = ifelse(CancellationCode == "", "E", CancellationCode)
)
table(hflights$CancellationCode)
##
## A B C D E
## 1202 1652 118 1 224523
# Build the lookup table: lut
lut <- c("A" = "carrier",
"B" = "weather",
"C" = "FFA",
"D" = "security",
"E" = "not cancelled")
# Use the lookup table to create a vector of code labels. Assign the vector to the CancellationCode column of hflights
hflights$Code <- lut[hflights$CancellationCode]
# Inspect the resulting raw values of your variables
glimpse(hflights)
## Observations: 227,496
## Variables: 22
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier <chr> "American", "American", "American", "America...
## $ FlightNum <int> 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance <int> 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode <chr> "E", "E", "E", "E", "E", "E", "E", "E", "E",...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Code <chr> "not cancelled", "not cancelled", "not cance...
select()
which returns a subset of the columns,filter()
that is able to return a subset of the rows,arrange()
that reorders the rows according to single or multiple variables,mutate()
used to add columns from existing data,summarise()
which reduces each group to a single row by calculating aggregate measures.select
and mutate
manipulate variablesfilter
and arrange
manipulate observationssummarize
manipulates groups
of observations# Print out a tbl with the four columns of hflights related to delay
select(hflights, ActualElapsedTime, AirTime, ArrDelay, DepDelay)
## # A tibble: 227,496 x 4
## ActualElapsedTime AirTime ArrDelay DepDelay
## <int> <int> <int> <int>
## 1 60 40 -10 0
## 2 60 45 -9 1
## 3 70 48 -8 -8
## 4 70 39 3 3
## 5 62 44 -3 5
## 6 64 45 -7 -1
## 7 70 43 -1 -1
## 8 59 40 -16 -5
## 9 71 41 44 43
## 10 70 45 43 43
## # ... with 227,486 more rows
# Print out hflights, nothing has changed!
hflights
## # A tibble: 227,496 x 22
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## <int> <int> <int> <int> <int> <int> <chr>
## 1 2011 1 1 6 1400 1500 American
## 2 2011 1 2 7 1401 1501 American
## 3 2011 1 3 1 1352 1502 American
## 4 2011 1 4 2 1403 1513 American
## 5 2011 1 5 3 1405 1507 American
## 6 2011 1 6 4 1359 1503 American
## 7 2011 1 7 5 1359 1509 American
## 8 2011 1 8 6 1355 1454 American
## 9 2011 1 9 7 1443 1554 American
## 10 2011 1 10 1 1443 1553 American
## # ... with 227,486 more rows, and 15 more variables: FlightNum <int>,
## # TailNum <chr>, ActualElapsedTime <int>, AirTime <int>, ArrDelay <int>,
## # DepDelay <int>, Origin <chr>, Dest <chr>, Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>, Code <chr>
# Print out the columns Origin up to Cancelled of hflights
select(hflights, Origin:Cancelled)
## # A tibble: 227,496 x 6
## Origin Dest Distance TaxiIn TaxiOut Cancelled
## <chr> <chr> <int> <int> <int> <int>
## 1 IAH DFW 224 7 13 0
## 2 IAH DFW 224 6 9 0
## 3 IAH DFW 224 5 17 0
## 4 IAH DFW 224 9 22 0
## 5 IAH DFW 224 9 9 0
## 6 IAH DFW 224 6 13 0
## 7 IAH DFW 224 12 15 0
## 8 IAH DFW 224 7 12 0
## 9 IAH DFW 224 8 22 0
## 10 IAH DFW 224 6 19 0
## # ... with 227,486 more rows
# Answer to last question: be concise!
select(hflights, 1:4, 12:21)
## # A tibble: 227,496 x 14
## Year Month DayofMonth DayOfWeek ArrDelay DepDelay Origin Dest
## <int> <int> <int> <int> <int> <int> <chr> <chr>
## 1 2011 1 1 6 -10 0 IAH DFW
## 2 2011 1 2 7 -9 1 IAH DFW
## 3 2011 1 3 1 -8 -8 IAH DFW
## 4 2011 1 4 2 3 3 IAH DFW
## 5 2011 1 5 3 -3 5 IAH DFW
## 6 2011 1 6 4 -7 -1 IAH DFW
## 7 2011 1 7 5 -1 -1 IAH DFW
## 8 2011 1 8 6 -16 -5 IAH DFW
## 9 2011 1 9 7 44 43 IAH DFW
## 10 2011 1 10 1 43 43 IAH DFW
## # ... with 227,486 more rows, and 6 more variables: Distance <int>,
## # TaxiIn <int>, TaxiOut <int>, Cancelled <int>, CancellationCode <chr>,
## # Diverted <int>
# Print out a tbl containing just ArrDelay and DepDelay
select(hflights, ends_with(c('Delay')))
## # A tibble: 227,496 x 2
## ArrDelay DepDelay
## <int> <int>
## 1 -10 0
## 2 -9 1
## 3 -8 -8
## 4 3 3
## 5 -3 5
## 6 -7 -1
## 7 -1 -1
## 8 -16 -5
## 9 44 43
## 10 43 43
## # ... with 227,486 more rows
# Print out a tbl as described in the second instruction, using both helper functions and variable names
select(hflights, UniqueCarrier, ends_with(c('Num')), starts_with(c('Cancel')))
## # A tibble: 227,496 x 5
## UniqueCarrier FlightNum TailNum Cancelled CancellationCode
## <chr> <int> <chr> <int> <chr>
## 1 American 428 N576AA 0 E
## 2 American 428 N557AA 0 E
## 3 American 428 N541AA 0 E
## 4 American 428 N403AA 0 E
## 5 American 428 N492AA 0 E
## 6 American 428 N262AA 0 E
## 7 American 428 N493AA 0 E
## 8 American 428 N477AA 0 E
## 9 American 428 N476AA 0 E
## 10 American 428 N504AA 0 E
## # ... with 227,486 more rows
# Print out a tbl as described in the third instruction, using only helper functions.
select(hflights, contains(c('Time')), contains(c('Delay')))
## # A tibble: 227,496 x 6
## DepTime ArrTime ActualElapsedTime AirTime ArrDelay DepDelay
## <int> <int> <int> <int> <int> <int>
## 1 1400 1500 60 40 -10 0
## 2 1401 1501 60 45 -9 1
## 3 1352 1502 70 48 -8 -8
## 4 1403 1513 70 39 3 3
## 5 1405 1507 62 44 -3 5
## 6 1359 1503 64 45 -7 -1
## 7 1359 1509 70 43 -1 -1
## 8 1355 1454 59 40 -16 -5
## 9 1443 1554 71 41 44 43
## 10 1443 1553 70 45 43 43
## # ... with 227,486 more rows
ex1r <- hflights[c("TaxiIn","TaxiOut","Distance")]
ex1d <- select(hflights, contains(c('Taxi')), Distance)
ex2r <- hflights[c("Year","Month","DayOfWeek","DepTime","ArrTime")]
ex2d <- select(hflights, Year:ArrTime, -3)
ex3r <- hflights[c("TailNum","TaxiIn","TaxiOut")]
ex3d <- select(hflights, TailNum, contains(c('Taxi')))
# Add the new variable ActualGroundTime to a copy of hflights and save the result as g1.
g1 <- mutate(hflights, ActualGroundTime = ActualElapsedTime - AirTime)
# Add the new variable GroundTime to a g1. Save the result as g2.
g2 <- mutate(g1, GroundTime = TaxiIn + TaxiOut)
# Add the new variable AverageSpeed to g2. Save the result as g3.
g3 <- mutate(g2, AverageSpeed = Distance / AirTime * 60)
# Print out g3
glimpse(g3)
## Observations: 227,496
## Variables: 25
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier <chr> "American", "American", "American", "America...
## $ FlightNum <int> 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance <int> 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode <chr> "E", "E", "E", "E", "E", "E", "E", "E", "E",...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Code <chr> "not cancelled", "not cancelled", "not cance...
## $ ActualGroundTime <int> 20, 15, 22, 31, 18, 19, 27, 19, 30, 25, 28, ...
## $ GroundTime <int> 20, 15, 22, 31, 18, 19, 27, 19, 30, 25, 28, ...
## $ AverageSpeed <dbl> 336.0000, 298.6667, 280.0000, 344.6154, 305....
# hflights and dplyr are ready, are you?
# Add a second variable loss_percent to the dataset: m1
m1 <- mutate(hflights,
loss = ArrDelay - DepDelay,
loss_percent = (ArrDelay - DepDelay)/DepDelay * 100
)
glimpse(m1)
## Observations: 227,496
## Variables: 24
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier <chr> "American", "American", "American", "America...
## $ FlightNum <int> 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance <int> 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode <chr> "E", "E", "E", "E", "E", "E", "E", "E", "E",...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Code <chr> "not cancelled", "not cancelled", "not cance...
## $ loss <int> -10, -10, 0, 0, -8, -6, 0, -11, 1, 0, 0, -14...
## $ loss_percent <dbl> -Inf, -1000.000000, 0.000000, 0.000000, -160...
# Copy and adapt the previous command to reduce redendancy: m2
m2 <- mutate(hflights,
loss = ArrDelay - DepDelay,
loss_percent = loss/DepDelay * 100
)
glimpse(m2)
## Observations: 227,496
## Variables: 24
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier <chr> "American", "American", "American", "America...
## $ FlightNum <int> 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance <int> 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode <chr> "E", "E", "E", "E", "E", "E", "E", "E", "E",...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Code <chr> "not cancelled", "not cancelled", "not cance...
## $ loss <int> -10, -10, 0, 0, -8, -6, 0, -11, 1, 0, 0, -14...
## $ loss_percent <dbl> -Inf, -1000.000000, 0.000000, 0.000000, -160...
# Add the three variables as described in the third instruction: m3
m3 <- mutate(hflights,
TotalTaxi = TaxiIn + TaxiOut,
ActualGroundTime = ActualElapsedTime - AirTime,
Diff = TotalTaxi - ActualGroundTime
)
glimpse(m3)
## Observations: 227,496
## Variables: 25
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier <chr> "American", "American", "American", "America...
## $ FlightNum <int> 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance <int> 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode <chr> "E", "E", "E", "E", "E", "E", "E", "E", "E",...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Code <chr> "not cancelled", "not cancelled", "not cance...
## $ TotalTaxi <int> 20, 15, 22, 31, 18, 19, 27, 19, 30, 25, 28, ...
## $ ActualGroundTime <int> 20, 15, 22, 31, 18, 19, 27, 19, 30, 25, 28, ...
## $ Diff <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
# All flights that traveled 3000 miles or more
filter(hflights, Distance >= 3000) %>% glimpse()
## Observations: 527
## Variables: 22
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth <int> 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, ...
## $ DayOfWeek <int> 1, 7, 6, 5, 4, 3, 2, 1, 7, 6, 5, 4, 3, 2, 1,...
## $ DepTime <int> 924, 925, 1045, 1516, 950, 944, 924, 1144, 9...
## $ ArrTime <int> 1413, 1410, 1445, 1916, 1344, 1350, 1337, 16...
## $ UniqueCarrier <chr> "Continental", "Continental", "Continental",...
## $ FlightNum <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ TailNum <chr> "N69063", "N76064", "N69063", "N77066", "N76...
## $ ActualElapsedTime <int> 529, 525, 480, 480, 474, 486, 493, 501, 489,...
## $ AirTime <int> 492, 493, 459, 463, 455, 471, 473, 464, 466,...
## $ ArrDelay <int> 23, 20, 55, 326, -6, 0, -13, 135, -15, -10, ...
## $ DepDelay <int> -1, 0, 80, 351, 25, 19, -1, 139, 1, 17, 3, 1...
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest <chr> "HNL", "HNL", "HNL", "HNL", "HNL", "HNL", "H...
## $ Distance <int> 3904, 3904, 3904, 3904, 3904, 3904, 3904, 39...
## $ TaxiIn <int> 6, 13, 4, 7, 4, 5, 5, 7, 6, 3, 6, 4, 6, 4, 5...
## $ TaxiOut <int> 31, 19, 17, 10, 15, 10, 15, 30, 17, 10, 19, ...
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode <chr> "E", "E", "E", "E", "E", "E", "E", "E", "E",...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,...
## $ Code <chr> "not cancelled", "not cancelled", "not cance...
# All flights flown by one of JetBlue, Southwest, or Delta
filter(hflights, UniqueCarrier %in% c('JetBlue', 'Southwest', 'Delta')) %>% glimpse()
## Observations: 48,679
## Variables: 22
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth <int> 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 7, 7, 8, 9,...
## $ DayOfWeek <int> 6, 6, 7, 7, 1, 1, 2, 2, 3, 3, 4, 5, 5, 6, 7,...
## $ DepTime <int> 654, 1639, 703, 1604, 659, 1801, 654, 1608, ...
## $ ArrTime <int> 1124, 2110, 1113, 2040, 1100, 2200, 1103, 20...
## $ UniqueCarrier <chr> "JetBlue", "JetBlue", "JetBlue", "JetBlue", ...
## $ FlightNum <int> 620, 622, 620, 622, 620, 622, 620, 622, 620,...
## $ TailNum <chr> "N324JB", "N324JB", "N324JB", "N324JB", "N22...
## $ ActualElapsedTime <int> 210, 211, 190, 216, 181, 179, 189, 206, 183,...
## $ AirTime <int> 181, 188, 172, 176, 166, 165, 168, 175, 167,...
## $ ArrDelay <int> 5, 61, -6, 31, -19, 111, -16, 25, -14, -6, -...
## $ DepDelay <int> -6, 54, 3, 19, -1, 136, -6, 23, 0, 9, -3, -6...
## $ Origin <chr> "HOU", "HOU", "HOU", "HOU", "HOU", "HOU", "H...
## $ Dest <chr> "JFK", "JFK", "JFK", "JFK", "JFK", "JFK", "J...
## $ Distance <int> 1428, 1428, 1428, 1428, 1428, 1428, 1428, 14...
## $ TaxiIn <int> 6, 12, 6, 9, 3, 5, 9, 8, 4, 14, 7, 6, 9, 9, ...
## $ TaxiOut <int> 23, 11, 12, 31, 12, 9, 12, 23, 12, 10, 9, 25...
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode <chr> "E", "E", "E", "E", "E", "E", "E", "E", "E",...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Code <chr> "not cancelled", "not cancelled", "not cance...
# All flights where taxiing took longer than flying
filter(hflights, (TaxiIn + TaxiOut) > AirTime) %>% glimpse()
## Observations: 1,389
## Variables: 22
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth <int> 24, 30, 24, 10, 31, 31, 31, 31, 30, 30, 30, ...
## $ DayOfWeek <int> 1, 7, 1, 1, 1, 1, 1, 1, 7, 7, 7, 7, 7, 7, 7,...
## $ DepTime <int> 731, 1959, 1621, 941, 1301, 2113, 1434, 900,...
## $ ArrTime <int> 904, 2132, 1749, 1113, 1356, 2215, 1539, 100...
## $ UniqueCarrier <chr> "American", "American", "American", "America...
## $ FlightNum <int> 460, 533, 1121, 1436, 241, 1533, 1541, 1583,...
## $ TailNum <chr> "N545AA", "N455AA", "N484AA", "N591AA", "N14...
## $ ActualElapsedTime <int> 93, 93, 88, 92, 55, 62, 65, 66, 64, 84, 80, ...
## $ AirTime <int> 42, 43, 43, 45, 27, 30, 30, 32, 31, 40, 37, ...
## $ ArrDelay <int> 29, 12, 4, 48, -2, 20, 15, 10, 10, 54, 16, 1...
## $ DepDelay <int> 11, -6, -9, 31, -4, 13, 4, 0, -1, 39, 2, -4,...
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "AUS", "AUS", "A...
## $ Distance <int> 224, 224, 224, 224, 140, 140, 140, 140, 140,...
## $ TaxiIn <int> 14, 10, 10, 27, 5, 7, 5, 5, 6, 10, 6, 4, 6, ...
## $ TaxiOut <int> 37, 40, 35, 20, 23, 25, 30, 29, 27, 34, 37, ...
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode <chr> "E", "E", "E", "E", "E", "E", "E", "E", "E",...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Code <chr> "not cancelled", "not cancelled", "not cance...
# All flights that departed before 5am or arrived after 10pm
filter(hflights, DepTime < 500 | ArrTime > 2200) %>% glimpse()
## Observations: 27,799
## Variables: 22
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth <int> 4, 14, 10, 26, 30, 9, 31, 31, 31, 31, 31, 31...
## $ DayOfWeek <int> 2, 5, 1, 3, 7, 7, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DepTime <int> 2100, 2119, 1934, 1905, 1856, 1938, 1919, 21...
## $ ArrTime <int> 2207, 2229, 2235, 2211, 2209, 2228, 2231, 23...
## $ UniqueCarrier <chr> "American", "American", "American", "America...
## $ FlightNum <int> 533, 533, 1294, 1294, 1294, 731, 190, 209, 2...
## $ TailNum <chr> "N4XGAA", "N549AA", "N3BXAA", "N3BXAA", "N3C...
## $ ActualElapsedTime <int> 67, 70, 121, 126, 133, 290, 132, 268, 141, 1...
## $ AirTime <int> 42, 45, 107, 111, 108, 253, 107, 256, 121, 1...
## $ ArrDelay <int> 47, 69, 80, 56, 54, 78, -12, -15, -18, -10, ...
## $ DepDelay <int> 55, 74, 99, 70, 61, 73, -1, -7, 0, 8, -1, 5,...
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest <chr> "DFW", "DFW", "MIA", "MIA", "MIA", "SEA", "M...
## $ Distance <int> 224, 224, 964, 964, 964, 1874, 964, 1825, 10...
## $ TaxiIn <int> 3, 5, 3, 5, 7, 5, 5, 4, 5, 6, 4, 18, 4, 7, 9...
## $ TaxiOut <int> 22, 20, 11, 10, 18, 32, 20, 8, 15, 9, 18, 22...
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode <chr> "E", "E", "E", "E", "E", "E", "E", "E", "E",...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Code <chr> "not cancelled", "not cancelled", "not cance...
# All flights that departed late but arrived ahead of schedule
filter(hflights, DepDelay > 0 & ArrDelay < 0) %>% glimpse()
## Observations: 27,712
## Variables: 22
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth <int> 2, 5, 18, 18, 12, 13, 26, 1, 10, 12, 15, 17,...
## $ DayOfWeek <int> 7, 3, 2, 2, 3, 4, 3, 6, 1, 3, 6, 1, 4, 7, 6,...
## $ DepTime <int> 1401, 1405, 1408, 721, 2015, 2020, 2009, 163...
## $ ArrTime <int> 1501, 1507, 1508, 827, 2113, 2116, 2103, 173...
## $ UniqueCarrier <chr> "American", "American", "American", "America...
## $ FlightNum <int> 428, 428, 428, 460, 533, 533, 533, 1121, 112...
## $ TailNum <chr> "N557AA", "N492AA", "N507AA", "N558AA", "N55...
## $ ActualElapsedTime <int> 60, 62, 60, 66, 58, 56, 54, 65, 61, 68, 64, ...
## $ AirTime <int> 45, 44, 42, 46, 39, 44, 39, 37, 41, 44, 48, ...
## $ ArrDelay <int> -9, -3, -2, -8, -7, -4, -17, -9, -5, -6, -9,...
## $ DepDelay <int> 1, 5, 8, 1, 10, 15, 4, 1, 9, 1, 2, 2, 4, 5, ...
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance <int> 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn <int> 6, 9, 7, 7, 9, 4, 9, 16, 8, 5, 5, 10, 10, 9,...
## $ TaxiOut <int> 9, 9, 11, 13, 10, 8, 6, 12, 12, 19, 11, 11, ...
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode <chr> "E", "E", "E", "E", "E", "E", "E", "E", "E",...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Code <chr> "not cancelled", "not cancelled", "not cance...
# All cancelled weekend flights
filter(hflights, Cancelled == 1 & DayOfWeek %in% c(6,7)) %>% glimpse()
## Observations: 585
## Variables: 22
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth <int> 9, 29, 9, 9, 9, 2, 29, 9, 1, 9, 9, 9, 9, 8, ...
## $ DayOfWeek <int> 7, 6, 7, 7, 7, 7, 6, 7, 6, 7, 7, 7, 7, 6, 7,...
## $ DepTime <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ ArrTime <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ UniqueCarrier <chr> "American", "Continental", "Continental", "D...
## $ FlightNum <int> 1820, 408, 755, 8, 6726, 1629, 1590, 5229, 2...
## $ TailNum <chr> "N4XCAA", "", "", "N933DL", "N779SK", "N749S...
## $ ActualElapsedTime <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ AirTime <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ ArrDelay <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ DepDelay <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "HOU", "I...
## $ Dest <chr> "DFW", "EWR", "ATL", "ATL", "ASE", "DAL", "A...
## $ Distance <int> 224, 1400, 689, 689, 914, 239, 689, 469, 696...
## $ TaxiIn <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ TaxiOut <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ Cancelled <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ CancellationCode <chr> "B", "A", "B", "B", "B", "A", "A", "A", "A",...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Code <chr> "weather", "carrier", "weather", "weather", ...
# All flights that were cancelled after being delayed
filter(hflights, DepDelay > 0 & Cancelled == 1) %>% glimpse()
## Observations: 40
## Variables: 22
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4, 4, 4,...
## $ DayofMonth <int> 26, 11, 19, 7, 4, 8, 2, 9, 1, 31, 4, 8, 21, ...
## $ DayOfWeek <int> 3, 2, 3, 5, 5, 2, 3, 3, 2, 4, 1, 5, 4, 1, 1,...
## $ DepTime <int> 1926, 1100, 1811, 2028, 1638, 1057, 802, 904...
## $ ArrTime <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ UniqueCarrier <chr> "Continental", "US_Airways", "ExpressJet", "...
## $ FlightNum <int> 310, 944, 2376, 3050, 1121, 408, 2189, 2605,...
## $ TailNum <chr> "N77865", "N452UW", "N15932", "N15912", "N53...
## $ ActualElapsedTime <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ AirTime <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ ArrDelay <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ DepDelay <int> 26, 135, 6, 73, 8, 187, 2, 4, 28, 156, 42, 5...
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest <chr> "EWR", "CLT", "ICT", "JAX", "DFW", "EWR", "D...
## $ Distance <int> 1400, 913, 542, 817, 224, 1400, 217, 217, 68...
## $ TaxiIn <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ TaxiOut <int> NA, NA, NA, 19, 19, NA, NA, NA, 19, NA, NA, ...
## $ Cancelled <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ CancellationCode <chr> "B", "B", "B", "A", "A", "A", "B", "B", "A",...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Code <chr> "weather", "weather", "weather", "carrier", ...
# Select the flights that had JFK as their destination: c1
c1 <- filter(hflights, Dest == 'JFK')
# Combine the Year, Month and DayofMonth variables to create a Date column: c2
c2 <- mutate(c1, Date = paste(Year,Month,DayofMonth, sep="-"))
# Print out a selection of columns of c2
select(c2, Date, DepTime, ArrTime, TailNum)
## # A tibble: 695 x 4
## Date DepTime ArrTime TailNum
## <chr> <int> <int> <chr>
## 1 2011-1-1 654 1124 N324JB
## 2 2011-1-1 1639 2110 N324JB
## 3 2011-1-2 703 1113 N324JB
## 4 2011-1-2 1604 2040 N324JB
## 5 2011-1-3 659 1100 N229JB
## 6 2011-1-3 1801 2200 N206JB
## 7 2011-1-4 654 1103 N267JB
## 8 2011-1-4 1608 2034 N267JB
## 9 2011-1-5 700 1103 N708JB
## 10 2011-1-5 1544 1954 N644JB
## # ... with 685 more rows
hflights %>%
filter(
Distance > 1000,
DayOfWeek > 5,
TaxiIn + TaxiOut < 15
) %>%
glimpse()
## Observations: 1,739
## Variables: 22
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth <int> 23, 30, 30, 29, 23, 23, 23, 22, 16, 16, 16, ...
## $ DayOfWeek <int> 7, 7, 7, 6, 7, 7, 7, 6, 7, 7, 7, 7, 7, 7, 7,...
## $ DepTime <int> 1535, 851, 2234, 1220, 847, 1224, 931, 942, ...
## $ ArrTime <int> 1933, 1230, 2, 1353, 1213, 1345, 1045, 1340,...
## $ UniqueCarrier <chr> "JetBlue", "Continental", "Continental", "Co...
## $ FlightNum <int> 624, 1058, 1717, 1620, 1058, 1629, 1723, 1, ...
## $ TailNum <chr> "N599JB", "N39726", "N38417", "N87512", "N16...
## $ ActualElapsedTime <int> 178, 159, 208, 153, 146, 201, 194, 478, 288,...
## $ AirTime <int> 164, 145, 195, 139, 134, 188, 181, 465, 275,...
## $ ArrDelay <int> -27, -13, 89, 19, -30, -27, -28, -10, 12, -1...
## $ DepDelay <int> 0, -2, 94, 45, -6, -1, -5, 17, -2, -5, 3, -1...
## $ Origin <chr> "HOU", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest <chr> "JFK", "DCA", "SAN", "PHX", "DCA", "SNA", "O...
## $ Distance <int> 1428, 1208, 1303, 1009, 1208, 1347, 1334, 39...
## $ TaxiIn <int> 6, 3, 3, 5, 4, 4, 3, 3, 5, 3, 3, 4, 3, 6, 4,...
## $ TaxiOut <int> 8, 11, 10, 9, 8, 9, 10, 10, 8, 10, 9, 10, 11...
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode <chr> "E", "E", "E", "E", "E", "E", "E", "E", "E",...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Code <chr> "not cancelled", "not cancelled", "not cance...
# Definition of dtc
dtc <- filter(hflights, Cancelled == 1, !is.na(DepDelay))
# Arrange dtc by departure delays
arrange(dtc, DepDelay) %>% glimpse()
## Observations: 68
## Variables: 22
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 7, 1, 12, 10, 7, 9, 2, 5, 1, 1, 2, 3, 4, 4, ...
## $ DayofMonth <int> 23, 17, 1, 12, 29, 29, 9, 9, 20, 17, 21, 18,...
## $ DayOfWeek <int> 6, 1, 4, 3, 5, 4, 3, 1, 4, 1, 1, 5, 6, 7, 1,...
## $ DepTime <int> 605, 916, 541, 2022, 1424, 1639, 555, 715, 1...
## $ ArrTime <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ UniqueCarrier <chr> "Frontier", "ExpressJet", "US_Airways", "Ame...
## $ FlightNum <int> 225, 3068, 282, 3724, 1079, 2062, 3265, 1177...
## $ TailNum <chr> "N912FR", "N13936", "N840AW", "N539MQ", "N14...
## $ ActualElapsedTime <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ AirTime <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ ArrDelay <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ DepDelay <int> -10, -9, -9, -8, -6, -6, -5, -5, -4, -4, -3,...
## $ Origin <chr> "HOU", "IAH", "IAH", "IAH", "IAH", "IAH", "H...
## $ Dest <chr> "DEN", "HRL", "PHX", "LAX", "ORD", "ATL", "D...
## $ Distance <int> 883, 295, 1009, 1379, 925, 689, 247, 1076, 1...
## $ TaxiIn <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ TaxiOut <int> 10, NA, NA, NA, 13, NA, 11, 17, NA, 8, NA, N...
## $ Cancelled <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ CancellationCode <chr> "A", "B", "A", "A", "A", "B", "A", "A", "A",...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Code <chr> "carrier", "weather", "carrier", "carrier", ...
# Arrange dtc so that cancellation reasons are grouped
arrange(dtc, CancellationCode) %>% glimpse()
## Observations: 68
## Variables: 22
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 1, 1, 2, 2, 2, 2, 2, 3, 4, 4, 4, 4, 4, 4, 4,...
## $ DayofMonth <int> 20, 7, 4, 8, 1, 21, 9, 18, 4, 8, 21, 4, 11, ...
## $ DayOfWeek <int> 4, 5, 5, 2, 2, 1, 3, 5, 1, 5, 4, 1, 1, 4, 6,...
## $ DepTime <int> 1413, 2028, 1638, 1057, 1508, 2257, 555, 727...
## $ ArrTime <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ UniqueCarrier <chr> "United", "ExpressJet", "American", "Contine...
## $ FlightNum <int> 552, 3050, 1121, 408, 5812, 1111, 3265, 109,...
## $ TailNum <chr> "N509UA", "N15912", "N537AA", "N11641", "N95...
## $ ActualElapsedTime <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ AirTime <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ ArrDelay <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ DepDelay <int> -4, 73, 8, 187, 28, -3, -5, -3, 42, 548, 3, ...
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "H...
## $ Dest <chr> "IAD", "JAX", "DFW", "EWR", "ATL", "AUS", "D...
## $ Distance <int> 1190, 817, 224, 1400, 689, 140, 247, 862, 68...
## $ TaxiIn <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ TaxiOut <int> NA, 19, 19, NA, 19, NA, 11, NA, NA, NA, 5, N...
## $ Cancelled <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ CancellationCode <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A",...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Code <chr> "carrier", "carrier", "carrier", "carrier", ...
# Arrange dtc according to carrier and departure delays
arrange(dtc, UniqueCarrier, DepDelay) %>% glimpse()
## Observations: 68
## Variables: 22
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 6, 8, 2, 10, 2, 7, 4, 4, 5, 9, 5, 9, 7, 1, 8...
## $ DayofMonth <int> 11, 18, 4, 12, 9, 17, 30, 10, 23, 29, 16, 26...
## $ DayOfWeek <int> 6, 4, 5, 3, 3, 7, 6, 7, 1, 4, 1, 1, 5, 3, 4,...
## $ DepTime <int> 1649, 1808, 1638, 2022, 555, 1917, 612, 1147...
## $ ArrTime <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ UniqueCarrier <chr> "AirTran", "American", "American", "American...
## $ FlightNum <int> 1595, 1294, 1121, 3724, 3265, 3717, 5386, 54...
## $ TailNum <chr> "N946AT", "N3FLAA", "N537AA", "N539MQ", "N61...
## $ ActualElapsedTime <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ AirTime <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ ArrDelay <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ DepDelay <int> 64, 3, 8, -8, -5, -3, -3, -3, -3, -2, -1, 22...
## $ Origin <chr> "HOU", "IAH", "IAH", "IAH", "HOU", "IAH", "I...
## $ Dest <chr> "BKG", "MIA", "DFW", "LAX", "DFW", "ORD", "M...
## $ Distance <int> 490, 964, 224, 1379, 247, 925, 469, 469, 696...
## $ TaxiIn <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ TaxiOut <int> 25, NA, 19, NA, 11, NA, NA, NA, NA, NA, NA, ...
## $ Cancelled <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ CancellationCode <chr> "A", "A", "A", "A", "A", "A", "A", "A", "A",...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Code <chr> "carrier", "carrier", "carrier", "carrier", ...
# Arrange according to carrier and decreasing departure delays
arrange(hflights, UniqueCarrier, desc(DepDelay)) %>% glimpse()
## Observations: 227,496
## Variables: 22
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 2, 3, 2, 11, 5, 5, 4, 6, 5, 7, 7, 6, 6, 7, 5...
## $ DayofMonth <int> 19, 14, 16, 13, 26, 26, 28, 5, 7, 25, 28, 24...
## $ DayOfWeek <int> 6, 1, 3, 7, 4, 4, 4, 7, 6, 1, 4, 5, 6, 1, 4,...
## $ DepTime <int> 1902, 2024, 2349, 2312, 2353, 1922, 1045, 22...
## $ ArrTime <int> 2143, 2309, 227, 213, 305, 2229, 1328, 52, 1...
## $ UniqueCarrier <chr> "AirTran", "AirTran", "AirTran", "AirTran", ...
## $ FlightNum <int> 298, 286, 292, 292, 296, 288, 290, 292, 290,...
## $ TailNum <chr> "N974AT", "N899AT", "N934AT", "N951AT", "N95...
## $ ActualElapsedTime <int> 101, 105, 98, 121, 132, 127, 103, 105, 107, ...
## $ AirTime <int> 89, 89, 85, 99, 115, 104, 88, 91, 94, 105, 1...
## $ ArrDelay <int> 500, 483, 367, 353, 292, 290, 258, 259, 216,...
## $ DepDelay <int> 507, 493, 380, 347, 275, 274, 270, 269, 224,...
## $ Origin <chr> "HOU", "HOU", "HOU", "HOU", "HOU", "HOU", "H...
## $ Dest <chr> "ATL", "ATL", "ATL", "ATL", "ATL", "ATL", "A...
## $ Distance <int> 696, 696, 696, 696, 696, 696, 696, 696, 696,...
## $ TaxiIn <int> 5, 7, 4, 14, 11, 11, 7, 4, 7, 9, 6, 10, 5, 1...
## $ TaxiOut <int> 7, 9, 9, 8, 6, 12, 8, 10, 6, 13, 7, 9, 9, 8,...
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode <chr> "E", "E", "E", "E", "E", "E", "E", "E", "E",...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Code <chr> "not cancelled", "not cancelled", "not cance...
# Arrange flights by total delay (normal order).
arrange(hflights, (DepDelay + ArrDelay)) %>% glimpse()
## Observations: 227,496
## Variables: 22
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 7, 8, 8, 8, 8, 12, 1, 8, 8, 8, 8, 9, 12, 9, ...
## $ DayofMonth <int> 3, 31, 21, 28, 29, 25, 30, 3, 4, 18, 26, 11,...
## $ DayOfWeek <int> 7, 3, 7, 7, 1, 7, 7, 3, 4, 4, 5, 7, 6, 2, 6,...
## $ DepTime <int> 1914, 934, 935, 2059, 935, 741, 620, 1741, 9...
## $ ArrTime <int> 2039, 1039, 1039, 2206, 1041, 926, 812, 1810...
## $ UniqueCarrier <chr> "ExpressJet", "SkyWest", "SkyWest", "SkyWest...
## $ FlightNum <int> 2804, 2040, 2001, 2003, 2040, 4591, 4461, 26...
## $ TailNum <chr> "N12157", "N783SK", "N767SK", "N783SK", "N76...
## $ ActualElapsedTime <int> 85, 185, 184, 187, 186, 165, 172, 89, 191, 1...
## $ AirTime <int> 66, 172, 171, 171, 169, 147, 156, 73, 177, 1...
## $ ArrDelay <int> -70, -56, -56, -54, -54, -57, -49, -40, -49,...
## $ DepDelay <int> -1, -11, -10, -11, -10, -4, -10, -19, -10, -...
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest <chr> "MEM", "BFL", "BFL", "BFL", "BFL", "SLC", "S...
## $ Distance <int> 468, 1428, 1428, 1428, 1428, 1195, 1195, 501...
## $ TaxiIn <int> 4, 3, 3, 5, 4, 4, 5, 5, 4, 4, 5, 6, 3, 4, 5,...
## $ TaxiOut <int> 15, 10, 10, 11, 13, 14, 11, 11, 10, 8, 10, 1...
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode <chr> "E", "E", "E", "E", "E", "E", "E", "E", "E",...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Code <chr> "not cancelled", "not cancelled", "not cance...
# Keep flights leaving to DFW before 8am and arrange according to decreasing AirTime
hflights %>% filter(Dest == 'DFW', DepTime < 800) %>% arrange(desc(AirTime)) %>% glimpse()
## Observations: 799
## Variables: 22
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 11, 8, 10, 5, 4, 4, 6, 9, 3, 12, 4, 4, 5, 6,...
## $ DayofMonth <int> 22, 25, 12, 2, 4, 4, 21, 1, 14, 5, 25, 11, 1...
## $ DayOfWeek <int> 2, 4, 3, 1, 1, 1, 2, 4, 1, 1, 1, 1, 3, 2, 1,...
## $ DepTime <int> 635, 602, 559, 716, 741, 627, 726, 715, 729,...
## $ ArrTime <int> 825, 758, 738, 854, 949, 742, 848, 844, 917,...
## $ UniqueCarrier <chr> "American", "American_Eagle", "American_Eagl...
## $ FlightNum <int> 1903, 3265, 3265, 2237, 1225, 3265, 2259, 19...
## $ TailNum <chr> "N477AA", "N633MQ", "N632MQ", "N552AA", "N4X...
## $ ActualElapsedTime <int> 110, 116, 99, 98, 128, 75, 82, 89, 108, 83, ...
## $ AirTime <int> 81, 74, 71, 70, 63, 62, 62, 62, 61, 61, 59, ...
## $ ArrDelay <int> 40, 53, 33, 29, 89, 37, 9, 9, 33, 2, 20, 1, ...
## $ DepDelay <int> 0, 2, -1, 1, 31, 27, -4, -5, -1, -1, -1, -7,...
## $ Origin <chr> "IAH", "HOU", "HOU", "IAH", "IAH", "HOU", "I...
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance <int> 224, 247, 247, 224, 224, 247, 224, 224, 224,...
## $ TaxiIn <int> 11, 21, 8, 11, 6, 3, 5, 16, 11, 3, 7, 5, 7, ...
## $ TaxiOut <int> 18, 21, 20, 17, 59, 10, 15, 11, 36, 19, 25, ...
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode <chr> "E", "E", "E", "E", "E", "E", "E", "E", "E",...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Code <chr> "not cancelled", "not cancelled", "not cance...
glimpse(hflights)
## Observations: 227,496
## Variables: 22
## $ Year <int> 2011, 2011, 2011, 2011, 2011, 2011, 2011, 20...
## $ Month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ DayofMonth <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1...
## $ DayOfWeek <int> 6, 7, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6,...
## $ DepTime <int> 1400, 1401, 1352, 1403, 1405, 1359, 1359, 13...
## $ ArrTime <int> 1500, 1501, 1502, 1513, 1507, 1503, 1509, 14...
## $ UniqueCarrier <chr> "American", "American", "American", "America...
## $ FlightNum <int> 428, 428, 428, 428, 428, 428, 428, 428, 428,...
## $ TailNum <chr> "N576AA", "N557AA", "N541AA", "N403AA", "N49...
## $ ActualElapsedTime <int> 60, 60, 70, 70, 62, 64, 70, 59, 71, 70, 70, ...
## $ AirTime <int> 40, 45, 48, 39, 44, 45, 43, 40, 41, 45, 42, ...
## $ ArrDelay <int> -10, -9, -8, 3, -3, -7, -1, -16, 44, 43, 29,...
## $ DepDelay <int> 0, 1, -8, 3, 5, -1, -1, -5, 43, 43, 29, 19, ...
## $ Origin <chr> "IAH", "IAH", "IAH", "IAH", "IAH", "IAH", "I...
## $ Dest <chr> "DFW", "DFW", "DFW", "DFW", "DFW", "DFW", "D...
## $ Distance <int> 224, 224, 224, 224, 224, 224, 224, 224, 224,...
## $ TaxiIn <int> 7, 6, 5, 9, 9, 6, 12, 7, 8, 6, 8, 4, 6, 5, 6...
## $ TaxiOut <int> 13, 9, 17, 22, 9, 13, 15, 12, 22, 19, 20, 11...
## $ Cancelled <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ CancellationCode <chr> "E", "E", "E", "E", "E", "E", "E", "E", "E",...
## $ Diverted <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ Code <chr> "not cancelled", "not cancelled", "not cance...
# Print out a summary with variables min_dist and max_dist
summarize(hflights, min_dist = min(Distance), max_dist = max(Distance))
## # A tibble: 1 x 2
## min_dist max_dist
## <dbl> <dbl>
## 1 79 3904
# Print out a summary with variable max_div
hflights %>% filter(Diverted == 1) %>% summarize(max_div = max(Distance))
## # A tibble: 1 x 1
## max_div
## <dbl>
## 1 3904
Aggregate functions defined in R:
min(x)
- minimum value of vector xmax(x)
- maximum value of vector xmean(x)
- mean value of vector xmedian(x)
- median value of vector xquantile(x, p)
- pth quantile of vector xsd(x)
- standard deviation of vector xvar(x)
- variance of vector xIQR(x)
- Inter Quartile Range (IQR) of vector xdiff(range(x))
- total range of vector x# Remove rows that have NA ArrDelay: temp1
temp1 <- filter(hflights, !is.na(ArrDelay))
# Generate summary about ArrDelay column of temp1
summarize(temp1,
earliest = min(ArrDelay),
average = mean(ArrDelay),
latest = max(ArrDelay),
sd = sd(ArrDelay)
)
## # A tibble: 1 x 4
## earliest average latest sd
## <dbl> <dbl> <dbl> <dbl>
## 1 -70 7.094334 978 30.70852
# Keep rows that have no NA TaxiIn and no NA TaxiOut: temp2
temp2 <- filter(hflights, !is.na(TaxiIn) & !is.na(TaxiOut))
# Print the maximum taxiing difference of temp2 with summarise()
summarise(temp2, max_taxi_diff = max(abs(TaxiIn - TaxiOut)))
## # A tibble: 1 x 1
## max_taxi_diff
## <int>
## 1 160
dplyr has some of its own aggregate functions:
first(x)
- The first element of vector xlast(x)
- The last element of vector xnth(x, n)
- The nth element of vector xn()
- The number of rows in the data.frame or group of observations that summarise() describesn_distinct(x)
- The number of unique values in vector x# Generate summarizing statistics for hflights
summarise(hflights,
n_obs = n(),
n_carrier = n_distinct(UniqueCarrier),
n_dest = n_distinct(Dest),
dest100 = nth(Dest, 100)
)
## # A tibble: 1 x 4
## n_obs n_carrier n_dest dest100
## <int> <int> <int> <chr>
## 1 227496 15 116 DFW
# Filter hflights to keep all American Airline flights: aa
aa <- filter(hflights, UniqueCarrier == 'American')
# Generate summarizing statistics for aa
summarise(aa,
n_flights = n(),
n_canc = sum(Cancelled),
p_canc = n_canc/n_flights * 100,
avg_delay = mean(ArrDelay, na.rm=T)
)
## # A tibble: 1 x 4
## n_flights n_canc p_canc avg_delay
## <int> <int> <dbl> <dbl>
## 1 3244 60 1.849568 0.8917558
%>%
is probably my favorite thing in R.
to pass it into the next function in any location!# Write the 'piped' version of the English sentences.
hflights %>%
mutate(diff = TaxiOut - TaxiIn) %>%
filter(!is.na(diff)) %>%
summarize(avg = mean(diff))
## # A tibble: 1 x 1
## avg
## <dbl>
## 1 8.992064
# Build data frame with 4 columns of hflights and 2 self-defined columns: d
d <- hflights %>%
select(Dest, UniqueCarrier, Distance, ActualElapsedTime) %>%
mutate(
RealTime = ActualElapsedTime + 100,
mph = Distance/RealTime*60
)
# Filter and summarise d according to the instructions
d %>%
filter(
!is.na(mph),
mph < 70
) %>%
summarize(
n_less = n(),
n_dest = n_distinct(Dest),
min_dist = min(Distance),
max_dist = max(Distance)
)
## # A tibble: 1 x 4
## n_less n_dest min_dist max_dist
## <int> <int> <dbl> <dbl>
## 1 6726 13 79 305
# Solve the exercise using a combination of dplyr verbs and %>%
hflights %>%
mutate(RealTime = ActualElapsedTime + 100, mph = Distance / RealTime * 60) %>%
filter(mph < 105 | Cancelled == 1 | Diverted == 1) %>%
summarise(n_non = n(),
p_non = n_non / nrow(hflights) * 100,
n_dest = n_distinct(Dest),
min_dist = min (Distance),
max_dist = max(Distance))
## # A tibble: 1 x 5
## n_non p_non n_dest min_dist max_dist
## <int> <dbl> <int> <dbl> <dbl>
## 1 42400 18.63769 113 79 3904
hflights %>%
filter(
!is.na(ArrTime),
!is.na(DepTime),
ArrTime < DepTime
) %>%
summarise(n = n())
## # A tibble: 1 x 1
## n
## <int>
## 1 2718
group_by
with summarize
is very powerful
mutate
and arrange
to create powerful window functions# Make an ordered per-carrier summary of hflights
hflights %>%
group_by(UniqueCarrier) %>%
summarise(
n_flights = n(),
n_canc = sum(Cancelled),
p_canc = n_canc/n_flights * 100,
avg_delay = mean(ArrDelay, na.rm=T)
) %>%
arrange(avg_delay, p_canc)
## # A tibble: 15 x 5
## UniqueCarrier n_flights n_canc p_canc avg_delay
## <chr> <int> <int> <dbl> <dbl>
## 1 US_Airways 4082 46 1.1268986 -0.6307692
## 2 American 3244 60 1.8495684 0.8917558
## 3 AirTran 2139 21 0.9817672 1.8536239
## 4 Alaska 365 0 0.0000000 3.1923077
## 5 Mesa 79 1 1.2658228 4.0128205
## 6 Delta 2641 42 1.5903067 6.0841374
## 7 Continental 70032 475 0.6782614 6.0986983
## 8 American_Eagle 4648 135 2.9044750 7.1529751
## 9 Atlantic_Southeast 2204 76 3.4482759 7.2569543
## 10 Southwest 45343 703 1.5504047 7.5871430
## 11 Frontier 838 6 0.7159905 7.6682692
## 12 ExpressJet 73053 1132 1.5495599 8.1865242
## 13 SkyWest 16061 224 1.3946828 8.6934922
## 14 JetBlue 695 18 2.5899281 9.8588410
## 15 United 2072 34 1.6409266 10.4628628
# Make an ordered per-day summary of hflights
hflights %>%
group_by(DayOfWeek) %>%
summarize(avg_taxi = mean(TaxiIn + TaxiOut, na.rm=T)) %>%
arrange(desc(avg_taxi))
## # A tibble: 7 x 2
## DayOfWeek avg_taxi
## <int> <dbl>
## 1 1 21.77027
## 2 2 21.43505
## 3 4 21.26076
## 4 3 21.19055
## 5 5 21.15805
## 6 7 20.93726
## 7 6 20.43061
# Solution to first instruction
hflights %>%
filter(!is.na(ArrDelay)) %>%
group_by(UniqueCarrier) %>%
summarize(p_delay = sum(ArrDelay > 0)/n()) %>%
mutate(rank = rank(p_delay)) %>%
arrange(rank)
## # A tibble: 15 x 3
## UniqueCarrier p_delay rank
## <chr> <dbl> <dbl>
## 1 American 0.3030208 1
## 2 AirTran 0.3112269 2
## 3 US_Airways 0.3267990 3
## 4 Atlantic_Southeast 0.3677511 4
## 5 American_Eagle 0.3696714 5
## 6 Delta 0.3871092 6
## 7 JetBlue 0.3952452 7
## 8 Alaska 0.4368132 8
## 9 Southwest 0.4644557 9
## 10 Mesa 0.4743590 10
## 11 Continental 0.4907385 11
## 12 ExpressJet 0.4943420 12
## 13 United 0.4963109 13
## 14 SkyWest 0.5350105 14
## 15 Frontier 0.5564904 15
# Solution to second instruction
hflights %>%
filter(
!is.na(ArrDelay),
ArrDelay > 0
) %>%
group_by(UniqueCarrier) %>%
summarize(avg = mean(ArrDelay)) %>%
mutate(rank = rank(avg)) %>%
arrange(rank)
## # A tibble: 15 x 3
## UniqueCarrier avg rank
## <chr> <dbl> <dbl>
## 1 Mesa 18.67568 1
## 2 Frontier 18.68683 2
## 3 US_Airways 20.70235 3
## 4 Continental 22.13374 4
## 5 Alaska 22.91195 5
## 6 SkyWest 24.14663 6
## 7 ExpressJet 24.19337 7
## 8 Southwest 25.27750 8
## 9 AirTran 27.85693 9
## 10 American 28.49740 10
## 11 Delta 32.12463 11
## 12 United 32.48067 12
## 13 American_Eagle 38.75135 13
## 14 Atlantic_Southeast 40.24231 14
## 15 JetBlue 45.47744 15
# Which plane (by tail number) flew out of Houston the most times? How many times? adv1
adv1 <- hflights %>%
group_by(TailNum) %>%
summarize(n = n()) %>%
filter(n == max(n))
adv1
## # A tibble: 1 x 2
## TailNum n
## <chr> <int>
## 1 N14945 971
# How many airplanes only flew to one destination from Houston? adv2
adv2 <- hflights %>%
group_by(TailNum) %>%
summarize(n_dest = n_distinct(Dest)) %>%
filter(n_dest == 1) %>%
summarize(nplanes = n())
adv2
## # A tibble: 1 x 1
## nplanes
## <int>
## 1 1526
# Find the most visited destination for each carrier: adv3
adv3 <- hflights %>%
group_by(UniqueCarrier, Dest) %>%
summarize(n = n()) %>%
group_by(UniqueCarrier) %>%
mutate(rank = rank(desc(n))) %>%
filter(rank == 1) %>%
arrange(UniqueCarrier, rank)
adv3
## # A tibble: 15 x 4
## # Groups: UniqueCarrier [15]
## UniqueCarrier Dest n rank
## <chr> <chr> <int> <dbl>
## 1 AirTran ATL 2029 1
## 2 Alaska SEA 365 1
## 3 American DFW 2105 1
## 4 American_Eagle DFW 2424 1
## 5 Atlantic_Southeast DTW 851 1
## 6 Continental EWR 3924 1
## 7 Delta ATL 2396 1
## 8 ExpressJet CRP 3175 1
## 9 Frontier DEN 837 1
## 10 JetBlue JFK 695 1
## 11 Mesa CLT 71 1
## 12 SkyWest COS 1335 1
## 13 Southwest DAL 8243 1
## 14 United SFO 643 1
## 15 US_Airways CLT 2212 1
# Find the carrier that travels to each destination the most: adv4
adv4 <- hflights %>%
group_by(Dest, UniqueCarrier) %>%
summarize(n = n()) %>%
group_by(Dest) %>%
mutate(rank = rank(desc(n))) %>%
filter(rank == 1)
adv4
## # A tibble: 116 x 4
## # Groups: Dest [116]
## Dest UniqueCarrier n rank
## <chr> <chr> <int> <dbl>
## 1 ABQ Southwest 1019 1
## 2 AEX ExpressJet 724 1
## 3 AGS Continental 1 1
## 4 AMA ExpressJet 1297 1
## 5 ANC Continental 125 1
## 6 ASE SkyWest 125 1
## 7 ATL Delta 2396 1
## 8 AUS Continental 2645 1
## 9 AVL ExpressJet 350 1
## 10 BFL SkyWest 504 1
## # ... with 106 more rows
library(data.table)
# Convert hflights to a data.table
class(hflights)
## [1] "tbl_df" "tbl" "data.frame"
hflights2 <- as.data.table(hflights)
class(hflights2)
## [1] "data.table" "data.frame"
# Use summarise to calculate n_carrier
s2 <- hflights2 %>%
summarize(n_carrier = n_distinct(UniqueCarrier))
s2
## n_carrier
## 1 15
library(RMySQL)
library(dbplyr)
# Set up a connection to the mysql database
my_db <- src_mysql(dbname = "dplyr",
host = "courses.csrrinzqubik.us-east-1.rds.amazonaws.com",
port = 3306,
user = "student",
password = "datacamp")
# Reference a table within that source: nycflights
nycflights <- tbl(my_db, "dplyr")
# glimpse at nycflights
glimpse(nycflights)
## Observations: 25
## Variables: 17
## $ id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1...
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013...
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 55...
## $ dep_delay <int> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2,...
## $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 8...
## $ arr_delay <int> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7,...
## $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6"...
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N...
## $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301...
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LG...
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IA...
## $ air_time <int> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149...
## $ distance <int> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 73...
## $ hour <int> 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6...
## $ minute <int> 17, 33, 42, 44, 54, 54, 55, 57, 57, 58, 58, 58, 58, ...
# Ordered, grouped summary of nycflights
nycflights %>%
group_by(carrier) %>%
summarise(n_flights = n(), avg_delay = mean(arr_delay)) %>%
arrange(avg_delay)
## # Source: lazy query [?? x 3]
## # Database: mysql 5.6.34-log
## # [student@courses.csrrinzqubik.us-east-1.rds.amazonaws.com:/dplyr]
## # Ordered by: avg_delay
## carrier n_flights avg_delay
## <chr> <dbl> <dbl>
## 1 AS 714 -9.8613
## 2 HA 342 -6.9152
## 3 AA 32729 0.3556
## 4 DL 48110 1.6289
## 5 VX 5162 1.7487
## 6 US 20536 2.0565
## 7 UA 58665 3.5045
## 8 9E 18460 6.9135
## 9 B6 54635 9.3565
## 10 WN 12275 9.4675
## # ... with more rows