Data loading
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.2.2
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(hflights)
## Warning: package 'hflights' was built under R version 3.2.2
str(hflights)
## 'data.frame': 227496 obs. of 21 variables:
## $ Year : int 2011 2011 2011 2011 2011 2011 2011 2011 2011 2011 ...
## $ Month : int 1 1 1 1 1 1 1 1 1 1 ...
## $ DayofMonth : int 1 2 3 4 5 6 7 8 9 10 ...
## $ DayOfWeek : int 6 7 1 2 3 4 5 6 7 1 ...
## $ DepTime : int 1400 1401 1352 1403 1405 1359 1359 1355 1443 1443 ...
## $ ArrTime : int 1500 1501 1502 1513 1507 1503 1509 1454 1554 1553 ...
## $ UniqueCarrier : chr "AA" "AA" "AA" "AA" ...
## $ FlightNum : int 428 428 428 428 428 428 428 428 428 428 ...
## $ TailNum : chr "N576AA" "N557AA" "N541AA" "N403AA" ...
## $ ActualElapsedTime: int 60 60 70 70 62 64 70 59 71 70 ...
## $ AirTime : int 40 45 48 39 44 45 43 40 41 45 ...
## $ ArrDelay : int -10 -9 -8 3 -3 -7 -1 -16 44 43 ...
## $ DepDelay : int 0 1 -8 3 5 -1 -1 -5 43 43 ...
## $ Origin : chr "IAH" "IAH" "IAH" "IAH" ...
## $ Dest : chr "DFW" "DFW" "DFW" "DFW" ...
## $ Distance : int 224 224 224 224 224 224 224 224 224 224 ...
## $ TaxiIn : int 7 6 5 9 9 6 12 7 8 6 ...
## $ TaxiOut : int 13 9 17 22 9 13 15 12 22 19 ...
## $ Cancelled : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CancellationCode : chr "" "" "" "" ...
## $ Diverted : int 0 0 0 0 0 0 0 0 0 0 ...
hflights <- tbl_df(hflights)
class(hflights)
## [1] "tbl_df" "tbl" "data.frame"
hflights
## Source: local data frame [227,496 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 1 1 6 1400 1500 AA
## 2 2011 1 2 7 1401 1501 AA
## 3 2011 1 3 1 1352 1502 AA
## 4 2011 1 4 2 1403 1513 AA
## 5 2011 1 5 3 1405 1507 AA
## 6 2011 1 6 4 1359 1503 AA
## 7 2011 1 7 5 1359 1509 AA
## 8 2011 1 8 6 1355 1454 AA
## 9 2011 1 9 7 1443 1554 AA
## 10 2011 1 10 1 1443 1553 AA
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
- select(), which returns a subset of the columns,
- filter(), that is able to return a subset of the rows,
- arrange(), that reorders the rows according to single or multiple variables,
- mutate(), used to add columns from existing data,
- summarise(), which reduces each group to a single row by calculating aggregate measures.
print hflights, nothing has changed!
hflights
## Source: local data frame [227,496 x 21]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 1 1 6 1400 1500 AA
## 2 2011 1 2 7 1401 1501 AA
## 3 2011 1 3 1 1352 1502 AA
## 4 2011 1 4 2 1403 1513 AA
## 5 2011 1 5 3 1405 1507 AA
## 6 2011 1 6 4 1359 1503 AA
## 7 2011 1 7 5 1359 1509 AA
## 8 2011 1 8 6 1355 1454 AA
## 9 2011 1 9 7 1443 1554 AA
## 10 2011 1 10 1 1443 1553 AA
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int)
Return a copy of hflights containing the columns Origin up to Cancelled
select(hflights,Origin, Dest, Distance, TaxiIn, TaxiOut, Cancelled)
## Source: local data frame [227,496 x 6]
##
## Origin Dest Distance TaxiIn TaxiOut Cancelled
## (chr) (chr) (int) (int) (int) (int)
## 1 IAH DFW 224 7 13 0
## 2 IAH DFW 224 6 9 0
## 3 IAH DFW 224 5 17 0
## 4 IAH DFW 224 9 22 0
## 5 IAH DFW 224 9 9 0
## 6 IAH DFW 224 6 13 0
## 7 IAH DFW 224 12 15 0
## 8 IAH DFW 224 7 12 0
## 9 IAH DFW 224 8 22 0
## 10 IAH DFW 224 6 19 0
## .. ... ... ... ... ... ...
Find the most concise way to select: columns Year up to and including DayOfWeek, columns ArrDelay up to and including Diverted. (The - operator allows you to select everything except a column or a range of columns.)
select(hflights, -DepTime, -ArrTime, -UniqueCarrier, -FlightNum, -TailNum, -ActualElapsedTime, -AirTime)
## Source: local data frame [227,496 x 14]
##
## Year Month DayofMonth DayOfWeek ArrDelay DepDelay Origin Dest
## (int) (int) (int) (int) (int) (int) (chr) (chr)
## 1 2011 1 1 6 -10 0 IAH DFW
## 2 2011 1 2 7 -9 1 IAH DFW
## 3 2011 1 3 1 -8 -8 IAH DFW
## 4 2011 1 4 2 3 3 IAH DFW
## 5 2011 1 5 3 -3 5 IAH DFW
## 6 2011 1 6 4 -7 -1 IAH DFW
## 7 2011 1 7 5 -1 -1 IAH DFW
## 8 2011 1 8 6 -16 -5 IAH DFW
## 9 2011 1 9 7 44 43 IAH DFW
## 10 2011 1 10 1 43 43 IAH DFW
## .. ... ... ... ... ... ... ... ...
## Variables not shown: Distance (int), TaxiIn (int), TaxiOut (int),
## Cancelled (int), CancellationCode (chr), Diverted (int)
dplyr provides 6 helper functions, each of which only works when used inside select().
- starts_with(“X”): every name that starts with “X”,
- ends_with(“X”): every name that ends with “X”,
- contains(“X”): every name that contains “X”,
- matches(“X”): every name that matches “X”, which can be a regular expression,
- num_range(“x”, 1:5): the variables named x01, x02, x03, x04 and x05,
- one_of(x): every name that appears in x, which should be a character vector.
Use select and a helper function to return a tbl copy of hflights that contains just ArrDelay and DepDelay.
select(hflights,contains("ArrDelay"), contains("DepDelay"))
## Source: local data frame [227,496 x 2]
##
## ArrDelay DepDelay
## (int) (int)
## 1 -10 0
## 2 -9 1
## 3 -8 -8
## 4 3 3
## 5 -3 5
## 6 -7 -1
## 7 -1 -1
## 8 -16 -5
## 9 44 43
## 10 43 43
## .. ... ...
Use a combination of helper functions and variable names to return the UniqueCarrier, FlightNum, TailNum, Cancelled, and CancellationCode columns of hflights.
select(hflights,matches("UniqueCarrier"),ends_with("Num"), contains("Cancell"))
## Source: local data frame [227,496 x 5]
##
## UniqueCarrier FlightNum TailNum Cancelled CancellationCode
## (chr) (int) (chr) (int) (chr)
## 1 AA 428 N576AA 0
## 2 AA 428 N557AA 0
## 3 AA 428 N541AA 0
## 4 AA 428 N403AA 0
## 5 AA 428 N492AA 0
## 6 AA 428 N262AA 0
## 7 AA 428 N493AA 0
## 8 AA 428 N477AA 0
## 9 AA 428 N476AA 0
## 10 AA 428 N504AA 0
## .. ... ... ... ... ...
Find the most concise way to return the following columns with select and its helper functions: DepTime, ArrTime, ActualElapsedTime, AirTime, ArrDelay, DepDelay. Use only helper functions!
select(hflights,starts_with("A"),starts_with("De"),-Dest)
## Source: local data frame [227,496 x 6]
##
## ArrTime ActualElapsedTime AirTime ArrDelay DepTime DepDelay
## (int) (int) (int) (int) (int) (int)
## 1 1500 60 40 -10 1400 0
## 2 1501 60 45 -9 1401 1
## 3 1502 70 48 -8 1352 -8
## 4 1513 70 39 3 1403 3
## 5 1507 62 44 -3 1405 5
## 6 1503 64 45 -7 1359 -1
## 7 1509 70 43 -1 1359 -1
## 8 1454 59 40 -16 1355 -5
## 9 1554 71 41 44 1443 43
## 10 1553 70 45 43 1443 43
## .. ... ... ... ... ... ...
basic R vs dplyr
# Exercise 1
ex1r <- hflights[c("TaxiIn","TaxiOut","Distance")]
ex1d <- select(hflights,TaxiIn,TaxiOut,Distance)
# Exercise 2
ex2r <- hflights[c("Year","Month","DayOfWeek","DepTime","ArrTime")]
ex2d <- select(hflights,Year:ArrTime, -DayofMonth)
# Exercise 3
ex3r <- hflights[c("TailNum","TaxiIn","TaxiOut")]
ex3d <- select(hflights, starts_with("Ta"))
Mutating is creating
# Add the new variable ActualGroundTime to a copy of hflights and save the result as g1.
g1 <- mutate(hflights, ActualGroundTime = ActualElapsedTime - AirTime)
# Add the new variable GroundTime to a g1. Save the result as g2.
g2 <- mutate(g1, GroundTime = TaxiIn + TaxiOut)
# Add the new variable AverageSpeed to g2. Save the result as g3.
g3 <- mutate(g2, AverageSpeed = Distance / AirTime * 60)
# Print out g3
g3
## Source: local data frame [227,496 x 24]
##
## Year Month DayofMonth DayOfWeek DepTime ArrTime UniqueCarrier
## (int) (int) (int) (int) (int) (int) (chr)
## 1 2011 1 1 6 1400 1500 AA
## 2 2011 1 2 7 1401 1501 AA
## 3 2011 1 3 1 1352 1502 AA
## 4 2011 1 4 2 1403 1513 AA
## 5 2011 1 5 3 1405 1507 AA
## 6 2011 1 6 4 1359 1503 AA
## 7 2011 1 7 5 1359 1509 AA
## 8 2011 1 8 6 1355 1454 AA
## 9 2011 1 9 7 1443 1554 AA
## 10 2011 1 10 1 1443 1553 AA
## .. ... ... ... ... ... ... ...
## Variables not shown: FlightNum (int), TailNum (chr), ActualElapsedTime
## (int), AirTime (int), ArrDelay (int), DepDelay (int), Origin (chr), Dest
## (chr), Distance (int), TaxiIn (int), TaxiOut (int), Cancelled (int),
## CancellationCode (chr), Diverted (int), ActualGroundTime (int),
## GroundTime (int), AverageSpeed (dbl)