Note : to display results you need to remove the “message=FALSE” option in the chunks below
library(dplyr)
library(hflights)
data("hflights")
head(hflights, 5)
we use tbl_df, which is a wrapper to display data more convenientlty and prints information on unseen variables
flights = as_tibble(hflights)
head(flights,4)
specify the number of rows you want to see
print(flights, n=5)
head(flights, 5)
# Rbase approach
flights[flights$Month == 1 & flights$DayOfWeek == 7, c(1:4)]
# dplyr approach
filter(slice(flights, 1:5), Month==1, DayofMonth ==2)
# using pipe and %in% operators
filter(slice(flights, 1:5), ActualElapsedTime == 60 | ActualElapsedTime ==70)
filter(flights, TailNum %in% c("N576AA"), Dest %in% c("DFW"))
#Base R approach
flights[c(1:5), c("FlightNum","AirTime","TaxiOut")]
# Dplyr approach
select(flights, FlightNum,AirTime, TaxiOut)
select(flights, AirTime:Dest, contains('taxi'), contains('cancel'))
select(flights, contains('taxi'), contains('cancel'))
# we can also use 'starts_with', 'ends_with', 'matches' to match columns by name
slice(flights, 1:5)
filter(select(flights, UniqueCarrier, TailNum, TaxiIn), TaxiIn > 10)
# alternative way : chaining
flights %>%
select(UniqueCarrier, TailNum, TaxiIn)%>%
filter(TaxiIn < 10)
Chaining increases the readability and can be used to replace nesting in R commands
# creating 2 vectors and calculate euclidian distance between them
x = 1:5; y = 2:6
sqrt(sum((x-y)^2))
# chaining method in practice
(x-y)^2 %>% sum() %>% sqrt()
# baseR Approach (select UniqueCarrier and DepDelay columns and sort by DepDelay )
flights[order(flights$DepDelay), c("UniqueCarrier", "DepDelay")]
#dplyr approach
flights %>%
select(UniqueCarrier,DepDelay)%>%
arrange(DepDelay) # ascending by default
# descending order
flights %>%
select(UniqueCarrier,DepDelay)%>%
arrange(desc(DepDelay))
# create new variables that are functions of existing variables :
# rbase approach
flights$Speed = flights$Distance / flights$AirTime*60
flights[, c("Distance", "AirTime", "Speed")]
# dplyr approach (prints the new variable but does not store it)
flights %>%
select(Distance, AirTime) %>%
mutate(speed_2 = Distance/AirTime*60)
#store the variable
flights = flights %>% mutate(speed_2 = Distance/AirTime*60)
head(flights, 5)
# mutate can be used without select
# calculate average arrival delay for each destination
# Rbase Approach
head(aggregate(ArrDelay ~ Dest, flights, mean))
head(with(flights, tapply(ArrDelay, Dest, mean, na.rm=TRUE)))
# Dplyr Approach
# we create a table of destinations and summarise each group by taking the mean ArrDelay
flights %>%
group_by(Dest) %>%
summarise(avg_delay=mean(ArrDelay, na.rm = TRUE))
# applies the same summmry function as above to multiple columns
# Calculating for each UniqueCarrier the percentage of flights cancelled or diverted (for each = group by)
flights %>%
group_by(UniqueCarrier) %>%
summarise_each(funs(mean),Cancelled, Diverted)
# Calculate for each Carrier the maximum and minimum arrival and departure delays
flights %>%
group_by(UniqueCarrier) %>%
summarise_each(funs(min(., na.rm = TRUE), max( na.rm = TRUE)), matches("Delay"))
# n = counts the number of rows in a group
# n_distinct(vector) = counts the number of unique items in that vector
# 1 - for each day of the year, count the total number of flights and sort in descending order
flights %>%
group_by(Month, DayofMonth) %>%
summarise( flight_count = n()) %>%
arrange(desc(flight_count))
# 2 - use simpler Tally function (# nb : output => shows 365 days in a year)
flights %>%
group_by(Month, DayofMonth) %>%
tally(sort = TRUE)
# 3 - Show for each destination the total number of flights and the number of distinct planes that flew there
flights %>%
group_by(Dest) %>%
summarise(flight_count = n(), plane_count = n_distinct(TailNum))
# Show the number of Cancelled and Not Cancelled flights
flights %>%
group_by(Dest) %>%
select(Cancelled) %>%
table() %>%
head()
# fore each Carrier calculate which 2 days of the year they had their longest departure delay
# Note : the ranking function puts the smallest value at first position. So we use the desc function to rank by largest value
flights %>%
group_by(UniqueCarrier) %>%
select(Month, DayofMonth, DepDelay) %>%
filter(min_rank(desc(DepDelay)) <= 2) %>%
arrange(UniqueCarrier, desc(DepDelay))
# Same thing with the top_n function above
flights %>%
group_by(UniqueCarrier) %>%
select(Month, DayofMonth, DepDelay) %>%
top_n(2) %>%
arrange(UniqueCarrier, desc(DepDelay))
# calculate for each month the total number of flights and the change versus previous month
flights %>%
group_by(Month) %>%
summarise(flight_count = n()) %>%
mutate(change = flight_count - lag(flight_count)) # lag function looks at the earlier value
# simpler with tally function
flights %>%
group_by(Month) %>%
tally() %>%
mutate(change = n - lag(n))
# random sample of a fixed number of rows without replacement
flights %>% sample_n(5)
# same as above with replacement but chosing a fraction of rows
flights %>% sample_frac(0.25, replace = TRUE)
# look at the structure of an object (base R approach)
str(flights)
# look at the structure of an object (dplyr approach)
glimpse(flights)