Exploratory data analysis with dplyr

Note : to display results you need to remove the “message=FALSE” option in the chunks below

1 - Load Libraries

library(dplyr)
library(hflights)

2 - View dataset

data("hflights")
head(hflights, 5)

3 - Convert to local Dataframe

we use tbl_df, which is a wrapper to display data more convenientlty and prints information on unseen variables

flights = as_tibble(hflights)
head(flights,4)

4 - Inspect the data

specify the number of rows you want to see

print(flights, n=5)

head(flights, 5)

5 - Filtering

# Rbase approach
flights[flights$Month == 1 & flights$DayOfWeek == 7, c(1:4)]

# dplyr approach
filter(slice(flights, 1:5), Month==1, DayofMonth ==2)

# using pipe and %in% operators
filter(slice(flights, 1:5), ActualElapsedTime == 60 | ActualElapsedTime ==70)

filter(flights, TailNum %in% c("N576AA"), Dest %in% c("DFW"))

6 - Selecting

#Base R approach
flights[c(1:5), c("FlightNum","AirTime","TaxiOut")]

# Dplyr approach
select(flights, FlightNum,AirTime, TaxiOut)
select(flights, AirTime:Dest, contains('taxi'), contains('cancel'))
select(flights, contains('taxi'), contains('cancel'))

# we can also use 'starts_with', 'ends_with', 'matches' to match columns by name

7 - Slicing

slice(flights, 1:5)

8 - Chaining and pipelining

filter(select(flights, UniqueCarrier, TailNum, TaxiIn), TaxiIn > 10)

# alternative way : chaining
flights %>%
  select(UniqueCarrier, TailNum, TaxiIn)%>%
  filter(TaxiIn < 10)

Chaining increases the readability and can be used to replace nesting in R commands

# creating 2 vectors and calculate euclidian distance between them
x = 1:5; y = 2:6
sqrt(sum((x-y)^2))

# chaining method in practice
(x-y)^2 %>% sum() %>% sqrt()

9- Arrange : reordering rows

# baseR Approach (select UniqueCarrier and  DepDelay columns and sort by DepDelay )
flights[order(flights$DepDelay), c("UniqueCarrier", "DepDelay")]

#dplyr approach
flights %>%
  select(UniqueCarrier,DepDelay)%>%
  arrange(DepDelay) # ascending by default

# descending order
flights %>%
  select(UniqueCarrier,DepDelay)%>%
  arrange(desc(DepDelay))

10 - Mutate (adding new variables)

# create new variables that are functions of existing variables :

# rbase approach
flights$Speed = flights$Distance / flights$AirTime*60
flights[, c("Distance", "AirTime", "Speed")]


# dplyr approach (prints the new variable but does not store it)
flights %>%
  select(Distance, AirTime)  %>%
  mutate(speed_2 = Distance/AirTime*60)

#store the variable
flights = flights %>% mutate(speed_2 = Distance/AirTime*60)
head(flights, 5)

# mutate can be used without select

11 - Summarise and group_by

# calculate average arrival delay for each destination
# Rbase Approach
head(aggregate(ArrDelay ~ Dest, flights, mean))
head(with(flights, tapply(ArrDelay, Dest, mean, na.rm=TRUE)))

# Dplyr Approach
# we create a table of destinations and summarise each group by taking the mean ArrDelay
flights %>%
  group_by(Dest) %>%
  summarise(avg_delay=mean(ArrDelay, na.rm = TRUE))

12 - Summarise_each

# applies the same summmry function as above to multiple columns 
# Calculating for each UniqueCarrier the percentage of flights cancelled or diverted (for each = group by)

flights %>%
  group_by(UniqueCarrier) %>%
  summarise_each(funs(mean),Cancelled, Diverted)

# Calculate for each Carrier the maximum and minimum arrival and departure delays
flights %>%
  group_by(UniqueCarrier) %>%
  summarise_each(funs(min(., na.rm = TRUE), max( na.rm = TRUE)), matches("Delay"))

13 - Helper functions

# n = counts the number of rows in a group
# n_distinct(vector) = counts the number of unique items in that vector 

# 1 - for each day of the year, count the total number of flights and sort in descending order
flights %>%
  group_by(Month, DayofMonth) %>%
  summarise( flight_count = n()) %>%
  arrange(desc(flight_count))
  
# 2 - use simpler Tally function (# nb : output => shows 365 days in a year)
flights %>%
  group_by(Month, DayofMonth) %>%
  tally(sort = TRUE)



# 3 - Show for each destination the total number of flights and the number of distinct planes that flew there
flights %>%
  group_by(Dest) %>%
  summarise(flight_count = n(), plane_count = n_distinct(TailNum))

14 - More on Grouping

# Show the number of Cancelled and Not Cancelled flights
flights %>%
  group_by(Dest) %>%
  select(Cancelled) %>%
  table() %>%
  head()

15 - Window functions

# fore each Carrier calculate which 2 days of the year they had their longest departure delay
# Note : the ranking function puts the smallest value at first position. So we use the desc function to rank by largest value
flights %>%
  group_by(UniqueCarrier) %>%
  select(Month, DayofMonth, DepDelay) %>%
  filter(min_rank(desc(DepDelay)) <= 2) %>%
  arrange(UniqueCarrier, desc(DepDelay))

# Same thing with the top_n function above
flights %>%
  group_by(UniqueCarrier) %>%
  select(Month, DayofMonth, DepDelay) %>%
  top_n(2) %>%
  arrange(UniqueCarrier, desc(DepDelay))


# calculate for each month the total number of flights and the change versus previous month
flights %>%
  group_by(Month) %>%
  summarise(flight_count = n()) %>%
  mutate(change = flight_count - lag(flight_count)) # lag function looks at the earlier value

# simpler with tally function
flights %>%
  group_by(Month) %>%
  tally() %>%
  mutate(change = n - lag(n))

16 - More useful functions

# random sample of a fixed number of rows without replacement
flights %>% sample_n(5)

# same as above with replacement but chosing a fraction of rows
flights %>% sample_frac(0.25, replace = TRUE)

# look at the structure of an object (base R approach)
str(flights)

# look at the structure of an object (dplyr approach)
glimpse(flights)