Data ergonomics with R/data.table

25 September 2015

Why data.table?

How it works

setDT(df)

set data.frame to data.table

df[i, j, by] :

get rows
get or process variables
group

…Easy, huh?

Get data

336K flights departing from NYC in 2013

library(data.table)
data(flights, package = "nycflights13")
df = setDT(flights)  # set to data.table

dim(df)

## [1] 336776     16

class(df)

## [1] "data.table" "data.frame"

Select columns

df[, .(tailnum, flight, origin)]

##         tailnum flight origin
##      1:  N14228   1545    EWR
##     ---                      
## 336776:  N839MQ   3531    LGA

Select rows

get flights to Los Angeles

# simple select
df[dest == "LAX"]

# setting a key also automatically sorts the data.frame
setkey(df, dest)
df["LAX",]

##        tailnum flight dest origin
##     1:  N29129    194  LAX    JFK
##    ---                           
## 16174:  N335AA    185  LAX    JFK

Group

average airtime to LAX by carrier

df[dest == "LAX",
   .(avg.airtime = mean(air_time, na.rm = T)),
   keyby = carrier]

##    carrier avg.airtime
## 1:      AA    327.8175
## 2:      B6    328.2816
## 3:      DL    329.8717
## 4:      UA    325.8660
## 5:      VX    331.8281

Add or remove variable

Add field:

df[, speed := (distance * 1.852) / (air_time / 60)]

Remove field:

df[, flight := NULL]

That's it - you're good to go

Update by reference

df[is.na(speed),]

##       carrier tailnum speed
##    1:      B6  N373JB    NA
##   ---                      
## 9430:      EV  N14179    NA

df[is.na(speed), speed := 0]

Add and remove more new fields

library(lubridate) #easy handling of dates

df[, `:=` (
  date = ymd(paste(year, month, day, sep = "-")),
  airtime.hrs = air_time / 60,
  dep.delay.hrs = dep_delay / 60,
  arr.delay.hrs = arr_delay / 60
)]

##               date airtime.hrs dep.delay.hrs arr.delay.hrs
##      1: 2013-10-01    3.833333    -0.1000000    -0.5833333
##     ---                                                   
## 336776: 2013-09-30    2.433333    -0.1333333    -0.5666667

Steps & Chains

Who went supersonic?

df[, speed := {
  dist.km = distance * 1.852
  airtime.hr = air_time / 60
  speed = dist.km / airtime.hr
}][speed > 1234, .(carrier, air_time, distance, speed)]

##    carrier air_time distance    speed
## 1:      DL       65      762 1302.668

Joins

Which airplane did they use?

data(planes, package = "nycflights13")
setDT(planes)

setkey(planes, tailnum)
setkey(df, tailnum)
dx = df[planes]

dx[speed > 1234, .(carrier, speed, manufacturer, model, engine)]

##    carrier    speed manufacturer   model    engine
## 1:      DL 1302.668       BOEING 757-232 Turbo-fan

Queries

Groups

dx[, .GRP, by = .(carrier, model)]

##    carrier     model GRP
## 1:      EV EMB-145XR   1
## 2:      US  A320-214   2
## 3:      EV EMB-145LR   3

dx[, tail(.SD, 1), keyby = .(carrier, model)]

##    carrier       model dest distance
## 1:      9E CL-600-2B19  TYS      647
## 2:      9E CL-600-2D24  SRQ     1047
## 3:      AA         150  STL      888

Even more queries

dx[dest %like% "LA"]  # finds LAS and LAX

##    dest distance   model carrier
## 1:  LAS     2227 737-824      UA
## 2:  LAS     2227 737-824      UA
## 3:  LAS     2227 737-824      UA

dx[distance %between% c(1000, 1200)]

##    dest distance     model carrier
## 1:  DSM     1017 EMB-145XR      EV
## 2:  DSM     1017 EMB-145XR      EV
## 3:  DSM     1017 EMB-145XR      EV

Visualize

Visualize large datasets

Strategies:

Use data.table to condense dataset, then standard plot
Use library hexbin [and ggplot2] to bin values
Use library bigvis to bin, summarize and smooth

Plot counts to destination

step 1: count, merge with airport data

library(ggmap)

data(airports, package = "nycflights13")
airports = setDT(airports)

dest.count = df[, .N, by = dest]

airport.count = merge(airports, dest.count, 
                      by.x = "faa", by.y = "dest")

Plot counts to destination

step 2: get map and plot counts on map

usa = get_map("USA", zoom = 4, 
              maptype = "roadmap", 
              color = "bw")

p = ggmap(usa) + 
    geom_point(aes(lon, lat, size = N), 
               color = "royalblue", 
               data = airport.count)

Airport popularity by destination

The problem

Solution 1: Use hexbin

library(hexbin)

bin = with(df, hexbin(distance, speed))
plot(bin, style = "lattice", legend = F)

Solution 2: Use hexbin via ggplot2

library(hexbin)

p1 = ggplot(df, aes(distance, speed)) +
  stat_binhex(colour = "white", bins = 60) +
  scale_fill_gradientn(colours = c("lightgrey","red"), 
                       name = "Frequency") +
  ggtitle("Distance-Speed density")

Solution 3: Use bigvis

library(bigvis) # not on CRAN, download from GitHub

# Relation between distance and speed?
dist.mean = condense(bin(df[, distance]), 
                     z = df[, speed], summary = "mean")

Why data.table?

How it works

Get data

336K flights departing from NYC in 2013

Select columns

Select rows

get flights to Los Angeles

Group

average airtime to LAX by carrier

Add or remove variable

That's it - you're good to go

Update by reference

Add and remove more new fields

Steps & Chains

Who went supersonic?

Joins

Which airplane did they use?

Queries

Groups

Even more queries

Visualize

Visualize large datasets

Plot counts to destination

step 1: count, merge with airport data

Plot counts to destination

step 2: get map and plot counts on map

Airport popularity by destination

The problem

Solution 1: Use hexbin

Solution 2: Use hexbin via ggplot2

Solution 3: Use bigvis

Questions?

check out documentation