25 September 2015

Why data.table?

How it works

setDT(df)

  • set data.frame to data.table

df[i, j, by] :

  • get rows

  • get or process variables

  • group

…Easy, huh?

Get data

336K flights departing from NYC in 2013

library(data.table)
data(flights, package = "nycflights13")
df = setDT(flights)  # set to data.table
dim(df)
## [1] 336776     16
class(df)
## [1] "data.table" "data.frame"

Select columns

df[, .(tailnum, flight, origin)]
##         tailnum flight origin
##      1:  N14228   1545    EWR
##     ---                      
## 336776:  N839MQ   3531    LGA

Select rows

get flights to Los Angeles

# simple select
df[dest == "LAX"]

# setting a key also automatically sorts the data.frame
setkey(df, dest)
df["LAX",]
##        tailnum flight dest origin
##     1:  N29129    194  LAX    JFK
##    ---                           
## 16174:  N335AA    185  LAX    JFK

Group

average airtime to LAX by carrier

df[dest == "LAX",
   .(avg.airtime = mean(air_time, na.rm = T)),
   keyby = carrier]
##    carrier avg.airtime
## 1:      AA    327.8175
## 2:      B6    328.2816
## 3:      DL    329.8717
## 4:      UA    325.8660
## 5:      VX    331.8281

Add or remove variable

Add field:

df[, speed := (distance * 1.852) / (air_time / 60)]

Remove field:

df[, flight := NULL]

That's it - you're good to go

Update by reference

df[is.na(speed),]
##       carrier tailnum speed
##    1:      B6  N373JB    NA
##   ---                      
## 9430:      EV  N14179    NA
df[is.na(speed), speed := 0]

Add and remove more new fields

library(lubridate) #easy handling of dates

df[, `:=` (
  date = ymd(paste(year, month, day, sep = "-")),
  airtime.hrs = air_time / 60,
  dep.delay.hrs = dep_delay / 60,
  arr.delay.hrs = arr_delay / 60
)]
##               date airtime.hrs dep.delay.hrs arr.delay.hrs
##      1: 2013-10-01    3.833333    -0.1000000    -0.5833333
##     ---                                                   
## 336776: 2013-09-30    2.433333    -0.1333333    -0.5666667

Steps & Chains

Who went supersonic?

df[, speed := {
  dist.km = distance * 1.852
  airtime.hr = air_time / 60
  speed = dist.km / airtime.hr
}][speed > 1234, .(carrier, air_time, distance, speed)]
##    carrier air_time distance    speed
## 1:      DL       65      762 1302.668

Joins

Which airplane did they use?

data(planes, package = "nycflights13")
setDT(planes)

setkey(planes, tailnum)
setkey(df, tailnum)
dx = df[planes]

dx[speed > 1234, .(carrier, speed, manufacturer, model, engine)]
##    carrier    speed manufacturer   model    engine
## 1:      DL 1302.668       BOEING 757-232 Turbo-fan

Queries

Groups

dx[, .GRP, by = .(carrier, model)] 
##    carrier     model GRP
## 1:      EV EMB-145XR   1
## 2:      US  A320-214   2
## 3:      EV EMB-145LR   3
dx[, tail(.SD, 1), keyby = .(carrier, model)]
##    carrier       model dest distance
## 1:      9E CL-600-2B19  TYS      647
## 2:      9E CL-600-2D24  SRQ     1047
## 3:      AA         150  STL      888

Even more queries

dx[dest %like% "LA"]  # finds LAS and LAX
##    dest distance   model carrier
## 1:  LAS     2227 737-824      UA
## 2:  LAS     2227 737-824      UA
## 3:  LAS     2227 737-824      UA
dx[distance %between% c(1000, 1200)] 
##    dest distance     model carrier
## 1:  DSM     1017 EMB-145XR      EV
## 2:  DSM     1017 EMB-145XR      EV
## 3:  DSM     1017 EMB-145XR      EV

Visualize

Visualize large datasets

Strategies:

  • Use data.table to condense dataset, then standard plot

  • Use library hexbin [and ggplot2] to bin values

  • Use library bigvis to bin, summarize and smooth

Plot counts to destination

step 1: count, merge with airport data

library(ggmap)

data(airports, package = "nycflights13")
airports = setDT(airports)

dest.count = df[, .N, by = dest]

airport.count = merge(airports, dest.count, 
                      by.x = "faa", by.y = "dest")

Plot counts to destination

step 2: get map and plot counts on map

usa = get_map("USA", zoom = 4, 
              maptype = "roadmap", 
              color = "bw")

p = ggmap(usa) + 
    geom_point(aes(lon, lat, size = N), 
               color = "royalblue", 
               data = airport.count)

Airport popularity by destination

The problem

Solution 1: Use hexbin

library(hexbin)

bin = with(df, hexbin(distance, speed))
plot(bin, style = "lattice", legend = F)

Solution 2: Use hexbin via ggplot2

library(hexbin)

p1 = ggplot(df, aes(distance, speed)) +
  stat_binhex(colour = "white", bins = 60) +
  scale_fill_gradientn(colours = c("lightgrey","red"), 
                       name = "Frequency") +
  ggtitle("Distance-Speed density")

Solution 3: Use bigvis

library(bigvis) # not on CRAN, download from GitHub

# Relation between distance and speed?
dist.mean = condense(bin(df[, distance]), 
                     z = df[, speed], summary = "mean")

Questions?

check out documentation