October 16, 2015

Fast analysis of large datasets

Get data from 7 million USA flights

library(data.table)

df = fread("2008.csv") 
## 
Read 0.0% of 7009728 rows
Read 10.3% of 7009728 rows
Read 26.0% of 7009728 rows
Read 41.8% of 7009728 rows
Read 57.5% of 7009728 rows
Read 73.3% of 7009728 rows
Read 89.2% of 7009728 rows
Read 7009728 rows and 29 (of 29) columns from 0.642 GB file in 00:00:10

How it works

# df[rows, columns, by]

df[Origin == "JFK",]  # select flights from JFK

df[, .(Origin, Distance)] # select columns

df[, .N, by = UniqueCarrier] # count flights per carrier

Get date field and day of week

df[, date := {
  date = paste(Year, Month, DayofMonth, sep = "-")
  date = ymd(date)
}]

# get weekday and day of the year
df[, `:=` (
  week.day = wday(date, label = T, abbr = T),
  year.day = yday(date)
)]

Calculate airplane speeds

df[, `:=` (
  distance = Distance * 1.852,
  hr = AirTime / 60
)]

# calculate speed and condense dataframe
dfx = df[, speed := distance / hr]
dfx = dfx[ , .(date, distance, hr, speed, TailNum, 
               UniqueCarrier, Origin, Dest)]

dfx[is.na(speed), speed := 0]
dfx[is.infinite(speed), speed := 0]

Visualizing data

Supersonic flights!

Corrected version: subsonic only

La Guardia airport, flights per day

Flights from JFK

Destinations, JFK airport

data(airports, package = "nycflights13")
setDT(airports)

dest.counts = df[Origin == "JFK", .N, by = Dest]

dest.counts = merge(dest.counts, 
                    airports, 
                    by.x = "Dest", by.y = "faa", 
                    all.x = T)

Destinations from JFK airport

Destinations from JFK airport II