25 September 2015
setDT(df)
df[i, j, by] :
get rows
get or process variables
group
…Easy, huh?
library(data.table) data(flights, package = "nycflights13") df = setDT(flights) # set to data.table
dim(df)
## [1] 336776 16
class(df)
## [1] "data.table" "data.frame"
df[, .(tailnum, flight, origin)]
## tailnum flight origin ## 1: N14228 1545 EWR ## --- ## 336776: N839MQ 3531 LGA
# simple select df[dest == "LAX"] # setting a key also automatically sorts the data.frame setkey(df, dest) df["LAX",]
## tailnum flight dest origin ## 1: N29129 194 LAX JFK ## --- ## 16174: N335AA 185 LAX JFK
df[dest == "LAX", .(avg.airtime = mean(air_time, na.rm = T)), keyby = carrier]
## carrier avg.airtime ## 1: AA 327.8175 ## 2: B6 328.2816 ## 3: DL 329.8717 ## 4: UA 325.8660 ## 5: VX 331.8281
Add field:
df[, speed := (distance * 1.852) / (air_time / 60)]
Remove field:
df[, flight := NULL]
df[is.na(speed),]
## carrier tailnum speed ## 1: B6 N373JB NA ## --- ## 9430: EV N14179 NA
df[is.na(speed), speed := 0]
library(lubridate) #easy handling of dates df[, `:=` ( date = ymd(paste(year, month, day, sep = "-")), airtime.hrs = air_time / 60, dep.delay.hrs = dep_delay / 60, arr.delay.hrs = arr_delay / 60 )]
## date airtime.hrs dep.delay.hrs arr.delay.hrs ## 1: 2013-10-01 3.833333 -0.1000000 -0.5833333 ## --- ## 336776: 2013-09-30 2.433333 -0.1333333 -0.5666667
df[, speed := {
dist.km = distance * 1.852
airtime.hr = air_time / 60
speed = dist.km / airtime.hr
}][speed > 1234, .(carrier, air_time, distance, speed)]
## carrier air_time distance speed ## 1: DL 65 762 1302.668
data(planes, package = "nycflights13") setDT(planes) setkey(planes, tailnum) setkey(df, tailnum) dx = df[planes] dx[speed > 1234, .(carrier, speed, manufacturer, model, engine)]
## carrier speed manufacturer model engine ## 1: DL 1302.668 BOEING 757-232 Turbo-fan
dx[, .GRP, by = .(carrier, model)]
## carrier model GRP ## 1: EV EMB-145XR 1 ## 2: US A320-214 2 ## 3: EV EMB-145LR 3
dx[, tail(.SD, 1), keyby = .(carrier, model)]
## carrier model dest distance ## 1: 9E CL-600-2B19 TYS 647 ## 2: 9E CL-600-2D24 SRQ 1047 ## 3: AA 150 STL 888
dx[dest %like% "LA"] # finds LAS and LAX
## dest distance model carrier ## 1: LAS 2227 737-824 UA ## 2: LAS 2227 737-824 UA ## 3: LAS 2227 737-824 UA
dx[distance %between% c(1000, 1200)]
## dest distance model carrier ## 1: DSM 1017 EMB-145XR EV ## 2: DSM 1017 EMB-145XR EV ## 3: DSM 1017 EMB-145XR EV
Strategies:
Use data.table to condense dataset, then standard plot
Use library hexbin [and ggplot2] to bin values
Use library bigvis to bin, summarize and smooth
library(ggmap)
data(airports, package = "nycflights13")
airports = setDT(airports)
dest.count = df[, .N, by = dest]
airport.count = merge(airports, dest.count,
by.x = "faa", by.y = "dest")
usa = get_map("USA", zoom = 4,
maptype = "roadmap",
color = "bw")
p = ggmap(usa) +
geom_point(aes(lon, lat, size = N),
color = "royalblue",
data = airport.count)
library(hexbin) bin = with(df, hexbin(distance, speed)) plot(bin, style = "lattice", legend = F)
library(hexbin)
p1 = ggplot(df, aes(distance, speed)) +
stat_binhex(colour = "white", bins = 60) +
scale_fill_gradientn(colours = c("lightgrey","red"),
name = "Frequency") +
ggtitle("Distance-Speed density")
library(bigvis) # not on CRAN, download from GitHub
# Relation between distance and speed?
dist.mean = condense(bin(df[, distance]),
z = df[, speed], summary = "mean")