Jesse Yang
2017-03-19
dplyr transforms regular data wrangling verbs into SQL queriesSELECT statements)collect()dplyr verbs your already familiar with for database accessThe nycflights13 datasets
library(nycflights13)
flights %>% head(5)
# A tibble: 5 × 19
year month day dep_time sched_dep_time dep_delay arr_time
<int> <int> <int> <int> <int> <dbl> <int>
1 2013 1 1 517 515 2 830
2 2013 1 1 533 529 4 850
3 2013 1 1 542 540 2 923
4 2013 1 1 544 545 -1 1004
5 2013 1 1 554 600 -6 812
# ... with 12 more variables: sched_arr_time <int>, arr_delay <dbl>,
# carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
# air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
# time_hour <dttm>
library(DBI)
library(RSQLite)
conn <- dbConnect(SQLite(), ":memory:")
conn <- dbConnect(SQLite(), "nycflights13.sqlite")
conn
<SQLiteConnection>
Path: nycflights13.sqlite
Extensions: TRUE
db <- src_memdb()
db
src: sqlite 3.11.1 [:memory:]
tbls:
db <- src_sqlite("nycflights13.sqlite", create = TRUE)
db
src: sqlite 3.11.1 [/Users/jesse/Dropbox (Personal)/DA5020/nycflights13.sqlite]
tbls:
Use copy_to to copy a local data frame to a remote source.
flights_sqlite <-
copy_to(
db, nycflights13::flights,
name = "flights",
temporary = FALSE,
indexes = list(
c("year", "month", "day"),
"carrier",
"tailnum"
))
Underlying function is actually copy_to.src_sql(...) (search help document for more arguments).
flights_sqlite <- tbl(db, "flights")
flights_sqlite
Source: table<flights> [?? x 19]
Database: sqlite 3.11.1 [/Users/jesse/Dropbox (Personal)/DA5020/nycflights13.sqlite]
year month day dep_time sched_dep_time dep_delay arr_time
<int> <int> <int> <int> <int> <dbl> <int>
1 2013 1 1 517 515 2 830
2 2013 1 1 533 529 4 850
3 2013 1 1 542 540 2 923
4 2013 1 1 544 545 -1 1004
5 2013 1 1 554 600 -6 812
6 2013 1 1 554 558 -4 740
7 2013 1 1 555 600 -5 913
8 2013 1 1 557 600 -3 709
9 2013 1 1 557 600 -3 838
10 2013 1 1 558 600 -2 753
# ... with more rows, and 12 more variables: sched_arr_time <int>,
# arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
# origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
# minute <dbl>, time_hour <dbl>
nycflights13::flights
# A tibble: 336,776 × 19
year month day dep_time sched_dep_time dep_delay arr_time
<int> <int> <int> <int> <int> <dbl> <int>
1 2013 1 1 517 515 2 830
2 2013 1 1 533 529 4 850
3 2013 1 1 542 540 2 923
4 2013 1 1 544 545 -1 1004
5 2013 1 1 554 600 -6 812
6 2013 1 1 554 558 -4 740
7 2013 1 1 555 600 -5 913
8 2013 1 1 557 600 -3 709
9 2013 1 1 557 600 -3 838
10 2013 1 1 558 600 -2 753
# ... with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
# arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
# origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
# minute <dbl>, time_hour <dttm>
c1 <- flights %>%
filter(dep_delay > 0) %>%
group_by(carrier) %>%
summarize(avg.delay = mean(dep_delay)) %>%
arrange(desc(avg.delay)) %>%
head(5)
c1
# A tibble: 5 × 2
carrier avg.delay
<chr> <dbl>
1 OO 58.00000
2 YV 52.95279
3 EV 50.32979
4 9E 48.92001
5 F9 45.13783
c1 <- flights_sqlite %>%
filter(dep_delay > 0) %>%
group_by(carrier) %>%
summarize(avg.delay = mean(dep_delay)) %>%
arrange(desc(avg.delay)) %>%
head(5)
c1
Source: lazy query [?? x 2]
Database: sqlite 3.11.1 [/Users/jesse/Dropbox (Personal)/DA5020/nycflights13.sqlite]
Ordered by: desc(avg.delay)
carrier avg.delay
<chr> <dbl>
1 OO 58.00000
2 YV 52.95279
3 EV 50.32979
4 9E 48.92001
5 F9 45.13783
show_query(c1)
<SQL>
SELECT `carrier`, AVG(`dep_delay`) AS `avg.delay`
FROM `flights`
WHERE (`dep_delay` > 0.0)
GROUP BY `carrier`
ORDER BY `avg.delay` DESC
LIMIT 5
with two-table verbs
inner_join - must match bothleft_join - keep xright_join - keep yfull_join - keep bothdf1 <- data_frame(x = c(1, 1, 3, 4), y = 1:4)
df2 <- data_frame(x = c(1, 1, 2), z = c("a", "b", "a"))
left_join(df1, df2)
# A tibble: 6 × 3
x y z
<dbl> <int> <chr>
1 1 1 a
2 1 1 b
3 1 2 a
4 1 2 b
5 3 3 <NA>
6 4 4 <NA>
Normally you would prefer left_join.
right_join(df1, df2)
# A tibble: 5 × 3
x y z
<dbl> <int> <chr>
1 1 1 a
2 1 2 a
3 1 1 b
4 1 2 b
5 2 NA a
df1 <- data_frame(x = c(1, 1, 3, 4), y = 1:4)
df2 <- data_frame(x = c(1, 1, 2), z = c("a", "b", "a"))
inner_join(df1, df2)
# A tibble: 4 × 3
x y z
<dbl> <int> <chr>
1 1 1 a
2 1 1 b
3 1 2 a
4 1 2 b
Use inner_join if you don't like NA's.
full_join(df1, df2)
# A tibble: 7 × 3
x y z
<dbl> <int> <chr>
1 1 1 a
2 1 1 b
3 1 2 a
4 1 2 b
5 3 3 <NA>
6 4 4 <NA>
7 2 NA a
df1 <- data_frame(x = c(1, 1, 3, 4), y = 1:4)
df2 <- data_frame(x = c(1, 1, 2), z = c("a", "b", "a"))
semi_join(df1, df2)
# A tibble: 2 × 2
x y
<dbl> <int>
1 1 1
2 1 2
anti_join(df1, df2)
# A tibble: 2 × 2
x y
<dbl> <int>
1 4 4
2 3 3
df1 <- data_frame(x = 1:4, y = c("a", "b", "a", "b"))
df2 <- data_frame(x = c(1, 1, 2), y = c("a", "b", "a"))
union(df1, df2)
# A tibble: 6 × 2
x y
<dbl> <chr>
1 2 a
2 1 b
3 4 b
4 3 a
5 2 b
6 1 a
intersect(df1, df2)
# A tibble: 1 × 2
x y
<dbl> <chr>
1 1 a
setdiff(df1, df2)
# A tibble: 3 × 2
x y
<dbl> <chr>
1 2 b
2 3 a
3 4 b
airlines_sqlite <- copy_to(db, airlines)
# top airlines in terms of number of flights
top_airlines <- flights_sqlite %>%
group_by(carrier) %>%
summarize(n = n()) %>%
arrange(desc(n))
top_airlines
Source: lazy query [?? x 2]
Database: sqlite 3.11.1 [/Users/jesse/Dropbox (Personal)/DA5020/nycflights13.sqlite]
Ordered by: desc(n)
carrier n
<chr> <int>
1 UA 58665
2 B6 54635
3 EV 54173
4 DL 48110
5 AA 32729
6 MQ 26397
7 US 20536
8 9E 18460
9 WN 12275
10 VX 5162
# ... with more rows
top_airlines <- top_airlines %>% left_join(airlines_sqlite, by = "carrier")
top_airlines
Source: lazy query [?? x 3]
Database: sqlite 3.11.1 [/Users/jesse/Dropbox (Personal)/DA5020/nycflights13.sqlite]
Ordered by: desc(n)
carrier n name
<chr> <int> <chr>
1 UA 58665 United Air Lines Inc.
2 B6 54635 JetBlue Airways
3 EV 54173 ExpressJet Airlines Inc.
4 DL 48110 Delta Air Lines Inc.
5 AA 32729 American Airlines Inc.
6 MQ 26397 Envoy Air
7 US 20536 US Airways Inc.
8 9E 18460 Endeavor Air Inc.
9 WN 12275 Southwest Airlines Co.
10 VX 5162 Virgin America
# ... with more rows
show_query(top_airlines)
<SQL>
SELECT `_LEFT`.`carrier` AS `carrier`, `_LEFT`.`n` AS `n`, `_RIGHT`.`name` AS `name`
FROM (SELECT `carrier`, COUNT() AS `n`
FROM `flights`
GROUP BY `carrier`
ORDER BY `n` DESC) AS `_LEFT`
LEFT JOIN `airlines` AS `_RIGHT`
ON (`_LEFT`.`carrier` = `_RIGHT`.`carrier`)
airlines <- data.frame(id = airlines$carrier, name = airlines$name)
airlines
id name
1 9E Endeavor Air Inc.
2 AA American Airlines Inc.
3 AS Alaska Airlines Inc.
4 B6 JetBlue Airways
5 DL Delta Air Lines Inc.
6 EV ExpressJet Airlines Inc.
7 F9 Frontier Airlines Inc.
8 FL AirTran Airways Corporation
9 HA Hawaiian Airlines Inc.
10 MQ Envoy Air
11 OO SkyWest Airlines Inc.
12 UA United Air Lines Inc.
13 US US Airways Inc.
14 VX Virgin America
15 WN Southwest Airlines Co.
16 YV Mesa Airlines Inc.
flights <- flights %>% select(carrier, flight, tailnum, origin, dest)
flights
# A tibble: 336,776 × 5
carrier flight tailnum origin dest
<chr> <int> <chr> <chr> <chr>
1 UA 1545 N14228 EWR IAH
2 UA 1714 N24211 LGA IAH
3 AA 1141 N619AA JFK MIA
4 B6 725 N804JB JFK BQN
5 DL 461 N668DN LGA ATL
6 UA 1696 N39463 EWR ORD
7 B6 507 N516JB EWR FLL
8 EV 5708 N829AS LGA IAD
9 B6 79 N593JB JFK MCO
10 AA 301 N3ALAA LGA ORD
# ... with 336,766 more rows
flights %>%
left_join(airlines, by = c("carrier" = "id"))
# A tibble: 336,776 × 6
carrier flight tailnum origin dest name
<chr> <int> <chr> <chr> <chr> <fctr>
1 UA 1545 N14228 EWR IAH United Air Lines Inc.
2 UA 1714 N24211 LGA IAH United Air Lines Inc.
3 AA 1141 N619AA JFK MIA American Airlines Inc.
4 B6 725 N804JB JFK BQN JetBlue Airways
5 DL 461 N668DN LGA ATL Delta Air Lines Inc.
6 UA 1696 N39463 EWR ORD United Air Lines Inc.
7 B6 507 N516JB EWR FLL JetBlue Airways
8 EV 5708 N829AS LGA IAD ExpressJet Airlines Inc.
9 B6 79 N593JB JFK MCO JetBlue Airways
10 AA 301 N3ALAA LGA ORD American Airlines Inc.
# ... with 336,766 more rows
Full results are not available unless you collect()
top_airlines %>% head(5)
Source: lazy query [?? x 3]
Database: sqlite 3.11.1 [/Users/jesse/Dropbox (Personal)/DA5020/nycflights13.sqlite]
Ordered by: desc(n)
carrier n name
<chr> <int> <chr>
1 UA 58665 United Air Lines Inc.
2 B6 54635 JetBlue Airways
3 EV 54173 ExpressJet Airlines Inc.
4 DL 48110 Delta Air Lines Inc.
5 AA 32729 American Airlines Inc.
top_airlines %>% nrow()
[1] NA
top_airlines %>% collect()
# A tibble: 16 × 3
carrier n name
<chr> <int> <chr>
1 UA 58665 United Air Lines Inc.
2 B6 54635 JetBlue Airways
3 EV 54173 ExpressJet Airlines Inc.
4 DL 48110 Delta Air Lines Inc.
5 AA 32729 American Airlines Inc.
6 MQ 26397 Envoy Air
7 US 20536 US Airways Inc.
8 9E 18460 Endeavor Air Inc.
9 WN 12275 Southwest Airlines Co.
10 VX 5162 Virgin America
11 FL 3260 AirTran Airways Corporation
12 AS 714 Alaska Airlines Inc.
13 F9 685 Frontier Airlines Inc.
14 YV 601 Mesa Airlines Inc.
15 HA 342 Hawaiian Airlines Inc.
16 OO 32 SkyWest Airlines Inc.
top_airlines %>% collect() %>% nrow()
[1] 16