library(tidyverse)
library(DBI)
library(RPostgres)
library(vroom)
library(palmerpenguins)
library(nycflights23)R to SQL transition
Introduction
Hello, i’m Vucius. I’m currently do an internship risk modeling in VP bank, Vietnam
R is a wonderful langue for data manipulation but i have to do a transition to SQL since it is universal langue for data. Here is a quick guide for who are familiar to data.frame and tidyverse but have to do a transition to SQL like me
In this guide i will use PostgreSQL because my SSMS have some issues and i dont know how to fix that yet
Syntax & Data type in SQL
Syntax
| R Syntax | SQL Equivalent | Explanation |
| == | = | Equality check. |
| != | != | Inequality check. |
| < | < | Less than. |
| <= | <= | Less than or equal to. |
| > | > | Greater than. |
| >= | >= | Greater than or equal to. |
| & | AND | Logical AND. Both conditions must be true. |
| ` | ` | OR |
| ! | NOT | Logical NOT. Inverts the truth body_mass_g. |
| is.na(x) | x IS NULL | Checks if x is NULL. |
| !is.na(x) | x IS NOT NULL | Checks if x is NOT NULL. |
| x %in% c(1, 2, 3) | x IN (1, 2, 3) | Checks if x is in a set of body_mass_gs. |
| `x == “A” | x == “B”` | x IN (‘A’, ‘B’) |
| x && y (short-circuiting AND) | x AND y | Logical AND without short-circuiting in SQL (no equivalent). |
| `x | y` | (short-circuiting OR) |
| 10^2 | POWER(10.0,2.0) | power |
| x %like% “pattern” (in data.table) | x LIKE ‘pattern%’ or x LIKE ‘%pattern’ | Similar to LIKE with wildcards %. |
| ifelse(test, yes, no) | CASE WHEN test THEN yes ELSE no END | Conditional logic in SQL. |
| substring(x, 1, 5) | SUBSTRING(x, 1, 5) | Extracts a substring. |
| str_detect(x, “pattern”) | x LIKE ‘%pattern%’ | Checks if a string matches a pattern. |
Data type
| SQL Data Type | R Equivalent | Description | SQL Example | R Example |
|---|---|---|---|---|
INT / INTEGER |
integer |
Whole numbers | CREATE TABLE my_table (id INT); |
x <- 1:10 |
FLOAT / REAL |
numeric |
Floating-point | CREATE TABLE my_table (score FLOAT); |
x <- c(1.5, 2.3, 3.7) |
DECIMAL / NUMERIC |
numeric |
Exact number with precision and scale | CREATE TABLE my_table (amount DECIMAL(10, 2)); |
x <- c(123.45, 678.90) |
CHAR(n) |
character |
Fixed-length string | CREATE TABLE my_table (code CHAR(5)); |
x <- c("ABCDE", "FGHIJ") |
VARCHAR(n) |
character |
Variable-length string | CREATE TABLE my_table (name VARCHAR(50)); |
x <- c("Alice", "Bob", "Charlie") |
TEXT |
character |
Large text data | CREATE TABLE my_table (description TEXT); |
x <- c("This is a long text.") |
DATE |
Date |
Date | CREATE TABLE my_table (birthdate DATE); |
x <- as.Date(c("2000-01-01", "2001-02-02")) |
TIME |
hms (from hms package) |
Time | CREATE TABLE my_table (start_time TIME); |
x <- hms::as_hms(c("08:00:00", "12:30:00")) |
DATETIME |
POSIXct |
Date and time | CREATE TABLE my_table (event_time DATETIME); |
x <- as.POSIXct(c("2024-01-01 08:00:00")) |
BOOLEAN |
logical |
TRUE or FALSE | CREATE TABLE my_table (is_active BOOLEAN); |
x <- c(TRUE, FALSE, TRUE) |
BLOB |
raw |
Binary data | CREATE TABLE my_table (image BLOB); |
x <- list(as.raw(c(0x89, 0x50))) |
JSON |
list / jsonlite::fromJSON |
JSON data | CREATE TABLE my_table (data JSON); |
x <- jsonlite::fromJSON('{"a": 1, "b": 2}') |
UUID |
character |
Universally Unique Identifier | CREATE TABLE my_table (id UUID); |
x <- UUIDgenerate::UUIDgenerate() |
ENUM |
factor |
Enumeration of body_mass_gs | CREATE TABLE my_table (size ENUM('small', 'medium', 'large')); |
x <- factor(c("small", "medium", "large")) |
SERIAL |
integer with sequence |
Auto-incrementing integer | CREATE TABLE my_table (id SERIAL); |
x <- 1:10 (auto-sequencing) |
GEOMETRY |
sf object (from sf package) |
Spatial data | CREATE TABLE my_table (location GEOMETRY); |
x <- sf::st_point(c(0, 0)) |
Library & Data
Setup SQL
con_tidy_tuesday_data <- dbConnect(RPostgres::Postgres(),
dbname = "tidy_tuesday_data", # projet chung khoan
host = "localhost", # san trong may
port = 5432, # cong mac dinh cua progressql
user = "postgres", # ten cua host
password = "1368") # pass dat luc dau
data("penguins")
dbWriteTable(con_tidy_tuesday_data, 'penguins', penguins, row.names = FALSE, overwrite = TRUE)
penguins_db <- tbl(con_tidy_tuesday_data, "penguins")penguins %>%
filter(
species == "Adelie",
sex == "male",
bill_length_mm >= 39 & bill_depth_mm >= 15
)# A tibble: 55 × 8
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 39.1 18.7 181 3750
2 Adelie Torgersen 39.3 20.6 190 3650
3 Adelie Torgersen 39.2 19.6 195 4675
4 Adelie Torgersen 42.5 20.7 197 4500
5 Adelie Torgersen 46 21.5 194 4200
6 Adelie Biscoe 40.6 18.6 183 3550
7 Adelie Biscoe 40.5 18.9 180 3950
8 Adelie Dream 40.9 18.9 184 3900
9 Adelie Dream 39.2 21.1 196 4150
10 Adelie Dream 39.8 19.1 184 4650
# ℹ 45 more rows
# ℹ 2 more variables: sex <fct>, year <int>
penguins_db %>%
filter(
species == "Adelie",
sex == "male",
bill_length_mm >= 39 & bill_depth_mm >= 15
) %>%
show_query()<SQL>
SELECT "penguins".*
FROM "penguins"
WHERE
("species" = 'Adelie') AND
("sex" = 'male') AND
("bill_length_mm" >= 39.0 AND "bill_depth_mm" >= 15.0)
Basic dplyr functions
Select
Selecting a Range of Consecutive Variables (: in Tidyverse)
penguins %>%
select(bill_length_mm:body_mass_g)# A tibble: 344 × 4
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<dbl> <dbl> <int> <int>
1 39.1 18.7 181 3750
2 39.5 17.4 186 3800
3 40.3 18 195 3250
4 NA NA NA NA
5 36.7 19.3 193 3450
6 39.3 20.6 190 3650
7 38.9 17.8 181 3625
8 39.2 19.6 195 4675
9 34.1 18.1 193 3475
10 42 20.2 190 4250
# ℹ 334 more rows
penguins_db %>%
select(bill_length_mm:body_mass_g) %>%
show_query()<SQL>
SELECT "bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g"
FROM "penguins"
SQL Equivalent: SQL does not have a direct operator to select a range of columns.
Taking the Complement of a Set of Variables (! in Tidyverse)
penguins %>%
select(-year)# A tibble: 344 × 7
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 39.1 18.7 181 3750
2 Adelie Torgersen 39.5 17.4 186 3800
3 Adelie Torgersen 40.3 18 195 3250
4 Adelie Torgersen NA NA NA NA
5 Adelie Torgersen 36.7 19.3 193 3450
6 Adelie Torgersen 39.3 20.6 190 3650
7 Adelie Torgersen 38.9 17.8 181 3625
8 Adelie Torgersen 39.2 19.6 195 4675
9 Adelie Torgersen 34.1 18.1 193 3475
10 Adelie Torgersen 42 20.2 190 4250
# ℹ 334 more rows
# ℹ 1 more variable: sex <fct>
penguins_db %>%
select(-year) %>%
show_query()<SQL>
SELECT
"species",
"island",
"bill_length_mm",
"bill_depth_mm",
"flipper_length_mm",
"body_mass_g",
"sex"
FROM "penguins"
SQL Equivalent: You must explicitly select all columns except the one you want to exclude:
Selecting All Variables (everything())
penguins %>%
select(everything())# A tibble: 344 × 8
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 39.1 18.7 181 3750
2 Adelie Torgersen 39.5 17.4 186 3800
3 Adelie Torgersen 40.3 18 195 3250
4 Adelie Torgersen NA NA NA NA
5 Adelie Torgersen 36.7 19.3 193 3450
6 Adelie Torgersen 39.3 20.6 190 3650
7 Adelie Torgersen 38.9 17.8 181 3625
8 Adelie Torgersen 39.2 19.6 195 4675
9 Adelie Torgersen 34.1 18.1 193 3475
10 Adelie Torgersen 42 20.2 190 4250
# ℹ 334 more rows
# ℹ 2 more variables: sex <fct>, year <int>
penguins_db %>%
select(everything()) %>%
show_query()<SQL>
SELECT *
FROM "penguins"
Select Last Variable (last_col())
penguins %>%
select(last_col())# A tibble: 344 × 1
year
<int>
1 2007
2 2007
3 2007
4 2007
5 2007
6 2007
7 2007
8 2007
9 2007
10 2007
# ℹ 334 more rows
penguins_db %>%
select(last_col()) %>%
show_query()<SQL>
SELECT "year"
FROM "penguins"
SQL Equivalent: You must know the last column name:
Selecting Variables by Matching Patterns in Their Names
Starts with (starts_with()/ends_with()/contains()/matches()/num_range()):
penguins %>%
select(starts_with("bill"))# A tibble: 344 × 2
bill_length_mm bill_depth_mm
<dbl> <dbl>
1 39.1 18.7
2 39.5 17.4
3 40.3 18
4 NA NA
5 36.7 19.3
6 39.3 20.6
7 38.9 17.8
8 39.2 19.6
9 34.1 18.1
10 42 20.2
# ℹ 334 more rows
penguins_db %>%
select(starts_with("bill")) %>%
show_query()<SQL>
SELECT "bill_length_mm", "bill_depth_mm"
FROM "penguins"
SQL Equivalent: Not directly supported in SQL; would need to explicitly list columns if known.
Selecting Variables Using a Predicate Function (where())
penguins %>%
select(where(is.numeric))# A tibble: 344 × 5
bill_length_mm bill_depth_mm flipper_length_mm body_mass_g year
<dbl> <dbl> <int> <int> <int>
1 39.1 18.7 181 3750 2007
2 39.5 17.4 186 3800 2007
3 40.3 18 195 3250 2007
4 NA NA NA NA 2007
5 36.7 19.3 193 3450 2007
6 39.3 20.6 190 3650 2007
7 38.9 17.8 181 3625 2007
8 39.2 19.6 195 4675 2007
9 34.1 18.1 193 3475 2007
10 42 20.2 190 4250 2007
# ℹ 334 more rows
#penguins_db %>%
# select(where(is.numeric)) %>%
# show_query()SQL Equivalent: SQL doesn’t directly support this; you would need to manually select numeric columns:
data type in SQL: PostgreSQL, MySQL, SQL sever
Filter (Where)
Comparison Operators (==, >, >=, etc.)
Equal to (
==): Use=in SQL.Greater than (
>): Use>.Greater than or equal to (
>=): Use>=.
penguins %>%
filter(body_mass_g >= 16)# A tibble: 342 × 8
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 39.1 18.7 181 3750
2 Adelie Torgersen 39.5 17.4 186 3800
3 Adelie Torgersen 40.3 18 195 3250
4 Adelie Torgersen 36.7 19.3 193 3450
5 Adelie Torgersen 39.3 20.6 190 3650
6 Adelie Torgersen 38.9 17.8 181 3625
7 Adelie Torgersen 39.2 19.6 195 4675
8 Adelie Torgersen 34.1 18.1 193 3475
9 Adelie Torgersen 42 20.2 190 4250
10 Adelie Torgersen 37.8 17.1 186 3300
# ℹ 332 more rows
# ℹ 2 more variables: sex <fct>, year <int>
penguins_db %>%
filter(body_mass_g >= 16) %>%
show_query()<SQL>
SELECT "penguins".*
FROM "penguins"
WHERE ("body_mass_g" >= 16.0)
Logical Operators (&, |, !, xor())
AND (
&): UseANDin SQL.OR (
|): UseOR.NOT (
!): UseNOT.Exclusive OR (
xor()): SQL does not have a direct equivalent to R’sxor()function, but you can achieve similar behavior using conditional logic.
penguins %>%
filter(body_mass_g >= 16 & year >= 2007)# A tibble: 342 × 8
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 39.1 18.7 181 3750
2 Adelie Torgersen 39.5 17.4 186 3800
3 Adelie Torgersen 40.3 18 195 3250
4 Adelie Torgersen 36.7 19.3 193 3450
5 Adelie Torgersen 39.3 20.6 190 3650
6 Adelie Torgersen 38.9 17.8 181 3625
7 Adelie Torgersen 39.2 19.6 195 4675
8 Adelie Torgersen 34.1 18.1 193 3475
9 Adelie Torgersen 42 20.2 190 4250
10 Adelie Torgersen 37.8 17.1 186 3300
# ℹ 332 more rows
# ℹ 2 more variables: sex <fct>, year <int>
penguins_db %>%
filter(body_mass_g >= 16 & year >= 2007) %>%
show_query()<SQL>
SELECT "penguins".*
FROM "penguins"
WHERE ("body_mass_g" >= 16.0 AND "year" >= 2007.0)
Handling Missing body_mass_gs (is.na())
penguins %>%
filter(is.na(body_mass_g))# A tibble: 2 × 8
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen NA NA NA NA
2 Gentoo Biscoe NA NA NA NA
# ℹ 2 more variables: sex <fct>, year <int>
penguins_db %>%
filter(is.na(body_mass_g)) %>%
show_query()<SQL>
SELECT "penguins".*
FROM "penguins"
WHERE (("body_mass_g" IS NULL))
In SQL, missing values are represented by NULL, and you check for them using IS NULL or IS NOT NULL.
Range Filtering (between())
penguins %>%
filter( between(body_mass_g,3000,5000))# A tibble: 272 × 8
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 39.1 18.7 181 3750
2 Adelie Torgersen 39.5 17.4 186 3800
3 Adelie Torgersen 40.3 18 195 3250
4 Adelie Torgersen 36.7 19.3 193 3450
5 Adelie Torgersen 39.3 20.6 190 3650
6 Adelie Torgersen 38.9 17.8 181 3625
7 Adelie Torgersen 39.2 19.6 195 4675
8 Adelie Torgersen 34.1 18.1 193 3475
9 Adelie Torgersen 42 20.2 190 4250
10 Adelie Torgersen 37.8 17.1 186 3300
# ℹ 262 more rows
# ℹ 2 more variables: sex <fct>, year <int>
penguins_db %>%
filter( between(body_mass_g,3000,5000)) %>%
show_query()<SQL>
SELECT "penguins".*
FROM "penguins"
WHERE ("body_mass_g" BETWEEN 3000.0 AND 5000.0)
Arrange(ORDER BY)
1. Simple Sort by multi Column (Ascending/Descending)
penguins %>%
arrange(desc(body_mass_g), year)# A tibble: 344 × 8
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Gentoo Biscoe 49.2 15.2 221 6300
2 Gentoo Biscoe 59.6 17 230 6050
3 Gentoo Biscoe 51.1 16.3 220 6000
4 Gentoo Biscoe 48.8 16.2 222 6000
5 Gentoo Biscoe 45.2 16.4 223 5950
6 Gentoo Biscoe 49.8 15.9 229 5950
7 Gentoo Biscoe 48.4 14.6 213 5850
8 Gentoo Biscoe 49.3 15.7 217 5850
9 Gentoo Biscoe 55.1 16 230 5850
10 Gentoo Biscoe 49.5 16.2 229 5800
# ℹ 334 more rows
# ℹ 2 more variables: sex <fct>, year <int>
penguins_db %>%
arrange(desc(body_mass_g), year) %>%
show_query()<SQL>
SELECT "penguins".*
FROM "penguins"
ORDER BY "body_mass_g" DESC, "year"
Sort with NA Handling
By default, SQL may place NULL values either at the beginning or end of the sorted results, depending on the SQL dialect. Some databases allow you to control this behavior with options like NULLS FIRST or NULLS LAST.
Slice (Limit)
penguins %>%
slice(1:5)# A tibble: 5 × 8
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 39.1 18.7 181 3750
2 Adelie Torgersen 39.5 17.4 186 3800
3 Adelie Torgersen 40.3 18 195 3250
4 Adelie Torgersen NA NA NA NA
5 Adelie Torgersen 36.7 19.3 193 3450
# ℹ 2 more variables: sex <fct>, year <int>
# SELECT *
# FROM penguins
# LIMIT 5;
penguins %>%
slice(6:10)# A tibble: 5 × 8
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 39.3 20.6 190 3650
2 Adelie Torgersen 38.9 17.8 181 3625
3 Adelie Torgersen 39.2 19.6 195 4675
4 Adelie Torgersen 34.1 18.1 193 3475
5 Adelie Torgersen 42 20.2 190 4250
# ℹ 2 more variables: sex <fct>, year <int>
# SELECT *
# FROM penguins
# LIMIT 5 OFFSET 5;
penguins %>%
slice(c(2,4,6))# A tibble: 3 × 8
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 39.5 17.4 186 3800
2 Adelie Torgersen NA NA NA NA
3 Adelie Torgersen 39.3 20.6 190 3650
# ℹ 2 more variables: sex <fct>, year <int>
# WITH NumberedRows AS (
# SELECT *,
# ROW_NUMBER() OVER (ORDER BY some_column) AS row_num
# FROM penguins
# )
# SELECT *
# FROM NumberedRows
# WHERE row_num IN (2, 4, 6);LIMIT: Restricts the number of rows returned by a query.OFFSET: Skips a specified number of rows before starting to return rows.ROW_NUMBER(): Assigns a unique row number to each row based on the specified ordering. Useful for more complex slicing operations.
Rename, Mutate, pivot_longer/pivot_wider
Rename & Mutate
penguins %>%
rename(new_body_mass_g = body_mass_g)# A tibble: 344 × 8
species island bill_length_mm bill_depth_mm flipper_length_mm new_body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torge… 39.1 18.7 181 3750
2 Adelie Torge… 39.5 17.4 186 3800
3 Adelie Torge… 40.3 18 195 3250
4 Adelie Torge… NA NA NA NA
5 Adelie Torge… 36.7 19.3 193 3450
6 Adelie Torge… 39.3 20.6 190 3650
7 Adelie Torge… 38.9 17.8 181 3625
8 Adelie Torge… 39.2 19.6 195 4675
9 Adelie Torge… 34.1 18.1 193 3475
10 Adelie Torge… 42 20.2 190 4250
# ℹ 334 more rows
# ℹ 2 more variables: sex <fct>, year <int>
penguins_db %>%
rename(new_body_mass_g = body_mass_g) %>%
show_query()<SQL>
SELECT
"species",
"island",
"bill_length_mm",
"bill_depth_mm",
"flipper_length_mm",
"body_mass_g" AS "new_body_mass_g",
"sex",
"year"
FROM "penguins"
# Creating a New Column
penguins %>%
mutate(new_body_mass_g = body_mass_g*2)# A tibble: 344 × 9
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 39.1 18.7 181 3750
2 Adelie Torgersen 39.5 17.4 186 3800
3 Adelie Torgersen 40.3 18 195 3250
4 Adelie Torgersen NA NA NA NA
5 Adelie Torgersen 36.7 19.3 193 3450
6 Adelie Torgersen 39.3 20.6 190 3650
7 Adelie Torgersen 38.9 17.8 181 3625
8 Adelie Torgersen 39.2 19.6 195 4675
9 Adelie Torgersen 34.1 18.1 193 3475
10 Adelie Torgersen 42 20.2 190 4250
# ℹ 334 more rows
# ℹ 3 more variables: sex <fct>, year <int>, new_body_mass_g <dbl>
penguins_db %>%
mutate(new_body_mass_g = body_mass_g*2) %>%
show_query()<SQL>
SELECT "penguins".*, "body_mass_g" * 2.0 AS "new_body_mass_g"
FROM "penguins"
# Modifying an Existing Column
penguins %>%
mutate(body_mass_g = body_mass_g*3)# A tibble: 344 × 8
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <dbl>
1 Adelie Torgersen 39.1 18.7 181 11250
2 Adelie Torgersen 39.5 17.4 186 11400
3 Adelie Torgersen 40.3 18 195 9750
4 Adelie Torgersen NA NA NA NA
5 Adelie Torgersen 36.7 19.3 193 10350
6 Adelie Torgersen 39.3 20.6 190 10950
7 Adelie Torgersen 38.9 17.8 181 10875
8 Adelie Torgersen 39.2 19.6 195 14025
9 Adelie Torgersen 34.1 18.1 193 10425
10 Adelie Torgersen 42 20.2 190 12750
# ℹ 334 more rows
# ℹ 2 more variables: sex <fct>, year <int>
penguins_db %>%
mutate(body_mass_g = body_mass_g*3) %>%
show_query()<SQL>
SELECT
"species",
"island",
"bill_length_mm",
"bill_depth_mm",
"flipper_length_mm",
"body_mass_g" * 3.0 AS "body_mass_g",
"sex",
"year"
FROM "penguins"
# Multiple Mutations
penguins %>%
mutate(new_body_mass_g_1 = body_mass_g*3,
new_body_mass_g_2 = new_body_mass_g_1 + 10)# A tibble: 344 × 10
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 39.1 18.7 181 3750
2 Adelie Torgersen 39.5 17.4 186 3800
3 Adelie Torgersen 40.3 18 195 3250
4 Adelie Torgersen NA NA NA NA
5 Adelie Torgersen 36.7 19.3 193 3450
6 Adelie Torgersen 39.3 20.6 190 3650
7 Adelie Torgersen 38.9 17.8 181 3625
8 Adelie Torgersen 39.2 19.6 195 4675
9 Adelie Torgersen 34.1 18.1 193 3475
10 Adelie Torgersen 42 20.2 190 4250
# ℹ 334 more rows
# ℹ 4 more variables: sex <fct>, year <int>, new_body_mass_g_1 <dbl>,
# new_body_mass_g_2 <dbl>
penguins_db %>%
mutate(new_body_mass_g_1 = body_mass_g*3,
new_body_mass_g_2 = new_body_mass_g_1 + 10) %>%
show_query()<SQL>
SELECT "q01".*, "new_body_mass_g_1" + 10.0 AS "new_body_mass_g_2"
FROM (
SELECT "penguins".*, "body_mass_g" * 3.0 AS "new_body_mass_g_1"
FROM "penguins"
) AS "q01"
in SQL, if you need to generate a new column that base on the value of others new column, you will need to create a new table
Pivot_longer/Pivot_wider
penguins %>%
mutate(id = row_number()) %>%
select(id,everything()) %>%
pivot_longer(cols = bill_length_mm:body_mass_g, # combine dbl and int
names_to = 'category',
values_to = "value") -> penguins_proceeded
dbWriteTable(con_tidy_tuesday_data, 'penguins_proceeded', penguins_proceeded, row.names = FALSE, overwrite = TRUE)
penguins_proceeded_db <- tbl(con_tidy_tuesday_data, "penguins_proceeded")
penguins_db %>%
mutate(id = row_number()) %>%
select(id,everything()) %>%
pivot_longer(cols = bill_length_mm:body_mass_g, # combine dbl and int
names_to = 'category',
values_to = "value") %>%
show_query()<SQL>
SELECT
"id",
"species",
"island",
"sex",
"year",
'bill_length_mm' AS "category",
"bill_length_mm" AS "value"
FROM (
SELECT ROW_NUMBER() OVER () AS "id", "penguins".*
FROM "penguins"
) AS "q01"
UNION ALL
SELECT
"id",
"species",
"island",
"sex",
"year",
'bill_depth_mm' AS "category",
"bill_depth_mm" AS "value"
FROM (
SELECT ROW_NUMBER() OVER () AS "id", "penguins".*
FROM "penguins"
) AS "q01"
UNION ALL
SELECT
"id",
"species",
"island",
"sex",
"year",
'flipper_length_mm' AS "category",
"flipper_length_mm" AS "value"
FROM (
SELECT ROW_NUMBER() OVER () AS "id", "penguins".*
FROM "penguins"
) AS "q01"
UNION ALL
SELECT
"id",
"species",
"island",
"sex",
"year",
'body_mass_g' AS "category",
"body_mass_g" AS "value"
FROM (
SELECT ROW_NUMBER() OVER () AS "id", "penguins".*
FROM "penguins"
) AS "q01"
penguins_proceeded %>%
filter(category %in% c("bill_length_mm","bill_depth_mm")) %>%
pivot_wider(names_from = category,
values_from = value)# A tibble: 344 × 7
id species island sex year bill_length_mm bill_depth_mm
<int> <fct> <fct> <fct> <int> <dbl> <dbl>
1 1 Adelie Torgersen male 2007 39.1 18.7
2 2 Adelie Torgersen female 2007 39.5 17.4
3 3 Adelie Torgersen female 2007 40.3 18
4 4 Adelie Torgersen <NA> 2007 NA NA
5 5 Adelie Torgersen female 2007 36.7 19.3
6 6 Adelie Torgersen male 2007 39.3 20.6
7 7 Adelie Torgersen female 2007 38.9 17.8
8 8 Adelie Torgersen male 2007 39.2 19.6
9 9 Adelie Torgersen <NA> 2007 34.1 18.1
10 10 Adelie Torgersen <NA> 2007 42 20.2
# ℹ 334 more rows
penguins_proceeded_db %>%
filter(category %in% c("bill_length_mm","bill_depth_mm")) %>%
pivot_wider(names_from = category,
values_from = value) %>%
show_query()<SQL>
SELECT
"id",
"species",
"island",
"sex",
"year",
MAX(CASE WHEN ("category" = 'bill_depth_mm') THEN "value" END) AS "bill_depth_mm",
MAX(CASE WHEN ("category" = 'bill_length_mm') THEN "value" END) AS "bill_length_mm"
FROM (
SELECT "penguins_proceeded".*
FROM "penguins_proceeded"
WHERE ("category" IN ('bill_length_mm', 'bill_depth_mm'))
) AS "q01"
GROUP BY "id", "species", "island", "sex", "year"
penguins %>%
mutate(
mass_type = case_when(
body_mass_g < 2000 & body_mass_g > 1000 ~ 1,
body_mass_g < 3000 & body_mass_g >= 2000 ~ 2,
body_mass_g < 4000 & body_mass_g >= 3000 ~ 3,
body_mass_g < 5000 & body_mass_g >= 4000 ~ 4,
body_mass_g >= 5000 ~ 5,
TRUE ~ NA_real_
)
)# A tibble: 344 × 9
species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
<fct> <fct> <dbl> <dbl> <int> <int>
1 Adelie Torgersen 39.1 18.7 181 3750
2 Adelie Torgersen 39.5 17.4 186 3800
3 Adelie Torgersen 40.3 18 195 3250
4 Adelie Torgersen NA NA NA NA
5 Adelie Torgersen 36.7 19.3 193 3450
6 Adelie Torgersen 39.3 20.6 190 3650
7 Adelie Torgersen 38.9 17.8 181 3625
8 Adelie Torgersen 39.2 19.6 195 4675
9 Adelie Torgersen 34.1 18.1 193 3475
10 Adelie Torgersen 42 20.2 190 4250
# ℹ 334 more rows
# ℹ 3 more variables: sex <fct>, year <int>, mass_type <dbl>
penguins_db %>%
mutate(
mass_type = case_when(
body_mass_g < 2000 & body_mass_g > 1000 ~ 1,
body_mass_g < 3000 & body_mass_g >= 2000 ~ 2,
body_mass_g < 4000 & body_mass_g >= 3000 ~ 3,
body_mass_g < 5000 & body_mass_g >= 4000 ~ 4,
body_mass_g >= 5000 ~ 5,
TRUE ~ NA_real_ # Handles cases where body_mass_g is NA or does not fit any criteria
)
) %>%
show_query()<SQL>
SELECT
"penguins".*,
CASE
WHEN ("body_mass_g" < 2000.0 AND "body_mass_g" > 1000.0) THEN 1.0
WHEN ("body_mass_g" < 3000.0 AND "body_mass_g" >= 2000.0) THEN 2.0
WHEN ("body_mass_g" < 4000.0 AND "body_mass_g" >= 3000.0) THEN 3.0
WHEN ("body_mass_g" < 5000.0 AND "body_mass_g" >= 4000.0) THEN 4.0
WHEN ("body_mass_g" >= 5000.0) THEN 5.0
ELSE NULL
END AS "mass_type"
FROM "penguins"
Group by & summary
penguins %>%
na.omit() %>%
group_by(species) %>%
summarize(
count = n(),
mean_mass = mean(body_mass_g, na.rm = TRUE),
median_mass = median(body_mass_g, na.rm = TRUE),
sd_mass = sd(body_mass_g, na.rm = TRUE),
.groups = 'drop' # Drop grouping structure after summarizing
)# A tibble: 3 × 5
species count mean_mass median_mass sd_mass
<fct> <int> <dbl> <dbl> <dbl>
1 Adelie 146 3706. 3700 459.
2 Chinstrap 68 3733. 3700 384.
3 Gentoo 119 5092. 5050 501.
#simple saummary table
penguins_db %>%
na.omit() %>%
group_by() %>%
summarize(
count = n(),
mean_mass = mean(body_mass_g, na.rm = TRUE),
median_mass = median(body_mass_g, na.rm = TRUE),
sd_mass = sd(body_mass_g, na.rm = TRUE),
.groups = 'drop' # Drop grouping structure after summarizing
) %>%
show_query()<SQL>
SELECT
COUNT(*) AS "count",
AVG("body_mass_g") AS "mean_mass",
PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY "body_mass_g") AS "median_mass",
STDDEV_SAMP("body_mass_g") AS "sd_mass"
FROM "penguins"
Join
For this part, i use datasets from packages `nycflights23`
airlines# A tibble: 14 × 2
carrier name
<chr> <chr>
1 9E Endeavor Air Inc.
2 AA American Airlines Inc.
3 AS Alaska Airlines Inc.
4 B6 JetBlue Airways
5 DL Delta Air Lines Inc.
6 F9 Frontier Airlines Inc.
7 G4 Allegiant Air
8 HA Hawaiian Airlines Inc.
9 MQ Envoy Air
10 NK Spirit Air Lines
11 OO SkyWest Airlines Inc.
12 UA United Air Lines Inc.
13 WN Southwest Airlines Co.
14 YX Republic Airline
airports# A tibble: 1,251 × 8
faa name lat lon alt tz dst tzone
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 AAF Apalachicola Regional Airport 29.7 -85.0 20 -5 A Amer…
2 AAP Andrau Airpark 29.7 -95.6 79 -6 A Amer…
3 ABE Lehigh Valley International Airpo… 40.7 -75.4 393 -5 A Amer…
4 ABI Abilene Regional Airport 32.4 -99.7 1791 -6 A Amer…
5 ABL Ambler Airport 67.1 -158. 334 -9 A Amer…
6 ABQ Albuquerque International Sunport 35.0 -107. 5355 -7 A Amer…
7 ABR Aberdeen Regional Airport 45.4 -98.4 1302 -6 A Amer…
8 ABY Southwest Georgia Regional Airport 31.5 -84.2 197 -5 A Amer…
9 ACK Nantucket Memorial Airport 41.3 -70.1 47 -5 A Amer…
10 ACT Waco Regional Airport 31.6 -97.2 516 -6 A Amer…
# ℹ 1,241 more rows
flights# A tibble: 435,352 × 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2023 1 1 1 2038 203 328 3
2 2023 1 1 18 2300 78 228 135
3 2023 1 1 31 2344 47 500 426
4 2023 1 1 33 2140 173 238 2352
5 2023 1 1 36 2048 228 223 2252
6 2023 1 1 503 500 3 808 815
7 2023 1 1 520 510 10 948 949
8 2023 1 1 524 530 -6 645 710
9 2023 1 1 537 520 17 926 818
10 2023 1 1 547 545 2 845 852
# ℹ 435,342 more rows
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>
planes# A tibble: 4,840 × 9
tailnum year type manufacturer model engines seats speed engine
<chr> <int> <chr> <chr> <chr> <int> <int> <int> <chr>
1 N101DQ 2020 Fixed wing multi… AIRBUS A321… 2 199 0 Turbo…
2 N101DU 2018 Fixed wing multi… C SERIES AI… BD-5… 2 133 0 Turbo…
3 N101HQ 2007 Fixed wing multi… EMBRAER-EMP… ERJ … 2 80 0 Turbo…
4 N101NN 2013 Fixed wing multi… AIRBUS INDU… A321… 2 379 0 Turbo…
5 N102DN 2020 Fixed wing multi… AIRBUS A321… 2 199 0 Turbo…
6 N102DU NA Fixed wing multi… C SERIES AI… BD-5… 2 133 0 Turbo…
7 N102HQ 2007 Fixed wing multi… EMBRAER-EMP… ERJ … 2 80 0 Turbo…
8 N102NN 2013 Fixed wing multi… AIRBUS A321… 2 379 0 Turbo…
9 N102UW 1998 Fixed wing multi… AIRBUS INDU… A320… 2 182 0 Turbo…
10 N103DU NA Fixed wing multi… C SERIES AI… BD-5… 2 133 0 Turbo…
# ℹ 4,830 more rows
weather# A tibble: 26,204 × 15
origin year month day hour temp dewp humid wind_dir wind_speed
<chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 JFK 2023 1 1 0 NA NA NA 0 0
2 JFK 2023 1 1 1 NA NA NA 190 4.60
3 JFK 2023 1 1 2 NA NA NA 190 5.75
4 JFK 2023 1 1 3 NA NA NA 250 5.75
5 JFK 2023 1 1 4 NA NA NA 170 8.06
6 JFK 2023 1 1 5 NA NA NA 0 0
7 JFK 2023 1 1 6 NA NA NA 250 9.21
8 JFK 2023 1 1 7 NA NA NA 230 9.21
9 JFK 2023 1 1 8 NA NA NA 260 11.5
10 JFK 2023 1 1 9 NA NA NA 250 12.7
# ℹ 26,194 more rows
# ℹ 5 more variables: wind_gust <dbl>, precip <dbl>, pressure <dbl>,
# visib <dbl>, time_hour <dttm>
datasets <- list(airlines, airports, flights, planes, weather)
table_names <- c("airlines", "airports", "flights", "planes", "weather")
# Write each dataset to the SQL database and create a dbplyr reference
walk2(datasets, table_names, ~ {
dbWriteTable(con_tidy_tuesday_data, .y, .x, row.names = FALSE, overwrite = TRUE)
assign(paste0(.y, "_db"), tbl(con_tidy_tuesday_data, .y), envir = .GlobalEnv)
})These relationships are summarized visually in this figure
dplyr provides six join functions: left_join(), inner_join(), right_join(), full_join(), semi_join(), and anti_join()
and join in SQL
Comparison of Joins in SQL and R
| Aspect | SQL Join | R Join (dplyr) |
|---|---|---|
| Function Syntax | INNER JOIN, LEFT JOIN, etc. |
inner_join(), left_join(), etc. |
| Common Key | Explicit ON clause (ON table1.column = table2.column) |
Use by = "column" to specify common key |
| Inner Join | SELECT * FROM table1 INNER JOIN table2 ON ... |
inner_join(table1, table2, by = "column") |
| Left Join | LEFT JOIN |
left_join() |
| Right Join | RIGHT JOIN |
right_join() |
| Full Join | FULL JOIN |
full_join() |
| Multiple Join Keys | ON table1.col1 = table2.col1 AND table1.col2 = table2.col2 |
by = c("col1" = "col2", "col1" = "col2") |
| Output Format | Data returned as a table | Data returned as a data.frame or tibble |
| Null Handling | NULL values returned when there is no match | NA values returned when there is no match |
Left/right join
flights %>%
left_join(planes, join_by(tailnum))# A tibble: 435,352 × 27
year.x month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2023 1 1 1 2038 203 328 3
2 2023 1 1 18 2300 78 228 135
3 2023 1 1 31 2344 47 500 426
4 2023 1 1 33 2140 173 238 2352
5 2023 1 1 36 2048 228 223 2252
6 2023 1 1 503 500 3 808 815
7 2023 1 1 520 510 10 948 949
8 2023 1 1 524 530 -6 645 710
9 2023 1 1 537 520 17 926 818
10 2023 1 1 547 545 2 845 852
# ℹ 435,342 more rows
# ℹ 19 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>, year.y <int>, type <chr>,
# manufacturer <chr>, model <chr>, engines <int>, seats <int>, speed <int>,
# engine <chr>
flights_db %>%
left_join(planes_db, join_by(tailnum)) %>%
show_query()<SQL>
SELECT
"flights"."year" AS "year.x",
"month",
"day",
"dep_time",
"sched_dep_time",
"dep_delay",
"arr_time",
"sched_arr_time",
"arr_delay",
"carrier",
"flight",
"flights"."tailnum" AS "tailnum",
"origin",
"dest",
"air_time",
"distance",
"hour",
"minute",
"time_hour",
"planes"."year" AS "year.y",
"type",
"manufacturer",
"model",
"engines",
"seats",
"speed",
"engine"
FROM "flights"
LEFT JOIN "planes"
ON ("flights"."tailnum" = "planes"."tailnum")
flights %>%
right_join(planes, join_by(tailnum))# A tibble: 424,068 × 27
year.x month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2023 1 1 1 2038 203 328 3
2 2023 1 1 18 2300 78 228 135
3 2023 1 1 31 2344 47 500 426
4 2023 1 1 33 2140 173 238 2352
5 2023 1 1 36 2048 228 223 2252
6 2023 1 1 503 500 3 808 815
7 2023 1 1 520 510 10 948 949
8 2023 1 1 524 530 -6 645 710
9 2023 1 1 537 520 17 926 818
10 2023 1 1 547 545 2 845 852
# ℹ 424,058 more rows
# ℹ 19 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
# tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
# hour <dbl>, minute <dbl>, time_hour <dttm>, year.y <int>, type <chr>,
# manufacturer <chr>, model <chr>, engines <int>, seats <int>, speed <int>,
# engine <chr>
flights_db %>%
right_join(planes_db, join_by(tailnum)) %>%
show_query()<SQL>
SELECT
"flights"."year" AS "year.x",
"month",
"day",
"dep_time",
"sched_dep_time",
"dep_delay",
"arr_time",
"sched_arr_time",
"arr_delay",
"carrier",
"flight",
"planes"."tailnum" AS "tailnum",
"origin",
"dest",
"air_time",
"distance",
"hour",
"minute",
"time_hour",
"planes"."year" AS "year.y",
"type",
"manufacturer",
"model",
"engines",
"seats",
"speed",
"engine"
FROM "flights"
RIGHT JOIN "planes"
ON ("flights"."tailnum" = "planes"."tailnum")