In this tutorial i will show you it’s easy to learn R’s dplyr together with SQL. I show some of the most similar ways of handling data in R and SQL
learning to code in more than one software at once is no easy pizzy but it’s much easy if the two softwares have some form of resemblence in them . In this tutorial i will show you how to connect to a database using Rmarkdown and Rstudio as well as query this database using SQL (Within an rmarkdown document)
library(tidyverse)
library(odbc)
library(DBI)
library(RSQLite)Next up we load our data for the tutorial , I have used a dataset
from a project i did from datacamp to be
certified as data scientist associate . I then sample only
a few rows and create two dataframes which i will turn into tables later
on.
## read in the dataset
df <- readr::read_csv("recipe_site_traffic_2212.csv")
## sample 100 observations and select first 3 variables
set.seed(1123)
data1<- df |>
sample_n(size=25) |>
select(1,2,3,6)
## sample 100 observations and select subsequent 3 variables(including the first(ID))
##
set.seed(1123)
data2<- df |>
sample_n(size=25) |>
select(1,2,4,5,6)con<-dbConnect(RSQLite::SQLite(), ":memory:")
copy_to(con,data1)
copy_to(con,data2)SELECT name FROM people;*
rather than typing all the columnsdbGetQuery(con,'SELECT * FROM data2')dbGetQuery(con,'SELECT recipe,sugar FROM data2')data2 |> select(everything())data2 |>
select(recipe,sugar)WHERE in SQLWHERE is a filtering clause
In SQL, the WHERE keyword allows you to filter based on both text and numeric values in a table. There are a few different comparison operators you can use:
= equal
<> not equal
< less than
> greater than
<= less than or equal to
>= greater than or equal to
You can build up your WHERE queries by combining multiple conditions with the AND keyword.
dbGetQuery(con,"SELECT *
FROM data2
WHERE category='Potato'")data2 |>
filter(category=="Potato")IN and WHERE in SQLdbGetQuery(con,"SELECT *
FROM data2
WHERE category IN ('Chicken','Chicken Breast')")%in%data2 |>
filter(category %in% c("Chicken","Chicken Breast"))COUNT and AS alias in SQLAS to rename default nameCOUNT(*) tells you how many rows are in a tabledbGetQuery(con,"SELECT COUNT(*) AS n FROM data2")data2 |> count()dbGetQuery(con,"SELECT COUNT(category) AS n_potato
FROM data2
WHERE category IN ('Chicken Breast','Chicken')")data2 |>
filter(category %in% c("Chicken","Chicken Breast")) |>
summarise(n_potato=n())dbExecutedata2 |>
janitor::tabyl(category)Breakfast to
Breakfast mealREPLACEdbExecute(con,"UPDATE data2
SET category= REPLACE(category,'Breakfast','Breakfast meal')
WHERE category='Breakfast'")
#> [1] 4dbGetQuery(con,"SELECT category, COUNT(category) as n_per_category
FROM data2
GROUP BY category")Breakfast has updated to
Breakfast mealstringr::str_replace to replace elements in a
string variabledata2 |>
mutate(category=stringr::str_replace(category,"Breakfast","Breakfast meal")) |>
group_by(category) |>
summarise(n_per_category=n())+n SQL, NULL represents a missing or unknown value. You can check for
NULL values using the expression IS NULL + Use IS NULL AND
IS NOT NULL
dbGetQuery(con,"SELECT *
FROM data2
WHERE Protein IS NULL")dbGetQuery(con,"SELECT *
FROM data2
WHERE Protein IS NOT NULL")is.na() and !is.nadata2 |>
filter(is.na(protein))data2 |>
filter(!is.na(protein))distinct items in SQLdbGetQuery(con,"SELECT COUNT(DISTINCT category) AS unique_categories
FROM data2")distinct in Rdata2 |>
distinct(category) |>
summarise(unique_categories=n())dbGetQuery(con,"SELECT *
FROM data2 WHERE sugar > 1 AND sugar < 5
AND category='Breakfast meal'")BETWEEN and
ANDdbGetQuery(con,"SELECT *
FROM data2
WHERE sugar BETWEEN 1 AND 5
AND category='Breakfast meal'")data2 |>
filter(between(sugar,1,5) & category=="Breakfast")LIKEThe % wildcard will match zero, one, or many characters in text
The _ wildcard will match a single character
dbGetQuery(con,"SELECT *
FROM data2
WHERE category LIKE 'Chicken%'")str_likedata2 |>
filter(str_like(category,"Chicken%"))sqldf::sqldf("SELECT AVG(sugar) AS avg_sugar,
MAX(sugar) AS max_sugar,
MIN(sugar) AS min_sugar
FROM data2;") data2 |>
summarise(avg_sugar=mean(sugar,na.rm=T),
max_sugar= max(sugar,na.rm=T),
min_sugar= min(sugar,na.rm=T))sqldf::sqldf("SELECT category,AVG(sugar) AS avg_sugar,
MAX(sugar) AS max_sugar,
MIN(sugar) AS min_sugar
FROM data2
GROUP BY category;") data2 |>
group_by(category) |>
summarise(avg_sugar=mean(sugar,na.rm=T),
max_sugar= max(sugar,na.rm=T),
min_sugar= min(sugar,na.rm=T))sqldf::sqldf("SELECT category,(sugar-protein) AS diff_sugar_protein
FROM data2")mutate()data2 |>
mutate(diff_sugar_protein=sugar-protein) |>
select(category,diff_sugar_protein)sqldf::sqldf("SELECT recipe,calories
FROM data2
WHERE calories >
(SELECT AVG(calories) FROM data1)")data2 |>
select(recipe,sugar,calories) |>
filter(calories>(data1 |>
summarise(mean_calories=mean(calories,na.rm=TRUE)) |>
pull(mean_calories)))x and yxyx
or yINNER JOIN AloneWhen talking about inner joins, we are only going to keep an observation if it is found in all of the tables we’re combining.
sqldf::sqldf("SELECT data2.category,data2.recipe,data2.calories,
carbohydrate,protein,sugar
FROM data2
INNER JOIN data1 ON data2.recipe=data1.recipe")dplyr::inner_join(data1,data2,by=join_by(recipe)) |>
select(recipe,carbohydrate,sugar,protein,ends_with(".x"))# read in example csv files (join_df1 and join_df2)
join_df1 <- read.csv("join_df1.csv") |>
rename(A=`ï..A`)
join_df2 <- read.csv("join_df2.csv")|>
rename(A=`ï..A`)
join<-dbConnect(RSQLite::SQLite(), ":memory:")
copy_to(join,join_df1)
copy_to(join,join_df2)dbGetQuery(join,"SELECT * FROM join_df1")dbGetQuery(join,"SELECT * FROM join_df2")For a left join, all rows in the first table specified will be included in the output. Any row in the second table that is not in the first table will not be included.
sqldf::sqldf("SELECT *
FROM join_df1
LEFT JOIN join_df2
USING(A)")Right Join is similar to what we just discussed; however, in the output from a right join, all rows in the final table specified are included in the output. NAs will be included for any observations found in the last specified table but not in the other tables.
sqldf::sqldf("SELECT *
FROM join_df1
RIGHT JOIN join_df2
USING(A)")dplyr::left_join(join_df1, join_df2,by = join_by(A))dplyr::right_join(join_df1, join_df2,by = join_by(A))Finally, a full join will take every observation from every table and include it in the output.
sqldf::sqldf("SELECT * FROM join_df1
FULL JOIN join_df2
USING(A)")dplyr::full_join(join_df1, join_df2,by = join_by(A))