In this tutorial i will show you it’s easy to learn R’s dplyr together with SQL. I show some of the most similar ways of handling data in R and SQL
learning to code in more than one software at once is no easy pizzy but it’s much easy if the two softwares have some form of resemblence in them . In this tutorial i will show you how to connect to a database using Rmarkdown and Rstudio as well as query this database using SQL (Within an rmarkdown document)
library(tidyverse)
library(odbc)
library(DBI)
library(RSQLite)
Next up we load our data for the tutorial , I have used a dataset
from a project i did from datacamp to be
certified as data scientist associate
. I then sample only
a few rows and create two dataframes which i will turn into tables later
on.
## read in the dataset
<- readr::read_csv("recipe_site_traffic_2212.csv")
df
## sample 100 observations and select first 3 variables
set.seed(1123)
<- df |>
data1sample_n(size=25) |>
select(1,2,3,6)
## sample 100 observations and select subsequent 3 variables(including the first(ID))
##
set.seed(1123)
<- df |>
data2sample_n(size=25) |>
select(1,2,4,5,6)
<-dbConnect(RSQLite::SQLite(), ":memory:")
concopy_to(con,data1)
copy_to(con,data2)
SELECT name FROM people;
*
rather than typing all the columnsdbGetQuery(con,'SELECT * FROM data2')
dbGetQuery(con,'SELECT recipe,sugar FROM data2')
|> select(everything()) data2
|>
data2 select(recipe,sugar)
WHERE
in SQLWHERE is a filtering clause
In SQL, the WHERE keyword allows you to filter based on both text and numeric values in a table. There are a few different comparison operators you can use:
= equal
<> not equal
< less than
> greater than
<= less than or equal to
>= greater than or equal to
You can build up your WHERE queries by combining multiple conditions with the AND keyword.
dbGetQuery(con,"SELECT *
FROM data2
WHERE category='Potato'")
|>
data2 filter(category=="Potato")
IN
and WHERE
in SQLdbGetQuery(con,"SELECT *
FROM data2
WHERE category IN ('Chicken','Chicken Breast')")
%in%
|>
data2 filter(category %in% c("Chicken","Chicken Breast"))
COUNT
and AS
alias in SQLAS
to rename default nameCOUNT(*)
tells you how many rows are in a tabledbGetQuery(con,"SELECT COUNT(*) AS n FROM data2")
|> count() data2
dbGetQuery(con,"SELECT COUNT(category) AS n_potato
FROM data2
WHERE category IN ('Chicken Breast','Chicken')")
|>
data2 filter(category %in% c("Chicken","Chicken Breast")) |>
summarise(n_potato=n())
dbExecute
|>
data2 ::tabyl(category) janitor
Breakfast
to
Breakfast meal
REPLACE
dbExecute(con,"UPDATE data2
SET category= REPLACE(category,'Breakfast','Breakfast meal')
WHERE category='Breakfast'")
#> [1] 4
dbGetQuery(con,"SELECT category, COUNT(category) as n_per_category
FROM data2
GROUP BY category")
Breakfast
has updated to
Breakfast meal
stringr::str_replace
to replace elements in a
string variable|>
data2 mutate(category=stringr::str_replace(category,"Breakfast","Breakfast meal")) |>
group_by(category) |>
summarise(n_per_category=n())
+n SQL, NULL represents a missing or unknown value. You can check for
NULL values using the expression IS NULL + Use IS NULL
AND
IS NOT NULL
dbGetQuery(con,"SELECT *
FROM data2
WHERE Protein IS NULL")
dbGetQuery(con,"SELECT *
FROM data2
WHERE Protein IS NOT NULL")
is.na()
and !is.na
|>
data2 filter(is.na(protein))
|>
data2 filter(!is.na(protein))
distinct
items in SQLdbGetQuery(con,"SELECT COUNT(DISTINCT category) AS unique_categories
FROM data2")
distinct
in R|>
data2 distinct(category) |>
summarise(unique_categories=n())
dbGetQuery(con,"SELECT *
FROM data2 WHERE sugar > 1 AND sugar < 5
AND category='Breakfast meal'")
BETWEEN
and
AND
dbGetQuery(con,"SELECT *
FROM data2
WHERE sugar BETWEEN 1 AND 5
AND category='Breakfast meal'")
|>
data2 filter(between(sugar,1,5) & category=="Breakfast")
LIKE
The % wildcard will match zero, one, or many characters in text
The _ wildcard will match a single character
dbGetQuery(con,"SELECT *
FROM data2
WHERE category LIKE 'Chicken%'")
str_like
|>
data2 filter(str_like(category,"Chicken%"))
::sqldf("SELECT AVG(sugar) AS avg_sugar,
sqldf MAX(sugar) AS max_sugar,
MIN(sugar) AS min_sugar
FROM data2;")
|>
data2 summarise(avg_sugar=mean(sugar,na.rm=T),
max_sugar= max(sugar,na.rm=T),
min_sugar= min(sugar,na.rm=T))
::sqldf("SELECT category,AVG(sugar) AS avg_sugar,
sqldf MAX(sugar) AS max_sugar,
MIN(sugar) AS min_sugar
FROM data2
GROUP BY category;")
|>
data2 group_by(category) |>
summarise(avg_sugar=mean(sugar,na.rm=T),
max_sugar= max(sugar,na.rm=T),
min_sugar= min(sugar,na.rm=T))
::sqldf("SELECT category,(sugar-protein) AS diff_sugar_protein
sqldf FROM data2")
mutate()
|>
data2 mutate(diff_sugar_protein=sugar-protein) |>
select(category,diff_sugar_protein)
::sqldf("SELECT recipe,calories
sqldf FROM data2
WHERE calories >
(SELECT AVG(calories) FROM data1)")
|>
data2 select(recipe,sugar,calories) |>
filter(calories>(data1 |>
summarise(mean_calories=mean(calories,na.rm=TRUE)) |>
pull(mean_calories)))
x
and y
x
y
x
or y
INNER JOIN
AloneWhen talking about inner joins, we are only going to keep an observation if it is found in all of the tables we’re combining.
::sqldf("SELECT data2.category,data2.recipe,data2.calories,
sqldf carbohydrate,protein,sugar
FROM data2
INNER JOIN data1 ON data2.recipe=data1.recipe")
::inner_join(data1,data2,by=join_by(recipe)) |>
dplyrselect(recipe,carbohydrate,sugar,protein,ends_with(".x"))
# read in example csv files (join_df1 and join_df2)
<- read.csv("join_df1.csv") |>
join_df1 rename(A=`ï..A`)
<- read.csv("join_df2.csv")|>
join_df2 rename(A=`ï..A`)
<-dbConnect(RSQLite::SQLite(), ":memory:")
joincopy_to(join,join_df1)
copy_to(join,join_df2)
dbGetQuery(join,"SELECT * FROM join_df1")
dbGetQuery(join,"SELECT * FROM join_df2")
For a left join, all rows in the first table specified will be included in the output. Any row in the second table that is not in the first table will not be included.
::sqldf("SELECT *
sqldf FROM join_df1
LEFT JOIN join_df2
USING(A)")
Right Join is similar to what we just discussed; however, in the output from a right join, all rows in the final table specified are included in the output. NAs will be included for any observations found in the last specified table but not in the other tables.
::sqldf("SELECT *
sqldf FROM join_df1
RIGHT JOIN join_df2
USING(A)")
::left_join(join_df1, join_df2,by = join_by(A)) dplyr
::right_join(join_df1, join_df2,by = join_by(A)) dplyr
Finally, a full join will take every observation from every table and include it in the output.
::sqldf("SELECT * FROM join_df1
sqldf FULL JOIN join_df2
USING(A)")
::full_join(join_df1, join_df2,by = join_by(A)) dplyr