library(tidyverse)
library(odbc)
library(DBI)
library(RSQLite)
## read in the dataset
<- readr::read_csv("recipe_site_traffic_2212.csv")
df
## sample 100 observations and select first 3 variables
set.seed(1123)
<- df |>
data1sample_n(size=25) |>
select(1,2,3,6)
## sample 100 observations and select subsequent 3 variables(including the first(ID))
##
set.seed(1123)
<- df |>
data2sample_n(size=25) |>
select(1,2,4,5,6)
<-dbConnect(RSQLite::SQLite(), ":memory:")
concopy_to(con,data1)
copy_to(con,data2)
WHERE
in SQLWHERE is a filtering clause
In SQL, the WHERE keyword allows you to filter based on both text and numeric values in a table. There are a few different comparison operators you can use:
= equal
<> not equal
< less than
> greater than
<= less than or equal to
>= greater than or equal to
You can build up your WHERE queries by combining multiple conditions with the AND keyword.
SELECT *
FROM data2
WHERE category='Potato'
recipe | calories | sugar | protein | category |
---|---|---|---|---|
624 | 236.62 | 0.81 | 9.07 | Potato |
IN
and WHERE
in SQLSELECT *
FROM data2
WHERE category IN ('Chicken','Chicken Breast')
recipe | calories | sugar | protein | category |
---|---|---|---|---|
609 | NA | NA | NA | Chicken Breast |
671 | 481.88 | 1.34 | 51.90 | Chicken |
683 | 147.24 | 0.94 | 54.00 | Chicken |
419 | 1830.28 | 1.83 | 44.74 | Chicken Breast |
117 | NA | NA | NA | Chicken Breast |
COUNT
and AS
alias in SQLAS
to rename default nameCOUNT(*)
tells you how many rows are in a tableSELECT COUNT(*) AS n
FROM data2
n |
---|
25 |
SELECT COUNT(category) AS n_potato
FROM data2
WHERE category IN ('Chicken Breast','Chicken')
n_potato |
---|
5 |
dbExecute
category <chr> | n <int> | percent <dbl> | ||
---|---|---|---|---|
Beverages | 1 | 0.04 | ||
Breakfast | 4 | 0.16 | ||
Chicken | 2 | 0.08 | ||
Chicken Breast | 3 | 0.12 | ||
Dessert | 3 | 0.12 | ||
Meat | 4 | 0.16 | ||
One Dish Meal | 3 | 0.12 | ||
Pork | 1 | 0.04 | ||
Potato | 1 | 0.04 | ||
Vegetable | 3 | 0.12 |
Breakfast
to
Breakfast meal
REPLACE
UPDATE data2
SET category= REPLACE(category,'Breakfast','Breakfast meal')
WHERE category='Breakfast'
SELECT category, COUNT(category) as n_per_category
FROM data2
GROUP BY category
category | n_per_category |
---|---|
Beverages | 1 |
Breakfast meal | 4 |
Chicken | 2 |
Chicken Breast | 3 |
Dessert | 3 |
Meat | 4 |
One Dish Meal | 3 |
Pork | 1 |
Potato | 1 |
Vegetable | 3 |
Breakfast
has updated to
Breakfast meal
NULL
represents a missing or unknown value. You
can check for NULL values using the expression IS NULLIS NULL
AND IS NOT NULL
SELECT *
FROM data2
WHERE Protein IS NULL
recipe | calories | sugar | protein | category |
---|---|---|---|---|
912 | NA | NA | NA | Dessert |
609 | NA | NA | NA | Chicken Breast |
117 | NA | NA | NA | Chicken Breast |
SELECT *
FROM data2
WHERE Protein IS NOT NULL
recipe | calories | sugar | protein | category |
---|---|---|---|---|
624 | 236.62 | 0.81 | 9.07 | Potato |
102 | 198.98 | 0.39 | 39.12 | Vegetable |
427 | 187.41 | 86.97 | 4.49 | Dessert |
324 | 49.50 | 4.69 | 53.33 | Breakfast meal |
380 | 75.89 | 8.18 | 17.64 | Meat |
671 | 481.88 | 1.34 | 51.90 | Chicken |
683 | 147.24 | 0.94 | 54.00 | Chicken |
208 | 20.98 | 9.80 | 0.82 | Beverages |
633 | 105.95 | 0.07 | 7.55 | Vegetable |
859 | 388.44 | 4.62 | 13.93 | Pork |
distinct
items in SQLSELECT COUNT(DISTINCT category) AS unique_categories
FROM data2
unique_categories |
---|
10 |
SELECT *
FROM data2 WHERE sugar > 1 AND sugar < 5
AND category='Breakfast meal'
recipe | calories | sugar | protein | category |
---|---|---|---|---|
324 | 49.5 | 4.69 | 53.33 | Breakfast meal |
598 | 212.4 | 2.25 | 23.78 | Breakfast meal |
526 | 93.5 | 2.57 | 23.68 | Breakfast meal |
BETWEEN
and
AND
SELECT *
FROM data2
WHERE sugar BETWEEN 1 AND 5
AND category='Breakfast meal'
recipe | calories | sugar | protein | category |
---|---|---|---|---|
324 | 49.5 | 4.69 | 53.33 | Breakfast meal |
598 | 212.4 | 2.25 | 23.78 | Breakfast meal |
526 | 93.5 | 2.57 | 23.68 | Breakfast meal |
LIKE
LIKE
operator can be used in a WHERE clause to
search for a pattern in a column.The
%
wildcard will match zero, one, or many characters in text
The
_
wildcard will match a single character
SELECT *
FROM data2
WHERE category LIKE 'Chicken%'
recipe | calories | sugar | protein | category |
---|---|---|---|---|
609 | NA | NA | NA | Chicken Breast |
671 | 481.88 | 1.34 | 51.90 | Chicken |
683 | 147.24 | 0.94 | 54.00 | Chicken |
419 | 1830.28 | 1.83 | 44.74 | Chicken Breast |
117 | NA | NA | NA | Chicken Breast |
SELECT AVG(sugar) AS avg_sugar,
MAX(sugar) AS max_sugar,
MIN(sugar) AS min_sugar,
VARIANCE(sugar) AS variance,
AS stnd_deviation
STDEV(sugar) FROM data2;
avg_sugar | max_sugar | min_sugar | variance | stnd_deviation |
---|---|---|---|---|
11.24864 | 86.97 | 0.07 | 426.5449 | 20.65296 |
SELECT category,AVG(sugar) AS avg_sugar,
MAX(sugar) AS max_sugar,
MIN(sugar) AS min_sugar
FROM data2
GROUP BY category;
category | avg_sugar | max_sugar | min_sugar |
---|---|---|---|
Beverages | 9.8000000 | 9.80 | 9.80 |
Breakfast meal | 5.1600000 | 11.13 | 2.25 |
Chicken | 1.1400000 | 1.34 | 0.94 |
Chicken Breast | 1.8300000 | 1.83 | 1.83 |
Dessert | 69.5700000 | 86.97 | 52.17 |
Meat | 13.5525000 | 30.06 | 5.21 |
One Dish Meal | 3.9066667 | 6.09 | 1.46 |
Pork | 4.6200000 | 4.62 | 4.62 |
Potato | 0.8100000 | 0.81 | 0.81 |
Vegetable | 0.8066667 | 1.96 | 0.07 |
we can order the data by a certain column using
ORDER BY
clause
SELECT category,AVG(sugar) AS avg_sugar,
MAX(sugar) AS max_sugar,
MIN(sugar) AS min_sugar
FROM data2
GROUP BY category
ORDER BY avg_sugar;
category | avg_sugar | max_sugar | min_sugar |
---|---|---|---|
Vegetable | 0.8066667 | 1.96 | 0.07 |
Potato | 0.8100000 | 0.81 | 0.81 |
Chicken | 1.1400000 | 1.34 | 0.94 |
Chicken Breast | 1.8300000 | 1.83 | 1.83 |
One Dish Meal | 3.9066667 | 6.09 | 1.46 |
Pork | 4.6200000 | 4.62 | 4.62 |
Breakfast meal | 5.1600000 | 11.13 | 2.25 |
Beverages | 9.8000000 | 9.80 | 9.80 |
Meat | 13.5525000 | 30.06 | 5.21 |
Dessert | 69.5700000 | 86.97 | 52.17 |
the above output shows results arranged in ascending order of
avg_sugar
, if we needed to begin with the largest descending we could useORDER BY...DESC
SELECT category,AVG(sugar) AS avg_sugar,
MAX(sugar) AS max_sugar,
MIN(sugar) AS min_sugar
FROM data2
GROUP BY category
ORDER BY avg_sugar DESC;
category | avg_sugar | max_sugar | min_sugar |
---|---|---|---|
Dessert | 69.5700000 | 86.97 | 52.17 |
Meat | 13.5525000 | 30.06 | 5.21 |
Beverages | 9.8000000 | 9.80 | 9.80 |
Breakfast meal | 5.1600000 | 11.13 | 2.25 |
Pork | 4.6200000 | 4.62 | 4.62 |
One Dish Meal | 3.9066667 | 6.09 | 1.46 |
Chicken Breast | 1.8300000 | 1.83 | 1.83 |
Chicken | 1.1400000 | 1.34 | 0.94 |
Potato | 0.8100000 | 0.81 | 0.81 |
Vegetable | 0.8066667 | 1.96 | 0.07 |
Sometimes you may want to change a column to all upper cases
SELECT recipe,category , UPPER(category) AS upper_cat
FROM data2;
recipe | category | upper_cat |
---|---|---|
624 | Potato | POTATO |
102 | Vegetable | VEGETABLE |
427 | Dessert | DESSERT |
324 | Breakfast meal | BREAKFAST MEAL |
912 | Dessert | DESSERT |
380 | Meat | MEAT |
609 | Chicken Breast | CHICKEN BREAST |
671 | Chicken | CHICKEN |
683 | Chicken | CHICKEN |
208 | Beverages | BEVERAGES |
+, -, *, and /
.SELECT category,(sugar-protein) AS diff_sugar_protein
FROM data2;
category | diff_sugar_protein |
---|---|
Potato | -8.26 |
Vegetable | -38.73 |
Dessert | 82.48 |
Breakfast meal | -48.64 |
Dessert | NA |
Meat | -9.46 |
Chicken Breast | NA |
Chicken | -50.56 |
Chicken | -53.06 |
Beverages | 8.98 |
CASE
CASE WHEN
to categorise or group data as
followsSELECT
….CASE WHEN
condition THEN
categoryWHEN
condition THEN
categoryELSE
some_categoryEND AS
variable nameSELECT recipe , sugar,
CASE WHEN sugar BETWEEN 0 AND 50 THEN '0-50'
WHEN sugar BETWEEN 51 AND 100 THEN '51-100'
WHEN sugar BETWEEN 101 AND 200 THEN '101-200'
WHEN sugar IS NULL THEN 'missing'
ELSE '>200'
END AS sugar_category
FROM data2;
recipe | sugar | sugar_category |
---|---|---|
624 | 0.81 | 0-50 |
102 | 0.39 | 0-50 |
427 | 86.97 | 51-100 |
324 | 4.69 | 0-50 |
912 | NA | missing |
380 | 8.18 | 0-50 |
609 | NA | missing |
671 | 1.34 | 0-50 |
683 | 0.94 | 0-50 |
208 | 9.80 | 0-50 |
we can letter perform aggregations on the new column for instance get the count per each category
SELECT COUNT(*) AS count_per_category,
CASE WHEN sugar BETWEEN 0 AND 50 THEN '0-50'
WHEN sugar BETWEEN 51 AND 100 THEN '51-100'
WHEN sugar BETWEEN 101 AND 200 THEN '101-200'
WHEN sugar IS NULL THEN 'missing'
ELSE '>200'
END AS sugar_category
FROM data2
GROUP BY sugar_category;
count_per_category | sugar_category |
---|---|
20 | 0-50 |
2 | 51-100 |
3 | missing |
SELECT recipe,calories
FROM data2
WHERE calories >
SELECT AVG(calories) FROM data1); (
recipe | calories |
---|---|
671 | 481.88 |
859 | 388.44 |
171 | 431.28 |
282 | 409.99 |
91 | 388.37 |
868 | 307.36 |
885 | 504.09 |
419 | 1830.28 |
639 | 321.95 |
Sometimes you want to understand how a value varies across groups. For example, how does the maximum value per group vary across groups?
To find out, first summarize by group, and then compute summary statistics of the group results. One way to do this is to compute group values in a subquery, and then summarize the results of the subquery.
what is the standard deviation across meal cateries in the maximum amount of sugar? What about the mean, min, and max of the maximums as well?
Start by writing a subquery to compute the
max()
per categoy; alias the subquery result as sugar_max. Then compute the standard deviation of sugar_max withstddev()
. Compute themin()
,max()
, andavg()
of sugar_max too.
SELECT AVG(sugar_max) AS avg_sugar,
MAX(sugar_max) AS max_sugar,
MIN(sugar_max) AS min_sugar,
VARIANCE(sugar_max) AS variance,
AS stnd_deviation
STDEV(sugar_max) FROM (SELECT MAX(sugar) AS sugar_max
FROM data2
-- Compute max by...
GROUP BY category) AS max_results
avg_sugar | max_sugar | min_sugar | variance | stnd_deviation |
---|---|---|---|---|
15.461 | 86.97 | 0.81 | 707.2168 | 26.59355 |