DROP SCHEMA IF EXISTS popular_movies CASCADE;
CREATE SCHEMA popular_movies;Week2A SQL and R
Approach
pgSQL
I know I want to start with pgSQL. I’ve always try reasons to use it and this is a good excuse. I already have a server set up, so creating a schema for it is simple. I’ll use it to create three different tables:
- people
- user_id
- INT
- SERIAL
- PRIMARY KEY
- name
- TEXT
- user_id
- movies
- movie_id
- INT
- SERIAL
- PRIMARY KEY
- title
- TEXT
- movie_id
- ratings
- rating_id
- INT
- SERIAL
- PRIMARY_KEY
- person_id
- INT
- SERIAL
- FOREIGN_KEY
- movie_id
- INT
- SERIAL
- FOREIGN_KEY
- rating
- INT
- rated_at
- DATE
- rating_id
I’ll combine all the data into a singular view by using the Primary_Keys and Foreign_Keys. I’ll create a raw view which will include all the data and another view with the null rows intact, maybe I can use R to get rid of those via a filter.
I’m not going to get into much of the data-entry angle. Like at work we do an ETL of data via REST to then convert it into a table, then SELECT into a view. So, I’ll just use SQL INSERT to add in the data, while I’m building out the table. This should be enough for the assignment.
R
I’m going to use the RPostgres package instead of dbplyr. I heard that it’s one of the go-to packages for using postgres within R so I’d like to use it. I’ll use it with DBI and perform the following:
- Connect to SQL database
- Select both views as variables using RPostgres
- Return each, but clean up the raw dataset to remove null entries
- Create a summarization table
- Create a chart
Codebase
pgSQL
Schema Creation
Table Creation
CREATE TABLE popular_movies.movies(
movie_id SERIAL PRIMARY KEY,
title TEXT NOT NULL,
release_year INT NOT NULL
);
CREATE TABLE popular_movies.people(
person_id SERIAL PRIMARY KEY,
name TEXT NOT NULL UNIQUE
);
CREATE TABLE popular_movies.ratings(
rating_id SERIAL PRIMARY KEY,
person_id INT NOT NULL,
movie_id INT NOT NULL,
-- Assume that everyone answering the survey has watched every movie
rating INT CHECK (rating BETWEEN 1 AND 5),
rated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
CONSTRAINT unique_person_movie UNIQUE (person_id, movie_id),
CONSTRAINT fk_person
FOREIGN KEY (person_id)
REFERENCES popular_movies.people(person_id),
CONSTRAINT fk_movie
FOREIGN KEY (movie_id)
REFERENCES popular_movies.movies(movie_id)
);Data Creation
TRUNCATE popular_movies.ratings, popular_movies.people, popular_movies.movies
RESTART IDENTITY;
INSERT INTO popular_movies.movies (title, release_year) VALUES
('One Battle After Another', 2025),
('Begonia', 2025),
('Wicked for Good', 2025),
('The Materialist', 2025),
('Sinners', 2025);
INSERT INTO popular_movies.people (name) VALUES
('Alex'),
('Bri'),
('Chen'),
('Devi'),
('Eli'),
('Fran');
INSERT INTO popular_movies.ratings (person_id, movie_id, rating) VALUES
(1,1,5),
(1,2,4),
(1,3,4),
(1,4,3),
(1,5,NULL),
(2,1,4),
(2,2,3),
(2,3,5),
(2,4,NULL),
(2,5,4),
(3,1,NULL),
(3,2,5),
(3,3,4),
(3,4,3),
(3,5,4),
(4,1,3),
(4,2,NULL),
(4,3,3),
(4,4,4),
(4,5,5),
(5,1,4),
(5,2,3),
(5,3,NULL),
(5,4,5),
(5,5,3),
(6,1,5),
(6,2,3),
(6,3,4),
(6,4,4),
(6,5,NULL);Create View
-- raw retains nulls
CREATE OR REPLACE VIEW popular_movies.v_ratings_raw AS
SELECT
p.name,
m.title,
m.release_year,
r.rating,
r.rated_at
FROM popular_movies.ratings r
JOIN popular_movies.people p ON p.person_id = r.person_id
JOIN popular_movies.movies m ON m.movie_id = r.movie_id;
-- clean filters rows where ratings = NULL
CREATE OR REPLACE VIEW popular_movies.v_ratings_clean AS
SELECT *
FROM popular_movies.v_ratings_raw
WHERE rating IS NOT NULL;R
Loading Packages
library(DBI)
library(RPostgres)
library(tidyverse)Creating connection variable
con <- dbConnect(
RPostgres::Postgres(),
dbname = "learning",
host = "localhost",
user = 'postgres',
password = 'admin'
)Connection Test
dbGetQuery(con, "SELECT version();") version
1 PostgreSQL 17.6 on x86_64-windows, compiled by msvc-19.44.35213, 64-bit
View Schema Contents
dbGetQuery(con,"
SELECT table_name
FROM information_schema.views
WHERE table_schema = 'popular_movies'
") table_name
1 v_ratings_raw
2 v_ratings_clean
Query
query <- "SELECT * FROM popular_movies.v_ratings_raw"
df_view_raw <- dbGetQuery(con, query) |>
as_tibble()
query <- "SELECT * FROM popular_movies.v_ratings_clean"
df_view_clean <- dbGetQuery(con, query) |>
as_tibble()
df_view_raw# A tibble: 30 × 5
name title release_year rating rated_at
<chr> <chr> <int> <int> <dttm>
1 Alex One Battle After Another 2025 5 2026-02-04 13:57:13
2 Alex Begonia 2025 4 2026-02-04 13:57:13
3 Alex Wicked for Good 2025 4 2026-02-04 13:57:13
4 Alex The Materialist 2025 3 2026-02-04 13:57:13
5 Alex Sinners 2025 NA 2026-02-04 13:57:13
6 Bri One Battle After Another 2025 4 2026-02-04 13:57:13
7 Bri Begonia 2025 3 2026-02-04 13:57:13
8 Bri Wicked for Good 2025 5 2026-02-04 13:57:13
9 Bri The Materialist 2025 NA 2026-02-04 13:57:13
10 Bri Sinners 2025 4 2026-02-04 13:57:13
# ℹ 20 more rows
df_view_clean# A tibble: 24 × 5
name title release_year rating rated_at
<chr> <chr> <int> <int> <dttm>
1 Alex One Battle After Another 2025 5 2026-02-04 13:57:13
2 Alex Begonia 2025 4 2026-02-04 13:57:13
3 Alex Wicked for Good 2025 4 2026-02-04 13:57:13
4 Alex The Materialist 2025 3 2026-02-04 13:57:13
5 Bri One Battle After Another 2025 4 2026-02-04 13:57:13
6 Bri Begonia 2025 3 2026-02-04 13:57:13
7 Bri Wicked for Good 2025 5 2026-02-04 13:57:13
8 Bri Sinners 2025 4 2026-02-04 13:57:13
9 Chen Begonia 2025 5 2026-02-04 13:57:13
10 Chen Wicked for Good 2025 4 2026-02-04 13:57:13
# ℹ 14 more rows
Viewing NA values
df_view_raw |>
filter(is.na(rating))# A tibble: 6 × 5
name title release_year rating rated_at
<chr> <chr> <int> <int> <dttm>
1 Alex Sinners 2025 NA 2026-02-04 13:57:13
2 Bri The Materialist 2025 NA 2026-02-04 13:57:13
3 Chen One Battle After Another 2025 NA 2026-02-04 13:57:13
4 Devi Begonia 2025 NA 2026-02-04 13:57:13
5 Eli Wicked for Good 2025 NA 2026-02-04 13:57:13
6 Fran Sinners 2025 NA 2026-02-04 13:57:13
Practice, pretend v_ratings_clean doesn’t exist
df_view_clean <- df_view_raw |>
filter(!is.na(rating))
df_view_clean# A tibble: 24 × 5
name title release_year rating rated_at
<chr> <chr> <int> <int> <dttm>
1 Alex One Battle After Another 2025 5 2026-02-04 13:57:13
2 Alex Begonia 2025 4 2026-02-04 13:57:13
3 Alex Wicked for Good 2025 4 2026-02-04 13:57:13
4 Alex The Materialist 2025 3 2026-02-04 13:57:13
5 Bri One Battle After Another 2025 4 2026-02-04 13:57:13
6 Bri Begonia 2025 3 2026-02-04 13:57:13
7 Bri Wicked for Good 2025 5 2026-02-04 13:57:13
8 Bri Sinners 2025 4 2026-02-04 13:57:13
9 Chen Begonia 2025 5 2026-02-04 13:57:13
10 Chen Wicked for Good 2025 4 2026-02-04 13:57:13
# ℹ 14 more rows
Analysis
df_view_clean |>
arrange(desc(rating))# A tibble: 24 × 5
name title release_year rating rated_at
<chr> <chr> <int> <int> <dttm>
1 Alex One Battle After Another 2025 5 2026-02-04 13:57:13
2 Bri Wicked for Good 2025 5 2026-02-04 13:57:13
3 Chen Begonia 2025 5 2026-02-04 13:57:13
4 Devi Sinners 2025 5 2026-02-04 13:57:13
5 Eli The Materialist 2025 5 2026-02-04 13:57:13
6 Fran One Battle After Another 2025 5 2026-02-04 13:57:13
7 Alex Begonia 2025 4 2026-02-04 13:57:13
8 Alex Wicked for Good 2025 4 2026-02-04 13:57:13
9 Bri One Battle After Another 2025 4 2026-02-04 13:57:13
10 Bri Sinners 2025 4 2026-02-04 13:57:13
# ℹ 14 more rows
titles <- df_view_clean |>
distinct(title)
df <- df_view_clean |>
group_by(title) |>
summarize(
avg_rating = mean(rating, na.rm = TRUE),
vote_count = sum(!is.na(rating)),
low_ratings = sum(rating <= 2, na.rm = TRUE),
medium_ratings = sum(rating == 3, na.rm = TRUE),
high_ratings = sum(rating >= 4, na.rm = TRUE),
max_rating = sum(rating == 5, na.rm = TRUE),
min_rating = sum(rating == 1, na.rm = TRUE)
) |>
arrange(desc(avg_rating), desc(vote_count)) |>
mutate(ranking = row_number()) |>
relocate(ranking, .after = title)
df# A tibble: 5 × 9
title ranking avg_rating vote_count low_ratings medium_ratings high_ratings
<chr> <int> <dbl> <int> <int> <int> <int>
1 One Bat… 1 4.2 5 0 1 4
2 Wicked … 2 4 5 0 1 4
3 Sinners 3 4 4 0 1 3
4 The Mat… 4 3.8 5 0 2 3
5 Begonia 5 3.6 5 0 3 2
# ℹ 2 more variables: max_rating <int>, min_rating <int>
df_report <- df |>
select(title, ranking, avg_rating)
df_report# A tibble: 5 × 3
title ranking avg_rating
<chr> <int> <dbl>
1 One Battle After Another 1 4.2
2 Wicked for Good 2 4
3 Sinners 3 4
4 The Materialist 4 3.8
5 Begonia 5 3.6
df_plot <- df_report |>
ggplot(aes(x = reorder(title, -avg_rating), y = avg_rating, fill = title)) +
geom_col() +
scale_y_continuous(limits = c(0, 5)) +
geom_text(aes(label = avg_rating), nudge_y = 0.2, size = 3) +
labs(
title = "Average Movie Ratings",
x = "Movies",
y = "Average Rating") +
theme_minimal() +
theme(plot.title = element_text(face = 'bold', size = 16, hjust = 0.5),
axis.title.x = element_text(margin = margin(t = 20)),
axis.title.y = element_text(margin = margin(r = 20))
)
df_plot