Week2A SQL and R

Approach

pgSQL

I know I want to start with pgSQL. I’ve always try reasons to use it and this is a good excuse. I already have a server set up, so creating a schema for it is simple. I’ll use it to create three different tables:

  • people
    • user_id
      • INT
      • SERIAL
      • PRIMARY KEY
    • name
      • TEXT
  • movies
    • movie_id
      • INT
      • SERIAL
      • PRIMARY KEY
    • title
      • TEXT
  • ratings
    • rating_id
      • INT
      • SERIAL
      • PRIMARY_KEY
    • person_id
      • INT
      • SERIAL
      • FOREIGN_KEY
    • movie_id
      • INT
      • SERIAL
      • FOREIGN_KEY
    • rating
      • INT
    • rated_at
      • DATE

I’ll combine all the data into a singular view by using the Primary_Keys and Foreign_Keys. I’ll create a raw view which will include all the data and another view with the null rows intact, maybe I can use R to get rid of those via a filter.

I’m not going to get into much of the data-entry angle. Like at work we do an ETL of data via REST to then convert it into a table, then SELECT into a view. So, I’ll just use SQL INSERT to add in the data, while I’m building out the table. This should be enough for the assignment.

R

I’m going to use the RPostgres package instead of dbplyr. I heard that it’s one of the go-to packages for using postgres within R so I’d like to use it. I’ll use it with DBI and perform the following:

  • Connect to SQL database
  • Select both views as variables using RPostgres
  • Return each, but clean up the raw dataset to remove null entries
  • Create a summarization table
  • Create a chart

Codebase

pgSQL

Schema Creation


DROP SCHEMA IF EXISTS popular_movies CASCADE;
CREATE SCHEMA popular_movies;

Table Creation


CREATE TABLE popular_movies.movies(
    movie_id SERIAL PRIMARY KEY,
    title TEXT NOT NULL,
    release_year INT NOT NULL
);

CREATE TABLE popular_movies.people(
    person_id SERIAL PRIMARY KEY,
    name TEXT NOT NULL UNIQUE
);

CREATE TABLE popular_movies.ratings(
    rating_id SERIAL PRIMARY KEY,
    person_id INT NOT NULL,
    movie_id INT NOT NULL,
    -- Assume that everyone answering the survey has watched every movie
    rating INT CHECK (rating BETWEEN 1 AND 5),
    rated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,

    CONSTRAINT unique_person_movie UNIQUE (person_id, movie_id),
    
    CONSTRAINT fk_person
        FOREIGN KEY (person_id)
        REFERENCES popular_movies.people(person_id),
        
    CONSTRAINT fk_movie
        FOREIGN KEY (movie_id)
        REFERENCES popular_movies.movies(movie_id)
);

Data Creation


TRUNCATE popular_movies.ratings, popular_movies.people, popular_movies.movies
RESTART IDENTITY;

INSERT INTO popular_movies.movies (title, release_year) VALUES
('One Battle After Another', 2025),
('Begonia', 2025),
('Wicked for Good', 2025),
('The Materialist', 2025),
('Sinners', 2025);

INSERT INTO popular_movies.people (name) VALUES
('Alex'),
('Bri'),
('Chen'),
('Devi'),
('Eli'),
('Fran');

INSERT INTO popular_movies.ratings (person_id, movie_id, rating) VALUES

(1,1,5),
(1,2,4),
(1,3,4),
(1,4,3),
(1,5,NULL),
(2,1,4),
(2,2,3),
(2,3,5),
(2,4,NULL),
(2,5,4),
(3,1,NULL),
(3,2,5),
(3,3,4),
(3,4,3),
(3,5,4),
(4,1,3),
(4,2,NULL),
(4,3,3),
(4,4,4),
(4,5,5),
(5,1,4),
(5,2,3),
(5,3,NULL),
(5,4,5),
(5,5,3),
(6,1,5),
(6,2,3),
(6,3,4),
(6,4,4),
(6,5,NULL);

Create View


-- raw retains nulls
CREATE OR REPLACE VIEW popular_movies.v_ratings_raw AS
SELECT
  p.name,
  m.title,
  m.release_year,
  r.rating,
  r.rated_at
FROM popular_movies.ratings r
JOIN popular_movies.people p ON p.person_id = r.person_id
JOIN popular_movies.movies m ON m.movie_id = r.movie_id;

-- clean filters rows where ratings = NULL
CREATE OR REPLACE VIEW popular_movies.v_ratings_clean AS
SELECT *
FROM popular_movies.v_ratings_raw
WHERE rating IS NOT NULL;

R

Loading Packages

library(DBI)
library(RPostgres)
library(tidyverse)

Creating connection variable

con <- dbConnect(
  RPostgres::Postgres(),
  dbname = "learning",
  host = "localhost",
  user = 'postgres',
  password = 'admin'
)

Connection Test

dbGetQuery(con, "SELECT version();")
                                                                  version
1 PostgreSQL 17.6 on x86_64-windows, compiled by msvc-19.44.35213, 64-bit

View Schema Contents

dbGetQuery(con,"
          SELECT table_name
          FROM information_schema.views
          WHERE table_schema = 'popular_movies'
          ")
       table_name
1   v_ratings_raw
2 v_ratings_clean

Query

query <- "SELECT * FROM popular_movies.v_ratings_raw"
df_view_raw <- dbGetQuery(con, query) |>
  as_tibble()

query <- "SELECT * FROM popular_movies.v_ratings_clean"
df_view_clean <- dbGetQuery(con, query) |>
  as_tibble()

df_view_raw
# A tibble: 30 × 5
   name  title                    release_year rating rated_at           
   <chr> <chr>                           <int>  <int> <dttm>             
 1 Alex  One Battle After Another         2025      5 2026-02-04 13:57:13
 2 Alex  Begonia                          2025      4 2026-02-04 13:57:13
 3 Alex  Wicked for Good                  2025      4 2026-02-04 13:57:13
 4 Alex  The Materialist                  2025      3 2026-02-04 13:57:13
 5 Alex  Sinners                          2025     NA 2026-02-04 13:57:13
 6 Bri   One Battle After Another         2025      4 2026-02-04 13:57:13
 7 Bri   Begonia                          2025      3 2026-02-04 13:57:13
 8 Bri   Wicked for Good                  2025      5 2026-02-04 13:57:13
 9 Bri   The Materialist                  2025     NA 2026-02-04 13:57:13
10 Bri   Sinners                          2025      4 2026-02-04 13:57:13
# ℹ 20 more rows
df_view_clean
# A tibble: 24 × 5
   name  title                    release_year rating rated_at           
   <chr> <chr>                           <int>  <int> <dttm>             
 1 Alex  One Battle After Another         2025      5 2026-02-04 13:57:13
 2 Alex  Begonia                          2025      4 2026-02-04 13:57:13
 3 Alex  Wicked for Good                  2025      4 2026-02-04 13:57:13
 4 Alex  The Materialist                  2025      3 2026-02-04 13:57:13
 5 Bri   One Battle After Another         2025      4 2026-02-04 13:57:13
 6 Bri   Begonia                          2025      3 2026-02-04 13:57:13
 7 Bri   Wicked for Good                  2025      5 2026-02-04 13:57:13
 8 Bri   Sinners                          2025      4 2026-02-04 13:57:13
 9 Chen  Begonia                          2025      5 2026-02-04 13:57:13
10 Chen  Wicked for Good                  2025      4 2026-02-04 13:57:13
# ℹ 14 more rows

Viewing NA values

df_view_raw |>
  filter(is.na(rating))
# A tibble: 6 × 5
  name  title                    release_year rating rated_at           
  <chr> <chr>                           <int>  <int> <dttm>             
1 Alex  Sinners                          2025     NA 2026-02-04 13:57:13
2 Bri   The Materialist                  2025     NA 2026-02-04 13:57:13
3 Chen  One Battle After Another         2025     NA 2026-02-04 13:57:13
4 Devi  Begonia                          2025     NA 2026-02-04 13:57:13
5 Eli   Wicked for Good                  2025     NA 2026-02-04 13:57:13
6 Fran  Sinners                          2025     NA 2026-02-04 13:57:13

Practice, pretend v_ratings_clean doesn’t exist

df_view_clean <- df_view_raw |>
  filter(!is.na(rating))

df_view_clean
# A tibble: 24 × 5
   name  title                    release_year rating rated_at           
   <chr> <chr>                           <int>  <int> <dttm>             
 1 Alex  One Battle After Another         2025      5 2026-02-04 13:57:13
 2 Alex  Begonia                          2025      4 2026-02-04 13:57:13
 3 Alex  Wicked for Good                  2025      4 2026-02-04 13:57:13
 4 Alex  The Materialist                  2025      3 2026-02-04 13:57:13
 5 Bri   One Battle After Another         2025      4 2026-02-04 13:57:13
 6 Bri   Begonia                          2025      3 2026-02-04 13:57:13
 7 Bri   Wicked for Good                  2025      5 2026-02-04 13:57:13
 8 Bri   Sinners                          2025      4 2026-02-04 13:57:13
 9 Chen  Begonia                          2025      5 2026-02-04 13:57:13
10 Chen  Wicked for Good                  2025      4 2026-02-04 13:57:13
# ℹ 14 more rows

Analysis

df_view_clean |>
  arrange(desc(rating))
# A tibble: 24 × 5
   name  title                    release_year rating rated_at           
   <chr> <chr>                           <int>  <int> <dttm>             
 1 Alex  One Battle After Another         2025      5 2026-02-04 13:57:13
 2 Bri   Wicked for Good                  2025      5 2026-02-04 13:57:13
 3 Chen  Begonia                          2025      5 2026-02-04 13:57:13
 4 Devi  Sinners                          2025      5 2026-02-04 13:57:13
 5 Eli   The Materialist                  2025      5 2026-02-04 13:57:13
 6 Fran  One Battle After Another         2025      5 2026-02-04 13:57:13
 7 Alex  Begonia                          2025      4 2026-02-04 13:57:13
 8 Alex  Wicked for Good                  2025      4 2026-02-04 13:57:13
 9 Bri   One Battle After Another         2025      4 2026-02-04 13:57:13
10 Bri   Sinners                          2025      4 2026-02-04 13:57:13
# ℹ 14 more rows
titles <- df_view_clean |>
  distinct(title)
df <- df_view_clean |>
  group_by(title) |>
  summarize(
    avg_rating = mean(rating, na.rm = TRUE),
    vote_count = sum(!is.na(rating)),
    low_ratings = sum(rating <= 2, na.rm = TRUE),
    medium_ratings = sum(rating == 3, na.rm = TRUE), 
    high_ratings = sum(rating >= 4, na.rm = TRUE),
    max_rating = sum(rating == 5, na.rm = TRUE),
    min_rating = sum(rating == 1, na.rm = TRUE)
    ) |>
  arrange(desc(avg_rating), desc(vote_count)) |>
  mutate(ranking = row_number()) |>
  relocate(ranking, .after = title)

df
# A tibble: 5 × 9
  title    ranking avg_rating vote_count low_ratings medium_ratings high_ratings
  <chr>      <int>      <dbl>      <int>       <int>          <int>        <int>
1 One Bat…       1        4.2          5           0              1            4
2 Wicked …       2        4            5           0              1            4
3 Sinners        3        4            4           0              1            3
4 The Mat…       4        3.8          5           0              2            3
5 Begonia        5        3.6          5           0              3            2
# ℹ 2 more variables: max_rating <int>, min_rating <int>
df_report <- df |>
  select(title, ranking, avg_rating)
df_report
# A tibble: 5 × 3
  title                    ranking avg_rating
  <chr>                      <int>      <dbl>
1 One Battle After Another       1        4.2
2 Wicked for Good                2        4  
3 Sinners                        3        4  
4 The Materialist                4        3.8
5 Begonia                        5        3.6
df_plot <- df_report |>
  ggplot(aes(x = reorder(title, -avg_rating), y = avg_rating, fill = title)) +
  geom_col() +
  scale_y_continuous(limits = c(0, 5)) +
  geom_text(aes(label = avg_rating), nudge_y = 0.2, size = 3) +
  labs(
    title = "Average Movie Ratings",
    x = "Movies",
    y = "Average Rating") +
  theme_minimal() +
  theme(plot.title = element_text(face = 'bold', size = 16, hjust = 0.5),
        axis.title.x = element_text(margin = margin(t = 20)),
        axis.title.y = element_text(margin = margin(r = 20))
        )

df_plot