##Objective Creating a normalized SQLite database for movie ratings, loading the data into R, handling missing data, and standardizing ratings.

##Populate

library(RSQLite)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
con <- dbConnect(RSQLite::SQLite(), "movie_ratings.db")

dbExecute(con, "
CREATE TABLE IF NOT EXISTS Movies (
    movie_id INTEGER PRIMARY KEY,
    title TEXT NOT NULL,
    release_year INTEGER
)")
## [1] 0
dbExecute(con, "
CREATE TABLE IF NOT EXISTS Friends (
    friend_id TEXT PRIMARY KEY,
    friend_name TEXT NOT NULL
)")
## [1] 0
dbExecute(con, "
CREATE TABLE IF NOT EXISTS Ratings (
    rating_id INTEGER PRIMARY KEY AUTOINCREMENT,
    friend_id TEXT,
    movie_id INTEGER,
    rating INTEGER,
    FOREIGN KEY (friend_id) REFERENCES Friends(friend_id),
    FOREIGN KEY (movie_id) REFERENCES Movies(movie_id)
)")
## [1] 0
movies_data <- data.frame(
    movie_id = 1:6,
    title = c("Dune", "Spider-Man: No Way Home", "The Batman", 
              "Top Gun: Maverick", "Avatar: The Way of Water", "Oppenheimer"),
    release_year = c(2021, 2021, 2022, 2022, 2022, 2023)
)

dbWriteTable(con, "Movies", movies_data, append = TRUE, row.names = FALSE)

friends_data <- data.frame(
    friend_id = c("F1", "F2", "F3", "F4", "F5"),
    friend_name = c("Friend A", "Friend B", "Friend C", "Friend D", "Friend E")
)

dbWriteTable(con, "Friends", friends_data, append = TRUE, row.names = FALSE)

ratings_data <- data.frame(
    friend_id = c("F1", "F1", "F1", "F1", "F1", "F2", "F2", "F2", "F2", "F2", "F3", "F3", "F3", "F3", "F3", "F4", "F4", "F4", "F4", "F5", "F5", "F5", "F5", "F5"),
    movie_id = c(1, 2, 3, 4, 6, 1, 2, 3, 5, 6, 1, 3, 4, 5, 6, 2, 3, 4, 5, 1, 2, 4, 5, 6),
    rating = c(4, 5, 3, 4, 5, 5, 4, 4, 5, 4, 3, 5, 4, 4, 5, 5, 4, 5, 5, 4, 4, 3, 4, 5)
)

dbWriteTable(con, "Ratings", ratings_data, append = TRUE, row.names = FALSE)

##Load

ratings_df <- dbGetQuery(con, "SELECT * FROM Ratings")
movies_df <- dbGetQuery(con, "SELECT * FROM Movies")

head(ratings_df)
##   rating_id friend_id movie_id rating
## 1         1        F1        1      4
## 2         2        F1        2      5
## 3         3        F1        3      3
## 4         4        F1        4      4
## 5         5        F1        6      5
## 6         6        F2        1      5
head(movies_df)
##   movie_id                    title release_year
## 1        1                     Dune         2021
## 2        2  Spider-Man: No Way Home         2021
## 3        3               The Batman         2022
## 4        4        Top Gun: Maverick         2022
## 5        5 Avatar: The Way of Water         2022
## 6        6              Oppenheimer         2023

##Matrix

ratings_wide <- dbGetQuery(con, "
    SELECT f.friend_id, m.title,
           CASE WHEN r.rating IS NULL THEN 'NA' ELSE CAST(r.rating AS TEXT) END as rating
    FROM Friends f
    CROSS JOIN Movies m
    LEFT JOIN Ratings r ON f.friend_id = r.friend_id AND m.movie_id = r.movie_id
    ORDER BY f.friend_id, m.movie_id
")

ratings_matrix <- ratings_wide %>%
    pivot_wider(names_from = title, values_from = rating) %>%
    as.data.frame()

ratings_matrix
##   friend_id Dune Spider-Man: No Way Home The Batman Top Gun: Maverick
## 1        F1    4                       5          3                 4
## 2        F2    5                       4          4                NA
## 3        F3    3                      NA          5                 4
## 4        F4   NA                       5          4                 5
## 5        F5    4                       4         NA                 3
##   Avatar: The Way of Water Oppenheimer
## 1                       NA           5
## 2                        5           4
## 3                        4           5
## 4                        5          NA
## 5                        4           5

##Standardize

ratings_standardized <- dbGetQuery(con, "
    WITH MovieStats AS (
        SELECT movie_id,
               AVG(CAST(rating AS FLOAT)) as mean_rating,
               SQRT(AVG(CAST(rating AS FLOAT) * CAST(rating AS FLOAT)) - 
                    AVG(CAST(rating AS FLOAT)) * AVG(CAST(rating AS FLOAT))) as std_dev
        FROM Ratings
        GROUP BY movie_id
    )
    SELECT r.friend_id,
           m.title,
           r.rating as original_rating,
           ROUND(CAST((r.rating - ms.mean_rating) / CASE WHEN ms.std_dev = 0 THEN 1 ELSE ms.std_dev END AS FLOAT), 2) as z_score
    FROM Ratings r
    JOIN Movies m ON r.movie_id = m.movie_id
    JOIN MovieStats ms ON r.movie_id = ms.movie_id
    ORDER BY r.friend_id, m.movie_id
")

ratings_standardized
##    friend_id                    title original_rating z_score
## 1         F1                     Dune               4    0.00
## 2         F1  Spider-Man: No Way Home               5    1.00
## 3         F1               The Batman               3   -1.41
## 4         F1        Top Gun: Maverick               4    0.00
## 5         F1              Oppenheimer               5    0.58
## 6         F2                     Dune               5    1.41
## 7         F2  Spider-Man: No Way Home               4   -1.00
## 8         F2               The Batman               4    0.00
## 9         F2 Avatar: The Way of Water               5    1.00
## 10        F2              Oppenheimer               4   -1.73
## 11        F3                     Dune               3   -1.41
## 12        F3               The Batman               5    1.41
## 13        F3        Top Gun: Maverick               4    0.00
## 14        F3 Avatar: The Way of Water               4   -1.00
## 15        F3              Oppenheimer               5    0.58
## 16        F4  Spider-Man: No Way Home               5    1.00
## 17        F4               The Batman               4    0.00
## 18        F4        Top Gun: Maverick               5    1.41
## 19        F4 Avatar: The Way of Water               5    1.00
## 20        F5                     Dune               4    0.00
## 21        F5  Spider-Man: No Way Home               4   -1.00
## 22        F5        Top Gun: Maverick               3   -1.41
## 23        F5 Avatar: The Way of Water               4   -1.00
## 24        F5              Oppenheimer               5    0.58

##Summary

rating_summary <- dbGetQuery(con, "
    SELECT m.title,
           COUNT(r.rating) as num_ratings,
           ROUND(AVG(CAST(r.rating AS FLOAT)), 2) as avg_rating,
           MIN(r.rating) as min_rating,
           MAX(r.rating) as max_rating
    FROM Movies m
    LEFT JOIN Ratings r ON m.movie_id = r.movie_id
    GROUP BY m.title
    ORDER BY m.movie_id
")

rating_summary
##                      title num_ratings avg_rating min_rating max_rating
## 1                     Dune           4       4.00          3          5
## 2  Spider-Man: No Way Home           4       4.50          4          5
## 3               The Batman           4       4.00          3          5
## 4        Top Gun: Maverick           4       4.00          3          5
## 5 Avatar: The Way of Water           4       4.50          4          5
## 6              Oppenheimer           4       4.75          4          5

##Conclusion