Week 2 task

Choose six recent popular movies. Ask at least five people that you know (friends, family, classmates, imaginary friends) to rate each of these movie that they have seen on a scale of 1 to 5. Take the results (observations) and store them in a SQL database. Load the information into an R dataframe. Your deliverables should include your SQL scripts and your R Markdown code, posted to GitHub.

Setup

# Load packages
library(RMySQL)
library(reshape2)
library(plyr)

Data initialization

Since this assignment is centered around using R with SQL, I have decided to enter initial data manually. Every year I attemp to watch all movies nominated as Best Picture for the Academy Awards. This data set includes my ratings and ratings of several review aggregators for 9 movies nominated this year.

movies <- c("Arrival", "Fences", "Hacksaw Ridge", "Hell or High Water", "Hidden Figures",
            "La La Land", "Lion", "Manchester by the Sea", "Moonlight")
ilya <- c(3, NA, 3, NA, 4, 3, 3, 5, 4)
imdb <- c(8.1, 7.5, 8.3, 7.7, 7.9, 8.5, 8, 8.2, 8.1)
rotten_tomatoes <- c(94, 95, 86, 98, 92, 93, 86, 96, 98)
google_users <- c(86, 83, 94, 91, 92, 89, 84, 74, 88)
metacritic <- c(81, 79, 71, 88, 74, 93, 69, 96, 99)

ratings <- data.frame(movies, ilya, imdb, rotten_tomatoes, google_users, metacritic)

# Normalize all ratings to a 5 point scale
ratings$imdb <- round(ratings$imdb / 2, 0)
ratings$rotten_tomatoes <- round(ratings$rotten_tomatoes / 20, 0)
ratings$google_users <- round(ratings$google_users / 20, 0)
ratings$metacritic <- round(ratings$metacritic / 20, 0)
knitr::kable(ratings, col.names = c("Movies", "My Ratings", "IMDB", "Rotten Tomatoes", "Google Users", "Metacritic"))
Movies My Ratings IMDB Rotten Tomatoes Google Users Metacritic
Arrival 3 4 5 4 4
Fences NA 4 5 4 4
Hacksaw Ridge 3 4 4 5 4
Hell or High Water NA 4 5 5 4
Hidden Figures 4 4 5 5 4
La La Land 3 4 5 4 5
Lion 3 4 4 4 3
Manchester by the Sea 5 4 5 4 5
Moonlight 4 4 5 4 5

Just for practicing I have decided to normalize the main table into three smaller tables containing information about movies, critics and ratings.

# Transpose the table
movie_ratings <- melt(ratings, id=c("movies"))
colnames(movie_ratings) <- c("movie", "critic", "rating")

# Extract unique movies and assign ID to each
movies <- data.frame(1:nlevels(movie_ratings$movie), levels(movie_ratings$movie))
colnames(movies) <- c("id", "movie")

# Extract unique critics and assign ID to each
critics <- data.frame(1:nlevels(movie_ratings$critic), levels(movie_ratings$critic))
colnames(critics) <- c("id", "critic")

# Add corresponding movie and critic IDs to ratings
movie_ratings <- join(movie_ratings, critics, by = "critic")
movie_ratings <- join(movie_ratings, movies, by = "movie")
# Drop unneeded columns
movie_ratings <- movie_ratings[ -c(1:2)]
# Add ID to ratings and adjust column names
movie_ratings <- data.frame(1:nrow(movie_ratings), movie_ratings)
colnames(movie_ratings) <- c("id", "rating", "critic_id", "movie_id")

# Adjust critic names
critics$critic <- as.character(critics$critic)
critics$critic[critics$critic == 'ilya'] <- 'Ilya Kats'
critics$critic[critics$critic == 'imdb'] <- 'IMDb'
critics$critic[critics$critic == 'rotten_tomatoes'] <- 'Rotten Tomatoes'
critics$critic[critics$critic == 'google_users'] <- 'Google Users'
critics$critic[critics$critic == 'metacritic'] <- 'Metacritic'
critics$critic <- as.factor(critics$critic)
Table 1. Movies
ID Movie
1 Arrival
2 Fences
3 Hacksaw Ridge
4 Hell or High Water
5 Hidden Figures
6 La La Land
7 Lion
8 Manchester by the Sea
9 Moonlight
Table 2. Critics
ID Critic
1 Ilya Kats
2 IMDb
3 Rotten Tomatoes
4 Google Users
5 Metacritic
Table 3. Ratings
ID Rating Critic ID Movie ID
1 3 1 1
2 NA 1 2
3 3 1 3
4 NA 1 4
5 4 1 5
6 3 1 6
7 3 1 7
8 5 1 8
9 4 1 9
10 4 2 1
11 4 2 2
12 4 2 3

Database connection

# Prompt for server username and password and establish connection
# getPass package is required

username <- readline("Please enter MySQL server username: ")
password <- getPass::getPass("Please enter MySQL server password: ")

con <- dbConnect(MySQL(), "flights", username=username, password=password)

Database creation, setup and data import

dbSendQuery(con, "CREATE DATABASE IF NOT EXISTS MovieRatings;")
dbSendQuery(con, "USE MovieRatings;")

# MOVIES
dbSendQuery(con, "DROP TABLE IF EXISTS Movies;")
dbSendQuery(con, "CREATE TABLE Movies 
            (
            ID int PRIMARY KEY, 
            Title varchar(255)
            );")
values <- paste("(",movies$id,",'", movies$movie,"')", sep="", collapse=",")
sqlst <- paste("INSERT INTO Movies VALUES",values)
dbSendQuery(con, sqlst)

# CRITICS
dbSendQuery(con, "DROP TABLE IF EXISTS Critics;")
dbSendQuery(con, "CREATE TABLE Critics 
            (
            ID int PRIMARY KEY, 
            Name varchar(255)
            );")
values <- paste("(",critics$id,",'", critics$critic,"')", sep="", collapse=",")
sqlst <- paste("INSERT INTO Critics VALUES",values)
dbSendQuery(con, sqlst)

# RATINGS
dbSendQuery(con, "DROP TABLE IF EXISTS Ratings;")
dbSendQuery(con, "CREATE TABLE Ratings 
            (
            ID int PRIMARY KEY, 
            CriticID int,
            MovieID int,
            Rating int
            );")
values <- paste("(",movie_ratings$id,",", movie_ratings$critic_id,",", 
                movie_ratings$movie_id,",", 
                ifelse(!is.na(movie_ratings$rating),movie_ratings$rating,"NULL"),
                ")", sep="", collapse=",")
sqlst <- paste("INSERT INTO Ratings VALUES",values)
dbSendQuery(con, sqlst)

# Create a view to easily retrieve all relevant ratings information
dbSendQuery(con, "DROP VIEW IF EXISTS ExpandedRatings;")
sqlst <- "CREATE VIEW ExpandedRatings AS
          SELECT c.Name AS 'Critic Name', m.Title AS 'Movie Title', r.Rating FROM Ratings r
          INNER JOIN Critics c ON r.CriticID = c.ID
          INNER JOIN Movies m ON r.MovieID = m.ID;"
dbSendQuery(con, sqlst)

Data access

allratings <- dbReadTable(con, "ExpandedRatings")
knitr::kable(allratings)
Critic.Name Movie.Title Rating
Ilya Kats Arrival 3
Ilya Kats Fences NA
Ilya Kats Hacksaw Ridge 3
Ilya Kats Hell or High Water NA
Ilya Kats Hidden Figures 4
Ilya Kats La La Land 3
Ilya Kats Lion 3
Ilya Kats Manchester by the Sea 5
Ilya Kats Moonlight 4
IMDb Arrival 4
IMDb Fences 4
IMDb Hacksaw Ridge 4
IMDb Hell or High Water 4
IMDb Hidden Figures 4
IMDb La La Land 4
IMDb Lion 4
IMDb Manchester by the Sea 4
IMDb Moonlight 4
Rotten Tomatoes Arrival 5
Rotten Tomatoes Fences 5
Rotten Tomatoes Hacksaw Ridge 4
Rotten Tomatoes Hell or High Water 5
Rotten Tomatoes Hidden Figures 5
Rotten Tomatoes La La Land 5
Rotten Tomatoes Lion 4
Rotten Tomatoes Manchester by the Sea 5
Rotten Tomatoes Moonlight 5
Google Users Arrival 4
Google Users Fences 4
Google Users Hacksaw Ridge 5
Google Users Hell or High Water 5
Google Users Hidden Figures 5
Google Users La La Land 4
Google Users Lion 4
Google Users Manchester by the Sea 4
Google Users Moonlight 4
Metacritic Arrival 4
Metacritic Fences 4
Metacritic Hacksaw Ridge 4
Metacritic Hell or High Water 4
Metacritic Hidden Figures 4
Metacritic La La Land 5
Metacritic Lion 3
Metacritic Manchester by the Sea 5
Metacritic Moonlight 5

Other options for reading data from SQL database into R dataframe are dbGetQuery to implicitly fetch all results of a SELECT statement and dbSendQuery and fetch combination to access results, but only fetch them when explicitly specified.

Cleanup

dbDisconnect(con)

Conclusion

The code above connects to a MySQL server and creates a simple relational database completely by passing SQL statements through R code. Alternatively, data could have been saved in an external CSV file and loaded into a SQL database by executing the LOAD DATA SQL statement.

LOAD DATA INFILE 'c:/Data/ratings.csv' 
INTO TABLE ratings 
FIELDS TERMINATED BY ',' 
ENCLOSED BY '"'
LINES TERMINATED BY '\n'
IGNORE 1 ROWS;