Choose six recent popular movies. Ask at least five people that you know (friends, family, classmates, imaginary friends) to rate each of these movie that they have seen on a scale of 1 to 5. Take the results (observations) and store them in a SQL database. Load the information into an R dataframe. Your deliverables should include your SQL scripts and your R Markdown code, posted to GitHub.
# Load packages
library(RMySQL)
library(reshape2)
library(plyr)
Since this assignment is centered around using R with SQL, I have decided to enter initial data manually. Every year I attemp to watch all movies nominated as Best Picture for the Academy Awards. This data set includes my ratings and ratings of several review aggregators for 9 movies nominated this year.
movies <- c("Arrival", "Fences", "Hacksaw Ridge", "Hell or High Water", "Hidden Figures",
"La La Land", "Lion", "Manchester by the Sea", "Moonlight")
ilya <- c(3, NA, 3, NA, 4, 3, 3, 5, 4)
imdb <- c(8.1, 7.5, 8.3, 7.7, 7.9, 8.5, 8, 8.2, 8.1)
rotten_tomatoes <- c(94, 95, 86, 98, 92, 93, 86, 96, 98)
google_users <- c(86, 83, 94, 91, 92, 89, 84, 74, 88)
metacritic <- c(81, 79, 71, 88, 74, 93, 69, 96, 99)
ratings <- data.frame(movies, ilya, imdb, rotten_tomatoes, google_users, metacritic)
# Normalize all ratings to a 5 point scale
ratings$imdb <- round(ratings$imdb / 2, 0)
ratings$rotten_tomatoes <- round(ratings$rotten_tomatoes / 20, 0)
ratings$google_users <- round(ratings$google_users / 20, 0)
ratings$metacritic <- round(ratings$metacritic / 20, 0)
knitr::kable(ratings, col.names = c("Movies", "My Ratings", "IMDB", "Rotten Tomatoes", "Google Users", "Metacritic"))
| Movies | My Ratings | IMDB | Rotten Tomatoes | Google Users | Metacritic |
|---|---|---|---|---|---|
| Arrival | 3 | 4 | 5 | 4 | 4 |
| Fences | NA | 4 | 5 | 4 | 4 |
| Hacksaw Ridge | 3 | 4 | 4 | 5 | 4 |
| Hell or High Water | NA | 4 | 5 | 5 | 4 |
| Hidden Figures | 4 | 4 | 5 | 5 | 4 |
| La La Land | 3 | 4 | 5 | 4 | 5 |
| Lion | 3 | 4 | 4 | 4 | 3 |
| Manchester by the Sea | 5 | 4 | 5 | 4 | 5 |
| Moonlight | 4 | 4 | 5 | 4 | 5 |
Just for practicing I have decided to normalize the main table into three smaller tables containing information about movies, critics and ratings.
# Transpose the table
movie_ratings <- melt(ratings, id=c("movies"))
colnames(movie_ratings) <- c("movie", "critic", "rating")
# Extract unique movies and assign ID to each
movies <- data.frame(1:nlevels(movie_ratings$movie), levels(movie_ratings$movie))
colnames(movies) <- c("id", "movie")
# Extract unique critics and assign ID to each
critics <- data.frame(1:nlevels(movie_ratings$critic), levels(movie_ratings$critic))
colnames(critics) <- c("id", "critic")
# Add corresponding movie and critic IDs to ratings
movie_ratings <- join(movie_ratings, critics, by = "critic")
movie_ratings <- join(movie_ratings, movies, by = "movie")
# Drop unneeded columns
movie_ratings <- movie_ratings[ -c(1:2)]
# Add ID to ratings and adjust column names
movie_ratings <- data.frame(1:nrow(movie_ratings), movie_ratings)
colnames(movie_ratings) <- c("id", "rating", "critic_id", "movie_id")
# Adjust critic names
critics$critic <- as.character(critics$critic)
critics$critic[critics$critic == 'ilya'] <- 'Ilya Kats'
critics$critic[critics$critic == 'imdb'] <- 'IMDb'
critics$critic[critics$critic == 'rotten_tomatoes'] <- 'Rotten Tomatoes'
critics$critic[critics$critic == 'google_users'] <- 'Google Users'
critics$critic[critics$critic == 'metacritic'] <- 'Metacritic'
critics$critic <- as.factor(critics$critic)
| ID | Movie |
|---|---|
| 1 | Arrival |
| 2 | Fences |
| 3 | Hacksaw Ridge |
| 4 | Hell or High Water |
| 5 | Hidden Figures |
| 6 | La La Land |
| 7 | Lion |
| 8 | Manchester by the Sea |
| 9 | Moonlight |
| ID | Critic |
|---|---|
| 1 | Ilya Kats |
| 2 | IMDb |
| 3 | Rotten Tomatoes |
| 4 | Google Users |
| 5 | Metacritic |
| ID | Rating | Critic ID | Movie ID |
|---|---|---|---|
| 1 | 3 | 1 | 1 |
| 2 | NA | 1 | 2 |
| 3 | 3 | 1 | 3 |
| 4 | NA | 1 | 4 |
| 5 | 4 | 1 | 5 |
| 6 | 3 | 1 | 6 |
| 7 | 3 | 1 | 7 |
| 8 | 5 | 1 | 8 |
| 9 | 4 | 1 | 9 |
| 10 | 4 | 2 | 1 |
| 11 | 4 | 2 | 2 |
| 12 | 4 | 2 | 3 |
# Prompt for server username and password and establish connection
# getPass package is required
username <- readline("Please enter MySQL server username: ")
password <- getPass::getPass("Please enter MySQL server password: ")
con <- dbConnect(MySQL(), "flights", username=username, password=password)
dbSendQuery(con, "CREATE DATABASE IF NOT EXISTS MovieRatings;")
dbSendQuery(con, "USE MovieRatings;")
# MOVIES
dbSendQuery(con, "DROP TABLE IF EXISTS Movies;")
dbSendQuery(con, "CREATE TABLE Movies
(
ID int PRIMARY KEY,
Title varchar(255)
);")
values <- paste("(",movies$id,",'", movies$movie,"')", sep="", collapse=",")
sqlst <- paste("INSERT INTO Movies VALUES",values)
dbSendQuery(con, sqlst)
# CRITICS
dbSendQuery(con, "DROP TABLE IF EXISTS Critics;")
dbSendQuery(con, "CREATE TABLE Critics
(
ID int PRIMARY KEY,
Name varchar(255)
);")
values <- paste("(",critics$id,",'", critics$critic,"')", sep="", collapse=",")
sqlst <- paste("INSERT INTO Critics VALUES",values)
dbSendQuery(con, sqlst)
# RATINGS
dbSendQuery(con, "DROP TABLE IF EXISTS Ratings;")
dbSendQuery(con, "CREATE TABLE Ratings
(
ID int PRIMARY KEY,
CriticID int,
MovieID int,
Rating int
);")
values <- paste("(",movie_ratings$id,",", movie_ratings$critic_id,",",
movie_ratings$movie_id,",",
ifelse(!is.na(movie_ratings$rating),movie_ratings$rating,"NULL"),
")", sep="", collapse=",")
sqlst <- paste("INSERT INTO Ratings VALUES",values)
dbSendQuery(con, sqlst)
# Create a view to easily retrieve all relevant ratings information
dbSendQuery(con, "DROP VIEW IF EXISTS ExpandedRatings;")
sqlst <- "CREATE VIEW ExpandedRatings AS
SELECT c.Name AS 'Critic Name', m.Title AS 'Movie Title', r.Rating FROM Ratings r
INNER JOIN Critics c ON r.CriticID = c.ID
INNER JOIN Movies m ON r.MovieID = m.ID;"
dbSendQuery(con, sqlst)
allratings <- dbReadTable(con, "ExpandedRatings")
knitr::kable(allratings)
| Critic.Name | Movie.Title | Rating |
|---|---|---|
| Ilya Kats | Arrival | 3 |
| Ilya Kats | Fences | NA |
| Ilya Kats | Hacksaw Ridge | 3 |
| Ilya Kats | Hell or High Water | NA |
| Ilya Kats | Hidden Figures | 4 |
| Ilya Kats | La La Land | 3 |
| Ilya Kats | Lion | 3 |
| Ilya Kats | Manchester by the Sea | 5 |
| Ilya Kats | Moonlight | 4 |
| IMDb | Arrival | 4 |
| IMDb | Fences | 4 |
| IMDb | Hacksaw Ridge | 4 |
| IMDb | Hell or High Water | 4 |
| IMDb | Hidden Figures | 4 |
| IMDb | La La Land | 4 |
| IMDb | Lion | 4 |
| IMDb | Manchester by the Sea | 4 |
| IMDb | Moonlight | 4 |
| Rotten Tomatoes | Arrival | 5 |
| Rotten Tomatoes | Fences | 5 |
| Rotten Tomatoes | Hacksaw Ridge | 4 |
| Rotten Tomatoes | Hell or High Water | 5 |
| Rotten Tomatoes | Hidden Figures | 5 |
| Rotten Tomatoes | La La Land | 5 |
| Rotten Tomatoes | Lion | 4 |
| Rotten Tomatoes | Manchester by the Sea | 5 |
| Rotten Tomatoes | Moonlight | 5 |
| Google Users | Arrival | 4 |
| Google Users | Fences | 4 |
| Google Users | Hacksaw Ridge | 5 |
| Google Users | Hell or High Water | 5 |
| Google Users | Hidden Figures | 5 |
| Google Users | La La Land | 4 |
| Google Users | Lion | 4 |
| Google Users | Manchester by the Sea | 4 |
| Google Users | Moonlight | 4 |
| Metacritic | Arrival | 4 |
| Metacritic | Fences | 4 |
| Metacritic | Hacksaw Ridge | 4 |
| Metacritic | Hell or High Water | 4 |
| Metacritic | Hidden Figures | 4 |
| Metacritic | La La Land | 5 |
| Metacritic | Lion | 3 |
| Metacritic | Manchester by the Sea | 5 |
| Metacritic | Moonlight | 5 |
Other options for reading data from SQL database into R dataframe are dbGetQuery to implicitly fetch all results of a SELECT statement and dbSendQuery and fetch combination to access results, but only fetch them when explicitly specified.
dbDisconnect(con)
The code above connects to a MySQL server and creates a simple relational database completely by passing SQL statements through R code. Alternatively, data could have been saved in an external CSV file and loaded into a SQL database by executing the LOAD DATA SQL statement.
LOAD DATA INFILE 'c:/Data/ratings.csv'
INTO TABLE ratings
FIELDS TERMINATED BY ','
ENCLOSED BY '"'
LINES TERMINATED BY '\n'
IGNORE 1 ROWS;