This code depends on a public MYSQL database created in the Google Cloud Platform. The project is reproducible as long the the database and connection are running.
The code establishes the connection, creates the table and the joins to output a data frame of the results.
Load the packages and libraries needed for this project. (GGPLOT2 library is only referenced; assumes package has been previously installed). The project used RMYSQL to connect and query the database.
install.packages("RMySQL", repos = "http://cran.us.r-project.org")
install.packages("DBI", repos = "http://cran.us.r-project.org")
library(DBI)
library(RMySQL)
library(ggplot2)
The GCP MYSQL connection depends on generic service account created on GCP MYSQL database. The connection was established in the MYSQL Workbench client.
mydb = dbConnect(MySQL(), user='dataserviceaccount', password='data607', dbname='mymovies', host='34.86.31.176')
summary(mydb)
## <MySQLConnection:0,0>
## User: dataserviceaccount
## Host: 34.86.31.176
## Dbname: mymovies
## Connection type: 34.86.31.176 via TCP/IP
##
## Results:
The code drops any existing tables to prevent issues when running the rest of the code. A check at the end that there are no tables in mymovies database.
#Drops Tables
dbSendQuery(mydb, "
Drop Table If Exists movie_list,
delilah_ratings,
emily_ratings,
eva_ratings,
johnny_ratings,
keyla_ratings,
kleber_ratings,
ratings_def ")
## <MySQLResult:3,0,0>
#Checks for table list
dbListTables(mydb)
## character(0)
We create the tables for the mymovies database. The tables are: - Movie List table containing the 6 movies we rate. - Movie Rating Definition noting what the numerical rating mean. - Individual Movie Ratings tables by 6 Family Members
#Creates Movie List Table
dbSendQuery(mydb, "
create table movie_list (
movie_id serial PRIMARY KEY,
title character(50));")
#Creates Individual Rating Tables
dbSendQuery(mydb, "
create table johnny_ratings (
movie_id integer,
rating numeric);")
dbSendQuery(mydb, "
create table keyla_ratings (
movie_id integer,
rating numeric);")
dbSendQuery(mydb, "
create table delilah_ratings (
movie_id integer,
rating numeric);")
dbSendQuery(mydb, "
create table eva_ratings (
movie_id integer,
rating numeric);")
dbSendQuery(mydb, "
create table emily_ratings (
movie_id integer,
rating numeric);")
dbSendQuery(mydb, "
create table kleber_ratings (
movie_id integer,
rating numeric);")
#Creates the Ratings Definition Table
dbSendQuery(mydb, "
create table ratings_def (
rating numeric,
definitions character(50));")
#Populates the Movie List
dbSendQuery(mydb, "
insert into movie_list (title)
values
('Finding Nemo'),
('Finding Dory'),
('Sonic the Hedgehog 2'),
('Beauty and the Beast'),
('Frozen 2'),
('The Little Mermaid');")
#Populates the Ratings Definitions
dbSendQuery(mydb, "
insert into ratings_def (rating, definitions)
values
('1', 'Poor'),
('2', 'Fair'),
('3', 'Mediocre'),
('4', 'Good'),
('5', 'Excellent');")
#Populates Keyla's Ratings
dbSendQuery(mydb, "
insert into keyla_ratings (movie_id, rating)
values
('1', '5'),
('2', '5'),
('3', '3'),
('4', '5'),
('5', '4'),
('6', '4');")
#Populates Delilah's Ratings
dbSendQuery(mydb, "
insert into delilah_ratings (movie_id, rating)
values
('1', '4'),
('2', '4'),
('3', '5'),
('4', '4'),
('5', '5'),
('6', '4');")
#Populates Eva's Ratings
dbSendQuery(mydb, "
insert into eva_ratings (movie_id, rating)
values
('1', '1'),
('2', '1'),
('3', '5'),
('4', '1'),
('5', '5'),
('6', '1');")
#Populates Johnny's Ratings
dbSendQuery(mydb, "
insert into johnny_ratings (movie_id, rating)
values
('1', '4'),
('2', '4'),
('3', '5'),
('4', '2'),
('5', '2'),
('6', '1');")
#Populates Emily's Ratings
dbSendQuery(mydb, "
insert into emily_ratings (movie_id, rating)
values
('1', '4'),
('2', '4'),
('3', NULL),
('4', NULL),
('5', '4'),
('6', NULL);")
#Populates Kleber's Ratings
dbSendQuery(mydb, "
insert into kleber_ratings (movie_id, rating)
values
('1', '4'),
('2', NULL),
('3', '4'),
('4', NULL),
('5', NULL),
('6', NULL);")
We can now see the movies we are rating and what the ratings definitions. (SuppressWarnings is used to suppress the coercion as numeric of the first column.)
suppressWarnings(
dbGetQuery(mydb, "SELECT * FROM movie_list LIMIT 10;"))
## movie_id title
## 1 1 Finding Nemo
## 2 2 Finding Dory
## 3 3 Sonic the Hedgehog 2
## 4 4 Beauty and the Beast
## 5 5 Frozen 2
## 6 6 The Little Mermaid
suppressWarnings(
dbGetQuery(mydb, "SELECT * FROM ratings_def LIMIT 10;"))
## rating definitions
## 1 1 Poor
## 2 2 Fair
## 3 3 Mediocre
## 4 4 Good
## 5 5 Excellent
We join the tables and generate the R dataframe of the movies and the individual ratings.
(SuppressWarnings is used to suppress the coercion as numeric of the rating column.)
suppressWarnings(
mymoviedf <- dbGetQuery(mydb, "
Select movie_list.title as Title,
delilah_ratings.rating as Delilah,
eva_ratings.rating as Eva,
emily_ratings.rating as Emily,
keyla_ratings.rating as Keyla,
johnny_ratings.rating as Johnny,
kleber_ratings.rating as Kleber
From movie_list
Left Join keyla_ratings On movie_list.movie_id = keyla_ratings.movie_id
Left Join johnny_ratings On movie_list.movie_id = johnny_ratings.movie_id
Left Join kleber_ratings On movie_list.movie_id = kleber_ratings.movie_id
Left Join emily_ratings On movie_list.movie_id = emily_ratings.movie_id
Left Join delilah_ratings On movie_list.movie_id = delilah_ratings.movie_id
Left Join eva_ratings On movie_list.movie_id = eva_ratings.movie_id;
")
)
The R dataframe and its structure.
mymoviedf
## Title Delilah Eva Emily Keyla Johnny Kleber
## 1 Finding Nemo 4 1 4 5 4 4
## 2 Finding Dory 4 1 4 5 4 NA
## 3 Sonic the Hedgehog 2 5 5 NA 3 5 4
## 4 Beauty and the Beast 4 1 NA 5 2 NA
## 5 Frozen 2 5 5 4 4 2 NA
## 6 The Little Mermaid 4 1 NA 4 1 NA
str(mymoviedf)
## 'data.frame': 6 obs. of 7 variables:
## $ Title : chr "Finding Nemo" "Finding Dory" "Sonic the Hedgehog 2" "Beauty and the Beast" ...
## $ Delilah: num 4 4 5 4 5 4
## $ Eva : num 1 1 5 1 5 1
## $ Emily : num 4 4 NA NA 4 NA
## $ Keyla : num 5 5 3 5 4 4
## $ Johnny : num 4 4 5 2 2 1
## $ Kleber : num 4 NA 4 NA NA NA
Once the dataframe is created, the average score for each movie is calculated and plotted on a bar. The average rating excludes missing values (NULLS) in the calculation.
#Calculate average score and append to dataframe
mymoviedf$AvgRating <- rowMeans(mymoviedf[,2:7], na.rm=TRUE)
#Bar Plot
ggplot(mymoviedf, aes(x=reorder(Title, +AvgRating), y = AvgRating)) +
stat_summary(fun = median, geom = "col") +
coord_flip()+
xlab("Movie Title") +
ylab("Average Family" ) +
ggtitle("Average Family Movie Ratings")