The Task

Choose six recent popular movies. Ask at least five people that you know (friends, family, classmates, imaginary friends) to rate each of these movie that they have seen on a scale of 1 to 5. Take the results (observations) and store them in a SQL database. Load the information into an R dataframe.

Your deliverables should include your SQL scripts and your R Markdown code, posted to GitHub.

This is by design a very open ended assignment. A variety of reasonable approaches are acceptable. You can (and should) blank out your SQL password if your solution requires it; otherwise, full credit requires that your code is “reproducible,” with the assumption that I have the same database server and R software.

Load Packages

knitr::opts_chunk$set(#echo=FALSE, 
                      warning=FALSE, 
                      message=FALSE,
                      tidy=TRUE,
                      #comment = "",
                      dev="png", 
                      dev.args=list(type="cairo"))

#https://www.rstudio.com/wp-content/uploads/2015/03/rmarkdown-reference.pdf

load.packages <- c("RMySQL", "RCurl", "knitr")

ipak <- function(pkg){
    #FUNCTION SOURCE: https://gist.github.com/stevenworthington/3178163
    new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
    if (length(new.pkg)) 
        install.packages(new.pkg, dependencies = TRUE)
    sapply(pkg, require, character.only = TRUE)
}
ipak(load.packages)
## Loading required package: RMySQL
## Loading required package: DBI
## Loading required package: RCurl
## Loading required package: bitops
## Loading required package: knitr

Connect to the DB

my.db <- dbConnect(MySQL(), user = "root", password = "kmgkmg", dbname = "movie_ratings", 
    host = "localhost", port = 3306)

Create & Load Tables

### This code doesn't seem necessary create.tables <-
### getURL('https://raw.githubusercontent.com/kylegilde/D607-Data-Acquistion/master/data-sets/movie_ratings/movie%20ratings.sql')
### create.tables <- unlist(strsplit(gsub('[\n\t]', '',create.tables),
### split=';')) dbGetQuery(my.db, create.tables)
### dbGetQuery(my.db,create.tables[5]) Do you have any idea why I have to run
### the 5th query on its own to create that table?

my.URLs <- getURL(gsub(" ", "", paste("https://raw.githubusercontent.com/kylegilde/D607-Data-Acquistion/master/data-sets/movie_ratings/", 
    c("friends.csv", "movies.csv", "movies_ratings_friends.csv", "ratings.csv"))))

# https://www.r-bloggers.com/accessing-mysql-through-r/

friends <- read.csv(text = my.URLs[1], sep = ",")
movies <- read.csv(text = my.URLs[2], sep = ",")
movies_ratings_friends <- read.csv(text = my.URLs[3], sep = ",")
ratings <- read.csv(text = my.URLs[4], sep = ",")

dbWriteTable(my.db, "friends", friends, overwrite = T, row.names = F)
dbWriteTable(my.db, "movies", movies, overwrite = T, row.names = F)
dbWriteTable(my.db, "movies_ratings_friends", movies_ratings_friends, overwrite = T, 
    row.names = F)
dbWriteTable(my.db, "ratings", ratings, overwrite = T, row.names = F)

List the Tables

my.tables <- dbGetQuery(my.db, "show tables")
kable(my.tables, caption = "Tables")
Tables
Tables_in_movie_ratings
friends
movies
movies_ratings_friends
ratings

Join all tables into one aggregate table. View the 1st 10 rows.

agg_table <- dbGetQuery(my.db, "select m.movie
          , f.name
          , r.rating 
          , r.rating_score
          from movies_ratings_friends mrf
          join movies m on mrf.movie_id = m.id
          join friends f on mrf.friend_id = f.id
          join ratings r on mrf.rating_id = r.id")

kable(head(agg_table, 10))
movie name rating rating_score
Arrival Kyle It was Ok 3
Arrival Tomas It was Ok 3
Arrival Dave Eh… kinda boring 2
Arrival Rachel Eh… kinda boring 2
Arrival Chris Horrible! 1
Fences Kyle Great! 5
Fences Dave Good…but not great 4
Fences Chris Good…but not great 4
Fences Tomas It was Ok 3
Fences Rachel Horrible! 1

The rating scores are 1 through 5, 1 being the lowest score.

my.ratings <- dbGetQuery(my.db, "select rating
                          , rating_score
                          from ratings
                          order by 2")

kable(my.ratings)
rating rating_score
Horrible! 1
Eh… kinda boring 2
It was Ok 3
Good…but not great 4
Great! 5

Which movie received the highest average rating from my friends?

highest_rating <- dbGetQuery(my.db, "select m.movie
          , avg(r.rating_score) avg_score
          from movies_ratings_friends mrf
          join movies m on mrf.movie_id = m.id
          join ratings r on mrf.rating_id = r.id
          group by 1
          order by 2 desc")

kable(highest_rating, caption = "We have a tie between La La Land and Hell or High Water!")
We have a tie between La La Land and Hell or High Water!
movie avg_score
Hell or High Water 4.0
La La Land 4.0
Fences 3.4
Hidden Figures 3.2
Lion 3.0
Manchester by the Sea 2.6
Hacksaw Ridge 2.4
Arrival 2.2

Which of my friends gave out the lowest ratings on average?

most_critical_friend <- dbGetQuery(my.db, "select f.name
          , avg(r.rating_score) avg_rating
          from movies_ratings_friends mrf
          join friends f on mrf.friend_id = f.id
          join ratings r on mrf.rating_id = r.id
          group by 1
          order by 2")

kable(most_critical_friend, caption = "Tomas gave out the lowest ratings!")
Tomas gave out the lowest ratings!
name avg_rating
Tomas 2.750
Chris 2.875
Rachel 3.000
Kyle 3.000
Dave 3.875

Disconnect from DB

dbDisconnect(my.db)