Cue up libraries

# install.packages("RMySQL")
library(RMySQL)
## Warning: package 'RMySQL' was built under R version 3.4.3
## Loading required package: DBI
## Warning: package 'DBI' was built under R version 3.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
## Warning: package 'stringr' was built under R version 3.4.3

Connect to MySQL database

# Pass the magic words to the connector function
dbname <- "data607_week2_films"
dbuser <- "root"
dbpass <- "pass8"
dbhost <- "localhost"
con <- dbConnect(RMySQL::MySQL(),
                  user = dbuser,
                  password = dbpass,
                  host = dbhost,
                  dbname = dbname
                  )
# Check the connection
summary(con)
## <MySQLConnection:0,0>
##   User:   root 
##   Host:   localhost 
##   Dbname: data607_week2_films 
##   Connection type: localhost via TCP/IP 
## 
## Results:
dbGetInfo(con)
## $host
## [1] "localhost"
## 
## $user
## [1] "root"
## 
## $dbname
## [1] "data607_week2_films"
## 
## $conType
## [1] "localhost via TCP/IP"
## 
## $serverVersion
## [1] "5.7.21-log"
## 
## $protocolVersion
## [1] 10
## 
## $threadId
## [1] 35
## 
## $rsId
## list()

Check MySQL tables, fields therein - confirm we’re in business

dbListTables(con)
## [1] "films"     "ratings"   "reviewers"
dbListFields(con, "films")
## [1] "FilmID" "Title"  "Year"
dbListFields(con, "Reviewers")
## [1] "ReviewerID" "FirstName"
dbListFields(con, "Ratings")
## [1] "FilmID"     "Rating"     "ReviewerID"

Read MySQL tables into R

# Read MySQL tables into R
Films <- dbReadTable(con, "films")
Ratings <- dbReadTable(con, "ratings")
Reviewers <- dbReadTable(con, "reviewers")
# Factorize ratings so they read as ordinal variable
Ratings$Rating <- as.factor(Ratings$Rating)
# Create a title key
TitleKey <- c(Films[,2])
# Check out the three data frames
str(Films)
## 'data.frame':    11 obs. of  3 variables:
##  $ FilmID: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Title : chr  "The Disaster Artist" "Alien: Covenant" "Atomic Blonde" "The Lost City of Z" ...
##  $ Year  : int  2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 ...
str(Reviewers)
## 'data.frame':    12 obs. of  2 variables:
##  $ ReviewerID: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ FirstName : chr  "Aubert\r" "Penelope\r" "Seo-Joon\r" "Eulalia\r" ...
str(Ratings)
## 'data.frame':    54 obs. of  3 variables:
##  $ FilmID    : int  1 1 1 1 1 2 2 3 3 3 ...
##  $ Rating    : Factor w/ 5 levels "1","2","3","4",..: 4 3 4 3 3 5 2 3 2 5 ...
##  $ ReviewerID: int  2 5 2 9 11 3 10 4 4 5 ...

Combine ratings into scores using SQL

# Pull in average score by title and calculate number of reviews received for context
sql_1 <- "SELECT    F.Title AS FilmTitle, AVG(RA.Rating) AS AvgScore, COUNT(RA.Rating) AS TotalReviews
          FROM Films AS F
          INNER JOIN Ratings AS RA
          ON F.FilmID = RA.FilmID
          GROUP BY FilmTitle
          ORDER BY AvgScore DESC; "
ScoresSQL <- dbGetQuery(con, sql_1)
## Warning in .local(conn, statement, ...): Decimal MySQL column 1 imported as
## numeric
ScoresSQL
##               FilmTitle AvgScore TotalReviews
## 1     Blade Runner 2049   4.0000            6
## 2     Beatriz at Dinner   3.8000            5
## 3               Get Out   3.7500            4
## 4       Alien: Covenant   3.5000            2
## 5   The Disaster Artist   3.4000            5
## 6  Call Me By Your Name   3.2000            5
## 7         Atomic Blonde   3.2000            5
## 8                  Okja   3.0000            5
## 9    The Lost City of Z   2.8000            5
## 10            Lady Bird   2.6667            6
## 11              Dunkirk   2.3333            6

Examine dispersion of ratings for each film using SQL

# Pull in all ratings by title and look at distribution
sql_2 <- "SELECT    F.Title AS FilmTitle, RA.Rating AS Rating
          FROM Films AS F
          INNER JOIN Ratings AS RA
          ON RA.FilmID = F.FilmID
          ORDER BY FilmTitle; "
RatingsbyTitle <- dbGetQuery(con, sql_2)
# Take a peek at the distribution - trying out a stripchart to change things up
par(las=3,
    cex.main=1,
    cex.lab=0.5,
    font.lab=2,
    cex.axis=0.45,
    font.axis=2,
    col.axis="grey50")
stripchart(Rating ~ FilmTitle, 
           data = RatingsbyTitle, 
           vertical=TRUE, 
           method = "stack", 
           offset=0.8, 
           pch=19, 
           main = "Score Dispersion by Film",
           ylab = "")

Rank films by total reviews using SQL

# Pull in all number fo reviews by title and rank to discern viewership
sql_3 <- "SELECT    F.Title AS FilmTitle, COUNT(RA.Rating) AS TotalReviews
          FROM Films AS F
          INNER JOIN Ratings AS RA
          ON F.FilmID = RA.FilmID
          INNER JOIN Reviewers AS RE
          ON RE.ReviewerID = RA.ReviewerID
          GROUP BY F.Title
          ORDER BY TotalReviews DESC; "
RankByReviewCountSQL <- dbGetQuery(con, sql_3)
RankByReviewCountSQL
##               FilmTitle TotalReviews
## 1     Blade Runner 2049            6
## 2             Lady Bird            6
## 3               Dunkirk            6
## 4                  Okja            5
## 5     Beatriz at Dinner            5
## 6  Call Me By Your Name            5
## 7    The Lost City of Z            5
## 8   The Disaster Artist            5
## 9         Atomic Blonde            5
## 10              Get Out            4
## 11      Alien: Covenant            2

Examine ratings by Reviewer

sql_4 <- "SELECT RE.FirstName AS Reviewer, COUNT(RA.Rating) AS FilmsReviewed, AVG(RA.Rating) AS AvgScore
          FROM Ratings AS RA
          INNER JOIN Reviewers AS RE
          ON RE.ReviewerID = RA.ReviewerID
          GROUP BY Reviewer
          ORDER BY FilmsReviewed DESC; "
RatingByReviewerSQL <- dbGetQuery(con, sql_4)
## Warning in .local(conn, statement, ...): Decimal MySQL column 2 imported as
## numeric
# Clean up SQL line terminating /r cruft
RatingByReviewerSQL$Reviewer <- str_sub(RatingByReviewerSQL$Reviewer, 1, str_length(RatingByReviewerSQL$Reviewer)-2)
RatingByReviewerSQL
##    Reviewer FilmsReviewed AvgScore
## 1       Rav             7   3.0000
## 2   Penelop             7   3.1429
## 3       Tin             6   3.6667
## 4    Eulali             6   3.0000
## 5   Seo-Joo             5   3.6000
## 6   Alistai             5   2.6000
## 7     Haroo             4   2.7500
## 8     Victo             4   4.7500
## 9     Sholt             3   2.6667
## 10   Kirsti             3   1.6667
## 11        Y             2   4.0000
## 12    Auber             2   4.0000

Visualize whether people who review more rate films higher / lower

par(las=1,
    cex.main=1,
    cex.lab=0.75,
    font.lab=2,
    cex.axis=0.75,
    font.axis=2,
    col.axis="grey50")
plot(RatingByReviewerSQL$FilmsReviewed, 
     RatingByReviewerSQL$AvgScore,
     main = "Relationship Between Films Reviewed and Scores Given",
     xlab = "Films Reviewed",
     ylab = "Average Score", 
     xlim = c(1,8), 
     ylim = c(1,5))
xaxis <- seq(1, 5, by = 1)
yaxis <- seq(1, 8, by = 1)
axis(1, at = xaxis)
axis(2, at = yaxis)

# Doesn't look like there's any association