Data 607_Week 2 Assignment

Cue up libraries

# install.packages("RMySQL")
library(RMySQL)

## Warning: package 'RMySQL' was built under R version 3.4.3

## Loading required package: DBI

## Warning: package 'DBI' was built under R version 3.4.3

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.4.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(stringr)

## Warning: package 'stringr' was built under R version 3.4.3

Connect to MySQL database

# Pass the magic words to the connector function
dbname <- "data607_week2_films"
dbuser <- "root"
dbpass <- "pass8"
dbhost <- "localhost"
con <- dbConnect(RMySQL::MySQL(),
                  user = dbuser,
                  password = dbpass,
                  host = dbhost,
                  dbname = dbname
                  )
# Check the connection
summary(con)

## <MySQLConnection:0,0>
##   User:   root 
##   Host:   localhost 
##   Dbname: data607_week2_films 
##   Connection type: localhost via TCP/IP 
## 
## Results:

dbGetInfo(con)

## $host
## [1] "localhost"
## 
## $user
## [1] "root"
## 
## $dbname
## [1] "data607_week2_films"
## 
## $conType
## [1] "localhost via TCP/IP"
## 
## $serverVersion
## [1] "5.7.21-log"
## 
## $protocolVersion
## [1] 10
## 
## $threadId
## [1] 35
## 
## $rsId
## list()

Check MySQL tables, fields therein - confirm we’re in business

dbListTables(con)

## [1] "films"     "ratings"   "reviewers"

dbListFields(con, "films")

## [1] "FilmID" "Title"  "Year"

dbListFields(con, "Reviewers")

## [1] "ReviewerID" "FirstName"

dbListFields(con, "Ratings")

## [1] "FilmID"     "Rating"     "ReviewerID"

Read MySQL tables into R

# Read MySQL tables into R
Films <- dbReadTable(con, "films")
Ratings <- dbReadTable(con, "ratings")
Reviewers <- dbReadTable(con, "reviewers")
# Factorize ratings so they read as ordinal variable
Ratings$Rating <- as.factor(Ratings$Rating)
# Create a title key
TitleKey <- c(Films[,2])
# Check out the three data frames
str(Films)

## 'data.frame':    11 obs. of  3 variables:
##  $ FilmID: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Title : chr  "The Disaster Artist" "Alien: Covenant" "Atomic Blonde" "The Lost City of Z" ...
##  $ Year  : int  2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 ...

str(Reviewers)

## 'data.frame':    12 obs. of  2 variables:
##  $ ReviewerID: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ FirstName : chr  "Aubert\r" "Penelope\r" "Seo-Joon\r" "Eulalia\r" ...

str(Ratings)

## 'data.frame':    54 obs. of  3 variables:
##  $ FilmID    : int  1 1 1 1 1 2 2 3 3 3 ...
##  $ Rating    : Factor w/ 5 levels "1","2","3","4",..: 4 3 4 3 3 5 2 3 2 5 ...
##  $ ReviewerID: int  2 5 2 9 11 3 10 4 4 5 ...

Combine ratings into scores using SQL

# Pull in average score by title and calculate number of reviews received for context
sql_1 <- "SELECT    F.Title AS FilmTitle, AVG(RA.Rating) AS AvgScore, COUNT(RA.Rating) AS TotalReviews
          FROM Films AS F
          INNER JOIN Ratings AS RA
          ON F.FilmID = RA.FilmID
          GROUP BY FilmTitle
          ORDER BY AvgScore DESC; "
ScoresSQL <- dbGetQuery(con, sql_1)

## Warning in .local(conn, statement, ...): Decimal MySQL column 1 imported as
## numeric

ScoresSQL

##               FilmTitle AvgScore TotalReviews
## 1     Blade Runner 2049   4.0000            6
## 2     Beatriz at Dinner   3.8000            5
## 3               Get Out   3.7500            4
## 4       Alien: Covenant   3.5000            2
## 5   The Disaster Artist   3.4000            5
## 6  Call Me By Your Name   3.2000            5
## 7         Atomic Blonde   3.2000            5
## 8                  Okja   3.0000            5
## 9    The Lost City of Z   2.8000            5
## 10            Lady Bird   2.6667            6
## 11              Dunkirk   2.3333            6

Examine dispersion of ratings for each film using SQL

# Pull in all ratings by title and look at distribution
sql_2 <- "SELECT    F.Title AS FilmTitle, RA.Rating AS Rating
          FROM Films AS F
          INNER JOIN Ratings AS RA
          ON RA.FilmID = F.FilmID
          ORDER BY FilmTitle; "
RatingsbyTitle <- dbGetQuery(con, sql_2)
# Take a peek at the distribution - trying out a stripchart to change things up
par(las=3,
    cex.main=1,
    cex.lab=0.5,
    font.lab=2,
    cex.axis=0.45,
    font.axis=2,
    col.axis="grey50")
stripchart(Rating ~ FilmTitle, 
           data = RatingsbyTitle, 
           vertical=TRUE, 
           method = "stack", 
           offset=0.8, 
           pch=19, 
           main = "Score Dispersion by Film",
           ylab = "")

Rank films by total reviews using SQL

# Pull in all number fo reviews by title and rank to discern viewership
sql_3 <- "SELECT    F.Title AS FilmTitle, COUNT(RA.Rating) AS TotalReviews
          FROM Films AS F
          INNER JOIN Ratings AS RA
          ON F.FilmID = RA.FilmID
          INNER JOIN Reviewers AS RE
          ON RE.ReviewerID = RA.ReviewerID
          GROUP BY F.Title
          ORDER BY TotalReviews DESC; "
RankByReviewCountSQL <- dbGetQuery(con, sql_3)
RankByReviewCountSQL

##               FilmTitle TotalReviews
## 1     Blade Runner 2049            6
## 2             Lady Bird            6
## 3               Dunkirk            6
## 4                  Okja            5
## 5     Beatriz at Dinner            5
## 6  Call Me By Your Name            5
## 7    The Lost City of Z            5
## 8   The Disaster Artist            5
## 9         Atomic Blonde            5
## 10              Get Out            4
## 11      Alien: Covenant            2

Examine ratings by Reviewer

sql_4 <- "SELECT RE.FirstName AS Reviewer, COUNT(RA.Rating) AS FilmsReviewed, AVG(RA.Rating) AS AvgScore
          FROM Ratings AS RA
          INNER JOIN Reviewers AS RE
          ON RE.ReviewerID = RA.ReviewerID
          GROUP BY Reviewer
          ORDER BY FilmsReviewed DESC; "
RatingByReviewerSQL <- dbGetQuery(con, sql_4)

## Warning in .local(conn, statement, ...): Decimal MySQL column 2 imported as
## numeric

# Clean up SQL line terminating /r cruft
RatingByReviewerSQL$Reviewer <- str_sub(RatingByReviewerSQL$Reviewer, 1, str_length(RatingByReviewerSQL$Reviewer)-2)
RatingByReviewerSQL

##    Reviewer FilmsReviewed AvgScore
## 1       Rav             7   3.0000
## 2   Penelop             7   3.1429
## 3       Tin             6   3.6667
## 4    Eulali             6   3.0000
## 5   Seo-Joo             5   3.6000
## 6   Alistai             5   2.6000
## 7     Haroo             4   2.7500
## 8     Victo             4   4.7500
## 9     Sholt             3   2.6667
## 10   Kirsti             3   1.6667
## 11        Y             2   4.0000
## 12    Auber             2   4.0000

Visualize whether people who review more rate films higher / lower

par(las=1,
    cex.main=1,
    cex.lab=0.75,
    font.lab=2,
    cex.axis=0.75,
    font.axis=2,
    col.axis="grey50")
plot(RatingByReviewerSQL$FilmsReviewed, 
     RatingByReviewerSQL$AvgScore,
     main = "Relationship Between Films Reviewed and Scores Given",
     xlab = "Films Reviewed",
     ylab = "Average Score", 
     xlim = c(1,8), 
     ylim = c(1,5))
xaxis <- seq(1, 5, by = 1)
yaxis <- seq(1, 8, by = 1)
axis(1, at = xaxis)
axis(2, at = yaxis)

# Doesn't look like there's any association

Data 607_Week 2 Assignment_Films

Jeremy

February 10, 2018

Cue up libraries

Connect to MySQL database

Check MySQL tables, fields therein - confirm we’re in business

Read MySQL tables into R

Combine ratings into scores using SQL

Examine dispersion of ratings for each film using SQL

Rank films by total reviews using SQL

Examine ratings by Reviewer

Visualize whether people who review more rate films higher / lower