Cue up libraries
# install.packages("RMySQL")
library(RMySQL)
## Warning: package 'RMySQL' was built under R version 3.4.3
## Loading required package: DBI
## Warning: package 'DBI' was built under R version 3.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
## Warning: package 'stringr' was built under R version 3.4.3
Connect to MySQL database
# Pass the magic words to the connector function
dbname <- "data607_week2_films"
dbuser <- "root"
dbpass <- "pass8"
dbhost <- "localhost"
con <- dbConnect(RMySQL::MySQL(),
user = dbuser,
password = dbpass,
host = dbhost,
dbname = dbname
)
# Check the connection
summary(con)
## <MySQLConnection:0,0>
## User: root
## Host: localhost
## Dbname: data607_week2_films
## Connection type: localhost via TCP/IP
##
## Results:
dbGetInfo(con)
## $host
## [1] "localhost"
##
## $user
## [1] "root"
##
## $dbname
## [1] "data607_week2_films"
##
## $conType
## [1] "localhost via TCP/IP"
##
## $serverVersion
## [1] "5.7.21-log"
##
## $protocolVersion
## [1] 10
##
## $threadId
## [1] 35
##
## $rsId
## list()
Check MySQL tables, fields therein - confirm we’re in business
dbListTables(con)
## [1] "films" "ratings" "reviewers"
dbListFields(con, "films")
## [1] "FilmID" "Title" "Year"
dbListFields(con, "Reviewers")
## [1] "ReviewerID" "FirstName"
dbListFields(con, "Ratings")
## [1] "FilmID" "Rating" "ReviewerID"
Read MySQL tables into R
# Read MySQL tables into R
Films <- dbReadTable(con, "films")
Ratings <- dbReadTable(con, "ratings")
Reviewers <- dbReadTable(con, "reviewers")
# Factorize ratings so they read as ordinal variable
Ratings$Rating <- as.factor(Ratings$Rating)
# Create a title key
TitleKey <- c(Films[,2])
# Check out the three data frames
str(Films)
## 'data.frame': 11 obs. of 3 variables:
## $ FilmID: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Title : chr "The Disaster Artist" "Alien: Covenant" "Atomic Blonde" "The Lost City of Z" ...
## $ Year : int 2017 2017 2017 2017 2017 2017 2017 2017 2017 2017 ...
str(Reviewers)
## 'data.frame': 12 obs. of 2 variables:
## $ ReviewerID: int 1 2 3 4 5 6 7 8 9 10 ...
## $ FirstName : chr "Aubert\r" "Penelope\r" "Seo-Joon\r" "Eulalia\r" ...
str(Ratings)
## 'data.frame': 54 obs. of 3 variables:
## $ FilmID : int 1 1 1 1 1 2 2 3 3 3 ...
## $ Rating : Factor w/ 5 levels "1","2","3","4",..: 4 3 4 3 3 5 2 3 2 5 ...
## $ ReviewerID: int 2 5 2 9 11 3 10 4 4 5 ...
Combine ratings into scores using SQL
# Pull in average score by title and calculate number of reviews received for context
sql_1 <- "SELECT F.Title AS FilmTitle, AVG(RA.Rating) AS AvgScore, COUNT(RA.Rating) AS TotalReviews
FROM Films AS F
INNER JOIN Ratings AS RA
ON F.FilmID = RA.FilmID
GROUP BY FilmTitle
ORDER BY AvgScore DESC; "
ScoresSQL <- dbGetQuery(con, sql_1)
## Warning in .local(conn, statement, ...): Decimal MySQL column 1 imported as
## numeric
ScoresSQL
## FilmTitle AvgScore TotalReviews
## 1 Blade Runner 2049 4.0000 6
## 2 Beatriz at Dinner 3.8000 5
## 3 Get Out 3.7500 4
## 4 Alien: Covenant 3.5000 2
## 5 The Disaster Artist 3.4000 5
## 6 Call Me By Your Name 3.2000 5
## 7 Atomic Blonde 3.2000 5
## 8 Okja 3.0000 5
## 9 The Lost City of Z 2.8000 5
## 10 Lady Bird 2.6667 6
## 11 Dunkirk 2.3333 6
Examine dispersion of ratings for each film using SQL
# Pull in all ratings by title and look at distribution
sql_2 <- "SELECT F.Title AS FilmTitle, RA.Rating AS Rating
FROM Films AS F
INNER JOIN Ratings AS RA
ON RA.FilmID = F.FilmID
ORDER BY FilmTitle; "
RatingsbyTitle <- dbGetQuery(con, sql_2)
# Take a peek at the distribution - trying out a stripchart to change things up
par(las=3,
cex.main=1,
cex.lab=0.5,
font.lab=2,
cex.axis=0.45,
font.axis=2,
col.axis="grey50")
stripchart(Rating ~ FilmTitle,
data = RatingsbyTitle,
vertical=TRUE,
method = "stack",
offset=0.8,
pch=19,
main = "Score Dispersion by Film",
ylab = "")

Rank films by total reviews using SQL
# Pull in all number fo reviews by title and rank to discern viewership
sql_3 <- "SELECT F.Title AS FilmTitle, COUNT(RA.Rating) AS TotalReviews
FROM Films AS F
INNER JOIN Ratings AS RA
ON F.FilmID = RA.FilmID
INNER JOIN Reviewers AS RE
ON RE.ReviewerID = RA.ReviewerID
GROUP BY F.Title
ORDER BY TotalReviews DESC; "
RankByReviewCountSQL <- dbGetQuery(con, sql_3)
RankByReviewCountSQL
## FilmTitle TotalReviews
## 1 Blade Runner 2049 6
## 2 Lady Bird 6
## 3 Dunkirk 6
## 4 Okja 5
## 5 Beatriz at Dinner 5
## 6 Call Me By Your Name 5
## 7 The Lost City of Z 5
## 8 The Disaster Artist 5
## 9 Atomic Blonde 5
## 10 Get Out 4
## 11 Alien: Covenant 2
Examine ratings by Reviewer
sql_4 <- "SELECT RE.FirstName AS Reviewer, COUNT(RA.Rating) AS FilmsReviewed, AVG(RA.Rating) AS AvgScore
FROM Ratings AS RA
INNER JOIN Reviewers AS RE
ON RE.ReviewerID = RA.ReviewerID
GROUP BY Reviewer
ORDER BY FilmsReviewed DESC; "
RatingByReviewerSQL <- dbGetQuery(con, sql_4)
## Warning in .local(conn, statement, ...): Decimal MySQL column 2 imported as
## numeric
# Clean up SQL line terminating /r cruft
RatingByReviewerSQL$Reviewer <- str_sub(RatingByReviewerSQL$Reviewer, 1, str_length(RatingByReviewerSQL$Reviewer)-2)
RatingByReviewerSQL
## Reviewer FilmsReviewed AvgScore
## 1 Rav 7 3.0000
## 2 Penelop 7 3.1429
## 3 Tin 6 3.6667
## 4 Eulali 6 3.0000
## 5 Seo-Joo 5 3.6000
## 6 Alistai 5 2.6000
## 7 Haroo 4 2.7500
## 8 Victo 4 4.7500
## 9 Sholt 3 2.6667
## 10 Kirsti 3 1.6667
## 11 Y 2 4.0000
## 12 Auber 2 4.0000
Visualize whether people who review more rate films higher / lower
par(las=1,
cex.main=1,
cex.lab=0.75,
font.lab=2,
cex.axis=0.75,
font.axis=2,
col.axis="grey50")
plot(RatingByReviewerSQL$FilmsReviewed,
RatingByReviewerSQL$AvgScore,
main = "Relationship Between Films Reviewed and Scores Given",
xlab = "Films Reviewed",
ylab = "Average Score",
xlim = c(1,8),
ylim = c(1,5))
xaxis <- seq(1, 5, by = 1)
yaxis <- seq(1, 8, by = 1)
axis(1, at = xaxis)
axis(2, at = yaxis)

# Doesn't look like there's any association