Pre-Requistes : Available Libraries
Movie Database
Connect to mySQL
## Loading required package: DBI
library(DBI)
conn <- dbConnect(RMySQL::MySQL() # Construct SQL driver
, dbname = "CUNY_DATA607"
, host = "35.231.71.159"
, port = 3306
, user = "root"
, password = "data607")
message("Connected to mysql on ", date())
## Connected to mysql on Sun Feb 10 13:27:43 2019
List of tables
## [1] "movie_ratings" "movies" "persons"
Tables
Movies
dbReadTable(conn, "movies")
## movie_id movie_name movie_genre
## 1 1 TITANIC Romance
## 2 2 TERMINATOR Science Fiction
## 3 3 STAR WARS Science Fiction
## 4 4 DIE HARD Action
## 5 5 GODFATHER Drama
## 6 6 THE HAUNTING Horror
## 7 7 BABY'S DAY OUT Comedy
## 8 8 TOY STORY Animation
## 9 9 AVENGERS Superhero
## 10 10 LA LA LAND Musical
Persons
#dbReadTable(conn, "")
dbGetQuery(conn, "SELECT * FROM persons ORDER BY person_id")
## person_id person_first_name person_last_name
## 1 1 John Doe
## 2 2 Jack Holmes
## 3 3 Elizabeth Dawn
## 4 4 Jessica Parker
## 5 5 Emanuel Banner
## 6 6 Diana Hayden
## 7 7 Jared Hill
## 8 8 Chloe Sue
## 9 9 Shawn Williams
## 10 10 Adam Becker
## 11 11 Teresa May
## 12 12 Aaron Flinch
## 13 13 Harry Dickens
Ratings
moviesQuery <- fetch(dbSendQuery(conn, "SELECT movie_name,movie_genre,person_first_name,person_last_name,movie_rating,(select avg(mr.movie_rating) FROM movie_ratings mr WHERE mr.movie_id=mrs.movie_id) avg_rating FROM movies LEFT JOIN movie_ratings mrs USING (movie_id) LEFT JOIN persons USING(person_id) ORDER BY movie_id,person_id"))
library(DT)
DT::datatable(moviesQuery, options = list(pagelength=5))
Visualizations
Bar Graph
library(ggplot2)
ggplot(moviesQuery,mapping=aes(x=moviesQuery$movie_name,y=moviesQuery$avg_rating, fill= avg_rating)) +
geom_col() +
ggtitle("Ratings on the Movies") +
coord_flip()
## Warning: Removed 4 rows containing missing values (position_stack).

Box Plot
ggplot(moviesQuery,mapping=aes(x=moviesQuery$movie_name, y=moviesQuery$movie_rating)) +
geom_boxplot() +
coord_flip()
## Warning: Removed 4 rows containing non-finite values (stat_boxplot).

Scatter PLot
#plot(x=movieRatings$movie_name, y=movieRatings$avg_rating, xlim=c(min(movieRatings$avg_rating, na.rm = TRUE), max(movieRatings$avg_rating, na.rm = TRUE)), type='p', xlab='Movie Name', ylab='Average Rating', main='Movie versus Average Rating')
plot(x=moviesQuery$movie_name, y=moviesQuery$movie_rating, xlim=c(min(moviesQuery$movie_rating, na.rm = TRUE), max(moviesQuery$movie_rating, na.rm = TRUE)), xlab='Movie Name', ylab='Movie Rating', main='Movie versus Rating')
## Warning in xy.coords(x, y, xlabel, ylabel, log): NAs introduced by coercion

Histogram
hist(moviesQuery$movie_rating)

movieAvgRatings <- unique(moviesQuery[c("movie_name", "avg_rating")])
hist(movieAvgRatings$avg_rating)

Geom Smooth