D607_A02_RickRN

DATA607 Week 2 Assignment: SQL & R

Each month, the popular Rotten Tomatoes website ranks the 100 best movies on Netflix as ranked by the Tomatometer.

September 2020: See https://editorial.rottentomatoes.com/guide/best-netflix-movies-to-watch-right-now/

An informal survey was created selecting the 5th film from each of set of 10 films among the top 60.

THe Movie Survey is available at https://www.surveymonkey.com/. Note: Not working but can be here.

# 
library(RMySQL)

## Loading required package: DBI

library(tidyverse)

## -- Attaching packages -------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ----------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Setup MySQL database connection

Prompt user for database username and password then setup database connection.

#prompt for input 

#vardb_schema <- readline(prompt = "Enter your DB schema: ")
#vardb_user <- readline(prompt = "Enter your DB username: ")
#vardb_password <- readline(prompt = "Enter your password: ")


vardb_user <- "DATA607"
vardb_password <- "hello"
vardb_schema <- "a02"

cat("Schema=", vardb_schema, " username=", vardb_user, " password=", vardb_password)

## Schema= a02  username= DATA607  password= hello

mydb = dbConnect(RMySQL::MySQL(), user=vardb_user,  password=vardb_password, port=3306, dbname=vardb_schema, host='localhost')

summary(mydb)

## <MySQLConnection:0,0>
##   User:   DATA607 
##   Host:   localhost 
##   Dbname: a02 
##   Connection type: localhost via TCP/IP 
## 
## Results:

Load popular movie data table

The .csv data will be loaded into the

# put code from SQL script here. 

pop_movie_df <- dbGetQuery(mydb, "select * from survey_popmovies")

dim(pop_movie_df)

## [1] 6 5

str(pop_movie_df)

## 'data.frame':    6 obs. of  5 variables:
##  $ movie_id        : int  1 2 3 4 5 6
##  $ movie_name      : chr  "Ladybird" "Homecoming: A Film by Beyonce" "Aquarius" "The Florida Project" ...
##  $ movie_year      : int  2017 2019 2016 2017 2019 2018
##  $ movie_rank      : int  5 15 25 35 45 55
##  $ movie_reviewsite: chr  "https://www.rottentomatoes.com/m/lady_bird/\r" "https://www.rottentomatoes.com/m/homecoming_a_film_by_beyonce\r" "https://www.rottentomatoes.com/m/aquarius/\r" "https://www.rottentomatoes.com/m/the_florida_project/\r" ...

pop_movie_df

##   movie_id                    movie_name movie_year movie_rank
## 1        1                      Ladybird       2017          5
## 2        2 Homecoming: A Film by Beyonce       2019         15
## 3        3                      Aquarius       2016         25
## 4        4           The Florida Project       2017         35
## 5        5                I Lost My Body       2019         45
## 6        6           The Death of Stalin       2018         55
##                                                  movie_reviewsite
## 1                   https://www.rottentomatoes.com/m/lady_bird/\r
## 2 https://www.rottentomatoes.com/m/homecoming_a_film_by_beyonce\r
## 3                    https://www.rottentomatoes.com/m/aquarius/\r
## 4         https://www.rottentomatoes.com/m/the_florida_project/\r
## 5              https://www.rottentomatoes.com/m/i_lost_my_body/\r
## 6         https://www.rottentomatoes.com/m/the_death_of_stalin/\r

Load movie survey data table

The .csv data will be loaded into the and handle missing data

# put code from SQL script here. 

survey_responses_df <- dbGetQuery(mydb, "select * from survey_responses")

dim(survey_responses_df)

## [1] 15  5

str(survey_responses_df)

## 'data.frame':    15 obs. of  5 variables:
##  $ survey_obs       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ movie_id         : int  1 2 3 4 5 1 2 3 4 5 ...
##  $ survey_id        : int  123 123 123 123 123 234 234 234 234 234 ...
##  $ survey_seen_movie: int  1 1 1 1 1 1 0 1 0 1 ...
##  $ survey_likert    : int  5 4 3 2 1 1 0 3 0 4 ...

survey_responses_df

##    survey_obs movie_id survey_id survey_seen_movie survey_likert
## 1           1        1       123                 1             5
## 2           2        2       123                 1             4
## 3           3        3       123                 1             3
## 4           4        4       123                 1             2
## 5           5        5       123                 1             1
## 6           6        1       234                 1             1
## 7           7        2       234                 0             0
## 8           8        3       234                 1             3
## 9           9        4       234                 0             0
## 10         10        5       234                 1             4
## 11         11        1       345                 1             5
## 12         12        2       345                 1             0
## 13         13        3       345                 1             3
## 14         14        4       345                 0             0
## 15         15        5       345                 1             3

Plots of movie survey data

add narrative

survey_responses_df <- dbGetQuery(mydb, "select * from survey_responses WHERE survey_seen_movie = 1")

dim(survey_responses_df)

## [1] 12  5

str(survey_responses_df)

## 'data.frame':    12 obs. of  5 variables:
##  $ survey_obs       : int  1 2 3 4 5 6 8 10 11 12 ...
##  $ movie_id         : int  1 2 3 4 5 1 3 5 1 2 ...
##  $ survey_id        : int  123 123 123 123 123 234 234 234 345 345 ...
##  $ survey_seen_movie: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ survey_likert    : int  5 4 3 2 1 1 3 4 5 0 ...

survey_responses_df

##    survey_obs movie_id survey_id survey_seen_movie survey_likert
## 1           1        1       123                 1             5
## 2           2        2       123                 1             4
## 3           3        3       123                 1             3
## 4           4        4       123                 1             2
## 5           5        5       123                 1             1
## 6           6        1       234                 1             1
## 7           8        3       234                 1             3
## 8          10        5       234                 1             4
## 9          11        1       345                 1             5
## 10         12        2       345                 1             0
## 11         13        3       345                 1             3
## 12         15        5       345                 1             3

#ggplot(data=survey_responses_df, aes(x=survey_likert, y=movie_id)) + geom_line()

g <- ggplot(survey_responses_df, aes(x = survey_likert, y = movie_id)) + geom_line() + xlab('Likert Rating') + ylab('Popular Movies') + facet_wrap(~movie_id) + labs(title = "Popular Movie Likert Results")
print(g)

## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?

Summary

narrative