DATA607 Week 2 Assignment: SQL & R

Each month, the popular Rotten Tomatoes website ranks the 100 best movies on Netflix as ranked by the Tomatometer.

September 2020: See https://editorial.rottentomatoes.com/guide/best-netflix-movies-to-watch-right-now/

An informal survey was created selecting the 5th film from each of set of 10 films among the top 60.

THe Movie Survey is available at https://www.surveymonkey.com/. Note: Not working but can be here.

# 
library(RMySQL)
## Loading required package: DBI
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ----------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

Setup MySQL database connection

Prompt user for database username and password then setup database connection.

#prompt for input 

#vardb_schema <- readline(prompt = "Enter your DB schema: ")
#vardb_user <- readline(prompt = "Enter your DB username: ")
#vardb_password <- readline(prompt = "Enter your password: ")


vardb_user <- "DATA607"
vardb_password <- "hello"
vardb_schema <- "a02"

cat("Schema=", vardb_schema, " username=", vardb_user, " password=", vardb_password)
## Schema= a02  username= DATA607  password= hello
mydb = dbConnect(RMySQL::MySQL(), user=vardb_user,  password=vardb_password, port=3306, dbname=vardb_schema, host='localhost')

summary(mydb)
## <MySQLConnection:0,0>
##   User:   DATA607 
##   Host:   localhost 
##   Dbname: a02 
##   Connection type: localhost via TCP/IP 
## 
## Results:

Load movie survey data table

The .csv data will be loaded into the and handle missing data

# put code from SQL script here. 

survey_responses_df <- dbGetQuery(mydb, "select * from survey_responses")

dim(survey_responses_df)
## [1] 15  5
str(survey_responses_df)
## 'data.frame':    15 obs. of  5 variables:
##  $ survey_obs       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ movie_id         : int  1 2 3 4 5 1 2 3 4 5 ...
##  $ survey_id        : int  123 123 123 123 123 234 234 234 234 234 ...
##  $ survey_seen_movie: int  1 1 1 1 1 1 0 1 0 1 ...
##  $ survey_likert    : int  5 4 3 2 1 1 0 3 0 4 ...
survey_responses_df
##    survey_obs movie_id survey_id survey_seen_movie survey_likert
## 1           1        1       123                 1             5
## 2           2        2       123                 1             4
## 3           3        3       123                 1             3
## 4           4        4       123                 1             2
## 5           5        5       123                 1             1
## 6           6        1       234                 1             1
## 7           7        2       234                 0             0
## 8           8        3       234                 1             3
## 9           9        4       234                 0             0
## 10         10        5       234                 1             4
## 11         11        1       345                 1             5
## 12         12        2       345                 1             0
## 13         13        3       345                 1             3
## 14         14        4       345                 0             0
## 15         15        5       345                 1             3

Plots of movie survey data

add narrative

survey_responses_df <- dbGetQuery(mydb, "select * from survey_responses WHERE survey_seen_movie = 1")

dim(survey_responses_df)
## [1] 12  5
str(survey_responses_df)
## 'data.frame':    12 obs. of  5 variables:
##  $ survey_obs       : int  1 2 3 4 5 6 8 10 11 12 ...
##  $ movie_id         : int  1 2 3 4 5 1 3 5 1 2 ...
##  $ survey_id        : int  123 123 123 123 123 234 234 234 345 345 ...
##  $ survey_seen_movie: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ survey_likert    : int  5 4 3 2 1 1 3 4 5 0 ...
survey_responses_df
##    survey_obs movie_id survey_id survey_seen_movie survey_likert
## 1           1        1       123                 1             5
## 2           2        2       123                 1             4
## 3           3        3       123                 1             3
## 4           4        4       123                 1             2
## 5           5        5       123                 1             1
## 6           6        1       234                 1             1
## 7           8        3       234                 1             3
## 8          10        5       234                 1             4
## 9          11        1       345                 1             5
## 10         12        2       345                 1             0
## 11         13        3       345                 1             3
## 12         15        5       345                 1             3
#ggplot(data=survey_responses_df, aes(x=survey_likert, y=movie_id)) + geom_line()

g <- ggplot(survey_responses_df, aes(x = survey_likert, y = movie_id)) + geom_line() + xlab('Likert Rating') + ylab('Popular Movies') + facet_wrap(~movie_id) + labs(title = "Popular Movie Likert Results")
print(g)
## geom_path: Each group consists of only one observation. Do you need to adjust
## the group aesthetic?

Summary

narrative