Source files: https://github.com/djlofland/DATA607_F2019/tree/master/Assignment2
Choose six recent popular movies. Ask at least five people that you know (friends, family, classmates, imaginary friends) to rate each of these movie that they have seen on a scale of 1 to 5. Take the results (observations) and store them in a SQL database. Load the information into an R dataframe. Your deliverables should include your SQL scripts and your R Markdown code, posted to GitHub. This is by design a very open ended assignment. A variety of reasonable approaches are acceptable. You can (and should) blank out your SQL password if your solution requires it; otherwise, full credit requires that your code is “reproducible,” with the assumption that I have the same database server and R software.
This is the survey I shared with friends and family. I created it with SurveyMonkey and shared the link via FB, email , Slack and text.
library(formattable) # Format output options: percent()
library(lubridate) # Date types and processing##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::as.difftime() masks base::as.difftime()
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ lubridate::intersect() masks base::intersect()
## ✖ dplyr::lag() masks stats::lag()
## ✖ lubridate::setdiff() masks base::setdiff()
## ✖ lubridate::union() masks base::union()
## Loading required package: DBI
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
To prevent sensitive information from being exposed, I created a file called .Renviron in my home directory. RStudio will read this file and load system environment variables. We can then just use Sys.getenv(“VAR_NAME”) to reference the value without exposing it.
Renviron settings
# Connect to our MySQL DB
con <- dbConnect(RMySQL::MySQL(), user=Sys.getenv("MYSQL_USER"), password=Sys.getenv("MYSQL_PASSWORD"), host=Sys.getenv("MYSQL_HOST"))
# Attempt to connect to movie_reviews DB, Create it if it's missing
result = tryCatch({
res <- dbSendQuery(con, 'USE movie_reviews;')
}, error = function(e) {
res <- dbSendQuery(con, 'CREATE DATABASE movie_reviews;')
})Source: sql/movies.sql, sql/options.sql, sql/responses.sql, sql/ages.sql
# Create the movies table and load movies
movies_sql <- getSQL('sql/movies.sql')
dbSendQuery(con, movies_sql)
# Create the options table and load options
options_sql <- getSQL('sql/options.sql')
dbSendQuery(con, options_sql)
# Create the responses table (note we will load data from csv in later step)
responses_sql <- getSQL('sql/responses.sql')
dbSendQuery(con, responses_sql)
# Create the ages table (note we will load data from csv in later step)
ages_sql <- getSQL('sql/ages.sql')
dbSendQuery(con, ages_sql)# Load the list of allowed options respondants could choose when ranking each movie
sql <- "SELECT Response FROM options ORDER BY ID;"
res <- dbSendQuery(con, sql)
response_options <- dbFetch(res)
response_options$Response## [1] "I didn't see it" "Terrible Movie"
## [3] "Not Good" "I Don't have an opinion"
## [5] "Decent Movie" "Great Movie"
## [7] "Best Movie Ever!"
# Load the list of movies (we need this to map MovieName's and MovieID's)
sql <- "SELECT MovieID, MovieName FROM movies;"
res <- dbSendQuery(con, sql)
movie_options <- dbFetch(res)
movie_options## MovieID MovieName
## 1 AnnabelleComeHome Annabelle Comes Home
## 2 HobbsAndShaw Hobbs & Shaw
## 3 LionKing2019 The Lion King (2019)
## 4 MetersDown47 47 Meters Down
## 5 SpidermanFarFromHome Spiderman: Far From Home
## 6 ToyStory4 Toy Story 4
# Load the list of movies (we need this to map MovieName's and MovieID's)
sql <- "SELECT AgeBucket FROM ages ORDER BY ID;"
res <- dbSendQuery(con, sql)
age_options <- dbFetch(res)
age_options$AgeBucket## [1] "Under 18" "18-24" "25-34" "35-44" "45-54" "55-64"
## [7] "65+"
## Parsed with column specification:
## cols(
## `Respondent ID` = col_double(),
## `Collector ID` = col_double(),
## `Start Date` = col_character(),
## `End Date` = col_character(),
## `IP Address` = col_character(),
## `Email Address` = col_logical(),
## `First Name` = col_logical(),
## `Last Name` = col_logical(),
## `Custom Data 1` = col_logical(),
## `Spiderman: Far from Home` = col_character(),
## `Hobbs & Shaw` = col_character(),
## `The Lion King (2019 Recent Version)` = col_character(),
## `Annabelle Come Home` = col_character(),
## `Toy Story 4` = col_character(),
## `47 Meters Down` = col_character(),
## `Your Gender` = col_character(),
## `Your Age Bracket` = col_character()
## )
# Let' set base column names (that are R safe)
names(survey) <- c('RespondentID', 'CollectorID', 'StartDate', 'EndDate', 'IPAddress', 'EmailAddress', 'FirstName', 'LastName', 'CustomData1', 'SpidermanFarFromHome', 'HobbsAndShaw', 'LionKing2019', 'AnnabelleComeHome', 'ToyStory4', 'MetersDown47', 'Gender', 'Age')# Remove a few unnecessary columns
survey <- survey %>% select(-IPAddress, -EmailAddress, -FirstName, -LastName, -CustomData1)
survey <- survey[2:nrow(survey),]
survey## # A tibble: 21 x 12
## RespondentID CollectorID StartDate EndDate SpidermanFarFro… HobbsAndShaw
## <dbl> <dbl> <chr> <chr> <chr> <chr>
## 1 10976555523 245709500 09/07/20… 09/07/… Decent Movie I didn't se…
## 2 10972842969 245715358 09/05/20… 09/05/… I didn't see it I didn't se…
## 3 10972794947 245715358 09/05/20… 09/05/… I didn't see it I didn't se…
## 4 10972751462 245715358 09/05/20… 09/05/… I didn't see it I didn't se…
## 5 10972471453 245715358 09/05/20… 09/05/… I Don't have an… I Don't hav…
## 6 10971873184 245715358 09/05/20… 09/05/… Not Good I didn't se…
## 7 10971861372 245715358 09/05/20… 09/05/… I didn't see it I didn't se…
## 8 10971809011 245715358 09/05/20… 09/05/… Best Movie Ever! Decent Movie
## 9 10971398086 245709500 09/05/20… 09/05/… I didn't see it I didn't se…
## 10 10971350360 245715358 09/05/20… 09/05/… Great Movie I didn't se…
## # … with 11 more rows, and 6 more variables: LionKing2019 <chr>,
## # AnnabelleComeHome <chr>, ToyStory4 <chr>, MetersDown47 <chr>,
## # Gender <chr>, Age <chr>
# Convert the two date columns from charater to date objects (using lubridate library)
survey$StartDate <- dmy_hms(survey$StartDate)
survey$EndDate <- dmy_hms(survey$EndDate)
# Cache the processed CSV
write_delim(survey, 'data/processed_movies.csv', delim = ", ", na = "NA", append = FALSE, quote_escape = "double")# Loop thru survey response rows and build SQL INSERT rows for each
rows <- ''
for (x in 1:nrow(survey)) {
row_data <- paste(t(survey[x,]), collapse='","')
rows <- paste(rows, sprintf('("%s"),',row_data))
}
# Run our INSERT statement to load the DB table
query <- paste("INSERT INTO responses VALUES", rows)
query <- substr(query, 1, str_length(query)-1)
res <- dbSendQuery(con, query)
data <- dbFetch(res)
# Cache off our SQL statement in case we need to troubleshoot
write(query, 'sql/insert_data.sql')# Load the list of movies (we need this to map MovieName's and MovieID's)
sql <- "SELECT * FROM responses;"
res <- dbSendQuery(con, sql)
survey <- as_tibble((dbFetch(res)))
survey## # A tibble: 21 x 12
## RespondentID CollectorID StartDate EndDate SpidermanFarFro… HobbsAndShaw
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 10970180147 245709500 2019-04-… 2019-0… Great Movie Decent Movie
## 2 10970192077 245709500 2019-04-… 2019-0… Decent Movie Decent Movie
## 3 10970285008 245709500 2019-04-… 2019-0… Decent Movie I didn't se…
## 4 10970920112 245715358 2019-05-… 2019-0… Great Movie Great Movie
## 5 10971077610 245709500 2019-05-… 2019-0… Great Movie Decent Movie
## 6 10971094944 245709500 2019-05-… 2019-0… Decent Movie I didn't se…
## 7 10971112419 245709500 2019-05-… 2019-0… I didn't see it I didn't se…
## 8 10971157469 245709500 2019-05-… 2019-0… I didn't see it I didn't se…
## 9 10971160987 245709500 2019-05-… 2019-0… I didn't see it I didn't se…
## 10 10971322807 245715358 2019-05-… 2019-0… I didn't see it I didn't se…
## # … with 11 more rows, and 6 more variables: LionKing2019 <chr>,
## # AnnabelleComeHome <chr>, ToyStory4 <chr>, MetersDown47 <chr>,
## # Gender <chr>, Age <chr>
# Fix column datatypes
survey$RespondentID <- as.character(survey$RespondentID)
survey$CollectorID <- factor(survey$CollectorID)
survey$StartDate <- ymd_hms(survey$StartDate)
survey$EndDate <- ymd_hms(survey$EndDate)
# Convert our movie review ranks into an ordinal factor
survey$SpidermanFarFromHome <- factor(survey$SpidermanFarFromHome, levels=response_options$Response, ordered=TRUE)
survey$HobbsAndShaw <- factor(survey$HobbsAndShaw, levels=response_options$Response, ordered=TRUE)
survey$LionKing2019 <- factor(survey$LionKing2019, levels=response_options$Response, ordered=TRUE)
survey$AnnabelleComeHome <- factor(survey$AnnabelleComeHome, levels=response_options$Response, ordered=TRUE)
survey$ToyStory4 <- factor(survey$ToyStory4, levels=response_options$Response, ordered=TRUE)
survey$MetersDown47 <- factor(survey$MetersDown47, levels=response_options$Response, ordered=TRUE)
survey$Gender <- factor(survey$Gender)
# Convert our ages (charater) into an ordinal factor
survey$Age <- factor(survey$Age, levels=age_options$AgeBucket, ordered=TRUE)
survey## # A tibble: 21 x 12
## RespondentID CollectorID StartDate EndDate
## <chr> <fct> <dttm> <dttm>
## 1 10970180147 245709500 2019-04-09 22:00:52 2019-04-09 22:01:59
## 2 10970192077 245709500 2019-04-09 22:07:14 2019-04-09 22:09:46
## 3 10970285008 245709500 2019-04-09 23:11:45 2019-04-09 23:12:31
## 4 10970920112 245715358 2019-05-09 06:42:04 2019-05-09 06:42:40
## 5 10971077610 245709500 2019-05-09 07:57:59 2019-05-09 07:58:41
## 6 10971094944 245709500 2019-05-09 08:05:00 2019-05-09 08:05:40
## 7 10971112419 245709500 2019-05-09 08:11:36 2019-05-09 08:12:32
## 8 10971157469 245709500 2019-05-09 08:29:44 2019-05-09 08:30:27
## 9 10971160987 245709500 2019-05-09 08:31:02 2019-05-09 08:31:52
## 10 10971322807 245715358 2019-05-09 09:30:08 2019-05-09 09:30:50
## # … with 11 more rows, and 8 more variables: SpidermanFarFromHome <ord>,
## # HobbsAndShaw <ord>, LionKing2019 <ord>, AnnabelleComeHome <ord>,
## # ToyStory4 <ord>, MetersDown47 <ord>, Gender <fct>, Age <ord>
# Get the total movies reported seen per person
survey <- survey %>%
mutate(MovieCount = purrr::pmap_dbl(list(SpidermanFarFromHome != "I didn't see it",
HobbsAndShaw != "I didn't see it",
LionKing2019 != "I didn't see it",
AnnabelleComeHome != "I didn't see it",
ToyStory4 != "I didn't see it",
MetersDown47 != "I didn't see it"
), sum))
# Simple boolean to filter people who haven't seen any movies
survey$hasSeenAny <- (survey$MovieCount > 0)
survey$hasSeenAny## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE
## [12] TRUE FALSE TRUE TRUE TRUE TRUE FALSE FALSE FALSE TRUE
##
## Man Woman
## 11 10
##
## Under 18 18-24 25-34 35-44 45-54 55-64 65+
## 2 3 3 4 5 3 1
##
## Call:
## density.default(x = table(survey$Age))
##
## Data: table(survey$Age) (7 obs.); Bandwidth 'bw' = 0.4551
##
## x y
## Min. :-0.3653 Min. :0.001406
## 1st Qu.: 1.3173 1st Qu.:0.064824
## Median : 3.0000 Median :0.139551
## Mean : 3.0000 Mean :0.148373
## 3rd Qu.: 4.6827 3rd Qu.:0.190352
## Max. : 6.3653 Max. :0.398055
survey$hasSeenAny <- factor(survey$hasSeenAny, levels=c(TRUE,FALSE))
# Seen any by Gender
table(survey$hasSeenAny, survey$Gender)##
## Man Woman
## TRUE 10 4
## FALSE 1 6
##
## Man Woman
## TRUE 0.47619048 0.19047619
## FALSE 0.04761905 0.28571429
p1 <- ggplot(data=survey) +
geom_mosaic(aes(x = product(Gender), fill=hasSeenAny), na.rm=TRUE) +
scale_fill_manual(values=palette_TF) +
labs(x="Gender", y="Has Seen Any Movies", title='Seen Any Movies by Gender') +
theme(plot.title = element_text(hjust = 0.5))
# Seen any by Age Bucket
prop.table(table(survey$hasSeenAny, survey$Age))##
## Under 18 18-24 25-34 35-44 45-54 55-64
## TRUE 0.04761905 0.09523810 0.14285714 0.09523810 0.19047619 0.09523810
## FALSE 0.04761905 0.04761905 0.00000000 0.09523810 0.04761905 0.04761905
##
## 65+
## TRUE 0.00000000
## FALSE 0.04761905
p2 <- ggplot(data=survey) +
geom_mosaic(aes(x = product(Age), fill=hasSeenAny), na.rm=TRUE) +
scale_fill_manual(values=palette_TF) +
labs(x="Age", y="Has Seen Any Movies", title='Seen Any Movies by Age') +
theme(plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 90, hjust = 1))
grid.arrange(p1, p2, nrow = 1)# Filter to only include people who have seen 1+ movies
survey_viewers <- subset(survey, hasSeenAny == TRUE)
# Movies by Gender
table(survey_viewers$MovieCount, survey_viewers$Age)##
## Under 18 18-24 25-34 35-44 45-54 55-64 65+
## 1 0 1 1 0 1 0 0
## 2 1 1 0 2 1 1 0
## 3 0 0 1 0 1 1 0
## 4 0 0 0 0 1 0 0
## 6 0 0 1 0 0 0 0
##
## Under 18 18-24 25-34 35-44 45-54 55-64
## 1 0.00000000 0.07142857 0.07142857 0.00000000 0.07142857 0.00000000
## 2 0.07142857 0.07142857 0.00000000 0.14285714 0.07142857 0.07142857
## 3 0.00000000 0.00000000 0.07142857 0.00000000 0.07142857 0.07142857
## 4 0.00000000 0.00000000 0.00000000 0.00000000 0.07142857 0.00000000
## 6 0.00000000 0.00000000 0.07142857 0.00000000 0.00000000 0.00000000
##
## 65+
## 1 0.00000000
## 2 0.00000000
## 3 0.00000000
## 4 0.00000000
## 6 0.00000000
p3 <- ggplot(data=survey_viewers, aes(x=MovieCount, fill=Age)) +
geom_bar(stat="count", width=0.7, position=position_dodge()) +
palette_Ages +
theme(plot.title = element_text(hjust = 0.5)) +
labs(x="Movies Seen", y="Respondants", title='Movie Count by Age')
# Movies by Gender
table(survey_viewers$MovieCount, survey_viewers$Gender)##
## Man Woman
## 1 2 1
## 2 4 2
## 3 3 0
## 4 0 1
## 6 1 0
##
## Man Woman
## 1 0.14285714 0.07142857
## 2 0.28571429 0.14285714
## 3 0.21428571 0.00000000
## 4 0.00000000 0.07142857
## 6 0.07142857 0.00000000
p4 <- ggplot(data=survey_viewers, aes(x=MovieCount, fill=Gender)) +
geom_bar(stat="count", width=0.7, position=position_dodge()) +
scale_fill_manual(values=palette_MF) +
theme(plot.title = element_text(hjust = 0.5)) +
labs(x="Movies Seen", y="Respondants", title='Movie Count by Gender')
grid.arrange(p3, p4, nrow = 1)p5 <- ggplot(data=survey, aes(x=Age, y=MovieCount, fill=Gender)) +
geom_boxplot() +
scale_fill_manual(values=palette_MF) +
theme(plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(x="Movies Seen", y="Respondants", title='Movie Count by Age (All)')
p6 <- ggplot(data=survey_viewers, aes(x=Age, y=MovieCount, fill=Gender)) +
geom_boxplot() +
scale_fill_manual(values=palette_MF) +
theme(plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = 90, hjust = 1)) +
labs(x="Movies Seen", y="Respondants", title='Movie Count by Age (Only Viewers)')
grid.arrange(p5, p6, nrow = 1)# Loop thru each movie and plot the response segmented by gender
plist <- lapply(movie_options$MovieID, function(col) {
ggplot(data=survey, aes_string(x=col, fill="Gender")) +
geom_bar(stat="count", width=0.7, position=position_dodge()) +
scale_fill_manual(values=palette_MF) +
theme(plot.title = element_text(hjust = 0.5), axis.text.x = element_text(angle = -45, hjust = 1)) +
labs(x="Movies Seen", y="Respondants",
title=movie_options$MovieName[movie_options$MovieID==col])
})
ml <- marrangeGrob(plist, nrow=3, ncol=2)
ml