knitr::opts_chunk$set(echo = TRUE)
options(warn=-1)
This notebook retrieves data from the movie surveys database and uses the information retrieved to answer the following questions: Surveys: 1. How many surveys have been completed?
Participants in Surveys 2. How many participants were in each survey? 3. What is the min, max, and average age of the participants? 4. What is the distribution of favorite genres among the survey participants?
The movies included in the survey: 5. What is the distribution of responses for each movie?
library(RMySQL)
## Loading required package: DBI
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
User and Password information is removed from this markdown.
rs = dbSendQuery(moviesdb, "select * from people")
dfpeople = fetch(rs, n=-1)
rs = dbSendQuery(moviesdb, "select * from questions")
dfquestions = fetch(rs, n=-1)
rs = dbSendQuery(moviesdb, "select * from responses")
dfresponses = fetch(rs, n=-1)
rs = dbSendQuery(moviesdb, "select * from surveys")
dfsurveys = fetch(rs, n=-1)
cat(nrow(dfsurveys), 'survey/s completed')
## 1 survey/s completed
dfpeople_age <- dfpeople %>% filter(!is.na(person_age))
dfpeople_age <- select(dfpeople_age, person_age)
cat('The number of participants in the survey was', nrow(dfpeople), '\n')
## The number of participants in the survey was 12
cat('The participant ages ranged from', min(dfpeople_age[,1]),'to',max(dfpeople_age[,1]),',with an average age of ', mean(dfpeople_age[,1]) )
## The participant ages ranged from 12 to 54 ,with an average age of 28.1
dfpeople_genre <- dfpeople %>% filter(!is.na(person_movie_genre))
dfpeople_genre <- select(dfpeople_genre, person_movie_genre)
qplot(dfpeople_genre$person_movie_genre, stat = 'count', main = 'Histogram for Genre', xlab = "Genre", fill=I("blue"))
## Responses from Participants ### 5. What is the distribution of responses for each movie?
dfquestions2 <- select(dfquestions, question_id, question_short)
dfresponses2 <- select(dfresponses, question_id, response)
dfcombined <- merge(x=dfresponses2, y=dfquestions2, by = 'question_id', all.x = TRUE)
dfcombined_count <- summarise_at(group_by(dfcombined,question_short,response),vars(question_id),funs(n()))
head(dfcombined_count, 2)
## # A tibble: 2 x 3
## # Groups: question_short [1]
## question_short response question_id
## <chr> <chr> <int>
## 1 "Avengers\r" Dissatisfied 3
## 2 "Avengers\r" Neutral 3
library(reshape2)
p <- ggplot(dfcombined_count, aes(x=response, y=question_id))
p <- p + geom_bar(stat = "identity", color = 'darkgreen', fill = 'darkgreen', width = 0.2)
# Divide by levels of "sex", in the vertical direction
p <- p + facet_grid(question_short ~ .)
p