knitr::opts_chunk$set(echo = TRUE)
options(warn=-1)

This notebook retrieves data from the movie surveys database and uses the information retrieved to answer the following questions: Surveys: 1. How many surveys have been completed?

Participants in Surveys 2. How many participants were in each survey? 3. What is the min, max, and average age of the participants? 4. What is the distribution of favorite genres among the survey participants?

The movies included in the survey: 5. What is the distribution of responses for each movie?

library(RMySQL)
## Loading required package: DBI
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)

User and Password information is removed from this markdown.

Create a dataframe of each table in the movies database

rs = dbSendQuery(moviesdb, "select * from people")
dfpeople = fetch(rs, n=-1)

rs = dbSendQuery(moviesdb, "select * from questions")
dfquestions = fetch(rs, n=-1)

rs = dbSendQuery(moviesdb, "select * from responses")
dfresponses = fetch(rs, n=-1)

rs = dbSendQuery(moviesdb, "select * from surveys")
dfsurveys = fetch(rs, n=-1)

Surveys:

1. How many surveys have been completed?

cat(nrow(dfsurveys), 'survey/s completed')
## 1 survey/s completed

Participants in Surveys

2. How many participants were in each survey?

3. What is the min, max, and average age of the participants?

dfpeople_age <- dfpeople %>% filter(!is.na(person_age))
dfpeople_age <- select(dfpeople_age, person_age)
cat('The number of participants in the survey was', nrow(dfpeople), '\n')
## The number of participants in the survey was 12
cat('The participant ages ranged from', min(dfpeople_age[,1]),'to',max(dfpeople_age[,1]),',with an average age of ', mean(dfpeople_age[,1]) )
## The participant ages ranged from 12 to 54 ,with an average age of  28.1

Genre’s

4. The preferred genre’s of the participants is illustrated in the chart below…

dfpeople_genre <- dfpeople %>% filter(!is.na(person_movie_genre))
dfpeople_genre <- select(dfpeople_genre, person_movie_genre)
qplot(dfpeople_genre$person_movie_genre, stat = 'count', main = 'Histogram for Genre', xlab = "Genre", fill=I("blue"))

## Responses from Participants ### 5. What is the distribution of responses for each movie?

dfquestions2 <- select(dfquestions, question_id, question_short)
dfresponses2 <- select(dfresponses, question_id, response)
dfcombined <- merge(x=dfresponses2, y=dfquestions2, by = 'question_id', all.x = TRUE)
dfcombined_count <- summarise_at(group_by(dfcombined,question_short,response),vars(question_id),funs(n()))
head(dfcombined_count, 2)
## # A tibble: 2 x 3
## # Groups:   question_short [1]
##   question_short response     question_id
##   <chr>          <chr>              <int>
## 1 "Avengers\r"   Dissatisfied           3
## 2 "Avengers\r"   Neutral                3
library(reshape2)

p <- ggplot(dfcombined_count, aes(x=response, y=question_id))

p <- p + geom_bar(stat = "identity", color = 'darkgreen', fill = 'darkgreen', width = 0.2)

# Divide by levels of "sex", in the vertical direction
p <- p + facet_grid(question_short ~ .)

p