week_2

Load in packages

knitr::opts_chunk$set(echo = TRUE)
library(kableExtra)
library(RMySQL)

## Loading required package: DBI

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

SQL code to create a db/table in SQL

The code should not be executed, it needs to be run in sql

#DROP TABLE IF EXISTS Ratings;

#Create database Movies_db;
#use Movies_db;
#CREATE TABLE Ratings (
#`Person` VARCHAR(100) PRIMARY KEY,
#`Gladiator`  int  NOT NULL,
#`Dark Knight`  int  NOT NULL,
#`Grandmas Boy`  int NOT NULL,
#`Shaun Of The Dead`  int NOT NULL,
#`Kill Bill`  int NOT NULL,
#`ZombieLand`  int NOT NULL
#);


#INSERT into Ratings (`Person`,`Gladiator`, `Dark Knight`,`Grandmas Boy`,   `Shaun Of The Dead`,`Kill Bill`,`ZombieLand`)
#VALUES ("Miller",5,5,5,3,5,4),
#("brian",5,5,4,5,5,4),
#("Ariel",4,4,5,4,3,2),
#("Justin",5,4,5,3,5,4),
#("Joia",4,5,1,4,3,5),
#("Stephan",5,5,3,3,3,4),
#("Briana",3,5,5,2,1,2),
#("Rotten_Tomatoes",76,94,16,92,85,90),
#("User_Scores_RT",87,94,85,93,81,90);



#select * from Ratings

Connect to db

Run query to select entire table

mydb = dbConnect(MySQL(), user='root', password=my_pass, dbname='movies_db', host='localhost')
user_movie_ratings <- dbGetQuery(mydb, "select * from Ratings ")
user_movie_ratings

##            Person Gladiator Dark Knight Grandmas Boy Shaun Of The Dead
## 1           Ariel         4           4            5                 4
## 2           brian         5           5            4                 5
## 3          Briana         3           5            5                 2
## 4            Joia         4           5            1                 4
## 5          Justin         5           4            5                 3
## 6          Miller         5           5            5                 3
## 7 Rotten_Tomatoes        76          94           16                92
## 8         Stephan         5           5            3                 3
## 9  User_Scores_RT        87          94           85                93
##   Kill Bill ZombieLand
## 1         3          2
## 2         5          4
## 3         1          2
## 4         3          5
## 5         5          4
## 6         5          4
## 7        85         90
## 8         3          4
## 9        81         90

Alternatively, we can use R markdown SQL Chunk to run sql code query after we setup our connection

SELECT * FROM Ratings Where `Shaun Of The Dead` =3

3 records
Person	Gladiator	Dark Knight	Grandmas Boy	Shaun Of The Dead	Kill Bill	ZombieLand
Justin	5	4	5	3	5	4
Miller	5	5	5	3	5	4
Stephan	5	5	3	3	3	4

Data Manipulation

Add user scores and critics scores from Rotten Tomatoes
These scores are on a scale of 1-100
- Scale them as 1-5

RT_SCORE	replacement
0-20	1
21-40	2
41-60	3
61-80	4
81-100	5

user_movie_ratings[c(7,9),2:7]<- sweep(user_movie_ratings[c(7,9),2:7],1,c(20,20,20,20,20,20),"/")

## Warning in sweep(user_movie_ratings[c(7, 9), 2:7], 1, c(20, 20, 20, 20, :
## STATS is longer than the extent of 'dim(x)[MARGIN]'

user_movie_ratings[c(7,9),2:7] <- ceiling(user_movie_ratings[c(7,9),2:7])

Display how each movie does with all of our “critics”

users <- c("Ariel","Brian","Briana","Joia","Justin","Miller","Rotten_Tomatoes","Stephan","User_scores_RT")
movie_list <- list(Gladiator=user_movie_ratings$Gladiator, Dark_Knight=user_movie_ratings$`Dark Knight`,Grandmas_Boy=user_movie_ratings$`Grandmas Boy`,Shaun_Of_The_dead=user_movie_ratings$`Shaun Of The Dead`,Kill_Bill=user_movie_ratings$`Kill Bill`,Zombie_Land =user_movie_ratings$ZombieLand)
                 
dat <- lapply(movie_list, function(x) cbind(x = seq_along(x), y = x))
list_my_names <- names(dat)
lns <- sapply(dat, nrow)
dat <- as.data.frame(do.call("rbind", dat))
dat$group <- rep(list_my_names, lns)
my_vector=user_movie_ratings$Person
my_big_vec <- rep(my_vector,6)
dat$x <- users

library(ggplot2)
ggplot(dat, aes(x = dat$x, y = dat$y, colour = dat$group)) +
    theme_bw() +
    geom_jitter(size=5)+
    theme(axis.text.x=element_text(angle=60, hjust=1))

#matplot(user_movie_ratings$Gladiator)

Observations

Movies like Zombie Land and Grandmas Boy have very polarized results
Gladiator and Dark Knight are very highly rated
Shaun of the dead has the most scores of 3(3)
Grouping results by movie will very quickly become difficult to interpret in visual form

Lets explore the overall tendencies in our critics ratings

ggplot(dat, aes(x = dat$x, y = dat$y, colour = dat$x)) +
    theme_bw() +
    geom_jitter(size=7)+
  theme(axis.text.x=element_text(angle=60, hjust=1))

Much Cleaner: lets summarize some results

All of the females in the list tend to give more unfavorable ratings
Rotten Tomatoes User_scores give every movie on this list a 5
Stephan tends to give the most scores of around 3
Briana gives the lowest scores overall

Average the scores my friends submitted

Join average friends score with Rotten Tomatoes scores
Added inline comments

# Create an empty list and fill with average score of each movie
N <- 7
average_scores <- vector("list", N)
for(i in 2:N) {
    Ps <- mean(user_movie_ratings[,i]) 
    average_scores[[i]] <- Ps
}

# Add average scores to an unlisted vector and round scores to appropriate values
average_scores <- (average_scores[2:7])
friends_avg <- unlist(average_scores)
friends_avg <- ceiling(friends_avg)
names <- "Friends_avg" 
friends_avg <- append(friends_avg, names, after=0)

## Subset old data to get the RT user and critics scores
rotten_tomatoes_subset <- subset(user_movie_ratings,Person== "Rotten_Tomatoes" | Person == "User_Scores_RT")

##combine subset data with friends_avg vector 
comparison_df <- rbind(rotten_tomatoes_subset,friends_avg)
comparison_df

##            Person Gladiator Dark Knight Grandmas Boy Shaun Of The Dead
## 7 Rotten_Tomatoes         4           5            1                 5
## 9  User_Scores_RT         5           5            5                 5
## 3     Friends_avg         5           5            4                 4
##   Kill Bill ZombieLand
## 7         5          5
## 9         5          5
## 3         4          4

Graph clustering our scores by the the reviewers(RT_users,RT_critics, My_friends)

movie_list <- list(Gladiator=comparison_df$Gladiator, Dark_Knight=comparison_df$`Dark Knight`,Grandmas_Boy=comparison_df$`Grandmas Boy`,Shaun_Of_The_dead=comparison_df$`Shaun Of The Dead`,Kill_Bill=comparison_df$`Kill Bill`,Zombie_Land =comparison_df$ZombieLand)

dat_2 <- lapply(movie_list, function(x) cbind(x = seq_along(x), y = x))
list_my_names <- names(dat_2)
lns <- sapply(dat_2, nrow)
dat_2 <- as.data.frame(do.call("rbind", dat_2))
dat_2$group <- rep(list_my_names, lns)
my_vector_2=comparison_df$Person
my_big_vec <- rep(my_vector_2,6)
dat_2$x <- my_big_vec

ggplot(dat_2, aes(x = dat_2$x, y = dat_2$y, colour = dat_2$x)) +
    theme_bw() +
    geom_jitter(size=10)

That was a fun experiment. Lets Expore problems with the experimental design

Discrete rating system

Having only 5 ratings to choose from, leaves a very large range for what each rating really means
- Questions such as “is 5 perfect?” and “what if i want to rate it by 100?” were presented by respondents
Presenting these movies together may Bias our results
- Many of my friends asked questions such as “am I comparing these movies to each other?”
  - As a large scaled survey, we would want to choose from a larger pool of movies and randomize which selections people were presented.
- Another Issue brought up was " Am i rating these movies by genre, or overall movie"
  - Perhaps we would want to create strata by genre and present genre based rating surveys

Our study wasn’t blinded

Non randomized movie selection
- Overall poor practice for experimental design.
- It is possible our low women scores can be attributed to my gender bias in movie selection

week_2_607

Justin Herman

February 8, 2018

Load in packages

SQL code to create a db/table in SQL

Connect to db

Data Manipulation

Display how each movie does with all of our “critics”

Observations

Lets explore the overall tendencies in our critics ratings

Much Cleaner: lets summarize some results

Average the scores my friends submitted

Graph clustering our scores by the the reviewers(RT_users,RT_critics, My_friends)

That was a fun experiment. Lets Expore problems with the experimental design

Discrete rating system

Our study wasn’t blinded