#clean up the environment
rm(list=ls())
#install and load the rio package for import
#install.packages(rio)
library(rio)
library(bitops)
library(RCurl)
library(ggplot2)
#Link to the data in github
#movies <- import("https://raw.githubusercontent.com/excelsiordata/DATA607/master/MoviesAndReviews.csv")
#Link to the data locally for full reproducibility without a manual step in between (loading .csv to github)
movies <- import("C:/ProgramData/MySQL/MySQL Server 5.7/Uploads/MoviesAndReviews.csv", stringsAsFactors = FALSE)
#Take a peek at the data
head(movies)
## Movie ID Movie Title Movie Length (in minutes)
## 1 1 Zootopia 108
## 2 1 Zootopia 108
## 3 1 Zootopia 108
## 4 1 Zootopia 108
## 5 1 Zootopia 108
## 6 2 Moonlight 110
## Rotten Tomatoes Link Reviewer Name Rating
## 1 https://www.rottentomatoes.com/m/zootopi Noah 5
## 2 https://www.rottentomatoes.com/m/zootopi Emma 4.9000000954
## 3 https://www.rottentomatoes.com/m/zootopi Olivia 5
## 4 https://www.rottentomatoes.com/m/zootopi Liam 4.8000001907
## 5 https://www.rottentomatoes.com/m/zootopi Sophia 4.9000000954
## 6 https://www.rottentomatoes.com/m/moonlig Noah 4.9000000954
#Convert the movie length and review to numbers vs. chars
movies$Rating <- as.numeric(as.character(movies$Rating))
movies$`Movie Length (in minutes)` <- as.numeric(as.character(movies$`Movie Length (in minutes)`))
#Create the data frame
movies.df <- data.frame(movies, stringsAsFactors=FALSE)
#Rename the movie length column
names(movies.df)[names(movies.df)=="Movie.Length..in.minutes."] <- "Movie.Length.In.Minutes"
#Take a peek at the data frame
head(movies.df)
## Movie.ID Movie.Title Movie.Length.In.Minutes
## 1 1 Zootopia 108
## 2 1 Zootopia 108
## 3 1 Zootopia 108
## 4 1 Zootopia 108
## 5 1 Zootopia 108
## 6 2 Moonlight 110
## Rotten.Tomatoes.Link Reviewer.Name Rating
## 1 https://www.rottentomatoes.com/m/zootopi Noah 5.0
## 2 https://www.rottentomatoes.com/m/zootopi Emma 4.9
## 3 https://www.rottentomatoes.com/m/zootopi Olivia 5.0
## 4 https://www.rottentomatoes.com/m/zootopi Liam 4.8
## 5 https://www.rottentomatoes.com/m/zootopi Sophia 4.9
## 6 https://www.rottentomatoes.com/m/moonlig Noah 4.9
#Calculate mean rating by movie title
meanRating <- aggregate(movies.df[, 6], list(movies.df$Movie.Title), mean, stringsAsFactors=FALSE)
meanRating
## Group.1 x
## 1 Arrival 4.40
## 2 Hell or High Water 4.70
## 3 La La Land 4.60
## 4 Moonlight 4.82
## 5 Zootopia 4.92
names(meanRating)[names(meanRating)=="Group.1"] <- "Movie Title"
names(meanRating)[names(meanRating)=="x"] <- "Average Rating"
print(meanRating)
## Movie Title Average Rating
## 1 Arrival 4.40
## 2 Hell or High Water 4.70
## 3 La La Land 4.60
## 4 Moonlight 4.82
## 5 Zootopia 4.92
meanReviewer <- aggregate(movies.df[, 6], list(movies.df$Reviewer.Name), mean, stringsAsFactors=FALSE)
print(meanReviewer)
## Group.1 x
## 1 Emma 4.68
## 2 Liam 4.60
## 3 Noah 4.74
## 4 Olivia 4.70
## 5 Sophia 4.72
M1 <- ggplot(movies.df, aes(x = reorder(movies.df$Movie.Title, -movies.df$Rating), y = movies.df$Rating/5)) + geom_bar(stat = "identity", fill="dodgerblue4")
M1 <- M1 + labs(list(
title = "Average Movie Rating by Title",
x="Movie Title", y="Average Rating"))
print(M1)
M2 <- ggplot(movies.df, aes(x = reorder(movies.df$Reviewer.Name, -movies.df$Rating), y = movies.df$Rating/5)) + geom_bar(stat = "identity", fill="dodgerblue4")
M2 <- M2 + labs(list(
title = "Average Score by Reviewer",
x="Reviewer", y="Average Rating"))
print(M2)
The highest rated movie in our study was “Zootopia”, with “Arrival” coming in last.
All of the reviewers’ scores seem to be in the same ballpark with the exception of Liam, who has a noticably lower average rating than the other reviewers. Seems he’s a bit pickier than the others.