#set working directory
setwd("C:/Users/Hai/Downloads")
#load data set -- Data Set contains critic/user ratings from Rotten Tomatoes
#from every movie released between 2007 to 2011
movies <- read.csv("P2-Movie-Ratings.csv")
#rename the columns
colnames(movies) <- c("Film", "Genre", "CriticRating",
"AudienceRating",
"BudgetMillions", "Year")
#since year is currently being interpreted as a numeric variable.
#we will set it to a factor. same with Genre.
movies$Year <- factor(movies$Year)
movies$Genre <- factor(movies$Genre)
#sample of data set
head(movies)
## Film Genre CriticRating AudienceRating BudgetMillions
## 1 (500) Days of Summer Comedy 87 81 8
## 2 10,000 B.C. Adventure 9 44 105
## 3 12 Rounds Action 30 52 20
## 4 127 Hours Adventure 93 84 18
## 5 17 Again Comedy 55 70 20
## 6 2012 Action 39 63 200
## Year
## 1 2009
## 2 2008
## 3 2009
## 4 2010
## 5 2009
## 6 2009
#enable ggplot2 package
library(ggplot2)
#plot visualization #1 - critics vs audience ratings over time
suppressWarnings(print(
ggplot(data=movies, aes(x=CriticRating, y=AudienceRating, color=Genre)) +
geom_point(aes(size=BudgetMillions)) + geom_smooth() + facet_grid(Genre~Year) +
coord_cartesian(ylim=c(0,100)) + ggtitle("Critics vs Audience Ratings over Time") +
theme(plot.title = element_text(hjust=0.5))
))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#plot visualization #2 - audience rating vs movie budget
ggplot(data=movies, aes(x=CriticRating, y=AudienceRating,
color=Genre,
size=BudgetMillions)) +
ggtitle("Movie Budget vs Audience Ratings") +
geom_point(aes(x=BudgetMillions)) + xlab("Budget Millions $") +
theme(plot.title = element_text(hjust=0.5))

#plot visualization #3 - movie budget distribution
suppressWarnings(ggplot(data=movies, aes(x=BudgetMillions)) +
geom_histogram(binwidth = 10, aes(fill=Genre), color="black") +
xlab("Budget in Millions") + ylab("Num of Movies") +
ggtitle("Movie Budget Distribution") +
theme(axis.title.x = element_text(color="DarkGreen", size=15),
axis.title.y = element_text(color="Red", size=15),
legend.position = c(1,1), legend.justification = c(1,1),
plot.title = element_text(color="Black", hjust=0.5, size=16)))

#plot visualization #4 - audience rating distribution
ggplot(data=movies, aes(x=AudienceRating)) +
ggtitle("Audience Ratings Distribution") +
ylab("Count of movies rated") +
geom_histogram(binwidth = 10, fill="white", color="black") +
theme(plot.title = element_text(hjust=0.5))

#plot visualization #5 - critic rating distribution
ggplot(data=movies, aes(x=CriticRating)) +
ggtitle("Critic Ratings Distribution") +
ylab("Count of movies rated") +
geom_histogram(binwidth = 10, fill="white", color="black") +
theme(plot.title = element_text(hjust=0.5))

#plot visualization #6 - distribution of audience ratings by genre
ggplot(data=movies, aes(x=Genre, y=AudienceRating, color=Genre)) +
ggtitle("Audience Ratings Distribution By Genre") +
geom_jitter() + geom_boxplot(size=1, alpha=0.5) +
theme(plot.title = element_text(hjust=0.5))
