#set working directory
setwd("C:/Users/Hai/Downloads")

#load data set -- Data Set contains critic/user ratings from Rotten Tomatoes
#from every movie released between 2007 to 2011
movies <- read.csv("P2-Movie-Ratings.csv")

#rename the columns
colnames(movies) <- c("Film", "Genre", "CriticRating", 
                      "AudienceRating",
                      "BudgetMillions", "Year")

#since year is currently being interpreted as a numeric variable.
#we will set it to a factor. same with Genre.

movies$Year <- factor(movies$Year)
movies$Genre <- factor(movies$Genre)

#sample of data set
head(movies)
##                    Film     Genre CriticRating AudienceRating BudgetMillions
## 1 (500) Days of Summer     Comedy           87             81              8
## 2           10,000 B.C. Adventure            9             44            105
## 3            12 Rounds     Action           30             52             20
## 4             127 Hours Adventure           93             84             18
## 5             17 Again     Comedy           55             70             20
## 6                  2012    Action           39             63            200
##   Year
## 1 2009
## 2 2008
## 3 2009
## 4 2010
## 5 2009
## 6 2009
#enable ggplot2 package
library(ggplot2)

#plot visualization #1 - critics vs audience ratings over time
suppressWarnings(print(
  ggplot(data=movies, aes(x=CriticRating, y=AudienceRating, color=Genre)) +
  geom_point(aes(size=BudgetMillions)) + geom_smooth() + facet_grid(Genre~Year) +
  coord_cartesian(ylim=c(0,100)) + ggtitle("Critics vs Audience Ratings over Time") +
    theme(plot.title = element_text(hjust=0.5))
  ))
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#plot visualization #2 - audience rating vs movie budget
ggplot(data=movies, aes(x=CriticRating, y=AudienceRating, 
                        color=Genre,
                        size=BudgetMillions)) +
  ggtitle("Movie Budget vs Audience Ratings") +
  geom_point(aes(x=BudgetMillions)) + xlab("Budget Millions $") +
  theme(plot.title = element_text(hjust=0.5))

#plot visualization #3 - movie budget distribution
suppressWarnings(ggplot(data=movies, aes(x=BudgetMillions)) +
  geom_histogram(binwidth = 10, aes(fill=Genre), color="black") +
  xlab("Budget in Millions") + ylab("Num of Movies") +
  ggtitle("Movie Budget Distribution") +
  theme(axis.title.x = element_text(color="DarkGreen", size=15),
        axis.title.y = element_text(color="Red", size=15),
        legend.position = c(1,1), legend.justification = c(1,1),
        plot.title = element_text(color="Black", hjust=0.5, size=16)))

#plot visualization #4 - audience rating distribution
ggplot(data=movies, aes(x=AudienceRating)) +
  ggtitle("Audience Ratings Distribution") +
  ylab("Count of movies rated") +
  geom_histogram(binwidth = 10, fill="white", color="black") +
  theme(plot.title = element_text(hjust=0.5))

#plot visualization #5 - critic rating distribution
ggplot(data=movies, aes(x=CriticRating)) +
  ggtitle("Critic Ratings Distribution") +
  ylab("Count of movies rated") +
  geom_histogram(binwidth = 10, fill="white", color="black") +
  theme(plot.title = element_text(hjust=0.5))

#plot visualization #6 - distribution of audience ratings by genre
ggplot(data=movies, aes(x=Genre, y=AudienceRating, color=Genre)) +
  ggtitle("Audience Ratings Distribution By Genre") + 
  geom_jitter() + geom_boxplot(size=1, alpha=0.5) +
  theme(plot.title = element_text(hjust=0.5))