Assignment 3

Click the Code and Visualization to see the visual and underlying R code.

Code

Movie Dataset The dataset include IMDB movie information for movies from 1990-2017 and their respective ratings, director info etc.

#Importing required packages
library(readr)
library(tidyr)
library(plyr)
library(dplyr)
library(plotly)
library(gapminder)
library(crosstalk)
library(ggplot2)
#Packages to plot
#install.packages("hrbrthemes")
#install.packages("gganimate")
library(gganimate)
library(hrbrthemes)
#install.packages("gifski")
library(gifski)

moviesdf <- read_csv("C:/Users/shubhdeep/Desktop/movies-1990-to-2017/movie.csv")
moviesdf <- moviesdf %>% select(c("Country", "Director", "Language", "Released", "Runtime", "Title", "Type", "Year", "imdbID"))
colSums(is.na(moviesdf))

##  Country Director Language Released  Runtime    Title     Type     Year 
##    12689    27330    25491    61713    59190        1        1        1 
##   imdbID 
##        1

genredf <- read_csv("C:/Users/shubhdeep/Desktop/movies-1990-to-2017/Movie_Genres.csv")
genredf <- genredf %>% select(c("imdbID", "Genre"))

moviedf <- genredf %>% left_join(moviesdf, by = "imdbID")
colSums(is.na(moviedf))

##   imdbID    Genre  Country Director Language Released  Runtime    Title 
##        0        0    19411    39241    40964   104705    85839       12 
##     Type     Year 
##       12       12

moviedf <- moviedf %>% select(c(imdbID, Genre, Title, Year, Country))
colSums(is.na(moviedf))

##  imdbID   Genre   Title    Year Country 
##       0       0      12      12   19411

moviedf$Country[is.na(moviedf$Country)] <- "Unknown"
colSums(is.na(moviedf))

##  imdbID   Genre   Title    Year Country 
##       0       0      12      12       0

moviedf <- na.omit(moviedf)


library(reshape2)
genredf <- moviedf %>% dplyr::count(Year, Genre) %>% spread(Genre, n, fill = 0)

#Just keeping the major generes and the prominent years of film production
gen <- genredf %>% filter(Year > 1899 & Year < 2019)
gen <- gen %>% select(Year, Action, Adventure, Animation, Comedy, Crime, Documentary, Drama, Fantasy, Horror, Mystery, Romance, `Sci-Fi`, Sport, Thriller)

#gendf <- genredf %>% gather(Action:Western, key = "genre", value = "count")
gendf <- gen %>% gather(Action:Thriller, key = "genre", value = "count")


ggdf<-gendf %>% group_by(Year) %>% arrange(-count) %>% mutate(rank=row_number()) %>% filter(rank<=10)

hist(ggdf$count)

#Just for backup
gendf2<-ggdf
#install.packages("forecast")
library(forecast)
##############################################
#Using BoxCox transformation
ggdf <- gendf2
count_movies <- BoxCox(ggdf$count,lambda = "auto")
hist(count_movies)

ggdf$movie_count <- ggdf$count
ggdf$count <- count_movies
ggdf$count[ggdf$count<0] <- 0

Data Reference * Movie Collection Analysis Dataset. (2019). Kaggle Datasets. Retrieved October 23, 2019, from Kaggle website: https://www.kaggle.com/beyjin/movies-1990-to-2017

Visualization

#Plotting
p3 <- ggdf %>% ggplot(aes(x=-rank, y = count, group =genre, fill = genre)) +
  geom_tile(aes(y=count/2, height = count), width = 0.9) +
  geom_text(aes(label = genre), hjust = "right", colour = "black", fontface = "bold", size = 8)+
  geom_text(aes(label=scales::comma(count)),
            hjust="left", nudge_y=0.5, colour = "black", fontface = "bold") + coord_flip(clip = "off")+scale_x_discrete("")+
  scale_y_continuous("",labels = scales::comma)+hrbrthemes::theme_ipsum(plot_title_size = 32, subtitle_size = 24, caption_size = 20, base_size = 20) +
  theme(panel.grid.major.y=element_blank(),
        panel.grid.minor.x=element_blank(),
        legend.position = "none",
        plot.margin = margin(1,1,1,2,"cm"),
        axis.text.y=element_blank()) + transition_time(Year) +
  ease_aes('cubic-in-out') +
  labs(title='Popularity of movie genres over the years(1900-2018)', subtitle='Proportion of particular genre movies in {round(frame_time,0)}')

animate(p3, nframes = 750, fps = 10, end_pause = 100, width = 1200, height = 900)

Assignment 3

Popularity of movie genres over the years.

Shubhdeep Singh - s3764546 and Naveen Gundelli - s3788271

Code

Visualization