Netflix dataset (1) - exploring dataset and visualisation

Acknowledgements

Example dataset source: “https://www.kaggle.com/datasets/shivamb/netflix-shows/versions/4?resource=download”
Based on tutorials by David Robinson: Youtube page [www.youtube.com/@safe4democracy]

Import data and load libraries

netflix_dataset <- read.csv("/Users/macbookpro/Desktop/netflix_titles.csv")
colnames(netflix_dataset)

##  [1] "show_id"      "type"         "title"        "director"     "cast"        
##  [6] "country"      "date_added"   "release_year" "rating"       "duration"    
## [11] "listed_in"    "description"

library(ggplot2)
library(dplyr)
library(hrbrthemes)
library(tidyr)
library(tidyverse)
library(forcats) 
library(gridExtra) 

theme_set(theme_ipsum())

Change in movie duration over years

# inspect data
unique(netflix_dataset$type)

## [1] "TV Show" "Movie"

netflix_movies <- netflix_dataset %>% 
  filter(type == "Movie")
netflix_movies[1:5,] %>% 
  select(duration)

##   duration
## 1   93 min
## 2   78 min
## 3   80 min
## 4  123 min
## 5   95 min

# separate duration vars
netflix_movies <- netflix_movies %>% 
  separate(duration, c("duration","duration units"), sep = " ", convert = TRUE)

netflix_movies %>% 
  mutate(decades = 10 * (release_year %/% 10)) %>%    # group release years for cleaner visualisation
  ggplot(aes(decades,duration,group = decades)) +
  geom_boxplot()+
  ggtitle("Change in movie duration over decades")

Change in number by type over years

netflix_dataset %>% 
  count(year = release_year,type) %>% 
  group_by(type) %>% 
  mutate(percent = n/sum(n)) %>% 
  ggplot(aes(year,percent,color = type))+
  geom_line()+
  ggtitle("Change in numbers over years by type")

Here, it shows that the number of TV shows experienced a steeper increase approximately post 2015.

Explore TV show and movie genres

tvshow_genres <- netflix_dataset %>% 
  separate_rows(listed_in, sep = ", ") %>% 
  group_by(type, genre = listed_in) %>% 
  summarise(n = n()) %>% 
  arrange(desc(n)) %>% 
  mutate(percent = n/sum(n)) %>% 
  filter(type == "TV Show") %>% 
  filter(genre != "TV Shows")

## `summarise()` has grouped output by 'type'. You can override using the
## `.groups` argument.

tvshow_genres %>% 
  mutate(genre = fct_reorder(genre,n)) %>% 
  ggplot(aes(percent,genre))+
  geom_col(fill = "#69b3a2", color="#e9ecef")+
  ggtitle("Percentage of TV show genres")+
  theme_bw()

# similar method for processing movies data - but group by release year this time
movie_genres <- netflix_dataset %>% 
  filter(type == "Movie") %>% 
  separate_rows(listed_in, sep = ", ") %>% 
  mutate(decades = 10*(release_year %/% 10)) %>% 
  group_by(decades, genre = listed_in) %>% 
  summarise(n = n()) %>% 
  mutate(percent = n/sum(n)) %>% 
  filter(genre != "Movies")

## `summarise()` has grouped output by 'decades'. You can override using the
## `.groups` argument.

## now can select and compare between decades (1980s and 2010s as an example)
plot_1980 <- movie_genres %>% 
  filter(decades == 1980) %>% 
  mutate(genre = fct_reorder(genre,n)) %>% 
  ggplot(aes(percent,genre)) +
  geom_col(fill = "#8de2eb",color="#e9ecef")+
  ylab("")+
  ggtitle("Movie genres 1980s")+
  theme_bw()

plot_2010 <- movie_genres %>% 
  filter(decades == 2010) %>% 
  mutate(genre = fct_reorder(genre,n)) %>% 
  ggplot(aes(percent,genre)) +
  geom_col(fill = "#8da2eb",color="#e9ecef")+
  ylab("")+
  ggtitle("Movie genres 2010s")+
  theme_bw()

grid.arrange(plot_1980,plot_2010,ncol = 2)

Explore added-date and ratings

library(lubridate)

# change date format
netflix_dataset <- netflix_dataset %>% 
  mutate(date_added = mdy(date_added))

netflix_dataset %>% 
  filter(!is.na(date_added)) %>% 
  count(year(date_added),type)

##    year(date_added)    type    n
## 1              2008   Movie    1
## 2              2008 TV Show    1
## 3              2009   Movie    2
## 4              2010   Movie    1
## 5              2011   Movie   13
## 6              2012   Movie    3
## 7              2013   Movie    6
## 8              2013 TV Show    5
## 9              2014   Movie   19
## 10             2014 TV Show    6
## 11             2015   Movie   58
## 12             2015 TV Show   30
## 13             2016   Movie  258
## 14             2016 TV Show  185
## 15             2017   Movie  864
## 16             2017 TV Show  361
## 17             2018   Movie 1255
## 18             2018 TV Show  430
## 19             2019   Movie 1497
## 20             2019 TV Show  656
## 21             2020   Movie 1312
## 22             2020 TV Show  697
## 23             2021   Movie   88
## 24             2021 TV Show   29

## It shows that most movies were added post 2015
netflix_dataset <- netflix_dataset %>% 
  mutate(year_added = pmax(year(date_added),2015)) 
  
netflix_dataset %>% 
  filter(!is.na(date_added)) %>% 
  count(year_added,type) %>% 
  ggplot(aes(year_added,n,fill = type))+
  geom_area()

## explore ratings by type and added years (top 4 rating types)
netflix_dataset %>% 
  filter(!is.na(date_added),!is.na(rating)) %>% 
  group_by(type) %>% 
  mutate(rating = fct_lump(rating,4)) %>% 
  count(type,year_added,rating) %>% 
  group_by(type,year_added) %>% 
  mutate(percent = n/sum(n)) %>% 
  ggplot(aes(year_added,percent,fill = rating))+
  geom_area()+
  facet_wrap(~type)+
  xlab("Year of being added")+
  theme_bw()

Country mapping

netflix_country <- netflix_dataset %>% 
  separate_rows(country,sep = ", ") %>% 
  select(country) %>% 
  filter(country != "") %>% 
  count(country)
names(netflix_country)[1] <- "region"

world <- map_data("world")

mapdata <- left_join(world, netflix_country, by = "region")

map1 <- ggplot(mapdata,aes(x = long, y = lat, group = group)) +
  geom_polygon(aes(fill = n), color = "black")

map2 <- map1 + scale_fill_gradient(name = "total number of Netflix releases", na.value = "grey50")+
  theme(axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks = element_blank(),
        axis.title.y = element_blank(),
        axis.title.x = element_blank(),
        rect = element_blank())

## Bar plot would be a more direct overall representation.
country_bar <- netflix_dataset %>% 
  filter(!is.na(country),
         country != "") %>% 
  count(country = fct_lump(country,9),
        type,sort = TRUE) %>% 
  mutate(country = fct_reorder(country,n)) %>% 
  ggplot(aes(n,country,fill = type))+
  geom_col(alpha = 0.75)
  
map2

country_bar

Relationship between country and mature ratings

netflix_dataset %>% 
  filter(!is.na(rating), !is.na(country), country != "") %>% 
  group_by(type, country = fct_lump(country,9)) %>% 
  summarise(n_mature = sum(rating %in% c("R","TV-MA")),
            n = n(),
            .groups = "drop") %>% 
  mutate(percent_mature = n_mature/n,
         conf_low = qbeta(.025, n_mature + .5, n - n_mature + .5),
         conf_high = qbeta(.975, n_mature + .5, n - n_mature + .5)) %>%
  ggplot(aes(percent_mature,country,color = type))+
  geom_point(aes(size = n))+
  geom_errorbar(aes(xmin = conf_low, xmax = conf_high))+
  scale_x_continuous(labels = scales::percent)+
  expand_limits(x = 0)+
  xlab("% of works that are mature rated (R/TV-MA)")

Netflix dataset (1) - exploring dataset and visualisation

YP

2024-12-10

Import data and load libraries

Change in movie duration over years

Change in number by type over years

Explore TV show and movie genres

Explore added-date and ratings

Country mapping

Relationship between country and mature ratings