Acknowledgements
netflix_dataset <- read.csv("/Users/macbookpro/Desktop/netflix_titles.csv")
colnames(netflix_dataset)
## [1] "show_id" "type" "title" "director" "cast"
## [6] "country" "date_added" "release_year" "rating" "duration"
## [11] "listed_in" "description"
library(ggplot2)
library(dplyr)
library(hrbrthemes)
library(tidyr)
library(tidyverse)
library(forcats)
library(gridExtra)
theme_set(theme_ipsum())
# inspect data
unique(netflix_dataset$type)
## [1] "TV Show" "Movie"
netflix_movies <- netflix_dataset %>%
filter(type == "Movie")
netflix_movies[1:5,] %>%
select(duration)
## duration
## 1 93 min
## 2 78 min
## 3 80 min
## 4 123 min
## 5 95 min
# separate duration vars
netflix_movies <- netflix_movies %>%
separate(duration, c("duration","duration units"), sep = " ", convert = TRUE)
netflix_movies %>%
mutate(decades = 10 * (release_year %/% 10)) %>% # group release years for cleaner visualisation
ggplot(aes(decades,duration,group = decades)) +
geom_boxplot()+
ggtitle("Change in movie duration over decades")
netflix_dataset %>%
count(year = release_year,type) %>%
group_by(type) %>%
mutate(percent = n/sum(n)) %>%
ggplot(aes(year,percent,color = type))+
geom_line()+
ggtitle("Change in numbers over years by type")
Here, it shows that the number of TV shows experienced a steeper increase approximately post 2015.
tvshow_genres <- netflix_dataset %>%
separate_rows(listed_in, sep = ", ") %>%
group_by(type, genre = listed_in) %>%
summarise(n = n()) %>%
arrange(desc(n)) %>%
mutate(percent = n/sum(n)) %>%
filter(type == "TV Show") %>%
filter(genre != "TV Shows")
## `summarise()` has grouped output by 'type'. You can override using the
## `.groups` argument.
tvshow_genres %>%
mutate(genre = fct_reorder(genre,n)) %>%
ggplot(aes(percent,genre))+
geom_col(fill = "#69b3a2", color="#e9ecef")+
ggtitle("Percentage of TV show genres")+
theme_bw()
# similar method for processing movies data - but group by release year this time
movie_genres <- netflix_dataset %>%
filter(type == "Movie") %>%
separate_rows(listed_in, sep = ", ") %>%
mutate(decades = 10*(release_year %/% 10)) %>%
group_by(decades, genre = listed_in) %>%
summarise(n = n()) %>%
mutate(percent = n/sum(n)) %>%
filter(genre != "Movies")
## `summarise()` has grouped output by 'decades'. You can override using the
## `.groups` argument.
## now can select and compare between decades (1980s and 2010s as an example)
plot_1980 <- movie_genres %>%
filter(decades == 1980) %>%
mutate(genre = fct_reorder(genre,n)) %>%
ggplot(aes(percent,genre)) +
geom_col(fill = "#8de2eb",color="#e9ecef")+
ylab("")+
ggtitle("Movie genres 1980s")+
theme_bw()
plot_2010 <- movie_genres %>%
filter(decades == 2010) %>%
mutate(genre = fct_reorder(genre,n)) %>%
ggplot(aes(percent,genre)) +
geom_col(fill = "#8da2eb",color="#e9ecef")+
ylab("")+
ggtitle("Movie genres 2010s")+
theme_bw()
grid.arrange(plot_1980,plot_2010,ncol = 2)
library(lubridate)
# change date format
netflix_dataset <- netflix_dataset %>%
mutate(date_added = mdy(date_added))
netflix_dataset %>%
filter(!is.na(date_added)) %>%
count(year(date_added),type)
## year(date_added) type n
## 1 2008 Movie 1
## 2 2008 TV Show 1
## 3 2009 Movie 2
## 4 2010 Movie 1
## 5 2011 Movie 13
## 6 2012 Movie 3
## 7 2013 Movie 6
## 8 2013 TV Show 5
## 9 2014 Movie 19
## 10 2014 TV Show 6
## 11 2015 Movie 58
## 12 2015 TV Show 30
## 13 2016 Movie 258
## 14 2016 TV Show 185
## 15 2017 Movie 864
## 16 2017 TV Show 361
## 17 2018 Movie 1255
## 18 2018 TV Show 430
## 19 2019 Movie 1497
## 20 2019 TV Show 656
## 21 2020 Movie 1312
## 22 2020 TV Show 697
## 23 2021 Movie 88
## 24 2021 TV Show 29
## It shows that most movies were added post 2015
netflix_dataset <- netflix_dataset %>%
mutate(year_added = pmax(year(date_added),2015))
netflix_dataset %>%
filter(!is.na(date_added)) %>%
count(year_added,type) %>%
ggplot(aes(year_added,n,fill = type))+
geom_area()
## explore ratings by type and added years (top 4 rating types)
netflix_dataset %>%
filter(!is.na(date_added),!is.na(rating)) %>%
group_by(type) %>%
mutate(rating = fct_lump(rating,4)) %>%
count(type,year_added,rating) %>%
group_by(type,year_added) %>%
mutate(percent = n/sum(n)) %>%
ggplot(aes(year_added,percent,fill = rating))+
geom_area()+
facet_wrap(~type)+
xlab("Year of being added")+
theme_bw()
netflix_country <- netflix_dataset %>%
separate_rows(country,sep = ", ") %>%
select(country) %>%
filter(country != "") %>%
count(country)
names(netflix_country)[1] <- "region"
world <- map_data("world")
mapdata <- left_join(world, netflix_country, by = "region")
map1 <- ggplot(mapdata,aes(x = long, y = lat, group = group)) +
geom_polygon(aes(fill = n), color = "black")
map2 <- map1 + scale_fill_gradient(name = "total number of Netflix releases", na.value = "grey50")+
theme(axis.text.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks = element_blank(),
axis.title.y = element_blank(),
axis.title.x = element_blank(),
rect = element_blank())
## Bar plot would be a more direct overall representation.
country_bar <- netflix_dataset %>%
filter(!is.na(country),
country != "") %>%
count(country = fct_lump(country,9),
type,sort = TRUE) %>%
mutate(country = fct_reorder(country,n)) %>%
ggplot(aes(n,country,fill = type))+
geom_col(alpha = 0.75)
map2
country_bar
netflix_dataset %>%
filter(!is.na(rating), !is.na(country), country != "") %>%
group_by(type, country = fct_lump(country,9)) %>%
summarise(n_mature = sum(rating %in% c("R","TV-MA")),
n = n(),
.groups = "drop") %>%
mutate(percent_mature = n_mature/n,
conf_low = qbeta(.025, n_mature + .5, n - n_mature + .5),
conf_high = qbeta(.975, n_mature + .5, n - n_mature + .5)) %>%
ggplot(aes(percent_mature,country,color = type))+
geom_point(aes(size = n))+
geom_errorbar(aes(xmin = conf_low, xmax = conf_high))+
scale_x_continuous(labels = scales::percent)+
expand_limits(x = 0)+
xlab("% of works that are mature rated (R/TV-MA)")