Background
About this Dataset: Netflix is one of the most popular media and
video streaming platforms. They have over 8000 movies or tv shows
available on their platform, as of mid-2021, they have over 200M
Subscribers globally. This tabular dataset consists of listings of all
the movies and tv shows available on Netflix, along with details such as
- cast, directors, ratings, release year, duration, etc.
library(ggplot2)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
## Loading required package: timechange
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
netflix <- read_csv("netflix_titles.csv")
## Rows: 8807 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): show_id, type, title, director, cast, country, date_added, rating,...
## dbl (1): release_year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
netflix$date_added <- mdy(netflix$date_added)
Main Questions
Top 5 countries that have the most films added to Netflix in 2021
?
What are the trends of releasing movie and TV shows in each continent
?
What are the trends of the contents released on Netflix over time in
each continent ?
How is the distribution of the rating across continents ?
Trends of movie and TV shows being released on Netflix in each
continent over time
# Now I want to do some analysis based on regions and continents so I will load another csv file
continents <- read_csv("continents2.csv")
## Rows: 249 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): name, alpha-2, alpha-3, iso_3166-2, region, sub-region, intermediat...
## dbl (4): country-code, region-code, sub-region-code, intermediate-region-code
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Since I only need the region and sub-region so I will only select those columns and the name column so I can later join the table
region_n_sub <- continents %>%
select(region, `sub-region`, name)
# The column name for sub region is really bad for data analysis so I change it
region_n_sub$`sub-region` <- colnames("sub_region")
# Now let's remove NA values
region_n_sub_clean <- region_n_sub %>%
filter(!is.na(region))
# Let's join the tables together !!!
netflix_join <- netflix %>%
inner_join(region_n_sub_clean, by = c("country" = "name"))
netflix_join_region <- netflix_join %>%
group_by(country, release_year, region) %>%
summarize(total = n())
## `summarise()` has grouped output by 'country', 'release_year'. You can override
## using the `.groups` argument.
# Let's create a scatterplot colored by region so they can see the trends of the released year between regions more clearly
ggplot(netflix_join_region, aes(release_year, total, color = region )) +
geom_point(alpha = 0.5) +
scale_y_log10()

# Let's create a line chart for these regions
# Firstly we need another table
netflix_cont <- netflix_join_region %>%
group_by(region, release_year) %>%
summarize(total = n())
## `summarise()` has grouped output by 'region'. You can override using the
## `.groups` argument.
ggplot(netflix_cont, aes(release_year, total, color = region)) +
geom_line() +
coord_fixed(1/2)

Trends of the contents released on Netflix over time in each
continent
There are only two types of content in this dataset which are “movie”
and “TV Shows”
# Now let's make some exploration to the Netflix type column to see the trend of Netflix types in each region
netflix_type_count <- netflix_join %>%
group_by(region, type) %>%
summarize(total = n())
## `summarise()` has grouped output by 'region'. You can override using the
## `.groups` argument.
ggplot(netflix_type_count, aes(type, total, fill = region)) +
geom_col(position = "fill")

# Let's see the Netflix type over time in each region
#First, we need a new table that also includes the year that is released
netflix_type_time <- netflix_join %>%
mutate(year_added = year(date_added)) %>%
group_by(region, type, year_added ) %>%
summarize(total = n())
## `summarise()` has grouped output by 'region', 'type'. You can override using
## the `.groups` argument.
ggplot(netflix_type_time, aes(year_added, total, color = type)) +
geom_line() +
facet_wrap(~ region)

Rating distribution across continents
# Let's make some anylysis upon movies rating, let's start with a simple count, remember to remove NA
netflix_rating <- netflix_join %>%
group_by(rating) %>%
summarise(total = n()) %>%
filter(!is.na(rating))
# Making some changes to the order of the data so I can make a more informative bar charts
netflix_rating <- netflix_rating[order(-netflix_rating$total),]
netflix_rating$rating <- reorder(netflix_rating$rating, netflix_rating$total)
# Let's get a bar chart for comparison, since the chart is hard to read I will flip the axes
ggplot(netflix_rating, aes(rating, total)) +
geom_col() +
coord_flip()

# Now let's see each rating's distributions across different regions
netflix_rating_by_region <- netflix_join %>%
group_by(rating, region) %>%
summarise(total = n()) %>%
filter(!is.na(rating))
## `summarise()` has grouped output by 'rating'. You can override using the
## `.groups` argument.
ggplot(netflix_rating_by_region, aes(rating, total, fill = region)) +
geom_col(position = "fill") +
coord_flip()

# Let's create subplot for each region that visualize their rating trend over time
# Firstly we need another table
netflix_region_time <- netflix_join %>%
group_by(region, year(date_added), rating) %>%
summarize(total = n())
## `summarise()` has grouped output by 'region', 'year(date_added)'. You can
## override using the `.groups` argument.
# Secondly, let's draw a bar chart
ggplot(netflix_region_time, aes(`year(date_added)`, total, fill = rating)) +
geom_col() +
facet_wrap(~ region)
## Warning: Removed 7 rows containing missing values (`position_stack()`).
