Background

About this Dataset: Netflix is one of the most popular media and video streaming platforms. They have over 8000 movies or tv shows available on their platform, as of mid-2021, they have over 200M Subscribers globally. This tabular dataset consists of listings of all the movies and tv shows available on Netflix, along with details such as - cast, directors, ratings, release year, duration, etc.

library(ggplot2)
library(readr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(lubridate)

## Loading required package: timechange

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

netflix <- read_csv("netflix_titles.csv")

## Rows: 8807 Columns: 12

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): show_id, type, title, director, cast, country, date_added, rating,...
## dbl  (1): release_year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

netflix$date_added <- mdy(netflix$date_added)

Main Questions

Top 5 countries that have the most films added to Netflix in 2021 ?

What are the trends of releasing movie and TV shows in each continent ?

What are the trends of the contents released on Netflix over time in each continent ?

How is the distribution of the rating across continents ?

Top 5 countries that have the most films added to Netflix in 2021

Trends of movie and TV shows being released on Netflix in each continent over time

# Now I want to do some analysis based on regions and continents so I will load another csv file
continents <- read_csv("continents2.csv")

## Rows: 249 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): name, alpha-2, alpha-3, iso_3166-2, region, sub-region, intermediat...
## dbl (4): country-code, region-code, sub-region-code, intermediate-region-code
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Since I only need the region and sub-region so I will only select those columns and the name column so I can later join the table
region_n_sub <- continents %>%
  select(region, `sub-region`, name)

# The column name for sub region is really bad for data analysis so I change it
region_n_sub$`sub-region` <- colnames("sub_region")

# Now let's remove NA values 
region_n_sub_clean <- region_n_sub %>%
  filter(!is.na(region))

# Let's join the tables together !!!
netflix_join <- netflix %>%
  inner_join(region_n_sub_clean, by = c("country" = "name"))

netflix_join_region <- netflix_join %>%
  group_by(country, release_year, region) %>%
  summarize(total = n())

## `summarise()` has grouped output by 'country', 'release_year'. You can override
## using the `.groups` argument.

# Let's create a scatterplot colored by region so they can see the trends of the released year between regions more clearly
ggplot(netflix_join_region, aes(release_year, total, color = region )) +
                geom_point(alpha = 0.5) +
                scale_y_log10()

# Let's create a line chart for these regions 
# Firstly we need another table
netflix_cont <- netflix_join_region %>%
  group_by(region, release_year) %>%
  summarize(total = n())

## `summarise()` has grouped output by 'region'. You can override using the
## `.groups` argument.

ggplot(netflix_cont, aes(release_year, total, color = region)) +
                geom_line() +
                coord_fixed(1/2)

Trends of the contents released on Netflix over time in each continent

There are only two types of content in this dataset which are “movie” and “TV Shows”

# Now let's make some exploration to the Netflix type column to see the trend of Netflix types in each region
netflix_type_count <- netflix_join %>%
  group_by(region, type) %>%
  summarize(total = n())

## `summarise()` has grouped output by 'region'. You can override using the
## `.groups` argument.

ggplot(netflix_type_count, aes(type, total, fill = region)) +
  geom_col(position = "fill")

# Let's see the Netflix type over time in each region 
#First, we need a new table that also includes the year that is released
netflix_type_time <- netflix_join %>%
  mutate(year_added = year(date_added)) %>%
  group_by(region, type, year_added ) %>%
  summarize(total = n())

## `summarise()` has grouped output by 'region', 'type'. You can override using
## the `.groups` argument.

ggplot(netflix_type_time, aes(year_added, total, color = type)) +
  geom_line() +
  facet_wrap(~ region)

Rating distribution across continents

# Let's make some anylysis upon movies rating, let's start with a simple count, remember to remove NA
netflix_rating <- netflix_join %>%
  group_by(rating) %>%
  summarise(total = n()) %>%
  filter(!is.na(rating)) 

# Making some changes to the order of the data so I can make a more informative bar charts
netflix_rating <- netflix_rating[order(-netflix_rating$total),]
netflix_rating$rating <- reorder(netflix_rating$rating, netflix_rating$total)

# Let's get a bar chart for comparison, since the chart is hard to read I will flip the axes
ggplot(netflix_rating, aes(rating, total)) +
  geom_col() +
  coord_flip()

# Now let's see each rating's distributions across different regions
netflix_rating_by_region <- netflix_join %>%
  group_by(rating, region) %>%
  summarise(total = n()) %>%
  filter(!is.na(rating))

## `summarise()` has grouped output by 'rating'. You can override using the
## `.groups` argument.

 ggplot(netflix_rating_by_region, aes(rating, total, fill = region)) +
  geom_col(position = "fill") +
  coord_flip()

# Let's create subplot for each region that visualize their rating trend over time
# Firstly we need another table
netflix_region_time <- netflix_join %>%
  group_by(region, year(date_added), rating) %>%
  summarize(total = n())

## `summarise()` has grouped output by 'region', 'year(date_added)'. You can
## override using the `.groups` argument.

# Secondly, let's draw a bar chart 
ggplot(netflix_region_time, aes(`year(date_added)`, total, fill = rating)) +
  geom_col() +
  facet_wrap(~ region)

## Warning: Removed 7 rows containing missing values (`position_stack()`).

Netflix Analysis

Khoi Nguyen

2023-03-09