Netflix Data Analysis

Libraries

library(readr)
library(dplyr)

## Warning: package 'dplyr' was built under R version 4.2.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2
## ──

## ✔ ggplot2 3.4.0     ✔ purrr   1.0.1
## ✔ tibble  3.2.1     ✔ stringr 1.5.0
## ✔ tidyr   1.3.0     ✔ forcats 1.0.0

## Warning: package 'tibble' was built under R version 4.2.3

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(ggplot2)
library(data.table)

## 
## Attaching package: 'data.table'
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

library(lubridate)

## Warning: package 'lubridate' was built under R version 4.2.3

## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

Importing dataset

netflix_titles <- read_csv("C:/Users/HP/Downloads/netflix_titles.csv/netflix_titles.csv")

## Rows: 8807 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): show_id, type, title, director, cast, country, date_added, rating,...
## dbl  (1): release_year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

View(netflix_titles)

Making copy of dataset and omitting values

netflix <- netflix_titles

na.omit(netflix)

## # A tibble: 5,332 × 12
##    show_id type    title   director cast  country date_added release_year rating
##    <chr>   <chr>   <chr>   <chr>    <chr> <chr>   <chr>             <dbl> <chr> 
##  1 s8      Movie   Sankofa Haile G… Kofi… United… September…         1993 TV-MA 
##  2 s9      TV Show The Gr… Andy De… Mel … United… September…         2021 TV-14 
##  3 s10     Movie   The St… Theodor… Meli… United… September…         2021 PG-13 
##  4 s13     Movie   Je Sui… Christi… Luna… German… September…         2021 TV-MA 
##  5 s25     Movie   Jeans   S. Shan… Pras… India   September…         1998 TV-14 
##  6 s28     Movie   Grown … Dennis … Adam… United… September…         2010 PG-13 
##  7 s29     Movie   Dark S… Scott S… Keri… United… September…         2013 PG-13 
##  8 s30     Movie   Parano… Robert … Liam… United… September…         2013 PG-13 
##  9 s39     Movie   Birth … George … Bill… China,… September…         2017 PG-13 
## 10 s42     Movie   Jaws    Steven … Roy … United… September…         1975 PG    
## # ℹ 5,322 more rows
## # ℹ 3 more variables: duration <chr>, listed_in <chr>, description <chr>

Remove duplicate rows

netflix <- unique(netflix_titles)

netflix <- na.omit(netflix)

TO check that how many tv shows made by which director

movies <- netflix %>% 
  filter(type=="Movie")

view(movies)

tvseries <- netflix %>% 
  filter(type=="TV Show")

view(tvseries)
shankar_movie<-movies %>% 
  filter(director=="S. Shankar")

view(shankar_movie)  

vinod <- netflix %>% 
  filter(director=="Rajiv Chilaka")
view(vinod)

JosephSargent<-netflix %>% 
  filter(director=="Joseph Sargent")
view(JosephSargent)

Jaychapman<-movies %>% 
  filter(director=="Jay Chapman")
view(Jaychapman)

check the count of movies and tv shows .

tv_show_directors_count <- netflix %>%
  filter(type == "TV Show") %>%
  nrow()
view(tv_show_directors_count)

movie_directors_count <-netflix %>%
  filter(type == "Movie") %>%
  nrow()

view(movie_directors_count)

count rows & colums in the netflix

count_rows <- count(netflix)
count_cols <- ncol(netflix)

Data cleaning

# Transforming to data table
netflix <- as.data.table(netflix)

# Formatting the date added column
netflix <- netflix[, date_added := mdy(date_added)]

# Adding a new column for the year the content is added
netflix <- netflix[, year_added := year(date_added)]

# Adding new columns for of duration duration unit, year added.
netflix <- netflix[, c('duration','duration_unit') := do.call(Map, c(f = c, strsplit(duration, ' '))) ]

# changing the duration column to numeric
netflix$duration <- as.numeric(netflix$duration)

# Creating a new column for the content details
netflix$content_details <- paste0(netflix$type, ", ", netflix$listed_in)


netflix[type == "Movie", decade := 10 * (release_year %/% 10) ]
netflix$decade <- as.numeric(netflix$decade)

Creating new table

# creating new data table of countries to be used for further mapping
countries <- netflix[, .(count = .N), by = .(country, year_added)]
countries <- drop_na(countries)

# extracting each country and the year their content added to the Netflix

countries <-  countries %>% 
  ungroup()%>%
  separate_rows(country,sep = ",")%>%
  mutate(
    country=str_trim(country)
  )%>%
  group_by(year_added,country)%>%
  summarize(
    count=n()
  )%>%
  ungroup()%>%
  filter(country!='NA',country!="" )%>%
  arrange(year_added,desc(count))

## `summarise()` has grouped output by 'year_added'. You can override using the
## `.groups` argument.

# Create function to drop null values
row.has.na <- apply(countries, 1, function(x){any(is.na(x))})
sum(row.has.na)

## [1] 0

create counrty code

countries <- countries[!row.has.na,]


# adding the country codes 
countries <- as.data.table(countries)
countries <- countries[, iso2 := countrycode::countryname(country,destination = "iso2c")]
countries <- countries[, iso3 := countrycode::countryname(country,destination = "iso3c")]

# adding the country codes 


# renaming the column year_added
colnames(countries)[colnames(countries) == "year_added"] <- "year"


# Defining MPA rating, Motion Picture and animation film rating system
MPA_ratings <- c("G", "PG", "PG-13", "R", "NC-17")

create world correlations Data summary

# data summary
# Data summary for type
summary((` Type` = type) ~ N + Percent(), data = netflix, title = "Netflix Contnet Type")

##  Length   Class    Mode 
##       3 formula    call

summary((`Rating` = rating )~ N + Percent(), data = netflix, title = "Rating Categories")

##  Length   Class    Mode 
##       3 formula    call

# data summary for Duration and type
summary((`Type` = type)*(`Duration` = duration) ~ Min + Max + Mean + Median + N , data = netflix, title = "Duration Summary")

##  Length   Class    Mode 
##       3 formula    call

# Data summary for type and yeat
summary((`Type` = type)*(`Year added` = year_added) ~ Min + Max + N , data = netflix, title = "Rating Categories")

##  Length   Class    Mode 
##       3 formula    call

Content Distributions.

# content distribution
ggplot(data=netflix, aes(x=year_added, fill = type)) +
  geom_bar() +
  xlab("Year") +
  ylab("Count")+
  labs(title = "Netflix Content Distribution", fill = "Type") +
  theme(legend.position = "top", 
        panel.border = element_blank(), axis.text=element_text(size=8), 
        plot.title = element_text(size = 12L, face = "bold", hjust = 0.5), 
        panel.background = element_rect(fill = NA) )

Graph shows movie duration distributions.

# movies duration distribution
ggplot(data=netflix[netflix$type == "Movie", ], aes(x=duration)) +
  geom_bar(fill = "red", alpha = 0.8) +
  xlab("Duration (in minutes) ") +
  ylab("Count")+
  labs(title = "Netflix Movies Duration Distribution") +
  theme(legend.position = "top", 
        panel.border = element_blank(), axis.text=element_text(size=8), 
        plot.title = element_text(size = 12L, face = "bold", hjust = 0.5), 
        panel.background = element_rect(fill = NA) )

Tv shows duration distributions.

# tv shows seasons distribution 

ggplot(data=netflix[netflix$type == "TV Show", ], aes(x=duration)) +
  geom_bar(fill = "purple", alpha = 0.8) +
  xlab("Number of Seasons") +
  ylab("Count")+
  labs(title = "Netflix TV Shows Seasons Distribution") +
  theme(legend.position = "top", 
        panel.border = element_blank(), axis.text=element_text(size=8), 
        plot.title = element_text(size = 12L, face = "bold", hjust = 0.5), panel.background = element_rect(fill = NA) )

Netflix Data Analysis

Simran

2023-09-18