Libraries
library(readr)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.2.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2
## ──
## ✔ ggplot2 3.4.0 ✔ purrr 1.0.1
## ✔ tibble 3.2.1 ✔ stringr 1.5.0
## ✔ tidyr 1.3.0 ✔ forcats 1.0.0
## Warning: package 'tibble' was built under R version 4.2.3
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(ggplot2)
library(data.table)
##
## Attaching package: 'data.table'
##
## The following object is masked from 'package:purrr':
##
## transpose
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.2.3
##
## Attaching package: 'lubridate'
##
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
Importing dataset
netflix_titles <- read_csv("C:/Users/HP/Downloads/netflix_titles.csv/netflix_titles.csv")
## Rows: 8807 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): show_id, type, title, director, cast, country, date_added, rating,...
## dbl (1): release_year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(netflix_titles)
Making copy of dataset and omitting values
netflix <- netflix_titles
na.omit(netflix)
## # A tibble: 5,332 × 12
## show_id type title director cast country date_added release_year rating
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl> <chr>
## 1 s8 Movie Sankofa Haile G… Kofi… United… September… 1993 TV-MA
## 2 s9 TV Show The Gr… Andy De… Mel … United… September… 2021 TV-14
## 3 s10 Movie The St… Theodor… Meli… United… September… 2021 PG-13
## 4 s13 Movie Je Sui… Christi… Luna… German… September… 2021 TV-MA
## 5 s25 Movie Jeans S. Shan… Pras… India September… 1998 TV-14
## 6 s28 Movie Grown … Dennis … Adam… United… September… 2010 PG-13
## 7 s29 Movie Dark S… Scott S… Keri… United… September… 2013 PG-13
## 8 s30 Movie Parano… Robert … Liam… United… September… 2013 PG-13
## 9 s39 Movie Birth … George … Bill… China,… September… 2017 PG-13
## 10 s42 Movie Jaws Steven … Roy … United… September… 1975 PG
## # ℹ 5,322 more rows
## # ℹ 3 more variables: duration <chr>, listed_in <chr>, description <chr>
Remove duplicate rows
netflix <- unique(netflix_titles)
netflix <- na.omit(netflix)
TO check that how many tv shows made by which director
movies <- netflix %>%
filter(type=="Movie")
view(movies)
tvseries <- netflix %>%
filter(type=="TV Show")
view(tvseries)
shankar_movie<-movies %>%
filter(director=="S. Shankar")
view(shankar_movie)
vinod <- netflix %>%
filter(director=="Rajiv Chilaka")
view(vinod)
JosephSargent<-netflix %>%
filter(director=="Joseph Sargent")
view(JosephSargent)
Jaychapman<-movies %>%
filter(director=="Jay Chapman")
view(Jaychapman)
check the count of movies and tv shows .
tv_show_directors_count <- netflix %>%
filter(type == "TV Show") %>%
nrow()
view(tv_show_directors_count)
movie_directors_count <-netflix %>%
filter(type == "Movie") %>%
nrow()
view(movie_directors_count)
count rows & colums in the netflix
count_rows <- count(netflix)
count_cols <- ncol(netflix)
Data cleaning
# Transforming to data table
netflix <- as.data.table(netflix)
# Formatting the date added column
netflix <- netflix[, date_added := mdy(date_added)]
# Adding a new column for the year the content is added
netflix <- netflix[, year_added := year(date_added)]
# Adding new columns for of duration duration unit, year added.
netflix <- netflix[, c('duration','duration_unit') := do.call(Map, c(f = c, strsplit(duration, ' '))) ]
# changing the duration column to numeric
netflix$duration <- as.numeric(netflix$duration)
# Creating a new column for the content details
netflix$content_details <- paste0(netflix$type, ", ", netflix$listed_in)
netflix[type == "Movie", decade := 10 * (release_year %/% 10) ]
netflix$decade <- as.numeric(netflix$decade)
Creating new table
# creating new data table of countries to be used for further mapping
countries <- netflix[, .(count = .N), by = .(country, year_added)]
countries <- drop_na(countries)
# extracting each country and the year their content added to the Netflix
countries <- countries %>%
ungroup()%>%
separate_rows(country,sep = ",")%>%
mutate(
country=str_trim(country)
)%>%
group_by(year_added,country)%>%
summarize(
count=n()
)%>%
ungroup()%>%
filter(country!='NA',country!="" )%>%
arrange(year_added,desc(count))
## `summarise()` has grouped output by 'year_added'. You can override using the
## `.groups` argument.
# Create function to drop null values
row.has.na <- apply(countries, 1, function(x){any(is.na(x))})
sum(row.has.na)
## [1] 0
create counrty code
countries <- countries[!row.has.na,]
# adding the country codes
countries <- as.data.table(countries)
countries <- countries[, iso2 := countrycode::countryname(country,destination = "iso2c")]
countries <- countries[, iso3 := countrycode::countryname(country,destination = "iso3c")]
# adding the country codes
# renaming the column year_added
colnames(countries)[colnames(countries) == "year_added"] <- "year"
# Defining MPA rating, Motion Picture and animation film rating system
MPA_ratings <- c("G", "PG", "PG-13", "R", "NC-17")
create world correlations Data summary
# data summary
# Data summary for type
summary((` Type` = type) ~ N + Percent(), data = netflix, title = "Netflix Contnet Type")
## Length Class Mode
## 3 formula call
summary((`Rating` = rating )~ N + Percent(), data = netflix, title = "Rating Categories")
## Length Class Mode
## 3 formula call
# data summary for Duration and type
summary((`Type` = type)*(`Duration` = duration) ~ Min + Max + Mean + Median + N , data = netflix, title = "Duration Summary")
## Length Class Mode
## 3 formula call
# Data summary for type and yeat
summary((`Type` = type)*(`Year added` = year_added) ~ Min + Max + N , data = netflix, title = "Rating Categories")
## Length Class Mode
## 3 formula call
Content Distributions.
# content distribution
ggplot(data=netflix, aes(x=year_added, fill = type)) +
geom_bar() +
xlab("Year") +
ylab("Count")+
labs(title = "Netflix Content Distribution", fill = "Type") +
theme(legend.position = "top",
panel.border = element_blank(), axis.text=element_text(size=8),
plot.title = element_text(size = 12L, face = "bold", hjust = 0.5),
panel.background = element_rect(fill = NA) )
Graph shows movie duration distributions.
# movies duration distribution
ggplot(data=netflix[netflix$type == "Movie", ], aes(x=duration)) +
geom_bar(fill = "red", alpha = 0.8) +
xlab("Duration (in minutes) ") +
ylab("Count")+
labs(title = "Netflix Movies Duration Distribution") +
theme(legend.position = "top",
panel.border = element_blank(), axis.text=element_text(size=8),
plot.title = element_text(size = 12L, face = "bold", hjust = 0.5),
panel.background = element_rect(fill = NA) )
Tv shows duration distributions.
# tv shows seasons distribution
ggplot(data=netflix[netflix$type == "TV Show", ], aes(x=duration)) +
geom_bar(fill = "purple", alpha = 0.8) +
xlab("Number of Seasons") +
ylab("Count")+
labs(title = "Netflix TV Shows Seasons Distribution") +
theme(legend.position = "top",
panel.border = element_blank(), axis.text=element_text(size=8),
plot.title = element_text(size = 12L, face = "bold", hjust = 0.5), panel.background = element_rect(fill = NA) )
Top countries content providers graphs.
ggplot(countries[ , .(country, count)][, .(total = sum(count)), by = country ][order(-total)][head(1:10)], aes(x = country, y = total)) +
geom_col(fill = "#440154", alpha = 0.8) +
xlab("Country") +
ylab("Number of contents") +
labs(title = "Number of Netflix Content per Country") +
theme(legend.position = "top",
panel.border = element_blank(), axis.text=element_text(size=8),
plot.title = element_text(size = 12L, face = "bold", hjust = 0.5), panel.background = element_rect(fill = NA) )
How the production of Movies and TV shows have changed across years?
netflix <- as.data.table(netflix)
ggplot(netflix[, .(count = .N), by = .(type, release_year)], aes(x=release_year, y=count, group=type, color=type)) +
geom_line() +
geom_point() +
ggtitle("Netflix Movies and TV shows per year") +
ylab("Number of Movies / TV shows") +
xlab("Release Year") +
labs(color = "Type", group = "Type") +
theme(legend.position = "top", panel.background = element_rect(fill = NA),
panel.border = element_blank(), axis.text=element_text(size=8),
plot.title = element_text(size = 12L, face = "bold", hjust = 0.5) )
How the average movie duration has changed across the decades?
ggplot(netflix[type == "Movie"], aes(decade, duration, group = decade, fill = decade)) +
xlab("Decade") +
ylab("Duration") +
labs(fill = "Decade", group = "Decade", title = "Movies Duration across Decades") +
geom_boxplot() +
theme(legend.position = "top", panel.background = element_rect(fill = NA),
panel.border = element_blank(), axis.text=element_text(size=8),
plot.title = element_text(size = 12L, face = "bold", hjust = 0.5) )