Netflix is known for its strong recommendation engines. They use a mix of content-based and collaborative filtering models to recommend tv shows and movies. In this task, one can create a recommendation engine based on text/description similarity techniques.
# Call library
library(ggplot2)
library(GGally)
library(ggthemes)
library(ggpubr)
library(leaflet)
library(lubridate)
library(readr)
library(dplyr)
library(tidyr)
library(glue)
library(plotly)
library(tidyverse)
library(flexdashboard)
library(shiny)
library(shinydashboard)
# Import data
netflix1 <- read_csv("netflix_titles.csv")
lat <- read.csv("countries_lat.csv")
str(netflix1)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 6234 obs. of 12 variables:
## $ show_id : num 81145628 80117401 70234439 80058654 80125979 ...
## $ type : chr "Movie" "Movie" "TV Show" "TV Show" ...
## $ title : chr "Norm of the North: King Sized Adventure" "Jandino: Whatever it Takes" "Transformers Prime" "Transformers: Robots in Disguise" ...
## $ director : chr "Richard Finn, Tim Maltby" NA NA NA ...
## $ cast : chr "Alan Marriott, Andrew Toth, Brian Dobson, Cole Howard, Jennifer Cameron, Jonathan Holmes, Lee Tockar, Lisa Duru"| __truncated__ "Jandino Asporaat" "Peter Cullen, Sumalee Montano, Frank Welker, Jeffrey Combs, Kevin Michael Richardson, Tania Gunadi, Josh Keaton"| __truncated__ "Will Friedle, Darren Criss, Constance Zimmer, Khary Payton, Mitchell Whitfield, Stuart Allan, Ted McGinley, Peter Cullen" ...
## $ country : chr "United States, India, South Korea, China" "United Kingdom" "United States" "United States" ...
## $ date_added : chr "9-Sep-19" "9-Sep-16" "8-Sep-18" "8-Sep-18" ...
## $ release_year: num 2019 2016 2013 2016 2017 ...
## $ rating : chr "TV-PG" "TV-MA" "TV-Y7-FV" "TV-Y7" ...
## $ duration : chr "90 min" "94 min" "1 Season" "1 Season" ...
## $ listed_in : chr "Children & Family Movies, Comedies" "Stand-Up Comedy" "Kids' TV" "Kids' TV" ...
## $ description : chr "Before planning an awesome wedding for his grandfather, a polar bear king must take back a stolen artifact from"| __truncated__ "Jandino Asporaat riffs on the challenges of raising kids and serenades the audience with a rousing rendition of"| __truncated__ "With the help of three human allies, the Autobots once again protect Earth from the onslaught of the Decepticon"| __truncated__ "When a prison ship crash unleashes hundreds of Decepticons on Earth, Bumblebee leads a new Autobot force to protect humankind." ...
## - attr(*, "spec")=
## .. cols(
## .. show_id = col_double(),
## .. type = col_character(),
## .. title = col_character(),
## .. director = col_character(),
## .. cast = col_character(),
## .. country = col_character(),
## .. date_added = col_character(),
## .. release_year = col_double(),
## .. rating = col_character(),
## .. duration = col_character(),
## .. listed_in = col_character(),
## .. description = col_character()
## .. )
colSums(is.na(netflix1))
## show_id type title director cast country
## 0 0 0 1969 570 476
## date_added release_year rating duration listed_in description
## 11 0 10 0 0 0
summary(netflix1)
## show_id type title director
## Min. : 247747 Length:6234 Length:6234 Length:6234
## 1st Qu.:80035802 Class :character Class :character Class :character
## Median :80163367 Mode :character Mode :character Mode :character
## Mean :76703679
## 3rd Qu.:80244889
## Max. :81235729
## cast country date_added release_year
## Length:6234 Length:6234 Length:6234 Min. :1925
## Class :character Class :character Class :character 1st Qu.:2013
## Mode :character Mode :character Mode :character Median :2016
## Mean :2013
## 3rd Qu.:2018
## Max. :2020
## rating duration listed_in description
## Length:6234 Length:6234 Length:6234 Length:6234
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
# 1. change data format of date_added
netflix1$date_added <- mdy(netflix1$date_added)
# 2. Drop column Show ID
netflix1 %>%
dplyr::select(-c(show_id))
## # A tibble: 6,234 x 11
## type title director cast country date_added release_year rating duration
## <chr> <chr> <chr> <chr> <chr> <date> <dbl> <chr> <chr>
## 1 Movie Norm~ Richard~ Alan~ United~ NA 2019 TV-PG 90 min
## 2 Movie Jand~ <NA> Jand~ United~ NA 2016 TV-MA 94 min
## 3 TV S~ Tran~ <NA> Pete~ United~ NA 2013 TV-Y7~ 1 Season
## 4 TV S~ Tran~ <NA> Will~ United~ NA 2016 TV-Y7 1 Season
## 5 Movie #rea~ Fernand~ Nest~ United~ NA 2017 TV-14 99 min
## 6 TV S~ Apac~ <NA> Albe~ Spain NA 2016 TV-MA 1 Season
## 7 Movie Auto~ Gabe Ib~ Anto~ Bulgar~ NA 2014 R 110 min
## 8 Movie Fabr~ Rodrigo~ Fabr~ Chile NA 2017 TV-MA 60 min
## 9 TV S~ Fire~ <NA> <NA> United~ NA 2017 TV-MA 1 Season
## 10 Movie Good~ Henrik ~ Jame~ United~ NA 2014 R 90 min
## # ... with 6,224 more rows, and 2 more variables: listed_in <chr>,
## # description <chr>
# 3. Drop duplicate shows
netflix <- netflix1 %>%
distinct(title, country, type, release_year, .keep_all = T)
# 4. drop 10 missing rows in rating column
netflix <- netflix1 %>%
drop_na(rating)
netflix
## # A tibble: 6,224 x 12
## show_id type title director cast country date_added release_year rating
## <dbl> <chr> <chr> <chr> <chr> <chr> <date> <dbl> <chr>
## 1 8.11e7 Movie Norm~ Richard~ Alan~ United~ NA 2019 TV-PG
## 2 8.01e7 Movie Jand~ <NA> Jand~ United~ NA 2016 TV-MA
## 3 7.02e7 TV S~ Tran~ <NA> Pete~ United~ NA 2013 TV-Y7~
## 4 8.01e7 TV S~ Tran~ <NA> Will~ United~ NA 2016 TV-Y7
## 5 8.01e7 Movie #rea~ Fernand~ Nest~ United~ NA 2017 TV-14
## 6 8.02e7 TV S~ Apac~ <NA> Albe~ Spain NA 2016 TV-MA
## 7 7.03e7 Movie Auto~ Gabe Ib~ Anto~ Bulgar~ NA 2014 R
## 8 8.02e7 Movie Fabr~ Rodrigo~ Fabr~ Chile NA 2017 TV-MA
## 9 8.01e7 TV S~ Fire~ <NA> <NA> United~ NA 2017 TV-MA
## 10 7.03e7 Movie Good~ Henrik ~ Jame~ United~ NA 2014 R
## # ... with 6,214 more rows, and 3 more variables: duration <chr>,
## # listed_in <chr>, description <chr>
#data processing
netflix_r1 <- netflix %>%
mutate(action = str_detect(listed_in, "Action"),
drama = str_detect(listed_in, "Drama"),
anime = str_detect(listed_in, "Anime"),
documentary = str_detect(listed_in, "Documentaries"),
comedy = str_detect(listed_in, "Comedies"),
category = case_when(action == T ~ "Action",
drama == T ~ "Drama",
anime == T ~ "Anime",
documentary == T ~ "Documentaries",
comedy == T ~ "Comedies")) %>%
dplyr::select(-c(action, drama, anime, documentary, comedy)) %>%
drop_na(category)
netflix_r12 <- netflix_r1%>%
group_by(category, type) %>%
summarise(total =n()) %>%
ungroup %>%
mutate(text= glue(
"Number of Movie/TV = {total}"
))
plot1 <- ggplot(data = netflix_r12, aes(x = reorder(category,total), y= total, text=text))+
theme()+
geom_col(aes(fill = type), position = "dodge")+
coord_flip()+
labs(title="Number of Movie/TV Shows Released on Netflix", x="Total Movie/TV Shows", y="Movie/ TV Shows Category", caption="Made by Meinari Claudia")
ggplotly(plot1, tooltip = "text")
There are huge number of countries in country column so we will focus on (United States,India,United Kingdom,Japan,Canada,Spain) as the top productive countries.
netflix_r2 <- netflix_r1 %>%
mutate(US = str_detect(country, "United States"),
India = str_detect(country, "India"),
UK = str_detect(country, "United Kingdom"),
Japan = str_detect(country, "Japan"),
Canada = str_detect(country, "Canada"),
Spain = str_detect(country, "Spain"),
country1 = case_when(US == T ~ "US",
India == T ~ "India",
UK == T ~ "UK",
Japan == T ~ "Japan",
Canada == T ~ "Canada",
Spain == T ~ "Spain")) %>%
dplyr::select(-c(US, India, UK, Japan, Canada, Spain)) %>%
drop_na(country1)
netflix_r2
## # A tibble: 3,182 x 14
## show_id type title director cast country date_added release_year rating
## <dbl> <chr> <chr> <chr> <chr> <chr> <date> <dbl> <chr>
## 1 8.11e7 Movie Norm~ Richard~ Alan~ United~ NA 2019 TV-PG
## 2 8.01e7 Movie #rea~ Fernand~ Nest~ United~ NA 2017 TV-14
## 3 7.03e7 Movie Good~ Henrik ~ Jame~ United~ NA 2014 R
## 4 7.03e7 Movie Kidn~ Daniel ~ Jim ~ Nether~ NA 2015 R
## 5 8.01e7 Movie Manh~ Tom O'B~ Tom ~ United~ NA 2014 TV-14
## 6 8.00e7 Movie Roll~ Mitch D~ <NA> United~ NA 2015 TV-MA
## 7 8.01e7 Movie The ~ Austin ~ Nico~ United~ NA 2015 R
## 8 8.00e7 Movie 6 Ye~ Hannah ~ Tais~ United~ NA 2015 NR
## 9 8.02e7 Movie City~ Madelei~ <NA> United~ NA 2018 TV-MA
## 10 8.10e7 Movie Next~ Kevin R~ John~ China,~ NA 2018 TV-PG
## # ... with 3,172 more rows, and 5 more variables: duration <chr>,
## # listed_in <chr>, description <chr>, category <chr>, country1 <chr>
netflix_r22 <- netflix_r2%>%
group_by(country1) %>%
summarise(total =n()) %>%
ungroup %>%
mutate(text= glue(
"Number of Movie/TV = {total}"
)) %>%
left_join(lat, by = "country1")
TV-MA:This program is specifically designed to be viewed by adults and therefore may be unsuitable for children under 17. TV-14:This program contains some material that many parents would find unsuitable for children under 14 years of age. TV-PG:This program contains material that parents may find unsuitable for younger children. R:Under 17 requires accompanying parent or adult guardian,Parents are urged to learn more about the film before taking their young children with them. PG-13:Some material may be inappropriate for children under 13. Parents are urged to be cautious. Some material may be inappropriate for pre-teenagers. NR or UR:If a film has not been submitted for a rating or is an uncut version of a film that was submitted PG:Some material may not be suitable for children,May contain some material parents might not like for their young children. TV-Y7:This program is designed for children age 7 and above. TV-G:This program is suitable for all ages. TV-Y:Programs rated TV-Y are designed to be appropriate for children of all ages. The thematic elements portrayed in programs with this rating are specifically designed for a very young audience, including children ages 2-6. TV-Y7-FV:is recommended for ages 7 and older, with the unique advisory that the program contains fantasy violence. G:All ages admitted. Nothing that would offend parents for viewing by children. NC-17:No One 17 and Under Admitted. Clearly adult. Children are not admitted. here we discover that UR and NR is the same rating(unrated,Not rated) Uncut/extended versions of films that are labeled “Unrated” also contain warnings saying that the uncut version of the film contains content that differs from the theatrical release and might not be suitable for minors. so we have the fix this.
netflix_r3 <- netflix_r1 %>%
group_by(rating, type) %>%
summarise(total=n()) %>%
ungroup() %>%
mutate(text= glue(
"Number of Movie/TV = {total}")) %>%
drop_na(rating) %>%
filter(rating != "UR" | rating != "NR")
plot3 <- ggplot(data = netflix_r3, aes(x = reorder(rating,total), y= total, tex=text))+
theme()+
geom_col(aes(fill = type), position = "dodge")+
coord_flip()+
labs(title="Number of Movie/TV Shows Released on Netflix based on TV rating", x="Total Movie/TV Shows", caption="Made by Meinari Claudia")
ggplotly(plot3, tooltip = "text")