Introduction

Netflix is known for its strong recommendation engines. They use a mix of content-based and collaborative filtering models to recommend tv shows and movies. In this task, one can create a recommendation engine based on text/description similarity techniques.

# Call library
library(ggplot2)
library(GGally)
library(ggthemes)
library(ggpubr)
library(leaflet)
library(lubridate)
library(readr) 
library(dplyr)
library(tidyr)
library(glue)
library(plotly)
library(tidyverse)
library(flexdashboard)
library(shiny)
library(shinydashboard)

Research Objective

# Import data
netflix1 <- read_csv("netflix_titles.csv")
lat <- read.csv("countries_lat.csv")
str(netflix1)
## Classes 'spec_tbl_df', 'tbl_df', 'tbl' and 'data.frame': 6234 obs. of  12 variables:
##  $ show_id     : num  81145628 80117401 70234439 80058654 80125979 ...
##  $ type        : chr  "Movie" "Movie" "TV Show" "TV Show" ...
##  $ title       : chr  "Norm of the North: King Sized Adventure" "Jandino: Whatever it Takes" "Transformers Prime" "Transformers: Robots in Disguise" ...
##  $ director    : chr  "Richard Finn, Tim Maltby" NA NA NA ...
##  $ cast        : chr  "Alan Marriott, Andrew Toth, Brian Dobson, Cole Howard, Jennifer Cameron, Jonathan Holmes, Lee Tockar, Lisa Duru"| __truncated__ "Jandino Asporaat" "Peter Cullen, Sumalee Montano, Frank Welker, Jeffrey Combs, Kevin Michael Richardson, Tania Gunadi, Josh Keaton"| __truncated__ "Will Friedle, Darren Criss, Constance Zimmer, Khary Payton, Mitchell Whitfield, Stuart Allan, Ted McGinley, Peter Cullen" ...
##  $ country     : chr  "United States, India, South Korea, China" "United Kingdom" "United States" "United States" ...
##  $ date_added  : chr  "9-Sep-19" "9-Sep-16" "8-Sep-18" "8-Sep-18" ...
##  $ release_year: num  2019 2016 2013 2016 2017 ...
##  $ rating      : chr  "TV-PG" "TV-MA" "TV-Y7-FV" "TV-Y7" ...
##  $ duration    : chr  "90 min" "94 min" "1 Season" "1 Season" ...
##  $ listed_in   : chr  "Children & Family Movies, Comedies" "Stand-Up Comedy" "Kids' TV" "Kids' TV" ...
##  $ description : chr  "Before planning an awesome wedding for his grandfather, a polar bear king must take back a stolen artifact from"| __truncated__ "Jandino Asporaat riffs on the challenges of raising kids and serenades the audience with a rousing rendition of"| __truncated__ "With the help of three human allies, the Autobots once again protect Earth from the onslaught of the Decepticon"| __truncated__ "When a prison ship crash unleashes hundreds of Decepticons on Earth, Bumblebee leads a new Autobot force to protect humankind." ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   show_id = col_double(),
##   ..   type = col_character(),
##   ..   title = col_character(),
##   ..   director = col_character(),
##   ..   cast = col_character(),
##   ..   country = col_character(),
##   ..   date_added = col_character(),
##   ..   release_year = col_double(),
##   ..   rating = col_character(),
##   ..   duration = col_character(),
##   ..   listed_in = col_character(),
##   ..   description = col_character()
##   .. )
colSums(is.na(netflix1))
##      show_id         type        title     director         cast      country 
##            0            0            0         1969          570          476 
##   date_added release_year       rating     duration    listed_in  description 
##           11            0           10            0            0            0
summary(netflix1)
##     show_id             type              title             director        
##  Min.   :  247747   Length:6234        Length:6234        Length:6234       
##  1st Qu.:80035802   Class :character   Class :character   Class :character  
##  Median :80163367   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :76703679                                                           
##  3rd Qu.:80244889                                                           
##  Max.   :81235729                                                           
##      cast             country           date_added         release_year 
##  Length:6234        Length:6234        Length:6234        Min.   :1925  
##  Class :character   Class :character   Class :character   1st Qu.:2013  
##  Mode  :character   Mode  :character   Mode  :character   Median :2016  
##                                                           Mean   :2013  
##                                                           3rd Qu.:2018  
##                                                           Max.   :2020  
##     rating            duration          listed_in         description       
##  Length:6234        Length:6234        Length:6234        Length:6234       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
## 

Data Cleaning

# 1. change data format of date_added

netflix1$date_added <- mdy(netflix1$date_added)

# 2. Drop column Show ID
netflix1 %>% 
  dplyr::select(-c(show_id))
## # A tibble: 6,234 x 11
##    type  title director cast  country date_added release_year rating duration
##    <chr> <chr> <chr>    <chr> <chr>   <date>            <dbl> <chr>  <chr>   
##  1 Movie Norm~ Richard~ Alan~ United~ NA                 2019 TV-PG  90 min  
##  2 Movie Jand~ <NA>     Jand~ United~ NA                 2016 TV-MA  94 min  
##  3 TV S~ Tran~ <NA>     Pete~ United~ NA                 2013 TV-Y7~ 1 Season
##  4 TV S~ Tran~ <NA>     Will~ United~ NA                 2016 TV-Y7  1 Season
##  5 Movie #rea~ Fernand~ Nest~ United~ NA                 2017 TV-14  99 min  
##  6 TV S~ Apac~ <NA>     Albe~ Spain   NA                 2016 TV-MA  1 Season
##  7 Movie Auto~ Gabe Ib~ Anto~ Bulgar~ NA                 2014 R      110 min 
##  8 Movie Fabr~ Rodrigo~ Fabr~ Chile   NA                 2017 TV-MA  60 min  
##  9 TV S~ Fire~ <NA>     <NA>  United~ NA                 2017 TV-MA  1 Season
## 10 Movie Good~ Henrik ~ Jame~ United~ NA                 2014 R      90 min  
## # ... with 6,224 more rows, and 2 more variables: listed_in <chr>,
## #   description <chr>
# 3. Drop duplicate shows
netflix <- netflix1 %>% 
  distinct(title, country, type, release_year, .keep_all = T)

# 4. drop 10 missing rows in rating column
netflix <- netflix1 %>% 
  drop_na(rating) 
netflix
## # A tibble: 6,224 x 12
##    show_id type  title director cast  country date_added release_year rating
##      <dbl> <chr> <chr> <chr>    <chr> <chr>   <date>            <dbl> <chr> 
##  1  8.11e7 Movie Norm~ Richard~ Alan~ United~ NA                 2019 TV-PG 
##  2  8.01e7 Movie Jand~ <NA>     Jand~ United~ NA                 2016 TV-MA 
##  3  7.02e7 TV S~ Tran~ <NA>     Pete~ United~ NA                 2013 TV-Y7~
##  4  8.01e7 TV S~ Tran~ <NA>     Will~ United~ NA                 2016 TV-Y7 
##  5  8.01e7 Movie #rea~ Fernand~ Nest~ United~ NA                 2017 TV-14 
##  6  8.02e7 TV S~ Apac~ <NA>     Albe~ Spain   NA                 2016 TV-MA 
##  7  7.03e7 Movie Auto~ Gabe Ib~ Anto~ Bulgar~ NA                 2014 R     
##  8  8.02e7 Movie Fabr~ Rodrigo~ Fabr~ Chile   NA                 2017 TV-MA 
##  9  8.01e7 TV S~ Fire~ <NA>     <NA>  United~ NA                 2017 TV-MA 
## 10  7.03e7 Movie Good~ Henrik ~ Jame~ United~ NA                 2014 R     
## # ... with 6,214 more rows, and 3 more variables: duration <chr>,
## #   listed_in <chr>, description <chr>

Research Question Number one : Which category shown the most on netflix?

#data processing

netflix_r1 <- netflix %>% 
  mutate(action = str_detect(listed_in, "Action"),
         drama = str_detect(listed_in, "Drama"),
         anime = str_detect(listed_in, "Anime"),
         documentary = str_detect(listed_in, "Documentaries"),
         comedy = str_detect(listed_in, "Comedies"),
                  category = case_when(action == T ~ "Action",
                              drama == T ~ "Drama",
                              anime == T ~ "Anime",
                              documentary == T ~ "Documentaries",
                              comedy == T ~ "Comedies")) %>% 
  dplyr::select(-c(action, drama, anime, documentary, comedy)) %>% 
  drop_na(category)
netflix_r12 <- netflix_r1%>% 
  group_by(category, type) %>% 
  summarise(total =n()) %>% 
  ungroup %>% 
  mutate(text= glue(
    "Number of Movie/TV = {total}"
  ))
plot1 <- ggplot(data = netflix_r12, aes(x = reorder(category,total), y= total, text=text))+
  theme()+
  geom_col(aes(fill = type), position = "dodge")+
  coord_flip()+
  labs(title="Number of Movie/TV Shows Released on Netflix",  x="Total Movie/TV Shows", y="Movie/ TV Shows Category", caption="Made by Meinari Claudia")

ggplotly(plot1, tooltip = "text")

Research Question Number two : which country released the most movie and tv show?

There are huge number of countries in country column so we will focus on (United States,India,United Kingdom,Japan,Canada,Spain) as the top productive countries.

netflix_r2 <- netflix_r1 %>% 
  mutate(US = str_detect(country, "United States"),
         India = str_detect(country, "India"),
         UK = str_detect(country, "United Kingdom"),
         Japan = str_detect(country, "Japan"),
         Canada = str_detect(country, "Canada"),
         Spain = str_detect(country, "Spain"),
         country1 = case_when(US == T ~ "US",
                              India == T ~ "India",
                              UK == T ~ "UK",
                              Japan == T ~ "Japan",
                              Canada == T ~ "Canada",
                              Spain == T ~ "Spain")) %>% 
  dplyr::select(-c(US, India, UK, Japan, Canada, Spain)) %>% 
  drop_na(country1) 
  
netflix_r2       
## # A tibble: 3,182 x 14
##    show_id type  title director cast  country date_added release_year rating
##      <dbl> <chr> <chr> <chr>    <chr> <chr>   <date>            <dbl> <chr> 
##  1  8.11e7 Movie Norm~ Richard~ Alan~ United~ NA                 2019 TV-PG 
##  2  8.01e7 Movie #rea~ Fernand~ Nest~ United~ NA                 2017 TV-14 
##  3  7.03e7 Movie Good~ Henrik ~ Jame~ United~ NA                 2014 R     
##  4  7.03e7 Movie Kidn~ Daniel ~ Jim ~ Nether~ NA                 2015 R     
##  5  8.01e7 Movie Manh~ Tom O'B~ Tom ~ United~ NA                 2014 TV-14 
##  6  8.00e7 Movie Roll~ Mitch D~ <NA>  United~ NA                 2015 TV-MA 
##  7  8.01e7 Movie The ~ Austin ~ Nico~ United~ NA                 2015 R     
##  8  8.00e7 Movie 6 Ye~ Hannah ~ Tais~ United~ NA                 2015 NR    
##  9  8.02e7 Movie City~ Madelei~ <NA>  United~ NA                 2018 TV-MA 
## 10  8.10e7 Movie Next~ Kevin R~ John~ China,~ NA                 2018 TV-PG 
## # ... with 3,172 more rows, and 5 more variables: duration <chr>,
## #   listed_in <chr>, description <chr>, category <chr>, country1 <chr>
netflix_r22 <- netflix_r2%>% 
  group_by(country1) %>% 
  summarise(total =n()) %>% 
  ungroup %>% 
  mutate(text= glue(
    "Number of Movie/TV = {total}" 
  )) %>% 
  left_join(lat, by = "country1")

Research Question Number three : What TV rating category shown the most on netflix?

TV-MA:This program is specifically designed to be viewed by adults and therefore may be unsuitable for children under 17. TV-14:This program contains some material that many parents would find unsuitable for children under 14 years of age. TV-PG:This program contains material that parents may find unsuitable for younger children. R:Under 17 requires accompanying parent or adult guardian,Parents are urged to learn more about the film before taking their young children with them. PG-13:Some material may be inappropriate for children under 13. Parents are urged to be cautious. Some material may be inappropriate for pre-teenagers. NR or UR:If a film has not been submitted for a rating or is an uncut version of a film that was submitted PG:Some material may not be suitable for children,May contain some material parents might not like for their young children. TV-Y7:This program is designed for children age 7 and above. TV-G:This program is suitable for all ages. TV-Y:Programs rated TV-Y are designed to be appropriate for children of all ages. The thematic elements portrayed in programs with this rating are specifically designed for a very young audience, including children ages 2-6. TV-Y7-FV:is recommended for ages 7 and older, with the unique advisory that the program contains fantasy violence. G:All ages admitted. Nothing that would offend parents for viewing by children. NC-17:No One 17 and Under Admitted. Clearly adult. Children are not admitted. here we discover that UR and NR is the same rating(unrated,Not rated) Uncut/extended versions of films that are labeled “Unrated” also contain warnings saying that the uncut version of the film contains content that differs from the theatrical release and might not be suitable for minors. so we have the fix this.

netflix_r3 <- netflix_r1 %>% 
  group_by(rating, type) %>% 
  summarise(total=n()) %>% 
  ungroup() %>%
  mutate(text= glue(
    "Number of Movie/TV = {total}")) %>% 
  drop_na(rating) %>% 
  filter(rating != "UR" | rating != "NR")  
plot3 <- ggplot(data = netflix_r3, aes(x = reorder(rating,total), y= total, tex=text))+
  theme()+
  geom_col(aes(fill = type), position = "dodge")+
  coord_flip()+
  labs(title="Number of Movie/TV Shows Released on Netflix based on TV rating",  x="Total Movie/TV Shows", caption="Made by Meinari Claudia")

ggplotly(plot3, tooltip = "text")