Data Visualization using Netflix dataset

Reynaldi

23/05/2022

About dataset

Overview

Netflix is a streaming service that offers a wide variety of award-winning TV shows, movies, anime, documentaries.The company’s primary business is a subscription-based streaming service offering online streaming from a library of films and television series, including those produced in-house. This data set contains detailed information on those libraries available on netflix and you can found here - https://www.kaggle.com/shivamb/netflix-shows

# Read data
netflix <- read.csv("data/netflix_titles.csv")

Metadata

  • show_id - Unique ID for every Movie / Tv Show
  • type - Identifier - A Movie or TV Show
  • title - Title of the Movie / Tv Show
  • director - Director of the Movie
  • cast - Actors involved in the movie / show
  • country - Country where the movie / show was produced
  • date_added - Date it was added on Netflix
  • release_year - Actual Release year of the move / show
  • rating - TV Rating of the movie / show
  • duration - Total Duration - in minutes or number of seasons
  • listed_in - Genere
  • description - The summary description
glimpse(netflix)
## Rows: 8,807
## Columns: 12
## $ show_id      <chr> "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s1~
## $ type         <chr> "Movie", "TV Show", "TV Show", "TV Show", "TV Show", "TV ~
## $ title        <chr> "Dick Johnson Is Dead", "Blood & Water", "Ganglands", "Ja~
## $ director     <chr> "Kirsten Johnson", "", "Julien Leclercq", "", "", "Mike F~
## $ cast         <chr> "", "Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Mola~
## $ country      <chr> "United States", "South Africa", "", "", "India", "", "",~
## $ date_added   <chr> "September 25, 2021", "September 24, 2021", "September 24~
## $ release_year <int> 2020, 2021, 2021, 2021, 2021, 2021, 2021, 1993, 2021, 202~
## $ rating       <chr> "PG-13", "TV-MA", "TV-MA", "TV-MA", "TV-MA", "TV-MA", "PG~
## $ duration     <chr> "90 min", "2 Seasons", "1 Season", "1 Season", "2 Seasons~
## $ listed_in    <chr> "Documentaries", "International TV Shows, TV Dramas, TV M~
## $ description  <chr> "As her father nears the end of his life, filmmaker Kirst~

Let’s clean up guys

Let’s clean guys

# Check missing values
colSums(is.na(netflix))
##      show_id         type        title     director         cast      country 
##            0            0            0            0            0            0 
##   date_added release_year       rating     duration    listed_in  description 
##            0            0            0            0            0            0
# Imput missing values with mode
getmode <- function(v) {
   uniqv <- unique(v)
   uniqv[which.max(tabulate(match(v, uniqv)))]
}

netflix$country <- ifelse( is.na(netflix$country),
                        getmode(netflix$country),
                        netflix$country)
                        
netflix$date_added <- ifelse( is.na(netflix$date_added) , getmode(netflix$date_added) , netflix$date_added )

netflix$rating <- ifelse( is.na(netflix$rating) , getmode(netflix$rating) , netflix$rating)

# Little bit data wrangling
netflix$listed_in <- as.factor(netflix$listed_in)

netflix$type <- as.factor(netflix$type)

# Drop missing values for certain column
netflix <- netflix %>% 
  drop_na(duration)

#drop duplicated rows based on the title, country, type and release_year
netflix=distinct(netflix,title,country,type,release_year, .keep_all= TRUE)

Data Visualization

1. Movies or TV Shows ?

type_count <- count(netflix,type)
type_count$percen <- paste0(round((type_count$n/sum(type_count$n))*100,2),'%') 
type_count
##      type    n percen
## 1   Movie 6131 69.62%
## 2 TV Show 2676 30.38%
type_count %>% 
  ggplot(aes(x="",y=percen,fill=type)) + 
  geom_bar(stat = 'identity') + 
  coord_polar("y",start=0)+
  scale_fill_brewer(palette='Spectral')+
  geom_text(aes(y=percen,label=percen),
            position=position_stack(vjust=0.5))+
  labs(title = 'Number of TV Shows and Movies',
       subtitle = "Comparison  of movies and tv show on Netflix",
       fill = "Movies/Tv Show ?") +
  theme_void()

There are far more movies than tv shows on netflix

Duration

netflix %>% 
  filter((country == 'India' | country == 'United States') & type=='Movie'& release_year<="2020-01-01" & release_year>="2000-01-01") %>% 
  mutate(movie_duration=substr(duration,1,nchar(as.character(duration))-4)) %>% 
  mutate(movie_duration = as.integer(movie_duration)) %>% 
  ggplot(aes(x=country,y=movie_duration,fill=country))+
  geom_boxplot() +
  labs(x = "Country",
       y = "Movie Duration",
       fill = "Best two country",
       title = "Movie Duration") +
  theme_minimal()
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).

From these two countries that contributed the most films, it turns out that films made in India have a longer duration than America. this may be because many Indian films are filled with songs and dances

Best words to make a title

#When passing paste function to vector, separator won't work, hence collapse.
title_vector <- paste(netflix[,3],collapse = " ")
title_words <- tokenize_words(title_vector) #break into single word
title_table<-table(unlist(title_words))
frequ=cbind.data.frame(words=names(title_table),
                      countt=as.integer(title_table))

wc<-frequ %>% 
  filter(nchar(as.character(words))>3) %>% 
  arrange(desc(countt))
cloud=wordcloud(words=wc$words,
                freq = wc$countt,
                rot.per=0.40,
                min.freq = 1,
                max.words = 200,
                random.order = F,
                colors = brewer.pal(8,"Set1"))

There are so many titles that contain the words love, movies, stories, life, and christmas