About dataset
Overview
Netflix is a streaming service that offers a wide variety of award-winning TV shows, movies, anime, documentaries.The company’s primary business is a subscription-based streaming service offering online streaming from a library of films and television series, including those produced in-house. This data set contains detailed information on those libraries available on netflix and you can found here - https://www.kaggle.com/shivamb/netflix-shows
# Read data
netflix <- read.csv("data/netflix_titles.csv")
Metadata
- show_id - Unique ID for every Movie / Tv Show
- type - Identifier - A Movie or TV Show
- title - Title of the Movie / Tv Show
- director - Director of the Movie
- cast - Actors involved in the movie / show
- country - Country where the movie / show was produced
- date_added - Date it was added on Netflix
- release_year - Actual Release year of the move / show
- rating - TV Rating of the movie / show
- duration - Total Duration - in minutes or number of seasons
- listed_in - Genere
- description - The summary description
glimpse(netflix)
## Rows: 8,807
## Columns: 12
## $ show_id <chr> "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s1~
## $ type <chr> "Movie", "TV Show", "TV Show", "TV Show", "TV Show", "TV ~
## $ title <chr> "Dick Johnson Is Dead", "Blood & Water", "Ganglands", "Ja~
## $ director <chr> "Kirsten Johnson", "", "Julien Leclercq", "", "", "Mike F~
## $ cast <chr> "", "Ama Qamata, Khosi Ngema, Gail Mabalane, Thabang Mola~
## $ country <chr> "United States", "South Africa", "", "", "India", "", "",~
## $ date_added <chr> "September 25, 2021", "September 24, 2021", "September 24~
## $ release_year <int> 2020, 2021, 2021, 2021, 2021, 2021, 2021, 1993, 2021, 202~
## $ rating <chr> "PG-13", "TV-MA", "TV-MA", "TV-MA", "TV-MA", "TV-MA", "PG~
## $ duration <chr> "90 min", "2 Seasons", "1 Season", "1 Season", "2 Seasons~
## $ listed_in <chr> "Documentaries", "International TV Shows, TV Dramas, TV M~
## $ description <chr> "As her father nears the end of his life, filmmaker Kirst~
Let’s clean up guys
Let’s clean guys
# Check missing values
colSums(is.na(netflix))
## show_id type title director cast country
## 0 0 0 0 0 0
## date_added release_year rating duration listed_in description
## 0 0 0 0 0 0
# Imput missing values with mode
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
netflix$country <- ifelse( is.na(netflix$country),
getmode(netflix$country),
netflix$country)
netflix$date_added <- ifelse( is.na(netflix$date_added) , getmode(netflix$date_added) , netflix$date_added )
netflix$rating <- ifelse( is.na(netflix$rating) , getmode(netflix$rating) , netflix$rating)
# Little bit data wrangling
netflix$listed_in <- as.factor(netflix$listed_in)
netflix$type <- as.factor(netflix$type)
# Drop missing values for certain column
netflix <- netflix %>%
drop_na(duration)
#drop duplicated rows based on the title, country, type and release_year
netflix=distinct(netflix,title,country,type,release_year, .keep_all= TRUE)
Data Visualization
1. Movies or TV Shows ?
type_count <- count(netflix,type)
type_count$percen <- paste0(round((type_count$n/sum(type_count$n))*100,2),'%')
type_count
## type n percen
## 1 Movie 6131 69.62%
## 2 TV Show 2676 30.38%
type_count %>%
ggplot(aes(x="",y=percen,fill=type)) +
geom_bar(stat = 'identity') +
coord_polar("y",start=0)+
scale_fill_brewer(palette='Spectral')+
geom_text(aes(y=percen,label=percen),
position=position_stack(vjust=0.5))+
labs(title = 'Number of TV Shows and Movies',
subtitle = "Comparison of movies and tv show on Netflix",
fill = "Movies/Tv Show ?") +
theme_void()
There are far more movies than tv shows on netflix
Duration
netflix %>%
filter((country == 'India' | country == 'United States') & type=='Movie'& release_year<="2020-01-01" & release_year>="2000-01-01") %>%
mutate(movie_duration=substr(duration,1,nchar(as.character(duration))-4)) %>%
mutate(movie_duration = as.integer(movie_duration)) %>%
ggplot(aes(x=country,y=movie_duration,fill=country))+
geom_boxplot() +
labs(x = "Country",
y = "Movie Duration",
fill = "Best two country",
title = "Movie Duration") +
theme_minimal()
## Warning: Removed 3 rows containing non-finite values (stat_boxplot).
From these two countries that contributed the most films, it turns out that films made in India have a longer duration than America. this may be because many Indian films are filled with songs and dances
Best words to make a title
#When passing paste function to vector, separator won't work, hence collapse.
title_vector <- paste(netflix[,3],collapse = " ")
title_words <- tokenize_words(title_vector) #break into single word
title_table<-table(unlist(title_words))
frequ=cbind.data.frame(words=names(title_table),
countt=as.integer(title_table))
wc<-frequ %>%
filter(nchar(as.character(words))>3) %>%
arrange(desc(countt))
cloud=wordcloud(words=wc$words,
freq = wc$countt,
rot.per=0.40,
min.freq = 1,
max.words = 200,
random.order = F,
colors = brewer.pal(8,"Set1"))
There are so many titles that contain the words love, movies, stories, life, and christmas