The Data

Dataset Selection

I have decided to change my dataset because i wasn’t fully in love with it.

In fact, Movies & Series are something I am extremely passionate about.

In my new work, I will focus on streaming platforms and analyze which ones have the best movies, series, etc. and will help you choose the right platform for you based on your movie taste.

I will join 4 Datasets collected from Kaggle: Netflix, HULU, Amazon, and Disney+

Data Joining

library(tidyverse)          #I start by installing the tidyverse package
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   1.0.0 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
amazon <- read.csv('data/amazon.csv')
disney <- read.csv('data/disney.csv')
hulu <- read.csv('data/hulu.csv')
netflix <- read.csv('data/netflix.csv')
amazon$platform <- "amazon"
disney$platform <- "disney"
hulu$platform <- "hulu"
netflix$platform <- "netflix"
df <- rbind(amazon, disney, hulu, netflix)
df_g <- rbind(amazon, disney, hulu, netflix)   #dataframe for splitting genres
df_c <- rbind(amazon, disney, hulu, netflix)   #dataframe for splitting countries
df$platform <- factor(df$platform)
df_c$platform <- factor(df_c$platform)
df_g$platform <- factor(df_g$platform)
df$type <- factor(df$type)
df_c$type <- factor(df_c$type)
df_g$type <- factor(df_g$type)
df_g$genres <- gsub("\\[", "", df_g$genres)
df_g$genres <- gsub("\\]", "", df_g$genres)
df_g <- separate_rows(df_g, genres, sep = ", ")
df_g$genres <- gsub("'", "", df_g$genres)

df_c$production_countries <- gsub("\\['", "", df_c$production_countries)
df_c$production_countries <- gsub("'\\]", "", df_c$production_countries)
df_c <- separate_rows(df_c, production_countries, sep = ", ")
df_c$production_countries <- gsub("'", "", df_c$production_countries)

df_c
df_g$genres <- factor(df_g$genres)
df_c$production_countries <- factor(df_c$production_countries)
save(df, file = "df.RData")
save(df_g, file = "df_g.RData")
save(df_c, file = "df_c.RData")
df |> dim()
## [1] 19654    16
df_c |> dim()
## [1] 22380    16
df_g |> dim()
## [1] 48377    16
str(df_g)
## tibble [48,377 × 16] (S3: tbl_df/tbl/data.frame)
##  $ id                  : chr [1:48377] "ts20945" "ts20945" "ts20945" "ts20945" ...
##  $ title               : chr [1:48377] "The Three Stooges" "The Three Stooges" "The Three Stooges" "The Three Stooges" ...
##  $ type                : Factor w/ 2 levels "MOVIE","SHOW": 2 2 2 2 2 2 1 1 1 1 ...
##  $ description         : chr [1:48377] "The Three Stooges were an American vaudeville and comedy team active from 1922 until 1970, best known for their"| __truncated__ "The Three Stooges were an American vaudeville and comedy team active from 1922 until 1970, best known for their"| __truncated__ "The Three Stooges were an American vaudeville and comedy team active from 1922 until 1970, best known for their"| __truncated__ "The Three Stooges were an American vaudeville and comedy team active from 1922 until 1970, best known for their"| __truncated__ ...
##  $ release_year        : int [1:48377] 1934 1934 1934 1934 1934 1934 1926 1926 1926 1926 ...
##  $ age_certification   : chr [1:48377] "TV-PG" "TV-PG" "TV-PG" "TV-PG" ...
##  $ runtime             : int [1:48377] 19 19 19 19 19 19 78 78 78 78 ...
##  $ genres              : Factor w/ 20 levels "","action","animation",..: 4 9 3 2 10 12 2 7 19 20 ...
##  $ production_countries: chr [1:48377] "['US']" "['US']" "['US']" "['US']" ...
##  $ seasons             : num [1:48377] 26 26 26 26 26 26 NA NA NA NA ...
##  $ imdb_id             : chr [1:48377] "tt0850645" "tt0850645" "tt0850645" "tt0850645" ...
##  $ imdb_score          : num [1:48377] 8.6 8.6 8.6 8.6 8.6 8.6 8.2 8.2 8.2 8.2 ...
##  $ imdb_votes          : num [1:48377] 1092 1092 1092 1092 1092 ...
##  $ tmdb_popularity     : num [1:48377] 15.4 15.4 15.4 15.4 15.4 ...
##  $ tmdb_score          : num [1:48377] 7.6 7.6 7.6 7.6 7.6 7.6 8 8 8 8 ...
##  $ platform            : Factor w/ 4 levels "amazon","disney",..: 1 1 1 1 1 1 1 1 1 1 ...

For more details about the app and instructions, you can check the Welcome page of the app!