Netflix Data Analysis

Published By: Shivam Singla

Loading the Libraries

library(pacman)
p_load(tidyverse,lubridate,showtext)
showtext_auto()
font_add_google("Bebas Neue", "Bebas Neue")

Loading the Dataset

netflix <- read_csv('./netflix-data/NetflixOriginals.csv')
## Rows: 584 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Title, Genre, Premiere, Language
## dbl (2): Runtime, IMDB Score
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(netflix)
## # A tibble: 6 × 6
##   Title           Genre                 Premiere   Runtime `IMDB Score` Language
##   <chr>           <chr>                 <chr>        <dbl>        <dbl> <chr>   
## 1 Enter the Anime Documentary           August 5,…      58          2.5 English…
## 2 Dark Forces     Thriller              August 21…      81          2.6 Spanish 
## 3 The App         Science fiction/Drama December …      79          2.6 Italian 
## 4 The Open House  Horror thriller       January 1…      94          3.2 English 
## 5 Kaali Khuhi     Mystery               October 3…      90          3.4 Hindi   
## 6 Drive           Action                November …     147          3.5 Hindi
as_tibble(sapply(netflix, class))
## # A tibble: 6 × 1
##   value    
##   <chr>    
## 1 character
## 2 character
## 3 character
## 4 numeric  
## 5 numeric  
## 6 character
netflix <- netflix %>% mutate(Released = mdy(Premiere))
netflix <- netflix %>%
        mutate(Year = year(Released)) %>%
        mutate(Month = month(Released, label=TRUE)) %>%
        mutate(Date = day(Released)) %>%
        mutate(Day = wday(Released, label=TRUE, abbr=FALSE))

Data Visualisation

Number of Movies Released Each Year

n <- netflix %>% group_by(Year) %>% summarise(total=n())
n_graph <- ggplot(data=n)+
        geom_col(mapping=aes(
          x=Year,
          y=total,
          fill=ifelse(total==max(total),"red","grey"))
          )+
        labs(title="Netflix Movies released each year")+
        theme_minimal()+
        scale_fill_manual(values=c("#2d2d2d","#E50914"))+
        theme(
          legend.position="none",
          plot.title=element_text(
            family="Bebas Neue",
            size=25,
            color="#E50914"),
          axis.title.x = element_blank(),
          axis.title.y = element_blank(),
          panel.grid.major.x = element_blank(),
          panel.grid.minor = element_blank()
          )

n_graph

Number of Movies Released Each Month

n1 <- netflix %>% group_by(Month) %>% summarise(total=n())
n1_graph <- ggplot(data=n1)+
        geom_col(mapping = aes(
            x=Month,
            y=total,
            fill=ifelse(total==max(total),"red","grey")))+
        labs(title='Netflix Movies Released Each Month')+
        theme_minimal()+
        scale_fill_manual(values=c("#2d2d2d","#E50914"))+
        theme(
            legend.position='none',
            plot.title = element_text(
              family="Bebas Neue",
              size=25,
              color="#E50914"),
            axis.title.x=element_blank(),
            axis.title.y=element_blank(),
            panel.grid.major.x = element_blank(),
            panel.grid.minor = element_blank(),
            text = element_text(size=20))

n1_graph

Number of Movies Released Each Date of Month

n2 <- netflix %>% group_by(Date) %>% summarise(total=n())

n2_graph <- ggplot(data=n2)+
                  geom_col(mapping=aes(x=Date, y=total,
                  fill=ifelse(total==max(total),"red","grey")))+
                  labs(
                    title="Netflix Movies released by date of each month")+
                  theme_minimal()+
                  scale_fill_manual(values = c("#2d2d2d","#E50914"))+
                  theme(
                    legend.position="none",
                    plot.title=element_text(
                      family="Bebas Neue",
                      size=25,
                      color="#E50914"
                      ),
                    axis.title.x=element_blank(),
                    axis.title.y=element_blank(),
                    panel.grid.major.x=element_blank(),
                    panel.grid.minor = element_blank(),
                    text=element_text(size=20)
                    )

n2_graph

Number of Movies Released Each Day of Week

n3 <- netflix %>% group_by(Day) %>% summarise(total=n())

n3_graph <- 
        ggplot(data=n3)+
        geom_col(mapping=aes(
            x=Day,
            y=total,
            fill=ifelse(total==max(total),"red","black")))+
        labs(title="Netflix Movies released by day of the week")+
        theme_minimal()+
        scale_fill_manual(values=c("#2d2d2d","#E50914"))+
        theme(
            legend.position="none",
            plot.title = element_text(
                family="Bebas Neue",
                size=25,
                color="#E50914"),
            axis.title.x=element_blank(),
            axis.title.y=element_blank(),
            panel.grid.major.x=element_blank(),
            panel.grid.minor=element_blank(),
            text=element_text(size=20)
            )
            
n3_graph

IMDB Score Distribution

n6_graph <- ggplot(netflix)+
                    geom_dotplot(mapping=aes(x=`IMDB Score`),
                        binwidth=0.3,fill="#2d2d2d",color="#e9ecef")+
                    labs(title="IMDB Score Distribution")+
                    theme_minimal()+
                    theme(
                    legend.position="none",
                    plot.title=element_text(
                      family="Bebas Neue",
                      size=25,
                      color="#E50914"),
                    axis.title.x=element_blank(),
                    axis.title.y=element_blank(),
                    panel.grid.major.x=element_blank()
                    )

    n6_graph

Highest Rated Movies

n7 <- netflix %>% arrange(desc(`IMDB Score`)) %>% head(5)

    n7_graph <- ggplot(data=n7)+
                    geom_col(mapping=aes(
                        x=reorder(`Title`,`IMDB Score`),
                        y=`IMDB Score`,
                        fill=ifelse(
                          `IMDB Score`==max(`IMDB Score`),
                          "red","black")))+
                    labs(title="Highest Rated Movies")+
                    theme_minimal()+
                    scale_fill_manual(values = c("#2d2d2d","#E50914"))+
                    coord_flip()+
                    theme(
                        legend.position="none",
                        plot.title = element_text(
                          family="Bebas Neue",
                          size=25,
                          color="#E50914"),
                        axis.title.x=element_blank(),
                        axis.title.y=element_blank(),
                        panel.grid.major.x=element_blank()
                        )

    n7_graph

Lowest Rated Movies

n8 <- netflix %>% arrange(desc(-`IMDB Score`)) %>% head(5)

    n8_graph <- ggplot(data=n8)+
                    geom_col(mapping=aes(
                        x=reorder(`Title`, -`IMDB Score`),
                        y=`IMDB Score`,
                        fill=ifelse(
                          `IMDB Score`==min(`IMDB Score`),
                          "red","black")))+
                    labs(title="Lowest Rated Movies")+
                    theme_minimal()+
                    scale_fill_manual(values = c("#2d2d2d","#E50914"))+
                    coord_flip()+
                    theme(
                        legend.position="none",
                        plot.title = element_text(
                          family="Bebas Neue",
                          size=25,
                          color="#E50914"),
                        axis.title.x=element_blank(),
                        axis.title.y=element_blank(),
                        panel.grid.major.x=element_blank()
                        )
            
    n8_graph

Movie Runtime

n9_graph <- ggplot(data=netflix)+
                    geom_dotplot(
                      mapping=aes(x=Runtime),
                      binwidth=2.25,
                      fill="#2d2d2d",
                      color="#e9ecef")+
                    labs(title="Movie Runtime")+
                    theme_minimal()+
                    theme(
                        legend.position="none",
                        plot.title=element_text(
                          family="Bebas Neue",
                          size=25,
                          color="#E50914"),
                        axis.title.x = element_blank(),
                        axis.title.y=element_blank(),
                        panel.grid.major.x=element_blank()
                        )

    n9_graph

Longest Movies

n10 <- netflix %>% arrange(desc(Runtime)) %>% head(5)

    n10_graph <- ggplot(data=n10)+
                    geom_col(mapping=aes(
                        x=reorder(`Title`,`Runtime`),
                        y=`Runtime`,
                        fill=ifelse(Runtime==max(`Runtime`),"red","black")))+
                    labs(title="Longest Movies")+
                    theme_minimal()+
                    scale_fill_manual(values=c("#2d2d2d","#E50914"))+
                    coord_flip()+
                    theme(
                        legend.position="none",
                        plot.title = element_text(
                          family="Bebas Neue",
                          size=25,
                          color="#E50914"),
                        axis.title.x=element_blank(),
                        axis.title.y=element_blank(),
                        panel.grid.major.x=element_blank()
                        )

    n10_graph

Shortest Movies

 n11 <- netflix %>% arrange(desc(-Runtime)) %>% head(5)

    n11_graph <- ggplot(data=n11)+
                    geom_col(mapping=aes(
                        x = reorder(`Title`,`Runtime`),
                        y = `Runtime`,
                        fill = ifelse(Runtime==min(`Runtime`),"red","black")))+
                    labs(title="Shortest Movies")+
                    theme_minimal()+
                    scale_fill_manual(values = c("#2d2d2d","#E50914"))+
                    coord_flip()+
                    theme(
                        legend.position="none",
                        plot.title = element_text(
                          family="Bebas Neue",
                          size=25,
                          color="#E50914"),
                        axis.title.x=element_blank(),
                        axis.title.y=element_blank(),
                        panel.grid.major.x=element_blank(),
                        text=element_text(size=20)
                        )

    n11_graph

Runtime vs IMDB Rating

 n12_graph <- ggplot(data=netflix,aes(x = `IMDB Score`, y = Runtime))+
                    geom_point()+
                    geom_smooth(method = "lm", color="#E50914")+
                    labs(title="Runtime vs IMDB Rating")+
                    theme_minimal()+
                    scale_fill_manual(values=c("#2d2d2d","#E50914"))+
                    theme(
                        legend.position = "none",
                        plot.title=element_text(
                          family="Bebas Neue",
                          size=25,
                          color="#E50914"),
                        axis.title.x=element_blank(),
                        axis.title.y=element_blank(),
                        panel.grid.major.x=element_blank()
                        )

    n12_graph
## `geom_smooth()` using formula = 'y ~ x'

Thank You