Netflix Movies Data

Loading the libraries

install.packages("pacman")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.1'
## (as 'lib' is unspecified)

library(pacman)
p_load(tidyverse,lubridate,showtext)
showtext_auto()
font_add_google("Bebas Neue", "Bebas Neue")

Loading the dataset

netflix <- read_csv('NetflixOriginals.csv')

## Rows: 584 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Title, Genre, Premiere, Language
## dbl (2): Runtime, IMDB Score
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

looking at the data in brief

head(netflix)

## # A tibble: 6 × 6
##   Title           Genre                 Premiere   Runtime `IMDB Score` Language
##   <chr>           <chr>                 <chr>        <dbl>        <dbl> <chr>   
## 1 Enter the Anime Documentary           August 5,…      58          2.5 English…
## 2 Dark Forces     Thriller              August 21…      81          2.6 Spanish 
## 3 The App         Science fiction/Drama December …      79          2.6 Italian 
## 4 The Open House  Horror thriller       January 1…      94          3.2 English 
## 5 Kaali Khuhi     Mystery               October 3…      90          3.4 Hindi   
## 6 Drive           Action                November …     147          3.5 Hindi

types of datatype in each column

as_tibble(sapply(netflix, class))

## # A tibble: 6 × 1
##   value    
##   <chr>    
## 1 character
## 2 character
## 3 character
## 4 numeric  
## 5 numeric  
## 6 character

Does the data contain any N/A values?

any(is.na(netflix))

## [1] FALSE

Converting the “Premiere” from character to datetime

netflix <- netflix %>% mutate(Released = mdy(Premiere))

Getting each year, month, and date separated, also adding the

corresponding day of the week for each release.

netflix <- netflix %>%
        mutate(Year = year(Released)) %>%
        mutate(Month = month(Released, label=TRUE)) %>%
        mutate(Date = day(Released)) %>%
        mutate(Day = wday(Released, label=TRUE, abbr=FALSE))

Whem the movies were released?

Number of Movies released each year

n <- netflix %>% group_by(Year) %>% summarise(total=n())
n_graph <- ggplot(data=n)+
        geom_col(mapping=aes(
          x=Year,
          y=total,
          fill=ifelse(total==max(total),"red","grey"))
          )+
        labs(title="Netflix Movies released each year")+
        theme_minimal()+
        scale_fill_manual(values=c("#2d2d2d","#E50914"))+
        theme(
          legend.position="none",
          plot.title=element_text(
            family="Bebas Neue",
            size=25,
            color="#E50914"),
          axis.title.x = element_blank(),
          axis.title.y = element_blank(),
          panel.grid.major.x = element_blank(),
          panel.grid.minor = element_blank()
          )

n_graph

Number of Movies released each month

n1 <- netflix %>% group_by(Month) %>% summarise(total=n())
n1_graph <- ggplot(data=n1)+
        geom_col(mapping = aes(
            x=Month,
            y=total,
            fill=ifelse(total==max(total),"red","grey")))+
        labs(title='Netflix Movies Released Each Month')+
        theme_minimal()+
        scale_fill_manual(values=c("#2d2d2d","#E50914"))+
        theme(
            legend.position='none',
            plot.title = element_text(
              family="Bebas Neue",
              size=25,
              color="#E50914"),
            axis.title.x=element_blank(),
            axis.title.y=element_blank(),
            panel.grid.major.x = element_blank(),
            panel.grid.minor = element_blank(),
            text = element_text(size=20))

n1_graph

Number of movies released by date of the month

n2 <- netflix %>% group_by(Date) %>% summarise(total=n())

n2_graph <- ggplot(data=n2)+
                  geom_col(mapping=aes(x=Date, y=total,
                  fill=ifelse(total==max(total),"red","grey")))+
                  labs(
                    title="Netflix Movies released by date of each month")+
                  theme_minimal()+
                  scale_fill_manual(values = c("#2d2d2d","#E50914"))+
                  theme(
                    legend.position="none",
                    plot.title=element_text(
                      family="Bebas Neue",
                      size=25,
                      color="#E50914"
                      ),
                    axis.title.x=element_blank(),
                    axis.title.y=element_blank(),
                    panel.grid.major.x=element_blank(),
                    panel.grid.minor = element_blank(),
                    text=element_text(size=20)
                    )

n2_graph

Number of movies releaes each day of the week

n3 <- netflix %>% group_by(Day) %>% summarise(total=n())

n3_graph <- 
        ggplot(data=n3)+
        geom_col(mapping=aes(
            x=Day,
            y=total,
            fill=ifelse(total==max(total),"red","black")))+
        labs(title="Netflix Movies released by day of the week")+
        theme_minimal()+
        scale_fill_manual(values=c("#2d2d2d","#E50914"))+
        theme(
            legend.position="none",
            plot.title = element_text(
                family="Bebas Neue",
                size=25,
                color="#E50914"),
            axis.title.x=element_blank(),
            axis.title.y=element_blank(),
            panel.grid.major.x=element_blank(),
            panel.grid.minor=element_blank(),
            text=element_text(size=20)
            )
            
n3_graph

5 Most popular Genres

    n4 <- netflix %>% group_by(Genre) %>% 
  summarise(Movies=n()) %>% 
  arrange(desc(Movies)) %>% 
  head(5)

    n4_graph <-
        ggplot(data=n4)+
        geom_col(mapping = aes(
            x=reorder(Genre, -Movies),
            y=Movies,
            fill=ifelse(Movies == max(Movies),"red","black")))+
        labs(title="Most Popular Genres")+
        theme_minimal()+
        scale_fill_manual(values = c("#2d2d2d","#E50914"))+
        theme(
            legend.position="none",
            plot.title = element_text(
              family="Bebas Neue",
              size=25,
              color="#E50914"),
            axis.title.x=element_blank(),
            axis.title.y=element_blank(),
            panel.grid.major.x=element_blank(),
            panel.grid.minor = element_blank(),
            text = element_text(size=20)
            )

    n4_graph

5 Most Popular Languages

n5 <- netflix %>% 
            group_by(Language) %>% 
            summarise(Movies=n()) %>% 
            arrange(desc(Movies)) %>% 
            head(5)

    n5_graph <- 
        ggplot(data=n5)+
        geom_col(mapping=aes(
        x=reorder(Language, -Movies),
        y=Movies,
        fill=ifelse(Movies == max(Movies),"red","black")))+
        labs(title="Most Popular Languages")+
        theme_minimal()+
        scale_fill_manual(values=c("#2d2d2d","#E50914"))+
        theme(
            legend.position="none",
            plot.title = element_text(
                family="Bebas Neue",
                size=25,
                color="#E50914"),
            axis.title.x=element_blank(),
            axis.title.y=element_blank(),
            panel.grid.major.x=element_blank(),
            panel.grid.minor = element_blank(),
            title=element_text(size=20)
            )

    n5_graph

IMDB Scores - How were most movies rated?

    n6_graph <- ggplot(netflix)+
                    geom_dotplot(mapping=aes(x=`IMDB Score`),
                        binwidth=0.3,fill="#2d2d2d",color="#e9ecef")+
                    labs(title="IMDB Score Distribution")+
                    theme_minimal()+
                    theme(
                    legend.position="none",
                    plot.title=element_text(
                      family="Bebas Neue",
                      size=25,
                      color="#E50914"),
                    axis.title.x=element_blank(),
                    axis.title.y=element_blank(),
                    panel.grid.major.x=element_blank()
                    )

    n6_graph

Highest Rated Movies

n7 <- netflix %>% arrange(desc(`IMDB Score`)) %>% head(5)

    n7_graph <- ggplot(data=n7)+
                    geom_col(mapping=aes(
                        x=reorder(`Title`,`IMDB Score`),
                        y=`IMDB Score`,
                        fill=ifelse(
                          `IMDB Score`==max(`IMDB Score`),
                          "red","black")))+
                    labs(title="Highest Rated Movies")+
                    theme_minimal()+
                    scale_fill_manual(values = c("#2d2d2d","#E50914"))+
                    coord_flip()+
                    theme(
                        legend.position="none",
                        plot.title = element_text(
                          family="Bebas Neue",
                          size=25,
                          color="#E50914"),
                        axis.title.x=element_blank(),
                        axis.title.y=element_blank(),
                        panel.grid.major.x=element_blank()
                        )

    n7_graph

Lowest Rated Movies

    n8 <- netflix %>% arrange(desc(-`IMDB Score`)) %>% head(5)

    n8_graph <- ggplot(data=n8)+
                    geom_col(mapping=aes(
                        x=reorder(`Title`, -`IMDB Score`),
                        y=`IMDB Score`,
                        fill=ifelse(
                          `IMDB Score`==min(`IMDB Score`),
                          "red","black")))+
                    labs(title="Lowest Rated Movies")+
                    theme_minimal()+
                    scale_fill_manual(values = c("#2d2d2d","#E50914"))+
                    coord_flip()+
                    theme(
                        legend.position="none",
                        plot.title = element_text(
                          family="Bebas Neue",
                          size=25,
                          color="#E50914"),
                        axis.title.x=element_blank(),
                        axis.title.y=element_blank(),
                        panel.grid.major.x=element_blank()
                        )
            
    n8_graph

Runtime - How long are the movies?

    n9_graph <- ggplot(data=netflix)+
                    geom_dotplot(
                      mapping=aes(x=Runtime),
                      binwidth=2.25,
                      fill="#2d2d2d",
                      color="#e9ecef")+
                    labs(title="Movie Runtime")+
                    theme_minimal()+
                    theme(
                        legend.position="none",
                        plot.title=element_text(
                          family="Bebas Neue",
                          size=25,
                          color="#E50914"),
                        axis.title.x = element_blank(),
                        axis.title.y=element_blank(),
                        panel.grid.major.x=element_blank()
                        )

    n9_graph

Longest Movies

    n10 <- netflix %>% arrange(desc(Runtime)) %>% head(5)

    n10_graph <- ggplot(data=n10)+
                    geom_col(mapping=aes(
                        x=reorder(`Title`,`Runtime`),
                        y=`Runtime`,
                        fill=ifelse(Runtime==max(`Runtime`),"red","black")))+
                    labs(title="Longest Movies")+
                    theme_minimal()+
                    scale_fill_manual(values=c("#2d2d2d","#E50914"))+
                    coord_flip()+
                    theme(
                        legend.position="none",
                        plot.title = element_text(
                          family="Bebas Neue",
                          size=25,
                          color="#E50914"),
                        axis.title.x=element_blank(),
                        axis.title.y=element_blank(),
                        panel.grid.major.x=element_blank()
                        )

    n10_graph

Shortest Movies

    n11 <- netflix %>% arrange(desc(-Runtime)) %>% head(5)

    n11_graph <- ggplot(data=n11)+
                    geom_col(mapping=aes(
                        x = reorder(`Title`,`Runtime`),
                        y = `Runtime`,
                        fill = ifelse(Runtime==min(`Runtime`),"red","black")))+
                    labs(title="Shortest Movies")+
                    theme_minimal()+
                    scale_fill_manual(values = c("#2d2d2d","#E50914"))+
                    coord_flip()+
                    theme(
                        legend.position="none",
                        plot.title = element_text(
                          family="Bebas Neue",
                          size=25,
                          color="#E50914"),
                        axis.title.x=element_blank(),
                        axis.title.y=element_blank(),
                        panel.grid.major.x=element_blank(),
                        text=element_text(size=20)
                        )

    n11_graph

Runtime vs IMDB-Score

    n12_graph <- ggplot(data=netflix,aes(x = `IMDB Score`, y = Runtime))+
                    geom_point()+
                    geom_smooth(method = "lm", color="#E50914")+
                    labs(title="Runtime vs IMDB Rating")+
                    theme_minimal()+
                    scale_fill_manual(values=c("#2d2d2d","#E50914"))+
                    theme(
                        legend.position = "none",
                        plot.title=element_text(
                          family="Bebas Neue",
                          size=25,
                          color="#E50914"),
                        axis.title.x=element_blank(),
                        axis.title.y=element_blank(),
                        panel.grid.major.x=element_blank()
                        )

    n12_graph

## `geom_smooth()` using formula 'y ~ x'

Basic Statistical Analysis

Linear Models

    model <- lm(data=netflix, formula = Runtime ~ `IMDB Score`)

    summary(model)

## 
## Call:
## lm(formula = Runtime ~ `IMDB Score`, data = netflix)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -91.399  -7.439   3.398  14.467 117.195 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   100.849      7.453  13.531   <2e-16 ***
## `IMDB Score`   -1.159      1.174  -0.987    0.324    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27.76 on 582 degrees of freedom
## Multiple R-squared:  0.001673,   Adjusted R-squared:  -4.283e-05 
## F-statistic: 0.975 on 1 and 582 DF,  p-value: 0.3238

Correlation Test

res <- cor.test(netflix$Runtime, netflix$`IMDB Score`,method="pearson")

res

## 
##  Pearson's product-moment correlation
## 
## data:  netflix$Runtime and netflix$`IMDB Score`
## t = -0.98744, df = 582, p-value = 0.3238
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.12162699  0.04037194
## sample estimates:
##         cor 
## -0.04089629

P-Value

res$p.value

## [1] 0.3238393

Correlation Coefficient

res$estimate

##         cor 
## -0.04089629

Netflix Data Analysis

Tarek Atassi

2022-04-26

Netflix Movies Data

Loading the libraries

Loading the dataset

looking at the data in brief

types of datatype in each column

Does the data contain any N/A values?

Converting the “Premiere” from character to datetime

Getting each year, month, and date separated, also adding the

corresponding day of the week for each release.

Whem the movies were released?

Number of Movies released each year

Number of Movies released each month

Number of movies released by date of the month

Number of movies releaes each day of the week

5 Most popular Genres

5 Most Popular Languages

IMDB Scores - How were most movies rated?

Highest Rated Movies

Lowest Rated Movies

Runtime - How long are the movies?

Longest Movies

Shortest Movies

Runtime vs IMDB-Score

Basic Statistical Analysis

Linear Models

Correlation Test

P-Value

Correlation Coefficient

Thank You