Netflix Movies Data

Loading the libraries

install.packages("pacman")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.1'
## (as 'lib' is unspecified)
library(pacman)
p_load(tidyverse,lubridate,showtext)
showtext_auto()
font_add_google("Bebas Neue", "Bebas Neue")

Loading the dataset

netflix <- read_csv('NetflixOriginals.csv')
## Rows: 584 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Title, Genre, Premiere, Language
## dbl (2): Runtime, IMDB Score
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

looking at the data in brief

head(netflix)
## # A tibble: 6 × 6
##   Title           Genre                 Premiere   Runtime `IMDB Score` Language
##   <chr>           <chr>                 <chr>        <dbl>        <dbl> <chr>   
## 1 Enter the Anime Documentary           August 5,…      58          2.5 English…
## 2 Dark Forces     Thriller              August 21…      81          2.6 Spanish 
## 3 The App         Science fiction/Drama December …      79          2.6 Italian 
## 4 The Open House  Horror thriller       January 1…      94          3.2 English 
## 5 Kaali Khuhi     Mystery               October 3…      90          3.4 Hindi   
## 6 Drive           Action                November …     147          3.5 Hindi

types of datatype in each column

as_tibble(sapply(netflix, class))
## # A tibble: 6 × 1
##   value    
##   <chr>    
## 1 character
## 2 character
## 3 character
## 4 numeric  
## 5 numeric  
## 6 character

Does the data contain any N/A values?

any(is.na(netflix))
## [1] FALSE

Converting the “Premiere” from character to datetime

netflix <- netflix %>% mutate(Released = mdy(Premiere))

Getting each year, month, and date separated, also adding the

corresponding day of the week for each release.

netflix <- netflix %>%
        mutate(Year = year(Released)) %>%
        mutate(Month = month(Released, label=TRUE)) %>%
        mutate(Date = day(Released)) %>%
        mutate(Day = wday(Released, label=TRUE, abbr=FALSE))

Whem the movies were released?

Number of Movies released each year

n <- netflix %>% group_by(Year) %>% summarise(total=n())
n_graph <- ggplot(data=n)+
        geom_col(mapping=aes(
          x=Year,
          y=total,
          fill=ifelse(total==max(total),"red","grey"))
          )+
        labs(title="Netflix Movies released each year")+
        theme_minimal()+
        scale_fill_manual(values=c("#2d2d2d","#E50914"))+
        theme(
          legend.position="none",
          plot.title=element_text(
            family="Bebas Neue",
            size=25,
            color="#E50914"),
          axis.title.x = element_blank(),
          axis.title.y = element_blank(),
          panel.grid.major.x = element_blank(),
          panel.grid.minor = element_blank()
          )

n_graph

Number of Movies released each month

n1 <- netflix %>% group_by(Month) %>% summarise(total=n())
n1_graph <- ggplot(data=n1)+
        geom_col(mapping = aes(
            x=Month,
            y=total,
            fill=ifelse(total==max(total),"red","grey")))+
        labs(title='Netflix Movies Released Each Month')+
        theme_minimal()+
        scale_fill_manual(values=c("#2d2d2d","#E50914"))+
        theme(
            legend.position='none',
            plot.title = element_text(
              family="Bebas Neue",
              size=25,
              color="#E50914"),
            axis.title.x=element_blank(),
            axis.title.y=element_blank(),
            panel.grid.major.x = element_blank(),
            panel.grid.minor = element_blank(),
            text = element_text(size=20))

n1_graph

Number of movies released by date of the month

n2 <- netflix %>% group_by(Date) %>% summarise(total=n())

n2_graph <- ggplot(data=n2)+
                  geom_col(mapping=aes(x=Date, y=total,
                  fill=ifelse(total==max(total),"red","grey")))+
                  labs(
                    title="Netflix Movies released by date of each month")+
                  theme_minimal()+
                  scale_fill_manual(values = c("#2d2d2d","#E50914"))+
                  theme(
                    legend.position="none",
                    plot.title=element_text(
                      family="Bebas Neue",
                      size=25,
                      color="#E50914"
                      ),
                    axis.title.x=element_blank(),
                    axis.title.y=element_blank(),
                    panel.grid.major.x=element_blank(),
                    panel.grid.minor = element_blank(),
                    text=element_text(size=20)
                    )

n2_graph

Number of movies releaes each day of the week

n3 <- netflix %>% group_by(Day) %>% summarise(total=n())

n3_graph <- 
        ggplot(data=n3)+
        geom_col(mapping=aes(
            x=Day,
            y=total,
            fill=ifelse(total==max(total),"red","black")))+
        labs(title="Netflix Movies released by day of the week")+
        theme_minimal()+
        scale_fill_manual(values=c("#2d2d2d","#E50914"))+
        theme(
            legend.position="none",
            plot.title = element_text(
                family="Bebas Neue",
                size=25,
                color="#E50914"),
            axis.title.x=element_blank(),
            axis.title.y=element_blank(),
            panel.grid.major.x=element_blank(),
            panel.grid.minor=element_blank(),
            text=element_text(size=20)
            )
            
n3_graph

IMDB Scores - How were most movies rated?

    n6_graph <- ggplot(netflix)+
                    geom_dotplot(mapping=aes(x=`IMDB Score`),
                        binwidth=0.3,fill="#2d2d2d",color="#e9ecef")+
                    labs(title="IMDB Score Distribution")+
                    theme_minimal()+
                    theme(
                    legend.position="none",
                    plot.title=element_text(
                      family="Bebas Neue",
                      size=25,
                      color="#E50914"),
                    axis.title.x=element_blank(),
                    axis.title.y=element_blank(),
                    panel.grid.major.x=element_blank()
                    )

    n6_graph

Highest Rated Movies

n7 <- netflix %>% arrange(desc(`IMDB Score`)) %>% head(5)

    n7_graph <- ggplot(data=n7)+
                    geom_col(mapping=aes(
                        x=reorder(`Title`,`IMDB Score`),
                        y=`IMDB Score`,
                        fill=ifelse(
                          `IMDB Score`==max(`IMDB Score`),
                          "red","black")))+
                    labs(title="Highest Rated Movies")+
                    theme_minimal()+
                    scale_fill_manual(values = c("#2d2d2d","#E50914"))+
                    coord_flip()+
                    theme(
                        legend.position="none",
                        plot.title = element_text(
                          family="Bebas Neue",
                          size=25,
                          color="#E50914"),
                        axis.title.x=element_blank(),
                        axis.title.y=element_blank(),
                        panel.grid.major.x=element_blank()
                        )

    n7_graph

Lowest Rated Movies

    n8 <- netflix %>% arrange(desc(-`IMDB Score`)) %>% head(5)

    n8_graph <- ggplot(data=n8)+
                    geom_col(mapping=aes(
                        x=reorder(`Title`, -`IMDB Score`),
                        y=`IMDB Score`,
                        fill=ifelse(
                          `IMDB Score`==min(`IMDB Score`),
                          "red","black")))+
                    labs(title="Lowest Rated Movies")+
                    theme_minimal()+
                    scale_fill_manual(values = c("#2d2d2d","#E50914"))+
                    coord_flip()+
                    theme(
                        legend.position="none",
                        plot.title = element_text(
                          family="Bebas Neue",
                          size=25,
                          color="#E50914"),
                        axis.title.x=element_blank(),
                        axis.title.y=element_blank(),
                        panel.grid.major.x=element_blank()
                        )
            
    n8_graph

Runtime - How long are the movies?

    n9_graph <- ggplot(data=netflix)+
                    geom_dotplot(
                      mapping=aes(x=Runtime),
                      binwidth=2.25,
                      fill="#2d2d2d",
                      color="#e9ecef")+
                    labs(title="Movie Runtime")+
                    theme_minimal()+
                    theme(
                        legend.position="none",
                        plot.title=element_text(
                          family="Bebas Neue",
                          size=25,
                          color="#E50914"),
                        axis.title.x = element_blank(),
                        axis.title.y=element_blank(),
                        panel.grid.major.x=element_blank()
                        )

    n9_graph

Longest Movies

    n10 <- netflix %>% arrange(desc(Runtime)) %>% head(5)

    n10_graph <- ggplot(data=n10)+
                    geom_col(mapping=aes(
                        x=reorder(`Title`,`Runtime`),
                        y=`Runtime`,
                        fill=ifelse(Runtime==max(`Runtime`),"red","black")))+
                    labs(title="Longest Movies")+
                    theme_minimal()+
                    scale_fill_manual(values=c("#2d2d2d","#E50914"))+
                    coord_flip()+
                    theme(
                        legend.position="none",
                        plot.title = element_text(
                          family="Bebas Neue",
                          size=25,
                          color="#E50914"),
                        axis.title.x=element_blank(),
                        axis.title.y=element_blank(),
                        panel.grid.major.x=element_blank()
                        )

    n10_graph

Shortest Movies

    n11 <- netflix %>% arrange(desc(-Runtime)) %>% head(5)

    n11_graph <- ggplot(data=n11)+
                    geom_col(mapping=aes(
                        x = reorder(`Title`,`Runtime`),
                        y = `Runtime`,
                        fill = ifelse(Runtime==min(`Runtime`),"red","black")))+
                    labs(title="Shortest Movies")+
                    theme_minimal()+
                    scale_fill_manual(values = c("#2d2d2d","#E50914"))+
                    coord_flip()+
                    theme(
                        legend.position="none",
                        plot.title = element_text(
                          family="Bebas Neue",
                          size=25,
                          color="#E50914"),
                        axis.title.x=element_blank(),
                        axis.title.y=element_blank(),
                        panel.grid.major.x=element_blank(),
                        text=element_text(size=20)
                        )

    n11_graph

Runtime vs IMDB-Score

    n12_graph <- ggplot(data=netflix,aes(x = `IMDB Score`, y = Runtime))+
                    geom_point()+
                    geom_smooth(method = "lm", color="#E50914")+
                    labs(title="Runtime vs IMDB Rating")+
                    theme_minimal()+
                    scale_fill_manual(values=c("#2d2d2d","#E50914"))+
                    theme(
                        legend.position = "none",
                        plot.title=element_text(
                          family="Bebas Neue",
                          size=25,
                          color="#E50914"),
                        axis.title.x=element_blank(),
                        axis.title.y=element_blank(),
                        panel.grid.major.x=element_blank()
                        )

    n12_graph
## `geom_smooth()` using formula 'y ~ x'

Basic Statistical Analysis

Linear Models

    model <- lm(data=netflix, formula = Runtime ~ `IMDB Score`)

    summary(model)
## 
## Call:
## lm(formula = Runtime ~ `IMDB Score`, data = netflix)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -91.399  -7.439   3.398  14.467 117.195 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   100.849      7.453  13.531   <2e-16 ***
## `IMDB Score`   -1.159      1.174  -0.987    0.324    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 27.76 on 582 degrees of freedom
## Multiple R-squared:  0.001673,   Adjusted R-squared:  -4.283e-05 
## F-statistic: 0.975 on 1 and 582 DF,  p-value: 0.3238

Correlation Test

res <- cor.test(netflix$Runtime, netflix$`IMDB Score`,method="pearson")

res
## 
##  Pearson's product-moment correlation
## 
## data:  netflix$Runtime and netflix$`IMDB Score`
## t = -0.98744, df = 582, p-value = 0.3238
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.12162699  0.04037194
## sample estimates:
##         cor 
## -0.04089629

P-Value

res$p.value
## [1] 0.3238393

Correlation Coefficient

res$estimate
##         cor 
## -0.04089629

Thank You