data <-read.csv("movies_metadata.csv")

# Replace "[]" with NA
data[data == "[]"] <- NA

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.3
## Warning: package 'ggplot2' was built under R version 4.3.3
## Warning: package 'tibble' was built under R version 4.3.3
## Warning: package 'tidyr' was built under R version 4.3.3
## Warning: package 'readr' was built under R version 4.3.3
## Warning: package 'purrr' was built under R version 4.3.3
## Warning: package 'dplyr' was built under R version 4.3.3
## Warning: package 'stringr' was built under R version 4.3.3
## Warning: package 'forcats' was built under R version 4.3.3
## Warning: package 'lubridate' was built under R version 4.3.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)

Taking Random Samples

sample_size <- floor(0.5 * nrow(data))

df_1 <- data.frame(data[sample(nrow(data), sample_size, replace = TRUE), ])
df_2 <- data.frame(data[sample(nrow(data), sample_size, replace = TRUE), ])
df_3 <- data.frame(data[sample(nrow(data), sample_size, replace = TRUE), ])
df_4 <- data.frame(data[sample(nrow(data), sample_size, replace = TRUE), ])
df_5 <- data.frame(data[sample(nrow(data), sample_size, replace = TRUE), ])

Summary of Occupation to Budgets Across Sub Samples

# For df_1
df_1 %>%
  mutate(budget = as.numeric(budget)) %>%
  filter(!is.na(budget)) %>%  # Remove NAs before calculating summary
  summarise(
    Min = min(budget), 
    Max = max(budget), 
    Mean = mean(budget), 
    Median = median(budget)
  )
##   Min     Max    Mean Median
## 1   0 3.8e+08 4375417      0
# For df_2
df_2 %>%
  mutate(budget = as.numeric(budget)) %>%
  filter(!is.na(budget)) %>%
  summarise(
    Min = min(budget), 
    Max = max(budget), 
    Mean = mean(budget), 
    Median = median(budget)
  )
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `budget = as.numeric(budget)`.
## Caused by warning:
## ! NAs introduced by coercion
##   Min     Max    Mean Median
## 1   0 3.8e+08 4241655      0
# For df_3
df_3 %>%
  mutate(budget = as.numeric(budget)) %>%
  filter(!is.na(budget)) %>%
  summarise(
    Min = min(budget), 
    Max = max(budget), 
    Mean = mean(budget), 
    Median = median(budget)
  )
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `budget = as.numeric(budget)`.
## Caused by warning:
## ! NAs introduced by coercion
##   Min     Max    Mean Median
## 1   0 3.8e+08 4316634      0
# For df_4
df_4 %>%
  mutate(budget = as.numeric(budget)) %>%
  filter(!is.na(budget)) %>%
  summarise(
    Min = min(budget), 
    Max = max(budget), 
    Mean = mean(budget), 
    Median = median(budget)
  )
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `budget = as.numeric(budget)`.
## Caused by warning:
## ! NAs introduced by coercion
##   Min     Max    Mean Median
## 1   0 3.8e+08 4254095      0
# For df_5
df_5 %>%
  mutate(budget = as.numeric(budget)) %>%
  filter(!is.na(budget)) %>%
  summarise(
    Min = min(budget), 
    Max = max(budget), 
    Mean = mean(budget), 
    Median = median(budget)
  )
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `budget = as.numeric(budget)`.
## Caused by warning:
## ! NAs introduced by coercion
##   Min     Max    Mean Median
## 1   0 3.8e+08 4300509      0


df_1$budget <- as.numeric(df_1$budget)
df_2$budget <- as.numeric(df_2$budget)
## Warning: NAs introduced by coercion
df_3$budget <- as.numeric(df_3$budget)
## Warning: NAs introduced by coercion
df_4$budget <- as.numeric(df_4$budget)
## Warning: NAs introduced by coercion
df_5$budget <- as.numeric(df_5$budget)
## Warning: NAs introduced by coercion
df_1 <- df_1[!is.na(df_1$budget), ]
df_2 <- df_2[!is.na(df_2$budget), ]
df_3 <- df_3[!is.na(df_3$budget), ]
df_4 <- df_4[!is.na(df_4$budget), ]
df_5 <- df_5[!is.na(df_5$budget), ]

boxplot(df_1$budget, main = "Budget Distribution in df_1", ylab = "Budget")

boxplot(df_2$budget, main = "Budget Distribution in df_2", ylab = "Budget")

boxplot(df_3$budget, main = "Budget Distribution in df_3", ylab = "Budget")

boxplot(df_4$budget, main = "Budget Distribution in df_4", ylab = "Budget")

boxplot(df_5$budget, main = "Budget Distribution in df_5", ylab = "Budget")


df_1 |> 
  ggplot() +
  geom_histogram(mapping = aes(x = budget),
                 colour = 'white') +
  scale_x_continuous(labels = scales::dollar_format()) +  # Use dollar_format without dividing by 1000
  labs(title = "Budgets of Movies in df_1") + 
  theme(plot.subtitle = element_text(color = 'darkgray'))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

df_2 |> 
  ggplot() +
  geom_histogram(mapping = aes(x = budget),
                 colour = 'white') +
  scale_x_continuous(labels = scales::dollar_format()) +  # Use dollar_format without dividing by 1000
  labs(title = "Budgets of Movies in df_2") + 
  
  theme(plot.subtitle = element_text(color = 'darkgray'))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

df_3 |> 
  ggplot() +
  geom_histogram(mapping = aes(x = budget),
                 colour = 'white') +
  scale_x_continuous(labels = scales::dollar_format()) +  # Use dollar_format without dividing by 1000
  labs(title = "Budgets of Movies in df_3") + 
  
  theme(plot.subtitle = element_text(color = 'darkgray'))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

df_4 |> 
  ggplot() +
  geom_histogram(mapping = aes(x = budget),
                 colour = 'white') +
  scale_x_continuous(labels = scales::dollar_format()) +  # Use dollar_format without dividing by 1000
  labs(title = "Budgets of Movies in df_4") + 
  
  theme(plot.subtitle = element_text(color = 'darkgray'))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

df_5 |> 
  ggplot() +
  geom_histogram(mapping = aes(x = budget),
                 colour = 'white') +
  scale_x_continuous(labels = scales::dollar_format()) +  # Use dollar_format without dividing by 1000
  labs(title = "Budgets of Movies in df_5") + 
  
  theme(plot.subtitle = element_text(color = 'darkgray'))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Insights

  • The data suggests that the majority of movies in each sub sample has a $0 budget, indicating that they may be indie movies or fan films.

  • When looking for anomalies in the budget, I found that the averages are very consistent across all samples. One anomaly that can be seen though is that som sub samples have a maximum budget of $380,000,000 while the rest of the sub samples have a maximum budget of $270,000,000 or $300,000,000.

  • The $380,000,000 budgets are also outliers in their respective data samples indicating that they are very rare budgets.

df_1$runtime <- as.numeric(df_1$runtime)
df_1$popularity <- as.numeric(df_1$popularity)
df_1 <- df_1[!is.na(df_1$runtime) & !is.na(df_1$popularity), ]
ggplot(df_1, aes(x = runtime, y = popularity)) + 
  geom_point(color = "#69b3a2", size = 3, alpha = 0.7) + 
  ggtitle("Movie Runtime vs Popularity Rating") + 
  xlab("Movie Runtime (minutes)") + 
  ylab("Popularity Rating") + 
  theme_minimal() +  
  scale_x_continuous(breaks = seq(0, max(df_1$runtime, na.rm = TRUE), by = 50)) +  
  scale_y_continuous(breaks = seq(0, max(df_1$popularity, na.rm = TRUE), by = 50))

df_2$runtime <- as.numeric(df_2$runtime)
df_2$popularity <- as.numeric(df_2$popularity)
df_2 <- df_2[!is.na(df_2$runtime) & !is.na(df_2$popularity), ]
ggplot(df_2, aes(x = runtime, y = popularity)) + 
  geom_point(color = "#69b3a2", size = 3, alpha = 0.7) + 
  ggtitle("Movie Runtime vs Popularity Rating") + 
  xlab("Movie Runtime (minutes)") + 
  ylab("Popularity Rating") + 
  theme_minimal() +  
  scale_x_continuous(breaks = seq(0, max(df_2$runtime, na.rm = TRUE), by = 50)) +  
  scale_y_continuous(breaks = seq(0, max(df_2$popularity, na.rm = TRUE), by = 50))

df_3$runtime <- as.numeric(df_3$runtime)
df_3$popularity <- as.numeric(df_3$popularity)
df_3 <- df_3[!is.na(df_3$runtime) & !is.na(df_3$popularity), ]
ggplot(df_3, aes(x = runtime, y = popularity)) + 
  geom_point(color = "#69b3a2", size = 3, alpha = 0.7) + 
  ggtitle("Movie Runtime vs Popularity Rating") + 
  xlab("Movie Runtime (minutes)") + 
  ylab("Popularity Rating") + 
  theme_minimal() +  
  scale_x_continuous(breaks = seq(0, max(df_3$runtime, na.rm = TRUE), by = 50)) +  
  scale_y_continuous(breaks = seq(0, max(df_3$popularity, na.rm = TRUE), by = 50))

df_4$runtime <- as.numeric(df_4$runtime)
df_4$popularity <- as.numeric(df_4$popularity)
df_4 <- df_4[!is.na(df_4$runtime) & !is.na(df_4$popularity), ]
ggplot(df_4, aes(x = runtime, y = popularity)) + 
  geom_point(color = "#69b3a2", size = 3, alpha = 0.7) + 
  ggtitle("Movie Runtime vs Popularity Rating") + 
  xlab("Movie Runtime (minutes)") + 
  ylab("Popularity Rating") + 
  theme_minimal() +  
  scale_x_continuous(breaks = seq(0, max(df_4$runtime, na.rm = TRUE), by = 50)) +  
  scale_y_continuous(breaks = seq(0, max(df_4$popularity, na.rm = TRUE), by = 50))

df_5$runtime <- as.numeric(df_5$runtime)
df_5$popularity <- as.numeric(df_5$popularity)
df_5 <- df_5[!is.na(df_5$runtime) & !is.na(df_5$popularity), ]
ggplot(df_5, aes(x = runtime, y = popularity)) + 
  geom_point(color = "#69b3a2", size = 3, alpha = 0.7) + 
  ggtitle("Movie Runtime vs Popularity Rating") + 
  xlab("Movie Runtime (minutes)") + 
  ylab("Popularity Rating") + 
  theme_minimal() +  
  scale_x_continuous(breaks = seq(0, max(df_5$runtime, na.rm = TRUE), by = 50)) +  
  scale_y_continuous(breaks = seq(0, max(df_5$popularity, na.rm = TRUE), by = 50))

Insights:

  • The dataset is skewed towards movies with runtimes of around 100-150 minutes.

  • There are some data points that extend the average runtime but have little popularity which we can call outliers.

  • The majority of the data points are clustered in around the same area across all subsamples when we compare the Runtime with Popularity ratings.

How Might this Investigation Affect Drawing Conclusions