Data Preprocessing Functions

# Enhanced data cleaning function
clean_numeric_column <- function(x) {
  x <- as.numeric(as.character(x))
  x[x < 0] <- NA  # Convert negative values to NA
  x[is.infinite(x)] <- NA  # Convert infinite values to NA
  return(x)
}

# Function to handle outliers using IQR method
handle_outliers <- function(x, method = "cap") {
  Q1 <- quantile(x, 0.25, na.rm = TRUE)
  Q3 <- quantile(x, 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  
  if(method == "cap") {
    x[x < lower_bound] <- lower_bound
    x[x > upper_bound] <- upper_bound
  } else if(method == "remove") {
    x[x < lower_bound | x > upper_bound] <- NA
  }
  return(x)
}

# Function to create summary statistics
create_summary_stats <- function(df, numeric_cols) {
  df %>%
    select(all_of(numeric_cols)) %>%
    summarise(across(everything(), 
                    list(
                      mean = ~mean(., na.rm = TRUE),
                      median = ~median(., na.rm = TRUE),
                      sd = ~sd(., na.rm = TRUE),
                      skew = ~skewness(., na.rm = TRUE),
                      kurtosis = ~kurtosis(., na.rm = TRUE),
                      missing = ~sum(is.na(.))
                    ))) %>%
    pivot_longer(everything(),
                names_to = c("variable", "statistic"),
                names_pattern = "(.*)_(.*)") %>%
    pivot_wider(names_from = statistic, values_from = value)
}

# Function to create interactive scatter plot
create_interactive_scatter <- function(df, x_var, y_var, color_var = NULL) {
  p <- ggplot(df, aes_string(x = x_var, y = y_var)) +
    geom_point(alpha = 0.6) +
    theme_minimal() +
    labs(title = paste(y_var, "vs", x_var),
         x = str_to_title(str_replace_all(x_var, "_", " ")),
         y = str_to_title(str_replace_all(y_var, "_", " ")))
  
  if(!is.null(color_var)) {
    p <- p + aes_string(color = color_var)
  }
  
  ggplotly(p)
}

Data Import and Initial Cleaning

# Create sample movie dataset (replace with your actual data)
set.seed(42)
n <- 1000

movies_df <- data.frame(
  title = paste("Movie", 1:n),
  budget = rnorm(n, mean = 50e6, sd = 20e6),
  revenue = rnorm(n, mean = 100e6, sd = 40e6),
  runtime = rnorm(n, mean = 120, sd = 20),
  vote_average = rnorm(n, mean = 7, sd = 1),
  vote_count = rpois(n, lambda = 1000),
  popularity = rlnorm(n, meanlog = 2, sdlog = 1),
  original_language = sample(c("en", "es", "fr", "de", "ja", "ko", "hi"), 
                           n, replace = TRUE, prob = c(0.6, 0.1, 0.1, 0.05, 0.05, 0.05, 0.05)),
  release_date = sample(seq(as.Date("2000/01/01"), as.Date("2023/12/31"), by = "day"), n)
)

# Initial data cleaning
movies_clean <- movies_df %>%
  clean_names() %>%  # Standardize column names
  mutate(across(c(budget, revenue, runtime, vote_average, vote_count, popularity),
                clean_numeric_column)) %>%
  mutate(
    release_year = year(release_date),
    budget_million = budget / 1e6,
    revenue_million = revenue / 1e6,
    roi = (revenue - budget) / budget,
    original_language = factor(original_language)
  ) %>%
  filter(!is.na(budget), !is.na(revenue), !is.na(runtime))

# Print initial summary
print("Initial data summary:")

## [1] "Initial data summary:"

skim(movies_clean)

Data summary
Name	movies_clean
Number of rows	987
Number of columns	13
_______________________
Column type frequency:
character	1
Date	1
factor	1
numeric	10
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
title	0	1	7	10	0	987	0

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
release_date	0	1	2000-01-02	2023-12-31	2011-08-28	987

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
original_language	0	1	FALSE	7	en: 595, fr: 115, es: 88, de: 58

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
budget	1	49878978.04	19538024.92	773290.50	36791823.11	49818870.49	63289462.81	119906085.28	▂▇▇▂▁
revenue	1	100492070.09	38619655.77	1828910.60	73990337.23	100013135.09	126375606.96	243386379.72	▂▇▇▂▁
runtime	1	119.99	20.52	56.01	107.33	120.25	132.78	189.42	▁▃▇▃▁
vote_average	1	6.98	0.98	4.05	6.31	6.96	7.65	10.17	▁▅▇▃▁
vote_count	1	999.52	31.52	905.00	977.00	1001.00	1020.50	1107.00	▁▅▇▃▁
popularity	1	12.54	20.56	0.15	3.69	7.57	14.39	326.40	▇▁▁▁▁
release_year	1	2011.44	7.04	2000.00	2005.00	2011.00	2018.00	2023.00	▇▇▆▇▇
budget_million	1	49.88	19.54	0.77	36.79	49.82	63.29	119.91	▂▇▇▂▁
revenue_million	1	100.49	38.62	1.83	73.99	100.01	126.38	243.39	▂▇▇▂▁
roi	1	1.81	5.20	-0.97	0.38	0.98	1.93	113.85	▇▁▁▁▁

Exploratory Data Analysis

Numerical Variables Distribution

# Create distribution plots for numerical variables
numeric_vars <- c("budget_million", "revenue_million", "runtime", "vote_average", "popularity")

plots <- lapply(numeric_vars, function(var) {
  ggplot(movies_clean, aes_string(x = var)) +
    geom_histogram(aes(y = ..density..), fill = "steelblue", alpha = 0.7) +
    geom_density(color = "red") +
    theme_minimal() +
    labs(title = paste("Distribution of", str_to_title(str_replace_all(var, "_", " "))))
})

# Display plots in a grid
do.call(grid.arrange, c(plots, ncol = 2))

Correlation Analysis

# Create correlation matrix
cor_matrix <- movies_clean %>%
  select(budget_million, revenue_million, runtime, vote_average, popularity) %>%
  cor(use = "complete.obs")

# Create correlation plot
corrplot(cor_matrix, 
         method = "color",
         type = "upper",
         addCoef.col = "black",
         tl.col = "black",
         tl.srt = 45,
         diag = FALSE)

Revenue Analysis by Language

# Create boxplot of revenue by language
ggplot(movies_clean, aes(x = original_language, y = revenue_million)) +
  geom_boxplot(fill = "steelblue", alpha = 0.7) +
  theme_minimal() +
  labs(title = "Revenue Distribution by Original Language",
       x = "Original Language",
       y = "Revenue (Millions)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Time Series Analysis

# Create time series plot of average revenue by year
yearly_revenue <- movies_clean %>%
  group_by(release_year) %>%
  summarise(
    avg_revenue = mean(revenue_million, na.rm = TRUE),
    avg_budget = mean(budget_million, na.rm = TRUE),
    n_movies = n()
  )

# Plot trends
ggplot(yearly_revenue, aes(x = release_year)) +
  geom_line(aes(y = avg_revenue, color = "Revenue"), size = 1) +
  geom_line(aes(y = avg_budget, color = "Budget"), size = 1) +
  geom_point(aes(y = avg_revenue), color = "blue") +
  geom_point(aes(y = avg_budget), color = "red") +
  theme_minimal() +
  labs(title = "Average Revenue and Budget Trends",
       x = "Release Year",
       y = "Amount (Millions)",
       color = "Metric") +
  scale_color_manual(values = c("Revenue" = "blue", "Budget" = "red"))

ROI Analysis

# Create ROI distribution plot
ggplot(movies_clean, aes(x = roi)) +
  geom_histogram(bins = 50, fill = "steelblue", alpha = 0.7) +
  theme_minimal() +
  labs(title = "Distribution of Return on Investment (ROI)",
       x = "ROI",
       y = "Count") +
  scale_x_continuous(labels = scales::percent)

# Calculate and display top 10 movies by ROI
top_roi <- movies_clean %>%
  arrange(desc(roi)) %>%
  select(title, budget_million, revenue_million, roi, release_year) %>%
  head(10) %>%
  mutate(roi = scales::percent(roi, accuracy = 0.1))

print("Top 10 Movies by ROI:")

## [1] "Top 10 Movies by ROI:"

datatable(top_roi)

Statistical Analysis

# Calculate summary statistics
summary_stats <- create_summary_stats(movies_clean, 
                                    c("budget_million", "revenue_million", "runtime", 
                                      "vote_average", "popularity"))

# Display summary statistics
print("Summary Statistics:")

## [1] "Summary Statistics:"

datatable(summary_stats)

# Perform t-test comparing revenue of English vs non-English movies
language_test <- t.test(revenue_million ~ original_language == "en", data = movies_clean)
print("T-test Results (English vs Non-English Revenue):")

## [1] "T-test Results (English vs Non-English Revenue):"

print(language_test)

## 
##  Welch Two Sample t-test
## 
## data:  revenue_million by original_language == "en"
## t = 0.94961, df = 811.01, p-value = 0.3426
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
##  -2.568744  7.383396
## sample estimates:
## mean in group FALSE  mean in group TRUE 
##           101.94329            99.53597

Interactive Visualizations

# Create interactive scatter plot of budget vs revenue
budget_revenue_plot <- create_interactive_scatter(movies_clean, 
                                                "budget_million", 
                                                "revenue_million", 
                                                "original_language")
budget_revenue_plot

Conclusions

This analysis reveals several key insights about the movie dataset:

Distribution patterns of key metrics
Correlation between budget and revenue
Language-specific performance trends
Temporal trends in the industry
Return on Investment patterns

The interactive visualizations and statistical tests provide a comprehensive view of the movie industry dynamics.

Enhanced Movie Data Analysis

Eduardo Pérez Martínez

2025-05-02