Data Preprocessing Functions

# Enhanced data cleaning function
clean_numeric_column <- function(x) {
  x <- as.numeric(as.character(x))
  x[x < 0] <- NA  # Convert negative values to NA
  x[is.infinite(x)] <- NA  # Convert infinite values to NA
  return(x)
}

# Function to handle outliers using IQR method
handle_outliers <- function(x, method = "cap") {
  Q1 <- quantile(x, 0.25, na.rm = TRUE)
  Q3 <- quantile(x, 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  
  if(method == "cap") {
    x[x < lower_bound] <- lower_bound
    x[x > upper_bound] <- upper_bound
  } else if(method == "remove") {
    x[x < lower_bound | x > upper_bound] <- NA
  }
  return(x)
}

# Function to create summary statistics
create_summary_stats <- function(df, numeric_cols) {
  df %>%
    select(all_of(numeric_cols)) %>%
    summarise(across(everything(), 
                    list(
                      mean = ~mean(., na.rm = TRUE),
                      median = ~median(., na.rm = TRUE),
                      sd = ~sd(., na.rm = TRUE),
                      skew = ~skewness(., na.rm = TRUE),
                      kurtosis = ~kurtosis(., na.rm = TRUE),
                      missing = ~sum(is.na(.))
                    ))) %>%
    pivot_longer(everything(),
                names_to = c("variable", "statistic"),
                names_pattern = "(.*)_(.*)") %>%
    pivot_wider(names_from = statistic, values_from = value)
}

# Function to create interactive scatter plot
create_interactive_scatter <- function(df, x_var, y_var, color_var = NULL) {
  p <- ggplot(df, aes_string(x = x_var, y = y_var)) +
    geom_point(alpha = 0.6) +
    theme_minimal() +
    labs(title = paste(y_var, "vs", x_var),
         x = str_to_title(str_replace_all(x_var, "_", " ")),
         y = str_to_title(str_replace_all(y_var, "_", " ")))
  
  if(!is.null(color_var)) {
    p <- p + aes_string(color = color_var)
  }
  
  ggplotly(p)
}

Data Import and Initial Cleaning

# Create sample movie dataset (replace with your actual data)
set.seed(42)
n <- 1000

movies_df <- data.frame(
  title = paste("Movie", 1:n),
  budget = rnorm(n, mean = 50e6, sd = 20e6),
  revenue = rnorm(n, mean = 100e6, sd = 40e6),
  runtime = rnorm(n, mean = 120, sd = 20),
  vote_average = rnorm(n, mean = 7, sd = 1),
  vote_count = rpois(n, lambda = 1000),
  popularity = rlnorm(n, meanlog = 2, sdlog = 1),
  original_language = sample(c("en", "es", "fr", "de", "ja", "ko", "hi"), 
                           n, replace = TRUE, prob = c(0.6, 0.1, 0.1, 0.05, 0.05, 0.05, 0.05)),
  release_date = sample(seq(as.Date("2000/01/01"), as.Date("2023/12/31"), by = "day"), n)
)

# Initial data cleaning
movies_clean <- movies_df %>%
  clean_names() %>%  # Standardize column names
  mutate(across(c(budget, revenue, runtime, vote_average, vote_count, popularity),
                clean_numeric_column)) %>%
  mutate(
    release_year = year(release_date),
    budget_million = budget / 1e6,
    revenue_million = revenue / 1e6,
    roi = (revenue - budget) / budget,
    original_language = factor(original_language)
  ) %>%
  filter(!is.na(budget), !is.na(revenue), !is.na(runtime))

# Print initial summary
print("Initial data summary:")
## [1] "Initial data summary:"
skim(movies_clean)
Data summary
Name movies_clean
Number of rows 987
Number of columns 13
_______________________
Column type frequency:
character 1
Date 1
factor 1
numeric 10
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
title 0 1 7 10 0 987 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
release_date 0 1 2000-01-02 2023-12-31 2011-08-28 987

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
original_language 0 1 FALSE 7 en: 595, fr: 115, es: 88, de: 58

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
budget 0 1 49878978.04 19538024.92 773290.50 36791823.11 49818870.49 63289462.81 119906085.28 ▂▇▇▂▁
revenue 0 1 100492070.09 38619655.77 1828910.60 73990337.23 100013135.09 126375606.96 243386379.72 ▂▇▇▂▁
runtime 0 1 119.99 20.52 56.01 107.33 120.25 132.78 189.42 ▁▃▇▃▁
vote_average 0 1 6.98 0.98 4.05 6.31 6.96 7.65 10.17 ▁▅▇▃▁
vote_count 0 1 999.52 31.52 905.00 977.00 1001.00 1020.50 1107.00 ▁▅▇▃▁
popularity 0 1 12.54 20.56 0.15 3.69 7.57 14.39 326.40 ▇▁▁▁▁
release_year 0 1 2011.44 7.04 2000.00 2005.00 2011.00 2018.00 2023.00 ▇▇▆▇▇
budget_million 0 1 49.88 19.54 0.77 36.79 49.82 63.29 119.91 ▂▇▇▂▁
revenue_million 0 1 100.49 38.62 1.83 73.99 100.01 126.38 243.39 ▂▇▇▂▁
roi 0 1 1.81 5.20 -0.97 0.38 0.98 1.93 113.85 ▇▁▁▁▁

Exploratory Data Analysis

Numerical Variables Distribution

# Create distribution plots for numerical variables
numeric_vars <- c("budget_million", "revenue_million", "runtime", "vote_average", "popularity")

plots <- lapply(numeric_vars, function(var) {
  ggplot(movies_clean, aes_string(x = var)) +
    geom_histogram(aes(y = ..density..), fill = "steelblue", alpha = 0.7) +
    geom_density(color = "red") +
    theme_minimal() +
    labs(title = paste("Distribution of", str_to_title(str_replace_all(var, "_", " "))))
})

# Display plots in a grid
do.call(grid.arrange, c(plots, ncol = 2))

Correlation Analysis

# Create correlation matrix
cor_matrix <- movies_clean %>%
  select(budget_million, revenue_million, runtime, vote_average, popularity) %>%
  cor(use = "complete.obs")

# Create correlation plot
corrplot(cor_matrix, 
         method = "color",
         type = "upper",
         addCoef.col = "black",
         tl.col = "black",
         tl.srt = 45,
         diag = FALSE)

Revenue Analysis by Language

# Create boxplot of revenue by language
ggplot(movies_clean, aes(x = original_language, y = revenue_million)) +
  geom_boxplot(fill = "steelblue", alpha = 0.7) +
  theme_minimal() +
  labs(title = "Revenue Distribution by Original Language",
       x = "Original Language",
       y = "Revenue (Millions)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Time Series Analysis

# Create time series plot of average revenue by year
yearly_revenue <- movies_clean %>%
  group_by(release_year) %>%
  summarise(
    avg_revenue = mean(revenue_million, na.rm = TRUE),
    avg_budget = mean(budget_million, na.rm = TRUE),
    n_movies = n()
  )

# Plot trends
ggplot(yearly_revenue, aes(x = release_year)) +
  geom_line(aes(y = avg_revenue, color = "Revenue"), size = 1) +
  geom_line(aes(y = avg_budget, color = "Budget"), size = 1) +
  geom_point(aes(y = avg_revenue), color = "blue") +
  geom_point(aes(y = avg_budget), color = "red") +
  theme_minimal() +
  labs(title = "Average Revenue and Budget Trends",
       x = "Release Year",
       y = "Amount (Millions)",
       color = "Metric") +
  scale_color_manual(values = c("Revenue" = "blue", "Budget" = "red"))

ROI Analysis

# Create ROI distribution plot
ggplot(movies_clean, aes(x = roi)) +
  geom_histogram(bins = 50, fill = "steelblue", alpha = 0.7) +
  theme_minimal() +
  labs(title = "Distribution of Return on Investment (ROI)",
       x = "ROI",
       y = "Count") +
  scale_x_continuous(labels = scales::percent)

# Calculate and display top 10 movies by ROI
top_roi <- movies_clean %>%
  arrange(desc(roi)) %>%
  select(title, budget_million, revenue_million, roi, release_year) %>%
  head(10) %>%
  mutate(roi = scales::percent(roi, accuracy = 0.1))

print("Top 10 Movies by ROI:")
## [1] "Top 10 Movies by ROI:"
datatable(top_roi)

Statistical Analysis

# Calculate summary statistics
summary_stats <- create_summary_stats(movies_clean, 
                                    c("budget_million", "revenue_million", "runtime", 
                                      "vote_average", "popularity"))

# Display summary statistics
print("Summary Statistics:")
## [1] "Summary Statistics:"
datatable(summary_stats)
# Perform t-test comparing revenue of English vs non-English movies
language_test <- t.test(revenue_million ~ original_language == "en", data = movies_clean)
print("T-test Results (English vs Non-English Revenue):")
## [1] "T-test Results (English vs Non-English Revenue):"
print(language_test)
## 
##  Welch Two Sample t-test
## 
## data:  revenue_million by original_language == "en"
## t = 0.94961, df = 811.01, p-value = 0.3426
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
##  -2.568744  7.383396
## sample estimates:
## mean in group FALSE  mean in group TRUE 
##           101.94329            99.53597

Interactive Visualizations

# Create interactive scatter plot of budget vs revenue
budget_revenue_plot <- create_interactive_scatter(movies_clean, 
                                                "budget_million", 
                                                "revenue_million", 
                                                "original_language")
budget_revenue_plot

Conclusions

This analysis reveals several key insights about the movie dataset:

  1. Distribution patterns of key metrics
  2. Correlation between budget and revenue
  3. Language-specific performance trends
  4. Temporal trends in the industry
  5. Return on Investment patterns

The interactive visualizations and statistical tests provide a comprehensive view of the movie industry dynamics.