# Enhanced data cleaning function
clean_numeric_column <- function(x) {
x <- as.numeric(as.character(x))
x[x < 0] <- NA # Convert negative values to NA
x[is.infinite(x)] <- NA # Convert infinite values to NA
return(x)
}
# Function to handle outliers using IQR method
handle_outliers <- function(x, method = "cap") {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
if(method == "cap") {
x[x < lower_bound] <- lower_bound
x[x > upper_bound] <- upper_bound
} else if(method == "remove") {
x[x < lower_bound | x > upper_bound] <- NA
}
return(x)
}
# Function to create summary statistics
create_summary_stats <- function(df, numeric_cols) {
df %>%
select(all_of(numeric_cols)) %>%
summarise(across(everything(),
list(
mean = ~mean(., na.rm = TRUE),
median = ~median(., na.rm = TRUE),
sd = ~sd(., na.rm = TRUE),
skew = ~skewness(., na.rm = TRUE),
kurtosis = ~kurtosis(., na.rm = TRUE),
missing = ~sum(is.na(.))
))) %>%
pivot_longer(everything(),
names_to = c("variable", "statistic"),
names_pattern = "(.*)_(.*)") %>%
pivot_wider(names_from = statistic, values_from = value)
}
# Function to create interactive scatter plot
create_interactive_scatter <- function(df, x_var, y_var, color_var = NULL) {
p <- ggplot(df, aes_string(x = x_var, y = y_var)) +
geom_point(alpha = 0.6) +
theme_minimal() +
labs(title = paste(y_var, "vs", x_var),
x = str_to_title(str_replace_all(x_var, "_", " ")),
y = str_to_title(str_replace_all(y_var, "_", " ")))
if(!is.null(color_var)) {
p <- p + aes_string(color = color_var)
}
ggplotly(p)
}# Create sample movie dataset (replace with your actual data)
set.seed(42)
n <- 1000
movies_df <- data.frame(
title = paste("Movie", 1:n),
budget = rnorm(n, mean = 50e6, sd = 20e6),
revenue = rnorm(n, mean = 100e6, sd = 40e6),
runtime = rnorm(n, mean = 120, sd = 20),
vote_average = rnorm(n, mean = 7, sd = 1),
vote_count = rpois(n, lambda = 1000),
popularity = rlnorm(n, meanlog = 2, sdlog = 1),
original_language = sample(c("en", "es", "fr", "de", "ja", "ko", "hi"),
n, replace = TRUE, prob = c(0.6, 0.1, 0.1, 0.05, 0.05, 0.05, 0.05)),
release_date = sample(seq(as.Date("2000/01/01"), as.Date("2023/12/31"), by = "day"), n)
)
# Initial data cleaning
movies_clean <- movies_df %>%
clean_names() %>% # Standardize column names
mutate(across(c(budget, revenue, runtime, vote_average, vote_count, popularity),
clean_numeric_column)) %>%
mutate(
release_year = year(release_date),
budget_million = budget / 1e6,
revenue_million = revenue / 1e6,
roi = (revenue - budget) / budget,
original_language = factor(original_language)
) %>%
filter(!is.na(budget), !is.na(revenue), !is.na(runtime))
# Print initial summary
print("Initial data summary:")## [1] "Initial data summary:"
| Name | movies_clean |
| Number of rows | 987 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| Date | 1 |
| factor | 1 |
| numeric | 10 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| title | 0 | 1 | 7 | 10 | 0 | 987 | 0 |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| release_date | 0 | 1 | 2000-01-02 | 2023-12-31 | 2011-08-28 | 987 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| original_language | 0 | 1 | FALSE | 7 | en: 595, fr: 115, es: 88, de: 58 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| budget | 0 | 1 | 49878978.04 | 19538024.92 | 773290.50 | 36791823.11 | 49818870.49 | 63289462.81 | 119906085.28 | ▂▇▇▂▁ |
| revenue | 0 | 1 | 100492070.09 | 38619655.77 | 1828910.60 | 73990337.23 | 100013135.09 | 126375606.96 | 243386379.72 | ▂▇▇▂▁ |
| runtime | 0 | 1 | 119.99 | 20.52 | 56.01 | 107.33 | 120.25 | 132.78 | 189.42 | ▁▃▇▃▁ |
| vote_average | 0 | 1 | 6.98 | 0.98 | 4.05 | 6.31 | 6.96 | 7.65 | 10.17 | ▁▅▇▃▁ |
| vote_count | 0 | 1 | 999.52 | 31.52 | 905.00 | 977.00 | 1001.00 | 1020.50 | 1107.00 | ▁▅▇▃▁ |
| popularity | 0 | 1 | 12.54 | 20.56 | 0.15 | 3.69 | 7.57 | 14.39 | 326.40 | ▇▁▁▁▁ |
| release_year | 0 | 1 | 2011.44 | 7.04 | 2000.00 | 2005.00 | 2011.00 | 2018.00 | 2023.00 | ▇▇▆▇▇ |
| budget_million | 0 | 1 | 49.88 | 19.54 | 0.77 | 36.79 | 49.82 | 63.29 | 119.91 | ▂▇▇▂▁ |
| revenue_million | 0 | 1 | 100.49 | 38.62 | 1.83 | 73.99 | 100.01 | 126.38 | 243.39 | ▂▇▇▂▁ |
| roi | 0 | 1 | 1.81 | 5.20 | -0.97 | 0.38 | 0.98 | 1.93 | 113.85 | ▇▁▁▁▁ |
# Create distribution plots for numerical variables
numeric_vars <- c("budget_million", "revenue_million", "runtime", "vote_average", "popularity")
plots <- lapply(numeric_vars, function(var) {
ggplot(movies_clean, aes_string(x = var)) +
geom_histogram(aes(y = ..density..), fill = "steelblue", alpha = 0.7) +
geom_density(color = "red") +
theme_minimal() +
labs(title = paste("Distribution of", str_to_title(str_replace_all(var, "_", " "))))
})
# Display plots in a grid
do.call(grid.arrange, c(plots, ncol = 2))# Create correlation matrix
cor_matrix <- movies_clean %>%
select(budget_million, revenue_million, runtime, vote_average, popularity) %>%
cor(use = "complete.obs")
# Create correlation plot
corrplot(cor_matrix,
method = "color",
type = "upper",
addCoef.col = "black",
tl.col = "black",
tl.srt = 45,
diag = FALSE)# Create boxplot of revenue by language
ggplot(movies_clean, aes(x = original_language, y = revenue_million)) +
geom_boxplot(fill = "steelblue", alpha = 0.7) +
theme_minimal() +
labs(title = "Revenue Distribution by Original Language",
x = "Original Language",
y = "Revenue (Millions)") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))# Create time series plot of average revenue by year
yearly_revenue <- movies_clean %>%
group_by(release_year) %>%
summarise(
avg_revenue = mean(revenue_million, na.rm = TRUE),
avg_budget = mean(budget_million, na.rm = TRUE),
n_movies = n()
)
# Plot trends
ggplot(yearly_revenue, aes(x = release_year)) +
geom_line(aes(y = avg_revenue, color = "Revenue"), size = 1) +
geom_line(aes(y = avg_budget, color = "Budget"), size = 1) +
geom_point(aes(y = avg_revenue), color = "blue") +
geom_point(aes(y = avg_budget), color = "red") +
theme_minimal() +
labs(title = "Average Revenue and Budget Trends",
x = "Release Year",
y = "Amount (Millions)",
color = "Metric") +
scale_color_manual(values = c("Revenue" = "blue", "Budget" = "red"))# Create ROI distribution plot
ggplot(movies_clean, aes(x = roi)) +
geom_histogram(bins = 50, fill = "steelblue", alpha = 0.7) +
theme_minimal() +
labs(title = "Distribution of Return on Investment (ROI)",
x = "ROI",
y = "Count") +
scale_x_continuous(labels = scales::percent)# Calculate and display top 10 movies by ROI
top_roi <- movies_clean %>%
arrange(desc(roi)) %>%
select(title, budget_million, revenue_million, roi, release_year) %>%
head(10) %>%
mutate(roi = scales::percent(roi, accuracy = 0.1))
print("Top 10 Movies by ROI:")## [1] "Top 10 Movies by ROI:"
# Calculate summary statistics
summary_stats <- create_summary_stats(movies_clean,
c("budget_million", "revenue_million", "runtime",
"vote_average", "popularity"))
# Display summary statistics
print("Summary Statistics:")## [1] "Summary Statistics:"
# Perform t-test comparing revenue of English vs non-English movies
language_test <- t.test(revenue_million ~ original_language == "en", data = movies_clean)
print("T-test Results (English vs Non-English Revenue):")## [1] "T-test Results (English vs Non-English Revenue):"
##
## Welch Two Sample t-test
##
## data: revenue_million by original_language == "en"
## t = 0.94961, df = 811.01, p-value = 0.3426
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
## -2.568744 7.383396
## sample estimates:
## mean in group FALSE mean in group TRUE
## 101.94329 99.53597
This analysis reveals several key insights about the movie dataset:
The interactive visualizations and statistical tests provide a comprehensive view of the movie industry dynamics.