library(readxl) library(ggplot2) library(dplyr) library(tidyr) library(caret) library(viridisLite)

getwd() setwd(“/Users/cross/Desktop”)

#Merging the Data

Read excel.csv file (save Excel file as CSV UTF-8 (Comma delimited))

Movie1 <- read_excel(“Movie Dataset_Financials.xlsx”) # Display the structure of Movie1 (data frame) str(Movie1) # Display the first few rows of Movie1 (data frame) head(Movie1, n=5)

Read excel.csv file (save Excel file as CSV UTF-8 (Comma delimited))

Movie2 <- read_excel(“Movie Dataset_General Audience.xlsx”) # Display the structure of Movie2 (data frame) str(Movie2) # Display the first few rows of Movie2 (data frame) head(Movie2, n=10)

Create a Master Dataset (combined Movie Dataset_Financials and Movie Dataset_General Audience)

Rename Unique ID in Movie2 to match with Movie1

Rename the first column in Movie2 as “Movie_ID”

(to match with the first column name in Movie1)

names(Movie2)[1] <- c(“Movie_ID”) head(Movie2, n=1)

Rename the first column in both datasets to “Movie_ID” (if necessary)

names(Movie1)[1] <- “Movie_ID” names(Movie2)[1] <- “Movie_ID”

Merge the datasets by “Movie_ID”

Movie_Total <- merge(Movie1, Movie2, by = “Movie_ID”)

Merge Movie1 and Movie2 into one dataset

Movie_Total <- merge(Movie1, Movie2, by = “Movie_ID”) str(Movie_Total) head(Movie_Total)

Save the merged data (Movie_Total) to a file

Save as CSV

write.csv(Movie_Total, “Movie_Total.csv”, row.names = FALSE) # row.names = FALSE prevents R from adding row numbers View(Movie_Total)

#Read Excel library(readxl)

Replace “file_path.xlsx” with the path to your file

Movie_data_1 <- read_excel(“Movie Dataset_Financials.xlsx”) Movie_data_2 <- read_excel(“Movie Dataset_General Audience.xlsx”)

Replace missing values

Check the master dataset

summary(Movie_Total)

#Replace missing values

Calculate mean of a specific column (example: Revenue). If output is NA, this means there is a null value in the column.

mean(Movie_Total$Revenue, na.rm = TRUE)

Find rows with NA in Revenue

na_rows <- Movie_Total[is.na(Movie_Total$Revenue), ] print(na_rows)

Find mean of Revenue without NA values

meanRevenue <- mean(Movie_Total$Revenue, na.rm = TRUE) print(meanRevenue)

Step 1: Ensure column names are correct

colnames(Movie_Total) <- make.names(colnames(Movie_Total)) # Fix column names to ensure validity

Step 2: Check the structure of the column

str(Movie_Total$Revenue)

Step 3: Convert the column to numeric

If the column contains non-numeric data, first coerce it into numeric format

Movie_Total$Revenue <- as.numeric(as.character(Movie_Total$revenue..Millions.)) # Adjust to your column name

Step 4: Check for NA values

If there are invalid values in the column, they will be converted to NA

summary(Movie_Total$Revenue) # Check for NA values

Step 5: Calculate the mean, handling NA values

if (any(is.na(Movie_Total$Revenue))) { cat("Missing values detected in Revenue column. Replacing NA with column mean.\n") } mean_revenue <- mean(Movie_Total$Revenue, na.rm = TRUE) cat(“Mean Revenue:”, mean_revenue, “”)

Step 6: Replace missing values with the mean (optional)

Movie_Total$Revenue[is.na(Movie_Total$Revenue)] <- mean_revenue

Step 7: Verify the result

summary(Movie_Total$Revenue)

write.csv(Movie_Total, “Updated_Movie_Total.csv”, row.names = FALSE)

Replace NA values with calculated mean of Revenue

Movie_Total[is.na(Movie_Total$Revenue), “Revenue”] <- meanRevenue

Check to see if NA rows are replaced with calculated mean

Movie_Total[c(rownames(na_rows)), ]

5- Create a new column to calculate Profit Margin

Movie_Total$Profit_Margin <- (Movie_Total$Revenue - Movie_Total$Budget) / Movie_Total$Revenue

Step 1: Ensure column names are correct

colnames(Movie_Total) <- make.names(colnames(Movie_Total)) # Fix column names if they contain invalid characters

Step 2: Verify and convert columns to numeric

Movie_Total$Revenue <- as.numeric(as.character(Movie_Total$revenue..Millions.)) # Replace ‘revenue..Millions.’ with actual column name Movie_Total$Budget <- as.numeric(as.character(Movie_Total$budget..Millions.)) # Replace ‘budget..Millions.’ with actual column name

Step 3: Check for and handle missing or invalid data

Replace NA values with column means, or handle as needed

Movie_Total$Revenue[is.na(Movie_Total$Revenue)] <- mean(Movie_Total$Revenue, na.rm = TRUE) Movie_Total$Budget[is.na(Movie_Total$Budget)] <- mean(Movie_Total$Budget, na.rm = TRUE)

Step 4: Ensure no division by zero

Movie_Total$Profit_Margin <- ifelse( Movie_Total$Revenue > 0, # Check if Revenue is greater than zero (Movie_Total$Revenue - Movie_Total$Budget) / Movie_Total$Revenue, # Calculate Profit Margin NA # Assign NA if Revenue is zero or negative )

Step 5: Check the results

summary(Movie_Total$Profit_Margin) # View summary statistics for Profit_Margin head(Movie_Total) # View the first few rows of the dataset

View the new column for Profit Margin

View(Movie_Total[c(“Revenue”, “Budget”, “Profit_Margin”)]) # Use this view to check calculation

#Merge genre

Movie_Total <- Movie_Total %>% mutate(genre_category = case_when( genre == “Action & Adventure” ~ “Action/Adventure”, genre == “Comedy” ~ “Comedy”, genre == “Drama” ~ “Drama”, genre %in% c(“Horror”, “Mystery & Suspense”) ~ “Horror/Thriller”, genre %in% “Animation” ~ “Animation”, genre == “Documentary” ~ “Documentary”, genre == “Science Fiction & Fantasy” ~ “Sci-Fi/Fantasy”, genre %in% c(“Art House & International”, “Musical & Performing Arts”) ~ “Arts”, TRUE ~ “Others” # Default category )) table(Movie_Total$genre_category)

#Multiple Regression

predictors <- c(“budget..Millions.”, “critics_score”, “audience_score”, “Facebook_Likes”) target <- “revenue..Millions.”

Subset and clean data

movie_cleaned <- Movie_Total %>% select(all_of(c(predictors, target))) %>% drop_na() # Corrected function name

Split the data into training and testing sets

set.seed(42) # For reproducibility train_indices <- createDataPartition(movie_cleaned[[target]], p = 0.8, list = FALSE) train_data <- movie_cleaned[train_indices, ] test_data <- movie_cleaned[-train_indices, ]

Train a multiple linear regression model

revenue_model <- train( as.formula(paste(target, “~”, paste(predictors, collapse = “+”))), data = train_data, method = “lm” )

Display model summary

summary(revenue_model$finalModel)

Make predictions on the test set

test_predictions <- predict(revenue_model, newdata = test_data)

Calculate evaluation metrics

mse <- mean((test_predictions - test_data[[target]])^2) r2 <- cor(test_predictions, test_data[[target]])^2

cat(“Mean Squared Error:”, mse, “”) cat(“R-squared:”, r2, “”)

Extract model coefficients

coefficients <- summary(revenue_model$finalModel)$coefficients coefficients_df <- data.frame( Feature = rownames(coefficients), Coefficient = coefficients[, “Estimate”] ) %>% arrange(desc(abs(Coefficient)))

print(coefficients_df)

Display model summary

summary(revenue_model$finalModel)

#Creating numerical values for “best_pic_nom” Movie_Total$best_pic_nom <- ifelse(Movie_Total$best_pic_nom == “yes”, 1, 0)

#Creating new variable “Avg_Score”, by taking the average values of both “critics_score” and “audience_score” Movie_Total <- Movie_Total %>% mutate(Avg_Score = (critics_score + audience_score) / 2)

Movie_Total <- Movie_Total %>% mutate(Avg_Score = rowMeans(cbind(critics_score, audience_score), na.rm = TRUE))

Perform the independent t-test

Subset Avg_Score for movies with and without Best Picture nominations

avg_score_nom <- Movie_Total$Avg_Score[Movie_Total$best_pic_nom == 1] avg_score_no_nom <- Movie_Total$Avg_Score[Movie_Total$best_pic_nom == 0]

Remove NAs from both groups

avg_score_nom <- na.omit(avg_score_nom) avg_score_no_nom <- na.omit(avg_score_no_nom)

t_test_avg <- t.test(avg_score_nom, avg_score_no_nom)

Print the results

print(t_test_avg)

library(ggbeeswarm) library(beeswarm) # Beeswarm plot ggplot(avg_score_data, aes(x = best_pic_nom, y = Avg_Score, color = best_pic_nom)) + geom_beeswarm(size = 2, alpha = 0.7) + stat_summary(fun = mean, geom = “point”, size = 4, shape = 18, color = “black”) + labs( title = “Average Scores by Best Picture Nomination”, x = “Best Picture Nomination”, y = “Average Score” ) + scale_color_manual(values = c(“Yes” = “green”, “No” = “gray”)) + theme_minimal()

#one way anova

Perform one-way ANOVA for Revenue vs. MPAA Rating

anova_mpaa <- aov(revenue..Millions. ~ mpaa_rating, data = Movie_Total)

Summary of the ANOVA

summary(anova_mpaa)

Tukey’s HSD Post-Hoc Test

tukey_mpaa <- TukeyHSD(anova_mpaa) print(tukey_mpaa)

library(ggplot2)

Boxplot for Revenue by MPAA Rating

ggplot(Movie_Total, aes(x = mpaa_rating, y = revenue..Millions., fill = mpaa_rating)) + geom_boxplot() + labs( title = “Revenue by MPAA Rating”, x = “MPAA Rating”, y = “Revenue (Millions)” ) + theme_minimal()

T-test Audience vs critics score

Perform a paired t-test

paired_t_test <- t.test( Movie_Total$critics_score, Movie_Total$audience_score, paired = TRUE # Paired test for critics vs audience scores )

Print results

print(paired_t_test)

library(ggplot2)

Reshape the data to long format for ggplot

scores_long <- Movie_Total %>% select(critics_score, audience_score) %>% pivot_longer(cols = everything(), names_to = “Score_Type”, values_to = “Score”)

Boxplot

ggplot(scores_long, aes(x = Score_Type, y = Score, fill = Score_Type)) + geom_boxplot(outlier.color = “red”, outlier.shape = 16, alpha = 0.7) + labs( title = “Comparison of Critics’ and Audience Scores”, x = “Score Type”, y = “Score” ) + theme_minimal() + scale_fill_manual(values = c(“critics_score” = “skyblue”, “audience_score” = “red”))

#one way anova #2

#Perform one-way ANOVA for revenue across genres anova_result <- aov(revenue..Millions. ~ genre_category, data = Movie_Total)

Display the ANOVA summary

summary(anova_result)

Calculate mean revenues for each genre

mean_revenues <- Movie_Total %>% group_by(genre_category) %>% summarize(Mean_Revenue = mean(revenue..Millions., na.rm = TRUE))

Create a bar chart for mean revenue

ggplot(mean_revenues, aes(x = genre_category, y = Mean_Revenue, fill = genre_category)) + geom_bar(stat = “identity”, alpha = 0.7) + labs( title = “Mean Revenue by Genre”, x = “Genre”, y = “Mean Revenue (Millions)” ) + theme_minimal() + scale_fill_brewer(palette = “Set3”) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels

#Descriptive analysis 1

Create a boxplot to visualize revenue distribution by country

library(viridisLite)

Calculate average revenue by country

average_revenue <- Movie_Total %>% group_by(country) %>% summarize(Average_Revenue = mean(revenue..Millions., na.rm = TRUE))

ggplot(average_revenue, aes(x = reorder(country, -Average_Revenue), y = Average_Revenue, fill = country)) + geom_bar(stat = “identity”) + labs( title = “Average Revenue by Country”, x = “Country”, y = “Average Revenue (Millions)” ) + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + scale_fill_viridis_d()

#Descriptive analysis 2 average_facebook_likes <- Movie_Total %>% group_by(genre_category) %>% summarise(Average_Likes = mean(Facebook_Likes, na.rm = TRUE))

Print the summary table

print(average_facebook_likes)

Create a bar chart for average Facebook likes by genre category

ggplot(average_facebook_likes, aes(x = reorder(genre_category, -Average_Likes), y = Average_Likes, fill = genre_category)) + geom_bar(stat = “identity”) + labs( title = “Distribution of Average Facebook Likes by Genre Category”, x = “Genre Category”, y = “Average Facebook Likes” ) + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + scale_fill_brewer(palette = “Set3”)

##Average imdb_rating by genre #descriptive analysis 3

Calculate the average IMDb rating for each genre category

average_imdb_rating <- Movie_Total %>% group_by(genre_category) %>% summarise( Average_IMDb_Rating = mean(imdb_rating, na.rm = TRUE), Count = n() # Optional: Number of movies in each genre )

View the result

print(average_imdb_rating)

Create a bar chart to represent the average IMDb rating by genre

ggplot(average_imdb_rating, aes(x = reorder(genre_category, -Average_IMDb_Rating), y = Average_IMDb_Rating, fill = genre_category)) + geom_bar(stat = “identity”) + labs( title = “Average IMDb Rating by Genre Category”, x = “Genre Category”, y = “Average IMDb Rating” ) + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + scale_fill_brewer(palette = “Set3”)

Movie Dataset Analysis

Logan, Drew, Riley

Dec 2nd 2024

Read excel.csv file (save Excel file as CSV UTF-8 (Comma delimited))

Read excel.csv file (save Excel file as CSV UTF-8 (Comma delimited))

Create a Master Dataset (combined Movie Dataset_Financials and Movie Dataset_General Audience)

Rename Unique ID in Movie2 to match with Movie1

Rename the first column in Movie2 as “Movie_ID”

(to match with the first column name in Movie1)

Rename the first column in both datasets to “Movie_ID” (if necessary)

Merge the datasets by “Movie_ID”

Merge Movie1 and Movie2 into one dataset

Save the merged data (Movie_Total) to a file

Save as CSV

Replace “file_path.xlsx” with the path to your file

Replace missing values

Check the master dataset

Calculate mean of a specific column (example: Revenue). If output is NA, this means there is a null value in the column.

Find rows with NA in Revenue

Find mean of Revenue without NA values

Step 1: Ensure column names are correct

Step 2: Check the structure of the column

Step 3: Convert the column to numeric

If the column contains non-numeric data, first coerce it into numeric format

Step 4: Check for NA values

If there are invalid values in the column, they will be converted to NA

Step 5: Calculate the mean, handling NA values

Step 6: Replace missing values with the mean (optional)

Step 7: Verify the result

Replace NA values with calculated mean of Revenue

Check to see if NA rows are replaced with calculated mean

5- Create a new column to calculate Profit Margin

Step 1: Ensure column names are correct

Step 2: Verify and convert columns to numeric

Step 3: Check for and handle missing or invalid data

Replace NA values with column means, or handle as needed

Step 4: Ensure no division by zero

Step 5: Check the results

View the new column for Profit Margin

Subset and clean data

Split the data into training and testing sets

Train a multiple linear regression model

Display model summary

Make predictions on the test set

Calculate evaluation metrics

Extract model coefficients

Display model summary

Perform the independent t-test

Subset Avg_Score for movies with and without Best Picture nominations

Remove NAs from both groups

Print the results

Perform one-way ANOVA for Revenue vs. MPAA Rating

Summary of the ANOVA

Tukey’s HSD Post-Hoc Test

Boxplot for Revenue by MPAA Rating

T-test Audience vs critics score

Perform a paired t-test

Print results

Reshape the data to long format for ggplot

Boxplot

Display the ANOVA summary

Calculate mean revenues for each genre

Create a bar chart for mean revenue

Create a boxplot to visualize revenue distribution by country

Calculate average revenue by country

Print the summary table

Create a bar chart for average Facebook likes by genre category

Calculate the average IMDb rating for each genre category

View the result

Create a bar chart to represent the average IMDb rating by genre

`Dec 2nd 2024`