Dec 2nd 2024library(readxl) library(ggplot2) library(dplyr) library(tidyr) library(caret) library(viridisLite)
getwd() setwd(“/Users/cross/Desktop”)
#Merging the Data
Movie1 <- read_excel(“Movie Dataset_Financials.xlsx”) # Display the structure of Movie1 (data frame) str(Movie1) # Display the first few rows of Movie1 (data frame) head(Movie1, n=5)
Movie2 <- read_excel(“Movie Dataset_General Audience.xlsx”) # Display the structure of Movie2 (data frame) str(Movie2) # Display the first few rows of Movie2 (data frame) head(Movie2, n=10)
names(Movie2)[1] <- c(“Movie_ID”) head(Movie2, n=1)
names(Movie1)[1] <- “Movie_ID” names(Movie2)[1] <- “Movie_ID”
Movie_Total <- merge(Movie1, Movie2, by = “Movie_ID”)
Movie_Total <- merge(Movie1, Movie2, by = “Movie_ID”) str(Movie_Total) head(Movie_Total)
write.csv(Movie_Total, “Movie_Total.csv”, row.names = FALSE) # row.names = FALSE prevents R from adding row numbers View(Movie_Total)
#Read Excel library(readxl)
Movie_data_1 <- read_excel(“Movie Dataset_Financials.xlsx”) Movie_data_2 <- read_excel(“Movie Dataset_General Audience.xlsx”)
summary(Movie_Total)
#Replace missing values
mean(Movie_Total$Revenue, na.rm = TRUE)
na_rows <- Movie_Total[is.na(Movie_Total$Revenue), ] print(na_rows)
meanRevenue <- mean(Movie_Total$Revenue, na.rm = TRUE) print(meanRevenue)
colnames(Movie_Total) <- make.names(colnames(Movie_Total)) # Fix column names to ensure validity
str(Movie_Total$Revenue)
Movie_Total\(Revenue <- as.numeric(as.character(Movie_Total\)revenue..Millions.)) # Adjust to your column name
summary(Movie_Total$Revenue) # Check for NA values
if (any(is.na(Movie_Total\(Revenue))) { cat("Missing values detected in Revenue column. Replacing NA with column mean.\n") } mean_revenue <- mean(Movie_Total\)Revenue, na.rm = TRUE) cat(“Mean Revenue:”, mean_revenue, “”)
Movie_Total\(Revenue[is.na(Movie_Total\)Revenue)] <- mean_revenue
summary(Movie_Total$Revenue)
write.csv(Movie_Total, “Updated_Movie_Total.csv”, row.names = FALSE)
Movie_Total[is.na(Movie_Total$Revenue), “Revenue”] <- meanRevenue
Movie_Total[c(rownames(na_rows)), ]
Movie_Total\(Profit_Margin <- (Movie_Total\)Revenue - Movie_Total\(Budget) / Movie_Total\)Revenue
colnames(Movie_Total) <- make.names(colnames(Movie_Total)) # Fix column names if they contain invalid characters
Movie_Total\(Revenue <- as.numeric(as.character(Movie_Total\)revenue..Millions.)) # Replace ‘revenue..Millions.’ with actual column name Movie_Total\(Budget <- as.numeric(as.character(Movie_Total\)budget..Millions.)) # Replace ‘budget..Millions.’ with actual column name
Movie_Total\(Revenue[is.na(Movie_Total\)Revenue)] <- mean(Movie_Total\(Revenue, na.rm = TRUE) Movie_Total\)Budget[is.na(Movie_Total\(Budget)] <- mean(Movie_Total\)Budget, na.rm = TRUE)
Movie_Total\(Profit_Margin <- ifelse( Movie_Total\)Revenue > 0, # Check if Revenue is greater than zero (Movie_Total\(Revenue - Movie_Total\)Budget) / Movie_Total$Revenue, # Calculate Profit Margin NA # Assign NA if Revenue is zero or negative )
summary(Movie_Total$Profit_Margin) # View summary statistics for Profit_Margin head(Movie_Total) # View the first few rows of the dataset
View(Movie_Total[c(“Revenue”, “Budget”, “Profit_Margin”)]) # Use this view to check calculation
#Merge genre
Movie_Total <- Movie_Total %>% mutate(genre_category = case_when( genre == “Action & Adventure” ~ “Action/Adventure”, genre == “Comedy” ~ “Comedy”, genre == “Drama” ~ “Drama”, genre %in% c(“Horror”, “Mystery & Suspense”) ~ “Horror/Thriller”, genre %in% “Animation” ~ “Animation”, genre == “Documentary” ~ “Documentary”, genre == “Science Fiction & Fantasy” ~ “Sci-Fi/Fantasy”, genre %in% c(“Art House & International”, “Musical & Performing Arts”) ~ “Arts”, TRUE ~ “Others” # Default category )) table(Movie_Total$genre_category)
#Multiple Regression
predictors <- c(“budget..Millions.”, “critics_score”, “audience_score”, “Facebook_Likes”) target <- “revenue..Millions.”
movie_cleaned <- Movie_Total %>% select(all_of(c(predictors, target))) %>% drop_na() # Corrected function name
set.seed(42) # For reproducibility train_indices <- createDataPartition(movie_cleaned[[target]], p = 0.8, list = FALSE) train_data <- movie_cleaned[train_indices, ] test_data <- movie_cleaned[-train_indices, ]
revenue_model <- train( as.formula(paste(target, “~”, paste(predictors, collapse = “+”))), data = train_data, method = “lm” )
summary(revenue_model$finalModel)
test_predictions <- predict(revenue_model, newdata = test_data)
mse <- mean((test_predictions - test_data[[target]])^2) r2 <- cor(test_predictions, test_data[[target]])^2
cat(“Mean Squared Error:”, mse, “”) cat(“R-squared:”, r2, “”)
coefficients <- summary(revenue_model\(finalModel)\)coefficients coefficients_df <- data.frame( Feature = rownames(coefficients), Coefficient = coefficients[, “Estimate”] ) %>% arrange(desc(abs(Coefficient)))
print(coefficients_df)
summary(revenue_model$finalModel)
#Creating numerical values for “best_pic_nom” Movie_Total\(best_pic_nom <- ifelse(Movie_Total\)best_pic_nom == “yes”, 1, 0)
#Creating new variable “Avg_Score”, by taking the average values of
both “critics_score” and “audience_score” Movie_Total <- Movie_Total
%>% mutate(Avg_Score = (critics_score +
audience_score) / 2)
Movie_Total <- Movie_Total %>% mutate(Avg_Score =
rowMeans(cbind(critics_score, audience_score),
na.rm = TRUE))
avg_score_nom <- Movie_Total\(Avg_Score[Movie_Total\)best_pic_nom == 1] avg_score_no_nom <- Movie_Total\(Avg_Score[Movie_Total\)best_pic_nom == 0]
avg_score_nom <- na.omit(avg_score_nom) avg_score_no_nom <- na.omit(avg_score_no_nom)
t_test_avg <- t.test(avg_score_nom, avg_score_no_nom)
print(t_test_avg)
library(ggbeeswarm) library(beeswarm) # Beeswarm plot ggplot(avg_score_data, aes(x = best_pic_nom, y = Avg_Score, color = best_pic_nom)) + geom_beeswarm(size = 2, alpha = 0.7) + stat_summary(fun = mean, geom = “point”, size = 4, shape = 18, color = “black”) + labs( title = “Average Scores by Best Picture Nomination”, x = “Best Picture Nomination”, y = “Average Score” ) + scale_color_manual(values = c(“Yes” = “green”, “No” = “gray”)) + theme_minimal()
#one way anova
anova_mpaa <- aov(revenue..Millions. ~ mpaa_rating, data = Movie_Total)
summary(anova_mpaa)
tukey_mpaa <- TukeyHSD(anova_mpaa) print(tukey_mpaa)
library(ggplot2)
ggplot(Movie_Total, aes(x = mpaa_rating, y = revenue..Millions., fill = mpaa_rating)) + geom_boxplot() + labs( title = “Revenue by MPAA Rating”, x = “MPAA Rating”, y = “Revenue (Millions)” ) + theme_minimal()
paired_t_test <- t.test( Movie_Total\(critics_score, Movie_Total\)audience_score, paired = TRUE # Paired test for critics vs audience scores )
print(paired_t_test)
library(ggplot2)
scores_long <- Movie_Total %>% select(critics_score, audience_score) %>% pivot_longer(cols = everything(), names_to = “Score_Type”, values_to = “Score”)
ggplot(scores_long, aes(x = Score_Type, y = Score, fill = Score_Type)) + geom_boxplot(outlier.color = “red”, outlier.shape = 16, alpha = 0.7) + labs( title = “Comparison of Critics’ and Audience Scores”, x = “Score Type”, y = “Score” ) + theme_minimal() + scale_fill_manual(values = c(“critics_score” = “skyblue”, “audience_score” = “red”))
#one way anova #2
#Perform one-way ANOVA for revenue across genres anova_result <- aov(revenue..Millions. ~ genre_category, data = Movie_Total)
summary(anova_result)
mean_revenues <- Movie_Total %>% group_by(genre_category) %>% summarize(Mean_Revenue = mean(revenue..Millions., na.rm = TRUE))
ggplot(mean_revenues, aes(x = genre_category, y = Mean_Revenue, fill = genre_category)) + geom_bar(stat = “identity”, alpha = 0.7) + labs( title = “Mean Revenue by Genre”, x = “Genre”, y = “Mean Revenue (Millions)” ) + theme_minimal() + scale_fill_brewer(palette = “Set3”) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels
#Descriptive analysis 1
library(viridisLite)
average_revenue <- Movie_Total %>% group_by(country) %>% summarize(Average_Revenue = mean(revenue..Millions., na.rm = TRUE))
ggplot(average_revenue, aes(x = reorder(country, -Average_Revenue), y = Average_Revenue, fill = country)) + geom_bar(stat = “identity”) + labs( title = “Average Revenue by Country”, x = “Country”, y = “Average Revenue (Millions)” ) + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + scale_fill_viridis_d()
#Descriptive analysis 2 average_facebook_likes <- Movie_Total %>% group_by(genre_category) %>% summarise(Average_Likes = mean(Facebook_Likes, na.rm = TRUE))
print(average_facebook_likes)
ggplot(average_facebook_likes, aes(x = reorder(genre_category, -Average_Likes), y = Average_Likes, fill = genre_category)) + geom_bar(stat = “identity”) + labs( title = “Distribution of Average Facebook Likes by Genre Category”, x = “Genre Category”, y = “Average Facebook Likes” ) + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + scale_fill_brewer(palette = “Set3”)
##Average imdb_rating by genre #descriptive analysis 3
average_imdb_rating <- Movie_Total %>% group_by(genre_category) %>% summarise( Average_IMDb_Rating = mean(imdb_rating, na.rm = TRUE), Count = n() # Optional: Number of movies in each genre )
print(average_imdb_rating)
ggplot(average_imdb_rating, aes(x = reorder(genre_category, -Average_IMDb_Rating), y = Average_IMDb_Rating, fill = genre_category)) + geom_bar(stat = “identity”) + labs( title = “Average IMDb Rating by Genre Category”, x = “Genre Category”, y = “Average IMDb Rating” ) + theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + scale_fill_brewer(palette = “Set3”)