#Calling libraries
library(units)
## udunits database from /Library/Frameworks/R.framework/Versions/4.3-arm64/Resources/library/units/share/udunits/udunits2.xml
library(visdat)
library(DataExplorer)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.0 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringr)
library(tidyr)
library(readr)
library(ggplot2)
movies<- read.csv("/Users/anapaualvear/Desktop/r business analytics/movies_metadata.csv")
# Finding how many NA are in the data base
sum(is.na(movies))
## [1] 281
A total of 281 NAs have been found in the database, and to know how to treat them, we will first identify which columns they come from and how many there are per column.
# Search in each column how many NAs exist to see what type of elimination to apply
sapply(movies, function(x) sum(is.na(x)))
## adult belongs_to_collection budget
## 0 0 0
## genres homepage id
## 0 0 0
## imdb_id original_language original_title
## 0 0 0
## overview popularity poster_path
## 0 0 0
## production_companies production_countries release_date
## 0 0 0
## revenue runtime spoken_languages
## 6 263 0
## status tagline title
## 0 0 0
## video vote_average vote_count
## 0 6 6
6 NA have been found in the “Revenue” column, 263 in “runtime”, 6 in “Vote Avarage”, and 6 in “Vote Count”. Now we will remove them by column individually
# Cleaning and summary of column "runtime"
movies2 <- movies
movies2$runtime[is.na(movies2$runtime)] <- mean(movies2$runtime, na.rm=TRUE)
summary(movies2$runtime)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 85.00 95.00 94.13 107.00 1256.00
# Cleaning and summary of column "revenue"
movies3 <- movies2
movies3$revenue[is.na(movies3$revenue)] <- mean(movies3$revenue, na.rm=TRUE)
summary(movies3$revenue)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000e+00 0.000e+00 0.000e+00 1.121e+07 0.000e+00 2.788e+09
# Cleaning and summary of column "vote count"
movies4 <- movies3
movies4$vote_count[is.na(movies4$vote_count)] <- mean(movies4$vote_count, na.rm=TRUE)
summary(movies4$vote_count)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 3.0 10.0 109.9 34.0 14075.0
# Cleaning and summary of column "vote average"
movies_wo_na <- movies4
movies_wo_na$vote_average[is.na(movies_wo_na$vote_average)] <- mean(movies_wo_na$vote_average, na.rm=TRUE)
summary(movies_wo_na$vote_average)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 5.000 6.000 5.618 6.800 10.000
# Verify the NA were eliminated
sum(is.na(movies_wo_na))
## [1] 0
The method of elimination used for the NAs was by taking the
average, which was decided for two reasons.
+ Because in the analysis of the columns it was possible to obtain that
the values that contained NA were numeric and integers runtime, vote
average, vote count, revenue.
+ The missing values are important and it is desired to preserve the
general structure of the data.
# Check how many information are totally duplicated
sum(duplicated(movies_wo_na))
## [1] 17
In the database we have found 17 duplicate values that have to be remove.
# Eliminate information that is totally duplicated
movies_wo_dup <- movies_wo_na
movies_wo_dup <- distinct(movies_wo_dup)
# Check if duplicate information is eliminated
sum(duplicated(movies_wo_dup))
## [1] 0
In this example the duplicates were eliminated given that it would be the same movie otherwise. As for the partially duplicated data, it was found that “popularity” was the differing variable and, for now, the decision made was to keep those with the lower rating given the nature of movie reviews skewing to the lower side.
# Splitting each string into a character vector, returns a df
# Use a regular expression to match the genre names
genre_pattern <- "(?<=name': ')[^']*(?=')"
# Apply the extraction to the entire column and store the result in a new object
genre_data <- lapply(movies_wo_dup$genres, function(x) {
genres <- str_extract_all(x, genre_pattern)[[1]]
split_genres <- strsplit(genres, ", ")
unlist(split_genres)
})
# Calculate the maximum number of genres a movie can have
max_genre <- max(sapply(genre_data, length))
# Convert the list to a data frame
genre_df <- do.call(rbind, lapply(genre_data, function(x) {
tmp <- rep(NA, max_genre)
tmp[seq_along(x)] <- x
as.data.frame(t(tmp))
}))
# Rename the columns
names(genre_df) <- paste0("genre", seq_len(ncol(genre_df)))
# Join the new genre data frame with the original data frame
movies_wo_dup <- cbind(movies_wo_dup, genre_df)
# Delete the genres column
movies_w_genres <- subset(movies_wo_dup, select = -genres)
# Feature to clear production company names
clean_production_companies <- function(companies) {
# Apply regular expression to extract company name
cleaned_names <- str_extract(companies, "'name': '([^']+)'")
# Delete the extra part
cleaned_names <- gsub("'name': '", "", cleaned_names)
cleaned_names <- gsub("'", "", cleaned_names)
return(cleaned_names)
}
# Apply the function to the production_companies column and store the results in a new column
movies_w_genres$Clean_production_companies <- clean_production_companies(movies_w_genres$production_companies)
# Delete the production companies column
movies_w_prodcom <- subset(movies_w_genres, select = -production_companies)
# Function to clear the names of the countries of production
clean_production_countries <- function(countries) {
# Apply regular expression to extract country name
cleaned_names <- str_extract(countries, "'name': '([^']+)'")
# Delete the extra part
cleaned_names <- gsub("'name': '", "", cleaned_names)
cleaned_names <- gsub("'", "", cleaned_names)
return(cleaned_names)
}
# Apply the function to the production_countries column and store the results in a new column
movies_w_prodcom$Clean_production_countries <- clean_production_countries(movies_w_prodcom$production_countries)
# Delete the production companies column
movies_w_prodcoun <- subset(movies_w_prodcom, select = -production_countries)
# Function to clear the names of the countries of production
clean_production_countries <- function(countries) {
# Apply regular expression to extract country name
cleaned_names <- str_extract(countries, "'name': '([^']+)'")
# Delete the extra part
cleaned_names <- gsub("'name': '", "", cleaned_names)
cleaned_names <- gsub("'", "", cleaned_names)
return(cleaned_names)
}
# Apply the function to the production_countries column and store the results in a new column
movies_w_prodcom$Clean_production_countries <- clean_production_countries(movies_w_prodcom$production_countries)
# Delete the production companies column
movies_w_prodcoun <- subset(movies_w_prodcom, select = -production_countries)
# Use a regular expression to match the language names
language_pattern <- "(?<=name': ')[^']*(?=')"
# Apply the extraction to the entire column and store the result in a new object
language_data <- lapply(movies_w_prodcoun$spoken_languages, function(x) {
languages <- str_extract_all(x, language_pattern)[[1]]
split_languages <- strsplit(languages, ", ")
unlist(split_languages)
})
# Calculate the maximum number of languages a movie can have
max_language <- max(sapply(language_data, length))
# Convert the list to a data frame
language_df <- do.call(rbind, lapply(language_data, function(x) {
tmp <- rep(NA, max_language)
tmp[seq_along(x)] <- x
as.data.frame(t(tmp))
}))
# Rename the columns
names(language_df) <- paste("spoken_lang", seq_len(ncol(language_df)), sep = "_")
# Join the new language data frame with the original data frame
movies_w_lang <- cbind(movies_w_prodcoun, language_df)
# Eliminate original column
movies_w_lang$spoken_languages <- NULL
# Function to separate the 'belongs_to_collection' column
separate_belongs_to_collection <- function(movies_w_lang) {
# Create a new DataFrame to perform separation
collection_data <- movies_w_lang
# Apply regular expression to extract collection name
collection_names <- str_extract(collection_data$belongs_to_collection, "'name': '([^']+)'")
# Delete the extra part
collection_names <- gsub("'name': '", "", collection_names)
collection_names <- gsub("'", "", collection_names)
# Create a new column in the new DataFrame with the extracted collection names
collection_data$Collection <- collection_names
# Remove the original 'belongs_to_collection' column from the new DataFrame
collection_data$belongs_to_collection <- NULL
return(collection_data)
}
# Create a new DataFrame for separation
collection_data <- movies_w_lang
# Apply the function to the new DataFrame
collection_data <- separate_belongs_to_collection(collection_data)
# Runtime histogram
histogram_runtime <- ggplot(collection_data, aes(x = runtime)) +
geom_histogram(fill = "skyblue", color = "black") +
labs(title = "Histogram of Runtime")
# Display histogram
print(histogram_runtime)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Histogram of Runtime: Suggests that most movies have a shorter runtime, with an average of 94.1281 minutes; with only few movies having exceptionally long runtimes, like 1,256 minutes. Besides, the presence of a movie with runtime of 0 minutes should be investigated as it could be an error or missing data.
# Revenue histogram
histogram_revenue <- ggplot(collection_data, aes(x = revenue)) +
geom_histogram(fill = "lightgreen", color = "black") +
labs(title = "Histogram of Revenue")
# Display histogram
print(histogram_revenue)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Histogram of Revenue: It also indicates that most movies have relatively
lower revenues compared to a few blockbuster movies with very high
revenues. There is a significant gap between majority of revenue values
and outliers.
# Vote Count histogram
histogram_vote_count <- ggplot(collection_data, aes(x = vote_count)) +
geom_histogram(fill = "lightpink", color = "black") +
labs(title = "Histogram of Vote Count")
# Display histogram
print(histogram_vote_count)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Histogram of Vote Count: It shows the distribution of “Vote Count”. The majority of movies in the dataset received a relatively low number of votes, with a long tail indicating a few movies recieving significantly higher numbers of votes; so there may be a considerable gap between outliers.
vis_miss(slice_sample(collection_data))
This function function reveals that 58.3% of the data is missing. This
suggests that there is an amount of missing information within the
dataset, meaning the analysis and interpretation can be challenging, as
it may lead to biased results or incomplete insights.
# Vote Average histogram
histogram_vote_average <- ggplot(collection_data, aes(x = vote_average)) +
geom_histogram(fill = "lightyellow", color = "black") +
labs(title = "Histogram of Vote Average")
# Display histogram
print(histogram_vote_average)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Histogram Vote Average: It shows a relatively symmetrical distribution around the average vote of 5.6, with values ranging form 0 to 10. A spike in the distribution might occur around whole numbers due to the nature of voting systems.
Now that we identified certain outliers, we can consider various approaches to handle them like deleting them, treating them separately or transforming data.The best approach depends on the nature of the data and the context of our analysis.
# Identify categorical variables and convert them to factors
cols_to_factor <- c("adult", paste0("genre", 1:8), "Clean_production_companies",
"Clean_production_countries", paste0("spoken_lang_", 1:18))
# Convert columns to factors
collection_data[cols_to_factor] <- lapply(collection_data[cols_to_factor], factor)
head(cols_to_factor)
## [1] "adult" "genre1" "genre2" "genre3" "genre4" "genre5"
Adressing factor levels becomes important when dealing with categorical variables that have specific order associated with their levels; for example: many statistical functions in R treat factors differently than character vectors, like in regression analysis that requieres accurate factor levels; besides, incorrect factor levels can lead to misleading visual representations of data.
# Replace unusual entries in the adult column with "False"
collection_data$adult[grep("written by|bikini contest|casino connected", tolower(collection_data$adult))] <- "False"
# Recalculate the frequency of each category in the adult column and update the data frame
adult_freq <-as.data.frame(table(collection_data$adult))
# Filter the rows that have a frequency greater than 0
adult_freq_filtered <- adult_freq[adult_freq$Freq > 0, ]
# Show the updated categories and their frequencies
print(adult_freq)
## Var1
## 1 - Written by Ørnås
## 2 Avalanche Sharks tells the story of a bikini contest that turns into a horrifying affair when it is hit by a shark avalanche.
## 3 Rune Balot goes to a casino connected to the October corporation to try to wrap up her case once and for all.
## 4 False
## 5 True
## Freq
## 1 0
## 2 0
## 3 0
## 4 45440
## 5 9
# Merge collection_data with adult_freq_filtered based on the 'Var1' column
collection_data <- merge(collection_data, adult_freq_filtered, by.x = "adult", by.y = "Var1", all.x = TRUE)
# Remove the 'Freq' column (since it's not needed)
collection_data <- collection_data[, -ncol(collection_data)]
In the “adult” column, there were 3 rows that did not respect the TRUE and FALSE format, so it was decided to eliminate those 3 rows to maintain consistency.
The sames goes for the original language column, which instead of an abbreviation with characters, there are some rows with integers. While also, adding a threshold for those movies whose original language has a frequency less than 10, and change it to the abbreviation as “other”, to simplify and consolidate the data.
# Create a dataframe with the frequency table of original languages
org_lang_freq <- as.data.frame(table(collection_data$original_language))
#see the average
summary(org_lang_freq$Freq)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 2.0 10.0 488.7 78.0 32259.0
# Find rows with frequency less than a threshold (e.g., 50) and combine them into 'other'
threshold <- 10
other_freq <- sum(org_lang_freq$Freq[org_lang_freq$Freq < threshold])
org_lang_freq$Var1[org_lang_freq$Freq < threshold] <- "other"
## Warning in `[<-.factor`(`*tmp*`, org_lang_freq$Freq < threshold, value =
## structure(c(1L, : invalid factor level, NA generated
org_lang_freq$Freq[org_lang_freq$Freq < threshold] <- other_freq
# Remove duplicates caused by combining into 'other'
org_lang_freq <- org_lang_freq[!duplicated(org_lang_freq$Var1), ]
# Print the modified dataframe
head(org_lang_freq)
## Var1 Freq
## 1 11
## 2 <NA> 113
## 5 ab 10
## 8 ar 39
## 10 bg 10
## 12 bn 29
library(forcats)
# Categories with less frequency
other_categories = c("Carousel Productions", "Aniplex", "Odyssey Media")
# Select it as other
collection_data <- collection_data %>%
mutate(genre1 = as.factor(genre1),
genre1 = fct_collapse(genre1, other = other_categories))
# Languages with less frequency
other_categories = c("Pulser Productions", "GoHands", "Vision View Entertainment")
# Select it as other
collection_data <- collection_data %>%
mutate(genre2 = as.factor(genre2),
genre2 = fct_collapse(genre2, other = other_categories))
# Languages with less frequency
other_categories = c("Telescene Film Group Productions", "Rogue State", "BROSTA TV")
# Select it as other
collection_data <- collection_data %>%
mutate(genre3 = as.factor(genre3),
genre3 = fct_collapse(genre3, other = other_categories))
The list of less frequent categories were defined in the three main movie genre columns (“genre1”, “genre2”, and “genre3”), in order to group them under the label “other,” thus achieving the simplification and consolidation of the information. This helps streamline the data and avoid cluttering analysis with overly specific categories.
clean_revenue <- function(revenue) {
# Format entries with commas and add money symbol
formatted_revenue <- paste0("$", format(revenue, big.mark = ",", scientific = FALSE))
return(formatted_revenue)
}
# Apply the function to the revenue column and store the results in a new column
collection_data$Clean_revenue <- clean_revenue(collection_data$revenue)
# Function to clean the budget
clean_budget <- function(budget) {
# Format budget with commas and add money symbol
formatted_budget <- paste0("$", format(budget, big.mark = ",", scientific = FALSE))
return(formatted_budget)
}
# Apply the function to the budget column and store the results in a new column
collection_data$Clean_budget <- clean_budget(collection_data$budget)
# Remove the previous budget column
collection_data <- subset(collection_data, select = -c(budget))
# Convert the "popularity" column to numeric
collection_data$popularity <- as.numeric(collection_data$popularity)
## Warning: NAs introduced by coercion
# Round popularity to whole numbers
collection_data$round_popularity <- round(collection_data$popularity)
# Remove the previous popularity column
collection_data <- subset(collection_data, select = -c(popularity))
Functions were applied to clean and format revenue, budget and popularity columns, adding currency symbols and commas for better readability. Besides, converted the “popularity” column to numeric format and rounded it to whole numbers for easier interpretation.
As for the univariate distributions, budget, popularity, Vote count and revenue have low frequencies so the “success” movies make the minority which hold into question the possibility of a prediction model. Run time is 100 minutes for most and Vote Average follows a normal distribution with an avg of 6-7. None of the others follow a normal distribution.
As for what variables are most correlated to revenue, here is what we found. Strongest: budget and vote count; 2nd level: popularity, genre1_adventure and genre2_action; 3rd level: genre1_animation and genre2_fantasy.
There is a positive correlation between vote count, budget and popularity to revenue. No clear trend with runtime and vote average.
# Check if runtime column is already assigned units
if (!inherits(collection_data$runtime, "units")) {
# Convert runtime column to numeric and then to minutes with units
collection_data$runtime <- as.numeric(collection_data$runtime)
collection_data$runtime <- set_units(collection_data$runtime, "minutes")
}
# Create a new column called 'num_genres' that counts the non-missing genre and spoken lang columns
collection_data$num_genres <- rowSums(!is.na(collection_data[, paste0("genre", 1:8)]))
collection_data <- collection_data %>%
mutate(num_spoken_languages = rowSums(!is.na(select(., starts_with("spoken_lang")))))
These code allows us to ensure consistent units for runtime and calculating the count of genres and spoken languages so the dataset becomes more standardized. The unit conversion ensures that the ‘runtime’ column is uniformly represented in minutes. It checks if the ‘runtime’ column is already assigned units. If not, it converts the ‘runtime’ column to numeric and then assigns units to represent time in minutes. Last the genre and spoken language counts code, adds two new columns to the dataset. ‘num_genres’: This column counts the number of non-missing genre columns for each movie. ‘num_spoken_languages’: This column counts the number of non-missing spoken language columns for each movie.
# Create a backup df
copy_data <- collection_data
# Define the revenue categories
revenue_bins <- c(0, 100000000, 500000000, 1000000000, 2000000000, 3000000000)
revenue_labels <- c('<100M', '100M-500M', '500M-1B', '1B-2B', '2B-3B')
# Create the 'revenue_group' column
copy_data$revenue_group <- cut(copy_data$revenue, breaks = revenue_bins, labels = revenue_labels, right = FALSE)
table(copy_data$revenue_group)
##
## <100M 100M-500M 500M-1B 1B-2B 2B-3B
## 44076 1203 141 27 2
table(copy_data$revenue_group, copy_data$genre1)
##
## Action Adventure Animation other Comedy Crime Documentary Drama
## <100M 4184 1309 1039 3 8581 1639 3412 11751
## 100M-500M 251 156 59 0 230 45 2 202
## 500M-1B 40 42 23 0 6 0 0 5
## 1B-2B 10 6 3 0 0 0 0 2
## 2B-3B 2 0 0 0 0 0 0 0
##
## Family Fantasy Foreign History Horror Music Mystery Romance
## <100M 495 647 118 272 2567 479 544 1162
## 100M-500M 21 49 0 7 52 8 9 28
## 500M-1B 4 7 0 0 0 0 1 1
## 1B-2B 4 0 0 0 0 0 0 0
## 2B-3B 0 0 0 0 0 0 0 0
##
## Science Fiction Thriller TV Movie War Western
## <100M 607 1616 390 374 446
## 100M-500M 29 46 0 4 4
## 500M-1B 9 1 0 1 1
## 1B-2B 2 0 0 0 0
## 2B-3B 0 0 0 0 0
Lower grossing films tend to have movies from a wide array of genres, but for 500M-1B to 2B-3B are most concentrated in action.
# Calculate the frequencies of each production company
company_frequencies <- copy_data %>%
count(Clean_production_companies) %>%
arrange(desc(n)) # Sort in descending order of frequency
# Select the top 10 production companies
top_10_companies <- head(company_frequencies, 10)
top_10_companies
## Clean_production_companies n
## 1 <NA> 11934
## 2 Paramount Pictures 998
## 3 Metro-Goldwyn-Mayer (MGM) 878
## 4 Twentieth Century Fox Film Corporation 780
## 5 Warner Bros. 757
## 6 Universal Pictures 754
## 7 Columbia Pictures 429
## 8 Columbia Pictures Corporation 401
## 9 RKO Radio Pictures 290
## 10 United Artists 272
# Calculate the count of each company
company_counts <- table(copy_data$Clean_production_companies)
# Sort the companies by count in descending order and select the top five
top_companies <- names(sort(company_counts, decreasing = TRUE)[1:8])
# Filter the dataset to include only the top five most important companies
filtered_data <- copy_data[collection_data$Clean_production_companies %in% top_companies, ]
# Create the bar plot
ggplot(filtered_data, aes(x = Clean_production_companies)) +
geom_bar() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
This bar graph analyzes the “Clean_production_companies” variable and
shows us the 8 most important results to be able to identify which
companies are making the most movies. It allows us to understand which
companies have the most extensive presence in terms of movie
production.
# Calculate the count of each country
country_counts <- table(collection_data$Clean_production_countries)
# Sort the countries by count in descending order and select the top five
top_countries <- names(sort(country_counts, decreasing = TRUE)[1:8])
# Filter the dataset to include only the top five most important countries
filtered_data <- collection_data[collection_data$Clean_production_countries %in% top_countries, ]
# Create the density plot
ggplot(filtered_data, aes(x = Clean_production_countries)) +
geom_density() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
This density plot analyzes the “Clean_production_countries” variable and
shows us the 8 most important results to be able to identify which
countries are where the most movies are being produced. We gain a deeper
understanding of the geographical landscape of the film industry
represented by the dataset.
# Load the corrplot package
library(corrplot)
## corrplot 0.92 loaded
# Extract numeric columns for correlation analysis
numeric_data <- copy_data[, sapply(copy_data, is.numeric)]
# Calculate the correlation matrix
correlation_matrix <- cor(numeric_data)
# Create the corrplot with the correlation matrix
corrplot(correlation_matrix, method = "color")
This shows the relationships between numerical variables within the
dataset copy_data by visualizing their correlations. The correlation
plot offers a comprehensive overview of the interdependencies among
variables.
popularity_summary <- aggregate(round_popularity ~ genre2, data = collection_data, FUN = mean)
ggplot(data = popularity_summary, aes(x = genre2, y = round_popularity)) +
geom_bar(stat = "identity", fill = "skyblue") +
labs(title = "Average Popularity by Genre", x = "Genre", y = "Average Popularity") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
What can be interpreted in the bar graph is that there are 3 genres that
have the highest popularity among the others, these are Adventure in
first place, Animation in second place and Fantasy in third place, the
others have similar and persistent amounts.
# Frequency table for a categorical variable
genre_freq <- table(collection_data$genre1)
head(genre_freq)
##
## Action Adventure Animation other Comedy Crime
## 4487 1513 1124 3 8817 1684
This table proved a summary of the counts of different genres in the dataset. We can now understand the distribution of movies across different genres. For example, there are in total 4,487 movies that are set in the action genre, 1,513 for adventure, 1,124 in animation, 8,817 in comedy, 1,684 in crime, and finally 3 in other genre.
ggplot(data = collection_data, aes(x = genre1)) +
geom_bar() +
labs(title = "Frequency of Genres") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
Visualizes the frequency of genres in the data set. It gives a clear visual representation of how many movies belong to each genre, helping to identify the most common and least common genres.
# Graficar
ggplot(data = collection_data, aes(x = genre1, y = revenue)) +
geom_point(aes(color = genre1)) +
labs(title = "Genre vs Revenue") +
theme_minimal() +
theme(legend.position = "bottom") +
scale_x_discrete(drop = FALSE)
This is a visual representation of the relationship between movie genres
and revenue. Each point on the plot corresponds to a specific movie
genre, with the position along the y-axis indicating the revenue
generated by movies belonging to that genre. We can gain insights into
which genres tend to generate higher or lower revenues, for example,
“Action” is the genre with the highest revenue.
# Load libraries
library("tm")
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library("SnowballC")
library("wordcloud")
## Loading required package: RColorBrewer
library("RColorBrewer")
# nos dice las peliculas que mas ganaron dinero
# Define los intervalos de ingresos
revenue_bins <- c(500000000, 1000000000, 2000000000, 3000000000)
# Filtra los datos de colección basados en los intervalos de ingresos
profit_data <- collection_data %>%
filter(revenue >= revenue_bins[1] & revenue < revenue_bins[2] |
revenue >= revenue_bins[2] & revenue < revenue_bins[3] |
revenue >= revenue_bins[3] & revenue < revenue_bins[4])
#We need to convert the text to a corpus
docs <- Corpus(VectorSource(profit_data$original_title))
#General text cleaning
# Convert the text to lower case
docs <- tm_map(docs, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(docs, content_transformer(tolower)):
## transformation drops documents
# Remove numbers
docs <- tm_map(docs, removeNumbers)
## Warning in tm_map.SimpleCorpus(docs, removeNumbers): transformation drops
## documents
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
## Warning in tm_map.SimpleCorpus(docs, removeWords, stopwords("english")):
## transformation drops documents
# Remove your own stop word
# specify your stopwords as a character vector
docs <- tm_map(docs, removeWords, c("blabla1", "blabla2"))
## Warning in tm_map.SimpleCorpus(docs, removeWords, c("blabla1", "blabla2")):
## transformation drops documents
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
## Warning in tm_map.SimpleCorpus(docs, removePunctuation): transformation drops
## documents
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
## Warning in tm_map.SimpleCorpus(docs, stripWhitespace): transformation drops
## documents
#Term-document matrix. Document matrix is a table containing the frequency of the words.
dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
#head(d, 100)
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 5,
max.words=Inf, random.order=T, rot.per=0.5,
colors=brewer.pal(8, "Dark2"))
#https://cran.r-project.org/web/packages/wordcloud/wordcloud.pdf
The trend for the most profitable titles is to be a part of a known franchise like “Pirates of the Caribbean”, “Star Wars”, “Spiderman”, “Harry Potter”, “Transformers”, etc.
copy_data <- collection_data
copy_data$Clean_budget_numeric <- as.numeric(gsub("\\$", "", copy_data$Clean_budget))
## Warning: NAs introduced by coercion
# Create measure for profit (ganancia o perdida)
copy_data <- copy_data %>%
mutate(profit = revenue - Clean_budget_numeric)
revenue_grouped <- copy_data %>%
group_by(revenue) %>%
summarize(mean(profit, na.rm = TRUE),
sd(profit, na.rm = TRUE),
median(profit, na.rm = TRUE),
quantile(profit, (.90), na.rm = TRUE),
n())
revenue_grouped
## # A tibble: 6,864 × 6
## revenue mean(profit, na.rm = …¹ sd(profit, na.rm = T…² median(profit, na.rm…³
## <dbl> <dbl> <dbl> <dbl>
## 1 0 -650401. 4340513. 0
## 2 1 -49.6 171. 1
## 3 2 1.67 0.577 2
## 4 3 -148886. 263412. 2
## 5 4 -91.5 166. -13.5
## 6 5 2.2 3.42 4
## 7 6 -5201150. 1695421. -5201150.
## 8 7 -499995. 999999. 3.5
## 9 8 -19.2 57.5 8
## 10 9 9 NA 9
## # ℹ 6,854 more rows
## # ℹ abbreviated names: ¹`mean(profit, na.rm = TRUE)`,
## # ²`sd(profit, na.rm = TRUE)`, ³`median(profit, na.rm = TRUE)`
## # ℹ 2 more variables: `quantile(profit, (0.9), na.rm = TRUE)` <dbl>,
## # `n()` <int>
summary(copy_data$profit)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -165710090 0 0 6987363 0 2550965087 3
collection_data %>%
ggplot(aes(x = genre1, fill = runtime)) +
geom_density(alpha = 0.2) + labs(title = "Density in genre1 and popularity", x = "genre1", y = "runtime - hours") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
In this graph we can see that the density using the “genre1” and
“runtime_hours” variables behaves in a right skewed form, showing us the
movie genres that use the most hours and those that use the least.
ggplot(collection_data, aes(y = runtime)) +
geom_boxplot()
## Warning: The `scale_name` argument of `continuous_scale()` is deprecated as of ggplot2
## 3.5.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Here we can see that most of the values are concentrated from the value
600 onwards, therefore all values above 10 are considered outliers in
the “runtime_hours” variable.
collection_data$runtime_hours <- as.numeric(collection_data$runtime)
filtered_data <- collection_data %>%
filter(runtime_hours <= 10)
#Create the density plot with the filtered data
filtered_data %>%
ggplot(aes(x = genre1, fill = runtime)) +
geom_density(alpha = 0.2) +
labs(title = "Density in genre1 and runtime - hours", x = "genre1", y = "runtime") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
We can see the result is similiar even withouth the outliers, the
variables still behaving in a right skewed form.
ggplot(copy_data, aes(y = revenue, x = Clean_budget)) + geom_point() + geom_smooth(method = "lm", se = TRUE)
## `geom_smooth()` using formula = 'y ~ x'
There is a weak positive linear correlation between the budget and
renenue of a movie.
ggplot(copy_data,
aes(x = cut(Clean_budget_numeric, breaks = 5), y = revenue)) +
geom_boxplot()
We can visualize the distribution of revenue across different intervals
of the Clean_budget_numeric variable.
# Create measure for character number
copy_data <- copy_data %>%
mutate(title_num = nchar(original_title))
ggplot(copy_data, aes(y = profit, x = title_num)) + geom_point() + stat_smooth(method = "lm", formula = y ~ x + I(x^2), size = 1)
There is a non existent relationship between character number and the
profit of a movie. So the hypothesis of it being a quadratic correlation
is false.
ggplot(copy_data,
aes(x = title_num, y = cut(profit, breaks = 5))) +
geom_boxplot()
However, the breaks allows us to see that for lower profits there is a
greater number of outliers with high character counts, while those in
the highest profit bracket have low IQR range and that it is situated
with a low character count.
# Load packages
library("e1071") #for skewness measure
copy_data %>%
ggplot(aes(x = profit)) +
geom_density()
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_density()`).
#Logarithmic transformation
log_data <- copy_data %>%
mutate(log_prof = log(profit))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `log_prof = log(profit)`.
## Caused by warning in `log()`:
## ! NaNs produced
# Convert into a left-skewed distribution
log_data %>%
ggplot(aes(x = log_prof)) +
geom_density()
log_data %>%
summarize(mean(profit, na.rm = TRUE),
sd(profit, na.rm = TRUE),
IQR(profit, na.rm = TRUE),
sum(profit))
## mean(profit, na.rm = TRUE) sd(profit, na.rm = TRUE) IQR(profit, na.rm = TRUE)
## 1 6987363 52149123 0
## sum(profit)
## 1 NA
The distribution of profits was visualized using a density plot and a logarithmic transformation was applied to achieve a left-skewed distribution. Summary statistics were computed for the transformed profit variable, including mean, standard deviation, interquartile range, and total profit sum. Probabilities were calculated to determine the likelihood of profits exceeding specific thresholds based on the log-transformed profit distribution.
The analysis also involved creating a new column called ‘profit’ and calculating summary statistics within revenue groups. A binary column was introduced to indicate whether a movie’s production country falls within North America, and summary statistics were computed based on movie geographical origin. The exploration of runtime characteristics involved density plots and boxplots. A scatter plot was used to examine the correlation between budget and revenue, and boxplot analysis was conducted to compare revenue distributions across budget categories. The relationship between the number of title letters and movie profit was explored using scatter plots and boxplots. A faceted scatter plot was used to analyze the relationships between profit, title letters, and revenue brackets.
The distribution of profits was visualized using a density plot and summary statistics were computed to gain a deeper understanding of its distribution characteristics. Finally, probabilities were calculated to determine the likelihood of profits exceeding specific thresholds.
Overall, these analyses provide insights into the distribution, central tendency, shape, outliers, correlations, and probabilities related to various variables in the movie dataset, helping to understand key patterns and relationships within the data.
# Convert variables into correct ones
collection_data$revenue <- as.numeric(as.character(collection_data$revenue))
collection_data$release_date <- as.Date(as.character(collection_data$release_date))
collection_data$runtime <- as.numeric(as.character(collection_data$runtime))
collection_data$vote_average <- as.numeric(as.character(collection_data$vote_average))
collection_data$vote_count <- as.numeric(as.character(collection_data$vote_count))
collection_data$round_popularity <- as.numeric(as.character(collection_data$round_popularity))
collection_data$num_spoken_languages <- as.numeric(as.character(collection_data$num_spoken_languages))
collection_data$num_genres <- as.numeric(as.character(collection_data$num_genres))
collection_data$Clean_budget <- as.numeric(as.character(collection_data$Clean_budget))
## Warning: NAs introduced by coercion
regression <- lm(revenue ~ release_date + runtime + vote_average + vote_count + round_popularity + num_spoken_languages, data = collection_data)
summary(regression)
##
## Call:
## lm(formula = revenue ~ release_date + runtime + vote_average +
## vote_count + round_popularity + num_spoken_languages, data = collection_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -925289146 -2715159 157493 1986071 1414896099
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 945290.45 683783.35 1.382 0.166843
## release_date -32.14 20.20 -1.591 0.111659
## runtime 17442.15 4722.49 3.693 0.000222 ***
## vote_average -842682.47 94121.83 -8.953 < 2e-16 ***
## vote_count 100983.16 431.15 234.217 < 2e-16 ***
## round_popularity 823160.60 35612.85 23.114 < 2e-16 ***
## num_spoken_languages 119760.66 244145.49 0.491 0.623761
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 37340000 on 45352 degrees of freedom
## (90 observations deleted due to missingness)
## Multiple R-squared: 0.6639, Adjusted R-squared: 0.6638
## F-statistic: 1.493e+04 on 6 and 45352 DF, p-value: < 2.2e-16
The results first show the variables that most affect the obtaining of revenue in films. These variables are: + Runtime + Vote Average + Vote Count +Round Popularity
In addition, we obtained an adjusted R squared with a value of 0.6638, indicating that approximately 66.38% of the variability in the dependent variable, in this case the revenue variable, can be explained by the independent variables included in the regression model, especially by the variables I mentioned before.
# Adjust the regression for greater precision
adjusted_regression <- lm(revenue ~ runtime + vote_average + vote_count + round_popularity, data = collection_data)
summary(adjusted_regression)
##
## Call:
## lm(formula = revenue ~ runtime + vote_average + vote_count +
## round_popularity, data = collection_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -925000790 -2604782 175898 2009199 1415655991
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 799240.7 648741.5 1.232 0.217960
## runtime 17197.5 4658.3 3.692 0.000223 ***
## vote_average -832503.6 93038.9 -8.948 < 2e-16 ***
## vote_count 100961.4 430.4 234.593 < 2e-16 ***
## round_popularity 819845.1 35323.3 23.210 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 37310000 on 45438 degrees of freedom
## (6 observations deleted due to missingness)
## Multiple R-squared: 0.6639, Adjusted R-squared: 0.6638
## F-statistic: 2.243e+04 on 4 and 45438 DF, p-value: < 2.2e-16
# Predictive model
predictive_model <- data.frame(runtime=94, vote_average=6, vote_count=110, round_popularity=3)
predict(adjusted_regression, predictive_model)
## 1
## 10986075
The model is using the input values of “runtime”, “vote average”, “votecount”, and “round popularity” to predict the revenue of a movie.The predicted revenue of approximately $10,986,075 is what the model estimates based on these input values.
This scenario was generated with an accuracy of 66.38% according to the adjusted R squared and a reliability of 95%.
# Load necessary libraries
library(ggplot2)
colnames(collection_data)
## [1] "adult" "homepage"
## [3] "id" "imdb_id"
## [5] "original_language" "original_title"
## [7] "overview" "poster_path"
## [9] "release_date" "revenue"
## [11] "runtime" "status"
## [13] "tagline" "title"
## [15] "video" "vote_average"
## [17] "vote_count" "genre1"
## [19] "genre2" "genre3"
## [21] "genre4" "genre5"
## [23] "genre6" "genre7"
## [25] "genre8" "Clean_production_companies"
## [27] "Clean_production_countries" "spoken_lang_1"
## [29] "spoken_lang_2" "spoken_lang_3"
## [31] "spoken_lang_4" "spoken_lang_5"
## [33] "spoken_lang_6" "spoken_lang_7"
## [35] "spoken_lang_8" "spoken_lang_9"
## [37] "spoken_lang_10" "spoken_lang_11"
## [39] "spoken_lang_12" "spoken_lang_13"
## [41] "spoken_lang_14" "spoken_lang_15"
## [43] "spoken_lang_16" "spoken_lang_17"
## [45] "spoken_lang_18" "Collection"
## [47] "Clean_revenue" "Clean_budget"
## [49] "round_popularity" "num_genres"
## [51] "num_spoken_languages" "runtime_hours"
# Plotting the correlation between runtime and revenue
ggplot(collection_data, aes(x = runtime, y = revenue)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Correlation between Runtime and Revenue",
x = "Runtime",
y = "Revenue") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
Statistical analysis that explores the relationship between certain variables and movie success, in this case movies runtimes and the revenue done. This graph shows a positive correlation indicating that longer movies tend to generate higher revenue.
# Load necessary libraries
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Attaching package: 'arules'
## The following object is masked from 'package:tm':
##
## inspect
## The following object is masked from 'package:dplyr':
##
## recode
## The following objects are masked from 'package:base':
##
## abbreviate, write
# Data preparation
# Assume we have a movie dataset with attributes including release_date and success metrics (e.g., revenue)
# Convert release_date into categorical variables representing seasons or holidays
# For example, create binary variables for seasons: Spring, Summer, Fall, Winter
# Frequent itemset generation
frequent_itemsets <- apriori(copy_data, parameter = list(support = 0.01, minlen = 2))
## Warning: Column(s) 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 46,
## 47, 48, 49, 50, 51, 52, 53, 54 not logical or factor. Applying default
## discretization (see '? discretizeDF').
## Warning in discretize(x = c(373554033, 262797249, 0, 81452156, 76578911, : The calculated breaks are: 0, 0, 0, 2787965087
## Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.
## Warning in discretize(x = c(`1` = 1, `2` = 2, `3` = 1, `4` = 1, `5` = 1, : The calculated breaks are: 0, 1, 1, 18
## Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.
## Warning in discretize(x = c(3e+07, 6.5e+07, 0, 1.6e+07, 0, 6e+07, 5.8e+07, : The calculated breaks are: 0, 0, 0, 3.8e+08
## Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.
## Warning in discretize(x = c(343554033, 197797249, 0, 65452156, 76578911, : The calculated breaks are: -165710090, 0, 0, 2550965087
## Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.01 2
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 454
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[332200 item(s), 45449 transaction(s)] done [0.49s].
## sorting and recoding items ... [115 item(s)] done [0.02s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 5 6 7
## Warning in apriori(copy_data, parameter = list(support = 0.01, minlen = 2)):
## Mining stopped (time limit reached). Only patterns up to a length of 7
## returned!
## done [10.33s].
## writing ... [8634150 rule(s)] done [0.40s].
## creating S4 object ... done [2.12s].
# Rule generation
association_rules <- apriori(copy_data, parameter = list(support = 0.01, confidence = 0.5))
## Warning: Column(s) 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 46,
## 47, 48, 49, 50, 51, 52, 53, 54 not logical or factor. Applying default
## discretization (see '? discretizeDF').
## Warning in discretize(x = c(373554033, 262797249, 0, 81452156, 76578911, : The calculated breaks are: 0, 0, 0, 2787965087
## Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.
## Warning in discretize(x = c(`1` = 1, `2` = 2, `3` = 1, `4` = 1, `5` = 1, : The calculated breaks are: 0, 1, 1, 18
## Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.
## Warning in discretize(x = c(3e+07, 6.5e+07, 0, 1.6e+07, 0, 6e+07, 5.8e+07, : The calculated breaks are: 0, 0, 0, 3.8e+08
## Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.
## Warning in discretize(x = c(343554033, 197797249, 0, 65452156, 76578911, : The calculated breaks are: -165710090, 0, 0, 2550965087
## Only unique breaks are used reducing the number of intervals. Look at ? discretize for details.
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.01 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 454
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[332200 item(s), 45449 transaction(s)] done [0.36s].
## sorting and recoding items ... [115 item(s)] done [0.02s].
## creating transaction tree ... done [0.01s].
## checking subsets of size 1 2 3 4 5 6 7
## Warning in apriori(copy_data, parameter = list(support = 0.01, confidence =
## 0.5)): Mining stopped (time limit reached). Only patterns up to a length of 7
## returned!
## done [10.60s].
## writing ... [11336013 rule(s)] done [0.50s].
## creating S4 object ... done [1.56s].
# Rule evaluation
summary(association_rules)
## set of 11336013 rules
##
## rule length distribution (lhs + rhs):sizes
## 1 2 3 4 5 6 7
## 14 1536 25154 193425 919086 3015594 7181204
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 6.000 7.000 6.511 7.000 7.000
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.01001 Min. :0.5000 Min. :0.01001 Min. : 0.5541
## 1st Qu.:0.01322 1st Qu.:0.8111 1st Qu.:0.01503 1st Qu.: 1.0000
## Median :0.01938 Median :0.9688 Median :0.02251 Median : 1.0063
## Mean :0.03391 Mean :0.8890 Mean :0.03931 Mean : 1.3514
## 3rd Qu.:0.03595 3rd Qu.:1.0000 3rd Qu.:0.04200 3rd Qu.: 1.1592
## Max. :1.00000 Max. :1.0000 Max. :1.00000 Max. :49.4778
## count
## Min. : 455
## 1st Qu.: 601
## Median : 881
## Mean : 1541
## 3rd Qu.: 1634
## Max. :45449
##
## mining info:
## data ntransactions support confidence
## copy_data 45449 0.01 0.5
## call
## apriori(data = copy_data, parameter = list(support = 0.01, confidence = 0.5))
Association Rule Mining is a technique that helps us find the patterns in our movie dataset that might contribute to a movie’s success. This technique finds relationship among large sets of data items. For this problem setup, we use this technique to uncover patterns in release dates, such as seasons or holidays, that are associated with successful movie releases. Using the arules package in R to perform association rule mining on our movie dataset and the apriori function generate frequent itemsets from our dataset, then create a summary to see the outcomes.
# Load necessary libraries
library(lubridate)
# Assume 'collection_data' is your movie dataset with a 'release_date' column
# Convert 'release_date' to Date format
collection_data$release_date <- as.Date(collection_data$release_date)
# Extract month from release_date
collection_data$release_month <- month(collection_data$release_date)
# Create binary variables for seasons
collection_data$Winter <- ifelse(collection_data$release_month %in% c(12, 1, 2), 1, 0)
collection_data$Spring <- ifelse(collection_data$release_month %in% c(3, 4, 5), 1, 0)
collection_data$Summer <- ifelse(collection_data$release_month %in% c(6, 7, 8), 1, 0)
collection_data$Fall <- ifelse(collection_data$release_month %in% c(9, 10, 11), 1, 0)
# Check the first few rows to verify the new variables
head(collection_data[, c("release_date", "release_month", "Winter", "Spring", "Summer", "Fall")])
## release_date release_month Winter Spring Summer Fall
## 1 1995-10-30 10 0 0 0 1
## 2 1995-12-15 12 1 0 0 0
## 3 1995-12-22 12 1 0 0 0
## 4 1995-12-22 12 1 0 0 0
## 5 1995-02-10 2 1 0 0 0
## 6 1995-12-15 12 1 0 0 0
Finally using the lubridate library we created variables to give each movie’s release date a season of the year, and see whether a movie was released in a particular season or not.
GeeksforGeeks. (2018, September 13). Association Rule. GeeksforGeeks; GeeksforGeeks. https://www.geeksforgeeks.org/association-rule/