#Importing the dataset and libraries.
#Load libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(pastecs)
##
## Attaching package: 'pastecs'
## The following objects are masked from 'package:dplyr':
##
## first, last
# Import the full dataset (5043 movies, 28 variables)
movies <- read.csv("C:/Users/Žabica/Desktop/IMB/bootcamp/R Take Home Exam 2025/movie_metadata.csv", stringsAsFactors = FALSE)
head(movies)
## color director_name num_critic_for_reviews duration
## 1 Color James Cameron 723 178
## 2 Color Gore Verbinski 302 169
## 3 Color Sam Mendes 602 148
## 4 Color Christopher Nolan 813 164
## 5 Doug Walker NA NA
## 6 Color Andrew Stanton 462 132
## director_facebook_likes actor_3_facebook_likes actor_2_name
## 1 0 855 Joel David Moore
## 2 563 1000 Orlando Bloom
## 3 0 161 Rory Kinnear
## 4 22000 23000 Christian Bale
## 5 131 NA Rob Walker
## 6 475 530 Samantha Morton
## actor_1_facebook_likes gross genres
## 1 1000 760505847 Action|Adventure|Fantasy|Sci-Fi
## 2 40000 309404152 Action|Adventure|Fantasy
## 3 11000 200074175 Action|Adventure|Thriller
## 4 27000 448130642 Action|Thriller
## 5 131 NA Documentary
## 6 640 73058679 Action|Adventure|Sci-Fi
## actor_1_name movie_title
## 1 CCH Pounder Avatar
## 2 Johnny Depp Pirates of the Caribbean: At World's End
## 3 Christoph Waltz Spectre
## 4 Tom Hardy The Dark Knight Rises
## 5 Doug Walker Star Wars: Episode VII - The Force Awakens
## 6 Daryl Sabara John Carter
## num_voted_users cast_total_facebook_likes actor_3_name
## 1 886204 4834 Wes Studi
## 2 471220 48350 Jack Davenport
## 3 275868 11700 Stephanie Sigman
## 4 1144337 106759 Joseph Gordon-Levitt
## 5 8 143
## 6 212204 1873 Polly Walker
## facenumber_in_poster
## 1 0
## 2 0
## 3 1
## 4 0
## 5 0
## 6 1
## plot_keywords
## 1 avatar|future|marine|native|paraplegic
## 2 goddess|marriage ceremony|marriage proposal|pirate|singapore
## 3 bomb|espionage|sequel|spy|terrorist
## 4 deception|imprisonment|lawlessness|police officer|terrorist plot
## 5
## 6 alien|american civil war|male nipple|mars|princess
## movie_imdb_link num_user_for_reviews
## 1 http://www.imdb.com/title/tt0499549/?ref_=fn_tt_tt_1 3054
## 2 http://www.imdb.com/title/tt0449088/?ref_=fn_tt_tt_1 1238
## 3 http://www.imdb.com/title/tt2379713/?ref_=fn_tt_tt_1 994
## 4 http://www.imdb.com/title/tt1345836/?ref_=fn_tt_tt_1 2701
## 5 http://www.imdb.com/title/tt5289954/?ref_=fn_tt_tt_1 NA
## 6 http://www.imdb.com/title/tt0401729/?ref_=fn_tt_tt_1 738
## language country content_rating budget title_year actor_2_facebook_likes
## 1 English USA PG-13 237000000 2009 936
## 2 English USA PG-13 300000000 2007 5000
## 3 English UK PG-13 245000000 2015 393
## 4 English USA PG-13 250000000 2012 23000
## 5 NA NA 12
## 6 English USA PG-13 263700000 2012 632
## imdb_score aspect_ratio movie_facebook_likes
## 1 7.9 1.78 33000
## 2 7.1 2.35 0
## 3 6.8 2.35 85000
## 4 8.5 2.35 164000
## 5 7.1 NA 0
## 6 6.6 2.35 24000
I will focus on a subset that meets the exam requirement (≥4 variables, mostly numeric plus ≥1 categorical):
imdb_score: IMDB Score of the movie on IMDB (numeric, average rating 0–10),
duration: Duration in minutes (numeric),
budget: P roduction budget of the movie in Dollars (numeric),
gross: Gross earnings of the movie in Dollars (numeric),
color: Film colorization (categorical, “Black and White” or “Color”)
#Data cleaning and manipulation
# Keep only variables we actually use later (gross, budget, duration, imdb_score, color, language)
movies_clean <-
movies %>%
select(gross, budget, duration, imdb_score, color, language) %>%
# 2) Remove rows with missing or unusable key values
filter(
!is.na(imdb_score),
!is.na(duration),
!is.na(budget),
!is.na(gross),
!is.na(color)
) %>%
# 3) Create ROI (return on investment); protect against zero budgets
mutate(roi = ifelse(budget > 0, gross / budget, NA_real_)) %>%
# 4) Rename for convenience
rename(imdb = imdb_score) %>%
# 5) Recode color to a binary factor: Color = 0, Black and White = 1
mutate(
ColorBinary = factor(
color,
levels = c("Color", "Black and White"),
labels = c(0, 1)
)
)
# 6) Subset: English-language films (used later for comparison)
movies_eng <- movies_clean %>% filter(language == "English")
I removed rows with missing values in key variables to ensure consistency.
I created new variable: roi = gross/budget, a simple profitability measure.
I renamed imdb_score to imdb to simplify code.
I reformated categorical value color: 0 = Color, 1 = Black and White.
I converted content_rating into a factor so it can be used in grouped summaries and boxplots.
I prepared a subset of English-language films for comparison.
#Descriptive statistics
summary(movies_clean[, c("imdb","duration","budget","gross","roi", "ColorBinary")])
## imdb duration budget gross
## Min. :1.600 Min. : 34.0 Min. :2.180e+02 Min. : 162
## 1st Qu.:5.900 1st Qu.: 95.0 1st Qu.:1.000e+07 1st Qu.: 6844452
## Median :6.600 Median :106.0 Median :2.400e+07 Median : 27996968
## Mean :6.464 Mean :109.9 Mean :4.520e+07 Mean : 51068087
## 3rd Qu.:7.200 3rd Qu.:120.0 3rd Qu.:5.000e+07 3rd Qu.: 65406486
## Max. :9.300 Max. :330.0 Max. :1.222e+10 Max. :760505847
## roi ColorBinary
## Min. : 0.0000 0 :3757
## 1st Qu.: 0.4513 1 : 0
## Median : 1.0708 NA's: 133
## Mean : 6.2554
## 3rd Qu.: 2.2286
## Max. :7194.4855
Interpretations:
Mean and median forIMDB ratings: mean is aproximately 6.5 suggesting ratings are roughly symmetric around 6–7.Most films are rated positively, but the mean is slightly pulled down by a few poorly rated movies. Median IMDB rating is also about 6.5, very close to the mean — indicating that ratings are fairly symmetric around the center.
Quartiles for duration: Q1 ≈ 95 minutes and Q3 ≈ 120 minutes. This means that 50% of all movies fall between 95–120 minutes.
Minimum and maximum foe budget & Gross: The minimum budget is only a few hundred dollars, while the maximum is over $1 billion. This big gap creates extreme outliers (highly right-skewed).; a few blockbusters dominate the upper tail.
Standard deviation for budget and gross: The SDs are extremely large, confirming that financial variables are highly variable and spread out, with big differences between typical films and blockbuster outliers.
ROI: very skewed; most films near ROI ≈ 1, but some very high outliers indicate runaway hits.
Frequency for ColorBinary: Majority of films are Color = 0, while Black & White = 1 represents only a small share of the dataset.
#Graphs
##Histogram of IMDB ratings
ggplot(movies_clean, aes(x = imdb)) +
geom_histogram(binwidth = 0.5, fill = "darkgoldenrod1", color = "mediumseagreen") +
labs(title = "Distribution of IMDB ratings",
x = "IMDB rating", y = "Count")
Interpretation: Asimetrical - scewed slightly to the left, unimodal.
Ratings concentrate between 6 and 8; tail is thinnest at very low
scores.
##Budget vs Gross
ggplot(movies_clean, aes(x = budget, y = gross)) +
geom_point(alpha = 0.4) +
labs(title = "Budget vs Gross revenue",
x = "Budget (USD)", y = "Gross (USD)")
Interpretation: A clear positive relationship with large
dispersion—higher budgets often earn more, but not always.
# Boxplot of ROI by ColorBinary (design-focused)
p95 <- quantile(movies_clean$roi, 0.95, na.rm = TRUE)
ggplot(movies_clean, aes(x = ColorBinary, y = roi, fill = ColorBinary)) +
geom_boxplot(
width = 0.6,
notch = TRUE, # notched median
outlier.shape = 21, # hollow circles for outliers
outlier.size = 2,
outlier.fill = "white",
alpha = 0.8,
color = "black"
) +
stat_summary(fun = mean, geom = "point",
shape = 23, size = 3, fill = "red") + # red diamond mean
coord_cartesian(ylim = c(0, p95)) +
scale_fill_manual(
values = c("0" = "#1f77b4", # blue for Color
"1" = "#ff7f0e") # orange for Black & White
) +
labs(
title = "ROI by Film Color",
subtitle = "Boxplot trimmed at 95th percentile",
x = "Film color (0 = Color, 1 = Black & White)",
y = "ROI (gross / budget)",
caption = "◇ = mean • Notch ≈ 95% CI around median"
) +
theme_minimal(base_size = 13) +
theme(
plot.title = element_text(face = "bold", size = 14, color = "navy"),
plot.subtitle = element_text(size = 11, color = "grey30"),
axis.title.x = element_text(margin = margin(t = 10)),
axis.title.y = element_text(margin = margin(r = 10)),
panel.grid.major.y = element_line(color = "grey85", linetype = "dashed"),
panel.grid.minor = element_blank(),
legend.position = "none"
)
Interpretation: ROI distributions are highly skewed in both groups (long
upper tails). Medians can be compared directly; trimming at the 95th
percentile keeps the central pattern readable while acknowledging
extreme successes.
(Design:
Color palette: blue for Color = 0, orange for B&W = 1 (clear, colorblind-safe).
Box style: notched, semi-transparent, black border.
Outliers: hollow circles with white fill to stand out.
Mean: red diamond marker.
Theme: theme_minimal() with custom title, subtitle, dashed gridlines.)