Code
library(tidyverse)
library(corrplot)
library(gtsummary)
library(patchwork)
library(kableExtra)
library(visdat)
library(scales)Rpubs: https://rpubs.com/riquebaron/Filmes
Shiny App: https://riquebaron.shinyapps.io/Filmes/
library(tidyverse)
library(corrplot)
library(gtsummary)
library(patchwork)
library(kableExtra)
library(visdat)
library(scales)Escolhida a base de dados do TMDB por interesse pessoal em projetos cinematográficos, seus orçamentos e receitas em comparação com a avaliação final e popularidade. https://www.kaggle.com/datasets/asaniczka/tmdb-movies-dataset-2023-930k-movies/data
dados_inicial <- read.csv("C:/Users/ts2m/OneDrive - TRANSPETRO/Documentos/Pós/04 EDA/PD/TMDB_movie_dataset_v11.csv", na = "")dados_inicial |>
select(where(is.numeric)) |>
glimpse ()Rows: 1,393,169
Columns: 7
$ id <int> 27205, 157336, 155, 19995, 24428, 293660, 299536, 550, 11…
$ vote_average <dbl> 8.364, 8.417, 8.512, 7.573, 7.710, 7.606, 8.255, 8.438, 7…
$ vote_count <int> 34495, 32571, 30619, 29815, 29166, 28894, 27713, 27238, 2…
$ revenue <dbl> 825532764, 701729206, 1004558444, 2923706026, 1518815515,…
$ runtime <int> 148, 169, 152, 162, 143, 108, 149, 139, 121, 154, 142, 15…
$ budget <int> 160000000, 165000000, 185000000, 237000000, 220000000, 58…
$ popularity <dbl> 83.952, 140.241, 130.643, 79.932, 98.082, 72.735, 154.340…
dados_inicial |>
select(where(is.character)) |>
glimpse ()Rows: 1,393,169
Columns: 17
$ title <chr> "Inception", "Interstellar", "The Dark Knight", "…
$ status <chr> "Released", "Released", "Released", "Released", "…
$ release_date <chr> "2010-07-15", "2014-11-05", "2008-07-16", "2009-1…
$ adult <chr> "False", "False", "False", "False", "False", "Fal…
$ backdrop_path <chr> "/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg", "/pbrkL804c8y…
$ homepage <chr> "https://www.warnerbros.com/movies/inception", "h…
$ imdb_id <chr> "tt1375666", "tt0816692", "tt0468569", "tt0499549…
$ original_language <chr> "en", "en", "en", "en", "en", "en", "en", "en", "…
$ original_title <chr> "Inception", "Interstellar", "The Dark Knight", "…
$ overview <chr> "Cobb, a skilled thief who commits corporate espi…
$ poster_path <chr> "/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg", "/gEU2QniE6E7…
$ tagline <chr> "Your mind is the scene of the crime.", "Mankind …
$ genres <chr> "Action, Science Fiction, Adventure", "Adventure,…
$ production_companies <chr> "Legendary Pictures, Syncopy, Warner Bros. Pictur…
$ production_countries <chr> "United Kingdom, United States of America", "Unit…
$ spoken_languages <chr> "English, French, Japanese, Swahili", "English", …
$ keywords <chr> "rescue, mission, dream, airplane, paris, france,…
Limpeza da lista dos filmes com poucos dados usando apenas a quantidade de votos acima de 10 e com valor de orçamento (budget) e receita (revenue) acima de zero, visto que é de onde vamos levantar os questionamentos. Sem dados suficientemente preenchidos, perdemos a confiabilidade da avaliação.
dados <- dados_inicial |>
filter(
vote_count > 10,
adult == "False",
budget > 0,
revenue > 0,
!is.na(genres)
) |>
select(
-id, -runtime, -status, -adult, -backdrop_path, -homepage,
-imdb_id, -original_title, -overview, -poster_path, -tagline, -popularity
) |>
mutate(budget = as.double(budget)) |>
mutate(budget_nivel = ifelse(budget >= median(budget, na.rm = TRUE), "Alto", "Baixo"))dados |>
select(where(is.numeric)) |>
glimpse ()Rows: 8,866
Columns: 4
$ vote_average <dbl> 8.364, 8.417, 8.512, 7.573, 7.710, 7.606, 8.255, 8.438, 7…
$ vote_count <int> 34495, 32571, 30619, 29815, 29166, 28894, 27713, 27238, 2…
$ revenue <dbl> 825532764, 701729206, 1004558444, 2923706026, 1518815515,…
$ budget <dbl> 1.60e+08, 1.65e+08, 1.85e+08, 2.37e+08, 2.20e+08, 5.80e+0…
dados |>
select(where(is.character)) |>
glimpse ()Rows: 8,866
Columns: 9
$ title <chr> "Inception", "Interstellar", "The Dark Knight", "…
$ release_date <chr> "2010-07-15", "2014-11-05", "2008-07-16", "2009-1…
$ original_language <chr> "en", "en", "en", "en", "en", "en", "en", "en", "…
$ genres <chr> "Action, Science Fiction, Adventure", "Adventure,…
$ production_companies <chr> "Legendary Pictures, Syncopy, Warner Bros. Pictur…
$ production_countries <chr> "United Kingdom, United States of America", "Unit…
$ spoken_languages <chr> "English, French, Japanese, Swahili", "English", …
$ keywords <chr> "rescue, mission, dream, airplane, paris, france,…
$ budget_nivel <chr> "Alto", "Alto", "Alto", "Alto", "Alto", "Alto", "…
dados |>
vis_miss(sort_miss = TRUE) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9))na_resumo <- dados |>
summarise(across(everything(), ~ mean(is.na(.)))) |>
pivot_longer(everything(), names_to = "variavel",
values_to = "pct_na") |>
filter(pct_na > 0) |>
arrange(desc(pct_na)) |>
mutate(pct_na_fmt = percent(pct_na, accuracy = 0.01))
na_resumo |>
kbl(col.names = c("Variável", "% NA", "% Formatado"),
align = "lrr") |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
row_spec(which(na_resumo$pct_na > 0.4), background = "#FAECE7")| Variável | % NA | % Formatado |
|---|---|---|
| keywords | 0.0601173 | 6.01% |
| production_companies | 0.0136476 | 1.36% |
| production_countries | 0.0064291 | 0.64% |
| spoken_languages | 0.0010151 | 0.10% |
na_resumo |>
ggplot(aes(x = reorder(variavel, pct_na), y = pct_na)) +
geom_col(fill = "#C04828", alpha = 0.85, width = 0.7) +
geom_text(aes(label = pct_na_fmt), hjust = -0.15, size = 3.2,
color = "#73726C") +
coord_flip() +
scale_y_continuous(
labels = percent_format(),
limits = c(0, max(na_resumo$pct_na) * 1.15),
expand = c(0, 0)
) +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold"),
panel.grid.major.y = element_blank())As variáveis com NAs não serão utilizadas neste PD, mas serão preenchidas para avaliação.
dados <- dados |>
mutate(
across(
all_of(na_resumo$variavel),
~ ifelse(is.na(.), "não disponível", as.character(.))
)
)dados |>
vis_miss(sort_miss = TRUE) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9))Apenas para conhecimento e exemplificação, visto que de fato os outliers existem e devem ser considerados nas produções cinematográficas, quando comparadas com suas avaliações.
dados |>
ggplot(aes(y = budget)) +
geom_boxplot(outlier.alpha = 0.3) +
scale_y_continuous(labels = dollar_format())options(scipen = 999)
q1 <- quantile(dados$budget, 0.25)
q3 <- quantile(dados$budget, 0.75)
iqr <- q3 - q1
inferior <- q1 - 1.5 * iqr
superior <- q3 + 1.5 * iqr
inferior 25%
-35000000
superior 75%
69000000
dados_num <- dados |>
select(where(is.numeric))
dados_num |>
tbl_summary()| Characteristic | N = 8,8661 |
|---|---|
| vote_average | 6.47 (5.90, 7.01) |
| vote_count | 554 (141, 1,813) |
| revenue | 18,195,890 (3,721,911, 70,000,000) |
| budget | 12,000,000 (4,000,000, 30,000,000) |
| 1 Median (Q1, Q3) | |
matriz <- cor (dados_num, use = "complete.obs")
corrplot(
matriz,
method = "color",
type = "upper",
addCoef.col = "black",
tl.col = "black",
tl.srt = 45,
tl.cex = 0.8,
number.cex = 0.8
)O mais interessante aqui é ver que podemos questionar se os filmes que tiveram maior orçamento (budget) também tiveram maior receita (revenue). Também podemos levantar se realmente os filmes com maior receita (revenue) são realmente os mais votados na internet (vote_count).
p1 <- ggplot(dados, aes(x = vote_average)) +
geom_histogram(bins = 30)
p2 <- ggplot(dados, aes(sample = vote_average)) +
stat_qq() +
stat_qq_line()
p1 + p2dados |>
slice_sample(n = 1000) |>
pull(vote_average) |>
shapiro.test()
Shapiro-Wilk normality test
data: pull(slice_sample(dados, n = 1000), vote_average)
W = 0.97985, p-value = 0.000000000153
p1 <- ggplot(dados, aes(x = vote_count)) +
geom_histogram(bins = 30)
p2 <- ggplot(dados, aes(sample = vote_count)) +
stat_qq() +
stat_qq_line()
p1 + p2dados |>
slice_sample(n = 1000) |>
pull(vote_count) |>
shapiro.test()
Shapiro-Wilk normality test
data: pull(slice_sample(dados, n = 1000), vote_count)
W = 0.55756, p-value < 0.00000000000000022
p1 <- ggplot(dados, aes(x = revenue)) +
geom_histogram(bins = 30) +
scale_x_continuous(labels = label_dollar(scale = 1/1000000, suffix = "M"))
p2 <- ggplot(dados, aes(sample = revenue)) +
stat_qq() +
stat_qq_line() +
scale_x_continuous(labels = label_dollar(scale = 1/1000000, suffix = "M")) +
scale_y_continuous(labels = label_dollar(scale = 1/1000000, suffix = "M"))
p1 + p2dados |>
slice_sample(n = 1000) |>
pull(revenue) |>
shapiro.test()
Shapiro-Wilk normality test
data: pull(slice_sample(dados, n = 1000), revenue)
W = 0.41191, p-value < 0.00000000000000022
p1 <- ggplot(dados, aes(x = budget)) +
geom_histogram(bins = 30) +
scale_x_continuous(labels = label_dollar(scale = 1/1000000, suffix = "M"))
p2 <- ggplot(dados, aes(sample = budget)) +
stat_qq() +
stat_qq_line() +
scale_x_continuous(labels = label_dollar(scale = 1/1000000, suffix = "M")) +
scale_y_continuous(labels = label_dollar(scale = 1/1000000, suffix = "M"))
p1 + p2dados |>
slice_sample(n = 1000) |>
pull(budget) |>
shapiro.test()
Shapiro-Wilk normality test
data: pull(slice_sample(dados, n = 1000), budget)
W = 0.64568, p-value < 0.00000000000000022
Nenhuma variável seguiu perfeitamente uma distribuição normal, mas vote_average foi a que visualmente mais se aproximou.
dados_num |>
ggplot(aes(x = revenue, y = vote_count,
color = vote_average)) +
geom_point(alpha = 0.4, size = 1.5) +
geom_smooth(method = "lm", se = TRUE,
color = "#C04828", fill = "#C04828",
alpha = 0.15, linewidth = 1) +
scale_color_gradient(low = "#B5D4F4", high = "#0C447C",
name = "Nota") +
scale_x_continuous(labels = comma) +
scale_y_continuous(labels = comma) +
labs(title = "Receita x Vote Count",
x = "Receita", y = "Número de avaliações") +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold"))r_pearson <- cor(dados$revenue,
dados$vote_count,
method = "pearson")
r_spearman <- cor(dados$revenue,
dados$vote_count,
method = "spearman")
r_pearson[1] 0.745075
r_spearman[1] 0.7324397
Análise entre receita e número de votos ou orçamento indicam associação positiva forte.
dados_num |>
ggplot(aes(x = revenue, y = budget)) +
geom_point(alpha = 0.4, size = 1.5) +
geom_smooth(method = "lm", se = TRUE,
color = "#C04828", fill = "#C04828",
alpha = 0.15, linewidth = 1) +
scale_x_continuous(labels = comma) +
scale_y_continuous(labels = comma) +
labs(title = "Receita x Orçamento",
x = "Receita", y = "Orçamento") +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold"))cor(dados$revenue,
dados$budget,
method = "pearson")[1] 0.7368986
cor(dados$revenue,
dados$budget,
method = "spearman")[1] 0.7058273
dados |>
select(budget_nivel, revenue, vote_average, vote_count) |>
tbl_summary(by = budget_nivel) |>
add_p()| Characteristic | Alto N = 4,5741 |
Baixo N = 4,2921 |
p-value2 |
|---|---|---|---|
| revenue | 55,494,815 (18,593,156, 149,281,606) | 5,000,000 (1,248,962, 16,495,540) | <0.001 |
| vote_average | 6.44 (5.92, 6.98) | 6.50 (5.86, 7.06) | 0.5 |
| vote_count | 1,268 (444, 3,268) | 202 (58, 656) | <0.001 |
| 1 Median (Q1, Q3) | |||
| 2 Wilcoxon rank sum test | |||
dados |>
ggplot(aes(x = budget_nivel, y = revenue)) +
geom_boxplot(outlier.alpha = 0.3) +
stat_summary(fun = mean, geom = "point", color = "red", size = 2) +
scale_y_continuous(labels = scales::label_dollar(scale = 1/1e6, suffix = "M")) +
labs(x = "Nível de budget", y = "Receita") +
theme_minimal()t.test(revenue ~ budget_nivel, data = dados)
Welch Two Sample t-test
data: revenue by budget_nivel
t = 35.58, df = 4867.1, p-value < 0.00000000000000022
alternative hypothesis: true difference in means between group Alto and group Baixo is not equal to 0
95 percent confidence interval:
106677230 119118353
sample estimates:
mean in group Alto mean in group Baixo
129342507 16444715
wilcox.test(revenue ~ budget_nivel, data = dados)
Wilcoxon rank sum test with continuity correction
data: revenue by budget_nivel
W = 16475261, p-value < 0.00000000000000022
alternative hypothesis: true location shift is not equal to 0
Filmes com budget alto apresentaram revenue significativamente maior do que filmes com budget baixo (p < 0,001).
dados_genero <- dados |>
mutate(genero_principal = sub(",.*", "", genres))dados_genero |>
select(genero_principal, revenue, vote_average, vote_count) |>
tbl_summary(by = genero_principal) |>
add_p()| Characteristic | Action N = 1,3361 |
Adventure N = 5291 |
Animation N = 2701 |
Comedy N = 1,9391 |
Crime N = 4271 |
Documentary N = 681 |
Drama N = 2,0551 |
Family N = 1831 |
Fantasy N = 2021 |
History N = 531 |
Horror N = 6001 |
Music N = 771 |
Mystery N = 1161 |
Romance N = 2351 |
Science Fiction N = 2211 |
Thriller N = 3951 |
TV Movie N = 11 |
War N = 921 |
Western N = 671 |
p-value2 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| revenue | 31,276,312 (7,257,282, 125,320,293) | 51,842,679 (9,001,826, 209,073,645) | 63,940,555 (11,373,501, 272,912,430) | 17,287,309 (3,970,078, 56,722,693) | 12,633,747 (2,062,066, 44,862,187) | 3,654,411 (293,066, 19,754,790) | 10,719,357 (2,152,738, 37,807,625) | 49,276,818 (16,951,702, 135,680,000) | 50,515,476 (8,016,708, 155,010,032) | 12,200,000 (2,998,545, 44,922,302) | 17,716,903 (3,485,000, 60,191,880) | 14,859,394 (4,358,000, 38,682,707) | 16,415,126 (2,919,250, 50,068,547) | 11,336,986 (3,280,152, 58,878,723) | 33,000,000 (5,086,909, 136,159,423) | 13,551,174 (2,250,000, 57,891,803) | 42,000,000 (42,000,000, 42,000,000) | 11,189,449 (1,724,244, 58,000,125) | 7,900,000 (2,015,000, 32,192,570) | <0.001 |
| vote_average | 6.28 (5.73, 6.84) | 6.52 (6.00, 7.08) | 6.89 (6.30, 7.36) | 6.25 (5.70, 6.76) | 6.63 (6.12, 7.17) | 6.92 (6.50, 7.35) | 6.78 (6.25, 7.24) | 6.51 (5.92, 7.04) | 6.41 (5.82, 7.04) | 6.78 (6.21, 7.25) | 6.11 (5.52, 6.58) | 6.81 (6.20, 7.36) | 6.34 (5.87, 7.02) | 6.48 (5.90, 6.97) | 6.25 (5.74, 6.88) | 6.39 (5.81, 6.88) | 6.20 (6.20, 6.20) | 6.80 (6.29, 7.26) | 6.84 (6.41, 7.37) | <0.001 |
| vote_count | 751 (158, 2,731) | 1,299 (298, 4,236) | 1,392 (289, 4,339) | 395 (102, 1,310) | 522 (154, 1,541) | 115 (40, 274) | 406 (106, 1,333) | 740 (265, 2,212) | 1,319 (231, 4,060) | 228 (74, 1,057) | 906 (344, 2,116) | 273 (95, 790) | 543 (176, 1,524) | 365 (76, 1,260) | 1,227 (316, 4,891) | 712 (230, 1,769) | 195 (195, 195) | 241 (90, 921) | 390 (183, 1,023) | <0.001 |
| 1 Median (Q1, Q3) | ||||||||||||||||||||
| 2 Kruskal-Wallis rank sum test | ||||||||||||||||||||
dados_genero |>
ggplot(aes(x = reorder(genero_principal, revenue, FUN = mean), y = revenue)) +
geom_boxplot() +
stat_summary(fun = mean, geom = "point", color = "red", size = 2) +
coord_flip() +
scale_y_continuous(labels = scales::label_dollar(scale = 1/1e6, suffix = "M")) +
labs(x = "Gênero principal", y = "Receita")kruskal.test(revenue ~ genero_principal, data = dados_genero)
Kruskal-Wallis rank sum test
data: revenue by genero_principal
Kruskal-Wallis chi-squared = 588.79, df = 18, p-value <
0.00000000000000022
Os resultados indicam que os gêneros apresentam diferenças significativas de receita, nota média e número de votos, sugerindo que o desempenho dos filmes varia conforme o gênero.
\(H_0:\) não há correlação linear entre o budget e a média das notas
\(H_1:\) há correlação linear entre o budget e a média das notas
dados |>
ggplot(aes(x = budget, y = vote_average)) +
geom_point(alpha = 0.25, color = "#185FA5", size = 1.2) +
geom_smooth(method = "lm", se = FALSE, color = "#C04828",
linewidth = 1, linetype = "solid") +
geom_smooth(method = "loess", se = FALSE, color = "#0F6E56",
linewidth = 1, linetype = "dashed") +
scale_x_continuous(labels = dollar_format(prefix = "US$", scale = 1e-3, suffix = "K")) +
labs(title = "Orçamento x Nota Média",
subtitle = "Vermelho = linear \n Verde = LOESS",
x = "Orçamento",
y = "Nota média")+
theme_minimal() +
theme(plot.title = element_text(face = "bold"))cor.test(dados$budget,
dados$vote_average,
method = "pearson")
Pearson's product-moment correlation
data: dados$budget and dados$vote_average
t = 8.0791, df = 8864, p-value = 0.0000000000000007381
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.06479709 0.10612468
sample estimates:
cor
0.08549766
cor.test(dados$budget,
dados$vote_average,
method = "spearman")
Spearman's rank correlation rho
data: dados$budget and dados$vote_average
S = 115924593131, p-value = 0.8529
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
0.001969864
Pearson indicou uma relação positiva muito fraca entre budget e nota média, mas Spearman não mostrou relação significativa. Na prática, isso sugere que budget e nota média quase não se relacionam.
vars_interesse <- c("revenue", "vote_count", "vote_average")
correlacoes <- vars_interesse |>
map_dfr(function(var) {
r_p <- cor.test(dados[[var]], dados$budget, method = "pearson")
r_s <- cor.test(dados[[var]], dados$budget, method = "spearman",
exact = FALSE)
tibble(
variavel = var,
pearson_r = round(r_p$estimate, 3),
spearman_r = round(r_s$estimate, 3),
p_valor = ifelse(r_p$p.value < 0.001, "<0.001", format(round(r_p$p.value, 4), nsmall = 4)),
sig = case_when(
r_p$p.value < 0.001 ~ "***",
r_p$p.value < 0.01 ~ "**",
r_p$p.value < 0.05 ~ "*",
TRUE ~ "ns"
),
interpretacao = case_when(
abs(r_p$estimate) >= 0.70 ~ "Forte",
abs(r_p$estimate) >= 0.40 ~ "Moderada",
abs(r_p$estimate) >= 0.20 ~ "Fraca",
TRUE ~ "Desprezível"
)
)
}) |>
arrange(desc(abs(pearson_r)))
correlacoes |>
kbl(col.names = c("Variável", "Pearson r", "Spearman ρ",
"p-valor", "Sig.", "Força"),
align = "lrrrcl",
caption = "Correlação com budget — ordenado por |r| de Pearson") |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) |>
row_spec(which(abs(correlacoes$pearson_r) >= 0.60),
background = "#D6E4F5", bold = TRUE)| Variável | Pearson r | Spearman ρ | p-valor | Sig. | Força |
|---|---|---|---|---|---|
| revenue | 0.737 | 0.706 | <0.001 | *** | Forte |
| vote_count | 0.579 | 0.597 | <0.001 | *** | Moderada |
| vote_average | 0.085 | 0.002 | <0.001 | *** | Desprezível |
Com isso, vemos que filmes com maior budget tendem a arrecadar mais e receber mais votos, mas não necessariamente ter nota média maior.
dados_modelo <- dados |>
select(budget, revenue, vote_average, vote_count)
saveRDS(dados_modelo, "C:/Users/ts2m/OneDrive - TRANSPETRO/Documentos/Pós/04 EDA/PD/Filmes/dados.rds")