Code
library(tidyverse)
library(corrplot)
library(gtsummary)
library(patchwork)
library(kableExtra)
library(visdat)
library(scales)Rpubs: https://rpubs.com/riquebaron/Filmes
Shiny App: https://riquebaron.shinyapps.io/Filmes/
library(tidyverse)
library(corrplot)
library(gtsummary)
library(patchwork)
library(kableExtra)
library(visdat)
library(scales)Escolhida a base de dados do TMDB por interesse pessoal em projetos cinematográficos, seus orçamentos e receitas em comparação com a avaliação final e popularidade. https://www.kaggle.com/datasets/asaniczka/tmdb-movies-dataset-2023-930k-movies/data
dados_inicial <- read.csv("C:/Users/ts2m/OneDrive - TRANSPETRO/Documentos/Pós/04 EDA/PD/TMDB_movie_dataset_v11.csv", na = "")dados_inicial |>
select(where(is.numeric)) |>
glimpse ()Rows: 1,393,169
Columns: 7
$ id <int> 27205, 157336, 155, 19995, 24428, 293660, 299536, 550, 11…
$ vote_average <dbl> 8.364, 8.417, 8.512, 7.573, 7.710, 7.606, 8.255, 8.438, 7…
$ vote_count <int> 34495, 32571, 30619, 29815, 29166, 28894, 27713, 27238, 2…
$ revenue <dbl> 825532764, 701729206, 1004558444, 2923706026, 1518815515,…
$ runtime <int> 148, 169, 152, 162, 143, 108, 149, 139, 121, 154, 142, 15…
$ budget <int> 160000000, 165000000, 185000000, 237000000, 220000000, 58…
$ popularity <dbl> 83.952, 140.241, 130.643, 79.932, 98.082, 72.735, 154.340…
dados_inicial |>
select(where(is.character)) |>
glimpse ()Rows: 1,393,169
Columns: 17
$ title <chr> "Inception", "Interstellar", "The Dark Knight", "…
$ status <chr> "Released", "Released", "Released", "Released", "…
$ release_date <chr> "2010-07-15", "2014-11-05", "2008-07-16", "2009-1…
$ adult <chr> "False", "False", "False", "False", "False", "Fal…
$ backdrop_path <chr> "/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg", "/pbrkL804c8y…
$ homepage <chr> "https://www.warnerbros.com/movies/inception", "h…
$ imdb_id <chr> "tt1375666", "tt0816692", "tt0468569", "tt0499549…
$ original_language <chr> "en", "en", "en", "en", "en", "en", "en", "en", "…
$ original_title <chr> "Inception", "Interstellar", "The Dark Knight", "…
$ overview <chr> "Cobb, a skilled thief who commits corporate espi…
$ poster_path <chr> "/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg", "/gEU2QniE6E7…
$ tagline <chr> "Your mind is the scene of the crime.", "Mankind …
$ genres <chr> "Action, Science Fiction, Adventure", "Adventure,…
$ production_companies <chr> "Legendary Pictures, Syncopy, Warner Bros. Pictur…
$ production_countries <chr> "United Kingdom, United States of America", "Unit…
$ spoken_languages <chr> "English, French, Japanese, Swahili", "English", …
$ keywords <chr> "rescue, mission, dream, airplane, paris, france,…
Limpeza da lista dos filmes com poucos dados usando apenas a quantidade de votos acima de 10 e com valor de orçamento (budget) e receita (revenue) acima de zero, visto que é de onde vamos levantar os questionamentos. Sem dados suficientemente preenchidos, perdemos a confiabilidade da avaliação.
dados <- dados_inicial |>
filter(
vote_count > 10,
adult == "False",
budget > 0,
revenue > 0
) |>
select(
-id, -runtime, -status, -adult, -backdrop_path, -homepage,
-imdb_id, -original_title, -overview, -poster_path, -tagline, -popularity
) |>
mutate(budget = as.double(budget)) |>
mutate(budget_nivel = ifelse(budget >= median(budget, na.rm = TRUE), "Alto", "Baixo"))dados |>
select(where(is.numeric)) |>
glimpse ()Rows: 8,870
Columns: 4
$ vote_average <dbl> 8.364, 8.417, 8.512, 7.573, 7.710, 7.606, 8.255, 8.438, 7…
$ vote_count <int> 34495, 32571, 30619, 29815, 29166, 28894, 27713, 27238, 2…
$ revenue <dbl> 825532764, 701729206, 1004558444, 2923706026, 1518815515,…
$ budget <dbl> 1.60e+08, 1.65e+08, 1.85e+08, 2.37e+08, 2.20e+08, 5.80e+0…
dados |>
select(where(is.character)) |>
glimpse ()Rows: 8,870
Columns: 9
$ title <chr> "Inception", "Interstellar", "The Dark Knight", "…
$ release_date <chr> "2010-07-15", "2014-11-05", "2008-07-16", "2009-1…
$ original_language <chr> "en", "en", "en", "en", "en", "en", "en", "en", "…
$ genres <chr> "Action, Science Fiction, Adventure", "Adventure,…
$ production_companies <chr> "Legendary Pictures, Syncopy, Warner Bros. Pictur…
$ production_countries <chr> "United Kingdom, United States of America", "Unit…
$ spoken_languages <chr> "English, French, Japanese, Swahili", "English", …
$ keywords <chr> "rescue, mission, dream, airplane, paris, france,…
$ budget_nivel <chr> "Alto", "Alto", "Alto", "Alto", "Alto", "Alto", "…
dados |>
vis_miss(sort_miss = TRUE) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9))na_resumo <- dados |>
summarise(across(everything(), ~ mean(is.na(.)))) |>
pivot_longer(everything(), names_to = "variavel",
values_to = "pct_na") |>
filter(pct_na > 0) |>
arrange(desc(pct_na)) |>
mutate(pct_na_fmt = percent(pct_na, accuracy = 0.01))
na_resumo |>
kbl(col.names = c("Variável", "% NA", "% Formatado"),
align = "lrr") |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = FALSE) |>
row_spec(which(na_resumo$pct_na > 0.4), background = "#FAECE7")| Variável | % NA | % Formatado |
|---|---|---|
| keywords | 0.0605411 | 6.05% |
| production_companies | 0.0139797 | 1.40% |
| production_countries | 0.0067644 | 0.68% |
| spoken_languages | 0.0012401 | 0.12% |
| genres | 0.0004510 | 0.05% |
| release_date | 0.0003382 | 0.03% |
na_resumo |>
ggplot(aes(x = reorder(variavel, pct_na), y = pct_na)) +
geom_col(fill = "#C04828", alpha = 0.85, width = 0.7) +
geom_text(aes(label = pct_na_fmt), hjust = -0.15, size = 3.2,
color = "#73726C") +
coord_flip() +
scale_y_continuous(
labels = percent_format(),
limits = c(0, max(na_resumo$pct_na) * 1.15),
expand = c(0, 0)
) +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold"),
panel.grid.major.y = element_blank())As variáveis com NAs não serão utilizadas neste PD, mas serão preenchidas para avaliação.
dados <- dados |>
mutate(
across(
all_of(na_resumo$variavel),
~ ifelse(is.na(.), "não disponível", as.character(.))
)
)dados |>
vis_miss(sort_miss = TRUE) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 9))Apenas para conhecimento e exemplificação, visto que de fato os outliers existem e devem ser considerados nas produções cinematográficas, quando comparadas com suas avaliações.
dados |>
ggplot(aes(y = budget)) +
geom_boxplot(outlier.alpha = 0.3) +
scale_y_continuous(labels = dollar_format())options(scipen = 999)
q1 <- quantile(dados$budget, 0.25)
q3 <- quantile(dados$budget, 0.75)
iqr <- q3 - q1
inferior <- q1 - 1.5 * iqr
superior <- q3 + 1.5 * iqr
inferior 25%
-35000000
superior 75%
69000000
dados_num <- dados |>
select(where(is.numeric))
dados_num |>
tbl_summary()| Characteristic | N = 8,8701 |
|---|---|
| vote_average | 6.47 (5.90, 7.01) |
| vote_count | 553 (141, 1,813) |
| revenue | 18,162,535 (3,716,598, 70,000,000) |
| budget | 12,000,000 (4,000,000, 30,000,000) |
| 1 Median (Q1, Q3) | |
matriz <- cor (dados_num, use = "complete.obs")
corrplot(
matriz,
method = "color",
type = "upper",
addCoef.col = "black",
tl.col = "black",
tl.srt = 45,
tl.cex = 0.8,
number.cex = 0.8
)O mais interessante aqui é ver que podemos questionar se os filmes que tiveram maior orçamento (budget) também tiveram maior receita (revenue). Também podemos levantar se realmente os filmes com maior receita (revenue) são realmente os mais votados na internet (vote_count).
p1 <- ggplot(dados, aes(x = vote_average)) +
geom_histogram(bins = 30)
p2 <- ggplot(dados, aes(sample = vote_average)) +
stat_qq() +
stat_qq_line()
p1 + p2dados |>
slice_sample(n = 1000) |>
pull(vote_average) |>
shapiro.test()
Shapiro-Wilk normality test
data: pull(slice_sample(dados, n = 1000), vote_average)
W = 0.98155, p-value = 0.0000000006088
p1 <- ggplot(dados, aes(x = vote_count)) +
geom_histogram(bins = 30)
p2 <- ggplot(dados, aes(sample = vote_count)) +
stat_qq() +
stat_qq_line()
p1 + p2dados |>
slice_sample(n = 1000) |>
pull(vote_count) |>
shapiro.test()
Shapiro-Wilk normality test
data: pull(slice_sample(dados, n = 1000), vote_count)
W = 0.56561, p-value < 0.00000000000000022
p1 <- ggplot(dados, aes(x = revenue)) +
geom_histogram(bins = 30) +
scale_x_continuous(labels = label_dollar(scale = 1/1000000, suffix = "M"))
p2 <- ggplot(dados, aes(sample = revenue)) +
stat_qq() +
stat_qq_line() +
scale_x_continuous(labels = label_dollar(scale = 1/1000000, suffix = "M")) +
scale_y_continuous(labels = label_dollar(scale = 1/1000000, suffix = "M"))
p1 + p2dados |>
slice_sample(n = 1000) |>
pull(revenue) |>
shapiro.test()
Shapiro-Wilk normality test
data: pull(slice_sample(dados, n = 1000), revenue)
W = 0.43615, p-value < 0.00000000000000022
p1 <- ggplot(dados, aes(x = budget)) +
geom_histogram(bins = 30) +
scale_x_continuous(labels = label_dollar(scale = 1/1000000, suffix = "M"))
p2 <- ggplot(dados, aes(sample = budget)) +
stat_qq() +
stat_qq_line() +
scale_x_continuous(labels = label_dollar(scale = 1/1000000, suffix = "M")) +
scale_y_continuous(labels = label_dollar(scale = 1/1000000, suffix = "M"))
p1 + p2dados |>
slice_sample(n = 1000) |>
pull(budget) |>
shapiro.test()
Shapiro-Wilk normality test
data: pull(slice_sample(dados, n = 1000), budget)
W = 0.68471, p-value < 0.00000000000000022
Nenhuma variável seguiu perfeitamente uma distribuição normal, mas vote_average foi a que visualmente mais se aproximou.
dados_num |>
ggplot(aes(x = revenue, y = vote_count,
color = vote_average)) +
geom_point(alpha = 0.4, size = 1.5) +
geom_smooth(method = "lm", se = TRUE,
color = "#C04828", fill = "#C04828",
alpha = 0.15, linewidth = 1) +
scale_color_gradient(low = "#B5D4F4", high = "#0C447C",
name = "Nota") +
scale_x_continuous(labels = comma) +
scale_y_continuous(labels = comma) +
labs(title = "Revenue x Vote Count",
x = "Revenue", y = "Número de avaliações") +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold"))r_pearson <- cor(dados$revenue,
dados$vote_count,
method = "pearson")
r_spearman <- cor(dados$revenue,
dados$vote_count,
method = "spearman")
r_pearson[1] 0.745102
r_spearman[1] 0.732704
Análise entre receita e número de votos ou orçamento indicam associação positiva forte.
dados_num |>
ggplot(aes(x = revenue, y = budget)) +
geom_point(alpha = 0.4, size = 1.5) +
geom_smooth(method = "lm", se = TRUE,
color = "#C04828", fill = "#C04828",
alpha = 0.15, linewidth = 1) +
scale_x_continuous(labels = comma) +
scale_y_continuous(labels = comma) +
labs(title = "Revenue x Budget",
x = "Revenue", y = "Budget") +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(face = "bold"))cor(dados$revenue,
dados$budget,
method = "pearson")[1] 0.7369269
cor(dados$revenue,
dados$budget,
method = "spearman")[1] 0.7061715
dados |>
select(budget_nivel, revenue, vote_average, vote_count) |>
tbl_summary(by = budget_nivel) |>
add_p()| Characteristic | Alto N = 4,5741 |
Baixo N = 4,2961 |
p-value2 |
|---|---|---|---|
| revenue | 55,494,815 (18,593,156, 149,281,606) | 5,000,000 (1,245,051, 16,458,776) | <0.001 |
| vote_average | 6.44 (5.92, 6.98) | 6.50 (5.85, 7.06) | 0.5 |
| vote_count | 1,268 (444, 3,268) | 201 (57, 655) | <0.001 |
| 1 Median (Q1, Q3) | |||
| 2 Wilcoxon rank sum test | |||
dados |>
ggplot(aes(x = budget_nivel, y = revenue)) +
geom_boxplot()t.test(revenue ~ budget_nivel, data = dados)
Welch Two Sample t-test
data: revenue by budget_nivel
t = 35.586, df = 4866.7, p-value < 0.00000000000000022
alternative hypothesis: true difference in means between group Alto and group Baixo is not equal to 0
95 percent confidence interval:
106692332 119133129
sample estimates:
mean in group Alto mean in group Baixo
129342507 16429776
wilcox.test(revenue ~ budget_nivel, data = dados)
Wilcoxon rank sum test with continuity correction
data: revenue by budget_nivel
W = 16493363, p-value < 0.00000000000000022
alternative hypothesis: true location shift is not equal to 0
Filmes com budget alto apresentaram revenue significativamente maior do que filmes com budget baixo (p < 0,001).
dados_genero <- dados |>
separate_rows(genres, sep = ",\\s*")dados_genero |>
select(genres, revenue, vote_average, vote_count) |>
tbl_summary(by = genres) |>
add_p()| Characteristic | Action N = 2,1981 |
Adventure N = 1,5221 |
Animation N = 4871 |
Comedy N = 3,2191 |
Crime N = 1,3831 |
Documentary N = 951 |
Drama N = 4,1251 |
Family N = 8991 |
Fantasy N = 8581 |
History N = 4701 |
Horror N = 1,0221 |
Music N = 3191 |
Mystery N = 7671 |
não disponível N = 41 |
Romance N = 1,6891 |
Science Fiction N = 9391 |
Thriller N = 2,2261 |
TV Movie N = 131 |
War N = 3411 |
Western N = 1521 |
p-value2 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| revenue | 35,147,235 (8,000,000, 136,333,522) | 59,544,841 (12,000,000, 215,663,859) | 65,146,020 (12,000,000, 250,397,798) | 21,630,088 (4,634,062, 74,151,346) | 17,986,781 (3,500,000, 59,284,015) | 4,074,023 (589,244, 22,730,842) | 12,638,526 (2,520,000, 46,122,355) | 50,500,000 (12,506,188, 176,104,344) | 49,009,522 (9,938,268, 179,179,718) | 17,817,433 (4,000,000, 58,972,904) | 14,572,091 (2,228,115, 53,785,551) | 14,859,394 (2,956,000, 44,922,302) | 18,196,170 (4,000,000, 68,349,884) | 253 (3, 800,250) | 14,822,346 (3,355,000, 54,830,779) | 32,589,624 (5,735,963, 159,773,545) | 20,581,985 (4,298,184, 76,066,841) | 110,000 (37, 8,900,000) | 16,361,885 (3,470,487, 68,129,518) | 9,673,089 (2,917,300, 38,632,139) | <0.001 |
| vote_average | 6.32 (5.80, 6.87) | 6.46 (5.90, 7.05) | 6.83 (6.24, 7.30) | 6.30 (5.76, 6.83) | 6.50 (5.98, 7.00) | 6.88 (6.46, 7.35) | 6.69 (6.15, 7.19) | 6.50 (5.87, 7.07) | 6.50 (5.90, 7.10) | 6.98 (6.50, 7.35) | 6.09 (5.50, 6.59) | 6.68 (6.13, 7.16) | 6.40 (5.90, 6.91) | 5.65 (5.25, 5.95) | 6.50 (5.96, 6.99) | 6.29 (5.74, 6.90) | 6.33 (5.81, 6.86) | 5.96 (5.52, 6.48) | 6.83 (6.33, 7.37) | 6.67 (6.10, 7.16) | <0.001 |
| vote_count | 866 (189, 3,040) | 1,220 (292, 4,069) | 1,349 (320, 4,339) | 479 (130, 1,556) | 608 (169, 1,770) | 118 (37, 326) | 425 (113, 1,385) | 888 (235, 3,024) | 1,258 (328, 3,957) | 441 (141, 1,533) | 779 (280, 1,925) | 271 (85, 823) | 824 (235, 1,985) | 20 (18, 32) | 363 (86, 1,276) | 1,212 (355, 4,378) | 823 (242, 2,070) | 55 (21, 182) | 450 (142, 1,436) | 467 (136, 1,086) | <0.001 |
| 1 Median (Q1, Q3) | |||||||||||||||||||||
| 2 Kruskal-Wallis rank sum test | |||||||||||||||||||||
dados_genero |>
ggplot(aes(x = genres, y = revenue)) +
geom_boxplot() +
coord_flip()kruskal.test(revenue ~ genres, data = dados_genero)
Kruskal-Wallis rank sum test
data: revenue by genres
Kruskal-Wallis chi-squared = 1336.4, df = 19, p-value <
0.00000000000000022
Os resultados indicam que os gêneros apresentam diferenças significativas de receita, nota média e número de votos, sugerindo que o desempenho dos filmes varia conforme o gênero.
\(H_0:\) não há correlação linear entre o budget e a média das notas
\(H_1:\) há correlação linear entre o budget e a média das notas
dados |>
ggplot(aes(x = budget, y = vote_average)) +
geom_point(alpha = 0.25, color = "#185FA5", size = 1.2) +
geom_smooth(method = "lm", se = FALSE, color = "#C04828",
linewidth = 1, linetype = "solid") +
geom_smooth(method = "loess", se = FALSE, color = "#0F6E56",
linewidth = 1, linetype = "dashed") +
scale_x_continuous(labels = dollar_format(prefix = "US$", scale = 1e-3, suffix = "K")) +
labs(title = "Budget x Nota Média",
subtitle = "Vermelho = linear \n Verde = LOESS",
x = "Budget",
y = "Nota média")+
theme_minimal() +
theme(plot.title = element_text(face = "bold"))cor.test(dados$budget,
dados$vote_average,
method = "pearson")
Pearson's product-moment correlation
data: dados$budget and dados$vote_average
t = 8.105, df = 8868, p-value = 0.0000000000000005975
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.06505553 0.10637199
sample estimates:
cor
0.08575062
cor.test(dados$budget,
dados$vote_average,
method = "spearman")
Spearman's rank correlation rho
data: dados$budget and dados$vote_average
S = 115984665544, p-value = 0.7918
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
0.002802983
Pearson indicou uma relação positiva muito fraca entre budget e nota média, mas Spearman não mostrou relação significativa. Na prática, isso sugere que budget e nota média quase não se relacionam.
vars_interesse <- c("revenue", "vote_count", "vote_average")
correlacoes <- vars_interesse |>
map_dfr(function(var) {
r_p <- cor.test(dados[[var]], dados$budget, method = "pearson")
r_s <- cor.test(dados[[var]], dados$budget, method = "spearman",
exact = FALSE)
tibble(
variavel = var,
pearson_r = round(r_p$estimate, 3),
spearman_r = round(r_s$estimate, 3),
p_valor = ifelse(r_p$p.value < 0.001, "<0.001", format(round(r_p$p.value, 4), nsmall = 4)),
sig = case_when(
r_p$p.value < 0.001 ~ "***",
r_p$p.value < 0.01 ~ "**",
r_p$p.value < 0.05 ~ "*",
TRUE ~ "ns"
),
interpretacao = case_when(
abs(r_p$estimate) >= 0.70 ~ "Forte",
abs(r_p$estimate) >= 0.40 ~ "Moderada",
abs(r_p$estimate) >= 0.20 ~ "Fraca",
TRUE ~ "Desprezível"
)
)
}) |>
arrange(desc(abs(pearson_r)))
correlacoes |>
kbl(col.names = c("Variável", "Pearson r", "Spearman ρ",
"p-valor", "Sig.", "Força"),
align = "lrrrcl",
caption = "Correlação com budget — ordenado por |r| de Pearson") |>
kable_styling(bootstrap_options = c("striped", "hover"), full_width = TRUE) |>
row_spec(which(abs(correlacoes$pearson_r) >= 0.60),
background = "#D6E4F5", bold = TRUE)| Variável | Pearson r | Spearman ρ | p-valor | Sig. | Força |
|---|---|---|---|---|---|
| revenue | 0.737 | 0.706 | <0.001 | *** | Forte |
| vote_count | 0.579 | 0.597 | <0.001 | *** | Moderada |
| vote_average | 0.086 | 0.003 | <0.001 | *** | Desprezível |
Com isso, vemos que filmes com maior budget tendem a arrecadar mais e receber mais votos, mas não necessariamente ter nota média maior.
dados_modelo <- dados |>
select(budget, revenue, vote_average, vote_count)
saveRDS(dados_modelo, "C:/Users/ts2m/OneDrive - TRANSPETRO/Documentos/Pós/04 EDA/PD/Filmes/dados.rds")