Init

library(kirkegaard)

## Loading required package: tidyverse

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
## 
## 
## Attaching package: 'magrittr'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## 
## Loading required package: weights
## 
## Loading required package: Hmisc
## 
## 
## Attaching package: 'Hmisc'
## 
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## 
## 
## Loading required package: assertthat
## 
## 
## Attaching package: 'assertthat'
## 
## 
## The following object is masked from 'package:tibble':
## 
##     has_name
## 
## 
## Loading required package: psych
## 
## 
## Attaching package: 'psych'
## 
## 
## The following object is masked from 'package:Hmisc':
## 
##     describe
## 
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## 
## 
## 
## Attaching package: 'kirkegaard'
## 
## 
## The following object is masked from 'package:psych':
## 
##     rescale
## 
## 
## The following object is masked from 'package:assertthat':
## 
##     are_equal
## 
## 
## The following object is masked from 'package:purrr':
## 
##     is_logical
## 
## 
## The following object is masked from 'package:base':
## 
##     +

load_packages(
  #neg binom count
  MASS,
  #plotting
  patchwork,
  #bayes
  ebpm
)

## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## 
## Attaching package: 'patchwork'
## 
## The following object is masked from 'package:MASS':
## 
##     area

#prefer select() from dplyr
select = dplyr::select

theme_set(theme_bw())

options(
    digits = 3
)

Data

Oscars

#oscar wins for non-US movies
#manually imported from https://en.wikipedia.org/wiki/List_of_countries_by_number_of_Academy_Awards_for_Best_International_Feature_Film
d = read_csv("data/oscar.csv") %>% 
  set_colnames(
    c("country_dirty", "wins", "nominations", "submissions", "first_submission", "last_submission")
  )

## Rows: 135 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): V1, V3, V5
## dbl (3): V2, V4, V6
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#fix encoding
d %<>% mutate(
  #clean off the text in brackets
  country = str_remove(country_dirty, "\\[.*\\]"),
  
  #country is half the length of the string
  country = str_sub(country, 1, str_length(country) / 2) %>% str_trim(),
  
  #as numeric vars
  wins = as.numeric(wins),
  nominations = as.numeric(nominations), #Uruguay cheated
  submissions = as.numeric(submissions)
)

## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `nominations = as.numeric(nominations)`.
## Caused by warning:
## ! NAs introduced by coercion

#manually fix a few issues
d$country[33] = "Yugoslavia"
d$country[49] = "Georgia"
d$country[59] = "Ireland"

#ISO encoding
d$ISO3 = pu_translate(d$country)

#Uruguay 0 nominations, their 1 is cheating
d$nominations[which(d$country == "Uruguay")] = 0

#wins before
sum(d$wins)

## [1] 77

sum(d$nominations)

## [1] 353

#manually deal with merged and split countries
#Russia 3 wins 5 nominations from Soviet Union
d %>% filter(country == "Soviet Union") %>% select(wins, nominations)

d$wins[which(d$country == "Russia")] = d$wins[which(d$country == "Russia")] + 3
d$nominations[which(d$country == "Russia")] = d$nominations[which(d$country == "Russia")] + 3 + 5 #wins are also nominations
#Ukraine 1 nomination from Soviet Union
d$nominations[which(d$country == "Ukraine")] = d$nominations[which(d$country == "Ukraine")] + 1

#Czechoslovakia
#Czech Republic 1.5 wins and 4 nominations from Czechoslovakia
d %>% filter(country == "Czechoslovakia") %>% select(wins, nominations)

d$wins[which(d$country == "Czech Republic")] = d$wins[which(d$country == "Czech Republic")] + 1.5
d$nominations[which(d$country == "Czech Republic")] = d$nominations[which(d$country == "Czech Republic")] + 4 + 1.5 #wins are also nominations
#Slovakia 0.5 wins and 0 nomination from Czechoslovakia
d$wins[which(d$country == "Slovakia")] = d$wins[which(d$country == "Slovakia")] + 0.5
d$nominations[which(d$country == "Slovakia")] = d$nominations[which(d$country == "Slovakia")] + 0.5 #wins are also nominations

#Germany, merge from West and East
#west germany 1 win, 8 nominations
#east germany, 0 wins, 1 nomination
d %>% filter(country %in% c("West Germany", "East Germany")) %>% select(country, wins, nominations)

d$wins[which(d$country == "Germany")] = d$wins[which(d$country == "Germany")] + 1
d$nominations[which(d$country == "Germany")] = d$nominations[which(d$country == "Germany")] + 9  #wins are also nominations

#Yugoslavia
#6 nominations, 0 wins, 3 Serbia, 1 Montenegro, 1 Slovenia, 1 Italy
d$nominations[which(d$country == "Serbia")] = d$nominations[which(d$country == "Serbia")] + 3
d$nominations[which(d$country == "Montenegro")] = d$nominations[which(d$country == "Montenegro")] + 1
d$nominations[which(d$country == "Slovenia")] = d$nominations[which(d$country == "Slovenia")] + 1
d$nominations[which(d$country == "Italy")] = d$nominations[which(d$country == "Italy")] + 1

#remove these cases
d = d %>% filter(!country %in% c("Soviet Union", "Yugoslavia", "Czechoslovakia", "East Germany", "West Germany"))

#wins after, must be the same
sum(d$wins)

## [1] 77

sum(d$nominations)

## [1] 353

Covariates

#population size
pop = read_csv("data/population.csv") %>% 
  set_colnames(c("country", "year", "population")) %>% 
  filter(year == 2023)

## Rows: 18944 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Entity
## dbl (2): Year, Population - Sex: all - Age: all - Variant: estimates
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#ISO3
pop$ISO3 = pu_translate(pop$country, fuzzy = F)

## No exact match: Africa (UN)
## No exact match: Americas (UN)
## No exact match: Asia (UN)
## No exact match: Bonaire Sint Eustatius and Saba
## No exact match: Europe (UN)
## No exact match: High-income countries
## No exact match: Land-locked developing countries (LLDC)
## No exact match: Latin America and the Caribbean (UN)
## No exact match: Least developed countries
## No exact match: Less developed regions
## No exact match: Less developed regions, excluding China
## No exact match: Less developed regions, excluding least developed countries
## No exact match: Low-income countries
## No exact match: Lower-middle-income countries
## No exact match: Micronesia (country)
## No exact match: More developed regions
## No exact match: Northern America (UN)
## No exact match: Oceania (UN)
## No exact match: Small island developing states (SIDS)
## No exact match: Upper-middle-income countries
## No exact match: Vatican
## No exact match: World

#population in millions
pop$population_m = pop$population / 1e6

Join data

d = d %>% left_join(pop %>% select(ISO3, population_m), by = "ISO3")

#double the count to avoid half-values
d$wins2 = d$wins * 2
d$nominations2 = d$nominations * 2

#log counts
d$log_wins = log(d$wins + 1)
d$log_nominations = log(d$nominations + 1)

#simple per million adjustment
d$wins_per_million = d$wins / d$population_m
d$nominations_per_million = d$nominations / d$population_m

Analysis

Simple population model

#wins per capita
p_wins_pm = d %>% arrange(desc(wins_per_million)) %>% head(20) %>% 
  mutate(
    #sort country by highest wins per million
    country = fct_reorder(country, wins_per_million)
  ) %>% 
  ggplot(aes(wins_per_million, country)) +
  geom_bar(stat = "identity")

#nominations per capita
p_noms_pm = d %>% arrange(desc(nominations_per_million)) %>% head(20) %>% 
  mutate(
    #sort country by highest nominations per million
    country = fct_reorder(country, nominations_per_million)
  ) %>% 
  ggplot(aes(nominations_per_million, country)) +
  geom_bar(stat = "identity")

#plot together
patchwork::wrap_plots(p_wins_pm, p_noms_pm + theme(axis.title.y = element_blank()), ncol = 2) +
  plot_annotation(
    title = "Oscar wins and nominations per million inhabitants"
  )

GG_save("figs/oscar foreign wins and nominations per million.png")

#pm for both
d %>% 
  GG_scatter("wins_per_million", "nominations_per_million", case_names = "country")

## `geom_smooth()` using formula = 'y ~ x'

Top values

#simulate two populations, one with fixed size of 100, the other with varying size, and calculate how many of the top 10 are from the second population
set.seed(1)
extreme_values_top10 = map_dfr(c(100, 250, 500, 1000, 1500, 2000) %>% rep(1000), function(n2) {
  #sim sample 1
  dd = tibble(
    pop = rep(c(1, 2), c(100, n2)),
    values = c(rnorm(100), rnorm(n2))
  ) %>% arrange(desc(values))
  
  tibble(
    n2 = n2,
    top10_frac = mean(dd$pop[1:10] == 2)
  )
})

#plot
p_extreme_vals1 = extreme_values_top10 %>% 
  group_by(n2) %>%
  summarise(top10_frac = mean(top10_frac)) %>%
  ggplot(aes(n2, top10_frac)) +
  geom_line() +
  scale_y_continuous(labels = scales::percent, limits = c(0.5, 1)) +
  labs(
    x = "Sample size of sample 2",
    y = "% of top 10 from second population"
  )

#as proportion of total population restores linearity
p_extreme_vals2 = extreme_values_top10 %>% 
  group_by(n2) %>%
  summarise(top10_frac = mean(top10_frac), n2_prop_of_total = n2[1]/(100+n2[1])) %>%
  ggplot(aes(n2_prop_of_total, top10_frac)) +
  geom_line() +
  scale_x_continuous(labels = scales::percent, limits = c(0.5, 1)) +
  scale_y_continuous(labels = scales::percent, limits = c(0.5, 1)) +
  labs(
    x = "Sample 2 share of total population",
    y = "% of top 10 from second population"
  )

p_extreme_vals1 + p_extreme_vals2

GG_save("figs/extreme values in top 10.png")

Sampling error issues

#simulate varying sample sizes of normal distribution, compute the mean
set.seed(1)
resamples = map_dfr(1:10000, ~{
  #sample size
  n = sample(5:200, 1)
  
  #sample
  x = rnorm(n, 0, 1)
  
  #return mean
  tibble(n = n, 
         mean = mean(x)
         )
})

#plot them
resamples %>% ggplot(aes(n, mean)) +
  geom_point(alpha = 0.2) +
  #ribbon for quantiles
  geom_ribbon(mapping = aes(
    ymin = quantile_smooth(resamples$n, resamples$mean, quantile = .10, method = "Rq"),
    ymax = quantile_smooth(resamples$n, resamples$mean, quantile = .90, method = "Rq")
  ), alpha = 0.4) +
  labs(
    title = "Sampling error and extreme estimates of the mean",
    x = "Sample size",
    y = "Mean"
  )

## number of knots in rcs defaulting to 5
## number of knots in rcs defaulting to 5

## Warning: Use of `resamples$n` is discouraged.
## ℹ Use `n` instead.

## Warning: Use of `resamples$mean` is discouraged.
## ℹ Use `mean` instead.

## Warning: Use of `resamples$n` is discouraged.
## ℹ Use `n` instead.

## Warning: Use of `resamples$mean` is discouraged.
## ℹ Use `mean` instead.

GG_save("figs/sampling error simulation.png")

## number of knots in rcs defaulting to 5
## number of knots in rcs defaulting to 5

## Warning: Use of `resamples$n` is discouraged.
## ℹ Use `n` instead.
## Use of `resamples$mean` is discouraged.
## ℹ Use `mean` instead.

## Warning: Use of `resamples$n` is discouraged.
## ℹ Use `n` instead.

## Warning: Use of `resamples$mean` is discouraged.
## ℹ Use `mean` instead.

Models

#simple expectation model using worldwide values
global_rates = d %>% summarise(
  wins = sum(wins),
  nominations = sum(nominations),
  population_m = sum(population_m)
) %>% mutate(
  wins_per_million = wins / population_m,
  nominations_per_million = nominations / population_m
)

global_rates

#add global rate predictions
d$wins_global_pred = global_rates$wins_per_million * d$population_m
d$nominations_global_pred = global_rates$nominations_per_million * d$population_m

#residuals
d$wins_global_resid = d$wins - d$wins_global_pred
d$nominations_global_resid = d$nominations - d$nominations_global_pred

#resid per million
d$wins_global_resid_pm = d$wins_global_resid / d$population_m
d$nominations_global_resid_pm = d$nominations_global_resid / d$population_m

#make a table
d %>% arrange(-nominations_global_resid) %>% select(country, nominations, population_m, nominations_global_pred, nominations_global_resid, nominations_global_resid_pm) %>% 
  head(20) %>% 
  knitr::kable(caption = "Global model residuals for nominations")

Global model residuals for nominations
country	nominations	population_m	nominations_global_pred	nominations_global_resid	nominations_global_resid_pm
France	42.0	66.44	3.192	38.81	0.584
Italy	34.0	59.50	2.859	31.14	0.523
Germany	23.0	84.55	4.063	18.94	0.224
Spain	21.0	47.91	2.302	18.70	0.390
Sweden	16.0	10.55	0.507	15.49	1.468
Denmark	15.0	5.95	0.286	14.71	2.474
Japan	18.0	124.37	5.976	12.02	0.097
Poland	13.0	38.76	1.863	11.14	0.287
Israel	10.0	9.26	0.445	9.55	1.032
Hungary	10.0	9.69	0.465	9.54	0.984
Russia	15.0	145.44	6.988	8.01	0.055
Czech Republic	8.5	10.81	0.519	7.98	0.738
Belgium	8.0	11.71	0.563	7.44	0.635
Netherlands	7.0	18.09	0.869	6.13	0.339
Argentina	8.0	45.54	2.188	5.81	0.128
Norway	6.0	5.52	0.265	5.74	1.039
Canada	7.0	39.30	1.888	5.11	0.130
Switzerland	5.0	8.87	0.426	4.57	0.516
Greece	5.0	10.24	0.492	4.51	0.440
Austria	4.0	9.13	0.439	3.56	0.390

#highest excess wins per million?
d %>% arrange(-nominations_global_resid_pm) %>% select(country, nominations, population_m, nominations_global_pred, nominations_global_resid, nominations_global_resid_pm) %>% 
  head(20)

#compare
d %>% 
  GG_scatter("nominations_global_pred", "nominations", case_names = "country", text_pos = "tr")

## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/nominations global model.png")

## `geom_smooth()` using formula = 'y ~ x'

#Poisson model
oscar_noms_poisson = glm(nominations2 ~ log(population_m), data = d, family = poisson)
oscar_noms_poisson %>% summary()

## 
## Call:
## glm(formula = nominations2 ~ log(population_m), family = poisson, 
##     data = d)
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         0.7537     0.0898    8.39   <2e-16 ***
## log(population_m)   0.3031     0.0234   12.94   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 1973.7  on 129  degrees of freedom
## Residual deviance: 1808.2  on 128  degrees of freedom
## AIC: 2039
## 
## Number of Fisher Scoring iterations: 6

#save residuals and predictions
d$oscar_noms_poisson_resid = residuals(oscar_noms_poisson) / 2
d$oscar_noms_poisson_pred = predict(oscar_noms_poisson, type = "response") / 2

#negative binomial model
oscar_noms_negbin = MASS::glm.nb(nominations2 ~ log(population_m), data = d)
oscar_noms_negbin %>% summary()

## 
## Call:
## MASS::glm.nb(formula = nominations2 ~ log(population_m), data = d, 
##     init.theta = 0.225750888, link = log)
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          0.322      0.386    0.84  0.40365    
## log(population_m)    0.441      0.121    3.64  0.00027 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(0.226) family taken to be 1)
## 
##     Null deviance: 126.09  on 129  degrees of freedom
## Residual deviance: 116.27  on 128  degrees of freedom
## AIC: 589.3
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  0.2258 
##           Std. Err.:  0.0364 
## 
##  2 x log-likelihood:  -583.3220

#save residuals and predictions
d$oscar_noms_negbin_resid = residuals(oscar_noms_negbin) / 2
d$oscar_noms_negbin_pred = predict(oscar_noms_negbin, type = "response") / 2

#log(1+count) OLS
oscar_noms_ols = lm(log_nominations ~ log(population_m), data = d)
oscar_noms_ols %>% summary()

## 
## Call:
## lm(formula = log_nominations ~ log(population_m), data = d)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -1.036 -0.719 -0.320  0.407  2.892 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)  
## (Intercept)         0.3831     0.1590    2.41    0.017 *
## log(population_m)   0.1159     0.0505    2.30    0.023 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.926 on 128 degrees of freedom
## Multiple R-squared:  0.0396, Adjusted R-squared:  0.0321 
## F-statistic: 5.27 on 1 and 128 DF,  p-value: 0.0233

#save residuals and predictions
d$oscar_noms_ols_resid = residuals(oscar_noms_ols)
d$oscar_noms_ols_pred = predict(oscar_noms_ols)

#correlations
d %>% select(nominations, log_nominations, contains("_pred")) %>% cor_matrix()

##                         nominations log_nominations wins_global_pred
## nominations                  1.0000          0.8638           0.0362
## log_nominations              0.8638          1.0000           0.0959
## wins_global_pred             0.0362          0.0959           1.0000
## nominations_global_pred      0.0362          0.0959           1.0000
## oscar_noms_poisson_pred      0.1739          0.1761           0.7844
## oscar_noms_negbin_pred       0.1432          0.1585           0.8760
## oscar_noms_ols_pred          0.2111          0.1989           0.5357
##                         nominations_global_pred oscar_noms_poisson_pred
## nominations                              0.0362                   0.174
## log_nominations                          0.0959                   0.176
## wins_global_pred                         1.0000                   0.784
## nominations_global_pred                  1.0000                   0.784
## oscar_noms_poisson_pred                  0.7844                   1.000
## oscar_noms_negbin_pred                   0.8760                   0.985
## oscar_noms_ols_pred                      0.5357                   0.925
##                         oscar_noms_negbin_pred oscar_noms_ols_pred
## nominations                              0.143               0.211
## log_nominations                          0.159               0.199
## wins_global_pred                         0.876               0.536
## nominations_global_pred                  0.876               0.536
## oscar_noms_poisson_pred                  0.985               0.925
## oscar_noms_negbin_pred                   1.000               0.850
## oscar_noms_ols_pred                      0.850               1.000

#how does it look like
d %>% 
  GG_scatter("oscar_noms_ols_pred", "log_nominations", case_names = "country")

## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/nominations log OLS model.png")

## `geom_smooth()` using formula = 'y ~ x'

#compare with poisson and negbin
p_poisson = d %>% 
  GG_scatter("oscar_noms_poisson_pred", "nominations", case_names = "country", text_pos = "tr") +
  labs(title = "Poisson model")

p_negbin = d %>%
  GG_scatter("oscar_noms_negbin_pred", "nominations", case_names = "country", text_pos = "tr") +
  labs(title = "Negative binomial model")

patchwork::wrap_plots(p_poisson, p_negbin, ncol = 2) +
  plot_annotation(
    title = "Poisson and negative binomial models for Oscar nominations"
  )

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/nominations poisson and negbin models.png")

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Bayesian models

fit_ebayes = ebpm_gamma(d$nominations, d$population_m)
fit_ebayes %>% summary()

##                Length Class       Mode   
## fitted_g       3      point_gamma list   
## posterior      2      data.frame  list   
## log_likelihood 1      -none-      numeric

d$nominations_post = fit_ebayes$posterior$mean

#how do these compare to raw pm?
d %>% 
  select(country, population_m, nominations, nominations_global_pred, nominations_global_resid, nominations_post, nominations_per_million) %>% 
  arrange(-nominations_post)

#raw and posterior nominations per million
d %>% 
  arrange(-nominations_post) %>%
  head(30) %>% 
  mutate(
    country = fct_reorder(country, nominations_per_million)
  ) %>% 
  select(country, nominations_per_million, nominations_post) %>%
  pivot_longer(-country) %>%
  ggplot(aes(value, country, fill = name)) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_fill_discrete(labels = c("Raw per million", "Bayesian posterior")) +
  labs(
    title = "Raw and posterior nominations per million inhabitants",
    x = "Nominations per million",
    fill = "Type of estimate"
  )

GG_save("figs/nominations raw and posterior per million.png")

Which country makes the best movies per capita?

08 February, 2025

Init

Data

Oscars

Covariates

Join data

Analysis

Simple population model

Top values

Sampling error issues

Models

Bayesian models

Meta