#Start

options(digits = 3)
library(pacman)
p_load(kirkegaard, rvest, rms, ISOcodes, metafor, broom)
theme_set(theme_bw())
source("R/functions.R")

#Data

Chess

I downlaoded these from https://2700chess.com/all-fide-players using a curl command fetched using Firefox’s great networking tool (F12). The site’s UI only allows one to select the top 100 players. I figured that this was just a UI limitation, so I modified the url request POST param to fetch more. This can be done with R’s tools (e.g. curl where one has to set a billion headers), but it is bothersome compared to CLI curl.

#call curl
#only works on unix
if (!file.exists("data/fide.html")) {
  system("curl 'https://2700chess.com/all-fide-players' -H 'Host: 2700chess.com' -H 'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0' -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' -H 'Accept-Language: en-US,en;q=0.5' --compressed -H 'Referer: https://2700chess.com/all-fide-players' -H 'Content-Type: application/x-www-form-urlencoded' -H 'Cookie: PHPSESSID=0n3dtmlqujcdd11r3uqnmtuu66; _csrf=a799b5e90a10a176eabfa682e6534e52a954b4ced1bcd561977c2378d2a3e5fba%3A2%3A%7Bi%3A0%3Bs%3A5%3A%22_csrf%22%3Bi%3A1%3Bs%3A32%3A%22BD9ImGubxcUFZZXQt8fTtGDI7p-HZ_Lk%22%3B%7D' -H 'Connection: keep-alive' -H 'Upgrade-Insecure-Requests: 1' -H 'DNT: 1' --data '_csrf=ejFody1IVGw4dVE.QA8hDgJSPTF3Egw9DgkOI1kPECVNQUU%2FdxcYBw%3D%3D&official_search=1&name=&country=&title=&rating_from=&rating_to=&format=standard&birth_from=1901&birth_to=2017&activity=1&sex=&count=5000' > data/fide.html")
  
  fide = read_html("data/fide.html") %>% html_node("#official-ratings-table") %>% html_table()
} else {
  fide = read_html("data/fide.html") %>% html_node("#official-ratings-table") %>% html_table()
}

#better colnames
names(fide) = c("rank", "title", "name", "country", "classic", "rapid", "blitz", "age")

#clean names
fide$name %<>% str_replace("\n.*", "")

#duplicates?
fide$name[which(duplicated(fide$name))]
## [1] "Nikolic, Nebojsa"
#remove
fide %<>% filter(!duplicated(name))

#recode a few problem cases
fide$country %<>% subs_to_mother_name()

#count by country
(fide_country = table2(fide$country, include_NA = F) %>% 
    select(-Percent) %>% 
    rename(
      country = Group,
      chess_count = Count
      )
  )
#merge data
#countries to ISO
fide_country$ISO = pu_translate(fide_country$country, superunit = "world")
## No exact match: Iran, Islamic Republic of
## No exact match: Macedonia, the Former Yugoslav Republic of
## No exact match: Korea, Republic of
## Best fuzzy match found: Iran, Islamic Republic of -> Iran Islamic Republic of with distance 1.00
## Best fuzzy match found: Macedonia, the Former Yugoslav Republic of -> Macedonia, the former Yugoslav Republic of with distance 1.00
## Best fuzzy match found: Korea, Republic of -> Korea Republic of with distance 1.00

Scrabble

#download in R
if (!file.exists("data/WESPA.rds")) {
  WESPA = read_html("http://www.wespa.org/aardvark/cgi-bin/rating.cgi") %>% html_node(".table") %>% html_table() %>% df_legalize_names()
  write_rds(WESPA, "data/WESPA.rds")
} else {
  WESPA = read_rds("data/WESPA.rds")
}

#duplicates?
length(WESPA$Name[which(duplicated(WESPA$Name))])
## [1] 0
#remove
WESPA %<>% filter(!duplicated(Name))

#recode a few problem cases
WESPA$Country %<>% subs_to_mother_name()

#count by country
(scrabble_country = table2(WESPA$Country, include_NA = F) %>% 
    select(-Percent) %>% 
    rename(
      country = Group,
      scrabble_count = Count
      )
  )
#merge data
#countries to ISO
scrabble_country$ISO = pu_translate(scrabble_country$country, superunit = "world")

#join
assert_that(!any(duplicated(scrabble_country$ISO)))
## [1] TRUE

Go

#download in R
if (!file.exists("data/go.rds")) {
  go_html = read_html("https://www.goratings.org/en/")
  
  #get table
  go_table_html = go_html %>% html_nodes("table") %>% .[[3]]
  
  #parse
  go = go_table_html %>% html_table() %>% df_legalize_names()
  
  #get countries from flags
  go$Flag = str_match_all(go_table_html, '(..)\\.svg') %>% .[[1]] %>% .[, 2]
  
  #rename
  names(go) = c("rank", "name", "sex", "country", "rating")
  
  write_rds(go, "data/go.rds")
} else {
  go = read_rds("data/go.rds")
}

#duplicates?
length(go$name[which(duplicated(go$name))])
## [1] 0
#remove
go %<>% filter(!duplicated(name))

#recode to ISO3 from ISO2
go$ISO = plyr::mapvalues(go$country, ISO_3166_1$Alpha_2 %>% str_to_lower(), 
                               ISO_3166_1$Alpha_3,
                               warn_missing = F)

#subs to mother
go$ISO %<>% subs_to_mother_ISO()

#count by country
go_country = table2(go$ISO, include_NA = F) %>% 
    rename(
      ISO = Group,
      go_count = Count
      )
  
#join
assert_that(!any(duplicated(go_country$ISO)))
## [1] TRUE

Poker

#load from disk if possible
if (!file.exists("data/poker.rds")) {
  #annoying, needs a few manual urls and then merge
  poker_urls = c("http://www.globalpokerindex.com/rankings/300/",
                 "http://www.globalpokerindex.com/rankings/301-600/", 
                 "http://www.globalpokerindex.com/rankings/601-900/",
                 "http://www.globalpokerindex.com/rankings/901-1200/",
                 "http://www.globalpokerindex.com/rankings/1201-1500/",
                 "http://www.globalpokerindex.com/rankings/1501-1800/")
  
  #download and extract
  poker = map_df(poker_urls, function(x) {
    this = read_html(x)
    
    t_table_html = this %>% html_nodes("table") %>% .[[2]]
    t_table = t_table_html %>% html_table()
    
    #get flag
    t_table$ISO2 = t_table_html %>% as.character() %>% str_match_all('gpiflag-(\\w\\w)') %>% .[[1]] %>% .[, 2]
    
    t_table
  })
  
  #ISO3
  poker$ISO = plyr::mapvalues(poker$ISO2,
                              ISO_3166_1$Alpha_2 %>% str_to_lower(),
                              ISO_3166_1$Alpha_3,
                              warn_missing = F)
  
  #join some countries
  poker$ISO %<>% subs_to_mother_ISO()
  
  #count
  poker_country = table2(poker$ISO, include_NA = F) %>% 
    rename(
      poker_count = Count,
      ISO = Group
    )
  poker_country$country = pu_translate(poker_country$ISO, reverse = T)
  
  #to full country
  #to ensure that they are all countries, not subdivisions
  poker_country$country = pu_translate(poker_country$ISO, superunit = "world", reverse = T)
  
  #save
  write_rds(poker, "data/poker.rds")
  write_rds(poker_country, "data/poker_country.rds")
} else {
  #read
  poker = read_rds("data/poker.rds")
  poker_country = read_rds("data/poker_country.rds")
}

Esportsearnings games

#get games
ese_games = list(
  sc2 = ese_get("sc2", "https://www.esportsearnings.com/games/151-starcraft-ii/countries"),
  lol = ese_get("lol", "https://www.esportsearnings.com/games/164-league-of-legends/countries"),
  dota2 = ese_get("dota2", "https://www.esportsearnings.com/games/231-dota-2/countries"),
  csgo = ese_get("csgo", "https://www.esportsearnings.com/games/245-counter-strike-global-offensive/countries"),
  cs = ese_get("cs", "https://www.esportsearnings.com/games/162-counter-strike/countries"),
  hs = ese_get("hs", "https://www.esportsearnings.com/games/328-hearthstone-heroes-of-warcraft/countries"),
  ow = ese_get("ow", "https://www.esportsearnings.com/games/426-overwatch/countries"),
  ssbm = ese_get("ssbm", "https://www.esportsearnings.com/games/204-super-smash-bros-melee/countries")
)

#merge subnational units
ese_games = map(ese_games, ese_merge_subs)
## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.
## No exact match: Korea, Republic of
## Best fuzzy match found: Korea, Republic of -> Korea Republic of with distance 1.00
## No exact match: Korea, Republic of
## No exact match: Macedonia, The Former Yugoslav Republic of
## Best fuzzy match found: Korea, Republic of -> Korea Republic of with distance 1.00
## Best fuzzy match found: Macedonia, The Former Yugoslav Republic of -> Macedonia, the former Yugoslav Republic of with distance 2.00
## No exact match: Iran, Islamic Republic of
## No exact match: Korea, Republic of
## No exact match: Macedonia, The Former Yugoslav Republic of
## Best fuzzy match found: Iran, Islamic Republic of -> Iran Islamic Republic of with distance 1.00
## Best fuzzy match found: Korea, Republic of -> Korea Republic of with distance 1.00
## Best fuzzy match found: Macedonia, The Former Yugoslav Republic of -> Macedonia, the former Yugoslav Republic of with distance 2.00
## No exact match: Iran, Islamic Republic of
## No exact match: Korea, Republic of
## No exact match: Macedonia, The Former Yugoslav Republic of
## Best fuzzy match found: Iran, Islamic Republic of -> Iran Islamic Republic of with distance 1.00
## Best fuzzy match found: Korea, Republic of -> Korea Republic of with distance 1.00
## Best fuzzy match found: Macedonia, The Former Yugoslav Republic of -> Macedonia, the former Yugoslav Republic of with distance 2.00
## No exact match: Korea, Republic of
## No exact match: Macedonia, The Former Yugoslav Republic of
## Best fuzzy match found: Korea, Republic of -> Korea Republic of with distance 1.00
## Best fuzzy match found: Macedonia, The Former Yugoslav Republic of -> Macedonia, the former Yugoslav Republic of with distance 2.00
## No exact match: Korea, Republic of
## Best fuzzy match found: Korea, Republic of -> Korea Republic of with distance 1.00
## No exact match: Korea, Republic of
## Best fuzzy match found: Korea, Republic of -> Korea Republic of with distance 1.00

Other national data

#national data collection
natdata = silence(read_csv("data/Megadataset_v2.0p.csv"))
names(natdata)[1] = "ISO"
natdata$IQ = natdata$LV2012estimatedIQ

#population data
worldpop = read_html("data/wiki_worldpop.html") %>% html_table() %>% .[[2]] %>% df_legalize_names()
worldpop %<>% mutate(
  population2017 = Population_1_July_2017_2 %>% str_replace_all(",", "") %>% as.integer(),
  pop = population2017,
  log_pop = log10(population2017),
  country = str_replace(Country_or_area, "\\[.*", ""),
  ISO = pu_translate(country)
) %>% na.omit()
## Warning in function_list[[k]](value): NAs introduced by coercion to integer
## range
## No exact match: World
## No exact match: Guernsey and  Jersey
## Best fuzzy match found: World -> world with distance 1.00
## Best fuzzy match found: Guernsey and  Jersey -> Guernsey (GBR) with distance 11.00
#merge, keep only countries
natdata = right_join(natdata, worldpop, by = "ISO")

#internet users
internetusers = read_csv("data/internet users 2016.csv")
## Parsed with column specification:
## cols(
##   region = col_character(),
##   country = col_character(),
##   iso = col_character(),
##   pop2010 = col_double(),
##   pop2016 = col_double(),
##   i99h2010 = col_double(),
##   i99h2016 = col_double(),
##   i2010 = col_double(),
##   i2016 = col_double()
## )
internetusers$ISO = internetusers$country %>% pu_translate()
## No exact match: Central African Rep.
## No exact match: Congo (Dem. Rep.)
## No exact match: Congo (Rep.)
## No exact match: S. Tome & Principe
## No exact match: Hong Kong, China
## No exact match: Korea (Rep.)
## No exact match: Lao P.D.R.
## No exact match: Macao, China
## Best fuzzy match found: Central African Rep. -> Central African Republic with distance 5.00
## Best fuzzy match found: Congo (Dem. Rep.) -> Congo, Dem. Rep. with distance 3.00
## Best fuzzy match found: Congo (Rep.) -> Congo, Rep. with distance 3.00
## Best fuzzy match found: S. Tome & Principe -> Sao Tome & Principe with distance 2.00
## Best fuzzy match found: Hong Kong, China -> Hong Kong-China with distance 2.00
## Best fuzzy match found: Korea (Rep.) -> Korea Rep. with distance 2.00
## Best fuzzy match found: Lao P.D.R. -> Lao PDR with distance 3.00
## Best fuzzy match found: Macao, China -> Macao-China with distance 2.00
internetusers$internet_users_pct = internetusers$i99h2016

#merge
natdata = left_join(natdata, internetusers %>% select(ISO, internet_users_pct), by = "ISO")

#recode
natdata %<>% mutate(
  UN_continent = UN_continental_region_1 %>% fct_relevel("Europe"),
  UN_region = UN_statistical_region_1 %>% fct_relevel("Western Europe")
)

#merge stat areas to
natdata %<>% mutate(
  UN_macroregion = plyr::revalue(UN_region, c("Australia and New Zealand" = "N & W Europe + offshoots",
                          "Europe" = "N & W Europe + offshoots",
                          "Northern Europe" = "N & W Europe + offshoots",
                          "Eastern Africa" = "Africa",
                          "Middle Africa" = "Africa",
                          "Western Africa" = "Africa",
                          "Southern Africa" = "Africa",
                          "Europe" = "North & Western Europe",
                          "Northern America" = "N & W Europe + offshoots",
                          "South America" = "Latin America",
                          "Central America" = "Latin America",
                          "Western Asia" = "MENA",
                          "Northern Africa" = "MENA",
                          "Western Europe" = "N & W Europe + offshoots"
                          ), warn_missing = F)
)

#exclude some quasi subunits
natdata %<>% filter(!ISO %in% sub_names_ISO)

Join all game data and mutate

#games
egames_names = names(ese_games)

all_games = c(ese_games,
              list(
                chess = fide_country,
                go = go_country,
                scrabble = scrabble_country,
                poker = poker_country
                )
              )
all_names = names(all_games)

#check for quasi-countries that we want to merge into motherland
map_df(all_games, function(x) {
  x %>% 
    filter(ISO %in% c("HKG", "MAC", "GGY", "IMN", "Greenland", "USA_PRI", "FRO", "WSM"))
})
#join all
for (g in all_names) {
  #extract cols of interest, then join to main
  count_var = g + "_count"
  this_df = all_games[[g]][c("ISO", count_var)]
  
  #join
  natdata = full_join(natdata, this_df, by = "ISO")
  
  #NA to 0
  natdata[[count_var]] %<>% plyr::mapvalues(NA, 0, warn_missing = F)
  
  #mutate
  natdata[["log_" + count_var]] = log10(natdata[[count_var]] + 1)
  natdata[[count_var + "_per_million"]] = natdata[[count_var]] / natdata$population2017
}

Analyses

Plot data with IQ in informative ways.

Basic plots

We do more analyses on this as this game had the most data.

#game stats
map2_df(all_games, names(all_games), function(x, g) {
  data_frame(
    game = g,
    esport = (g %in% egames_names) %>% plyr::mapvalues(c(T, F), c("yes", "no"), warn_missing = F),
    top_players = sum(x[[g + "_count"]]) %>% as.integer(),
    n_unique_countries = nrow(x)
  )
}) %>% write_clipboard()
##        Game Esport Top players N unique countries
## 1       sc2    yes        2179                 64
## 2       lol    yes        5072                 80
## 3     dota2    yes        2219                 71
## 4      csgo    yes        9076                 92
## 5        cs    yes        2624                 58
## 6        hs    yes        1744                 66
## 7        ow    yes        2104                 63
## 8      ssbm    yes        2107                 29
## 9     chess     no        4942                103
## 10       go     no         946                 14
## 11 scrabble     no        1321                 48
## 12    poker     no        1800                 66
#regions
table2(natdata$UN_continent, include_NA = F)
table2(natdata$UN_region, include_NA = F)
table2(natdata$UN_macroregion, include_NA = F)

Sampling theory

Iceland example.

#simple
set.seed(1)
rbinom(1e6, 3.3e5, prob = 1/1e6) %>% 
  table2() %>% 
  rename(
    event_count = Group,
    percent = Percent
  ) %>% write_clipboard()
##   Event count     Count Percent
## 1           0 719259.00   71.93
## 2           1 237116.00   23.71
## 3           2  39105.00    3.91
## 4           3   4157.00    0.42
## 5           4    335.00    0.03
## 6           5     28.00    0.00
## 7                  0.00    0.00

Simulate world-wide data

Realistic data.

#params
world_pops = natdata$population2017
rate_min = 1/1e9
rate_max = 1/1e6

#simulate world
sim_world_data = function(pop_factor = 1, world_size = 200, cause_effect_r = .7) {
  #two normals for cause and effect
  d_ = MASS::mvrnorm(n = world_size, mu = c(0, 0), Sigma = matrix(c(1, cause_effect_r, cause_effect_r, 1), nrow = 2))
  
  #data frame
  d_ = as_data_frame(d_) %>% 
    set_colnames(c("person_rate", "cause"))
  
  #adjust pop sizes?
  world_pops_local = as.numeric(world_pops) * pop_factor

  #mutate
  d_ %<>% mutate(
    #pop sizes
    pop = world_pops_local %>% sample(size = world_size, replace = T),
    
    #rescale person rate
    person_rate = rescale(person_rate, new_min = rate_min, new_max = rate_max),
    
    #sample for each country
    count = map2_int(person_rate, pop, function(rate, pop) {
      rbinom(n = 1, size = pop, prob = rate)
      }),
    
    #logs
    log_pop = log10(pop),
    log_count = log10(count + 1),
    
    #per capita.
    count_per_million = (count / pop) * 1e6,
    
    #resids
    count_resid = resid(lm(count ~ pop, data = d_)),
    log_count_resid = resid(lm(log_count ~ log_pop, data = d_))
  )

  
  d_
}

sim_world_reg = function(pop_factor = 1, world_size = 200, cause_effect_r = .7) {
  d_ = sim_world_data(pop_factor = pop_factor, world_size = world_size, cause_effect_r = cause_effect_r)
  
  #models
  fit1 = lm(count ~ cause + pop, data = d_)
  fit2 = lm(log_count ~ cause + log_pop, data = d_)
  fit3 = lm(count_per_million ~ cause, data = d_)

  #save
  #don't bother with R2's here because they wont be comparable. the count approaches includes population size as a predictor, and population size explains most of the variance!
  data_frame(
    count_beta = fit1 %>% summary() %>% .$coef %>% .[2, 1],
    count_p = fit1 %>% summary() %>% .$coef %>% .[2, 4],
    log_count_beta = fit2 %>% summary() %>% .$coef %>% .[2, 1],
    log_count_p = fit2 %>% summary() %>% .$coef %>% .[2, 4],
    per_capita_beta = fit3 %>% summary() %>% .$coef %>% .[2, 1],
    per_capita_p = fit3 %>% summary() %>% .$coef %>% .[2, 4]
  )
}

#simulate
set.seed(1)
sim_results = map_df(rep(1, 1000), sim_world_reg)
## Warning: `as_data_frame()` is deprecated, use `as_tibble()` (but mind the new semantics).
## This warning is displayed once per session.
#stats
sim_result_desc = describe(sim_results)

#coef of variation
sim_result_desc %>% 
  rownames_to_column("variable") %>% 
  mutate(cv = sd / mean) %>% select(-n, -trimmed, -range, -vars, -se) %>% 
  write_clipboard()
##          Variable Mean   Sd Median  Mad   Min   Max  Skew Kurtosis   Cv
## 1      count_beta 3.66 2.07   3.17 1.30  0.19 18.17  2.25     7.85 0.56
## 2         count_p 0.03 0.08   0.00 0.00  0.00  0.94  5.60    40.52 3.08
## 3  log_count_beta 0.08 0.03   0.08 0.03 -0.01  0.17 -0.17     0.12 0.34
## 4     log_count_p 0.03 0.10   0.00 0.00  0.00  1.00  5.49    34.76 3.25
## 5 per_capita_beta 0.12 0.14   0.11 0.06 -2.02  1.26 -3.97    82.87 1.19
## 6    per_capita_p 0.17 0.25   0.04 0.06  0.00  1.00  1.63     1.63 1.46
#p cutoffs
sim_results %>% 
  select(contains("_p")) %>% 
  map_df(~data_frame("<.05" = mean(. < .05), "<.01" = mean(. < .01), "<.005" = mean(.< .005), "<.001" = mean(.<.001))) %>% 
  mutate(
    method = sim_results %>% 
  select(contains("_p")) %>% names()
  )
sim_results %>% 
  select(contains("_p")) %>% 
  gather(method, p_value, count_p, log_count_p, per_capita_p) %>% 
  mutate(method = str_replace(method, "_p", "")) %>% 
  GG_denhist("p_value", group = "method", vline = NULL) +
  scale_x_log10("p value") +
  geom_vline(xintercept = c(.05, .01, .005, .001), linetype = "dashed") +
  scale_fill_discrete(labels = function(x) str_replace(x, "_", " "))
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figs/sim_p_dist.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Very large world data

#very large pops to determine conversion function with certainty
#pars
runs_large = 1000
set.seed(1)

#simulate
sim_large_results = map2_df(rep(1, runs_large), runif(runs_large, .2, .9), .f = ~sim_world_reg(cause_effect_r = .y, pop_factor = 1e4))

#describe/examine
describe(sim_large_results)
#cors
wtd.cors(sim_large_results[c("count_beta", "log_count_beta", "per_capita_beta")])
##                 count_beta log_count_beta per_capita_beta
## count_beta           1.000          0.464           0.483
## log_count_beta       0.464          1.000           0.950
## per_capita_beta      0.483          0.950           1.000
#plots
GG_scatter(sim_large_results, "count_beta", "per_capita_beta")

GG_scatter(sim_large_results, "log_count_beta", "per_capita_beta")

#fit model
rms::ols(per_capita_beta ~ log_count_beta, data = sim_large_results)
## Linear Regression Model
##  
##  rms::ols(formula = per_capita_beta ~ log_count_beta, data = sim_large_results)
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs    1000    LR chi2    2324.57    R2       0.902    
##  sigma0.0124    d.f.             1    R2 adj   0.902    
##  d.f.    998    Pr(> chi2)  0.0000    g        0.043    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -0.0397028 -0.0074611  0.0003148  0.0081496  0.0420562 
##  
##  
##                 Coef   S.E.   t     Pr(>|t|)
##  Intercept      0.0117 0.0010 11.69 <0.0001 
##  log_count_beta 0.7467 0.0078 95.94 <0.0001 
## 
#one very large world
large_world = sim_world_data(pop_factor = 1e4, world_size = 1e4)

#cors
wtd.cors(large_world)
##                   person_rate    cause      pop  count   log_pop log_count
## person_rate           1.00000  0.70047 1.22e-02 0.0794 -3.23e-03   0.10672
## cause                 0.70047  1.00000 1.18e-02 0.0583 -1.66e-03   0.07518
## pop                   0.01215  0.01180 1.00e+00 0.9598  3.76e-01   0.37531
## count                 0.07935  0.05833 9.60e-01 1.0000  3.54e-01   0.36082
## log_pop              -0.00323 -0.00166 3.76e-01 0.3543  1.00e+00   0.99312
## log_count             0.10672  0.07518 3.75e-01 0.3608  9.93e-01   1.00000
## count_per_million     0.97937  0.68515 1.26e-02 0.0784  3.04e-03   0.11444
## count_resid           0.24113  0.16745 3.84e-17 0.2807 -2.40e-02   0.00215
## log_count_resid       0.93876  0.65607 1.45e-02 0.0761  2.34e-16   0.11711
##                   count_per_million count_resid log_count_resid
## person_rate                 0.97937    2.41e-01        9.39e-01
## cause                       0.68515    1.67e-01        6.56e-01
## pop                         0.01257    3.84e-17        1.45e-02
## count                       0.07837    2.81e-01        7.61e-02
## log_pop                     0.00304   -2.40e-02        2.34e-16
## log_count                   0.11444    2.15e-03        1.17e-01
## count_per_million           1.00000    2.36e-01        9.51e-01
## count_resid                 0.23617    1.00e+00        2.22e-01
## log_count_resid             0.95145    2.22e-01        1.00e+00
#some plots
GG_scatter(large_world, "log_count", "count_per_million")

GG_scatter(large_world, "count_resid", "count_per_million")

ggplot(large_world, aes_string("log_count_resid", "count_per_million")) +
  geom_point() +
  geom_smooth(se = F) +
  # geom_line(data = data_frame(x = large_world$log_count_resid, pred = predict(line_fit)), mapping = aes(x = x, y = pred)) +
  theme_bw()
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'

#fit model
rms::ols(count_per_million ~ rcs(log_count, 3), data = large_world)
## Linear Regression Model
##  
##  rms::ols(formula = count_per_million ~ rcs(log_count, 3), data = large_world)
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs   10000    LR chi2    160.22    R2       0.016    
##  sigma0.1428    d.f.            2    R2 adj   0.016    
##  d.f.   9997    Pr(> chi2) 0.0000    g        0.020    
##  
##  Residuals
##  
##        Min        1Q    Median        3Q       Max 
##  -0.500823 -0.095570 -0.001154  0.096204  0.638505 
##  
##  
##             Coef   S.E.   t     Pr(>|t|)
##  Intercept  0.4952 0.0079 62.41 <0.0001 
##  log_count  0.0026 0.0024  1.07 0.2825  
##  log_count' 0.0139 0.0026  5.33 <0.0001 
## 

Countries across datasets

Fit model without IQ and save residuals. They should correlate and reflect GCA.

#cors
wtd.cors(natdata[c("IQ", "population2017")])
##                   IQ population2017
## IQ             1.000          0.118
## population2017 0.118          1.000
#using the 0 imputed data
models_popsize = map_df(all_names, function(game) {
  #form
  t_form = as.formula(sprintf("log_%s_count ~ log_pop", game))
  
  #fit
  t_fit = rms::ols(t_form, data = natdata)
  
  #save resids
  data_frame(
    game = game,
    country = natdata$country,
    ISO = natdata$ISO,
    resid = resid(t_fit)
  )
})

#aggregate
models_popsize_countries = models_popsize %>% spread(game, resid) 

#factor analysis
(game_cors = wtd.cors(models_popsize_countries %>% select(-country, -ISO)))
##           chess    cs  csgo dota2     go    hs   lol    ow poker   sc2 scrabble
## chess    1.0000 0.645 0.754 0.581 0.0534 0.660 0.661 0.519 0.669 0.566   0.0650
## cs       0.6449 1.000 0.813 0.699 0.3639 0.832 0.785 0.784 0.777 0.833   0.2035
## csgo     0.7536 0.813 1.000 0.765 0.2792 0.864 0.857 0.813 0.775 0.823   0.2476
## dota2    0.5808 0.699 0.765 1.000 0.2241 0.736 0.707 0.638 0.623 0.711   0.2921
## go       0.0534 0.364 0.279 0.224 1.0000 0.434 0.376 0.485 0.329 0.486   0.0586
## hs       0.6604 0.832 0.864 0.736 0.4343 1.000 0.911 0.876 0.829 0.909   0.2125
## lol      0.6613 0.785 0.857 0.707 0.3761 0.911 1.000 0.861 0.760 0.864   0.2274
## ow       0.5191 0.784 0.813 0.638 0.4849 0.876 0.861 1.000 0.746 0.876   0.3066
## poker    0.6689 0.777 0.775 0.623 0.3292 0.829 0.760 0.746 1.000 0.795   0.2420
## sc2      0.5661 0.833 0.823 0.711 0.4861 0.909 0.864 0.876 0.795 1.000   0.2744
## scrabble 0.0650 0.203 0.248 0.292 0.0586 0.212 0.227 0.307 0.242 0.274   1.0000
## ssbm     0.4227 0.623 0.524 0.356 0.1913 0.584 0.521 0.579 0.675 0.642   0.2587
##           ssbm
## chess    0.423
## cs       0.623
## csgo     0.524
## dota2    0.356
## go       0.191
## hs       0.584
## lol      0.521
## ow       0.579
## poker    0.675
## sc2      0.642
## scrabble 0.259
## ssbm     1.000
describe(game_cors %>% {diag(.) = NA; .})
game_cors %>% MAT_half() %>% describe()
(gms_factor = models_popsize_countries %>% .[all_names] %>% fa())
## Factor Analysis using method =  minres
## Call: fa(r = .)
## Standardized loadings (pattern matrix) based upon correlation matrix
##           MR1    h2    u2 com
## sc2      0.94 0.885 0.115   1
## lol      0.92 0.845 0.155   1
## dota2    0.76 0.573 0.427   1
## csgo     0.92 0.841 0.159   1
## cs       0.89 0.794 0.206   1
## hs       0.96 0.924 0.076   1
## ow       0.90 0.807 0.193   1
## ssbm     0.63 0.398 0.602   1
## chess    0.69 0.471 0.529   1
## go       0.39 0.151 0.849   1
## scrabble 0.27 0.073 0.927   1
## poker    0.87 0.753 0.247   1
## 
##                 MR1
## SS loadings    7.52
## Proportion Var 0.63
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 factor is sufficient.
## 
## The degrees of freedom for the null model are  66  and the objective function was  13.1 with Chi Square of  2888
## The degrees of freedom for the model are 54  and the objective function was  1.64 
## 
## The root mean square of the residuals (RMSR) is  0.06 
## The df corrected root mean square of the residuals is  0.07 
## 
## The harmonic number of observations is  227 with the empirical chi square  111  with prob <  8.8e-06 
## The total number of observations was  227  with Likelihood Chi Square =  361  with prob <  5.1e-47 
## 
## Tucker Lewis Index of factoring reliability =  0.867
## RMSEA index =  0.161  and the 90 % confidence intervals are  0.143 0.174
## BIC =  68.2
## Fit based upon off diagonal values = 0.99
## Measures of factor score adequacy             
##                                                    MR1
## Correlation of (regression) scores with factors   0.99
## Multiple R square of scores with factors          0.98
## Minimum correlation of possible factor scores     0.97
#save scores
models_popsize_countries$general_mental_sports = gms_factor$scores %>% as.vector()

#merge to main
natdata = full_join(natdata,
                    models_popsize_countries %>% select(-country),
                       by = "ISO")

#cors
wtd.cors(natdata[c("IQ",
                   "general_mental_sports"
                   )],
         weight = sqrt(natdata$population2017))
##                          IQ general_mental_sports
## IQ                    1.000                 0.792
## general_mental_sports 0.792                 1.000
#plots
GG_scatter(natdata, "IQ", "general_mental_sports",
           weights = sqrt(natdata$population2017),
           case_names = "country"
           ) +
  geom_smooth(se = F, span = 1) +
  scale_y_continuous("General mental gaming skill")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

GG_save("figs/gmgs_IQ.png")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#Jensen's method
fa_Jensens_method(gms_factor, natdata, "IQ") +
  scale_y_continuous("Correlation with national IQ")
## Using Pearson correlations for the criterion-indicators relationships.

GG_save("figs/Jensens_method.png")

#models
#base
(model_1a = rms::ols(general_mental_sports ~ IQ, data = natdata, weights = sqrt(natdata$population2017)))
## Frequencies of Missing Values Due to Each Variable
## general_mental_sports                    IQ             (weights) 
##                     0                    32                     0 
## 
## Linear Regression Model
##  
##  rms::ols(formula = general_mental_sports ~ IQ, data = natdata, 
##      weights = sqrt(natdata$population2017))
##  
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs      195    LR chi2    192.68    R2       0.628    
##  sigma53.2267    d.f.            1    R2 adj   0.626    
##  d.f.     193    Pr(> chi2) 0.0000    g        1.186    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -3.0638 -0.5878 -0.1587  0.3940  1.9984 
##  
##  
##            Coef    S.E.   t      Pr(>|t|)
##  Intercept -8.0283 0.4595 -17.47 <0.0001 
##  IQ         0.0958 0.0053  18.04 <0.0001 
## 
#nonlinear
(model_1b = rms::ols(general_mental_sports ~ rcs(IQ, 3), data = natdata, weights = sqrt(natdata$population2017)))
## Frequencies of Missing Values Due to Each Variable
## general_mental_sports                    IQ             (weights) 
##                     0                    32                     0 
## 
## Linear Regression Model
##  
##  rms::ols(formula = general_mental_sports ~ rcs(IQ, 3), data = natdata, 
##      weights = sqrt(natdata$population2017))
##  
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs      195    LR chi2    221.62    R2       0.679    
##  sigma49.5484    d.f.            2    R2 adj   0.676    
##  d.f.     192    Pr(> chi2) 0.0000    g        1.099    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -3.53472 -0.42170 -0.04563  0.34492  2.02523 
##  
##  
##            Coef    S.E.   t     Pr(>|t|)
##  Intercept -3.1654 0.9761 -3.24 0.0014  
##  IQ         0.0309 0.0127  2.43 0.0161  
##  IQ'        0.0765 0.0138  5.54 <0.0001 
## 
#real increase?
lrtest(model_1a,
      model_1b)
## 
## Model 1: general_mental_sports ~ IQ
## Model 2: general_mental_sports ~ rcs(IQ, 3)
## 
## L.R. Chisq       d.f.          P 
##   2.89e+01   1.00e+00   7.46e-08
#add regional controls
(model_2a = rms::ols(general_mental_sports ~ rcs(IQ, 3) + UN_region, data = natdata, weights = sqrt(natdata$population2017)))
## Frequencies of Missing Values Due to Each Variable
## general_mental_sports                    IQ             UN_region 
##                     0                    32                     0 
##             (weights) 
##                     0 
## 
## Linear Regression Model
##  
##  rms::ols(formula = general_mental_sports ~ rcs(IQ, 3) + UN_region, 
##      data = natdata, weights = sqrt(natdata$population2017))
##  
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs      195    LR chi2    330.20    R2       0.816    
##  sigma39.8613    d.f.           24    R2 adj   0.790    
##  d.f.     170    Pr(> chi2) 0.0000    g        1.246    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -2.73752 -0.38474 -0.01435  0.29236  1.37945 
##  
##  
##                                      Coef    S.E.   t     Pr(>|t|)
##  Intercept                            0.0828 1.7583  0.05 0.9625  
##  IQ                                  -0.0025 0.0219 -0.11 0.9095  
##  IQ'                                  0.1006 0.0253  3.97 0.0001  
##  UN_region=Australia and New Zealand -0.2965 0.5233 -0.57 0.5717  
##  UN_region=Caribbean                 -0.7295 0.5045 -1.45 0.1500  
##  UN_region=Central America           -0.6606 0.4369 -1.51 0.1324  
##  UN_region=Central Asia              -0.5331 0.4931 -1.08 0.2812  
##  UN_region=Eastern Africa            -0.9817 0.4575 -2.15 0.0333  
##  UN_region=Eastern Asia              -1.3095 0.3140 -4.17 <0.0001 
##  UN_region=Eastern Europe             0.0237 0.3044  0.08 0.9380  
##  UN_region=Europe                    -1.0336 1.2425 -0.83 0.4066  
##  UN_region=Melanesia                 -0.8173 0.6502 -1.26 0.2105  
##  UN_region=Micronesia                -0.2036 1.2435 -0.16 0.8701  
##  UN_region=Middle Africa             -0.9388 0.5257 -1.79 0.0759  
##  UN_region=Northern Africa           -0.9145 0.4387 -2.08 0.0386  
##  UN_region=Northern America           1.0949 0.3402  3.22 0.0015  
##  UN_region=Northern Europe           -0.0172 0.3373 -0.05 0.9593  
##  UN_region=Polynesia                 -0.5803 1.3690 -0.42 0.6722  
##  UN_region=South America              0.1381 0.3845  0.36 0.7198  
##  UN_region=South-Eastern Asia        -0.7036 0.3447 -2.04 0.0428  
##  UN_region=Southern Africa            0.2218 0.5581  0.40 0.6916  
##  UN_region=Southern Asia             -0.9917 0.4091 -2.42 0.0164  
##  UN_region=Southern Europe           -0.4129 0.3311 -1.25 0.2141  
##  UN_region=Western Africa            -0.9411 0.4818 -1.95 0.0524  
##  UN_region=Western Asia              -0.7471 0.3880 -1.93 0.0559  
## 
(model_2b = rms::ols(general_mental_sports ~ rcs(IQ, 3) + UN_macroregion, data = natdata, weights = sqrt(natdata$population2017)))
## Frequencies of Missing Values Due to Each Variable
## general_mental_sports                    IQ        UN_macroregion 
##                     0                    32                     0 
##             (weights) 
##                     0 
## 
## Linear Regression Model
##  
##  rms::ols(formula = general_mental_sports ~ rcs(IQ, 3) + UN_macroregion, 
##      data = natdata, weights = sqrt(natdata$population2017))
##  
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs      195    LR chi2    294.25    R2       0.779    
##  sigma42.5975    d.f.           15    R2 adj   0.760    
##  d.f.     179    Pr(> chi2) 0.0000    g        1.248    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -2.7355 -0.3812 -0.0395  0.2730  1.6502 
##  
##  
##                                    Coef    S.E.   t     Pr(>|t|)
##  Intercept                         -0.3981 1.7446 -0.23 0.8198  
##  IQ                                 0.0056 0.0218  0.26 0.7979  
##  IQ'                                0.0981 0.0258  3.80 0.0002  
##  UN_macroregion=Caribbean          -0.8678 0.4869 -1.78 0.0764  
##  UN_macroregion=Latin America      -0.3229 0.3367 -0.96 0.3388  
##  UN_macroregion=Central Asia       -0.6974 0.4737 -1.47 0.1427  
##  UN_macroregion=Africa             -0.9587 0.4226 -2.27 0.0245  
##  UN_macroregion=Eastern Asia       -1.5971 0.2806 -5.69 <0.0001 
##  UN_macroregion=Eastern Europe     -0.2231 0.2559 -0.87 0.3845  
##  UN_macroregion=Melanesia          -1.0046 0.6562 -1.53 0.1275  
##  UN_macroregion=Micronesia         -0.3899 1.3091 -0.30 0.7662  
##  UN_macroregion=MENA               -0.9987 0.3446 -2.90 0.0042  
##  UN_macroregion=Polynesia          -0.7886 1.4461 -0.55 0.5862  
##  UN_macroregion=South-Eastern Asia -0.9207 0.2984 -3.09 0.0024  
##  UN_macroregion=Southern Asia      -1.1646 0.3717 -3.13 0.0020  
##  UN_macroregion=Southern Europe    -0.6531 0.2892 -2.26 0.0252  
## 
(model_2c = rms::ols(general_mental_sports ~ rcs(IQ, 3) + UN_continent, data = natdata, weights = sqrt(natdata$population2017)))
## Frequencies of Missing Values Due to Each Variable
## general_mental_sports                    IQ          UN_continent 
##                     0                    32                     0 
##             (weights) 
##                     0 
## 
## Linear Regression Model
##  
##  rms::ols(formula = general_mental_sports ~ rcs(IQ, 3) + UN_continent, 
##      data = natdata, weights = sqrt(natdata$population2017))
##  
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs      195    LR chi2    273.56    R2       0.754    
##  sigma43.8292    d.f.            6    R2 adj   0.746    
##  d.f.     188    Pr(> chi2) 0.0000    g        1.208    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -3.07332 -0.51577 -0.02894  0.27535  1.67390 
##  
##  
##                        Coef    S.E.   t     Pr(>|t|)
##  Intercept             -1.3971 1.3218 -1.06 0.2919  
##  IQ                     0.0148 0.0162  0.92 0.3613  
##  IQ'                    0.0850 0.0144  5.89 <0.0001 
##  UN_continent=Africa   -0.6435 0.2355 -2.73 0.0069  
##  UN_continent=Americas  0.1360 0.1826  0.75 0.4572  
##  UN_continent=Asia     -0.8145 0.1500 -5.43 <0.0001 
##  UN_continent=Oceania  -0.4043 0.3806 -1.06 0.2896  
## 
#final, with linear
(model_2d = rms::ols(general_mental_sports ~ IQ + UN_macroregion, data = natdata, weights = sqrt(natdata$population2017)))
## Frequencies of Missing Values Due to Each Variable
## general_mental_sports                    IQ        UN_macroregion 
##                     0                    32                     0 
##             (weights) 
##                     0 
## 
## Linear Regression Model
##  
##  rms::ols(formula = general_mental_sports ~ IQ + UN_macroregion, 
##      data = natdata, weights = sqrt(natdata$population2017))
##  
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs      195    LR chi2    279.11    R2       0.761    
##  sigma44.1596    d.f.           14    R2 adj   0.742    
##  d.f.     180    Pr(> chi2) 0.0000    g        1.286    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -2.77716 -0.51179 -0.06577  0.30740  1.58381 
##  
##  
##                                    Coef    S.E.   t     Pr(>|t|)
##  Intercept                         -4.6415 1.3897 -3.34 0.0010  
##  IQ                                 0.0704 0.0140  5.01 <0.0001 
##  UN_macroregion=Caribbean          -1.4389 0.4801 -3.00 0.0031  
##  UN_macroregion=Latin America      -1.0968 0.2779 -3.95 0.0001  
##  UN_macroregion=Central Asia       -1.4872 0.4413 -3.37 0.0009  
##  UN_macroregion=Africa             -1.2835 0.4291 -2.99 0.0032  
##  UN_macroregion=Eastern Asia       -1.0161 0.2439 -4.17 <0.0001 
##  UN_macroregion=Eastern Europe     -0.4636 0.2570 -1.80 0.0729  
##  UN_macroregion=Melanesia          -1.8600 0.6390 -2.91 0.0041  
##  UN_macroregion=Micronesia         -1.2389 1.3372 -0.93 0.3554  
##  UN_macroregion=MENA               -1.7861 0.2854 -6.26 <0.0001 
##  UN_macroregion=Polynesia          -1.5579 1.4843 -1.05 0.2953  
##  UN_macroregion=South-Eastern Asia -1.5479 0.2577 -6.01 <0.0001 
##  UN_macroregion=Southern Asia      -1.9865 0.3134 -6.34 <0.0001 
##  UN_macroregion=Southern Europe    -0.9746 0.2867 -3.40 0.0008  
## 
#compare
lrtest(model_2d, model_2b)
## 
## Model 1: general_mental_sports ~ IQ + UN_macroregion
## Model 2: general_mental_sports ~ rcs(IQ, 3) + UN_macroregion
## 
## L.R. Chisq       d.f.          P 
##    15.1317     1.0000     0.0001
#output
broom::tidy(summary.lm(model_2b), conf.int = T) %>% cbind(model_2b %>% confint()) %>% write_clipboard(digits = 3)
##                                                                Term Estimate
## Intercept                                                 Intercept   -0.398
## IQ                                                               IQ    0.006
## IQ'                                                             IQ'    0.098
## UN macroregion=Caribbean                   UN_macroregion=Caribbean   -0.868
## UN macroregion=Latin America           UN_macroregion=Latin America   -0.323
## UN macroregion=Central Asia             UN_macroregion=Central Asia   -0.697
## UN macroregion=Africa                         UN_macroregion=Africa   -0.959
## UN macroregion=Eastern Asia             UN_macroregion=Eastern Asia   -1.597
## UN macroregion=Eastern Europe         UN_macroregion=Eastern Europe   -0.223
## UN macroregion=Melanesia                   UN_macroregion=Melanesia   -1.005
## UN macroregion=Micronesia                 UN_macroregion=Micronesia   -0.390
## UN macroregion=MENA                             UN_macroregion=MENA   -0.999
## UN macroregion=Polynesia                   UN_macroregion=Polynesia   -0.789
## UN macroregion=South-Eastern Asia UN_macroregion=South-Eastern Asia   -0.921
## UN macroregion=Southern Asia           UN_macroregion=Southern Asia   -1.165
## UN macroregion=Southern Europe       UN_macroregion=Southern Europe   -0.653
##                                   Std error Statistic P value  2 5 % 97 5 %
## Intercept                             1.745    -0.228   0.820 -3.841  3.045
## IQ                                    0.022     0.256   0.798 -0.037  0.049
## IQ'                                   0.026     3.800   0.000  0.047  0.149
## UN macroregion=Caribbean              0.487    -1.782   0.076 -1.829  0.093
## UN macroregion=Latin America          0.337    -0.959   0.339 -0.987  0.341
## UN macroregion=Central Asia           0.474    -1.472   0.143 -1.632  0.237
## UN macroregion=Africa                 0.423    -2.268   0.025 -1.793 -0.125
## UN macroregion=Eastern Asia           0.281    -5.692   0.000 -2.151 -1.043
## UN macroregion=Eastern Europe         0.256    -0.872   0.384 -0.728  0.282
## UN macroregion=Melanesia              0.656    -1.531   0.128 -2.299  0.290
## UN macroregion=Micronesia             1.309    -0.298   0.766 -2.973  2.193
## UN macroregion=MENA                   0.345    -2.899   0.004 -1.679 -0.319
## UN macroregion=Polynesia              1.446    -0.545   0.586 -3.642  2.065
## UN macroregion=South-Eastern Asia     0.298    -3.085   0.002 -1.510 -0.332
## UN macroregion=Southern Asia          0.372    -3.133   0.002 -1.898 -0.431
## UN macroregion=Southern Europe        0.289    -2.258   0.025 -1.224 -0.082
#relative effect sizes
model_2d$coefficients[2] / model_1a$coefficients[2]
##    IQ 
## 0.735
#regions alone
(model_3a = rms::ols(general_mental_sports ~ UN_continent, data = natdata, weights = sqrt(natdata$population2017)))
## Linear Regression Model
##  
##  rms::ols(formula = general_mental_sports ~ UN_continent, data = natdata, 
##      weights = sqrt(natdata$population2017))
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs      227    LR chi2    114.77    R2       0.397    
##  sigma63.2950    d.f.            4    R2 adj   0.386    
##  d.f.     222    Pr(> chi2) 0.0000    g        0.995    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -2.1068 -0.9033 -0.3923  0.1609  2.6257 
##  
##  
##                        Coef    S.E.   t      Pr(>|t|)
##  Intercept              1.6167 0.1697   9.53 <0.0001 
##  UN_continent=Africa   -2.5270 0.2174 -11.62 <0.0001 
##  UN_continent=Americas -0.9285 0.2443  -3.80 0.0002  
##  UN_continent=Asia     -1.5522 0.2051  -7.57 <0.0001 
##  UN_continent=Oceania  -1.0239 0.5208  -1.97 0.0505  
## 
(model_3b = rms::ols(general_mental_sports ~ UN_macroregion, data = natdata, weights = sqrt(natdata$population2017)))
## Linear Regression Model
##  
##  rms::ols(formula = general_mental_sports ~ UN_macroregion, data = natdata, 
##      weights = sqrt(natdata$population2017))
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs      227    LR chi2    292.19    R2       0.724    
##  sigma43.7156    d.f.           13    R2 adj   0.707    
##  d.f.     213    Pr(> chi2) 0.0000    g        1.190    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -2.81097 -0.43876 -0.04281  0.36487  1.78954 
##  
##  
##                                    Coef    S.E.   t      Pr(>|t|)
##  Intercept                          2.2622 0.1453  15.57 <0.0001 
##  UN_macroregion=Caribbean          -2.8168 0.3495  -8.06 <0.0001 
##  UN_macroregion=Latin America      -1.9819 0.2081  -9.53 <0.0001 
##  UN_macroregion=Central Asia       -2.7227 0.3588  -7.59 <0.0001 
##  UN_macroregion=Africa             -3.2036 0.1779 -18.01 <0.0001 
##  UN_macroregion=Eastern Asia       -0.5191 0.2222  -2.34 0.0204  
##  UN_macroregion=Eastern Europe     -0.6420 0.2508  -2.56 0.0112  
##  UN_macroregion=Melanesia          -2.8604 0.5991  -4.77 <0.0001 
##  UN_macroregion=Micronesia         -2.2355 1.0433  -2.14 0.0333  
##  UN_macroregion=MENA               -2.7462 0.2039 -13.47 <0.0001 
##  UN_macroregion=Polynesia          -2.2499 0.9975  -2.26 0.0251  
##  UN_macroregion=South-Eastern Asia -2.1596 0.2216  -9.75 <0.0001 
##  UN_macroregion=Southern Asia      -3.1356 0.2058 -15.24 <0.0001 
##  UN_macroregion=Southern Europe    -1.2572 0.2761  -4.55 <0.0001 
## 
(model_3c = rms::ols(general_mental_sports ~ UN_region, data = natdata, weights = sqrt(natdata$population2017)))
## Linear Regression Model
##  
##  rms::ols(formula = general_mental_sports ~ UN_region, data = natdata, 
##      weights = sqrt(natdata$population2017))
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs      227    LR chi2    332.36    R2       0.769    
##  sigma40.8862    d.f.           22    R2 adj   0.744    
##  d.f.     204    Pr(> chi2) 0.0000    g        1.200    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -3.00009 -0.43755  0.01496  0.39681  1.78954 
##  
##  
##                                      Coef    S.E.   t      Pr(>|t|)
##  Intercept                            2.1025 0.2298   9.15 <0.0001 
##  UN_region=Australia and New Zealand -0.2601 0.5365  -0.48 0.6284  
##  UN_region=Caribbean                 -2.6570 0.3758  -7.07 <0.0001 
##  UN_region=Central America           -2.4827 0.3342  -7.43 <0.0001 
##  UN_region=Central Asia              -2.5630 0.3833  -6.69 <0.0001 
##  UN_region=Eastern Africa            -3.1275 0.2737 -11.43 <0.0001 
##  UN_region=Eastern Asia              -0.3594 0.2784  -1.29 0.1983  
##  UN_region=Eastern Europe            -0.4823 0.2990  -1.61 0.1082  
##  UN_region=Europe                    -2.0736 1.2618  -1.64 0.1018  
##  UN_region=Melanesia                 -2.7007 0.5902  -4.58 <0.0001 
##  UN_region=Micronesia                -2.0758 0.9932  -2.09 0.0379  
##  UN_region=Middle Africa             -3.1266 0.3275  -9.55 <0.0001 
##  UN_region=Northern Africa           -2.8533 0.3165  -9.02 <0.0001 
##  UN_region=Northern America           0.9667 0.3474   2.78 0.0059  
##  UN_region=Northern Europe           -0.2016 0.3421  -0.59 0.5562  
##  UN_region=Polynesia                 -2.0901 0.9512  -2.20 0.0291  
##  UN_region=South America             -1.4976 0.2859  -5.24 <0.0001 
##  UN_region=South-Eastern Asia        -1.9998 0.2780  -7.19 <0.0001 
##  UN_region=Southern Africa           -1.9685 0.4225  -4.66 <0.0001 
##  UN_region=Southern Asia             -2.9759 0.2672 -11.14 <0.0001 
##  UN_region=Southern Europe           -1.0975 0.3178  -3.45 0.0007  
##  UN_region=Western Africa            -3.1318 0.2826 -11.08 <0.0001 
##  UN_region=Western Asia              -2.4243 0.2856  -8.49 <0.0001 
## 
#predict IQ from regions
(model_4a = rms::ols(IQ ~ UN_continent, data = natdata, weights = sqrt(natdata$population2017)))
## Frequencies of Missing Values Due to Each Variable
##           IQ UN_continent    (weights) 
##           32            0            0 
## 
## Linear Regression Model
##  
##  rms::ols(formula = IQ ~ UN_continent, data = natdata, weights = sqrt(natdata$population2017))
##  
##  
##                   Model Likelihood     Discrimination    
##                      Ratio Test           Indexes        
##  Obs       195    LR chi2    161.52    R2       0.563    
##  sigma480.3925    d.f.            4    R2 adj   0.554    
##  d.f.      190    Pr(> chi2) 0.0000    g        9.445    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -24.902  -5.889  -2.204   2.161  17.760 
##  
##  
##                        Coef     S.E.   t      Pr(>|t|)
##  Intercept              96.3769 1.2937  74.49 <0.0001 
##  UN_continent=Africa   -23.7726 1.6638 -14.29 <0.0001 
##  UN_continent=Americas  -9.4749 1.8714  -5.06 <0.0001 
##  UN_continent=Asia      -7.0365 1.5616  -4.51 <0.0001 
##  UN_continent=Oceania   -5.0029 4.1545  -1.20 0.2300  
## 
(model_4b = rms::ols(IQ ~ UN_macroregion, data = natdata, weights = sqrt(natdata$population2017)))
## Frequencies of Missing Values Due to Each Variable
##             IQ UN_macroregion      (weights) 
##             32              0              0 
## 
## Linear Regression Model
##  
##  rms::ols(formula = IQ ~ UN_macroregion, data = natdata, weights = sqrt(natdata$population2017))
##  
##  
##                   Model Likelihood     Discrimination    
##                      Ratio Test           Indexes        
##  Obs       195    LR chi2    452.06    R2       0.902    
##  sigma233.6685    d.f.           13    R2 adj   0.894    
##  d.f.      181    Pr(> chi2) 0.0000    g       11.492    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -15.3156  -2.6951   0.1014   2.1771  17.7614 
##  
##  
##                                    Coef     S.E.   t      Pr(>|t|)
##  Intercept                          98.3697 0.7809 125.97 <0.0001 
##  UN_macroregion=Caribbean          -21.0541 2.0010 -10.52 <0.0001 
##  UN_macroregion=Latin America      -12.8711 1.1170 -11.52 <0.0001 
##  UN_macroregion=Central Asia       -17.8884 1.9194  -9.32 <0.0001 
##  UN_macroregion=Africa             -27.6866 0.9589 -28.87 <0.0001 
##  UN_macroregion=Eastern Asia         6.7104 1.1904   5.64 <0.0001 
##  UN_macroregion=Eastern Europe      -2.8796 1.3431  -2.14 0.0334  
##  UN_macroregion=Melanesia          -14.5514 3.2034  -4.54 <0.0001 
##  UN_macroregion=Micronesia         -14.6908 6.9908  -2.10 0.0370  
##  UN_macroregion=MENA               -13.9946 1.0950 -12.78 <0.0001 
##  UN_macroregion=Polynesia          -10.9511 7.8121  -1.40 0.1627  
##  UN_macroregion=South-Eastern Asia  -9.0310 1.1871  -7.61 <0.0001 
##  UN_macroregion=Southern Asia      -16.6625 1.1030 -15.11 <0.0001 
##  UN_macroregion=Southern Europe     -4.2314 1.4842  -2.85 0.0049  
## 
(model_4c = rms::ols(IQ ~ UN_region, data = natdata, weights = sqrt(natdata$population2017)))
## Frequencies of Missing Values Due to Each Variable
##        IQ UN_region (weights) 
##        32         0         0 
## 
## Linear Regression Model
##  
##  rms::ols(formula = IQ ~ UN_region, data = natdata, weights = sqrt(natdata$population2017))
##  
##  
##                   Model Likelihood     Discrimination    
##                      Ratio Test           Indexes        
##  Obs       195    LR chi2    472.11    R2       0.911    
##  sigma227.6908    d.f.           22    R2 adj   0.900    
##  d.f.      172    Pr(> chi2) 0.0000    g       11.668    
##  
##  Residuals
##  
##         Min         1Q     Median         3Q        Max 
##  -15.315574  -2.809845  -0.002582   1.879562  17.761353 
##  
##  
##                                      Coef     S.E.   t      Pr(>|t|)
##  Intercept                            98.9497 1.2838  77.08 <0.0001 
##  UN_region=Australia and New Zealand   0.1588 2.9892   0.05 0.9577  
##  UN_region=Caribbean                 -21.6341 2.2070  -9.80 <0.0001 
##  UN_region=Central America           -15.2926 1.8639  -8.20 <0.0001 
##  UN_region=Central Asia              -18.4684 2.1371  -8.64 <0.0001 
##  UN_region=Eastern Africa            -26.4858 1.5434 -17.16 <0.0001 
##  UN_region=Eastern Asia                6.1304 1.5539   3.95 0.0001  
##  UN_region=Eastern Europe             -3.4596 1.6679  -2.07 0.0395  
##  UN_region=Europe                     -7.1497 7.0273  -1.02 0.3104  
##  UN_region=Melanesia                 -15.1314 3.2882  -4.60 <0.0001 
##  UN_region=Micronesia                -15.2708 6.8900  -2.22 0.0280  
##  UN_region=Middle Africa             -31.1463 1.8268 -17.05 <0.0001 
##  UN_region=Northern Africa           -16.5471 1.7742  -9.33 <0.0001 
##  UN_region=Northern America           -0.8734 1.9390  -0.45 0.6530  
##  UN_region=Northern Europe            -0.9418 1.9221  -0.49 0.6248  
##  UN_region=Polynesia                 -11.5312 7.6821  -1.50 0.1352  
##  UN_region=South America             -12.5371 1.5982  -7.84 <0.0001 
##  UN_region=South-Eastern Asia         -9.6110 1.5515  -6.19 <0.0001 
##  UN_region=Southern Africa           -27.1290 2.3550 -11.52 <0.0001 
##  UN_region=Southern Asia             -17.2425 1.4914 -11.56 <0.0001 
##  UN_region=Southern Europe            -4.8114 1.7778  -2.71 0.0075  
##  UN_region=Western Africa            -29.1176 1.5775 -18.46 <0.0001 
##  UN_region=Western Asia              -13.4011 1.5938  -8.41 <0.0001 
## 
#summary stats
list(model_1a, model_1b, model_2a, model_2b, model_2c, model_2d, model_3a, model_3b, model_3c) %>% map_df(function(.) {
  data_frame(
    R2_adj = glance(.) %>% pull(adj.r.squared)
    )
})
#internet users
(model_4a = rms::ols(general_mental_sports ~ IQ + UN_region + internet_users_pct, data = natdata, weights = sqrt(natdata$population2017)))
## Frequencies of Missing Values Due to Each Variable
## general_mental_sports                    IQ             UN_region 
##                     0                    32                     0 
##    internet_users_pct             (weights) 
##                    37                     0 
## 
## Linear Regression Model
##  
##  rms::ols(formula = general_mental_sports ~ IQ + UN_region + internet_users_pct, 
##      data = natdata, weights = sqrt(natdata$population2017))
##  
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs      187    LR chi2    320.90    R2       0.820    
##  sigma40.1955    d.f.           24    R2 adj   0.794    
##  d.f.     162    Pr(> chi2) 0.0000    g        1.270    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -2.29531 -0.51771 -0.04888  0.31471  1.25504 
##  
##  
##                                      Coef    S.E.   t     Pr(>|t|)
##  Intercept                           -3.5328 1.3683 -2.58 0.0107  
##  IQ                                   0.0443 0.0143  3.09 0.0024  
##  UN_region=Australia and New Zealand -0.2827 0.5277 -0.54 0.5929  
##  UN_region=Caribbean                 -1.2029 0.4901 -2.45 0.0152  
##  UN_region=Central America           -1.2245 0.3938 -3.11 0.0022  
##  UN_region=Central Asia              -1.1172 0.4563 -2.45 0.0154  
##  UN_region=Eastern Africa            -0.9520 0.4636 -2.05 0.0416  
##  UN_region=Eastern Asia              -0.2410 0.3151 -0.76 0.4455  
##  UN_region=Eastern Europe            -0.0658 0.3019 -0.22 0.8277  
##  UN_region=Europe                    -1.5947 1.2443 -1.28 0.2018  
##  UN_region=Melanesia                 -1.0733 0.6588 -1.63 0.1052  
##  UN_region=Micronesia                -0.5713 1.3827 -0.41 0.6800  
##  UN_region=Middle Africa             -0.6844 0.5395 -1.27 0.2065  
##  UN_region=Northern Africa           -1.4710 0.3923 -3.75 0.0002  
##  UN_region=Northern America           1.1763 0.3453  3.41 0.0008  
##  UN_region=Northern Europe           -0.1551 0.3398 -0.46 0.6486  
##  UN_region=Polynesia                 -0.9175 1.4782 -0.62 0.5357  
##  UN_region=South America             -0.5145 0.3315 -1.55 0.1226  
##  UN_region=South-Eastern Asia        -0.9346 0.3206 -2.92 0.0041  
##  UN_region=Southern Africa           -0.1520 0.5545 -0.27 0.7844  
##  UN_region=Southern Asia             -1.3441 0.3719 -3.61 0.0004  
##  UN_region=Southern Europe           -0.6240 0.3228 -1.93 0.0549  
##  UN_region=Western Africa            -0.8633 0.4905 -1.76 0.0803  
##  UN_region=Western Asia              -1.4259 0.3362 -4.24 <0.0001 
##  internet_users_pct                   0.0143 0.0033  4.32 <0.0001 
## 
(model_4b = rms::ols(general_mental_sports ~ IQ + UN_macroregion + internet_users_pct, data = natdata, weights = sqrt(natdata$population2017)))
## Frequencies of Missing Values Due to Each Variable
## general_mental_sports                    IQ        UN_macroregion 
##                     0                    32                     0 
##    internet_users_pct             (weights) 
##                    37                     0 
## 
## Linear Regression Model
##  
##  rms::ols(formula = general_mental_sports ~ IQ + UN_macroregion + 
##      internet_users_pct, data = natdata, weights = sqrt(natdata$population2017))
##  
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs      187    LR chi2    288.41    R2       0.786    
##  sigma42.6738    d.f.           15    R2 adj   0.767    
##  d.f.     171    Pr(> chi2) 0.0000    g        1.307    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -2.53274 -0.55993 -0.07711  0.30538  1.48396 
##  
##  
##                                    Coef    S.E.   t     Pr(>|t|)
##  Intercept                         -3.5431 1.3663 -2.59 0.0103  
##  IQ                                 0.0459 0.0146  3.15 0.0019  
##  UN_macroregion=Caribbean          -1.3622 0.4676 -2.91 0.0041  
##  UN_macroregion=Latin America      -0.9299 0.2727 -3.41 0.0008  
##  UN_macroregion=Central Asia       -1.2787 0.4304 -2.97 0.0034  
##  UN_macroregion=Africa             -0.9359 0.4247 -2.20 0.0289  
##  UN_macroregion=Eastern Asia       -0.4579 0.2705 -1.69 0.0922  
##  UN_macroregion=Eastern Europe     -0.2759 0.2525 -1.09 0.2760  
##  UN_macroregion=Melanesia          -1.2157 0.6624 -1.84 0.0682  
##  UN_macroregion=Micronesia         -0.7213 1.4512 -0.50 0.6198  
##  UN_macroregion=MENA               -1.6191 0.2799 -5.78 <0.0001 
##  UN_macroregion=Polynesia          -1.0806 1.5547 -0.70 0.4880  
##  UN_macroregion=South-Eastern Asia -1.1102 0.2688 -4.13 <0.0001 
##  UN_macroregion=Southern Asia      -1.4915 0.3245 -4.60 <0.0001 
##  UN_macroregion=Southern Europe    -0.8328 0.2795 -2.98 0.0033  
##  internet_users_pct                 0.0153 0.0033  4.57 <0.0001 
## 
(model_4c = rms::ols(general_mental_sports ~ IQ + UN_continent + internet_users_pct, data = natdata, weights = sqrt(natdata$population2017)))
## Frequencies of Missing Values Due to Each Variable
## general_mental_sports                    IQ          UN_continent 
##                     0                    32                     0 
##    internet_users_pct             (weights) 
##                    37                     0 
## 
## Linear Regression Model
##  
##  rms::ols(formula = general_mental_sports ~ IQ + UN_continent + 
##      internet_users_pct, data = natdata, weights = sqrt(natdata$population2017))
##  
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs      187    LR chi2    249.57    R2       0.737    
##  sigma46.1461    d.f.            6    R2 adj   0.728    
##  d.f.     180    Pr(> chi2) 0.0000    g        1.326    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -2.0891 -0.7170 -0.1079  0.3268  1.4773 
##  
##  
##                        Coef    S.E.   t      Pr(>|t|)
##  Intercept             -7.3187 0.7168 -10.21 <0.0001 
##  IQ                     0.0812 0.0084   9.69 <0.0001 
##  UN_continent=Africa    0.1827 0.2421   0.75 0.4515  
##  UN_continent=Americas  0.1330 0.1931   0.69 0.4917  
##  UN_continent=Asia     -0.5275 0.1746  -3.02 0.0029  
##  UN_continent=Oceania  -0.2379 0.4148  -0.57 0.5671  
##  internet_users_pct     0.0145 0.0032   4.53 <0.0001 
## 
(model_4d = rms::ols(general_mental_sports ~ rcs(IQ, 3) + UN_region + internet_users_pct, data = natdata, weights = sqrt(natdata$population2017)))
## Frequencies of Missing Values Due to Each Variable
## general_mental_sports                    IQ             UN_region 
##                     0                    32                     0 
##    internet_users_pct             (weights) 
##                    37                     0 
## 
## Linear Regression Model
##  
##  rms::ols(formula = general_mental_sports ~ rcs(IQ, 3) + UN_region + 
##      internet_users_pct, data = natdata, weights = sqrt(natdata$population2017))
##  
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs      187    LR chi2    335.03    R2       0.833    
##  sigma38.8257    d.f.           25    R2 adj   0.807    
##  d.f.     161    Pr(> chi2) 0.0000    g        1.259    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -2.25314 -0.42977 -0.02946  0.31131  1.31611 
##  
##  
##                                      Coef    S.E.   t     Pr(>|t|)
##  Intercept                            0.3568 1.7160  0.21 0.8356  
##  IQ                                  -0.0143 0.0215 -0.66 0.5085  
##  IQ'                                  0.0884 0.0249  3.55 0.0005  
##  UN_region=Australia and New Zealand -0.2948 0.5097 -0.58 0.5639  
##  UN_region=Caribbean                 -0.6933 0.4946 -1.40 0.1629  
##  UN_region=Central America           -0.5190 0.4291 -1.21 0.2282  
##  UN_region=Central Asia              -0.4117 0.4834 -0.85 0.3957  
##  UN_region=Eastern Africa            -0.6108 0.4580 -1.33 0.1843  
##  UN_region=Eastern Asia              -0.7545 0.3369 -2.24 0.0265  
##  UN_region=Eastern Europe             0.1738 0.2993  0.58 0.5622  
##  UN_region=Europe                    -1.0857 1.2104 -0.90 0.3711  
##  UN_region=Melanesia                 -0.3355 0.6694 -0.50 0.6169  
##  UN_region=Micronesia                 0.1704 1.3518  0.13 0.8998  
##  UN_region=Middle Africa             -0.5883 0.5219 -1.13 0.2613  
##  UN_region=Northern Africa           -0.7318 0.4322 -1.69 0.0924  
##  UN_region=Northern America           1.2232 0.3338  3.67 0.0003  
##  UN_region=Northern Europe           -0.0799 0.3289 -0.24 0.8085  
##  UN_region=Polynesia                 -0.2300 1.4409 -0.16 0.8734  
##  UN_region=South America              0.1881 0.3763  0.50 0.6179  
##  UN_region=South-Eastern Asia        -0.3786 0.3469 -1.09 0.2768  
##  UN_region=Southern Africa            0.2074 0.5451  0.38 0.7041  
##  UN_region=Southern Asia             -0.6282 0.4119 -1.53 0.1292  
##  UN_region=Southern Europe           -0.3105 0.3240 -0.96 0.3393  
##  UN_region=Western Africa            -0.6425 0.4778 -1.34 0.1806  
##  UN_region=Western Asia              -0.7276 0.3795 -1.92 0.0570  
##  internet_users_pct                   0.0132 0.0032  4.11 <0.0001 
## 
(model_4e = rms::ols(general_mental_sports ~ rcs(IQ, 3) + UN_macroregion + internet_users_pct, data = natdata, weights = sqrt(natdata$population2017)))
## Frequencies of Missing Values Due to Each Variable
## general_mental_sports                    IQ        UN_macroregion 
##                     0                    32                     0 
##    internet_users_pct             (weights) 
##                    37                     0 
## 
## Linear Regression Model
##  
##  rms::ols(formula = general_mental_sports ~ rcs(IQ, 3) + UN_macroregion + 
##      internet_users_pct, data = natdata, weights = sqrt(natdata$population2017))
##  
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs      187    LR chi2    300.61    R2       0.800    
##  sigma41.4265    d.f.           16    R2 adj   0.781    
##  d.f.     170    Pr(> chi2) 0.0000    g        1.287    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -2.5379 -0.4451 -0.0728  0.2606  1.5423 
##  
##  
##                                    Coef    S.E.   t     Pr(>|t|)
##  Intercept                          0.0638 1.7016  0.04 0.9701  
##  IQ                                -0.0088 0.0215 -0.41 0.6834  
##  IQ'                                0.0858 0.0253  3.38 0.0009  
##  UN_macroregion=Caribbean          -0.8632 0.4773 -1.81 0.0723  
##  UN_macroregion=Latin America      -0.2599 0.3306 -0.79 0.4328  
##  UN_macroregion=Central Asia       -0.5966 0.4639 -1.29 0.2002  
##  UN_macroregion=Africa             -0.6685 0.4198 -1.59 0.1132  
##  UN_macroregion=Eastern Asia       -1.0094 0.3090 -3.27 0.0013  
##  UN_macroregion=Eastern Europe     -0.0763 0.2521 -0.30 0.7625  
##  UN_macroregion=Melanesia          -0.5102 0.6759 -0.75 0.4514  
##  UN_macroregion=Micronesia         -0.0132 1.4242 -0.01 0.9926  
##  UN_macroregion=MENA               -0.9371 0.3383 -2.77 0.0062  
##  UN_macroregion=Polynesia          -0.4305 1.5214 -0.28 0.7776  
##  UN_macroregion=South-Eastern Asia -0.5920 0.3025 -1.96 0.0520  
##  UN_macroregion=Southern Asia      -0.8026 0.3751 -2.14 0.0338  
##  UN_macroregion=Southern Europe    -0.5586 0.2832 -1.97 0.0501  
##  internet_users_pct                 0.0142 0.0033  4.34 <0.0001 
## 
(model_4f = rms::ols(general_mental_sports ~ rcs(IQ, 3) + UN_continent + internet_users_pct, data = natdata, weights = sqrt(natdata$population2017)))
## Frequencies of Missing Values Due to Each Variable
## general_mental_sports                    IQ          UN_continent 
##                     0                    32                     0 
##    internet_users_pct             (weights) 
##                    37                     0 
## 
## Linear Regression Model
##  
##  rms::ols(formula = general_mental_sports ~ rcs(IQ, 3) + UN_continent + 
##      internet_users_pct, data = natdata, weights = sqrt(natdata$population2017))
##  
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs      187    LR chi2    287.38    R2       0.785    
##  sigma41.8249    d.f.            7    R2 adj   0.777    
##  d.f.     179    Pr(> chi2) 0.0000    g        1.266    
##  
##  Residuals
##  
##      Min      1Q  Median      3Q     Max 
##  -2.2665 -0.5647 -0.1021  0.2758  1.3893 
##  
##  
##                        Coef    S.E.   t     Pr(>|t|)
##  Intercept             -0.3489 1.2779 -0.27 0.7851  
##  IQ                    -0.0088 0.0161 -0.54 0.5873  
##  IQ'                    0.0878 0.0139  6.33 <0.0001 
##  UN_continent=Africa   -0.3204 0.2334 -1.37 0.1715  
##  UN_continent=Americas  0.2390 0.1758  1.36 0.1757  
##  UN_continent=Asia     -0.4662 0.1585 -2.94 0.0037  
##  UN_continent=Oceania  -0.1704 0.3761 -0.45 0.6511  
##  internet_users_pct     0.0151 0.0029  5.20 <0.0001 
## 
#test LR
#linear variants
lrtest(model_2d, model_4b)
## 
## Model 1: general_mental_sports ~ IQ + UN_macroregion
## Model 2: general_mental_sports ~ IQ + UN_macroregion + internet_users_pct
## 
## L.R. Chisq       d.f.          P 
##    9.29986    1.00000    0.00229
1 - (model_4b$coefficients["IQ"] / model_2d$coefficients["IQ"])
##    IQ 
## 0.348
#spline variants
lrtest(model_2b, model_4e)
## 
## Model 1: general_mental_sports ~ rcs(IQ, 3) + UN_macroregion
## Model 2: general_mental_sports ~ rcs(IQ, 3) + UN_macroregion + internet_users_pct
## 
## L.R. Chisq       d.f.          P 
##     6.3590     1.0000     0.0117
1 - (model_4e$coefficients["IQ'"] / model_2b$coefficients["IQ'"])
##   IQ' 
## 0.125
#average
((1 - (model_4b$coefficients["IQ"] / model_2d$coefficients["IQ"])) + (1 - (model_4e$coefficients["IQ'"] / model_2b$coefficients["IQ'"]))) / 2
##    IQ 
## 0.237
#Jensen's method
fa_Jensens_method(gms_factor, natdata, "internet_users_pct") +
  scale_y_continuous("Correlation with internet users")
## Using Pearson correlations for the criterion-indicators relationships.

Nonlinear IQ relationshiop visualization

#simple spline
model_1b
## Frequencies of Missing Values Due to Each Variable
## general_mental_sports                    IQ             (weights) 
##                     0                    32                     0 
## 
## Linear Regression Model
##  
##  rms::ols(formula = general_mental_sports ~ rcs(IQ, 3), data = natdata, 
##      weights = sqrt(natdata$population2017))
##  
##  
##                  Model Likelihood     Discrimination    
##                     Ratio Test           Indexes        
##  Obs      195    LR chi2    221.62    R2       0.679    
##  sigma49.5484    d.f.            2    R2 adj   0.676    
##  d.f.     192    Pr(> chi2) 0.0000    g        1.099    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -3.53472 -0.42170 -0.04563  0.34492  2.02523 
##  
##  
##            Coef    S.E.   t     Pr(>|t|)
##  Intercept -3.1654 0.9761 -3.24 0.0014  
##  IQ         0.0309 0.0127  2.43 0.0161  
##  IQ'        0.0765 0.0138  5.54 <0.0001 
## 
#visualize model nonlinear IQ relationship
model_vis = tibble(
  IQ = seq(min(natdata$IQ, na.rm=T), max(natdata$IQ, na.rm=T), by = 1),
  UN_macroregion = natdata$UN_macroregion[3]
)
model_vis$fit = predict(model_1b, newdata = model_vis)
ggplot(model_vis, aes(IQ, fit)) +
  geom_line()

#knot IQs
model_1b$Design$parms$IQ
## [1] 69.6 84.6 98.7
#predictions
knot_preds = predict(model_1b, newdata = tibble(IQ = model_1b$Design$parms$IQ))
knot_preds
##      1      2      3 
## -1.016 -0.247  1.590
#slopes
(knot_preds[2] - knot_preds[1]) / (model_1b$Design$parms$IQ[2] - model_1b$Design$parms$IQ[1])
##      2 
## 0.0512
(knot_preds[3] - knot_preds[2]) / (model_1b$Design$parms$IQ[3] - model_1b$Design$parms$IQ[2])
##    3 
## 0.13
#maybe it is slope at the midpoint of the interval
knot_midpoints = c(
  model_1b$Design$parms$IQ[2] - ((model_1b$Design$parms$IQ[2] - model_1b$Design$parms$IQ[1])/2),
  model_1b$Design$parms$IQ[2] + ((model_1b$Design$parms$IQ[3] - model_1b$Design$parms$IQ[2])/2)
)
knot_midpoints
## [1] 77.1 91.7
knot_midpoints_preds = predict(model_1b, newdata = tibble(IQ = c(knot_midpoints - .1, knot_midpoints + .1)))
knot_midpoints_preds
##      1      2      3      4 
## -0.751  0.557 -0.741  0.584
#slopes
(knot_midpoints_preds[3] - knot_midpoints_preds[1]) / .2
##      3 
## 0.0461
(knot_midpoints_preds[4] - knot_midpoints_preds[2]) / .2
##     4 
## 0.135

Nigeria and other specific countries

#residuals
natdata %>% filter(Country_EN == "Nigeria") %>% select(!!all_names)
natdata %>% filter(Country_EN == "Nigeria") %>% select(general_mental_sports)
natdata$model_1a_resid = model_1a %>% resid() %>% standardize()
natdata %>% filter(Country_EN == "Nigeria") %>% select(model_1a_resid)
#game resids
game_resids = natdata[c("country", all_names)]

#dispersion by country
games_sum_stats = natdata %>% plyr::adply(.margins = 1, .expand = F, .id = NULL, .fun = function(.) {
  nums = .[all_names] %>% unlist()
  
  data_frame(
    country = .$country,
    factor_score = .$general_mental_sports,
    mean = mean(nums),
    median = median(nums),
    sd = sd(nums),
    mad = mad(nums),
  )
})

wtd.cors(games_sum_stats[-1])
##              factor_score  mean median    sd   mad
## factor_score        1.000 0.986  0.990 0.496 0.498
## mean                0.986 1.000  0.985 0.518 0.488
## median              0.990 0.985  1.000 0.488 0.484
## sd                  0.496 0.518  0.488 1.000 0.868
## mad                 0.498 0.488  0.484 0.868 1.000
#predict Nigerian IQ
natdata$pred_IQ = ols(IQ ~ rcs(general_mental_sports, 3), data = natdata, weights = sqrt(natdata$population2017)) %>% predict()
natdata %>% filter(country == "Nigeria") %>% pull(pred_IQ)
## [1] 73.1

All models

#models to fit
models = expand.grid(
  complexity = c("basic", "full"),
  game = all_names,
  zeros = c(T, F)
)

#fit
models$fit = plyr::alply(models, 1, function(r) {
  #make model
  y_var = sprintf("%s_count", r$game)
  if (r$complexity == "basic") {
    this_model = as.formula(sprintf("%s ~ pop * IQ", y_var))
  } else {
    this_model = as.formula(sprintf("%s ~ pop * IQ + UN_macroregion", y_var))
  }

  #fit
  if (r$zeros) {
    y = rms::ols(this_model, data = natdata)
  } else {
    y = rms::ols(this_model, data = natdata %>% filter(.[[y_var]] != 0))
  }
  
  y
})

#add metrics
models$R2a = NA
models$n = NA
models$IQ_beta = NA
models$IQ_p = NA
models$IQ_se = NA

for (i in 1:nrow(models)) {
  #fit
  this_fit = models$fit[[i]]
  
  #hack solution
  mod_print = capture.output(print(this_fit))
  
  #meta
  models$n[i] = this_fit$stats[1]
  models$R2a[i] = mod_print %>% str_subset("R2 adj") %>% str_match("\\d+\\.\\d+") %>% as.numeric()
  
  models$IQ_beta[i] = this_fit$coefficients[3]
  
  #line with IQ
  IQ_line = mod_print[which(str_detect(mod_print, "^ IQ"))]
  IQ_nums = str_match_all(IQ_line, "\\d+\\.\\d+") %>% .[[1]] %>% as.vector() %>% as.numeric()
  
  models$IQ_p[i] = IQ_nums[4]
  models$IQ_se[i] = IQ_nums[2]
}

#meta-analyze
#simple
full_imp = models %>% filter(complexity == "full", zeros)
print(describe(full_imp$IQ_beta), digits = 4)
##    vars  n   mean     sd median trimmed    mad     min   max range   skew
## X1    1 12 0.7619 0.7279 0.5696  0.6847 0.4569 0.01963 2.276 2.257 0.9055
##    kurtosis     se
## X1  -0.7137 0.2101
#RE
(RE_fit = rma(full_imp$IQ_beta,
    sei = full_imp$IQ_se))
## 
## Random-Effects Model (k = 12; tau^2 estimator: REML)
## 
## tau^2 (estimated amount of total heterogeneity): 0 (SE = 0.0745)
## tau (square root of estimated tau^2 value):      0
## I^2 (total heterogeneity / total variability):   0.00%
## H^2 (total variability / sampling variability):  1.00
## 
## Test for Heterogeneity:
## Q(df = 11) = 7.6702, p-val = 0.7425
## 
## Model Results:
## 
## estimate      se    zval    pval   ci.lb   ci.ub 
##   0.4583  0.1362  3.3654  0.0008  0.1914  0.7253  *** 
## 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#forest
GG_forest(RE_fit, .names = full_imp$game)

GG_save("figs/RE_forest.png")

#no Go
full_imp_noGo = models %>% filter(complexity == "full", zeros, game != "go")
print(describe(full_imp_noGo$IQ_beta), digits = 4)
##    vars  n   mean    sd median trimmed    mad    min   max range   skew
## X1    1 11 0.8294 0.723 0.5736  0.7493 0.3022 0.1036 2.276 2.173 0.8511
##    kurtosis    se
## X1  -0.9037 0.218
(RE_fit = rma(full_imp_noGo$IQ_beta,
    sei = full_imp_noGo$IQ_se))
## 
## Random-Effects Model (k = 11; tau^2 estimator: REML)
## 
## tau^2 (estimated amount of total heterogeneity): 0 (SE = 0.0855)
## tau (square root of estimated tau^2 value):      0
## I^2 (total heterogeneity / total variability):   0.00%
## H^2 (total variability / sampling variability):  1.00
## 
## Test for Heterogeneity:
## Q(df = 10) = 5.6659, p-val = 0.8425
## 
## Model Results:
## 
## estimate      se    zval    pval   ci.lb   ci.ub 
##   0.5431  0.1488  3.6506  0.0003  0.2515  0.8347  *** 
## 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#other designs
#no regions, but with 0's
basic_Imp = models %>% filter(complexity == "basic", zeros)
print(describe(basic_Imp$IQ_beta), digits = 4)
##    vars  n  mean    sd median trimmed    mad    min   max range  skew kurtosis
## X1    1 12 1.365 1.131 0.9811   1.182 0.5956 0.3116 4.252 3.941 1.325   0.7882
##        se
## X1 0.3265
(RE_fit = rma(basic_Imp$IQ_beta,
    sei = basic_Imp$IQ_se))
## 
## Random-Effects Model (k = 12; tau^2 estimator: REML)
## 
## tau^2 (estimated amount of total heterogeneity): 0.7625 (SE = 0.3791)
## tau (square root of estimated tau^2 value):      0.8732
## I^2 (total heterogeneity / total variability):   93.56%
## H^2 (total variability / sampling variability):  15.54
## 
## Test for Heterogeneity:
## Q(df = 11) = 74.6685, p-val < .0001
## 
## Model Results:
## 
## estimate      se    zval    pval   ci.lb   ci.ub 
##   1.2493  0.2735  4.5675  <.0001  0.7132  1.7855  *** 
## 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#regions and no 0's
full_noZeros = models %>% filter(complexity == "full", !zeros)
print(describe(full_noZeros$IQ_beta), digits = 4)
##    vars  n   mean    sd median trimmed  mad    min   max range   skew kurtosis
## X1    1 12 0.4097 4.434  2.017   1.351 1.62 -12.23 3.633 15.86 -1.901    2.621
##      se
## X1 1.28
(RE_fit = rma(full_noZeros$IQ_beta,
    sei = full_noZeros$IQ_se))
## 
## Random-Effects Model (k = 12; tau^2 estimator: REML)
## 
## tau^2 (estimated amount of total heterogeneity): 0 (SE = 1.2051)
## tau (square root of estimated tau^2 value):      0
## I^2 (total heterogeneity / total variability):   0.00%
## H^2 (total variability / sampling variability):  1.00
## 
## Test for Heterogeneity:
## Q(df = 11) = 5.8896, p-val = 0.8806
## 
## Model Results:
## 
## estimate      se    zval    pval   ci.lb   ci.ub 
##   1.6735  0.5574  3.0022  0.0027  0.5810  2.7661  ** 
## 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Additional regressions

Carried out later in 2019 for talk on Youtube

#models
natdata$IQ_z = natdata$IQ %>% standardize()
natdata$internet_users_pct2 = natdata$internet_users_pct/100
natdata$internet_users_pct_z = natdata$internet_users_pct %>% standardize()

regs = list(
  simple = rms::ols(general_mental_sports ~ IQ_z, data = natdata, weights = sqrt(natdata$population2017)),
  simple_nowt = rms::ols(general_mental_sports ~ IQ_z, data = natdata),
  nonlinear = rms::ols(general_mental_sports ~ rcs(IQ_z, 3), data = natdata, weights = sqrt(natdata$population2017)),
  continents = rms::ols(general_mental_sports ~ IQ_z + UN_continent, data = natdata, weights = sqrt(natdata$population2017)),
  macroregions = rms::ols(general_mental_sports ~ IQ_z + UN_macroregion, data = natdata, weights = sqrt(natdata$population2017)),
  internet = rms::ols(general_mental_sports ~ IQ_z + internet_users_pct2, data = natdata, weights = sqrt(natdata$population2017)),
  internet_z = rms::ols(general_mental_sports ~ IQ_z + internet_users_pct_z, data = natdata, weights = sqrt(natdata$population2017)),
  full = rms::ols(general_mental_sports ~ IQ_z + internet_users_pct_z + UN_macroregion, data = natdata, weights = sqrt(natdata$population2017))
)

#table
regs %>% 
  summarize_models()

Open science

#main data
write_csv(cbind(country = natdata$country, factor_score = natdata$general_mental_sports, games_sum_stats[-1], game_resids[-1], natdata %>% select(ends_with("_count"), -starts_with("log_"))), "data/out/main_data.csv", na = "")

#all data
write_rds(natdata, "data/out/all_data.rds", compress = "xz")

#chess low level data
write_rds(fide, "data/out/fide.rds")