options(digits = 2)
library(pacman)
p_load(kirkegaard, googlesheets, plyr, metafor)
#auth
gs_auth()
#load
gs = gs_url("https://docs.google.com/spreadsheets/d/1drrchXhJs5elWCvtYCOwMX7zG22_A_WD73KcAyBfTn4/edit#gid=211583593")
## Sheet-identifying info appears to be a browser URL.
## googlesheets will attempt to extract sheet key from the URL.
## Putative key: 1drrchXhJs5elWCvtYCOwMX7zG22_A_WD73KcAyBfTn4
## Sheet successfully identified: "Lynn et al, Regional IQ studies review, 2017, Intelligence, dataset"
d = gs_read(gs) %>% df_legalize_names()
## Accessing worksheet titled 'data'.
## Warning: Missing column names filled in: 'X20' [20], 'X21' [21]
## Parsed with column specification:
## cols(
## .default = col_double(),
## Country = col_character(),
## Name = col_character(),
## Level = col_integer(),
## `Unit N` = col_integer(),
## `Population 2010` = col_number(),
## Authors = col_character(),
## X20 = col_character(),
## X21 = col_character(),
## `Study Number` = col_integer()
## )
## See spec(...) for full column specifications.
#ad hoc functions
add_vectorized = function(x, y) {
rowMeans(cbind(x, y), na.rm = T) %>% plyr::mapvalues(NaN, NA)
}
#vec2df
vec2df = function(x) {
y = x %>% as.data.frame() %>% t() %>% as.data.frame()
names(y) = names(x)
y
}
#NaN2NA
NaN2NA = function(x) {
x[is.nan(x)] = NA
x
}
#remove NAs from output, hack solution
removeNAs = function(x) {
#convert all to strings
x %>% map_df(~as.character(.) %>% mapvalues(NA, ""))
}
#unique studies
length(unique(d$Authors))
## [1] 45
#overview of levels
table2(d$Level)
#count by country
table2(d$Country)
To avoid double counting and redundant data.
#first, average the alternate columns
d2 = d %>% mutate(
Income = add_vectorized(Per_capita_income_1, Per_capita_income_2),
Educational_attainment = add_vectorized(Educational_attainment_1, Educational_attainment_2),
Life_expectancy = add_vectorized(Life_expectancy_1, Life_expectancy_2),
Infant_mortality = add_vectorized(Infant_mortality_1, Infant_mortality_2),
Fertility = add_vectorized(Fertility_1, Fertility_2),
Crime = add_vectorized(Crime_1, Crime_2),
Health = add_vectorized(Life_expectancy, Infant_mortality*-1)
)
#vars
primary_vars = c("Income", "Educational_attainment", "Fertility", "Crime", "Health", "General_SES")
#aggregate without countries
d2_country = ddply(d2, "Country", function(x) {
#begin with a copy
y = x[1, c("Country", "Population_2010")]
#add n studies
y$Study_N = nrow(x)
#add average data for numeric cols of interest
y2 = colMeans(x[c("Unit_N", primary_vars)], na.rm = T) %>% NaN2NA() %>% vec2df()
cbind(y, y2)
})
#aggregate without countries
d2_country_levels = ddply(d2, c("Country", "Level"), function(x) {
#begin with a copy
y = x[1, c("Country", "Level", "Population_2010")]
#add n studies
y$Study_N = nrow(x)
#add average data for numeric cols of interest
y2 = colMeans(x[c("Unit_N", primary_vars)], na.rm = T) %>% NaN2NA() %>% vec2df()
cbind(y, y2)
})
#output aggregated tables back to spreadsheet
gs_edit_cells(gs, ws = "aggregated_country", input = d2_country %>% removeNAs())
## The following `from` values were not present in `x`: NA
## The following `from` values were not present in `x`: NA
## The following `from` values were not present in `x`: NA
## The following `from` values were not present in `x`: NA
## Range affected by the update: "R1C1:R23C10"
## Worksheet "aggregated_country" successfully updated with 230 new value(s).
gs_edit_cells(gs, ws = "aggregated_country_level", input = d2_country_levels %>% removeNAs())
## The following `from` values were not present in `x`: NA
## The following `from` values were not present in `x`: NA
## The following `from` values were not present in `x`: NA
## The following `from` values were not present in `x`: NA
## The following `from` values were not present in `x`: NA
## Range affected by the update: "R1C1:R27C11"
## Worksheet "aggregated_country_level" successfully updated with 297 new value(s).
#simple means
colMeans(d2_country[primary_vars], na.rm = T)
## Income Educational_attainment Fertility
## 0.56 0.59 -0.52
## Crime Health General_SES
## -0.25 0.49 0.55
colMeans(d2_country_levels[primary_vars], na.rm = T)
## Income Educational_attainment Fertility
## 0.55 0.56 -0.51
## Crime Health General_SES
## -0.30 0.49 0.58
#weighted by population
map_df(d2_country[primary_vars], ~wtd_mean(., w = d2_country$Population_2010))
#weighted by sqrt population
map_df(d2_country[primary_vars], ~wtd_mean(., w = d2_country$Population_2010 %>% sqrt()))
#weighted by sqrt N divisions
map_df(d2_country[primary_vars], ~wtd_mean(., w = d2_country$Unit_N %>% sqrt()))
#overall
d2_country_levels %>%
gather_("outcome", "r", primary_vars) %>%
ggplot(aes(Country, r, color = outcome, shape = factor(Level), group = outcome)) +
geom_point(size = 1.5) +
theme_bw() +
theme(axis.text.x = element_text(angle = -30, hjust = 0))
## Warning: Removed 76 rows containing missing values (geom_point).
GG_save("figs/overview.png")
## Warning: Removed 76 rows containing missing values (geom_point).
#bars
d2_country %>%
gather_("outcome", "r", primary_vars) %>%
ggplot(aes(Country, r, fill = outcome)) +
geom_bar(stat = "identity", position = "dodge") +
theme_bw() +
theme(axis.text.x = element_text(angle = -30, hjust = 0))
## Warning: Removed 63 rows containing missing values (geom_bar).
GG_save("figs/overview2.png")
## Warning: Removed 63 rows containing missing values (geom_bar).