Basics

Data summary

metadata1 %>%
  group_by(T_SURVEY_NAME) %>%
  summarise(n_surveys = n_distinct(T_SURVEY_ID),
            n_countries = n_distinct(T_SURVEY_COUNTRY),
            n_waves = n_distinct(T_SURVEY_ROUND),
            min_year = min(T_SURVEY_YEAR),
            max_year = max(T_SURVEY_YEAR))

There are 45 countries altogether. Of those, 17 are not included in all projects:

metadata1 %>%
  count(T_SURVEY_NAME, T_SURVEY_COUNTRY) %>%
  spread(T_SURVEY_NAME, n) %>%
  filter(is.na(EQLS + ESS + EVS + ISSP))

Excess bias

by project

Proportion of surveys with excess (> 1.96) absolute bias:

metadata1 %>%
  mutate(is_sig = Q_Abs_bias_internal_criteria > 1.96) %>%
  count(T_SURVEY_NAME, is_sig) %>%
  group_by(T_SURVEY_NAME) %>%
  mutate(prop_sig = round(n / sum(n), 3)) %>%
  filter(is_sig == TRUE) %>%
  select(T_SURVEY_NAME, prop_sig)

2008 was a difficult year for surveys?

Numbers above bars indicate the number of surveys for which bias was possible to calculate.

part1 <- metadata1 %>%
  drop_na(Q_Abs_bias_internal_criteria) %>%
  filter(T_SURVEY_NAME != "ISSP") %>%
  mutate(is_sig = Q_Abs_bias_internal_criteria > 1.96,
         round = gsub("^[A-Z]{1,4}", "", T_SURVEY_ROUND)) %>%
  count(T_SURVEY_NAME, round, is_sig) %>%
  group_by(T_SURVEY_NAME, round) %>%
  mutate(prop_sig = n / sum(n),
         nsurveys = sum(n)) %>%
  filter(is_sig == TRUE) %>%
  ggplot(.) +
  geom_bar(aes(x = round, y = prop_sig), stat = "Identity", fill = "gray70") +
  geom_text(aes(x = round, y = prop_sig + 0.03, label = nsurveys), col = "gray20") +
  theme_bw() +
  facet_wrap("T_SURVEY_NAME", scales = "free_x")

part2 <- metadata1 %>%
  drop_na(Q_Abs_bias_internal_criteria) %>%
  filter(T_SURVEY_NAME == "ISSP") %>%
  mutate(is_sig = Q_Abs_bias_internal_criteria > 1.96,
         round = gsub("^[A-Z]{1,4}", "", T_SURVEY_ROUND)) %>%
  count(T_SURVEY_NAME, round, is_sig) %>%
  group_by(T_SURVEY_NAME, round) %>%
  mutate(prop_sig = n / sum(n),
         nsurveys = sum(n)) %>%
  filter(is_sig == TRUE) %>%
  ggplot(.) +
  geom_bar(aes(x = round, y = prop_sig), stat = "Identity", fill = "gray70") +
  geom_text(aes(x = round, y = prop_sig + 0.035, label = nsurveys), col = "gray20") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90)) +
  facet_wrap("T_SURVEY_NAME", scales = "free_x")

part1 / part2

by country

Surveys with absolute bias > 1.96 by country.

metadata1 %>%
  drop_na(Q_Abs_bias_internal_criteria) %>%
  group_by(T_SURVEY_COUNTRY) %>%
  mutate(total_surveys = n(),
         total_projects = n_distinct(T_SURVEY_NAME),
         is_sig = Q_Abs_bias_internal_criteria > 1.96) %>%
  group_by(T_SURVEY_COUNTRY, is_sig) %>%
  mutate(bias_surveys = n(),
         bias_projects = n_distinct(T_SURVEY_NAME)) %>%
  ungroup() %>%
  distinct(T_SURVEY_COUNTRY, total_surveys, bias_surveys, total_projects, bias_projects, is_sig) %>%
  complete(is_sig, T_SURVEY_COUNTRY, fill = list(bias_surveys = 0, bias_projects = 0)) %>%
  group_by(T_SURVEY_COUNTRY) %>%
  mutate(total_surveys = max(total_surveys, na.rm = T),
         total_projects = max(total_projects, na.rm = T)) %>%
  filter(is_sig == TRUE) %>%
  mutate(prop_bias = round(bias_surveys / total_surveys, 3)) %>%
  select(T_SURVEY_COUNTRY, total_surveys, bias_surveys, prop_bias, total_projects, bias_projects) %>%
  arrange(desc(prop_bias))

Changes over time

Bias

Orange dots indicate surveys from the waves analyzed in Kohler 2007.

metadata1 %>%
  ungroup() %>%
  mutate(S_SAMPLE_TYPE = fct_collapse(S_SAMPLE_TYPE,
                                      noinfo_insuff = c("no info", "insuff")),
         S_SAMPLE_TYPE = fct_rev(S_SAMPLE_TYPE),
         K2007 = T_SURVEY_ROUND %in% c("EQLS1", "ESS1", "ESS2", "EVS1999", "ISSP2002")) %>%
  ggplot(., aes(x = T_SURVEY_YEAR, y = Q_Bias_internal_criteria, col = K2007)) +
  geom_point(size = 2, alpha = 0.5) +
  scale_color_manual(values = c("gray50", "darkorange")) +
  theme_bw() +
  theme(legend.position = "none") +
  xlab("")

Surveys with absolute bias exceeding 5 are labelled.

metadata1 %>%
  ggplot(., aes(x = T_SURVEY_YEAR, y = Q_Bias_internal_criteria)) +
  geom_point(size = 2) +
  gghighlight(Q_Abs_bias_internal_criteria > 5) +
  geom_text_repel(aes(label = T_SURVEY_ID), size = 3.5) +
  theme_bw() +
  xlab("")

Absolute bias and response rates

Note the different Y axis ranges.

metadata1 %>%
  filter(!S_SAMPLE_TYPE %in% c("no info", "insuff", "non-prob"),
         mode == "f2f") %>%
  # filter(Q_Abs_bias_internal_criteria < 4.5) %>%
  select(T_SURVEY_NAME, T_SURVEY_COUNTRY, T_SURVEY_ROUND, T_SURVEY_YEAR,
         Q_Abs_bias_internal_criteria, S_RR1_CALC_VALUE) %>%
  gather(var, value, c(S_RR1_CALC_VALUE, Q_Abs_bias_internal_criteria)) %>%
  drop_na(value) %>%
  ggplot(., aes(x = T_SURVEY_YEAR,
                y = value)) +
  geom_point(alpha = 0.1) +
  geom_smooth(method = "loess") +
  theme_bw() +
  facet_wrap(T_SURVEY_NAME ~ var, scales = "free_y", ncol = 2)

Absolute bias by sample type

part1 <- metadata1 %>%
  filter(T_SURVEY_NAME %in% c("ESS", "EQLS")) %>%
  ggplot(., aes(x = S_SAMPLE_TYPE, y = Q_Abs_bias_internal_criteria)) +
  geom_boxplot() +
  theme_bw() +
  xlab("") +
  facet_wrap("T_SURVEY_NAME", ncol = 1)

part2 <- metadata1 %>%
  filter(T_SURVEY_NAME %in% c("ISSP", "EVS")) %>%
  ggplot(., aes(x = S_SAMPLE_TYPE, y = Q_Abs_bias_internal_criteria)) +
  geom_boxplot() +
  theme_bw() +
  ylab("") + xlab("") +
  facet_wrap("T_SURVEY_NAME", ncol = 1)

part1 + part2 + plot_layout(widths = c(1, 2))

Excluding absolute bias > 4.

part1 <- metadata1 %>%
  filter(T_SURVEY_NAME %in% c("ESS", "EQLS")) %>%
  ggplot(., aes(x = S_SAMPLE_TYPE, y = Q_Abs_bias_internal_criteria)) +
  geom_boxplot() +
  theme_bw() +
  xlab("") + ylim(0,4) +
  facet_wrap("T_SURVEY_NAME", ncol = 1)

part2 <- metadata1 %>%
  filter(T_SURVEY_NAME %in% c("ISSP", "EVS")) %>%
  ggplot(., aes(x = S_SAMPLE_TYPE, y = Q_Abs_bias_internal_criteria)) +
  geom_boxplot() +
  theme_bw() +
  ylab("") + xlab("") + ylim(0,4) +
  facet_wrap("T_SURVEY_NAME", ncol = 1)

part1 + part2 + plot_layout(widths = c(1, 2))

Bias and response rates

Overall

Excludes sample types: no information, insufficient, non-probability.

metadata1 %>%
  filter(!S_SAMPLE_TYPE %in% c("no info", "insuff", "non-prob"),
         mode == "f2f") %>%
  ggplot(., aes(x = S_RR1_CALC_VALUE, y = Q_Abs_bias_internal_criteria)) +
  geom_point(aes(col = T_SURVEY_NAME), size = 2, alpha = 0.5) +
  geom_smooth(method = "loess", se = FALSE) +
  xlim(0, 1) +
  theme_bw()

Excludes RR >= 0.9, and sample types: no information, insufficient, non-probability.

metadata1 %>%
  filter(!S_SAMPLE_TYPE %in% c("no info", "insuff", "non-prob"),
         mode == "f2f",
         S_RR1_CALC_VALUE < 0.9) %>%
  ggplot(., aes(x = S_RR1_CALC_VALUE, y = Q_Abs_bias_internal_criteria)) +
  geom_point(aes(col = T_SURVEY_NAME), size = 2, alpha = 0.5) +
  geom_smooth(method = "loess", se = FALSE) +
  xlim(0, 1) +
  theme_bw()

By sample type

Excludes RR >= 0.9, and sample types: no information, insufficient, non-probability.

metadata1 %>%
  filter(!S_SAMPLE_TYPE %in% c("no info", "insuff", "non-prob"),
         mode == "f2f",
         S_RR1_CALC_VALUE < 0.9) %>%  
  ggplot(., aes(x = S_RR1_CALC_VALUE, y = Q_Abs_bias_internal_criteria)) +
  geom_point(size = 2, alpha = 0.1) +
  geom_smooth(method = "loess", se = FALSE, size = 1) +
  theme_bw() +
  facet_wrap("S_SAMPLE_TYPE")

By sample type and project

Excludes RR >= 0.9, and sample types: no information, insufficient, non-probability.

metadata1 %>%
  filter(!S_SAMPLE_TYPE %in% c("no info", "insuff", "non-prob"),
         mode == "f2f",
         S_RR1_CALC_VALUE < 0.9) %>%  
  ggplot(., aes(x = S_RR1_CALC_VALUE, y = Q_Abs_bias_internal_criteria)) +
  geom_point(size = 2, alpha = 0.1) +
  geom_smooth(method = "loess", se = FALSE) +
  theme_bw() +
  facet_grid(T_SURVEY_NAME ~ S_SAMPLE_TYPE)

---
title: "Surveys"
date: "7/13/2021"
output:
  html_notebook:
    code_folding: hide
    toc: true
    toc_float: 
      collapsed: false
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)

library(tidyverse)
library(haven)
library(patchwork)
library(ggrepel)
library(gghighlight)

metadata <- rio::import("00_Metadata_EQLS_ESS_EVS_ISSP_21_07_02.sav", user_na = TRUE)


metadata1 <- metadata %>%
  zap_labels() %>%
  mutate(S_SAMPLE_TYPE = plyr::mapvalues(S_SAMPLE_TYPE, c(NA, 1,2,3,4,5,6,7),
                                         c("no info", "non-prob", "multi RR", "multi HH", "multi ind", 
                                           "single stage", "multi unspec", "insuff")),
         S_SAMPLE_TYPE = factor(S_SAMPLE_TYPE,
                                levels = c("no info", "insuff", "multi unspec", "non-prob", 
                                           "multi RR", "multi HH", "multi ind", "single stage")),
         S_WITHIN_HH_SELECTION = plyr::mapvalues(S_WITHIN_HH_SELECTION, c(0,1,6,7,9,10,NA),
                                                 c("not app", "Kish", "birthday", 
                                                   "birthday self-compl", "insuff", 
                                                   "non-random", "miss")),
         year = T_SURVEY_YEAR - 1981,
         mode = case_when(
           S_MODE_F2F == 1 ~ "f2f",
           S_MODE_CATI == 1 ~ "cati",
           S_MODE_CAWI == 1 ~ "cawi",
           S_MODE_POST_MAIL == 1 ~ "mail",
           S_MODE_SELF_COMPLETION == 1 ~ "self",
           TRUE ~ "missing"
         )
  ) %>%
  
  mutate_if(is.numeric, function(x) ifelse(x %in% c(-9999, -8888), NA, x)) %>%
  mutate_at(vars(S_BACK_CHECKING, S_SUBSTITUTION), factor)
```

# Basics

Data summary

```{r}
metadata1 %>%
  group_by(T_SURVEY_NAME) %>%
  summarise(n_surveys = n_distinct(T_SURVEY_ID),
            n_countries = n_distinct(T_SURVEY_COUNTRY),
            n_waves = n_distinct(T_SURVEY_ROUND),
            min_year = min(T_SURVEY_YEAR),
            max_year = max(T_SURVEY_YEAR))
```

There are 45 countries altogether. Of those, 17 are not included in all projects:

```{r}
metadata1 %>%
  count(T_SURVEY_NAME, T_SURVEY_COUNTRY) %>%
  spread(T_SURVEY_NAME, n) %>%
  filter(is.na(EQLS + ESS + EVS + ISSP))
```



# Excess bias

## by project

Proportion of surveys with excess (> 1.96) absolute bias:

```{r project-significant-table}
metadata1 %>%
  mutate(is_sig = Q_Abs_bias_internal_criteria > 1.96) %>%
  count(T_SURVEY_NAME, is_sig) %>%
  group_by(T_SURVEY_NAME) %>%
  mutate(prop_sig = round(n / sum(n), 3)) %>%
  filter(is_sig == TRUE) %>%
  select(T_SURVEY_NAME, prop_sig)
```


2008 was a difficult year for surveys?

Numbers above bars indicate the number of surveys for which bias was possible to calculate.


```{r, fig.height= 6, fig.width = 10}
part1 <- metadata1 %>%
  drop_na(Q_Abs_bias_internal_criteria) %>%
  filter(T_SURVEY_NAME != "ISSP") %>%
  mutate(is_sig = Q_Abs_bias_internal_criteria > 1.96,
         round = gsub("^[A-Z]{1,4}", "", T_SURVEY_ROUND)) %>%
  count(T_SURVEY_NAME, round, is_sig) %>%
  group_by(T_SURVEY_NAME, round) %>%
  mutate(prop_sig = n / sum(n),
         nsurveys = sum(n)) %>%
  filter(is_sig == TRUE) %>%
  ggplot(.) +
  geom_bar(aes(x = round, y = prop_sig), stat = "Identity", fill = "gray70") +
  geom_text(aes(x = round, y = prop_sig + 0.03, label = nsurveys), col = "gray20") +
  theme_bw() +
  facet_wrap("T_SURVEY_NAME", scales = "free_x")

part2 <- metadata1 %>%
  drop_na(Q_Abs_bias_internal_criteria) %>%
  filter(T_SURVEY_NAME == "ISSP") %>%
  mutate(is_sig = Q_Abs_bias_internal_criteria > 1.96,
         round = gsub("^[A-Z]{1,4}", "", T_SURVEY_ROUND)) %>%
  count(T_SURVEY_NAME, round, is_sig) %>%
  group_by(T_SURVEY_NAME, round) %>%
  mutate(prop_sig = n / sum(n),
         nsurveys = sum(n)) %>%
  filter(is_sig == TRUE) %>%
  ggplot(.) +
  geom_bar(aes(x = round, y = prop_sig), stat = "Identity", fill = "gray70") +
  geom_text(aes(x = round, y = prop_sig + 0.035, label = nsurveys), col = "gray20") +
  theme_bw() +
  theme(axis.text.x = element_text(angle = 90)) +
  facet_wrap("T_SURVEY_NAME", scales = "free_x")

part1 / part2
```

## by country

Surveys with absolute bias > 1.96 by country.

```{r}
metadata1 %>%
  drop_na(Q_Abs_bias_internal_criteria) %>%
  group_by(T_SURVEY_COUNTRY) %>%
  mutate(total_surveys = n(),
         total_projects = n_distinct(T_SURVEY_NAME),
         is_sig = Q_Abs_bias_internal_criteria > 1.96) %>%
  group_by(T_SURVEY_COUNTRY, is_sig) %>%
  mutate(bias_surveys = n(),
         bias_projects = n_distinct(T_SURVEY_NAME)) %>%
  ungroup() %>%
  distinct(T_SURVEY_COUNTRY, total_surveys, bias_surveys, total_projects, bias_projects, is_sig) %>%
  complete(is_sig, T_SURVEY_COUNTRY, fill = list(bias_surveys = 0, bias_projects = 0)) %>%
  group_by(T_SURVEY_COUNTRY) %>%
  mutate(total_surveys = max(total_surveys, na.rm = T),
         total_projects = max(total_projects, na.rm = T)) %>%
  filter(is_sig == TRUE) %>%
  mutate(prop_bias = round(bias_surveys / total_surveys, 3)) %>%
  select(T_SURVEY_COUNTRY, total_surveys, bias_surveys, prop_bias, total_projects, bias_projects) %>%
  arrange(desc(prop_bias))
```



# Changes over time

## Bias

Orange dots indicate surveys from the waves analyzed in Kohler 2007.

```{r, fig.height= 6, fig.width = 10}
metadata1 %>%
  ungroup() %>%
  mutate(S_SAMPLE_TYPE = fct_collapse(S_SAMPLE_TYPE,
                                      noinfo_insuff = c("no info", "insuff")),
         S_SAMPLE_TYPE = fct_rev(S_SAMPLE_TYPE),
         K2007 = T_SURVEY_ROUND %in% c("EQLS1", "ESS1", "ESS2", "EVS1999", "ISSP2002")) %>%
  ggplot(., aes(x = T_SURVEY_YEAR, y = Q_Bias_internal_criteria, col = K2007)) +
  geom_point(size = 2, alpha = 0.5) +
  scale_color_manual(values = c("gray50", "darkorange")) +
  theme_bw() +
  theme(legend.position = "none") +
  xlab("")
```

Surveys with absolute bias exceeding 5 are labelled.

```{r, fig.height= 6, fig.width = 10}
metadata1 %>%
  ggplot(., aes(x = T_SURVEY_YEAR, y = Q_Bias_internal_criteria)) +
  geom_point(size = 2) +
  gghighlight(Q_Abs_bias_internal_criteria > 5) +
  geom_text_repel(aes(label = T_SURVEY_ID), size = 3.5) +
  theme_bw() +
  xlab("")
```


## Absolute bias and response rates

Note the different Y axis ranges.

```{r, fig.height= 10, fig.width = 10}
metadata1 %>%
  filter(!S_SAMPLE_TYPE %in% c("no info", "insuff", "non-prob"),
         mode == "f2f") %>%
  # filter(Q_Abs_bias_internal_criteria < 4.5) %>%
  select(T_SURVEY_NAME, T_SURVEY_COUNTRY, T_SURVEY_ROUND, T_SURVEY_YEAR,
         Q_Abs_bias_internal_criteria, S_RR1_CALC_VALUE) %>%
  gather(var, value, c(S_RR1_CALC_VALUE, Q_Abs_bias_internal_criteria)) %>%
  drop_na(value) %>%
  ggplot(., aes(x = T_SURVEY_YEAR,
                y = value)) +
  geom_point(alpha = 0.1) +
  geom_smooth(method = "loess") +
  theme_bw() +
  facet_wrap(T_SURVEY_NAME ~ var, scales = "free_y", ncol = 2)
```



# Absolute bias by sample type

```{r, fig.height= 7, fig.width = 10}
part1 <- metadata1 %>%
  filter(T_SURVEY_NAME %in% c("ESS", "EQLS")) %>%
  ggplot(., aes(x = S_SAMPLE_TYPE, y = Q_Abs_bias_internal_criteria)) +
  geom_boxplot() +
  theme_bw() +
  xlab("") +
  facet_wrap("T_SURVEY_NAME", ncol = 1)

part2 <- metadata1 %>%
  filter(T_SURVEY_NAME %in% c("ISSP", "EVS")) %>%
  ggplot(., aes(x = S_SAMPLE_TYPE, y = Q_Abs_bias_internal_criteria)) +
  geom_boxplot() +
  theme_bw() +
  ylab("") + xlab("") +
  facet_wrap("T_SURVEY_NAME", ncol = 1)

part1 + part2 + plot_layout(widths = c(1, 2))
```


Excluding absolute bias > 4.


```{r, fig.height= 7, fig.width = 10}
part1 <- metadata1 %>%
  filter(T_SURVEY_NAME %in% c("ESS", "EQLS")) %>%
  ggplot(., aes(x = S_SAMPLE_TYPE, y = Q_Abs_bias_internal_criteria)) +
  geom_boxplot() +
  theme_bw() +
  xlab("") + ylim(0,4) +
  facet_wrap("T_SURVEY_NAME", ncol = 1)

part2 <- metadata1 %>%
  filter(T_SURVEY_NAME %in% c("ISSP", "EVS")) %>%
  ggplot(., aes(x = S_SAMPLE_TYPE, y = Q_Abs_bias_internal_criteria)) +
  geom_boxplot() +
  theme_bw() +
  ylab("") + xlab("") + ylim(0,4) +
  facet_wrap("T_SURVEY_NAME", ncol = 1)

part1 + part2 + plot_layout(widths = c(1, 2))
```



# Bias and response rates

## Overall

Excludes sample types: no information, insufficient, non-probability.

```{r, fig.height= 5, fig.width = 10}
metadata1 %>%
  filter(!S_SAMPLE_TYPE %in% c("no info", "insuff", "non-prob"),
         mode == "f2f") %>%
  ggplot(., aes(x = S_RR1_CALC_VALUE, y = Q_Abs_bias_internal_criteria)) +
  geom_point(aes(col = T_SURVEY_NAME), size = 2, alpha = 0.5) +
  geom_smooth(method = "loess", se = FALSE) +
  xlim(0, 1) +
  theme_bw()
```

Excludes RR >= 0.9, and sample types: no information, insufficient, non-probability.

```{r, fig.height= 5, fig.width = 10}
metadata1 %>%
  filter(!S_SAMPLE_TYPE %in% c("no info", "insuff", "non-prob"),
         mode == "f2f",
         S_RR1_CALC_VALUE < 0.9) %>%
  ggplot(., aes(x = S_RR1_CALC_VALUE, y = Q_Abs_bias_internal_criteria)) +
  geom_point(aes(col = T_SURVEY_NAME), size = 2, alpha = 0.5) +
  geom_smooth(method = "loess", se = FALSE) +
  xlim(0, 1) +
  theme_bw()
```


## By sample type

Excludes RR >= 0.9, and sample types: no information, insufficient, non-probability.

```{r, fig.height= 6, fig.width = 10}
metadata1 %>%
  filter(!S_SAMPLE_TYPE %in% c("no info", "insuff", "non-prob"),
         mode == "f2f",
         S_RR1_CALC_VALUE < 0.9) %>%  
  ggplot(., aes(x = S_RR1_CALC_VALUE, y = Q_Abs_bias_internal_criteria)) +
  geom_point(size = 2, alpha = 0.1) +
  geom_smooth(method = "loess", se = FALSE, size = 1) +
  theme_bw() +
  facet_wrap("S_SAMPLE_TYPE")
```


## By sample type and project

Excludes RR >= 0.9, and sample types: no information, insufficient, non-probability.

```{r, fig.height= 6, fig.width = 10}
metadata1 %>%
  filter(!S_SAMPLE_TYPE %in% c("no info", "insuff", "non-prob"),
         mode == "f2f",
         S_RR1_CALC_VALUE < 0.9) %>%  
  ggplot(., aes(x = S_RR1_CALC_VALUE, y = Q_Abs_bias_internal_criteria)) +
  geom_point(size = 2, alpha = 0.1) +
  geom_smooth(method = "loess", se = FALSE) +
  theme_bw() +
  facet_grid(T_SURVEY_NAME ~ S_SAMPLE_TYPE)
```

  
  