COVID - Study 2 - Initial plots

Import libraries, set global theme

knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)

## ── Attaching packages ─────────────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0

## ── Conflicts ────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(plotrix)
library(ggplot2)
library(ggthemes)
library(viridis)

## Loading required package: viridisLite

setwd("/Users/adkinsty/Box/side_projects/covid/visualization/")
my_theme = theme_wsj(base_size = 10,color = "grey") +
    theme(
        axis.line.y = element_line(size=.5),
        legend.title = element_text(size=12,family="sans",face="bold"),
        axis.text = element_text(face = "plain"),
        axis.title = element_text(size = 12,family="sans",face="bold"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        plot.title = element_text(hjust = .5,size=13,family="sans", face="bold"))
theme_set(my_theme)

Setup data for plotting

data_input = read.csv("../data/prepped/data_long_study2.csv") 
# define and remove 'extreme' values (for better illustration)
extremes = data_input %>%
    group_by(delay,outcome) %>%
    summarise(upper = quantile(est, c(0.05,0.98)[2],na.rm=T)) %>% 
    mutate(lower = ifelse(outcome == "deaths",12692,393782))

data = merge(data_input,extremes,
             intersect(names(data_input),names(extremes))) %>%
    filter(est < upper & est > lower)

wide_data = read.csv("../data/prepped/data_wide_study2.csv") %>% filter(new==1)

Sample details

Location

library(usmap)
loc_data = read.csv("../data/extra/loc/uszips.csv")
all_loc_data = merge(wide_data,loc_data,by="zip") %>% 
    mutate(state = state_id) %>%
    group_by(state) %>%
    summarise(n = n())

# Population by state, 2015 Census
plot_usmap("states",data = statepop,values="pop_2015") +
    scale_fill_viridis("2015 Population") +
    theme(legend.position = "right")

# Study 1 sample size by state
plot_usmap("states",data = all_loc_data,values="n") +
    scale_fill_viridis("Sample size") +
    theme(legend.position = "right")

Other descriptives

oldw <- getOption("warn")
options(warn = -1)
# Age
wide_data %>% ggplot(aes(x = age)) + geom_histogram(stat="count")

# Gender
wide_data %>% ggplot(aes(x = Gender)) + geom_histogram(stat="count")

# Edu
wide_data %>% ggplot(aes(x = edu)) + geom_histogram(stat="count") + 
    scale_x_discrete(limits=c("Some high school","High school","Some college",
                              "2-year degree","4-year degree","Advanced Degree"),
                     labels=c("sHS","HS","sC","2C","4C","A"))

# Conservatism
wide_data %>% ggplot(aes(x = conserv_mu)) + geom_histogram(stat="count")

# Risk-aversion 
wide_data %>% ggplot(aes(x = risk_mu)) + geom_histogram(stat="count")

# Numeracy
wide_data %>% ggplot(aes(x = num_mu)) + geom_histogram(stat="count")

# Know-someone
wide_data %>% ggplot(aes(x = know_someone)) + geom_histogram(stat="count")

options(warn = oldw)

Distributions of estimates

Estimate types: ‘aCases’: actual cases, ‘cCases’: confirmed cases, ‘deaths’: deaths.
Estimate delay: 1: 3 days (Mar. 30), 2: 6 days (Apr. 2), 3: 9 days (Apr. 5)

data %>% 
    ggplot(aes(x = est, group = interaction(outcome,delay))) + 
    geom_histogram(aes(y = ..density.., fill = outcome)) +
    scale_fill_viridis_d(begin = .5) +
    geom_density(adjust=2,size=1) +
    facet_wrap(outcome~delay,nrow = 3,ncol=3,scales="free")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Distributions of confidence (in estimates)

data %>% 
    ggplot(aes(x = conf, group = interaction(outcome,delay))) + 
    geom_histogram(aes(y = ..density.., fill = outcome)) +
    scale_fill_viridis_d(begin = .5) +
    geom_density(adjust=2,size=1) +
    facet_wrap(outcome~delay,nrow = 3,ncol=3,scales="free")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Estimates of cases and deaths over time

Estimates are of confirmed cases (‘cCases’), actual cases (‘aCases’), and deaths (‘deaths’). Estimates from Mar. 11 to Apr. 7 (‘truth’) are from worldometers.info. Estimates for Apr. 10, April 13, and April 16 are from MTurk workers who were administered the earlier worldometers.info data in one of text (‘t’), table (‘ta’) or graph (‘g’) form.

p = all_data %>% 
    ggplot(aes(x = factor(date), y = est, 
               colour = group, group = group)) +
    stat_summary(geom="point") +
    stat_summary(geom="line") +
    scale_x_discrete("Date",
                     labels = c("M11","M18",
                                "M25","A1","A7",
                                "A10","A13","A16")) +
    scale_color_colorblind() +
    facet_wrap(.~outcome,ncol = 3,scales="free_y")
suppressMessages(print(p))

p = all_data %>%
    ggplot(aes(x = factor(date), y = est,
               colour = age_bin, group = age_bin)) +
    stat_summary(geom="point") +
    stat_summary(geom="line") +
    scale_x_discrete("Date",
                     labels = c("M11","M18",
                                "M25","A1","A7",
                                "A10","A13","A16")) +
    scale_color_colorblind() +
    facet_wrap(.~outcome,ncol = 3,scales="free_y")
suppressMessages(print(p))

## Warning: Removed 10 rows containing missing values (geom_point).

## Warning: Removed 10 rows containing missing values (geom_path).

Graphical hypothesis tests

Hypotheses adapted from the Study 1 Protocol Google doc.

H: Compared to the graph and table conditions, participants in the text-only condition will report smaller estimates of future cases and deaths.

p = data %>%
    ggplot(aes(x=group,y=est)) +
    stat_summary() +
    facet_wrap(new~outcome,ncol = 3,scales="free")
suppressMessages(print(p))

H: Compared to the other conditions, participants in the graph condition will be more confident in their estimates.

p = data %>%
    ggplot(aes(x=group,y=conf)) +
    stat_summary() +
    facet_wrap(new~outcome,ncol = 3,scales="free")
suppressMessages(print(p))

H: People with higher risk aversion will estimate fewer cases and deaths.

p = data %>%
    ggplot(aes(x=risk_mu,y=est)) + 
    geom_smooth(method="auto")  +
    stat_summary_bin(bins = 10) +
    facet_wrap(.~outcome,ncol = 3,
               scales="free")
suppressMessages(print(p))

H: People with know_someone connections to COVID-19 patients will estimate more cases and deaths

p = data %>% 
    ggplot(aes(x=know_someone,y=est)) + 
    stat_summary() +
    facet_wrap(.~outcome,ncol = 3,scales="free")
suppressMessages(print(p))

H: Compared to older people, younger people will estimate lower probabilities of getting sick with covid, especially if they do not have a know_someone connection to anyone with COVID19.

p = wide_data %>% filter(!is.na(age)) %>%
    ggplot(aes(x=age,y=prob_of_case,
               colour=know_someone,group=know_someone)) + 
    geom_smooth(method="auto") +
    stat_summary_bin(bins = 10)
suppressMessages(print(p))

## Warning: Removed 2 rows containing missing values (geom_pointrange).

p = wide_data %>% filter(!is.na(age)) %>%
    ggplot(aes(x=age,y=prob_of_hosp,
               colour=know_someone,group=know_someone)) + 
    geom_smooth(method="auto") +
    stat_summary_bin(bins = 10)
suppressMessages(print(p))

## Warning: Removed 2 rows containing missing values (geom_pointrange).

p = wide_data %>% filter(!is.na(age)) %>%
    ggplot(aes(x=age,y=prob_of_death,
               colour=know_someone,group=know_someone)) + 
    geom_smooth(method="auto") +
    stat_summary_bin(bins = 10)
suppressMessages(print(p))

## Warning: Removed 2 rows containing missing values (geom_pointrange).

H: Participants with higher conservatism will estimate fewer cases and deaths and estimate lower probabilities of getting sick with covid.

Patterns: Participants with conservatism scores near zero estimated the smallest numbers of cases and deaths. But these participants also saw themselves more likely to get sick with the disease.

p = wide_data %>% 
    ggplot(aes(x=conserv_mu,y=prob_of_case)) + 
    geom_smooth(method="auto") +
    stat_summary_bin(bins = 10)  
suppressMessages(print(p))

p = wide_data %>% 
    ggplot(aes(x=conserv_mu,y=prob_of_hosp)) + 
    geom_smooth(method="auto") +
    stat_summary_bin(bins = 10)
suppressMessages(print(p))

p = wide_data %>% 
    ggplot(aes(x=conserv_mu,y=prob_of_death)) + 
    geom_smooth(method="auto") +
    stat_summary_bin(bins = 10)
suppressMessages(print(p))

Additional contrasts

Case estimates

Below are plots corresponding to predictors included in Tyler’s regression model of case estimates.

ad = data_input %>% filter(outcome == "aCases")
cd = data_input %>% filter(outcome == "cCases") 
d = cd %>% mutate(c_est = est, a_est = ad$est) %>%
    merge(extremes,intersect(names(data_input),names(extremes))) %>%
    filter(c_est < upper & c_est > lower &
           a_est < upper & a_est > lower) %>%
    mutate(bin_cons = conserv_mu > median(conserv_mu,na.rm=T))

library(viridis)
d %>% ggplot(aes(x=c_est/100000,y=a_est/100000,
             colour=bin_cons,group=bin_cons)) + 
    geom_smooth(method="auto") +
    geom_point(size=.5,alpha=.5) +
    geom_abline(slope=1,linetype="dashed") +
    scale_x_continuous("Estimates of confirmed cases (100k)") +
    scale_y_continuous("Estimates of actual cases (100k)") +
    scale_colour_manual("Conservatism > median",values=c("blue","red")) +
    coord_equal(xlim = c(4,22),ylim=c(4,22))

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'