COVID - Study 2 - Pre-processing

Import libraries

knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──

## ✔ ggplot2 3.2.1     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0

## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(tidyselect)
library(magrittr)
## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:purrr':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract

Read data

data_input = read_csv(
    file = "../data/raw_data/raw_data_study2_all.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   StartDate = col_datetime(format = ""),
##   EndDate = col_datetime(format = ""),
##   IPAddress = col_character(),
##   RecordedDate = col_datetime(format = ""),
##   ResponseId = col_character(),
##   RecipientLastName = col_logical(),
##   RecipientFirstName = col_logical(),
##   RecipientEmail = col_logical(),
##   ExternalReference = col_logical(),
##   DistributionChannel = col_character(),
##   UserLanguage = col_character(),
##   Confirm_case_est_1 = col_character(),
##   Confirm_case_est_2 = col_character(),
##   Confirm_case_est_3 = col_character(),
##   Actual_case_est_1 = col_character(),
##   Actual_case_est_2 = col_character(),
##   Actual_case_est_3 = col_character(),
##   Deaths_est_2 = col_character(),
##   Deaths_est_3 = col_character(),
##   Deaths_est_4 = col_character()
##   # ... with 25 more columns
## )

## See spec(...) for full column specifications.

## Warning: 1 parsing failure.
##  row col               expected actual                                       file
## 1139 Age no trailing characters    \4 '../data/raw_data/raw_data_study2_all.csv'

Rename columns

oldw <- getOption("warn")
options(warn = -1)
data_renamed = data_input %>%
    mutate(id = 1:n(),
           total_time = data_input$`Duration (in seconds)`) %>%
    rowwise() %>%
    mutate(
        trump = trump_approval_1,
        fox = ifelse(news == 3, 1, 0),
        # FILTERS
        president = Q129,
        effort = as.numeric(Effort_1),
        attention = Q122_7,
        start_en = Q165,
        prim_lng = Q166,
        
        # group
        group_id = ifelse(!is.na(Confirm_case_est_1),"t",
                          ifelse(!is.na(Q252_1),"ta","g")),
        
        # ESTIMATES AND CONFIDENCE
        # text group
        ## confirmed case 
        ### estimates
        cCases_1_est_t = as.numeric(gsub(",","",x =Confirm_case_est_1)),
        cCases_2_est_t = as.numeric(gsub(",","",x =Confirm_case_est_2)),
        cCases_3_est_t = as.numeric(gsub(",","",x =Confirm_case_est_3)),
        ### confidence
        cCases_1_conf_t = Confirm_C_Conf_1,
        cCases_2_conf_t = Confirm_C_Conf_2,
        cCases_3_conf_t = Confirm_C_Conf_3,
        ## actual case 
        ### estimates
        aCases_1_est_t = as.numeric(gsub(",","",x =Actual_case_est_1)),
        aCases_2_est_t = as.numeric(gsub(",","",x =Actual_case_est_2)),
        aCases_3_est_t = as.numeric(gsub(",","",x =Actual_case_est_3)),
        ### confidence
        aCases_1_conf_t = Act_case_conf_1,
        aCases_2_conf_t = Act_case_conf_2,
        aCases_3_conf_t = Act_case_conf_3,
        ## deaths
        ### estimates
        deaths_1_est_t = as.numeric(gsub(",","",x =Deaths_est_2)),
        deaths_2_est_t = as.numeric(gsub(",","",x =Deaths_est_3)),
        deaths_3_est_t = as.numeric(gsub(",","",x =Deaths_est_4)),
        ### confidence
        deaths_1_conf_t = Deaths_conf_1,
        deaths_2_conf_t = Deaths_conf_2,
        deaths_3_conf_t = Deaths_conf_3,
        
        # table group
        ## confirmed cases
        ### estimates
        cCases_1_est_ta = as.numeric(gsub(",","",x =Q252_1)),
        cCases_2_est_ta = as.numeric(gsub(",","",x =Q252_2)),
        cCases_3_est_ta = as.numeric(gsub(",","",x =Q252_3)),
        ### confidence
        cCases_1_conf_ta = Q253_1,
        cCases_2_conf_ta = Q253_2,
        cCases_3_conf_ta = Q253_3,
        ## actual cases
        ### estimate
        aCases_1_est_ta = as.numeric(gsub(",","",x =Q255_1)),
        aCases_2_est_ta = as.numeric(gsub(",","",x =Q255_2)),
        aCases_3_est_ta = as.numeric(gsub(",","",x =Q255_3)),
        ### confidence
        aCases_1_conf_ta = Q256_1,
        aCases_2_conf_ta = Q256_2,
        aCases_3_conf_ta = Q256_3,
        ## deaths
        ### estimates
        deaths_1_est_ta = as.numeric(gsub(",","",x =Q257_2)),
        deaths_2_est_ta = as.numeric(gsub(",","",x =Q257_3)),
        deaths_3_est_ta = as.numeric(gsub(",","",x =Q257_4)),
        ### confidence
        deaths_1_conf_ta = Q258_1,
        deaths_2_conf_ta = Q258_2,
        deaths_3_conf_ta = Q258_3,
        
        # graph group
        ## confirmed cases
        ### estimates
        cCases_1_est_g = as.numeric(gsub(",","",x =Q261_1)),
        cCases_2_est_g = as.numeric(gsub(",","",x =Q261_2)),
        cCases_3_est_g = as.numeric(gsub(",","",x =Q261_3)),
        ### confidence
        cCases_1_conf_g = Q262_1,
        cCases_2_conf_g = Q262_2,
        cCases_3_conf_g = Q262_3,
        ## actual cases
        ### estimate
        aCases_1_est_g = as.numeric(gsub(",","",x =Q264_1)),
        aCases_2_est_g = as.numeric(gsub(",","",x =Q264_2)),
        aCases_3_est_g = as.numeric(gsub(",","",x =Q264_3)),
        ### confidence
        aCases_1_conf_g = Q265_1,
        aCases_2_conf_g = Q265_2,
        aCases_3_conf_g = Q265_3,
        ## deaths
        ### estimates
        deaths_1_est_g = as.numeric(gsub(",","",x =Q266_2)),
        deaths_2_est_g = as.numeric(gsub(",","",x =Q266_3)),
        deaths_3_est_g = as.numeric(gsub(",","",x =Q266_4)),
        ### confidence
        deaths_1_conf_g = Q267_1,
        deaths_2_conf_g = Q267_2,
        deaths_3_conf_g = Q267_3,
        
        # RT
        rt_t = `Q155_Page Submit`,
        rt_ta = `Q156_Page Submit`,
        rt_g = `Q157_Page Submit`,
        
        # personal risk estimates 
        prob_of_case = prob_of_contract_1,
        prob_of_hosp = Q133_1,
        prob_of_death = Q132_1,
        time_to_stop_dist = Q167_1,
        max_new_cases = as.numeric(gsub(",","",x =Q168)),
        know_someone = ifelse(know_someone==2,0,1),
        # covariates
        zip = as.numeric(Zip_Code),
        # age
        age = as.numeric(Age),
        age_bin = ifelse(age > 60, 'old', ifelse(age < 30, 'young', 'middle')),
        # education level
        edu = Ed_Level,
        edu_mom = Ed_Level_Mom,
        gen_health = ifelse(general_health_1=="Very Good",5,
                            ifelse(general_health_1 == "Quite Good",4,
                                   ifelse(general_health_1=="Neither good nor poor",3,
                                          ifelse(general_health_1=="Quite Poor",2,1)))),
        gen_anxiety = ifelse(gen_anxiety_1=="Nearly every day",4,
                             ifelse(gen_anxiety_1=="More than half the days",3,
                                    ifelse(gen_anxiety_1=="Several days",2,1))),
        # conservatism
        cons1 = as.numeric(Conserv_scale_1), 
        cons2 = -as.numeric(Conserv_scale_2), 
        cons3 = -as.numeric(Conserv_scale_3), 
        cons4 = as.numeric(Conserv_scale_4),
        cons5 = -as.numeric(Conserv_scale_5), 
        cons6 = as.numeric(Conserv_scale_6),
        cons7 = as.numeric(Conserv_scale_7),
        # mean-center risk aversion
        risk1 = as.numeric(substr(Q122_1,1,1)) - 4,
        risk2=as.numeric(substr(Q122_2,1,1)) - 4,
        risk3=as.numeric(substr(Q122_3,1,1)) - 4,
        risk4=as.numeric(substr(Q122_4,1,1)) - 4,
        risk5= -(as.numeric(substr(Q122_5,1,1)) - 4),
        risk6=as.numeric(substr(Q122_6,1,1)) - 4,
        # mean-center numeracy scores
        num1 = as.numeric(substr(Numeracy_1,1,1)) - 3.5,
        num2=as.numeric(substr(Numeracy_2,1,1))- 3.5,
        num3=as.numeric(substr(Numeracy_3,1,1))-3.5,
        num4=as.numeric(substr(Numeracy_4,1,1))- 3.5, 
        num5=as.numeric(substr(Q124_1,1,1))- 3.5, 
        num6=as.numeric(substr(Q125_1,1,1))-3.5,
        num7= -(as.numeric(substr(Q126_1,1,1))-3.5), 
        num8=as.numeric(substr(Q127_1,1,1))-3.5,
        # composite scores
        conserv_mu = mean(c(cons1,cons2,cons3,cons4,cons5,cons6,cons7),na.rm=T),
        risk_mu = mean(c(risk1,risk2,risk3,risk4,risk5,risk6),na.rm=T),
        num_mu = mean(c(num1,num2,num3,num4, num5,num6,num7,num8)))
options(warn = oldw)

Select only columns of interest

data_col_trimmed = data_renamed %>%
    dplyr::select(
        income_down_1,optimism_1,essential_worker,
        total_time,
        news,fox,
        # filters
        attention,
        effort,
        id,age,age_bin,MTurk_ID,group_id,
        # estimates and confidence
        cCases_1_est_t,cCases_2_est_t,cCases_3_est_t,
        cCases_1_conf_t,cCases_2_conf_t,cCases_3_conf_t,
        aCases_1_est_t,aCases_2_est_t,aCases_3_est_t,
        aCases_1_conf_t,aCases_2_conf_t,aCases_3_conf_t,
        deaths_1_est_t,deaths_2_est_t,deaths_3_est_t,
        deaths_1_conf_t,deaths_2_conf_t,deaths_3_conf_t,
        cCases_1_est_g,cCases_2_est_g,cCases_3_est_g,
        cCases_1_conf_g,cCases_2_conf_g,cCases_3_conf_g,
        aCases_1_est_g,aCases_2_est_g,aCases_3_est_g,
        aCases_1_conf_g,aCases_2_conf_g,aCases_3_conf_g,
        deaths_1_est_g,deaths_2_est_g,deaths_3_est_g,
        deaths_1_conf_g,deaths_2_conf_g,deaths_3_conf_g,
        cCases_1_est_ta,cCases_2_est_ta,cCases_3_est_ta,
        cCases_1_conf_ta,cCases_2_conf_ta,cCases_3_conf_ta,
        aCases_1_est_ta,aCases_2_est_ta,aCases_3_est_ta,
        aCases_1_conf_ta,aCases_2_conf_ta,aCases_3_conf_ta,
        deaths_1_est_ta,deaths_2_est_ta,deaths_3_est_ta,
        deaths_1_conf_ta,deaths_2_conf_ta,deaths_3_conf_ta,
        # response times
        rt_t,rt_g,rt_ta,
        # personal risk estimates
        prob_of_case,prob_of_hosp,prob_of_death,
        # covariates
        zip,
        Gender,
        gen_health,
        gen_anxiety,
        gen_health,
        president,
        edu,edu_mom,
        start_en,prim_lng,
        num1,num2,num3,num4,num5,num6,num7,num8,num_mu,
        cons1,cons2,cons3,cons4,cons5,cons6,cons7,conserv_mu,
        know_someone,
        corona_news_1,corona_anxious_1,
        past_social_iso_1,futur_social_iso_1,
        risk1,risk2,risk3,risk4,risk5,risk6,risk_mu,
        # other DV
        time_to_stop_dist,
        max_new_cases,
        new,
        trump)

Filter the data

data_filtered = data_col_trimmed %>%
    mutate(cCases_1_est = ifelse(group_id=="t",cCases_1_est_t,ifelse(group_id=="ta",cCases_1_est_ta,cCases_1_est_g)),
           cCases_2_est = ifelse(group_id=="t",cCases_2_est_t,ifelse(group_id=="ta",cCases_2_est_ta,cCases_2_est_g)),
           cCases_3_est = ifelse(group_id=="t",cCases_3_est_t,ifelse(group_id=="ta",cCases_3_est_ta,cCases_3_est_g)),
           aCases_1_est = ifelse(group_id=="t",aCases_1_est_t,ifelse(group_id=="ta",aCases_1_est_ta,aCases_1_est_g)),
           aCases_2_est = ifelse(group_id=="t",aCases_2_est_t,ifelse(group_id=="ta",aCases_2_est_ta,aCases_2_est_g)),
           aCases_3_est = ifelse(group_id=="t",aCases_3_est_t,ifelse(group_id=="ta",aCases_3_est_ta,aCases_3_est_g)),
           deaths_1_est = ifelse(group_id=="t",deaths_1_est_t,ifelse(group_id=="ta",deaths_1_est_ta,deaths_1_est_g)),
           deaths_2_est = ifelse(group_id=="t",deaths_2_est_t,ifelse(group_id=="ta",deaths_2_est_ta,deaths_2_est_g)),
           deaths_3_est = ifelse(group_id=="t",deaths_3_est_t,ifelse(group_id=="ta",deaths_3_est_ta,deaths_3_est_g))) %>%
    mutate(og_n = n()) %>%
    filter(
        (grepl("trump",president) | grepl("Trump",president) | grepl("Don",president) | grepl("don",president)) & 
        attention == 6 & 
        age > 18 & age < 100 &
        effort > 5 & 
        # forcast constraints (no decreases)
        cCases_1_est <= cCases_2_est & cCases_2_est <= cCases_3_est &
        aCases_1_est <= aCases_2_est & aCases_2_est <= aCases_3_est &
        deaths_1_est <= deaths_2_est & deaths_2_est <= deaths_3_est) %>%
    mutate(new_n = n())

Convert dataset from wide to long (for ease of plotting and modeling)

data_long_est = data_filtered %>%
    pivot_longer(
        cols = contains("_est_"),
        names_to = c("outcome","delay","group"),
        names_pattern = "(.*)_(.*)_est_(.*)",
        values_to = "estimate") %>% filter(!is.na(estimate))
data_long_conf = data_filtered %>%
    pivot_longer(
        cols = contains("_conf_"),
        names_to = c("outcome","delay","group"),
        names_pattern = "(.*)_(.*)_conf_(.*)",
        values_to = "confidence") %>% filter(!is.na(confidence))
data_long = data_long_est %>% cbind(data_long_conf['confidence'])

Finishing touches

oldw <- getOption("warn")
options(warn = -1)
data_clean = data_long %>% 
    mutate(
        # remove commas from key outcomes, convert to int
        est = as.numeric(gsub(",","",x =data_long$estimate)),
        conf = as.numeric(gsub(",","",x =data_long$confidence)),
        # order factor variables
        age_bin = ordered(age_bin,levels=c("young","middle","old")),
        delay = ordered(delay,levels=c(1,2,3)),
        outcome = ordered(outcome,levels=c("deaths","aCases","cCases")),
        group = ordered(group,levels=c("t","ta","g"))) %>%
    ungroup()
options(warn = oldw)

Save

write_csv(data_clean,"../data/prepped/data_long_study2.csv")
write_csv(data_filtered,"../data/prepped/data_wide_study2.csv")