COVID - Study 2 - Pre-processing
Import libraries
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
Read data
## Parsed with column specification:
## cols(
## .default = col_double(),
## StartDate = col_datetime(format = ""),
## EndDate = col_datetime(format = ""),
## IPAddress = col_character(),
## RecordedDate = col_datetime(format = ""),
## ResponseId = col_character(),
## RecipientLastName = col_logical(),
## RecipientFirstName = col_logical(),
## RecipientEmail = col_logical(),
## ExternalReference = col_logical(),
## DistributionChannel = col_character(),
## UserLanguage = col_character(),
## Confirm_case_est_1 = col_character(),
## Confirm_case_est_2 = col_character(),
## Confirm_case_est_3 = col_character(),
## Actual_case_est_1 = col_character(),
## Actual_case_est_2 = col_character(),
## Actual_case_est_3 = col_character(),
## Deaths_est_2 = col_character(),
## Deaths_est_3 = col_character(),
## Deaths_est_4 = col_character()
## # ... with 25 more columns
## )
## See spec(...) for full column specifications.
## Warning: 1 parsing failure.
## row col expected actual file
## 1139 Age no trailing characters \4 '../data/raw_data/raw_data_study2_all.csv'
Rename columns
oldw <- getOption("warn")
options(warn = -1)
data_renamed = data_input %>%
mutate(id = 1:n(),
total_time = data_input$`Duration (in seconds)`) %>%
rowwise() %>%
mutate(
trump = trump_approval_1,
fox = ifelse(news == 3, 1, 0),
# FILTERS
president = Q129,
effort = as.numeric(Effort_1),
attention = Q122_7,
start_en = Q165,
prim_lng = Q166,
# group
group_id = ifelse(!is.na(Confirm_case_est_1),"t",
ifelse(!is.na(Q252_1),"ta","g")),
# ESTIMATES AND CONFIDENCE
# text group
## confirmed case
### estimates
cCases_1_est_t = as.numeric(gsub(",","",x =Confirm_case_est_1)),
cCases_2_est_t = as.numeric(gsub(",","",x =Confirm_case_est_2)),
cCases_3_est_t = as.numeric(gsub(",","",x =Confirm_case_est_3)),
### confidence
cCases_1_conf_t = Confirm_C_Conf_1,
cCases_2_conf_t = Confirm_C_Conf_2,
cCases_3_conf_t = Confirm_C_Conf_3,
## actual case
### estimates
aCases_1_est_t = as.numeric(gsub(",","",x =Actual_case_est_1)),
aCases_2_est_t = as.numeric(gsub(",","",x =Actual_case_est_2)),
aCases_3_est_t = as.numeric(gsub(",","",x =Actual_case_est_3)),
### confidence
aCases_1_conf_t = Act_case_conf_1,
aCases_2_conf_t = Act_case_conf_2,
aCases_3_conf_t = Act_case_conf_3,
## deaths
### estimates
deaths_1_est_t = as.numeric(gsub(",","",x =Deaths_est_2)),
deaths_2_est_t = as.numeric(gsub(",","",x =Deaths_est_3)),
deaths_3_est_t = as.numeric(gsub(",","",x =Deaths_est_4)),
### confidence
deaths_1_conf_t = Deaths_conf_1,
deaths_2_conf_t = Deaths_conf_2,
deaths_3_conf_t = Deaths_conf_3,
# table group
## confirmed cases
### estimates
cCases_1_est_ta = as.numeric(gsub(",","",x =Q252_1)),
cCases_2_est_ta = as.numeric(gsub(",","",x =Q252_2)),
cCases_3_est_ta = as.numeric(gsub(",","",x =Q252_3)),
### confidence
cCases_1_conf_ta = Q253_1,
cCases_2_conf_ta = Q253_2,
cCases_3_conf_ta = Q253_3,
## actual cases
### estimate
aCases_1_est_ta = as.numeric(gsub(",","",x =Q255_1)),
aCases_2_est_ta = as.numeric(gsub(",","",x =Q255_2)),
aCases_3_est_ta = as.numeric(gsub(",","",x =Q255_3)),
### confidence
aCases_1_conf_ta = Q256_1,
aCases_2_conf_ta = Q256_2,
aCases_3_conf_ta = Q256_3,
## deaths
### estimates
deaths_1_est_ta = as.numeric(gsub(",","",x =Q257_2)),
deaths_2_est_ta = as.numeric(gsub(",","",x =Q257_3)),
deaths_3_est_ta = as.numeric(gsub(",","",x =Q257_4)),
### confidence
deaths_1_conf_ta = Q258_1,
deaths_2_conf_ta = Q258_2,
deaths_3_conf_ta = Q258_3,
# graph group
## confirmed cases
### estimates
cCases_1_est_g = as.numeric(gsub(",","",x =Q261_1)),
cCases_2_est_g = as.numeric(gsub(",","",x =Q261_2)),
cCases_3_est_g = as.numeric(gsub(",","",x =Q261_3)),
### confidence
cCases_1_conf_g = Q262_1,
cCases_2_conf_g = Q262_2,
cCases_3_conf_g = Q262_3,
## actual cases
### estimate
aCases_1_est_g = as.numeric(gsub(",","",x =Q264_1)),
aCases_2_est_g = as.numeric(gsub(",","",x =Q264_2)),
aCases_3_est_g = as.numeric(gsub(",","",x =Q264_3)),
### confidence
aCases_1_conf_g = Q265_1,
aCases_2_conf_g = Q265_2,
aCases_3_conf_g = Q265_3,
## deaths
### estimates
deaths_1_est_g = as.numeric(gsub(",","",x =Q266_2)),
deaths_2_est_g = as.numeric(gsub(",","",x =Q266_3)),
deaths_3_est_g = as.numeric(gsub(",","",x =Q266_4)),
### confidence
deaths_1_conf_g = Q267_1,
deaths_2_conf_g = Q267_2,
deaths_3_conf_g = Q267_3,
# RT
rt_t = `Q155_Page Submit`,
rt_ta = `Q156_Page Submit`,
rt_g = `Q157_Page Submit`,
# personal risk estimates
prob_of_case = prob_of_contract_1,
prob_of_hosp = Q133_1,
prob_of_death = Q132_1,
time_to_stop_dist = Q167_1,
max_new_cases = as.numeric(gsub(",","",x =Q168)),
know_someone = ifelse(know_someone==2,0,1),
# covariates
zip = as.numeric(Zip_Code),
# age
age = as.numeric(Age),
age_bin = ifelse(age > 60, 'old', ifelse(age < 30, 'young', 'middle')),
# education level
edu = Ed_Level,
edu_mom = Ed_Level_Mom,
gen_health = ifelse(general_health_1=="Very Good",5,
ifelse(general_health_1 == "Quite Good",4,
ifelse(general_health_1=="Neither good nor poor",3,
ifelse(general_health_1=="Quite Poor",2,1)))),
gen_anxiety = ifelse(gen_anxiety_1=="Nearly every day",4,
ifelse(gen_anxiety_1=="More than half the days",3,
ifelse(gen_anxiety_1=="Several days",2,1))),
# conservatism
cons1 = as.numeric(Conserv_scale_1),
cons2 = -as.numeric(Conserv_scale_2),
cons3 = -as.numeric(Conserv_scale_3),
cons4 = as.numeric(Conserv_scale_4),
cons5 = -as.numeric(Conserv_scale_5),
cons6 = as.numeric(Conserv_scale_6),
cons7 = as.numeric(Conserv_scale_7),
# mean-center risk aversion
risk1 = as.numeric(substr(Q122_1,1,1)) - 4,
risk2=as.numeric(substr(Q122_2,1,1)) - 4,
risk3=as.numeric(substr(Q122_3,1,1)) - 4,
risk4=as.numeric(substr(Q122_4,1,1)) - 4,
risk5= -(as.numeric(substr(Q122_5,1,1)) - 4),
risk6=as.numeric(substr(Q122_6,1,1)) - 4,
# mean-center numeracy scores
num1 = as.numeric(substr(Numeracy_1,1,1)) - 3.5,
num2=as.numeric(substr(Numeracy_2,1,1))- 3.5,
num3=as.numeric(substr(Numeracy_3,1,1))-3.5,
num4=as.numeric(substr(Numeracy_4,1,1))- 3.5,
num5=as.numeric(substr(Q124_1,1,1))- 3.5,
num6=as.numeric(substr(Q125_1,1,1))-3.5,
num7= -(as.numeric(substr(Q126_1,1,1))-3.5),
num8=as.numeric(substr(Q127_1,1,1))-3.5,
# composite scores
conserv_mu = mean(c(cons1,cons2,cons3,cons4,cons5,cons6,cons7),na.rm=T),
risk_mu = mean(c(risk1,risk2,risk3,risk4,risk5,risk6),na.rm=T),
num_mu = mean(c(num1,num2,num3,num4, num5,num6,num7,num8)))
options(warn = oldw)
Select only columns of interest
data_col_trimmed = data_renamed %>%
dplyr::select(
income_down_1,optimism_1,essential_worker,
total_time,
news,fox,
# filters
attention,
effort,
id,age,age_bin,MTurk_ID,group_id,
# estimates and confidence
cCases_1_est_t,cCases_2_est_t,cCases_3_est_t,
cCases_1_conf_t,cCases_2_conf_t,cCases_3_conf_t,
aCases_1_est_t,aCases_2_est_t,aCases_3_est_t,
aCases_1_conf_t,aCases_2_conf_t,aCases_3_conf_t,
deaths_1_est_t,deaths_2_est_t,deaths_3_est_t,
deaths_1_conf_t,deaths_2_conf_t,deaths_3_conf_t,
cCases_1_est_g,cCases_2_est_g,cCases_3_est_g,
cCases_1_conf_g,cCases_2_conf_g,cCases_3_conf_g,
aCases_1_est_g,aCases_2_est_g,aCases_3_est_g,
aCases_1_conf_g,aCases_2_conf_g,aCases_3_conf_g,
deaths_1_est_g,deaths_2_est_g,deaths_3_est_g,
deaths_1_conf_g,deaths_2_conf_g,deaths_3_conf_g,
cCases_1_est_ta,cCases_2_est_ta,cCases_3_est_ta,
cCases_1_conf_ta,cCases_2_conf_ta,cCases_3_conf_ta,
aCases_1_est_ta,aCases_2_est_ta,aCases_3_est_ta,
aCases_1_conf_ta,aCases_2_conf_ta,aCases_3_conf_ta,
deaths_1_est_ta,deaths_2_est_ta,deaths_3_est_ta,
deaths_1_conf_ta,deaths_2_conf_ta,deaths_3_conf_ta,
# response times
rt_t,rt_g,rt_ta,
# personal risk estimates
prob_of_case,prob_of_hosp,prob_of_death,
# covariates
zip,
Gender,
gen_health,
gen_anxiety,
gen_health,
president,
edu,edu_mom,
start_en,prim_lng,
num1,num2,num3,num4,num5,num6,num7,num8,num_mu,
cons1,cons2,cons3,cons4,cons5,cons6,cons7,conserv_mu,
know_someone,
corona_news_1,corona_anxious_1,
past_social_iso_1,futur_social_iso_1,
risk1,risk2,risk3,risk4,risk5,risk6,risk_mu,
# other DV
time_to_stop_dist,
max_new_cases,
new,
trump)
Filter the data
data_filtered = data_col_trimmed %>%
mutate(cCases_1_est = ifelse(group_id=="t",cCases_1_est_t,ifelse(group_id=="ta",cCases_1_est_ta,cCases_1_est_g)),
cCases_2_est = ifelse(group_id=="t",cCases_2_est_t,ifelse(group_id=="ta",cCases_2_est_ta,cCases_2_est_g)),
cCases_3_est = ifelse(group_id=="t",cCases_3_est_t,ifelse(group_id=="ta",cCases_3_est_ta,cCases_3_est_g)),
aCases_1_est = ifelse(group_id=="t",aCases_1_est_t,ifelse(group_id=="ta",aCases_1_est_ta,aCases_1_est_g)),
aCases_2_est = ifelse(group_id=="t",aCases_2_est_t,ifelse(group_id=="ta",aCases_2_est_ta,aCases_2_est_g)),
aCases_3_est = ifelse(group_id=="t",aCases_3_est_t,ifelse(group_id=="ta",aCases_3_est_ta,aCases_3_est_g)),
deaths_1_est = ifelse(group_id=="t",deaths_1_est_t,ifelse(group_id=="ta",deaths_1_est_ta,deaths_1_est_g)),
deaths_2_est = ifelse(group_id=="t",deaths_2_est_t,ifelse(group_id=="ta",deaths_2_est_ta,deaths_2_est_g)),
deaths_3_est = ifelse(group_id=="t",deaths_3_est_t,ifelse(group_id=="ta",deaths_3_est_ta,deaths_3_est_g))) %>%
mutate(og_n = n()) %>%
filter(
(grepl("trump",president) | grepl("Trump",president) | grepl("Don",president) | grepl("don",president)) &
attention == 6 &
age > 18 & age < 100 &
effort > 5 &
# forcast constraints (no decreases)
cCases_1_est <= cCases_2_est & cCases_2_est <= cCases_3_est &
aCases_1_est <= aCases_2_est & aCases_2_est <= aCases_3_est &
deaths_1_est <= deaths_2_est & deaths_2_est <= deaths_3_est) %>%
mutate(new_n = n())
Convert dataset from wide to long (for ease of plotting and modeling)
data_long_est = data_filtered %>%
pivot_longer(
cols = contains("_est_"),
names_to = c("outcome","delay","group"),
names_pattern = "(.*)_(.*)_est_(.*)",
values_to = "estimate") %>% filter(!is.na(estimate))
data_long_conf = data_filtered %>%
pivot_longer(
cols = contains("_conf_"),
names_to = c("outcome","delay","group"),
names_pattern = "(.*)_(.*)_conf_(.*)",
values_to = "confidence") %>% filter(!is.na(confidence))
data_long = data_long_est %>% cbind(data_long_conf['confidence'])
Finishing touches
oldw <- getOption("warn")
options(warn = -1)
data_clean = data_long %>%
mutate(
# remove commas from key outcomes, convert to int
est = as.numeric(gsub(",","",x =data_long$estimate)),
conf = as.numeric(gsub(",","",x =data_long$confidence)),
# order factor variables
age_bin = ordered(age_bin,levels=c("young","middle","old")),
delay = ordered(delay,levels=c(1,2,3)),
outcome = ordered(outcome,levels=c("deaths","aCases","cCases")),
group = ordered(group,levels=c("t","ta","g"))) %>%
ungroup()
options(warn = oldw)
Save