About

This is the results page for the Corona study. You can take part in it here:

We now have sufficient data for some values. Calculations and output shown below!

Init

options(digits = 3, scipen = 999)
options(contrasts=c("contr.treatment", "contr.treatment"))
library(pacman)
p_load(kirkegaard, readxl, lubridate, DT, rms, osfr, mirt)
theme_set(theme_bw())

#fix bug
describe = function(...) psych::describe(...) %>% as.matrix()

#clean string of question fluff
str_remove_fluff = function(x) {
  x %>% 
    str_clean() %>% 
    str_replace("Estimate death tolls of the new coronavirus", "") %>% 
    str_replace(" Note Europe does not include Turkey Russia or Caucasus area countries", "") %>%
                                    str_replace("A state of emergency is a situation in which a government is empowered to perform actions or impose policies that it would normally not be permitted to undertake", "") %>%
                                    str_replace(" The format is YYYY MM DD or click the calendar button", "") %>% str_replace(" 1st degree relatives i e parents children siblings", "") %>% 
                                    str_replace(" Infected means anyone who had the virus currently has it recovered from the virus or died of it as a consequence", "") %>% 
    str_trim() %>% 
    str_c("?")
}

Data

#read data
d = read_csv("data/20200404154654-SurveyExport.csv") %>% 
  df_legalize_names() %>% 
  #filter test responses
  filter(Time_Started > ymd_hms("2020-03-11 22:17:11"))
## Warning: Duplicated column names deduplicated: 'Other - Write In:To what extent
## have you altered your behaviour because of the new coronavirus? Check all
## that apply, and only check a box when you have done this as a result of the
## new coronavirus.' => 'Other - Write In:To what extent have you altered your
## behaviour because of the new coronavirus? Check all that apply, and only check a
## box when you have done this as a result of the new coronavirus._1' [112]
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   `Response ID` = col_double(),
##   `Time Started` = col_datetime(format = ""),
##   `Date Submitted` = col_datetime(format = ""),
##   `Contact ID` = col_logical(),
##   `Legacy Comments` = col_logical(),
##   Comments = col_logical(),
##   Tags = col_logical(),
##   Longitude = col_double(),
##   Latitude = col_double(),
##   `What is your age?` = col_double(),
##   `There is a movement of thought called the rationalist or Less Wrong movement, headed by people such as Eliezer Yudkowsky and Scott Alexander (Slate Star Codex). It is based on the idea that humans can become aware of and hopefully counteract various cognitive biases that we possess. To which degree do you consider yourself a rationalist?` = col_double(),
##   `“Genetic variation contributes substantially to psychological and socioeconomic variation between human populations (clusters, races etc.), being ultimately responsible for at least 50% of observed differences.” How likely do you think this statement is true?` = col_double(),
##   `What is your ideal rate of personal income taxation?` = col_double(),
##   `How many of the following do you want to legalise or keep legal? Sex work/prostitution, strip clubs, casinos/gambling, cannabis, amphetamine, methamphetamine, LSD, driving without seatbelt, selling human organs, selling human semen/eggs, homosexual sex. Input a number (min. 0 max. 11).` = col_double(),
##   `How accurate do you think the mainstream media has presented information about Coronavirus?` = col_double(),
##   `The new coronavirus is a variant of the flu.` = col_logical(),
##   `The new coronavirus uses DNA to replicate itself and store genetic information.` = col_logical(),
##   `Some scientists working in fields such as microbiology and virology think the name SARS-CoV-2 is appropriate for the new coronavirus.` = col_logical(),
##   `Washing hands is an effective way to prevent spreading the new coronavirus because soap is able to dissolve proteins, and the viral envelope of the new coronavirus is made out of protein.` = col_logical(),
##   `The previous pages featured 30 questions about science and about coronaviruses. How many answers do you think you got correct?` = col_double()
##   # ... with 12 more columns
## )
## See spec(...) for full column specifications.
#total samples
nrow(d)
## [1] 369
#variable list
d_vars = df_var_table(d)

d_vars #print it
#scoring key for test
test_scoring_key = read_excel("data/test_scoring_key.xlsx")
test_scoring_key$correct %<>% str_trim()

#original data before exclusion
d_original = d

#data collection
d_original %>% 
  ggplot(aes(Date_Submitted)) +
  geom_histogram() +
  scale_x_datetime(date_breaks = "3 days")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

d_original$Date_Submitted %>% mean()
## [1] "2020-03-13 15:05:19 UTC"
d_original$Date_Submitted %>% median()
## [1] "2020-03-12 12:42:35 UTC"
d_original$Date_Submitted %>% date() %>% table2()
#filter out late responses
d = d %>% filter(Date_Submitted <= ymd("2020-03-18"))

#filter out dishonest replies
#honest? remove
d$honest = d$Did_you_honestly_answer_the_science_and_forecasting_questions_If_not_kindly_tell_us_so_we_can_exclude_your_results
d$honest %>% table2()
d = d %>% filter(honest == "Yes, I was honest")

#fake age, remove
d %<>% filter(What_is_your_age <= 100)

#crazy forecasts, remove
d %<>% filter(
  What_do_you_estimate_will_be_the_total_death_toll_of_Coronavirus_in_the_USA_in_the_next_2_years_Estimate_death_tolls_of_the_new_coronavirus < 300e6
)

#too late
d$Date_Submitted %>% quantile()
##                        0%                       25%                       50% 
## "2020-03-11 22:43:22 UTC" "2020-03-12 01:24:56 UTC" "2020-03-12 08:19:02 UTC" 
##                       75%                      100% 
## "2020-03-13 12:29:23 UTC" "2020-03-17 14:30:14 UTC"
#rename
d$survey_duration = (d$Date_Submitted - d$Time_Started)
d$total_score = d$We_have_calculated_your_score %>% as.numeric()
d$age = d$What_is_your_age %>% as.numeric()
d$age_bin = d$age %>% discretize(breaks = 5)
d$sex = d$What_is_your_sex %>% factor()
d$education = d$What_is_the_highest_level_of_education_you_have_completed %>% ordered(c("None", "High school", "Vocational school", "Bachelor", "Master", "Doctorate or higher"))
d$education_num = d$education %>% as.numeric()
d$academic_class = d$What_is_your_current_academic_status   %>% mapvalues(from = c(NA), to = c("I am not a researcher")) %>% factor()
d$GJP = d$Have_you_participated_in_the_Good_Judgment_Project_forecasting_tournament_or_other_prediction_markets %>% ordered(c("No", "Only a little bit", "Quite a bit", "A lot"))
d$rationalist = d$There_is_a_movement_of_thought_called_the_rationalist_or_Less_Wrong_movement_headed_by_people_such_as_Eliezer_Yudkowsky_and_Scott_Alexander_Slate_Star_Codex_It_is_based_on_the_idea_that_humans_can_become_aware_of_and_hopefully_counteract_various_cognitive_biases_that_we_possess_To_which_degree_do_you_consider_yourself_a_rationalist %>% as.numeric()
d$HBD = d$Genetic_variation_contributes_substantially_to_psychological_and_socioeconomic_variation_between_human_populations_clusters_races_etc_being_ultimately_responsible_for_at_least_50pct_of_observed_differences_How_likely_do_you_think_this_statement_is_true  %>% as.numeric()
d$income_tax = d$What_is_your_ideal_rate_of_personal_income_taxation %>% as.numeric()
d$personal_freedom = d$How_many_of_the_following_do_you_want_to_legalise_or_keep_legal_Sex_work_prostitution_strip_clubs_casinos_gambling_cannabis_amphetamine_methamphetamine_LSD_driving_without_seatbelt_selling_human_organs_selling_human_semen_eggs_homosexual_sex_Input_a_number_min_0_max_11    %>% as.numeric()
d$media_coverage_accurate = d$How_accurate_do_you_think_the_mainstream_media_has_presented_information_about_Coronavirus %>% as.numeric()
d$self_score_estimate = d$The_previous_pages_featured_30_questions_about_science_and_about_coronaviruses_How_many_answers_do_you_think_you_got_correct  %>% as.numeric()
d$self_forecast_accuracy = d$How_good_do_you_think_your_forecasts_will_be_relative_to_other_people_taking_this_survey   %>% as.numeric()
d$fox_hedgehog = d$In_a_famous_essay_Isaiah_Berlin_classified_thinkers_as_hedgehogs_and_foxes_The_hedgehog_knows_one_big_thing_and_tries_to_explain_as_much_as_possible_using_that_theory_or_framework_The_fox_knows_many_small_things_and_is_content_to_improvise_explanations_on_a_case_by_case_basis_When_it_comes_to_making_predictions_would_you_describe_yourself_as_more_of_a_hedgehog_or_more_of_a_fox %>% ordered(c("Very much more fox-like", "Somewhat more fox-like", "Neither more fox-like nor more hedgehog-like", "Somewhat more hedgehog-like", "Very much more hedgehog-like"))
d$fox_hedgehog_num = d$fox_hedgehog %>% as.numeric()
d$follow_news_closely = d$On_a_scale_of_0_to_100_how_closely_have_you_been_following_news_related_to_the_the_new_coronavirus    %>% as.numeric()
d$national_security = d$The_new_coronavirus_is_a_threat_to_the_national_security_of_the_country_in_which_I_currently_live %>% ordered(c("Strongly disagree", "Somewhat disagree", "Neither disagree nor agree", "Somewhat agree", "Strongly agree"))
d$italy_govt = d$The_Italian_authorities_initially_did_not_do_enough_to_combat_the_spread_of_the_new_coronavirus %>% ordered(c("Strongly disagree", "Somewhat disagree", "Neither disagree nor agree", "Somewhat agree", "Strongly agree"))
d$close_airports = d$Authorities_across_the_world_should_completely_close_airports_in_order_to_prevent_the_spread_of_the_new_coronavirus    %>% ordered(c("Strongly disagree", "Somewhat disagree", "Neither disagree nor agree", "Somewhat agree", "Strongly agree"))

#countries
d$from_USA = d$Country == "United States"
d$from_UK = d$Country == "United Kingdom"
d$from_France = d$Country == "France"

Results

Basic demographics

Collection of the basic stuff

#time to complete
d$survey_duration %>% as.numeric() %>% describe()
##    vars   n mean   sd median trimmed  mad  min max range skew kurtosis   se
## X1    1 333 28.1 34.3   21.1    22.7 9.49 7.57 497   490 9.12      110 1.88
#countries
d$Country %>% table2()
#sex
d$sex %>% table2()
#education
d$education %>% table2()
#academic class
d$academic_class %>% table2()
#GJP participation
d$GJP %>% table2()
#fox hedgehog
d$fox_hedgehog %>% as.numeric() %>% describe()
##    vars   n mean   sd median trimmed  mad min max range  skew kurtosis     se
## X1    1 333 2.37 1.09      2    2.28 1.48   1   5     4 0.734   -0.224 0.0596
ggplot(d, aes(fox_hedgehog)) +
  geom_bar() +
  theme(axis.text.x = element_text(angle = -10)) +
  ggtitle("Foxes and hedgehogs")

GG_save("figs/fox_dist.png")

#age
d$age %>% describe()
##    vars   n mean   sd median trimmed mad min max range skew kurtosis    se
## X1    1 333 32.8 11.5     30    31.3 8.9  18  68    50 1.15    0.675 0.629
GG_denhist(d, "age", group = "sex") +
  # ggtitle("Age distribution by sex", "Red line = mean") +
  scale_x_continuous(breaks = seq(0, 200, by = 10))
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figs/age_dist_by_sex.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#rationalist self-rating
d$rationalist %>% describe()
##    vars   n mean   sd median trimmed  mad min max range   skew kurtosis   se
## X1    1 333 58.2 25.7     65    60.2 22.2   0 100   100 -0.691   -0.386 1.41
GG_denhist(d, "rationalist", group = "sex") +
  ggtitle("Rationalist self-identification distribution by sex", "Red line = mean")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figs/rationalist_dist_by_sex.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#HBD endorsement
d$HBD %>% describe()
##    vars   n mean   sd median trimmed  mad min max range  skew kurtosis   se
## X1    1 333   76 29.7     89    81.4 16.3   0 100   100 -1.29    0.415 1.63
d$HBD %>% describeBy(d$sex)
## 
##  Descriptive statistics by group 
## group: Female
##    vars  n mean   sd median trimmed  mad min max range  skew kurtosis   se
## X1    1 32 62.4 34.6     74      65 37.1   0 100   100 -0.47    -1.27 6.12
## ------------------------------------------------------------ 
## group: Male
##    vars   n mean   sd median trimmed  mad min max range  skew kurtosis   se
## X1    1 301 77.4 28.8     90      83 14.8   0 100   100 -1.41     0.78 1.66
GG_denhist(d, "HBD", group = "sex") +
  ggtitle("Probability that population differences at least 50% genetic in origin", "Red line = mean")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figs/HBD_dist_by_sex.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Science test

#numerics
d$total_score %>% describe()
##    vars   n mean   sd median trimmed  mad min max range    skew kurtosis    se
## X1    1 333 16.9 4.12     17    16.9 4.45   5  27    22 -0.0837    -0.28 0.226
#distribution to centile and IQ
total_score_cdf = ecdf(d$total_score)

#centiles
total_score_cdf(0:30) %>% set_names(0:30)
##       0       1       2       3       4       5       6       7       8       9 
## 0.00000 0.00000 0.00000 0.00000 0.00000 0.00300 0.00901 0.01201 0.02402 0.03904 
##      10      11      12      13      14      15      16      17      18      19 
## 0.05105 0.09610 0.14414 0.21922 0.27628 0.37538 0.46547 0.55856 0.64264 0.73273 
##      20      21      22      23      24      25      26      27      28      29 
## 0.79880 0.86186 0.90691 0.94294 0.97598 0.98799 0.99399 1.00000 1.00000 1.00000 
##      30 
## 1.00000
#IQ scale
total_score_cdf(0:30) %>% qnorm(mean = 100, sd = 15) %>% set_names(0:30)
##     0     1     2     3     4     5     6     7     8     9    10    11    12 
##  -Inf  -Inf  -Inf  -Inf  -Inf  58.8  64.5  66.1  70.3  73.6  75.5  80.4  84.1 
##    13    14    15    16    17    18    19    20    21    22    23    24    25 
##  88.4  91.1  95.2  98.7 102.2 105.5 109.3 112.6 116.3 119.8 123.7 129.7 133.9 
##    26    27    28    29    30 
## 137.7   Inf   Inf   Inf   Inf
#plots
GG_denhist(d, "total_score")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figs/score_dist.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
GG_denhist(d, "total_score", "sex")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figs/score_dist_by_sex.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#cohen d
SMD_matrix(d$total_score, d$sex, extended_output = T)
## $d
##        Female Male
## Female     NA 0.09
## Male     0.09   NA
## 
## $d_string
##        Female              Male               
## Female NA                  "0.09 [-0.27 0.45]"
## Male   "0.09 [-0.27 0.45]" NA                 
## 
## $CI_lower
##        Female   Male
## Female     NA -0.274
## Male   -0.274     NA
## 
## $CI_upper
##        Female  Male
## Female     NA 0.455
## Male    0.455    NA
## 
## $se_z
## [1] 1.96
## 
## $se
##        Female  Male
## Female     NA 0.186
## Male    0.186    NA
## 
## $p
##        Female  Male
## Female     NA 0.628
## Male    0.628    NA
## 
## $pairwise_n
##        Female Male
## Female     NA  333
## Male      333   NA
#age
ggplot(d, aes(total_score, age_bin, fill = age_bin)) +
  geom_boxplot() +
  coord_flip() +
  labs(x = "Science knowledge score", y = "Age group")

GG_save("figs/sumscore_dist_by_age.png")

#cor
cor.test(d$total_score, d$age)
## 
##  Pearson's product-moment correlation
## 
## data:  d$total_score and d$age
## t = 4, df = 331, p-value = 0.0002
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.0951 0.3015
## sample estimates:
##   cor 
## 0.201
### Item response theory

#item data
test_items = d %>% 
  #select items
  select(What_is_COVID_19_COVID_19_is_a:In_the_past_growth_of_moulds_on_stored_food_and_drink_was_a_persistent_problem_Who_dispelled_the_general_belief_of_spontaneous_generation_that_is_the_supposed_production_of_living_organisms_from_non_living_matter_as_inferred_from_the_apparent_appearance_of_life_in_some_supposedly_sterile_environments) %>% 
  #remove whitespace
  map_df(str_trim)
test_items_long_names = names(test_items)
names(test_items) = "item_" + 1:ncol(test_items)

#score
test_items_scored = score_items(test_items, key = test_scoring_key$correct)

#reliability
alpha(test_items_scored)
## 
## Reliability analysis   
## Call: alpha(x = test_items_scored)
## 
##   raw_alpha std.alpha G6(smc) average_r S/N   ase mean   sd median_r
##       0.69       0.7    0.72     0.071 2.3 0.024 0.56 0.14    0.067
## 
##  lower alpha upper     95% confidence boundaries
## 0.64 0.69 0.74 
## 
##  Reliability if an item is dropped:
##         raw_alpha std.alpha G6(smc) average_r S/N alpha se  var.r med.r
## item_1       0.69      0.69    0.72     0.072 2.2    0.024 0.0049 0.067
## item_2       0.69      0.69    0.72     0.072 2.2    0.024 0.0048 0.068
## item_3       0.69      0.69    0.72     0.072 2.3    0.024 0.0048 0.067
## item_4       0.68      0.69    0.71     0.071 2.2    0.025 0.0048 0.067
## item_5       0.70      0.70    0.72     0.074 2.3    0.024 0.0047 0.068
## item_6       0.69      0.70    0.72     0.074 2.3    0.024 0.0049 0.068
## item_7       0.70      0.70    0.73     0.074 2.3    0.024 0.0047 0.068
## item_8       0.69      0.70    0.72     0.073 2.3    0.024 0.0049 0.068
## item_9       0.68      0.69    0.71     0.070 2.2    0.025 0.0049 0.066
## item_10      0.68      0.68    0.71     0.069 2.2    0.025 0.0048 0.065
## item_11      0.69      0.69    0.72     0.072 2.3    0.024 0.0049 0.068
## item_12      0.69      0.69    0.72     0.071 2.2    0.025 0.0049 0.067
## item_13      0.69      0.69    0.72     0.071 2.2    0.025 0.0049 0.067
## item_14      0.68      0.69    0.71     0.070 2.2    0.025 0.0048 0.065
## item_15      0.69      0.69    0.72     0.072 2.3    0.024 0.0049 0.068
## item_16      0.67      0.67    0.70     0.067 2.1    0.026 0.0046 0.064
## item_17      0.68      0.69    0.71     0.070 2.2    0.025 0.0048 0.067
## item_18      0.69      0.69    0.72     0.073 2.3    0.024 0.0049 0.068
## item_19      0.67      0.68    0.71     0.068 2.1    0.026 0.0046 0.065
## item_20      0.69      0.69    0.72     0.072 2.2    0.024 0.0047 0.067
## item_21      0.69      0.69    0.71     0.071 2.2    0.025 0.0048 0.067
## item_22      0.68      0.68    0.71     0.069 2.1    0.025 0.0048 0.065
## item_23      0.67      0.68    0.71     0.069 2.1    0.025 0.0046 0.066
## item_24      0.69      0.69    0.72     0.072 2.3    0.024 0.0048 0.068
## item_25      0.68      0.68    0.71     0.068 2.1    0.025 0.0046 0.065
## item_26      0.68      0.68    0.71     0.070 2.2    0.025 0.0048 0.066
## item_27      0.68      0.69    0.71     0.070 2.2    0.025 0.0048 0.066
## item_28      0.67      0.68    0.70     0.067 2.1    0.026 0.0044 0.065
## item_29      0.68      0.68    0.71     0.070 2.2    0.025 0.0048 0.065
## item_30      0.69      0.70    0.72     0.073 2.3    0.024 0.0049 0.068
## 
##  Item statistics 
##           n raw.r std.r r.cor r.drop mean   sd
## item_1  333  0.27  0.28 0.221  0.164 0.29 0.46
## item_2  333  0.26  0.28 0.217  0.173 0.84 0.37
## item_3  333  0.27  0.26 0.202  0.164 0.71 0.45
## item_4  333  0.30  0.33 0.285  0.219 0.85 0.36
## item_5  333  0.19  0.17 0.096  0.068 0.42 0.49
## item_6  333  0.18  0.20 0.121  0.092 0.14 0.35
## item_7  333  0.18  0.17 0.091  0.060 0.62 0.49
## item_8  333  0.23  0.23 0.161  0.123 0.25 0.43
## item_9  333  0.36  0.36 0.313  0.257 0.35 0.48
## item_10 333  0.38  0.39 0.364  0.304 0.14 0.34
## item_11 333  0.24  0.26 0.201  0.170 0.91 0.29
## item_12 333  0.30  0.31 0.257  0.204 0.24 0.43
## item_13 333  0.32  0.31 0.251  0.202 0.54 0.50
## item_14 333  0.32  0.35 0.306  0.242 0.85 0.36
## item_15 333  0.29  0.27 0.204  0.171 0.59 0.49
## item_16 333  0.49  0.50 0.496  0.409 0.78 0.41
## item_17 333  0.34  0.34 0.301  0.248 0.78 0.42
## item_18 333  0.23  0.24 0.170  0.136 0.80 0.40
## item_19 333  0.45  0.44 0.424  0.348 0.63 0.48
## item_20 333  0.30  0.28 0.221  0.185 0.40 0.49
## item_21 333  0.29  0.33 0.285  0.237 0.95 0.22
## item_22 333  0.42  0.41 0.384  0.316 0.68 0.47
## item_23 333  0.44  0.42 0.394  0.336 0.33 0.47
## item_24 333  0.26  0.25 0.191  0.154 0.26 0.44
## item_25 333  0.43  0.42 0.406  0.340 0.73 0.44
## item_26 333  0.38  0.37 0.330  0.275 0.70 0.46
## item_27 333  0.34  0.34 0.294  0.243 0.28 0.45
## item_28 333  0.49  0.47 0.463  0.394 0.41 0.49
## item_29 333  0.38  0.37 0.331  0.272 0.62 0.49
## item_30 333  0.24  0.23 0.162  0.129 0.71 0.46
## 
## Non missing response frequency for each item
##            0    1 miss
## item_1  0.71 0.29    0
## item_2  0.16 0.84    0
## item_3  0.29 0.71    0
## item_4  0.15 0.85    0
## item_5  0.58 0.42    0
## item_6  0.86 0.14    0
## item_7  0.38 0.62    0
## item_8  0.75 0.25    0
## item_9  0.65 0.35    0
## item_10 0.86 0.14    0
## item_11 0.09 0.91    0
## item_12 0.76 0.24    0
## item_13 0.46 0.54    0
## item_14 0.15 0.85    0
## item_15 0.41 0.59    0
## item_16 0.22 0.78    0
## item_17 0.22 0.78    0
## item_18 0.20 0.80    0
## item_19 0.37 0.63    0
## item_20 0.60 0.40    0
## item_21 0.05 0.95    0
## item_22 0.32 0.68    0
## item_23 0.67 0.33    0
## item_24 0.74 0.26    0
## item_25 0.27 0.73    0
## item_26 0.30 0.70    0
## item_27 0.72 0.28    0
## item_28 0.59 0.41    0
## item_29 0.38 0.62    0
## item_30 0.29 0.71    0
#item data
item_stats = tibble(
  item = names(test_items),
  # question = test_items_long_names %>% str_clean(),
  pass_rate = colMeans(test_items_scored)
)

#IRT fit
irt_fit = irt.fa(test_items_scored)
## Warning in cor.smooth(mat): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.

item_stats$loading = irt_fit$fa$loadings[, 1]
item_stats$discrimination = irt_fit$irt$discrimination[, 1]

#mirt too
#we need it for reliability
irt_fit_mirt = mirt(test_items_scored, model = 1)
## 
Iteration: 1, Log-Lik: -5421.032, Max-Change: 0.57422
Iteration: 2, Log-Lik: -5358.640, Max-Change: 0.16266
Iteration: 3, Log-Lik: -5352.402, Max-Change: 0.07363
Iteration: 4, Log-Lik: -5351.539, Max-Change: 0.02506
Iteration: 5, Log-Lik: -5351.375, Max-Change: 0.01238
Iteration: 6, Log-Lik: -5351.334, Max-Change: 0.00783
Iteration: 7, Log-Lik: -5351.319, Max-Change: 0.00515
Iteration: 8, Log-Lik: -5351.316, Max-Change: 0.00285
Iteration: 9, Log-Lik: -5351.315, Max-Change: 0.00124
Iteration: 10, Log-Lik: -5351.314, Max-Change: 0.00103
Iteration: 11, Log-Lik: -5351.314, Max-Change: 0.00024
Iteration: 12, Log-Lik: -5351.314, Max-Change: 0.00021
Iteration: 13, Log-Lik: -5351.314, Max-Change: 0.00015
Iteration: 14, Log-Lik: -5351.314, Max-Change: 0.00013
Iteration: 15, Log-Lik: -5351.314, Max-Change: 0.00012
Iteration: 16, Log-Lik: -5351.314, Max-Change: 0.00028
Iteration: 17, Log-Lik: -5351.314, Max-Change: 0.00018
Iteration: 18, Log-Lik: -5351.314, Max-Change: 0.00005
irt_fit_mirt_scores = fscores(irt_fit_mirt, full.scores = T, full.scores.SE = T)
empirical_rxx(irt_fit_mirt_scores)
##    F1 
## 0.745
marginal_rxx(irt_fit_mirt)
## [1] 0.744
#summary
item_stats$pass_rate %>% describe()
##    vars  n mean    sd median trimmed   mad   min   max range   skew kurtosis
## X1    1 30 0.56 0.246  0.619   0.566 0.312 0.135 0.949 0.814 -0.199    -1.38
##        se
## X1 0.0448
item_stats$loading %>% describe()
##    vars  n  mean    sd median trimmed   mad   min   max range   skew kurtosis
## X1    1 30 0.369 0.141  0.359   0.369 0.166 0.125 0.647 0.522 0.0897    -1.12
##        se
## X1 0.0258
#score
irt_fit_scored = scoreIrt(irt_fit, items = test_items_scored)
d$irt_score = irt_fit_scored$theta1 %>% standardize()

#bins
d$irt_score_bin = d$irt_score %>% discretize(breaks = 3, equal_range = F, labels = "integer")
d$irt_score_bin %>% table2()
describeBy(d$irt_score, d$irt_score_bin, mat = T)
#compare
GG_scatter(d, "total_score", "irt_score")
## `geom_smooth()` using formula 'y ~ x'

GG_save("figs/IRT_score_total_score.png")
## `geom_smooth()` using formula 'y ~ x'
#plots
GG_denhist(d, "irt_score", "sex") +
  scale_x_continuous("Science knowledge score")
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figs/IRT_score_dist_by_sex.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#cohen d
SMD_matrix(d$irt_score, d$sex, extended_output = T)
## $d
##        Female   Male
## Female     NA 0.0273
## Male   0.0273     NA
## 
## $d_string
##        Female               Male                
## Female NA                   "0.027 [-0.34 0.39]"
## Male   "0.027 [-0.34 0.39]" NA                  
## 
## $CI_lower
##        Female   Male
## Female     NA -0.337
## Male   -0.337     NA
## 
## $CI_upper
##        Female  Male
## Female     NA 0.392
## Male    0.392    NA
## 
## $se_z
## [1] 1.96
## 
## $se
##        Female  Male
## Female     NA 0.186
## Male    0.186    NA
## 
## $p
##        Female  Male
## Female     NA 0.883
## Male    0.883    NA
## 
## $pairwise_n
##        Female Male
## Female     NA  333
## Male      333   NA
#age
ggplot(d, aes(irt_score, age_bin, fill = age_bin)) +
  geom_boxplot() +
  coord_flip() +
  labs(x = "Science knowledge score", y = "Age group")

GG_save("figs/IRT_score_dist_by_age.png")

#cor
cor.test(d$irt_score, d$age)
## 
##  Pearson's product-moment correlation
## 
## data:  d$irt_score and d$age
## t = 3, df = 331, p-value = 0.004
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.0493 0.2591
## sample estimates:
##   cor 
## 0.156
ggplot(d %>% filter(age >= 30), aes(irt_score, education, fill = education)) +
  geom_boxplot() +
  coord_flip() +
  labs(x = "Science knowledge score", y = "Highest education attainted")

GG_save("figs/IRT_score_dist_by_education.png")

#cor
cor.test(d$irt_score, d$education_num)
## 
##  Pearson's product-moment correlation
## 
## data:  d$irt_score and d$education_num
## t = 4, df = 331, p-value = 0.00001
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.131 0.334
## sample estimates:
##   cor 
## 0.235
#model
ols(irt_score ~ sex + age + education_num, data = d %>% df_standardize(messages = F))
## Linear Regression Model
##  
##  ols(formula = irt_score ~ sex + age + education_num, data = d %>% 
##      df_standardize(messages = F))
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs     333    LR chi2     19.97    R2       0.058    
##  sigma0.9749    d.f.            3    R2 adj   0.050    
##  d.f.    329    Pr(> chi2) 0.0002    g        0.273    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -2.28570 -0.72773  0.09568  0.71296  2.21863 
##  
##  
##                Coef    S.E.   t     Pr(>|t|)
##  Intercept      0.0056 0.1725  0.03 0.9742  
##  sex=Male      -0.0062 0.1814 -0.03 0.9728  
##  age            0.0617 0.0602  1.03 0.3056  
##  education_num  0.2067 0.0601  3.44 0.0007  
## 
#self estimate
ols(total_score ~ self_score_estimate, data = d)
## Linear Regression Model
##  
##  ols(formula = total_score ~ self_score_estimate, data = d)
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs     333    LR chi2    100.85    R2       0.261    
##  sigma3.5480    d.f.            1    R2 adj   0.259    
##  d.f.    331    Pr(> chi2) 0.0000    g        2.393    
##  
##  Residuals
##  
##        Min        1Q    Median        3Q       Max 
##  -11.52278  -2.51051   0.06741   2.47722   8.46496 
##  
##  
##                      Coef    S.E.   t     Pr(>|t|)
##  Intercept           11.5596 0.5290 21.85 <0.0001 
##  self_score_estimate  0.3975 0.0367 10.82 <0.0001 
## 
cor.test(d$total_score, d$self_score_estimate)
## 
##  Pearson's product-moment correlation
## 
## data:  d$total_score and d$self_score_estimate
## t = 11, df = 331, p-value <0.0000000000000002
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.427 0.586
## sample estimates:
##   cor 
## 0.511
GG_scatter(d, "self_score_estimate", "total_score") +
  geom_count() +
  geom_smooth()
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

GG_save("figs/self_rate_score_and_score.png")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#model
ols(irt_score ~ sex + age + education_num + self_score_estimate, data = d %>% df_standardize(messages = F))
## Linear Regression Model
##  
##  ols(formula = irt_score ~ sex + age + education_num + self_score_estimate, 
##      data = d %>% df_standardize(messages = F))
##  
##                 Model Likelihood     Discrimination    
##                    Ratio Test           Indexes        
##  Obs     333    LR chi2    111.24    R2       0.284    
##  sigma0.8513    d.f.            4    R2 adj   0.275    
##  d.f.    328    Pr(> chi2) 0.0000    g        0.604    
##  
##  Residuals
##  
##       Min       1Q   Median       3Q      Max 
##  -2.38044 -0.61784  0.04835  0.59185  2.40931 
##  
##  
##                      Coef    S.E.   t     Pr(>|t|)
##  Intercept           -0.1241 0.1512 -0.82 0.4124  
##  sex=Male             0.1373 0.1591  0.86 0.3889  
##  age                 -0.0202 0.0531 -0.38 0.7041  
##  education_num        0.1329 0.0530  2.51 0.0126  
##  self_score_estimate  0.4957 0.0487 10.17 <0.0001 
## 

Books read

#books
books = d %>% 
  select(Superforecasting_The_Art_and_Science_of_Prediction_by_Philip_E_Tetlock_and_Dan_Gardner_Which_of_the_following_books_have_you_read_Check_all_that_apply:Any_university_level_textbook_introduction_to_medicine_Which_of_the_following_books_have_you_read_Check_all_that_apply) %>% 
  names()

#book question fluff
book_fluff = "_Which_of_the_following_books_have_you_read_Check_all_that_apply"
  
#books endorsed
books_data = d %>% 
  select(!!books) %>% 
  map_df(~!is.na(.))

names(books_data) = names(books_data) %>% str_replace(book_fluff, "")

#table of books
books_data %>% 
  map_df(~mean(.)) %>% 
  t() %>% 
  as.data.frame() %>% 
  rownames_to_column("book") %>% 
  rename(read_pct = V1) %>% 
  mutate(book = book %>% str_clean()) %>% 
  arrange(-read_pct) %>% 
  df_round(2) %>% 
  datatable()
#save index score
d$book_index = books_data %>% 
  rowSums()

Politics

We will want to cluster the labels later on.

#names of labels
pol_labels = d %>% 
  select(Neoreactionary_nrx_Check_any_label_you_identify_with_politically:N_A_Check_any_label_you_identify_with_politically) %>% 
  names()

#data
pol_data = d %>% select(!!pol_labels) %>% map_df(~!is.na(.))

#nice names
names(pol_data) = pol_labels %>% str_replace("_Check_any_label_you_identify_with_politically", "")

#endorsement by label
pol_data %>% colMeans()
## Neoreactionary_nrx          alt_right   ethnonationalist          communist 
##             0.2192             0.1441             0.2012             0.0180 
##          socialist            liberal  classical_liberal            leftist 
##             0.0751             0.1261             0.2492             0.0691 
##       conservative        libertarian anarcholibertarian   radical_centrist 
##             0.2613             0.2282             0.0450             0.0871 
## plain_old_centrist              YIMBY              NIMBY           moderate 
##             0.0300             0.1622             0.0210             0.1231 
##       center_right        center_left         apolitical                N_A 
##             0.1502             0.1051             0.0901             0.0961
#labels heatmap
pol_data %>% 
  map_df(as.numeric) %>% 
  mixedCor() %>% 
  .$rho %>% 
  GG_heatmap(font_size = 3.2, color_label = "Latent correlations")
## Warning in cor.smooth(mat): Matrix was not positive definite, smoothing was done

GG_save("figs/political_labels.png")

#summarize politics into fewer dimensions
#by trial and error, got 3 factor solution
pol_fa = irt.fa(pol_data %>% map_df(as.numeric), nfactors = 3)
## Warning in cor.smooth(mat): Matrix was not positive definite, smoothing was done
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.

pol_fa$fa
## Factor Analysis using method =  minres
## Call: fa(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate, 
##     fm = fm)
## Standardized loadings (pattern matrix) based upon correlation matrix
##                      MR1   MR2   MR3    h2   u2 com
## Neoreactionary_nrx  0.72 -0.11 -0.02 0.528 0.47 1.1
## alt_right           0.77  0.08  0.03 0.595 0.41 1.0
## ethnonationalist    0.66 -0.21 -0.05 0.491 0.51 1.2
## communist           0.27  0.07  0.71 0.563 0.44 1.3
## socialist           0.13 -0.04  0.85 0.712 0.29 1.0
## liberal            -0.33  0.60  0.34 0.626 0.37 2.2
## classical_liberal   0.04  0.71 -0.33 0.583 0.42 1.4
## leftist            -0.20 -0.06  0.86 0.792 0.21 1.1
## conservative        0.40  0.20 -0.32 0.312 0.69 2.5
## libertarian         0.23  0.65 -0.04 0.472 0.53 1.3
## anarcholibertarian  0.38  0.26  0.17 0.243 0.76 2.2
## radical_centrist    0.19  0.26  0.12 0.119 0.88 2.2
## plain_old_centrist  0.00  0.47  0.16 0.260 0.74 1.2
## YIMBY              -0.12  0.54  0.07 0.321 0.68 1.1
## NIMBY               0.67  0.24  0.32 0.592 0.41 1.7
## moderate           -0.22  0.56 -0.03 0.358 0.64 1.3
## center_right       -0.04  0.38 -0.39 0.274 0.73 2.0
## center_left        -0.53  0.19  0.36 0.489 0.51 2.1
## apolitical         -0.03  0.02  0.12 0.017 0.98 1.2
## N_A                -0.26 -0.52  0.02 0.336 0.66 1.5
## 
##                        MR1  MR2  MR3
## SS loadings           3.04 2.88 2.76
## Proportion Var        0.15 0.14 0.14
## Cumulative Var        0.15 0.30 0.43
## Proportion Explained  0.35 0.33 0.32
## Cumulative Proportion 0.35 0.68 1.00
## 
##  With factor correlations of 
##       MR1  MR2   MR3
## MR1  1.00 0.01 -0.07
## MR2  0.01 1.00  0.08
## MR3 -0.07 0.08  1.00
## 
## Mean item complexity =  1.5
## Test of the hypothesis that 3 factors are sufficient.
## 
## The degrees of freedom for the null model are  190  and the objective function was  78.7 with Chi Square of  25526
## The degrees of freedom for the model are 133  and the objective function was  72 
## 
## The root mean square of the residuals (RMSR) is  0.12 
## The df corrected root mean square of the residuals is  0.14 
## 
## The harmonic number of observations is  333 with the empirical chi square  1851  with prob <  0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000043 
## The total number of observations was  333  with Likelihood Chi Square =  23219  with prob <  0 
## 
## Tucker Lewis Index of factoring reliability =  -0.31
## RMSEA index =  0.722  and the 90 % confidence intervals are  0.715 0.731
## BIC =  22446
## Fit based upon off diagonal values = 0.79
#score
#error when scoring using IRT?
pol_fa_scores = scoreIrt(stats = pol_fa$fa, items = pol_data %>% map_df(as.numeric))

d$pol_factor_libertarianish = pol_fa_scores$theta1 %>% standardize()
d$pol_factor_altrightish = pol_fa_scores$theta2 %>% standardize()
d$pol_factor_leftliberalish = pol_fa_scores$theta3 %>% standardize()

#income tax
d$income_tax %>% describe()
##    vars   n mean   sd median trimmed  mad min max range  skew kurtosis    se
## X1    1 333 21.7 14.5     20    20.9 14.8   0 100   100 0.821     2.11 0.793
GG_denhist(d, "income_tax", group = "sex") +
  scale_x_continuous(limits = c(0, 100)) +
  ggtitle("Income tax preference distribution by sex", "Red line = mean")
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).

GG_save("figs/income_tax_pref_dist_by_sex.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
#personal freedom
d$personal_freedom %>% describe()
##    vars   n mean   sd median trimmed  mad min max range   skew kurtosis    se
## X1    1 333 6.05 3.32      6    6.13 4.45   0  11    11 -0.175    -1.15 0.182
GG_denhist(d, "personal_freedom", group = "sex") +
  scale_x_continuous(limits = c(0, 11), breaks = 0:100) +
  ggtitle("Personal freedom items endorsed, distribution by sex", "Red line = mean")
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).

GG_save("figs/pers_freedom_dist_by_sex.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
#national security
ggplot(d, aes(national_security, fill = sex)) +
  geom_bar(position = "fill") +
  theme(axis.text.x = element_text(angle = -10)) +
  ggtitle("The new coronavirus is a threat to the national security of the country in which I currently live.")

GG_save("figs/CV_nat_sec_dist_by_sex.png")

#italy govt
ggplot(d, aes(italy_govt, fill = sex)) +
  geom_bar(position = "fill") +
  theme(axis.text.x = element_text(angle = -10)) +
  ggtitle("The Italian authorities initially did not do enough to combat the spread of the new coronavirus.")

GG_save("figs/italy_govt_dist_by_sex.png")

#close airports
ggplot(d, aes(close_airports, fill = sex)) +
  geom_bar(position = "fill") +
  theme(axis.text.x = element_text(angle = -10)) +
  ggtitle("Authorities across the world should\ncompletely close airports in order to prevent the spread of the new coronavirus.")

GG_save("figs/close_airports_dist_by_sex.png")

Media / news

#how closely following
d$follow_news_closely %>% describe()
##    vars   n mean   sd median trimmed  mad min max range   skew kurtosis   se
## X1    1 333 62.8 23.2     69    64.2 22.2   1 100    99 -0.545    -0.59 1.27
GG_denhist(d, "follow_news_closely", group = "sex") +
  scale_x_continuous(limits = c(0, 100)) +
  ggtitle("Following news on Coronavirus, distribution by sex", "Red line = mean")
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).

#media coverage accuracy
d$media_coverage_accurate %>% describe()
##    vars   n mean   sd median trimmed  mad min max range  skew kurtosis   se
## X1    1 333   40 23.9     36    38.9 23.7   0 100   100 0.384   -0.712 1.31
GG_denhist(d, "media_coverage_accurate", group = "sex") +
  scale_x_continuous(limits = c(0, 100)) +
  ggtitle("Mainstream media accuracy rating, distribution by sex", "Red line = mean")
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).

#relationships
GG_scatter(d, "follow_news_closely", "media_coverage_accurate") +
  geom_smooth()
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

GG_scatter(d, "follow_news_closely", "total_score") +
  geom_smooth()
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Personal safety

#safety items
safety_items = d %>% 
  select(contains("Check_all_that_apply_and_only_check_a_box_when_you_have_done_this_as_a_result_of_the_new_coronavirus")) %>% 
  names() %>% 
  .[-c(14:15)]

#data
safety_data = d %>% select(!!safety_items) %>% map_df(~!is.na(.))

#nice names
names(safety_data) = safety_items %>% str_replace("_To_what_extent_have_you_altered_your_behaviour_because_of_the_new_coronavirus_Check_all_that_apply_and_only_check_a_box_when_you_have_done_this_as_a_result_of_the_new_coronavirus", "")

#endorsement by label
safety_data %>% colMeans() %>% sort(decreasing = T)
##                                                                                  I_wash_my_hands_more_often_or_have_started_washing_them 
##                                                                                                                                   0.8258 
##                                                                                      I_have_stopped_shaking_people_s_hands_to_greet_them 
##                                                                                                                                   0.5526 
##                                                                                                        I_built_an_emergency_food_storage 
##                                                                                                                                   0.4925 
##                                                                                                             I_have_canceled_travel_plans 
##                                                                                                                                   0.3844 
##                                                                                    I_have_stopped_going_to_my_place_of_work_or_education 
##                                                                                                                                   0.3093 
##                                                                                 I_have_purchased_face_masks_gas_masks_or_other_equipment 
##                                                                                                                                   0.2883 
##                                                                          I_have_ordered_more_products_on_the_internet_than_I_normally_do 
##                                                                                                                                   0.2492 
##                I_have_stopped_kissing_people_on_the_cheeks_to_greet_them_In_some_cultures_people_are_kissed_on_their_cheek_as_a_greeting 
##                                                                                                                                   0.1892 
## I_have_reduced_the_amount_of_times_I_visit_my_place_of_worship_for_example_a_church_mosque_or_synagogue_or_have_stopped_going_completely 
##                                                                                                                                   0.1441 
##                                                      I_did_not_go_to_a_birthday_party_a_jubilee_party_marriage_ceremony_or_similar_event 
##                                                                                                                                   0.1381 
##                                                                                                                           Other_Write_In 
##                                                                                                                                   0.0751 
##                                                                                                                    I_have_stopped_dating 
##                                                                                                                                   0.0631 
##                                                                     I_have_stopped_touching_my_pet_s_or_have_been_touching_my_pet_s_less 
##                                                                                                                                   0.0240

Forecasts

Tricky to summarize because of the log differences probably. We can do a start.

#forecast variables
forecast_vars = d %>% 
  select(What_do_you_estimate_will_be_the_total_death_toll_of_Coronavirus_in_the_USA_in_the_next_2_years_Estimate_death_tolls_of_the_new_coronavirus:What_is_the_probability_that_anyone_in_your_immediate_genetic_family_will_be_infected_by_the_coronavirus_in_the_next_two_years_1st_degree_relatives_i_e_parents_children_siblings) %>% names()

forecast_vars_date = d %>% 
  select(starts_with("When_do_you_think_the_rate_of_new")) %>% names()

forecast_vars_number = setdiff(forecast_vars, forecast_vars_date)

#descriptive stats for each
map_df(forecast_vars, function(v) {
  x = d[[v]]
  #is time?
  if (d[[v]] %>% is.Date()) return(tibble())
  
  #character? its percent
  if (d[[v]] %>% is.character()) x = d[[v]] %>% str_replace("%", "") %>% as.numeric()
  

  tibble(
    question = v %>% str_clean()
  ) %>% 
    cbind(describe(x))
}) %>% 
  select(-vars, -n, -se, -range, -skew, -kurtosis) %>% 
  rename(trimmed_mean_10pct = trimmed) %>% 
  df_round(2) %>% 
  datatable()
#plot each of them
for (v in forecast_vars) {
  #is percent?
  is_percent = d[[v]] %>% str_detect("%") %>% any(na.rm = T)
  
  #clean
  local_v = d[[v]]
  if (is_percent) local_v = local_v %>% str_replace("%", "") %>% as.numeric() %>% divide_by(100)
  
  #plot
  if (is_percent) {
    gg = GG_denhist(tibble(x = local_v), "x", vline = "median") +
      scale_x_continuous("Probability", 
                         breaks = seq(0, 1, by = .1), 
                         limits = c(0, 1),
                         labels = scales::percent) +
      ggtitle(v %>% str_remove_fluff() %>% add_newlines(line_length = 90))
  } else if (is.Date(local_v)) {
    gg = ggplot(tibble(x = local_v), aes(x)) +
      geom_histogram(binwidth = 2.5) +
      geom_density(aes(y=2.5 * ..count..)) +
      scale_x_date("Date", date_breaks = "1 month") +
      theme(axis.text.x = element_text(angle = -20)) +
      geom_vline(data = tibble(x = median(local_v, na.rm = T)), mapping = aes(xintercept = x), color = "red", linetype = "dashed", size = 1) +
      ggtitle(v %>% str_remove_fluff() %>% add_newlines(line_length = 90))
  } else {
    gg = GG_denhist(tibble(x = local_v), "x", vline = "median") +
      scale_x_log10("Number", breaks = 10^(0:20)) +
      theme(
        # axis.title.y = element_blank()
        axis.text.y = element_blank(),
        # axis.ticks.y = element_blank()
      ) +
      ggtitle(v %>% str_remove_fluff() %>% add_newlines(line_length = 90))
  }
  
  print(gg)
  GG_save(plot = gg, str_glue("figs/forecast_{which(v == forecast_vars)}.png"))
}
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing non-finite values (stat_density).
## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing non-finite values (stat_density).

## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).

#all in one plot?
#not possible because some are dates
#but we can do 2 subsets
d %>% 
  #gather to long form
  gather(key = question, value = value, !!forecast_vars_date) %>% 
  mutate(question = mapvalues(question, from = forecast_vars_date, to = c("USA", "Germany", "Europe", "United Kingdom")),
         date = as.Date(value, origin = "1970-01-01")) %>% 
  ggplot(aes(date)) +
  geom_histogram() +
  scale_x_date(limits = c(as.Date("2020-01-01",  "%Y-%m-%d"), as.Date("2022-01-01",  "%Y-%m-%d")),
               date_breaks = "3 months", guide = guide_axis(n.dodge = 2)) +
  facet_wrap("question", nrow = 2)
## Warning: attributes are not identical across measure variables;
## they will be dropped
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing non-finite values (stat_bin).
## Warning: Removed 8 rows containing missing values (geom_bar).

GG_save("figs/forecast_distributions_dates.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing non-finite values (stat_bin).

## Warning: Removed 8 rows containing missing values (geom_bar).
#summary
d %>% 
  select(!!forecast_vars_date) %>% 
  map_df(function(x) {
    tibble(
      mean = mean(x),
      median = median(x)
    )
  })
#forecast general index
#convert to rank format
forecasts_numeric_ranked = d[forecast_vars %>% str_subset("When_do_you_thi", negate = T)] %>% map_df(function(x) {
  if (!is.numeric(x)) x = x %>% str_replace("%", "") %>% as.numeric()
  rank(x)
})

#better names
names(forecasts_numeric_ranked) = c(
  "death_toll_" + c("USA", "Europe", "UK", "Poland"),
  "prevalence_" + c("France", "Italy", "Spain", "USA"),
  "emergency_declared_" + c("USA", "Germany", "UK", "Sweden"),
  "risk_" + c("self", "family")
)

#heatmap
forecasts_numeric_ranked %>% 
  GG_heatmap()

GG_save("figs/forecasts_heatmap.png")

#factor analyze
forecasts_fa = fa(forecasts_numeric_ranked)
forecasts_fa
## Factor Analysis using method =  minres
## Call: fa(r = forecasts_numeric_ranked)
## Standardized loadings (pattern matrix) based upon correlation matrix
##                             MR1    h2   u2 com
## death_toll_USA             0.65 0.420 0.58   1
## death_toll_Europe          0.67 0.449 0.55   1
## death_toll_UK              0.68 0.466 0.53   1
## death_toll_Poland          0.66 0.435 0.57   1
## prevalence_France          0.85 0.718 0.28   1
## prevalence_Italy           0.79 0.622 0.38   1
## prevalence_Spain           0.84 0.700 0.30   1
## prevalence_USA             0.86 0.734 0.27   1
## emergency_declared_USA     0.30 0.091 0.91   1
## emergency_declared_Germany 0.34 0.117 0.88   1
## emergency_declared_UK      0.33 0.112 0.89   1
## emergency_declared_Sweden  0.34 0.118 0.88   1
## risk_self                  0.64 0.412 0.59   1
## risk_family                0.60 0.363 0.64   1
## 
##                 MR1
## SS loadings    5.76
## Proportion Var 0.41
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 factor is sufficient.
## 
## The degrees of freedom for the null model are  91  and the objective function was  17.5 with Chi Square of  5706
## The degrees of freedom for the model are 77  and the objective function was  11.3 
## 
## The root mean square of the residuals (RMSR) is  0.25 
## The df corrected root mean square of the residuals is  0.27 
## 
## The harmonic number of observations is  333 with the empirical chi square  3718  with prob <  0 
## The total number of observations was  333  with Likelihood Chi Square =  3680  with prob <  0 
## 
## Tucker Lewis Index of factoring reliability =  0.24
## RMSEA index =  0.375  and the 90 % confidence intervals are  0.365 0.386
## BIC =  3233
## Fit based upon off diagonal values = 0.73
## Measures of factor score adequacy             
##                                                    MR1
## Correlation of (regression) scores with factors   0.96
## Multiple R square of scores with factors          0.93
## Minimum correlation of possible factor scores     0.86
forecasts_fa$loadings[, 1] %>% describe()
##    vars  n  mean    sd median trimmed   mad   min   max range  skew kurtosis
## X1    1 14 0.611 0.202  0.654   0.616 0.236 0.302 0.857 0.555 -0.38    -1.45
##        se
## X1 0.0539
fa_plot_loadings(forecasts_fa)

GG_save("figs/forecasts_loadings.png")

#save scores
d$general_forecast = forecasts_fa$scores[, 1] %>% standardize()

d$general_forecast %>% describe()
##    vars   n                  mean sd  median trimmed  mad   min  max range
## X1    1 333 0.0000000000000000132  1 -0.0242 0.00627 1.11 -2.16 2.11  4.27
##       skew kurtosis     se
## X1 -0.0454   -0.899 0.0548
#plot
GG_denhist(d, "general_forecast")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

GG_save("figs/general_forecast_dist.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

By science knowledge level

#expertise split
#plot each of them
for (v in forecast_vars) {
  #is percent?
  is_percent = d[[v]] %>% str_detect("%") %>% any(na.rm = T)
  
  #clean
  local_v = d[[v]]
  if (is_percent) local_v = local_v %>% str_replace("%", "") %>% as.numeric() %>% divide_by(100)
  
  #plot
  if (is_percent) {
    
    gg = GG_denhist(tibble(x = local_v, irt_score_bin = d$irt_score_bin), "x", group = "irt_score_bin", vline = "median") +
      scale_x_continuous("Probability", 
                         breaks = seq(0, 1, by = .1), 
                         limits = c(0, 1),
                         labels = scales::percent) +
      ggtitle(v %>% str_remove_fluff() %>% add_newlines(line_length = 90))
    
  } else if (is.Date(local_v)) {
    #remove bad dates
    local_v[local_v < ymd("2020-01-01")] = NA
    local_v[local_v > ymd("2023-01-01")] = NA
    
    #plot
    gg = ggplot(tibble(x = local_v, irt_score_bin = d$irt_score_bin), aes(x, fill = factor(irt_score_bin))) +
      geom_histogram(binwidth = 2.5, position = "dodge") +
      geom_density(aes(y = 2.5 * ..count..), alpha = .2) +
      scale_x_date("Date", date_breaks = "1 month") +
      theme(axis.text.x = element_text(angle = -20)) +
      geom_vline(data = tibble(x = describeBy(local_v %>% as.numeric(), d$irt_score_bin, mat = T)$median,
                               irt_score_bin = factor(1:3)), mapping = aes(xintercept = x, color = irt_score_bin), linetype = "dashed", size = 1) +
      ggtitle(v %>% str_remove_fluff() %>% add_newlines(line_length = 90)) +
      guides(color = F)
    
  } else {
    
    gg = GG_denhist(tibble(x = local_v, irt_score_bin = d$irt_score_bin), group = "irt_score_bin", "x", vline = "median") +
      scale_x_log10("Number", breaks = 10^(0:20)) +
      theme(
        # axis.title.y = element_blank()
        axis.text.y = element_blank(),
        # axis.ticks.y = element_blank()
      ) +
      ggtitle(v %>% str_remove_fluff() %>% add_newlines(line_length = 90))
    
  }
  
  print(gg)
  GG_save(plot = gg, str_glue("figs/forecast_{which(v == forecast_vars)}_science_bins.png"))
}
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing non-finite values (stat_density).
## Warning: Transformation introduced infinite values in continuous x-axis

## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing non-finite values (stat_density).

## Warning: Removed 2 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing non-finite values (stat_density).
## Warning: Removed 2 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing non-finite values (stat_density).

## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).

#median and trimmed mean by forecast and expertise
map_df(forecast_vars, function(v) {

  #is percent?
  is_percent = d[[v]] %>% str_detect("%") %>% any(na.rm = T)
  
  #clean
  local_v = d[[v]]
  if (is_percent) local_v = local_v %>% str_replace("%", "") %>% as.numeric() %>% divide_by(100)
  
  #manually
  tibble(
    question = v,
    median_low = median(local_v[d$irt_score_bin == 1], na.rm = T),
    median_medium = median(local_v[d$irt_score_bin == 2], na.rm = T),
    median_high = median(local_v[d$irt_score_bin == 3], na.rm = T),
    tmean_low = mean(local_v[d$irt_score_bin == 1], na.rm = T, trim = .1),
    tmean_medium = mean(local_v[d$irt_score_bin == 2], na.rm = T, trim = .1),
    tmean_high = mean(local_v[d$irt_score_bin == 3], na.rm = T, trim = .1),
  ) %>% 
    #convert all to character because otherwise we can't merge results in the end
    map_df(function(x) {
      #already chr, remove fluff
      if (is.character(x)) return(x %>% str_remove_fluff())
      
      #dates to chr
      if (is.Date(x)) return(as.character(x))
      
      #2 digits if 0-1
      if (x < 1) return(format_digits(x, digits = 2))
      
      #round otherwise
      round(x) %>% as.character()
    })
}) ->
  forecast_MM_by_sciknow

forecast_MM_by_sciknow

Predict forecasts

#individual variables

#models
list(
  ols(general_forecast ~ irt_score, data = d %>% df_standardize(messages = F)),
  ols(general_forecast ~ irt_score + age + sex, data = d %>% df_standardize(messages = F)),
  ols(general_forecast ~ irt_score + age + sex + education_num + book_index + fox_hedgehog_num + rationalist + HBD, data = d %>% df_standardize(messages = F)),
  ols(general_forecast ~ irt_score + age + sex + education_num + book_index + fox_hedgehog_num + rationalist + HBD +  pol_factor_libertarianish + pol_factor_altrightish + pol_factor_leftliberalish + income_tax, data = d %>% df_standardize(messages = F)),
  ols(general_forecast ~ irt_score + age + sex + education_num + book_index + fox_hedgehog_num + rationalist + HBD +  pol_factor_libertarianish + pol_factor_altrightish + pol_factor_leftliberalish + income_tax + from_USA + from_UK + from_France, data = d %>% df_standardize(messages = F))
) %>% 
  summarize_models()
#each outcome individually
map_df(names(forecasts_numeric_ranked), function(v) {
  #modify data locally
  d_local = d %>% 
    df_standardize(messages = F)
  #replace outcome variable with the specific one
  d_local$general_forecast = forecasts_numeric_ranked[[v]] %>% standardize()
  
  #regress
  list(
    ols(general_forecast ~ irt_score, data = d_local),
    ols(general_forecast ~ irt_score + age + sex, data = d_local),
    ols(general_forecast ~ irt_score + age + sex + education_num + book_index + fox_hedgehog_num + rationalist + HBD, data = d_local),
    ols(general_forecast ~ irt_score + age + sex + education_num + book_index + fox_hedgehog_num + rationalist + HBD +  pol_factor_libertarianish + pol_factor_altrightish + pol_factor_leftliberalish + income_tax, data = d_local),
    ols(general_forecast ~ irt_score + age + sex + education_num + book_index + fox_hedgehog_num + rationalist + HBD +  pol_factor_libertarianish + pol_factor_altrightish + pol_factor_leftliberalish + income_tax + from_USA + from_UK + from_France, data = d_local)
  ) %>% 
    summarize_models() %>% 
    mutate(outcome = v)
}) ->
  many_models

#print
many_models

Meta

#write data out
data_out = d %>% 
  #remove variables without data
  {
    .[setdiff(names(.), d_vars %>% filter(miss_prop == 1) %>% pull(var_name))] %>% 
      #privacy
      select(-IP_Address, -User_Agent, -SessionID, -Leave_your_email_address_here_if_we_may_send_you_follow_up_surveys_and_inform_you_of_your_prediction_accuracy_later_Note_This_question_is_NOT_required_and_your_e_mail_will_NOT_be_shared_publicly)
  }

#write
write_rds(test_items_scored, "data/out/scored_items.rds")
write_rds(data_out, "data/out/main_data.rds")

#versions
write_sessioninfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 19.3
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] mirt_1.31          osfr_0.2.8         rms_5.1-4          SparseM_1.78      
##  [5] DT_0.13            lubridate_1.7.4    readxl_1.3.1       kirkegaard_2018.05
##  [9] metafor_2.4-0      Matrix_1.2-18      psych_1.9.12.31    magrittr_1.5      
## [13] assertthat_0.2.1   weights_1.0.1      mice_3.8.0         gdata_2.18.0      
## [17] Hmisc_4.4-0        Formula_1.2-3      survival_3.1-11    lattice_0.20-40   
## [21] forcats_0.5.0      stringr_1.4.0      dplyr_0.8.5        purrr_0.3.3       
## [25] readr_1.3.1        tidyr_1.0.2        tibble_3.0.0       ggplot2_3.3.0     
## [29] tidyverse_1.3.0    pacman_0.5.1      
## 
## loaded via a namespace (and not attached):
##  [1] TH.data_1.0-10        colorspace_1.4-1      ellipsis_0.3.0       
##  [4] htmlTable_1.13.3      base64enc_0.1-3       fs_1.4.1             
##  [7] rstudioapi_0.11       httpcode_0.2.0        farver_2.0.3         
## [10] Deriv_4.0             MatrixModels_0.4-1    fansi_0.4.1          
## [13] mvtnorm_1.1-0         xml2_1.3.1            codetools_0.2-16     
## [16] splines_3.6.3         mnormt_1.5-6          knitr_1.28           
## [19] jsonlite_1.6.1        broom_0.5.5           cluster_2.1.0        
## [22] dbplyr_1.4.2          png_0.1-7             compiler_3.6.3       
## [25] httr_1.4.1            backports_1.1.5       cli_2.0.2            
## [28] acepack_1.4.1         htmltools_0.4.0       quantreg_5.55        
## [31] tools_3.6.3           gtable_0.3.0          glue_1.4.0           
## [34] Rcpp_1.0.4.6          cellranger_1.1.0      vctrs_0.2.4          
## [37] crul_0.9.0            nlme_3.1-145          crosstalk_1.1.0.1    
## [40] multilevel_2.6        xfun_0.12             rvest_0.3.5          
## [43] lifecycle_0.2.0       dcurver_0.9.1         gtools_3.8.2         
## [46] polspline_1.1.17      MASS_7.3-51.5         zoo_1.8-7            
## [49] scales_1.1.0          hms_0.5.3             parallel_3.6.3       
## [52] sandwich_2.5-1        RColorBrewer_1.1-2    psychometric_2.2     
## [55] yaml_2.2.1            curl_4.3              memoise_1.1.0        
## [58] gridExtra_2.3         rpart_4.1-15          latticeExtra_0.6-29  
## [61] stringi_1.4.6         permute_0.9-5         checkmate_2.0.0      
## [64] rlang_0.4.5           pkgconfig_2.0.3       evaluate_0.14        
## [67] labeling_0.3          htmlwidgets_1.5.1     tidyselect_1.0.0     
## [70] plyr_1.8.6            R6_2.4.1              generics_0.0.2       
## [73] multcomp_1.4-12       DBI_1.1.0             mgcv_1.8-31          
## [76] pillar_1.4.3          haven_2.2.0           foreign_0.8-76       
## [79] withr_2.1.2           nnet_7.3-13           modelr_0.1.6         
## [82] crayon_1.3.4          rmarkdown_2.1         jpeg_0.1-8.1         
## [85] grid_3.6.3            data.table_1.12.8     vegan_2.5-6          
## [88] reprex_0.3.0          digest_0.6.25         GPArotation_2014.11-1
## [91] munsell_0.5.0         viridisLite_0.3.0
#update OSF
#manually only
#tutorial https://rpubs.com/EmilOWK/osfr_demo
if (F) {
  #connect
  osf_auth(read_lines("~/.config/osf_token"))
  osf_proj = osf_retrieve_node("https://osf.io/9ecwv/")
  
  #which files to upload
  osf_upload(osf_proj, 
             path = c("notebook.html", "notebook.Rmd", "figs", "data/out", "data/test_scoring_key.xlsx"), 
             conflicts = "overwrite")
}

Extras

Unrelated to study. Norms for science test for 20 item version.

#pick 20 best items by loading
best20_items_scored = test_items_scored[item_stats %>% arrange(-loading) %>% .[1:20, ] %>% pull(item)]
colnames(best20_items_scored)
##  [1] "item_16" "item_10" "item_28" "item_25" "item_21" "item_19" "item_23"
##  [8] "item_22" "item_29" "item_26" "item_14" "item_17" "item_9"  "item_4" 
## [15] "item_12" "item_27" "item_1"  "item_13" "item_2"  "item_11"
#analyze again
best20_items_irt = mirt(best20_items_scored, model = 1, verbose = F)
best20_items_irt_scored = fscores(best20_items_irt, full.scores = T, full.scores.SE = T)

#reliabilities
empirical_rxx(best20_items_irt_scored)
##    F1 
## 0.721
marginal_rxx(best20_items_irt)
## [1] 0.719
#numerics
d$total_score20 = rowSums(best20_items_scored)
d$total_score20 %>% describe()
##    vars   n mean   sd median trimmed  mad min max range   skew kurtosis    se
## X1    1 333 11.9 3.28     12      12 2.97   4  19    15 -0.131   -0.621 0.179
#distribution to centile and IQ
total_score_cdf20 = ecdf(d$total_score20)

#centiles
total_score_cdf20(0:20) %>% set_names(0:20)
##       0       1       2       3       4       5       6       7       8       9 
## 0.00000 0.00000 0.00000 0.00000 0.00901 0.02703 0.05405 0.09009 0.16817 0.24925 
##      10      11      12      13      14      15      16      17      18      19 
## 0.35736 0.44745 0.53153 0.64565 0.76877 0.86787 0.92492 0.95796 0.99099 1.00000 
##      20 
## 1.00000
#IQ scale
total_score_cdf20(0:20) %>% qnorm(mean = 100, sd = 15) %>% set_names(0:20)
##     0     1     2     3     4     5     6     7     8     9    10    11    12 
##  -Inf  -Inf  -Inf  -Inf  64.5  71.1  75.9  79.9  85.6  89.8  94.5  98.0 101.2 
##    13    14    15    16    17    18    19    20 
## 105.6 111.0 116.7 121.6 125.9 135.5   Inf   Inf
#plots
GG_denhist(d, "total_score20", vline = F) +
  scale_x_continuous("Score", limits = c(0, 20)) +
  ggtitle("Distribution of 20 item scores", "Data based on pilot study with n=333 persons")
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).

GG_save("figs/score_dist20.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).