This is the results page for the Corona study. You can take part in it here:
We now have sufficient data for some values. Calculations and output shown below!
options(digits = 3, scipen = 999)
options(contrasts=c("contr.treatment", "contr.treatment"))
library(pacman)
p_load(kirkegaard, readxl, lubridate, DT, rms, osfr, mirt)
theme_set(theme_bw())
#fix bug
describe = function(...) psych::describe(...) %>% as.matrix()
#clean string of question fluff
str_remove_fluff = function(x) {
x %>%
str_clean() %>%
str_replace("Estimate death tolls of the new coronavirus", "") %>%
str_replace(" Note Europe does not include Turkey Russia or Caucasus area countries", "") %>%
str_replace("A state of emergency is a situation in which a government is empowered to perform actions or impose policies that it would normally not be permitted to undertake", "") %>%
str_replace(" The format is YYYY MM DD or click the calendar button", "") %>% str_replace(" 1st degree relatives i e parents children siblings", "") %>%
str_replace(" Infected means anyone who had the virus currently has it recovered from the virus or died of it as a consequence", "") %>%
str_trim() %>%
str_c("?")
}
#read data
d = read_csv("data/20200404154654-SurveyExport.csv") %>%
df_legalize_names() %>%
#filter test responses
filter(Time_Started > ymd_hms("2020-03-11 22:17:11"))
## Warning: Duplicated column names deduplicated: 'Other - Write In:To what extent
## have you altered your behaviour because of the new coronavirus? Check all
## that apply, and only check a box when you have done this as a result of the
## new coronavirus.' => 'Other - Write In:To what extent have you altered your
## behaviour because of the new coronavirus? Check all that apply, and only check a
## box when you have done this as a result of the new coronavirus._1' [112]
## Parsed with column specification:
## cols(
## .default = col_character(),
## `Response ID` = col_double(),
## `Time Started` = col_datetime(format = ""),
## `Date Submitted` = col_datetime(format = ""),
## `Contact ID` = col_logical(),
## `Legacy Comments` = col_logical(),
## Comments = col_logical(),
## Tags = col_logical(),
## Longitude = col_double(),
## Latitude = col_double(),
## `What is your age?` = col_double(),
## `There is a movement of thought called the rationalist or Less Wrong movement, headed by people such as Eliezer Yudkowsky and Scott Alexander (Slate Star Codex). It is based on the idea that humans can become aware of and hopefully counteract various cognitive biases that we possess. To which degree do you consider yourself a rationalist?` = col_double(),
## `“Genetic variation contributes substantially to psychological and socioeconomic variation between human populations (clusters, races etc.), being ultimately responsible for at least 50% of observed differences.” How likely do you think this statement is true?` = col_double(),
## `What is your ideal rate of personal income taxation?` = col_double(),
## `How many of the following do you want to legalise or keep legal? Sex work/prostitution, strip clubs, casinos/gambling, cannabis, amphetamine, methamphetamine, LSD, driving without seatbelt, selling human organs, selling human semen/eggs, homosexual sex. Input a number (min. 0 max. 11).` = col_double(),
## `How accurate do you think the mainstream media has presented information about Coronavirus?` = col_double(),
## `The new coronavirus is a variant of the flu.` = col_logical(),
## `The new coronavirus uses DNA to replicate itself and store genetic information.` = col_logical(),
## `Some scientists working in fields such as microbiology and virology think the name SARS-CoV-2 is appropriate for the new coronavirus.` = col_logical(),
## `Washing hands is an effective way to prevent spreading the new coronavirus because soap is able to dissolve proteins, and the viral envelope of the new coronavirus is made out of protein.` = col_logical(),
## `The previous pages featured 30 questions about science and about coronaviruses. How many answers do you think you got correct?` = col_double()
## # ... with 12 more columns
## )
## See spec(...) for full column specifications.
#total samples
nrow(d)
## [1] 369
#variable list
d_vars = df_var_table(d)
d_vars #print it
#scoring key for test
test_scoring_key = read_excel("data/test_scoring_key.xlsx")
test_scoring_key$correct %<>% str_trim()
#original data before exclusion
d_original = d
#data collection
d_original %>%
ggplot(aes(Date_Submitted)) +
geom_histogram() +
scale_x_datetime(date_breaks = "3 days")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
d_original$Date_Submitted %>% mean()
## [1] "2020-03-13 15:05:19 UTC"
d_original$Date_Submitted %>% median()
## [1] "2020-03-12 12:42:35 UTC"
d_original$Date_Submitted %>% date() %>% table2()
#filter out late responses
d = d %>% filter(Date_Submitted <= ymd("2020-03-18"))
#filter out dishonest replies
#honest? remove
d$honest = d$Did_you_honestly_answer_the_science_and_forecasting_questions_If_not_kindly_tell_us_so_we_can_exclude_your_results
d$honest %>% table2()
d = d %>% filter(honest == "Yes, I was honest")
#fake age, remove
d %<>% filter(What_is_your_age <= 100)
#crazy forecasts, remove
d %<>% filter(
What_do_you_estimate_will_be_the_total_death_toll_of_Coronavirus_in_the_USA_in_the_next_2_years_Estimate_death_tolls_of_the_new_coronavirus < 300e6
)
#too late
d$Date_Submitted %>% quantile()
## 0% 25% 50%
## "2020-03-11 22:43:22 UTC" "2020-03-12 01:24:56 UTC" "2020-03-12 08:19:02 UTC"
## 75% 100%
## "2020-03-13 12:29:23 UTC" "2020-03-17 14:30:14 UTC"
#rename
d$survey_duration = (d$Date_Submitted - d$Time_Started)
d$total_score = d$We_have_calculated_your_score %>% as.numeric()
d$age = d$What_is_your_age %>% as.numeric()
d$age_bin = d$age %>% discretize(breaks = 5)
d$sex = d$What_is_your_sex %>% factor()
d$education = d$What_is_the_highest_level_of_education_you_have_completed %>% ordered(c("None", "High school", "Vocational school", "Bachelor", "Master", "Doctorate or higher"))
d$education_num = d$education %>% as.numeric()
d$academic_class = d$What_is_your_current_academic_status %>% mapvalues(from = c(NA), to = c("I am not a researcher")) %>% factor()
d$GJP = d$Have_you_participated_in_the_Good_Judgment_Project_forecasting_tournament_or_other_prediction_markets %>% ordered(c("No", "Only a little bit", "Quite a bit", "A lot"))
d$rationalist = d$There_is_a_movement_of_thought_called_the_rationalist_or_Less_Wrong_movement_headed_by_people_such_as_Eliezer_Yudkowsky_and_Scott_Alexander_Slate_Star_Codex_It_is_based_on_the_idea_that_humans_can_become_aware_of_and_hopefully_counteract_various_cognitive_biases_that_we_possess_To_which_degree_do_you_consider_yourself_a_rationalist %>% as.numeric()
d$HBD = d$Genetic_variation_contributes_substantially_to_psychological_and_socioeconomic_variation_between_human_populations_clusters_races_etc_being_ultimately_responsible_for_at_least_50pct_of_observed_differences_How_likely_do_you_think_this_statement_is_true %>% as.numeric()
d$income_tax = d$What_is_your_ideal_rate_of_personal_income_taxation %>% as.numeric()
d$personal_freedom = d$How_many_of_the_following_do_you_want_to_legalise_or_keep_legal_Sex_work_prostitution_strip_clubs_casinos_gambling_cannabis_amphetamine_methamphetamine_LSD_driving_without_seatbelt_selling_human_organs_selling_human_semen_eggs_homosexual_sex_Input_a_number_min_0_max_11 %>% as.numeric()
d$media_coverage_accurate = d$How_accurate_do_you_think_the_mainstream_media_has_presented_information_about_Coronavirus %>% as.numeric()
d$self_score_estimate = d$The_previous_pages_featured_30_questions_about_science_and_about_coronaviruses_How_many_answers_do_you_think_you_got_correct %>% as.numeric()
d$self_forecast_accuracy = d$How_good_do_you_think_your_forecasts_will_be_relative_to_other_people_taking_this_survey %>% as.numeric()
d$fox_hedgehog = d$In_a_famous_essay_Isaiah_Berlin_classified_thinkers_as_hedgehogs_and_foxes_The_hedgehog_knows_one_big_thing_and_tries_to_explain_as_much_as_possible_using_that_theory_or_framework_The_fox_knows_many_small_things_and_is_content_to_improvise_explanations_on_a_case_by_case_basis_When_it_comes_to_making_predictions_would_you_describe_yourself_as_more_of_a_hedgehog_or_more_of_a_fox %>% ordered(c("Very much more fox-like", "Somewhat more fox-like", "Neither more fox-like nor more hedgehog-like", "Somewhat more hedgehog-like", "Very much more hedgehog-like"))
d$fox_hedgehog_num = d$fox_hedgehog %>% as.numeric()
d$follow_news_closely = d$On_a_scale_of_0_to_100_how_closely_have_you_been_following_news_related_to_the_the_new_coronavirus %>% as.numeric()
d$national_security = d$The_new_coronavirus_is_a_threat_to_the_national_security_of_the_country_in_which_I_currently_live %>% ordered(c("Strongly disagree", "Somewhat disagree", "Neither disagree nor agree", "Somewhat agree", "Strongly agree"))
d$italy_govt = d$The_Italian_authorities_initially_did_not_do_enough_to_combat_the_spread_of_the_new_coronavirus %>% ordered(c("Strongly disagree", "Somewhat disagree", "Neither disagree nor agree", "Somewhat agree", "Strongly agree"))
d$close_airports = d$Authorities_across_the_world_should_completely_close_airports_in_order_to_prevent_the_spread_of_the_new_coronavirus %>% ordered(c("Strongly disagree", "Somewhat disagree", "Neither disagree nor agree", "Somewhat agree", "Strongly agree"))
#countries
d$from_USA = d$Country == "United States"
d$from_UK = d$Country == "United Kingdom"
d$from_France = d$Country == "France"
Collection of the basic stuff
#time to complete
d$survey_duration %>% as.numeric() %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 333 28.1 34.3 21.1 22.7 9.49 7.57 497 490 9.12 110 1.88
#countries
d$Country %>% table2()
#sex
d$sex %>% table2()
#education
d$education %>% table2()
#academic class
d$academic_class %>% table2()
#GJP participation
d$GJP %>% table2()
#fox hedgehog
d$fox_hedgehog %>% as.numeric() %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 333 2.37 1.09 2 2.28 1.48 1 5 4 0.734 -0.224 0.0596
ggplot(d, aes(fox_hedgehog)) +
geom_bar() +
theme(axis.text.x = element_text(angle = -10)) +
ggtitle("Foxes and hedgehogs")
GG_save("figs/fox_dist.png")
#age
d$age %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 333 32.8 11.5 30 31.3 8.9 18 68 50 1.15 0.675 0.629
GG_denhist(d, "age", group = "sex") +
# ggtitle("Age distribution by sex", "Red line = mean") +
scale_x_continuous(breaks = seq(0, 200, by = 10))
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
GG_save("figs/age_dist_by_sex.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#rationalist self-rating
d$rationalist %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 333 58.2 25.7 65 60.2 22.2 0 100 100 -0.691 -0.386 1.41
GG_denhist(d, "rationalist", group = "sex") +
ggtitle("Rationalist self-identification distribution by sex", "Red line = mean")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
GG_save("figs/rationalist_dist_by_sex.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#HBD endorsement
d$HBD %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 333 76 29.7 89 81.4 16.3 0 100 100 -1.29 0.415 1.63
d$HBD %>% describeBy(d$sex)
##
## Descriptive statistics by group
## group: Female
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 32 62.4 34.6 74 65 37.1 0 100 100 -0.47 -1.27 6.12
## ------------------------------------------------------------
## group: Male
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 301 77.4 28.8 90 83 14.8 0 100 100 -1.41 0.78 1.66
GG_denhist(d, "HBD", group = "sex") +
ggtitle("Probability that population differences at least 50% genetic in origin", "Red line = mean")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
GG_save("figs/HBD_dist_by_sex.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#numerics
d$total_score %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 333 16.9 4.12 17 16.9 4.45 5 27 22 -0.0837 -0.28 0.226
#distribution to centile and IQ
total_score_cdf = ecdf(d$total_score)
#centiles
total_score_cdf(0:30) %>% set_names(0:30)
## 0 1 2 3 4 5 6 7 8 9
## 0.00000 0.00000 0.00000 0.00000 0.00000 0.00300 0.00901 0.01201 0.02402 0.03904
## 10 11 12 13 14 15 16 17 18 19
## 0.05105 0.09610 0.14414 0.21922 0.27628 0.37538 0.46547 0.55856 0.64264 0.73273
## 20 21 22 23 24 25 26 27 28 29
## 0.79880 0.86186 0.90691 0.94294 0.97598 0.98799 0.99399 1.00000 1.00000 1.00000
## 30
## 1.00000
#IQ scale
total_score_cdf(0:30) %>% qnorm(mean = 100, sd = 15) %>% set_names(0:30)
## 0 1 2 3 4 5 6 7 8 9 10 11 12
## -Inf -Inf -Inf -Inf -Inf 58.8 64.5 66.1 70.3 73.6 75.5 80.4 84.1
## 13 14 15 16 17 18 19 20 21 22 23 24 25
## 88.4 91.1 95.2 98.7 102.2 105.5 109.3 112.6 116.3 119.8 123.7 129.7 133.9
## 26 27 28 29 30
## 137.7 Inf Inf Inf Inf
#plots
GG_denhist(d, "total_score")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
GG_save("figs/score_dist.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
GG_denhist(d, "total_score", "sex")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
GG_save("figs/score_dist_by_sex.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#cohen d
SMD_matrix(d$total_score, d$sex, extended_output = T)
## $d
## Female Male
## Female NA 0.09
## Male 0.09 NA
##
## $d_string
## Female Male
## Female NA "0.09 [-0.27 0.45]"
## Male "0.09 [-0.27 0.45]" NA
##
## $CI_lower
## Female Male
## Female NA -0.274
## Male -0.274 NA
##
## $CI_upper
## Female Male
## Female NA 0.455
## Male 0.455 NA
##
## $se_z
## [1] 1.96
##
## $se
## Female Male
## Female NA 0.186
## Male 0.186 NA
##
## $p
## Female Male
## Female NA 0.628
## Male 0.628 NA
##
## $pairwise_n
## Female Male
## Female NA 333
## Male 333 NA
#age
ggplot(d, aes(total_score, age_bin, fill = age_bin)) +
geom_boxplot() +
coord_flip() +
labs(x = "Science knowledge score", y = "Age group")
GG_save("figs/sumscore_dist_by_age.png")
#cor
cor.test(d$total_score, d$age)
##
## Pearson's product-moment correlation
##
## data: d$total_score and d$age
## t = 4, df = 331, p-value = 0.0002
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.0951 0.3015
## sample estimates:
## cor
## 0.201
### Item response theory
#item data
test_items = d %>%
#select items
select(What_is_COVID_19_COVID_19_is_a:In_the_past_growth_of_moulds_on_stored_food_and_drink_was_a_persistent_problem_Who_dispelled_the_general_belief_of_spontaneous_generation_that_is_the_supposed_production_of_living_organisms_from_non_living_matter_as_inferred_from_the_apparent_appearance_of_life_in_some_supposedly_sterile_environments) %>%
#remove whitespace
map_df(str_trim)
test_items_long_names = names(test_items)
names(test_items) = "item_" + 1:ncol(test_items)
#score
test_items_scored = score_items(test_items, key = test_scoring_key$correct)
#reliability
alpha(test_items_scored)
##
## Reliability analysis
## Call: alpha(x = test_items_scored)
##
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd median_r
## 0.69 0.7 0.72 0.071 2.3 0.024 0.56 0.14 0.067
##
## lower alpha upper 95% confidence boundaries
## 0.64 0.69 0.74
##
## Reliability if an item is dropped:
## raw_alpha std.alpha G6(smc) average_r S/N alpha se var.r med.r
## item_1 0.69 0.69 0.72 0.072 2.2 0.024 0.0049 0.067
## item_2 0.69 0.69 0.72 0.072 2.2 0.024 0.0048 0.068
## item_3 0.69 0.69 0.72 0.072 2.3 0.024 0.0048 0.067
## item_4 0.68 0.69 0.71 0.071 2.2 0.025 0.0048 0.067
## item_5 0.70 0.70 0.72 0.074 2.3 0.024 0.0047 0.068
## item_6 0.69 0.70 0.72 0.074 2.3 0.024 0.0049 0.068
## item_7 0.70 0.70 0.73 0.074 2.3 0.024 0.0047 0.068
## item_8 0.69 0.70 0.72 0.073 2.3 0.024 0.0049 0.068
## item_9 0.68 0.69 0.71 0.070 2.2 0.025 0.0049 0.066
## item_10 0.68 0.68 0.71 0.069 2.2 0.025 0.0048 0.065
## item_11 0.69 0.69 0.72 0.072 2.3 0.024 0.0049 0.068
## item_12 0.69 0.69 0.72 0.071 2.2 0.025 0.0049 0.067
## item_13 0.69 0.69 0.72 0.071 2.2 0.025 0.0049 0.067
## item_14 0.68 0.69 0.71 0.070 2.2 0.025 0.0048 0.065
## item_15 0.69 0.69 0.72 0.072 2.3 0.024 0.0049 0.068
## item_16 0.67 0.67 0.70 0.067 2.1 0.026 0.0046 0.064
## item_17 0.68 0.69 0.71 0.070 2.2 0.025 0.0048 0.067
## item_18 0.69 0.69 0.72 0.073 2.3 0.024 0.0049 0.068
## item_19 0.67 0.68 0.71 0.068 2.1 0.026 0.0046 0.065
## item_20 0.69 0.69 0.72 0.072 2.2 0.024 0.0047 0.067
## item_21 0.69 0.69 0.71 0.071 2.2 0.025 0.0048 0.067
## item_22 0.68 0.68 0.71 0.069 2.1 0.025 0.0048 0.065
## item_23 0.67 0.68 0.71 0.069 2.1 0.025 0.0046 0.066
## item_24 0.69 0.69 0.72 0.072 2.3 0.024 0.0048 0.068
## item_25 0.68 0.68 0.71 0.068 2.1 0.025 0.0046 0.065
## item_26 0.68 0.68 0.71 0.070 2.2 0.025 0.0048 0.066
## item_27 0.68 0.69 0.71 0.070 2.2 0.025 0.0048 0.066
## item_28 0.67 0.68 0.70 0.067 2.1 0.026 0.0044 0.065
## item_29 0.68 0.68 0.71 0.070 2.2 0.025 0.0048 0.065
## item_30 0.69 0.70 0.72 0.073 2.3 0.024 0.0049 0.068
##
## Item statistics
## n raw.r std.r r.cor r.drop mean sd
## item_1 333 0.27 0.28 0.221 0.164 0.29 0.46
## item_2 333 0.26 0.28 0.217 0.173 0.84 0.37
## item_3 333 0.27 0.26 0.202 0.164 0.71 0.45
## item_4 333 0.30 0.33 0.285 0.219 0.85 0.36
## item_5 333 0.19 0.17 0.096 0.068 0.42 0.49
## item_6 333 0.18 0.20 0.121 0.092 0.14 0.35
## item_7 333 0.18 0.17 0.091 0.060 0.62 0.49
## item_8 333 0.23 0.23 0.161 0.123 0.25 0.43
## item_9 333 0.36 0.36 0.313 0.257 0.35 0.48
## item_10 333 0.38 0.39 0.364 0.304 0.14 0.34
## item_11 333 0.24 0.26 0.201 0.170 0.91 0.29
## item_12 333 0.30 0.31 0.257 0.204 0.24 0.43
## item_13 333 0.32 0.31 0.251 0.202 0.54 0.50
## item_14 333 0.32 0.35 0.306 0.242 0.85 0.36
## item_15 333 0.29 0.27 0.204 0.171 0.59 0.49
## item_16 333 0.49 0.50 0.496 0.409 0.78 0.41
## item_17 333 0.34 0.34 0.301 0.248 0.78 0.42
## item_18 333 0.23 0.24 0.170 0.136 0.80 0.40
## item_19 333 0.45 0.44 0.424 0.348 0.63 0.48
## item_20 333 0.30 0.28 0.221 0.185 0.40 0.49
## item_21 333 0.29 0.33 0.285 0.237 0.95 0.22
## item_22 333 0.42 0.41 0.384 0.316 0.68 0.47
## item_23 333 0.44 0.42 0.394 0.336 0.33 0.47
## item_24 333 0.26 0.25 0.191 0.154 0.26 0.44
## item_25 333 0.43 0.42 0.406 0.340 0.73 0.44
## item_26 333 0.38 0.37 0.330 0.275 0.70 0.46
## item_27 333 0.34 0.34 0.294 0.243 0.28 0.45
## item_28 333 0.49 0.47 0.463 0.394 0.41 0.49
## item_29 333 0.38 0.37 0.331 0.272 0.62 0.49
## item_30 333 0.24 0.23 0.162 0.129 0.71 0.46
##
## Non missing response frequency for each item
## 0 1 miss
## item_1 0.71 0.29 0
## item_2 0.16 0.84 0
## item_3 0.29 0.71 0
## item_4 0.15 0.85 0
## item_5 0.58 0.42 0
## item_6 0.86 0.14 0
## item_7 0.38 0.62 0
## item_8 0.75 0.25 0
## item_9 0.65 0.35 0
## item_10 0.86 0.14 0
## item_11 0.09 0.91 0
## item_12 0.76 0.24 0
## item_13 0.46 0.54 0
## item_14 0.15 0.85 0
## item_15 0.41 0.59 0
## item_16 0.22 0.78 0
## item_17 0.22 0.78 0
## item_18 0.20 0.80 0
## item_19 0.37 0.63 0
## item_20 0.60 0.40 0
## item_21 0.05 0.95 0
## item_22 0.32 0.68 0
## item_23 0.67 0.33 0
## item_24 0.74 0.26 0
## item_25 0.27 0.73 0
## item_26 0.30 0.70 0
## item_27 0.72 0.28 0
## item_28 0.59 0.41 0
## item_29 0.38 0.62 0
## item_30 0.29 0.71 0
#item data
item_stats = tibble(
item = names(test_items),
# question = test_items_long_names %>% str_clean(),
pass_rate = colMeans(test_items_scored)
)
#IRT fit
irt_fit = irt.fa(test_items_scored)
## Warning in cor.smooth(mat): Matrix was not positive definite, smoothing was done
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## In smc, smcs < 0 were set to .0
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
item_stats$loading = irt_fit$fa$loadings[, 1]
item_stats$discrimination = irt_fit$irt$discrimination[, 1]
#mirt too
#we need it for reliability
irt_fit_mirt = mirt(test_items_scored, model = 1)
##
Iteration: 1, Log-Lik: -5421.032, Max-Change: 0.57422
Iteration: 2, Log-Lik: -5358.640, Max-Change: 0.16266
Iteration: 3, Log-Lik: -5352.402, Max-Change: 0.07363
Iteration: 4, Log-Lik: -5351.539, Max-Change: 0.02506
Iteration: 5, Log-Lik: -5351.375, Max-Change: 0.01238
Iteration: 6, Log-Lik: -5351.334, Max-Change: 0.00783
Iteration: 7, Log-Lik: -5351.319, Max-Change: 0.00515
Iteration: 8, Log-Lik: -5351.316, Max-Change: 0.00285
Iteration: 9, Log-Lik: -5351.315, Max-Change: 0.00124
Iteration: 10, Log-Lik: -5351.314, Max-Change: 0.00103
Iteration: 11, Log-Lik: -5351.314, Max-Change: 0.00024
Iteration: 12, Log-Lik: -5351.314, Max-Change: 0.00021
Iteration: 13, Log-Lik: -5351.314, Max-Change: 0.00015
Iteration: 14, Log-Lik: -5351.314, Max-Change: 0.00013
Iteration: 15, Log-Lik: -5351.314, Max-Change: 0.00012
Iteration: 16, Log-Lik: -5351.314, Max-Change: 0.00028
Iteration: 17, Log-Lik: -5351.314, Max-Change: 0.00018
Iteration: 18, Log-Lik: -5351.314, Max-Change: 0.00005
irt_fit_mirt_scores = fscores(irt_fit_mirt, full.scores = T, full.scores.SE = T)
empirical_rxx(irt_fit_mirt_scores)
## F1
## 0.745
marginal_rxx(irt_fit_mirt)
## [1] 0.744
#summary
item_stats$pass_rate %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 30 0.56 0.246 0.619 0.566 0.312 0.135 0.949 0.814 -0.199 -1.38
## se
## X1 0.0448
item_stats$loading %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 30 0.369 0.141 0.359 0.369 0.166 0.125 0.647 0.522 0.0897 -1.12
## se
## X1 0.0258
#score
irt_fit_scored = scoreIrt(irt_fit, items = test_items_scored)
d$irt_score = irt_fit_scored$theta1 %>% standardize()
#bins
d$irt_score_bin = d$irt_score %>% discretize(breaks = 3, equal_range = F, labels = "integer")
d$irt_score_bin %>% table2()
describeBy(d$irt_score, d$irt_score_bin, mat = T)
#compare
GG_scatter(d, "total_score", "irt_score")
## `geom_smooth()` using formula 'y ~ x'
GG_save("figs/IRT_score_total_score.png")
## `geom_smooth()` using formula 'y ~ x'
#plots
GG_denhist(d, "irt_score", "sex") +
scale_x_continuous("Science knowledge score")
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
GG_save("figs/IRT_score_dist_by_sex.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#cohen d
SMD_matrix(d$irt_score, d$sex, extended_output = T)
## $d
## Female Male
## Female NA 0.0273
## Male 0.0273 NA
##
## $d_string
## Female Male
## Female NA "0.027 [-0.34 0.39]"
## Male "0.027 [-0.34 0.39]" NA
##
## $CI_lower
## Female Male
## Female NA -0.337
## Male -0.337 NA
##
## $CI_upper
## Female Male
## Female NA 0.392
## Male 0.392 NA
##
## $se_z
## [1] 1.96
##
## $se
## Female Male
## Female NA 0.186
## Male 0.186 NA
##
## $p
## Female Male
## Female NA 0.883
## Male 0.883 NA
##
## $pairwise_n
## Female Male
## Female NA 333
## Male 333 NA
#age
ggplot(d, aes(irt_score, age_bin, fill = age_bin)) +
geom_boxplot() +
coord_flip() +
labs(x = "Science knowledge score", y = "Age group")
GG_save("figs/IRT_score_dist_by_age.png")
#cor
cor.test(d$irt_score, d$age)
##
## Pearson's product-moment correlation
##
## data: d$irt_score and d$age
## t = 3, df = 331, p-value = 0.004
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.0493 0.2591
## sample estimates:
## cor
## 0.156
ggplot(d %>% filter(age >= 30), aes(irt_score, education, fill = education)) +
geom_boxplot() +
coord_flip() +
labs(x = "Science knowledge score", y = "Highest education attainted")
GG_save("figs/IRT_score_dist_by_education.png")
#cor
cor.test(d$irt_score, d$education_num)
##
## Pearson's product-moment correlation
##
## data: d$irt_score and d$education_num
## t = 4, df = 331, p-value = 0.00001
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.131 0.334
## sample estimates:
## cor
## 0.235
#model
ols(irt_score ~ sex + age + education_num, data = d %>% df_standardize(messages = F))
## Linear Regression Model
##
## ols(formula = irt_score ~ sex + age + education_num, data = d %>%
## df_standardize(messages = F))
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 333 LR chi2 19.97 R2 0.058
## sigma0.9749 d.f. 3 R2 adj 0.050
## d.f. 329 Pr(> chi2) 0.0002 g 0.273
##
## Residuals
##
## Min 1Q Median 3Q Max
## -2.28570 -0.72773 0.09568 0.71296 2.21863
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 0.0056 0.1725 0.03 0.9742
## sex=Male -0.0062 0.1814 -0.03 0.9728
## age 0.0617 0.0602 1.03 0.3056
## education_num 0.2067 0.0601 3.44 0.0007
##
#self estimate
ols(total_score ~ self_score_estimate, data = d)
## Linear Regression Model
##
## ols(formula = total_score ~ self_score_estimate, data = d)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 333 LR chi2 100.85 R2 0.261
## sigma3.5480 d.f. 1 R2 adj 0.259
## d.f. 331 Pr(> chi2) 0.0000 g 2.393
##
## Residuals
##
## Min 1Q Median 3Q Max
## -11.52278 -2.51051 0.06741 2.47722 8.46496
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 11.5596 0.5290 21.85 <0.0001
## self_score_estimate 0.3975 0.0367 10.82 <0.0001
##
cor.test(d$total_score, d$self_score_estimate)
##
## Pearson's product-moment correlation
##
## data: d$total_score and d$self_score_estimate
## t = 11, df = 331, p-value <0.0000000000000002
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.427 0.586
## sample estimates:
## cor
## 0.511
GG_scatter(d, "self_score_estimate", "total_score") +
geom_count() +
geom_smooth()
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
GG_save("figs/self_rate_score_and_score.png")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#model
ols(irt_score ~ sex + age + education_num + self_score_estimate, data = d %>% df_standardize(messages = F))
## Linear Regression Model
##
## ols(formula = irt_score ~ sex + age + education_num + self_score_estimate,
## data = d %>% df_standardize(messages = F))
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 333 LR chi2 111.24 R2 0.284
## sigma0.8513 d.f. 4 R2 adj 0.275
## d.f. 328 Pr(> chi2) 0.0000 g 0.604
##
## Residuals
##
## Min 1Q Median 3Q Max
## -2.38044 -0.61784 0.04835 0.59185 2.40931
##
##
## Coef S.E. t Pr(>|t|)
## Intercept -0.1241 0.1512 -0.82 0.4124
## sex=Male 0.1373 0.1591 0.86 0.3889
## age -0.0202 0.0531 -0.38 0.7041
## education_num 0.1329 0.0530 2.51 0.0126
## self_score_estimate 0.4957 0.0487 10.17 <0.0001
##
#books
books = d %>%
select(Superforecasting_The_Art_and_Science_of_Prediction_by_Philip_E_Tetlock_and_Dan_Gardner_Which_of_the_following_books_have_you_read_Check_all_that_apply:Any_university_level_textbook_introduction_to_medicine_Which_of_the_following_books_have_you_read_Check_all_that_apply) %>%
names()
#book question fluff
book_fluff = "_Which_of_the_following_books_have_you_read_Check_all_that_apply"
#books endorsed
books_data = d %>%
select(!!books) %>%
map_df(~!is.na(.))
names(books_data) = names(books_data) %>% str_replace(book_fluff, "")
#table of books
books_data %>%
map_df(~mean(.)) %>%
t() %>%
as.data.frame() %>%
rownames_to_column("book") %>%
rename(read_pct = V1) %>%
mutate(book = book %>% str_clean()) %>%
arrange(-read_pct) %>%
df_round(2) %>%
datatable()
#save index score
d$book_index = books_data %>%
rowSums()
We will want to cluster the labels later on.
#names of labels
pol_labels = d %>%
select(Neoreactionary_nrx_Check_any_label_you_identify_with_politically:N_A_Check_any_label_you_identify_with_politically) %>%
names()
#data
pol_data = d %>% select(!!pol_labels) %>% map_df(~!is.na(.))
#nice names
names(pol_data) = pol_labels %>% str_replace("_Check_any_label_you_identify_with_politically", "")
#endorsement by label
pol_data %>% colMeans()
## Neoreactionary_nrx alt_right ethnonationalist communist
## 0.2192 0.1441 0.2012 0.0180
## socialist liberal classical_liberal leftist
## 0.0751 0.1261 0.2492 0.0691
## conservative libertarian anarcholibertarian radical_centrist
## 0.2613 0.2282 0.0450 0.0871
## plain_old_centrist YIMBY NIMBY moderate
## 0.0300 0.1622 0.0210 0.1231
## center_right center_left apolitical N_A
## 0.1502 0.1051 0.0901 0.0961
#labels heatmap
pol_data %>%
map_df(as.numeric) %>%
mixedCor() %>%
.$rho %>%
GG_heatmap(font_size = 3.2, color_label = "Latent correlations")
## Warning in cor.smooth(mat): Matrix was not positive definite, smoothing was done
GG_save("figs/political_labels.png")
#summarize politics into fewer dimensions
#by trial and error, got 3 factor solution
pol_fa = irt.fa(pol_data %>% map_df(as.numeric), nfactors = 3)
## Warning in cor.smooth(mat): Matrix was not positive definite, smoothing was done
## Warning in fa.stats(r = r, f = f, phi = phi, n.obs = n.obs, np.obs = np.obs, :
## The estimated weights for the factor scores are probably incorrect. Try a
## different factor score estimation method.
pol_fa$fa
## Factor Analysis using method = minres
## Call: fa(r = r, nfactors = nfactors, n.obs = n.obs, rotate = rotate,
## fm = fm)
## Standardized loadings (pattern matrix) based upon correlation matrix
## MR1 MR2 MR3 h2 u2 com
## Neoreactionary_nrx 0.72 -0.11 -0.02 0.528 0.47 1.1
## alt_right 0.77 0.08 0.03 0.595 0.41 1.0
## ethnonationalist 0.66 -0.21 -0.05 0.491 0.51 1.2
## communist 0.27 0.07 0.71 0.563 0.44 1.3
## socialist 0.13 -0.04 0.85 0.712 0.29 1.0
## liberal -0.33 0.60 0.34 0.626 0.37 2.2
## classical_liberal 0.04 0.71 -0.33 0.583 0.42 1.4
## leftist -0.20 -0.06 0.86 0.792 0.21 1.1
## conservative 0.40 0.20 -0.32 0.312 0.69 2.5
## libertarian 0.23 0.65 -0.04 0.472 0.53 1.3
## anarcholibertarian 0.38 0.26 0.17 0.243 0.76 2.2
## radical_centrist 0.19 0.26 0.12 0.119 0.88 2.2
## plain_old_centrist 0.00 0.47 0.16 0.260 0.74 1.2
## YIMBY -0.12 0.54 0.07 0.321 0.68 1.1
## NIMBY 0.67 0.24 0.32 0.592 0.41 1.7
## moderate -0.22 0.56 -0.03 0.358 0.64 1.3
## center_right -0.04 0.38 -0.39 0.274 0.73 2.0
## center_left -0.53 0.19 0.36 0.489 0.51 2.1
## apolitical -0.03 0.02 0.12 0.017 0.98 1.2
## N_A -0.26 -0.52 0.02 0.336 0.66 1.5
##
## MR1 MR2 MR3
## SS loadings 3.04 2.88 2.76
## Proportion Var 0.15 0.14 0.14
## Cumulative Var 0.15 0.30 0.43
## Proportion Explained 0.35 0.33 0.32
## Cumulative Proportion 0.35 0.68 1.00
##
## With factor correlations of
## MR1 MR2 MR3
## MR1 1.00 0.01 -0.07
## MR2 0.01 1.00 0.08
## MR3 -0.07 0.08 1.00
##
## Mean item complexity = 1.5
## Test of the hypothesis that 3 factors are sufficient.
##
## The degrees of freedom for the null model are 190 and the objective function was 78.7 with Chi Square of 25526
## The degrees of freedom for the model are 133 and the objective function was 72
##
## The root mean square of the residuals (RMSR) is 0.12
## The df corrected root mean square of the residuals is 0.14
##
## The harmonic number of observations is 333 with the empirical chi square 1851 with prob < 0.0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000043
## The total number of observations was 333 with Likelihood Chi Square = 23219 with prob < 0
##
## Tucker Lewis Index of factoring reliability = -0.31
## RMSEA index = 0.722 and the 90 % confidence intervals are 0.715 0.731
## BIC = 22446
## Fit based upon off diagonal values = 0.79
#score
#error when scoring using IRT?
pol_fa_scores = scoreIrt(stats = pol_fa$fa, items = pol_data %>% map_df(as.numeric))
d$pol_factor_libertarianish = pol_fa_scores$theta1 %>% standardize()
d$pol_factor_altrightish = pol_fa_scores$theta2 %>% standardize()
d$pol_factor_leftliberalish = pol_fa_scores$theta3 %>% standardize()
#income tax
d$income_tax %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 333 21.7 14.5 20 20.9 14.8 0 100 100 0.821 2.11 0.793
GG_denhist(d, "income_tax", group = "sex") +
scale_x_continuous(limits = c(0, 100)) +
ggtitle("Income tax preference distribution by sex", "Red line = mean")
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
GG_save("figs/income_tax_pref_dist_by_sex.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
#personal freedom
d$personal_freedom %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 333 6.05 3.32 6 6.13 4.45 0 11 11 -0.175 -1.15 0.182
GG_denhist(d, "personal_freedom", group = "sex") +
scale_x_continuous(limits = c(0, 11), breaks = 0:100) +
ggtitle("Personal freedom items endorsed, distribution by sex", "Red line = mean")
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
GG_save("figs/pers_freedom_dist_by_sex.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
#national security
ggplot(d, aes(national_security, fill = sex)) +
geom_bar(position = "fill") +
theme(axis.text.x = element_text(angle = -10)) +
ggtitle("The new coronavirus is a threat to the national security of the country in which I currently live.")
GG_save("figs/CV_nat_sec_dist_by_sex.png")
#italy govt
ggplot(d, aes(italy_govt, fill = sex)) +
geom_bar(position = "fill") +
theme(axis.text.x = element_text(angle = -10)) +
ggtitle("The Italian authorities initially did not do enough to combat the spread of the new coronavirus.")
GG_save("figs/italy_govt_dist_by_sex.png")
#close airports
ggplot(d, aes(close_airports, fill = sex)) +
geom_bar(position = "fill") +
theme(axis.text.x = element_text(angle = -10)) +
ggtitle("Authorities across the world should\ncompletely close airports in order to prevent the spread of the new coronavirus.")
GG_save("figs/close_airports_dist_by_sex.png")
#how closely following
d$follow_news_closely %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 333 62.8 23.2 69 64.2 22.2 1 100 99 -0.545 -0.59 1.27
GG_denhist(d, "follow_news_closely", group = "sex") +
scale_x_continuous(limits = c(0, 100)) +
ggtitle("Following news on Coronavirus, distribution by sex", "Red line = mean")
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
#media coverage accuracy
d$media_coverage_accurate %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 333 40 23.9 36 38.9 23.7 0 100 100 0.384 -0.712 1.31
GG_denhist(d, "media_coverage_accurate", group = "sex") +
scale_x_continuous(limits = c(0, 100)) +
ggtitle("Mainstream media accuracy rating, distribution by sex", "Red line = mean")
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
#relationships
GG_scatter(d, "follow_news_closely", "media_coverage_accurate") +
geom_smooth()
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
GG_scatter(d, "follow_news_closely", "total_score") +
geom_smooth()
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
#safety items
safety_items = d %>%
select(contains("Check_all_that_apply_and_only_check_a_box_when_you_have_done_this_as_a_result_of_the_new_coronavirus")) %>%
names() %>%
.[-c(14:15)]
#data
safety_data = d %>% select(!!safety_items) %>% map_df(~!is.na(.))
#nice names
names(safety_data) = safety_items %>% str_replace("_To_what_extent_have_you_altered_your_behaviour_because_of_the_new_coronavirus_Check_all_that_apply_and_only_check_a_box_when_you_have_done_this_as_a_result_of_the_new_coronavirus", "")
#endorsement by label
safety_data %>% colMeans() %>% sort(decreasing = T)
## I_wash_my_hands_more_often_or_have_started_washing_them
## 0.8258
## I_have_stopped_shaking_people_s_hands_to_greet_them
## 0.5526
## I_built_an_emergency_food_storage
## 0.4925
## I_have_canceled_travel_plans
## 0.3844
## I_have_stopped_going_to_my_place_of_work_or_education
## 0.3093
## I_have_purchased_face_masks_gas_masks_or_other_equipment
## 0.2883
## I_have_ordered_more_products_on_the_internet_than_I_normally_do
## 0.2492
## I_have_stopped_kissing_people_on_the_cheeks_to_greet_them_In_some_cultures_people_are_kissed_on_their_cheek_as_a_greeting
## 0.1892
## I_have_reduced_the_amount_of_times_I_visit_my_place_of_worship_for_example_a_church_mosque_or_synagogue_or_have_stopped_going_completely
## 0.1441
## I_did_not_go_to_a_birthday_party_a_jubilee_party_marriage_ceremony_or_similar_event
## 0.1381
## Other_Write_In
## 0.0751
## I_have_stopped_dating
## 0.0631
## I_have_stopped_touching_my_pet_s_or_have_been_touching_my_pet_s_less
## 0.0240
Tricky to summarize because of the log differences probably. We can do a start.
#forecast variables
forecast_vars = d %>%
select(What_do_you_estimate_will_be_the_total_death_toll_of_Coronavirus_in_the_USA_in_the_next_2_years_Estimate_death_tolls_of_the_new_coronavirus:What_is_the_probability_that_anyone_in_your_immediate_genetic_family_will_be_infected_by_the_coronavirus_in_the_next_two_years_1st_degree_relatives_i_e_parents_children_siblings) %>% names()
forecast_vars_date = d %>%
select(starts_with("When_do_you_think_the_rate_of_new")) %>% names()
forecast_vars_number = setdiff(forecast_vars, forecast_vars_date)
#descriptive stats for each
map_df(forecast_vars, function(v) {
x = d[[v]]
#is time?
if (d[[v]] %>% is.Date()) return(tibble())
#character? its percent
if (d[[v]] %>% is.character()) x = d[[v]] %>% str_replace("%", "") %>% as.numeric()
tibble(
question = v %>% str_clean()
) %>%
cbind(describe(x))
}) %>%
select(-vars, -n, -se, -range, -skew, -kurtosis) %>%
rename(trimmed_mean_10pct = trimmed) %>%
df_round(2) %>%
datatable()
#plot each of them
for (v in forecast_vars) {
#is percent?
is_percent = d[[v]] %>% str_detect("%") %>% any(na.rm = T)
#clean
local_v = d[[v]]
if (is_percent) local_v = local_v %>% str_replace("%", "") %>% as.numeric() %>% divide_by(100)
#plot
if (is_percent) {
gg = GG_denhist(tibble(x = local_v), "x", vline = "median") +
scale_x_continuous("Probability",
breaks = seq(0, 1, by = .1),
limits = c(0, 1),
labels = scales::percent) +
ggtitle(v %>% str_remove_fluff() %>% add_newlines(line_length = 90))
} else if (is.Date(local_v)) {
gg = ggplot(tibble(x = local_v), aes(x)) +
geom_histogram(binwidth = 2.5) +
geom_density(aes(y=2.5 * ..count..)) +
scale_x_date("Date", date_breaks = "1 month") +
theme(axis.text.x = element_text(angle = -20)) +
geom_vline(data = tibble(x = median(local_v, na.rm = T)), mapping = aes(xintercept = x), color = "red", linetype = "dashed", size = 1) +
ggtitle(v %>% str_remove_fluff() %>% add_newlines(line_length = 90))
} else {
gg = GG_denhist(tibble(x = local_v), "x", vline = "median") +
scale_x_log10("Number", breaks = 10^(0:20)) +
theme(
# axis.title.y = element_blank()
axis.text.y = element_blank(),
# axis.ticks.y = element_blank()
) +
ggtitle(v %>% str_remove_fluff() %>% add_newlines(line_length = 90))
}
print(gg)
GG_save(plot = gg, str_glue("figs/forecast_{which(v == forecast_vars)}.png"))
}
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing non-finite values (stat_density).
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing non-finite values (stat_density).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
#all in one plot?
#not possible because some are dates
#but we can do 2 subsets
d %>%
#gather to long form
gather(key = question, value = value, !!forecast_vars_date) %>%
mutate(question = mapvalues(question, from = forecast_vars_date, to = c("USA", "Germany", "Europe", "United Kingdom")),
date = as.Date(value, origin = "1970-01-01")) %>%
ggplot(aes(date)) +
geom_histogram() +
scale_x_date(limits = c(as.Date("2020-01-01", "%Y-%m-%d"), as.Date("2022-01-01", "%Y-%m-%d")),
date_breaks = "3 months", guide = guide_axis(n.dodge = 2)) +
facet_wrap("question", nrow = 2)
## Warning: attributes are not identical across measure variables;
## they will be dropped
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing non-finite values (stat_bin).
## Warning: Removed 8 rows containing missing values (geom_bar).
GG_save("figs/forecast_distributions_dates.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3 rows containing non-finite values (stat_bin).
## Warning: Removed 8 rows containing missing values (geom_bar).
#summary
d %>%
select(!!forecast_vars_date) %>%
map_df(function(x) {
tibble(
mean = mean(x),
median = median(x)
)
})
#forecast general index
#convert to rank format
forecasts_numeric_ranked = d[forecast_vars %>% str_subset("When_do_you_thi", negate = T)] %>% map_df(function(x) {
if (!is.numeric(x)) x = x %>% str_replace("%", "") %>% as.numeric()
rank(x)
})
#better names
names(forecasts_numeric_ranked) = c(
"death_toll_" + c("USA", "Europe", "UK", "Poland"),
"prevalence_" + c("France", "Italy", "Spain", "USA"),
"emergency_declared_" + c("USA", "Germany", "UK", "Sweden"),
"risk_" + c("self", "family")
)
#heatmap
forecasts_numeric_ranked %>%
GG_heatmap()
GG_save("figs/forecasts_heatmap.png")
#factor analyze
forecasts_fa = fa(forecasts_numeric_ranked)
forecasts_fa
## Factor Analysis using method = minres
## Call: fa(r = forecasts_numeric_ranked)
## Standardized loadings (pattern matrix) based upon correlation matrix
## MR1 h2 u2 com
## death_toll_USA 0.65 0.420 0.58 1
## death_toll_Europe 0.67 0.449 0.55 1
## death_toll_UK 0.68 0.466 0.53 1
## death_toll_Poland 0.66 0.435 0.57 1
## prevalence_France 0.85 0.718 0.28 1
## prevalence_Italy 0.79 0.622 0.38 1
## prevalence_Spain 0.84 0.700 0.30 1
## prevalence_USA 0.86 0.734 0.27 1
## emergency_declared_USA 0.30 0.091 0.91 1
## emergency_declared_Germany 0.34 0.117 0.88 1
## emergency_declared_UK 0.33 0.112 0.89 1
## emergency_declared_Sweden 0.34 0.118 0.88 1
## risk_self 0.64 0.412 0.59 1
## risk_family 0.60 0.363 0.64 1
##
## MR1
## SS loadings 5.76
## Proportion Var 0.41
##
## Mean item complexity = 1
## Test of the hypothesis that 1 factor is sufficient.
##
## The degrees of freedom for the null model are 91 and the objective function was 17.5 with Chi Square of 5706
## The degrees of freedom for the model are 77 and the objective function was 11.3
##
## The root mean square of the residuals (RMSR) is 0.25
## The df corrected root mean square of the residuals is 0.27
##
## The harmonic number of observations is 333 with the empirical chi square 3718 with prob < 0
## The total number of observations was 333 with Likelihood Chi Square = 3680 with prob < 0
##
## Tucker Lewis Index of factoring reliability = 0.24
## RMSEA index = 0.375 and the 90 % confidence intervals are 0.365 0.386
## BIC = 3233
## Fit based upon off diagonal values = 0.73
## Measures of factor score adequacy
## MR1
## Correlation of (regression) scores with factors 0.96
## Multiple R square of scores with factors 0.93
## Minimum correlation of possible factor scores 0.86
forecasts_fa$loadings[, 1] %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 14 0.611 0.202 0.654 0.616 0.236 0.302 0.857 0.555 -0.38 -1.45
## se
## X1 0.0539
fa_plot_loadings(forecasts_fa)
GG_save("figs/forecasts_loadings.png")
#save scores
d$general_forecast = forecasts_fa$scores[, 1] %>% standardize()
d$general_forecast %>% describe()
## vars n mean sd median trimmed mad min max range
## X1 1 333 0.0000000000000000132 1 -0.0242 0.00627 1.11 -2.16 2.11 4.27
## skew kurtosis se
## X1 -0.0454 -0.899 0.0548
#plot
GG_denhist(d, "general_forecast")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
GG_save("figs/general_forecast_dist.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#expertise split
#plot each of them
for (v in forecast_vars) {
#is percent?
is_percent = d[[v]] %>% str_detect("%") %>% any(na.rm = T)
#clean
local_v = d[[v]]
if (is_percent) local_v = local_v %>% str_replace("%", "") %>% as.numeric() %>% divide_by(100)
#plot
if (is_percent) {
gg = GG_denhist(tibble(x = local_v, irt_score_bin = d$irt_score_bin), "x", group = "irt_score_bin", vline = "median") +
scale_x_continuous("Probability",
breaks = seq(0, 1, by = .1),
limits = c(0, 1),
labels = scales::percent) +
ggtitle(v %>% str_remove_fluff() %>% add_newlines(line_length = 90))
} else if (is.Date(local_v)) {
#remove bad dates
local_v[local_v < ymd("2020-01-01")] = NA
local_v[local_v > ymd("2023-01-01")] = NA
#plot
gg = ggplot(tibble(x = local_v, irt_score_bin = d$irt_score_bin), aes(x, fill = factor(irt_score_bin))) +
geom_histogram(binwidth = 2.5, position = "dodge") +
geom_density(aes(y = 2.5 * ..count..), alpha = .2) +
scale_x_date("Date", date_breaks = "1 month") +
theme(axis.text.x = element_text(angle = -20)) +
geom_vline(data = tibble(x = describeBy(local_v %>% as.numeric(), d$irt_score_bin, mat = T)$median,
irt_score_bin = factor(1:3)), mapping = aes(xintercept = x, color = irt_score_bin), linetype = "dashed", size = 1) +
ggtitle(v %>% str_remove_fluff() %>% add_newlines(line_length = 90)) +
guides(color = F)
} else {
gg = GG_denhist(tibble(x = local_v, irt_score_bin = d$irt_score_bin), group = "irt_score_bin", "x", vline = "median") +
scale_x_log10("Number", breaks = 10^(0:20)) +
theme(
# axis.title.y = element_blank()
axis.text.y = element_blank(),
# axis.ticks.y = element_blank()
) +
ggtitle(v %>% str_remove_fluff() %>% add_newlines(line_length = 90))
}
print(gg)
GG_save(plot = gg, str_glue("figs/forecast_{which(v == forecast_vars)}_science_bins.png"))
}
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing non-finite values (stat_density).
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing non-finite values (stat_density).
## Warning: Removed 2 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing non-finite values (stat_density).
## Warning: Removed 2 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing non-finite values (stat_density).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values (geom_bar).
#median and trimmed mean by forecast and expertise
map_df(forecast_vars, function(v) {
#is percent?
is_percent = d[[v]] %>% str_detect("%") %>% any(na.rm = T)
#clean
local_v = d[[v]]
if (is_percent) local_v = local_v %>% str_replace("%", "") %>% as.numeric() %>% divide_by(100)
#manually
tibble(
question = v,
median_low = median(local_v[d$irt_score_bin == 1], na.rm = T),
median_medium = median(local_v[d$irt_score_bin == 2], na.rm = T),
median_high = median(local_v[d$irt_score_bin == 3], na.rm = T),
tmean_low = mean(local_v[d$irt_score_bin == 1], na.rm = T, trim = .1),
tmean_medium = mean(local_v[d$irt_score_bin == 2], na.rm = T, trim = .1),
tmean_high = mean(local_v[d$irt_score_bin == 3], na.rm = T, trim = .1),
) %>%
#convert all to character because otherwise we can't merge results in the end
map_df(function(x) {
#already chr, remove fluff
if (is.character(x)) return(x %>% str_remove_fluff())
#dates to chr
if (is.Date(x)) return(as.character(x))
#2 digits if 0-1
if (x < 1) return(format_digits(x, digits = 2))
#round otherwise
round(x) %>% as.character()
})
}) ->
forecast_MM_by_sciknow
forecast_MM_by_sciknow
#individual variables
#models
list(
ols(general_forecast ~ irt_score, data = d %>% df_standardize(messages = F)),
ols(general_forecast ~ irt_score + age + sex, data = d %>% df_standardize(messages = F)),
ols(general_forecast ~ irt_score + age + sex + education_num + book_index + fox_hedgehog_num + rationalist + HBD, data = d %>% df_standardize(messages = F)),
ols(general_forecast ~ irt_score + age + sex + education_num + book_index + fox_hedgehog_num + rationalist + HBD + pol_factor_libertarianish + pol_factor_altrightish + pol_factor_leftliberalish + income_tax, data = d %>% df_standardize(messages = F)),
ols(general_forecast ~ irt_score + age + sex + education_num + book_index + fox_hedgehog_num + rationalist + HBD + pol_factor_libertarianish + pol_factor_altrightish + pol_factor_leftliberalish + income_tax + from_USA + from_UK + from_France, data = d %>% df_standardize(messages = F))
) %>%
summarize_models()
#each outcome individually
map_df(names(forecasts_numeric_ranked), function(v) {
#modify data locally
d_local = d %>%
df_standardize(messages = F)
#replace outcome variable with the specific one
d_local$general_forecast = forecasts_numeric_ranked[[v]] %>% standardize()
#regress
list(
ols(general_forecast ~ irt_score, data = d_local),
ols(general_forecast ~ irt_score + age + sex, data = d_local),
ols(general_forecast ~ irt_score + age + sex + education_num + book_index + fox_hedgehog_num + rationalist + HBD, data = d_local),
ols(general_forecast ~ irt_score + age + sex + education_num + book_index + fox_hedgehog_num + rationalist + HBD + pol_factor_libertarianish + pol_factor_altrightish + pol_factor_leftliberalish + income_tax, data = d_local),
ols(general_forecast ~ irt_score + age + sex + education_num + book_index + fox_hedgehog_num + rationalist + HBD + pol_factor_libertarianish + pol_factor_altrightish + pol_factor_leftliberalish + income_tax + from_USA + from_UK + from_France, data = d_local)
) %>%
summarize_models() %>%
mutate(outcome = v)
}) ->
many_models
#print
many_models
#write data out
data_out = d %>%
#remove variables without data
{
.[setdiff(names(.), d_vars %>% filter(miss_prop == 1) %>% pull(var_name))] %>%
#privacy
select(-IP_Address, -User_Agent, -SessionID, -Leave_your_email_address_here_if_we_may_send_you_follow_up_surveys_and_inform_you_of_your_prediction_accuracy_later_Note_This_question_is_NOT_required_and_your_e_mail_will_NOT_be_shared_publicly)
}
#write
write_rds(test_items_scored, "data/out/scored_items.rds")
write_rds(data_out, "data/out/main_data.rds")
#versions
write_sessioninfo()
## R version 3.6.3 (2020-02-29)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 19.3
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=de_DE.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=de_DE.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] mirt_1.31 osfr_0.2.8 rms_5.1-4 SparseM_1.78
## [5] DT_0.13 lubridate_1.7.4 readxl_1.3.1 kirkegaard_2018.05
## [9] metafor_2.4-0 Matrix_1.2-18 psych_1.9.12.31 magrittr_1.5
## [13] assertthat_0.2.1 weights_1.0.1 mice_3.8.0 gdata_2.18.0
## [17] Hmisc_4.4-0 Formula_1.2-3 survival_3.1-11 lattice_0.20-40
## [21] forcats_0.5.0 stringr_1.4.0 dplyr_0.8.5 purrr_0.3.3
## [25] readr_1.3.1 tidyr_1.0.2 tibble_3.0.0 ggplot2_3.3.0
## [29] tidyverse_1.3.0 pacman_0.5.1
##
## loaded via a namespace (and not attached):
## [1] TH.data_1.0-10 colorspace_1.4-1 ellipsis_0.3.0
## [4] htmlTable_1.13.3 base64enc_0.1-3 fs_1.4.1
## [7] rstudioapi_0.11 httpcode_0.2.0 farver_2.0.3
## [10] Deriv_4.0 MatrixModels_0.4-1 fansi_0.4.1
## [13] mvtnorm_1.1-0 xml2_1.3.1 codetools_0.2-16
## [16] splines_3.6.3 mnormt_1.5-6 knitr_1.28
## [19] jsonlite_1.6.1 broom_0.5.5 cluster_2.1.0
## [22] dbplyr_1.4.2 png_0.1-7 compiler_3.6.3
## [25] httr_1.4.1 backports_1.1.5 cli_2.0.2
## [28] acepack_1.4.1 htmltools_0.4.0 quantreg_5.55
## [31] tools_3.6.3 gtable_0.3.0 glue_1.4.0
## [34] Rcpp_1.0.4.6 cellranger_1.1.0 vctrs_0.2.4
## [37] crul_0.9.0 nlme_3.1-145 crosstalk_1.1.0.1
## [40] multilevel_2.6 xfun_0.12 rvest_0.3.5
## [43] lifecycle_0.2.0 dcurver_0.9.1 gtools_3.8.2
## [46] polspline_1.1.17 MASS_7.3-51.5 zoo_1.8-7
## [49] scales_1.1.0 hms_0.5.3 parallel_3.6.3
## [52] sandwich_2.5-1 RColorBrewer_1.1-2 psychometric_2.2
## [55] yaml_2.2.1 curl_4.3 memoise_1.1.0
## [58] gridExtra_2.3 rpart_4.1-15 latticeExtra_0.6-29
## [61] stringi_1.4.6 permute_0.9-5 checkmate_2.0.0
## [64] rlang_0.4.5 pkgconfig_2.0.3 evaluate_0.14
## [67] labeling_0.3 htmlwidgets_1.5.1 tidyselect_1.0.0
## [70] plyr_1.8.6 R6_2.4.1 generics_0.0.2
## [73] multcomp_1.4-12 DBI_1.1.0 mgcv_1.8-31
## [76] pillar_1.4.3 haven_2.2.0 foreign_0.8-76
## [79] withr_2.1.2 nnet_7.3-13 modelr_0.1.6
## [82] crayon_1.3.4 rmarkdown_2.1 jpeg_0.1-8.1
## [85] grid_3.6.3 data.table_1.12.8 vegan_2.5-6
## [88] reprex_0.3.0 digest_0.6.25 GPArotation_2014.11-1
## [91] munsell_0.5.0 viridisLite_0.3.0
#update OSF
#manually only
#tutorial https://rpubs.com/EmilOWK/osfr_demo
if (F) {
#connect
osf_auth(read_lines("~/.config/osf_token"))
osf_proj = osf_retrieve_node("https://osf.io/9ecwv/")
#which files to upload
osf_upload(osf_proj,
path = c("notebook.html", "notebook.Rmd", "figs", "data/out", "data/test_scoring_key.xlsx"),
conflicts = "overwrite")
}
Unrelated to study. Norms for science test for 20 item version.
#pick 20 best items by loading
best20_items_scored = test_items_scored[item_stats %>% arrange(-loading) %>% .[1:20, ] %>% pull(item)]
colnames(best20_items_scored)
## [1] "item_16" "item_10" "item_28" "item_25" "item_21" "item_19" "item_23"
## [8] "item_22" "item_29" "item_26" "item_14" "item_17" "item_9" "item_4"
## [15] "item_12" "item_27" "item_1" "item_13" "item_2" "item_11"
#analyze again
best20_items_irt = mirt(best20_items_scored, model = 1, verbose = F)
best20_items_irt_scored = fscores(best20_items_irt, full.scores = T, full.scores.SE = T)
#reliabilities
empirical_rxx(best20_items_irt_scored)
## F1
## 0.721
marginal_rxx(best20_items_irt)
## [1] 0.719
#numerics
d$total_score20 = rowSums(best20_items_scored)
d$total_score20 %>% describe()
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 333 11.9 3.28 12 12 2.97 4 19 15 -0.131 -0.621 0.179
#distribution to centile and IQ
total_score_cdf20 = ecdf(d$total_score20)
#centiles
total_score_cdf20(0:20) %>% set_names(0:20)
## 0 1 2 3 4 5 6 7 8 9
## 0.00000 0.00000 0.00000 0.00000 0.00901 0.02703 0.05405 0.09009 0.16817 0.24925
## 10 11 12 13 14 15 16 17 18 19
## 0.35736 0.44745 0.53153 0.64565 0.76877 0.86787 0.92492 0.95796 0.99099 1.00000
## 20
## 1.00000
#IQ scale
total_score_cdf20(0:20) %>% qnorm(mean = 100, sd = 15) %>% set_names(0:20)
## 0 1 2 3 4 5 6 7 8 9 10 11 12
## -Inf -Inf -Inf -Inf 64.5 71.1 75.9 79.9 85.6 89.8 94.5 98.0 101.2
## 13 14 15 16 17 18 19 20
## 105.6 111.0 116.7 121.6 125.9 135.5 Inf Inf
#plots
GG_denhist(d, "total_score20", vline = F) +
scale_x_continuous("Score", limits = c(0, 20)) +
ggtitle("Distribution of 20 item scores", "Data based on pilot study with n=333 persons")
## Scale for 'x' is already present. Adding another scale for 'x', which will
## replace the existing scale.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
GG_save("figs/score_dist20.png")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).