library(readr)
library(tidyverse)

## -- Attaching packages ------------------- tidyverse 1.2.1 --

## v ggplot2 3.1.0     v purrr   0.2.5
## v tibble  1.4.2     v dplyr   0.7.7
## v tidyr   0.8.2     v stringr 1.3.1
## v ggplot2 3.1.0     v forcats 0.3.0

## -- Conflicts ---------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

library(ggplot2)
library(FactoMineR)

## Warning: package 'FactoMineR' was built under R version 3.5.3

library(lemon)

## Warning: package 'lemon' was built under R version 3.5.3

## 
## Attaching package: 'lemon'

## The following object is masked from 'package:purrr':
## 
##     %||%

library(knitr)

df.ifps = read_csv("~/Uni/Psych252/finalproject/final-project-goodjudgmentproject/data/ifps.csv")

## Parsed with column specification:
## cols(
##   ifp_id = col_character(),
##   q_type = col_integer(),
##   q_text = col_character(),
##   q_desc = col_character(),
##   q_status = col_character(),
##   date_start = col_character(),
##   date_suspend = col_character(),
##   date_to_close = col_character(),
##   date_closed = col_character(),
##   outcome = col_character(),
##   short_title = col_character(),
##   days_open = col_integer(),
##   n_opts = col_integer(),
##   options = col_character()
## )

df.fcasts1 = read_csv("~/Uni/Psych252/finalproject/final-project-goodjudgmentproject/data/survey_fcasts.yr1.csv")

## Parsed with column specification:
## cols(
##   ifp_id = col_character(),
##   ctt = col_character(),
##   cond = col_integer(),
##   training = col_character(),
##   team = col_character(),
##   user_id = col_integer(),
##   forecast_id = col_integer(),
##   fcast_type = col_integer(),
##   answer_option = col_character(),
##   value = col_double(),
##   fcast_date = col_date(format = ""),
##   expertise = col_integer(),
##   q_status = col_character(),
##   viewtime = col_character(),
##   year = col_integer(),
##   timestamp = col_datetime(format = "")
## )

## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)

## Warning: 45 parsing failures.
## row # A tibble: 5 x 5 col     row col     expected  actual file                                      expected   <int> <chr>   <chr>     <chr>  <chr>                                     actual 1  8115 user_id an integ~ NULL   '~/Uni/Psych252/finalproject/final-proje~ file 2  8116 user_id an integ~ NULL   '~/Uni/Psych252/finalproject/final-proje~ row 3  8121 user_id an integ~ NULL   '~/Uni/Psych252/finalproject/final-proje~ col 4  8122 user_id an integ~ NULL   '~/Uni/Psych252/finalproject/final-proje~ expected 5  8131 user_id an integ~ NULL   '~/Uni/Psych252/finalproject/final-proje~
## ... ................. ... .......................................................................... ........ .......................................................................... ...... .......................................................................... .... .......................................................................... ... .......................................................................... ... .......................................................................... ........ ..........................................................................
## See problems(...) for more details.

df.fcasts2 <- read_csv("~/Uni/Psych252/finalproject/final-project-goodjudgmentproject/data/survey_fcasts.yr2.csv")

## Parsed with column specification:
## cols(
##   ifp_id = col_character(),
##   ctt = col_character(),
##   cond = col_integer(),
##   training = col_character(),
##   team = col_integer(),
##   user_id = col_integer(),
##   forecast_id = col_integer(),
##   fcast_type = col_integer(),
##   answer_option = col_character(),
##   value = col_double(),
##   fcast_date = col_date(format = ""),
##   expertise = col_integer(),
##   q_status = col_character(),
##   viewtime = col_character(),
##   year = col_integer(),
##   timestamp = col_datetime(format = "")
## )

## Warning: 2 parsing failures.
## row # A tibble: 2 x 5 col      row col      expected       actual file                               expected    <int> <chr>    <chr>          <chr>  <chr>                              actual 1 226542 forecas~ no trailing c~ e+05   '~/Uni/Psych252/finalproject/fina~ file 2 226543 forecas~ no trailing c~ e+05   '~/Uni/Psych252/finalproject/fina~

df.differences <- read_csv("~/Uni/Psych252/finalproject/final-project-goodjudgmentproject/data/all_individual_differences.csv")

## Parsed with column specification:
## cols(
##   .default = col_integer(),
##   newshrs_yr3 = col_double(),
##   newshrs_yr4 = col_double(),
##   raven_adjusted_score = col_double(),
##   raven_1_time_yr3 = col_double(),
##   raven_2_time_yr3 = col_double(),
##   raven_3_time_yr3 = col_double(),
##   raven_4_time_yr3 = col_double(),
##   raven_5_time_yr3 = col_double(),
##   raven_6_time_yr3 = col_double(),
##   raven_7_time_yr3 = col_double(),
##   raven_8_time_yr3 = col_double(),
##   raven_9_time_yr3 = col_double(),
##   raven_10_time_yr3 = col_double(),
##   raven_11_time_yr3 = col_double(),
##   raven_12_time_yr3 = col_double(),
##   raven_1_yr4 = col_character(),
##   raven_2_yr4 = col_character(),
##   raven_3_yr4 = col_character(),
##   raven_4_yr4 = col_character(),
##   raven_5_yr4 = col_character()
##   # ... with 670 more columns
## )

## See spec(...) for full column specifications.

## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)

## Warning: 89 parsing failures.
## row # A tibble: 5 x 5 col     row col       expected       actual file                               expected   <int> <chr>     <chr>          <chr>  <chr>                              actual 1  1227 crt_17_y~ no trailing c~ ,25    '~/Uni/Psych252/finalproject/fina~ file 2  1227 numeracy~ no trailing c~ ,25    '~/Uni/Psych252/finalproject/fina~ row 3  2792 crt_17_y~ no trailing c~ ,25    '~/Uni/Psych252/finalproject/fina~ col 4  2792 numeracy~ no trailing c~ ,25    '~/Uni/Psych252/finalproject/fina~ expected 5  3078 crt_17_y~ no trailing c~ ,25    '~/Uni/Psych252/finalproject/fina~
## ... ................. ... .......................................................................... ........ .......................................................................... ...... .......................................................................... .... .......................................................................... ... .......................................................................... ... .......................................................................... ........ ..........................................................................
## See problems(...) for more details.

Abstract

Mellers et al. (2015) describe results from a research program that investigates the accuracy of predictions–or so-called “forecasts”–about future events. This report describes a project which re-analyzed their data and had two main aims in doing so. The first is to reproduce three key statistics from Mellers et al. (2015). The second is to conduct some exploratory analysis to reveal new insights from the data. Regarding the first aim, one difficulty was that the project was unable to exactly reproduce the analysis for these statistics. However, the re-analysis was nevertheless successful insofar as it did support the same general conclusions that were derived from each of them: 1) that forecasting accuracy is (close to) normally distributed among participants, 2) that high forecasting accuracy is unlikely to be due to chance and 3) that a range of variables can account for variation in forecasting accuracy. There are various potential explanations of why the re-analysis failed to exactly reproduce the results, including the possibility that the original study used unspecified exclusion criteria and the fact that some key variables in the original study’s dataset were not available for this re-analysis. Regarding the second aim, the exploratory analysis revealed a potential methodological improvement compared to the original analysis procedure. In the original study, the estimates of partcipants’ accuracy (in the form of mean Brier scores) were perhaps unduly affected by variation in whether some participants made predictions at earlier times when the available evidence told a different story, so to speak. There are theoretical grounds to think that mean Brier scores would ideally have been calculated solely on the basis of the final forecasts for a given question. When this methodological tweak is implemented, it paints a more optimistic picture of the data: participants are more accurate than it previously appeared, and particular variables are better predictors than was previously reported.

Introduction

Everyday, we make decisions, but if these decisions are based on inaccurate beliefs, the consequences can be devastating. Some think that the Iraq war is a prime example of this, as support for the disastrous war–at least in some circles–was fueled by the inaccurate belief that Iraq was probably developing weapons of mass destruction. Examples like this showcase that inaccurate beliefs can result in terrible consequences. For reasons like this, we may be interested in the question of how to form accurate beliefs about the world.

A study from Mellers et al. (2015) goes to the heart of this question. They describe the results of a forecasting program initiated in 2011. The program saw thousands of people–including lay-persons, academics and others–make predictions or “forecasts” about geopolitical questions. Such questions could be about the outcome of an election, about the probability of civil unrest in a country or about the result of negotiations about nuclear weapons among some countries–to give but a few examples. Their study then uses statistics to extract some insights from the program, including the following:

Forecasting accuracy was approximately normally distributed with a mean forecasting ability above chance guessing but below perfect accuracy (Mellers et al. (2015), p. 6, figure 1)
Highly accurate forecasters were not accurate solely due to chance, and they maintained their lead over other forecasters throughout the first two years of the program (Mellers et al. (2015), p. 7, figure 3)
A range of dispositional, situational and behavioral variables predict forecasting accuracy (Mellers et al. (2015), p. 11, table 5)

While their study concerns predictions about specifically geopolitical topics, there are good reasons to think that these key insights are generalizable to other contexts too. The first of these is that what made the forecasters good are things that seem to conduce to accurate beliefs in general. For example, the avoidance of cognitive biases, the acquisition of subject specific-knowledge and the exchange of insights are good in most–if not all–contexts. Second, the prediction of geopolitical events requires reasoning about features of human life that are present in many non-geopolitical contexts–features such as what motivates individuals, what their institutional and societal constraints are, and how they are likely to interact with the interests of others in their environment. Such features are pervasive in society, relating for instance to the behavior of employees in organizations, to partners in relationships and to other aspects of human life.

Method and Results

Original dataset and procedure

In the research program, participants were recruited via various means, including word of mouth, approaching professional societies and the like. Participants were required to have a Bachelor’s degree or higher.

The first year of the program started with 1,593 participants.

However, the dataset for analysis in the original study includes only the 793 of these participants who made at least 30 predictions and who participated in the first two years of the program. The rationale was that these exclusion criteria would enable te researchers to study only those forecasters for whom they could “get stable estimates of forecasting ability” (Mellers et al. 2015, p. 6).

Forecasting accuracy was measured with Brier scores. Brier scores are the sums of squared deviations from probability forecasts on the one hand and the actual outcome on the other. For example, if a forecaster assigns a probability of 75% to an event which turns out to be the actual outcome, then the Brier score for that outcome is specified by the below equation:

\((.75 - 1)^2 + (.25 - 0)^2 = .125\)

Data for the re-analysis project

Data for this study were obtained from the Dataverse: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/BPCDH5.

Either 1) there are some differences between this dataset and the dataset used in the original study or alternatively, 2) additional exclusion criteria were applied to the original dataset beyond what is specified in the original paper.

This is because the online data appear to include more participants. In particular, the dataverse dataset has 1,973 distinct user IDs associated with the forecasts made in the first year–significantly more than the figure of 1,593 that is reported in the original study.

Additionally, for this re-analysis project, more participants (799 in total) satisfied the inclusion criteria of having made at least 30 predictions and having participated in both years. Other inclusion and exclusion criteria were tested to try obtain exactly 743 participants, but none of the sensible candidates yielded this result.

Consequently, the results are likely to be different from what was reported in the original study.

Other discrepancies between the data for the original study and for the re-analysis will be described below.

We will now outline the re-analysis of the key statistics, starting with the distribution of forecasting accuracy.

Statistic # 1: Distribution of forecasting accuracy

## Joining the forecasting datasets to include forecasts for both years 1 and 2

df.fcasts1mod = df.fcasts1 %>% 
  mutate(team = as.numeric(team))

df.fcasts = union(df.fcasts1mod, df.fcasts2)

## Creating a dataset that includes Brier scores for each forecast

# Create a dataset containing only the outcomes for each question

df.outcomes = df.ifps %>% select(outcome, ifp_id)

# Add outcomes for each question to the dataframe containing for forecasts

df.merged = left_join(df.fcasts, df.outcomes)

## Joining, by = "ifp_id"

# Calculating Brier scores for probabilities assigned to the true outcomes (so that the Brier score is only calculated relative to the probability for the true outcome)

df.scores = df.merged %>% 
  mutate(true_outcome = ifelse(answer_option == outcome, 1, 0)) %>% 
  filter(true_outcome == 1) %>% 
  mutate(brier = ((value - 1)^2) + (((1-value) - 0)^2))

## Second step: Filtering out participants who made less than thirty predictions in both years

# Below, I test different exclusion criteria which have been commented out because they do not yield better results

# 30 predictions in year 1 and 30 in year 2
# df.users = df.scores %>% 
#   group_by(user_id, year) %>% 
#   mutate(no_user_fcasts = n()) %>% 
#   filter(no_user_fcasts >= 30) %>% 
#   mutate(year_1 = ifelse(year == 1, 1, 0),
#          year_2 = ifelse(year == 2, 1, 0)) %>%
#   ungroup() %>% 
#   group_by(user_id) %>% 
#   mutate(in_year_1 = ifelse(mean(year_1)>0, 1,0),
#          in_year_2 = ifelse(mean(year_2)>0, 1,0),
#          years12 = ifelse(in_year_1*in_year_2 >0, 1, 0)) %>% 
#   filter(years12 == 1)

# Predictions on 30 questions in year 1 and 30 in year 2
# df.users = df.scores %>%
#   group_by(user_id) %>%
#   unique(ifp_id)
#   filter(n() >= 30) %>%
#   mutate(year_1 = ifelse(year == 1, 1, 0),
#          year_2 = ifelse(year == 2, 1, 0)) %>%
#   ungroup() %>%
#   group_by(user_id) %>%
#   mutate(in_year_1 = ifelse(mean(year_1)>0, 1,0),
#          in_year_2 = ifelse(mean(year_2)>0, 1,0),
#          years12 = ifelse(in_year_1*in_year_2 >0, 1, 0)) %>%
#   filter(years12 == 1)

# Last predictions. There's a problem here. Accuracy goes up drastically if only the last predictions count.
# df.scores = df.scores %>%
#   group_by(ifp_id, user_id) %>%
#   arrange(desc(fcast_date)) %>%
#   filter(row_number()==1) %>%
#   ungroup()

# Filtering out participants who have made less than 30 predictions across both year 1 and 2

df.users = df.scores %>%
  group_by(user_id) %>%
  mutate(no_user_fcasts = n()) %>%
  filter(no_user_fcasts >= 30) %>%
  mutate(year_1 = ifelse(year == 1, 1, 0),
         year_2 = ifelse(year == 2, 1, 0)) %>%
  ungroup() %>%
  group_by(user_id) %>%
  mutate(in_year_1 = ifelse(mean(year_1)>0, 1,0),
         in_year_2 = ifelse(mean(year_2)>0, 1,0),
         years12 = ifelse(in_year_1*in_year_2 >0, 1, 0)) %>%
  filter(years12 == 1)

# Calculating Brier scores per participant

df.briers = df.users %>% 
  group_by(user_id) %>% 
  summarise(mean_brier = mean(brier))

# Binning mean Brier scores

df.briers$binned_brier = car::recode(df.briers$mean_brier, "0:.14999=1; 0.15:.19999=2; 0.2:.24999=3; 0.25:.29999=4; 0.3:.349999=5; 0.35:.39999=6; 0.4:0.44999=7; 0.45:0.49999=7; 0.5:0.54999=8; 0.55:0.5999=9; 0.6:0.64999=10; 0.65:2=11") 

# Visualiing Brier scores

df.plot = df.briers %>%
  select(binned_brier) %>% 
  table() %>% 
  as.data.frame() %>% 
  mutate(relfreq = (Freq/sum(Freq)),
         thing = 1:11)

firstplot = ggplot(df.plot, aes(x = factor(thing), y = relfreq)) +
  geom_bar(stat="identity", aes(y = relfreq), width = 0.4, fill = "red") +
  labs(title = "Figure 2: Re-Analysis of Brier Score Accuracy", x = "Brier score category", y = "Proportion of participants") + ylim(0, .25) +
   scale_x_discrete(breaks=c(1:11), labels=c("0.15", "0.2", "0.25", "0.3", "0.35", "0.4", "0.45", "0.5", "0.55", "0.6", "0.65")) + theme_classic()

# Calculating mean of the mean brier scores for the participants

mean = mean(df.briers$mean_brier)

The original graph:

The original paper featured the following bar chart depicting a close to normal distribution of the mean Brier scores among participants (Mellers et al. (2015), p. 6, figure 1).

The re-analysis graph:

This statistical graph was partially reproduced in the re-analysis provided in figure 2 below.

The differences between the two graphs can be seen in figure 3 below. The reproduced graph is transparent in red and has been imposed over the original graph which is grey in the background.

Figure 3

Overall, the results support the main insight of the original graph: average forecasting accuracy is normally distributed among the participants, with a fairly similar means of .30 and .35 in the original analysis and the re-analysis respectively. However, the means and the shapes of the graphs are not exactly the same, and this may be due to the fact that the data meeting the inclusion criteria for re-analysis differs from the original dataset for reasons discussed earlier.

Statistic # 2: Chance and forecasting accuracy

# Identify first 25 questions

df.ifpsdated = df.ifps %>% 
  mutate(date_closed = as.Date(date_closed, format = "%m/%d/%y")) %>% 
  filter(!is.na(date_closed)) %>% 
  arrange(date_closed) 

df.first25 = df.ifpsdated %>% 
  head(25)

# Remove all other questions from the dataset for the forecasts dataset (merged with the variable for the actual outcomes)

df.merged25 = df.merged %>% 
  subset(ifp_id %in% df.first25$ifp_id) %>% 
  subset(user_id %in% df.briers$user_id)


df.scores25 = df.merged25 %>% 
  mutate(true_outcome = ifelse(answer_option == outcome, 1, 0)) %>% 
  filter(true_outcome == 1) %>% 
  mutate(brier = ((value - 1)^2) + (((1-value) - 0)^2)) %>% 
  group_by(ifp_id) %>% 
  mutate(zbrier = scale(brier)) %>% 
  ungroup()


# Identify the top 100 and bottom 100 forecasters for the first 25 questions to close

df.zbriers25 = df.scores25 %>% 
  group_by(user_id) %>% 
  summarise(mean_zbrier = mean(zbrier)) 
  
top100 = df.zbriers25 %>% 
  arrange(mean_zbrier)%>% 
  head(100)

bottom100 = df.zbriers25 %>% 
  arrange(desc(mean_zbrier)) %>% 
  head(100)


# Create two functions that take as an input the order of a question (e.g. 'the 32nd question to close' ) and then returns the mean standardized Brier score of the top 100 or the bottom 100 users -- the code below could be more elegant and simplified, but it does the job

top = function(x){ #A function with the order of a question as an input
  
question = df.ifpsdated$ifp_id[x] #Assign the question to a variable
  
df.mergedx = df.merged %>% #Subset the forecasts for the users that satisfied the inclusion criteria and for the question with the order specified in the function
  filter(ifp_id == question) %>% 
  subset(user_id %in% df.briers$user_id)

df.scoresx = df.mergedx %>%  #Assign brier scores to the forecasts
  mutate(true_outcome = ifelse(answer_option == outcome, 1, 0)) %>% 
  filter(true_outcome == 1) %>% 
  mutate(brier = ((value - 1)^2) + (((1-value) - 0)^2))

df.briersx = df.scoresx %>% #Standardize the brier scores for each user
  group_by(user_id) %>% 
  summarise(mean_brier = mean(brier)) %>% 
  mutate(z.score = scale(mean_brier))

df.briersxselect = df.briersx %>% #Filter in only the top 100 users Brier scores for that question
  subset(user_id %in% top100$user_id)

return(mean(df.briersxselect$z.score)) #Return the mean of these scores
}

bottom = function(x){ #Same function as above, but for the worst 100 users
  
question = df.ifpsdated$ifp_id[x]
  
df.mergedx = df.merged %>% 
  filter(ifp_id == question) %>% 
  subset(user_id %in% df.briers$user_id)

df.scoresx = df.mergedx %>% 
  mutate(true_outcome = ifelse(answer_option == outcome, 1, 0)) %>% 
  filter(true_outcome == 1) %>% 
  mutate(brier = ((value - 1)^2) + (((1-value) - 0)^2))

df.briersx = df.scoresx %>% 
  group_by(user_id) %>% 
  summarise(mean_brier = mean(brier)) %>% 
  mutate(z.score = scale(mean_brier))

df.briersxselect = df.briersx %>% 
  subset(user_id %in% bottom100$user_id)

return(mean(df.briersxselect$z.score))
}


df.dudettes = data_frame(
  number = c(25:498)) 
      

for(i in 25:498){
  df.dudettes$topz[i-24] <- top(i)
  df.dudettes$bottomz[i-24] <- bottom(i)}

## Warning: Unknown or uninitialised column: 'topz'.

## Warning: Unknown or uninitialised column: 'bottomz'.

# It looks like some of the questions in df.ifps aren't in any of the fcast data

# Remove NA

df.zs = df.dudettes %>% 
  filter(!is.na(topz))

df.zs$x= c(1:188) 

df.zs = df.zs %>% filter(x < 176)

secondplot = ggplot(df.zs, aes(x = x, y = topz))+
  geom_line(aes(x = x, y = topz), color = "red4", alpha = 0.7, size =1.2)+
  geom_line(aes(x = x, y = bottomz), color ="pink2", alpha = 0.7, , size =1.2)+
  theme_classic()+
  labs(title = "Figure 5: Reproduced results", x = "Question closing order: 25th to 199th", y = "Average Standardized Brier Score") + 
   scale_y_continuous(breaks=c(-1, -.8, -.6, -.4, -.2, 0, .2, .4, .6, .8, 1), labels=c("-1", "-.8", "-.6", "-.4", "-.2", "0", ".2", ".4", ".6", ".8", "1"), limits=c(-1,1), expand = c(0,0))+
  scale_x_continuous(expand = c(0,0)) +
  geom_hline(yintercept=0, linetype="dashed", color = "black")

The original statistic:

The authors of the original study aimed to answer the question of whether “accuracy [was] largely attributable to luck” (p. 6). One of the ways they did this was to track the accuracy of participants over time. More specifically, they standardized the Brier scores for each question relative to the other forecasts for that same question. Then, they identified the forecasters who were in the top 100 or the bottom 100 after the 25th question to close. They further state that the groups were constructed so as to consist of “forecasters [who] had attempted an average of 15 questions” (Meller et al., 2015, p. 6).

They then tracked the average accuracy of participants in these groups over the first 199 questions to close.

Their results are depicted in figure 4 below (Mellers et al. (2015), p. 7, figure 3 in the original).

Figure 4

If forecasting accuracy was largely due to chance, one would not have expected the highly accurate forecasters to maintain their lead. Consequently, the above graph can be regarded as evidence that accuracy is not largely due to chance, since it shows that the accurate forecasters did maintain their lead.

The re-analysis statistic:

Figure 5 below attempts to reproduce this graph, utilizing the same criteria and procedure. However, their paper did not specify precisely how they obtained participants who had attempted an average of 15 questions. Consequently, in the re-analysis, I just employed their explicit inclusion criteria: participation in first two years of the program and making at least 30 predictions.

The results showed that, for the most part, the highly accurate forecasters did maintain their lead. This then supports the general conclusion that forecasting accuracy is not largely due to chance.

However, the graphs are not identical, as can be seen from the juxtaposition of the graphs in figure 6 below. This may again be because of implicit exclusion criteria, including criteria about having attempted 15 questions on average.

Figure 6

Statistic #3: Predictors of forecasting accuracy

# Exclude irrelevant case from the data

df.participant = df.differences %>% 
  subset(user_id %in% df.briers$user_id) %>% 
  left_join(df.briers)

## Joining, by = "user_id"

# Make dataset with dispositional variables

df.dis = df.participant[, c("raven_score_yr123", "crt_score_yr12", "pk_score_yr1", "pk_score_yr2", "numeracy_score_yr12", "aomt_score_yr123")]

# Doing principal components analysis. The analysis cannot run with missing values, so I am replacing them with the mean (in the absence of any other strategy for dealing with them that is specified in the paper).

df.dis = df.dis %>% 
  mutate(
    raven_score_yr123 = 
      ifelse(is.na(raven_score_yr123), 
             mean(raven_score_yr123, na.rm=TRUE), 
             raven_score_yr123),
crt_score_yr12 = 
      ifelse(is.na(crt_score_yr12), 
             mean(crt_score_yr12, na.rm=TRUE), 
             crt_score_yr12),
pk_score_yr1 = 
      ifelse(is.na(pk_score_yr1), 
             mean(pk_score_yr1, na.rm=TRUE), 
             pk_score_yr1),
pk_score_yr2 = 
      ifelse(is.na(pk_score_yr2), 
             mean(pk_score_yr2, na.rm=TRUE), 
             pk_score_yr2),
numeracy_score_yr12 = 
      ifelse(is.na(numeracy_score_yr12), 
             mean(numeracy_score_yr12, na.rm=TRUE), 
             numeracy_score_yr12),
aomt_score_yr123 = 
      ifelse(is.na(aomt_score_yr123), 
             mean(aomt_score_yr123, na.rm=TRUE), 
             aomt_score_yr123)
)

# Alternative approach to dealing with missing values: deleting rows with missing values. This didn't yield better results.
# df.dis = df.dis[complete.cases(df.dis), ]


# Make PCA scores

pca = prcomp(df.dis, scale. = TRUE)

pca.data = as.data.frame(cbind(pca$x))

df.briers = df.briers %>% 
  mutate(n = 1:n())

pca.data = pca.data %>% 
  mutate(n = 1:n())

pca.data = left_join(df.briers, pca.data)

## Joining, by = "n"

# Note that I checked to make sure that the right scores were assigned to the right participants, but I haven't included the code for that here.

# Add training and teaming variables to the data

df.teams = df.fcasts %>% 
  mutate(team_dummy = ifelse(is.na(team), 0, 1),
         trainingnum = ifelse(training == "a", 0, 1)) %>% 
  group_by(user_id) %>% 
  select(user_id, trainingnum, team_dummy) %>% 
  summarise(training = round(mean(trainingnum)),
            team = round(mean(team_dummy))) %>% 
  subset(user_id %in% df.briers$user_id)

# The values of training and teaming can differ for the same participant: for example, they may have a value suggesting that they are in one team at one moment, but then they will have a different value at another. The result is that it is not clear which value of training and teaming a user should have for the sake of the regression. No strategy for dealing with this was reported in the paper, and I could not think of a strategy with a clear advantage over another. Hence, I just took the mean of the these variables and then rounded them as a way of assigning one value to each participant. 


#  For the traning variable, "a" looks like the no training condition, with "b" being the scenario training condition and  "c" being the probability training condition. It wasn't clear how they coded these categories for the "training" variable in the original article.


# Add a variable specifying a users average forecasts per question

df.fperq = df.fcasts %>% 
  group_by(user_id, ifp_id) %>% 
  mutate(FperQ = n()) %>% 
  summarise(FperQ = mean(FperQ)) %>% 
  group_by(user_id) %>% 
  summarise(mean_FperQ = mean(FperQ)) %>% 
  subset(user_id %in% df.briers$user_id)


# Make final dataset

df.prefinal = left_join(pca.data, df.teams, by = "user_id")

df.final = left_join(df.prefinal, df.fperq, by = "user_id") %>% 
  select(-c(n, binned_brier))


# Make models

lm.dis = lm(mean_brier ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6, df.final)

lm.sit = lm(mean_brier ~ training + team, df.final)

lm.beh = lm(mean_brier ~ mean_FperQ, df.final)

lm.all = lm(mean_brier ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + training + team + mean_FperQ, df.final)

multiple.r = sqrt(summary(lm.all)$r.squared)

multiple.r

## [1] 0.4150737

a = c("Dispositional", "Situational", "Behavioral", "All 3 (Dis + Sit + Beh)")
b = c(.31, .34, .54, .64)
c = c(.26, .31, .23, .41)

table = data.frame(a, b, c)

table =table %>% mutate(Difference = c-b)

colnames(table) = c("Type of variables","Original r", "Re-analysis r", "Difference")

kable1 = kable(table, caption = "Multiple r comparisons")

The third statistic aims to address the question of what predicts forecasting accuracy.

The original study analyses three categories of predictor variables.

The first category are so-called “dispositional variables”. While they do not explicitly define what a “dispositional” variable is, they do include five variables under this category in their analysis. These each relate to performance on five tests:

Cognitive reflection test: test of cognitive reflection aim to measure cognitive control. Mellers et al. (2015) state that individuals with cognitive control possess “the ability to override seemingly obvious but incorrect responses and engage in more prolonged and deeper thought” (Meller et al., 2015, p. 2). An example of a question from this test is the following: “All flowers have petals. Roses have petals. If these two statements are true can we conclude that roses are flowers?”
Extended cognitive reflection test: this is a longer test that also measures cognitive control.
Numeracy test: this measure mathematical reasoning capabilities.
Active open-minded thinking test: This test aims to measure the extent to which one is an active open-minded thinker. Such thinkers have “the tendency to evaluate arguments and evidence without undue bias from one’s own prior beliefs–and with recognition of the fallibility of one’s judgment” (Mellers et al., 2015, p. 3).
Political knowledge test: this straightforwardly aims to measure one’s knowledge about political topics.

The second category of variables are “situational variables”. In particular, such variables consist of 1) whether a participant was placed in a forecasting team when making forecasts and 2) whether a participant received forecasting training.

The third and final category of variables are behavioral variables. This consists of 1) the average number of forecasts which a participant would make per question and 2) their “deliberation time”–that is, the average time they spent viewing a question before making a prediction.

Some of these variables are not apparent in the dataset used for re-analysis. In particular, there appears to be no data indicating the time that was spent viewing a question, and it is also not clear as to which variables qualify as the extended cognitive reflection test, if any.

The original paper states that “principle factor scores” were used for the results of the tests concerning the dispositional variables (Mellers et al., 2015, p.11). It is unclear what they meant by this, but it probably is a typo and should instead read as principal component scores. Such scores are a common way of creating new variables which condense the variation from a dataset with many variables where the original variables may correlated with each other.

They then report the multiple correlation “r” statistic for various models with different sets of predictors. Not all of these are reported here, but some salient results are reported below. The multiple correlation from the original study is displayed below, alongside the multiple correlation from the re-analysis and the difference between the two statistics. Note that each of these results are statistically significantly (p<0.01):

Multiple r comparisons
Type of variables	Original r	Re-analysis r	Difference
Dispositional	0.31	0.26	-0.05
Situational	0.34	0.31	-0.03
Behavioral	0.54	0.23	-0.31
All 3 (Dis + Sit + Beh)	0.64	0.41	-0.23

The results again differ from the original, but they again also support the same general conclusion: that a range of specific variables predict forecasting accuracy.

There are several reasons why the results might differ in this case. Two have already been alluded to: 1) the use of implicit exclusion criteria in the original study and 2) variables missing from the re-analysis dataset, specifically deliberation time and the extended cognitive reflection test. Such missing variables would in particular explain why the behavioral variable category has the biggest difference between the original analysis and the re-analysis: the important variable of deliberation time is missing in the re-analysis. Another reason is that it is not clear how the principal components analysis was carried out. There are a range of variables which could have been included or excluded from the analysis: for example, the percentage of correct scores, the overall correct scores, or answers on singular questions. It is not clear how these variables were (or were not) used to obtain the principal component scores.

Exploratory analysis

# Filtering the dataset to only include people meet the criteria and the final forecasts made for a question
df.finalpreds = df.scores %>%
  group_by(user_id) %>%
  mutate(no_user_fcasts = n()) %>%
  filter(no_user_fcasts >= 30) %>%
  mutate(year_1 = ifelse(year == 1, 1, 0),
         year_2 = ifelse(year == 2, 1, 0),
         in_year_1 = ifelse(mean(year_1)>0, 1,0),
         in_year_2 = ifelse(mean(year_2)>0, 1,0),
         years12 = ifelse(in_year_1*in_year_2 >0, 1, 0)) %>% 
  filter(years12 == 1) %>%  
  ungroup() %>% 
  group_by(ifp_id, user_id) %>%
  arrange(desc(fcast_date)) %>%
  filter(row_number()==1) %>%
  ungroup()


# Final step: Calculating Brier scores per participant

df.briers = df.finalpreds %>% 
  group_by(user_id) %>% 
  summarise(mean_brier = mean(brier))

# Mean of the average Brier scores new dataset

newmean = mean(df.briers$mean_brier)

# Binning mean Brier scores

df.briers$binned_brier = car::recode(df.briers$mean_brier, "0:.14999=1; 0.15:.19999=2; 0.2:.24999=3; 0.25:.29999=4; 0.3:.349999=5; 0.35:.39999=6; 0.4:0.44999=7; 0.45:0.49999=7; 0.5:0.54999=8; 0.55:0.5999=9; 0.6:0.64999=10; 0.65:2=11") 

# Visualiing Brier scores

df.plot = df.briers %>%
  select(binned_brier) %>% 
  table() %>% 
  as.data.frame() %>% 
  mutate(relfreq = (Freq/sum(Freq)),
         thing = 1:11)

plot3 = ggplot(df.plot, aes(x = factor(thing), y = relfreq)) +
  geom_bar(stat="identity", aes(y = relfreq), width = 0.4, fill = "red") +
  labs(title = "Figure 8: Disribution of adjusted Brier scores", x = "Brier score", y = "Proportion of participants") + ylim(0, .25) +
   scale_x_discrete(breaks=c(1:11), labels=c("0.15", "0.2", "0.25", "0.3", "0.35", "0.4", "0.45", "0.5", "0.55", "0.6", "0.65")) + theme_classic()


# Re analysing predictor variables with adjusted mean Brier scores

pca.data = left_join(df.briers, pca.data)

## Joining, by = c("user_id", "mean_brier", "binned_brier")

# Make final dataset

df.prefinal = left_join(pca.data, df.teams, by = "user_id")

df.final = left_join(df.prefinal, df.fperq, by = "user_id") %>% 
  select(-c(n, binned_brier))


# Make models

lm.dis = lm(mean_brier ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6, df.final)

lm.sit = lm(mean_brier ~ training + team, df.final)

lm.beh = lm(mean_brier ~ mean_FperQ, df.final)

lm.all = lm(mean_brier ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + training + team + mean_FperQ, df.final)

multiple.r = sqrt(summary(lm.all)$r.squared)

multiple.r

## [1] 0.8726703

# Make table to display results

a = c("Dispositional", "Situational", "Behavioral", "All 3 (Dis + Sit + Beh)")
b = c(.31, .34, .54, .64)
c = c(.26, .31, .23, .41)
d = c(.46, .38, .53, .87)


table = data.frame(a, b, c, d)

table =table %>% mutate(Difference = d-b)

colnames(table) = c("Type of variables","Original r", "Re-analysis r", "Adjusted re-analysis r", "Difference between original and adjusted analysis")

kable2 = kable(table, caption = "Multiple r comparisons")

This re-analysis project also undertook some exploratory analysis for the sake of uncovering other potential insights in the data. One potential insight is that the forecasting accuracy of participants could be measured differently, taking into account only their final predictions on a question.

To obtain the graph of Brier scores for statistic one (see figure 1 above), it is likely that the accuracy of all of a participant’s forecasts was averaged, including forecasts made on a question at an earlier time when the available evidence about the outcome for that question may have been different. With such forecasts, the individual may have changed their estimate later as new evidence accumulated. The original paper does not explicitly state that it followed this procedure, but it is likely that it did so for a few reasons. For one, Mellers et al. (2015) state that “the average raw Brier score of our participants was 0.30”, but they do not qualify this statement by specifying that it is the average of the Brier score for only the final predictions made on a question. For another, re-analysing the data with this procedure gets results that are the closest to what is reported in the original study, with the distributions shaped similarly in both the original analysis and the re-analysis. As we shall see, re-analysing the data with only the final forecasts yields a different result.

However, averaging all forecasts may paint a somewhat noisey picture of an individual’s forecasting accuracy and merit. To see how this is so, consider this (somewhat trivial) example. Suppose Alice and Brendon have to reason about whether a particular politician, call her Caroline, has survived a plane crash. On Monday, the only available evidence is that 99% of the passengers on board the plane died in the crash. Alison considers the question on Monday and makes a probability assignment of 99% that Caroline is dead, and this is perfectly rational given the then available evidence. Suppose also that neither Alice nor anyone else can gather more evidence at this point. Then, on the following Wednesday, news breaks that one of the survivors claims to be named “Caroline”, but it is not yet clear whether or not this “Caroline” is the politician. Alice updates on this evidence and assigns a 50% probability to Caroline being alive. Up until Wednesday, Brendon was busy with other things, but now Brendon can consider the question for the first time. He then assigns a probability of 50% too. It then turns out that Caroline the politician did survive, and she in fact was the “Caroline” alluded to on Wednesday. Now, Alice would have a worse mean Brier score than Brendon using the original analysis procedure; the original procedure takes into account Alice’s earlier probability assignment on Monday, and this assignment was further from the truth (although we can suppose this was completely justifiable given the then available evidence). Yet Alice would not necessarily be a worse forecaster: if Brendon was able to consider the question earlier, he would have assigned the same probability on Monday as Alice did given the available evidence. What differs between Alice and Brendon is not their forecasting merit, but simply the times at which they made their forecasts and the evidence that was available at those times. Brendon just happened to be unable to consider the question earlier when the evidence was less informative. For this reason, the orginal procedure does not evaluate participants on a level playing field, considering their forecasts at similar times when similar evidence was potentially accessible to all of them. This then neglects the fact that some forecasters will make rational but less accurate probability assignments at times when the evidence is less informative. Ultimately, then, this reflects the idea that even the best available evidence can sometimes be misleading or unhelpful in other ways, and sometimes even the best of truth-seekers can do nothing about this given human limitations.

One solution to this problem is to only consider a participant’s final forecast for a question, and to average these forecasts to obtain their average forecasting accuracy. The final forecasts were made at times when the participants are more likely to have had the same available evidence.

However, when this is done, it generates a very different distribution of Brier scores:

plot3

We can justapose this graph over the one originally reported by Mellers et al. (2015):

Figure 8

As we can see, the re-analysis paints quite a different picture. For one, the distribution is now closer to a uniform distribution than it was before. For another, it is more optimistic about people’s forecasting abilities: it shifts the distribution more to the left, showing that participants were considerably more accurate on average when only their final forecasts were taken into account.

The picture is even more dramatically different when we re-analyse the multiple correlation data:

Multiple r comparisons
Type of variables	Original r	Re-analysis r	Adjusted re-analysis r	Difference between original and adjusted analysis
Dispositional	0.31	0.26	0.46	0.15
Situational	0.34	0.31	0.38	0.04
Behavioral	0.54	0.23	0.53	-0.01
All 3 (Dis + Sit + Beh)	0.64	0.41	0.87	0.23

Here, we can see that the adjusted procedure enables the predictor variables to account for considerably more of the data. Unlike with the previous table where all of the effect sizes in the re-analysis were weaker, the adjusted re-analysis effect sizes in this table are stronger than the original effects. The one exception is the effect for behavioral variables, but this is unsurprising given that one of the important variables is missing from the analysis–namely, deliberation time. The effect size would have in all probability been greater had this variable been included.

One potential explanation for why the effect sizes are stronger is that a significant amount of noise may be reduced in the analysis. On the analysis procedure in the original paper, variation in mean Brier scores could be partly attributed to whether some participants also made additional forecasts earlier than others, since these could lower the scores if they made forecasts at an earlier time when evidence looked different and was less informative about–or faithful to–what the actual outcome was. With the adjusted procedure, this source of noise is reduced, and forecasting abilities among participants are compared on a more level playing field–that is, at times when the evidence they had (or could have had) is more likely to be similar among the participants. Such a comparison may then more clearly reflect how participants differ with respect to their forecasting abilities and how they collect or analyze evidence in order to make forecasts.

Consequently, it may be advantageous to re-analyse the these data with only the final predictions.

Discussion

This re-analysis project had two aims: to reproduce three key statistics from the Mellers et al. (2015) and to conduct some exploratory analysis.

Regarding the first aim, the re-analysis was unable to exactly reproduce the statistics. This may be because of several reasons. One is that implicit inclusion criteria may have been used in the original study. Another is that some key variables were not apparent in the dataset used for re-analysis.

Despite this, the statistics were reproduced with sufficiently similarity to the original statistics so that the same conclusions are warranted:

Forecasting accuracy is approximately normally distributed with a mean forecasting ability above chance guessing but below perfect accuracy (Mellers et al. (2015), p. 6, figure 1)
Highly accurate forecasters were not accurate solely due to chance, and they maintained their lead over other forecasters throughout the first two years of the program (Mellers et al. (2015), p. 7, figure 3)
A range of dispositional, situational and behavioral variables predict forecasting accuracy (Mellers et al. (2015), p. 11, table 5)

Regarding the second aim of this project, the exploratory analysis revealed a potential way of improving the analysis of the data compared to the original procedure. In particular, when participants mean Brier scores are obtained using only their final predictions on a question, the results may 1) reflect their forecasting abilities both more faithfully and optimistically and 2) reveal stronger relationships between predictor variables and forecasting accuracy.

So while this project did not reproduce the letter of the original results, so to speak, it did reproduce the spirit of the original results. But more than that: it showed how a methdological tweak may reinforce previous conclusions, strengthening the relationships which the original results aimed to investigate.

Reference:

Mellers, Barbara, Eric Stone, Pavel Atanasov, Nick Rohrbaugh, S. Emlen Metz, Lyle Ungar, Michael M. Bishop, Michael Horowitz, Ed Merkle, and Philip Tetlock. “The psychology of intelligence analysis: Drivers of prediction accuracy in world politics.” Journal of experimental psychology: applied 21, no. 1 (2015): 1-14.

A Re-Analysis of the Good Judgment Project Data: Successes, Difficulties and Potential Methodological Improvements

John Wilcox, Stanford University

March 22, 2019