Replication in the Reproducibility Project Psychology and citations

# based on CHJ Hartgerink's script
options(stringsAsFactors = TRUE)

library(httr)
library(dplyr)
library(ggplot2)
library(ggbeeswarm)
# Read in Tilburg data
info <- GET('https://osf.io/fgjvw/?action=download', write_disk('rpp_data.csv', overwrite = TRUE)) #downloads data file from the OSF
MASTER <- read.csv("rpp_data.csv")[1:167, ]
colnames(MASTER)[1] <- "ID" # Change first column name to ID to be able to load .csv file

get DOIs, missing from the RPP data by searching on titles, authors, date (didn’t do any checking that matches were proper)
get up-to-date citation counts from CrossRef

if (file.exists("osfdata_with_dois.rdata")) {
    load("osfdata_with_dois.rdata")
} else {
    library(rcrossref)
    MASTER$DOI = NA_character_
    for (i in 1:nrow(MASTER)) {
        tryCatch({
        MASTER$DOI[i] = rcrossref::cr_works(flq = c(query.title = MASTER$Study.Title..O.[i], query.author = MASTER$Authors..O.[i]), filter = c(from_pub_date = 2007, until_pub_date = 2009), sort = "relevance", limit = 1)$data$DOI}, error = function(e) warning(e))
    }
    MASTER$citation_count_2018 = NA_real_
    for (i in 1:nrow(MASTER)) {
        tryCatch({
            MASTER$citation_count_2018[i] = rcrossref::cr_citation_count(MASTER$DOI[i])
        }, error = function(e) warning(e))
    }
}

```

Does replication in the RPP predict how often a paper is cited?

No, not for the citation count recorded in the RPP.

MASTER %>% filter(!is.na(T_pval_USE..R.)) %>% ggplot(aes(T_pval_USE..R.< .05, Citation.count..paper..O.)) + geom_beeswarm() + geom_pointrange(stat='summary', fun.data='mean_se',color ='blue')

MASTER %>% filter(!is.na(T_pval_USE..R.)) %>% glm(Citation.count..paper..O. ~ T_pval_USE..R.< .05, data = ., family = quasipoisson()) %>% summary()

## 
## Call:
## glm(formula = Citation.count..paper..O. ~ T_pval_USE..R. < 0.05, 
##     family = quasipoisson(), data = .)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -11.43   -6.24   -3.23    4.00   20.52  
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 4.4709     0.1035   43.19   <2e-16 ***
## T_pval_USE..R. < 0.05TRUE  -0.0853     0.1790   -0.48     0.63    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 60)
## 
##     Null deviance: 5247.2  on 98  degrees of freedom
## Residual deviance: 5233.5  on 97  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 5

Does replication in the RPP predict citations after the publication of the RPP?

The citation count in the RPP probably includes more sources, but these aren’t systematically different.

qplot(Citation.count..paper..O., citation_count_2018, data = MASTER) + ggtitle("Strong correspondence")

cor.test(MASTER$citation_count_2018, MASTER$Citation.count..paper..O.)

## 
##  Pearson's product-moment correlation
## 
## data:  MASTER$citation_count_2018 and MASTER$Citation.count..paper..O.
## t = 50, df = 200, p-value <2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.96 0.98
## sample estimates:
##  cor 
## 0.97

lm(MASTER$citation_count_2018 ~ MASTER$Citation.count..paper..O.)

## 
## Call:
## lm(formula = MASTER$citation_count_2018 ~ MASTER$Citation.count..paper..O.)
## 
## Coefficients:
##                      (Intercept)  MASTER$Citation.count..paper..O.  
##                           -6.921                             0.824

# but crossref citation count are much lower for some reason
mean(MASTER$citation_count_2018)

## [1] 67

mean(MASTER$Citation.count..paper..O.)

## [1] 90

Does replication predict 2018 citation counts?

Again, no association.

MASTER %>% filter(!is.na(T_pval_USE..R.)) %>% ggplot(aes(T_pval_USE..R.< .05, citation_count_2018)) + geom_beeswarm() + geom_pointrange(stat='summary', fun.data='mean_se',color ='blue')

MASTER %>% filter(!is.na(T_pval_USE..R.)) %>% glm(citation_count_2018 ~ T_pval_USE..R.< .05, data = ., family = quasipoisson()) %>% summary()

## 
## Call:
## glm(formula = citation_count_2018 ~ T_pval_USE..R. < 0.05, family = quasipoisson(), 
##     data = .)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -11.29   -6.37   -3.79    3.65   18.04  
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 4.1545     0.1177   35.30   <2e-16 ***
## T_pval_USE..R. < 0.05TRUE  -0.0687     0.2024   -0.34     0.73    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 56)
## 
##     Null deviance: 4897.0  on 98  degrees of freedom
## Residual deviance: 4890.4  on 97  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 5

Does replication predict subsequent citation counts (ie. 2015-2018)?

This is pretty dirty now, because I’m subtracting citation counts from one source with another, so most papers are cited less in 2018 than in 2015. But haven’t found a quick way to get citation counts in 2015 from rcrossref.

Again, no association. So, assuming the dirtiness of the analysis doesn’t matter (strong rank order correspondence in the citation counts), the literature hasn’t reacted at all to the presumably important bit of information that a study doesn’t replicate.

MASTER %>% filter(!is.na(T_pval_USE..R.)) %>% ggplot(aes(T_pval_USE..R.< .05, citation_count_2018 - Citation.count..paper..O. * 0.8)) + geom_beeswarm() + geom_pointrange(stat='summary', fun.data='mean_se',color ='blue')

MASTER %>% filter(!is.na(T_pval_USE..R.)) %>% glm(90 + citation_count_2018 - Citation.count..paper..O. * 0.8 ~ T_pval_USE..R.< .05, data = ., family = 'quasipoisson') %>% summary()

## 
## Call:
## glm(formula = 90 + citation_count_2018 - Citation.count..paper..O. * 
##     0.8 ~ T_pval_USE..R. < 0.05, family = "quasipoisson", data = .)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -11.954   -0.840    0.199    0.705    5.201  
## 
## Coefficients:
##                           Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 4.4281     0.0249  177.91   <2e-16 ***
## T_pval_USE..R. < 0.05TRUE   0.0176     0.0416    0.42     0.67    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 3.3)
## 
##     Null deviance: 385.73  on 98  degrees of freedom
## Residual deviance: 385.13  on 97  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 4

slightly different way of looking at it

qplot(Citation.count..paper..O., citation_count_2018, colour = T_pval_USE..R.< .05, data = MASTER %>% filter(!is.na(T_pval_USE..R.))) + geom_smooth(method = 'lm')

qplot(Citation.count..paper..O., citation_count_2018 - Citation.count..paper..O. * 0.8, colour = T_pval_USE..R.< .05, data = MASTER %>% filter(!is.na(T_pval_USE..R.))) + geom_smooth(method = 'lm')