Data + recode

#d = read.table("clipboard", sep = "\t", header=T, quote = "", stringsAsFactors = F)
#write_rds(d, "data/data.rds")
d = read_rds("data/data.rds")

#recoder func
recode_this = function(x) {
  x %>% 
    str_trim %>% 
    plyr::mapvalues(from = c("", "+", "++"), to = c(0, 1, 2), warn_missing = F) %>% 
    as.integer
}

#recode
d = d %>% dplyr::mutate(
  self_plagiarism = self_plagiarism %>% recode_this,
  data_duplication = data_duplication %>% recode_this,
  data_issues = data_issues %>% recode_this,
  statistical_issues = statistical_issues %>% recode_this,
  year = str_match(publication, pattern = "\\d{4}") %>% as.vector %>% as.integer,
  age = (2017 - year)
)

#total issues
issue_types = c("self_plagiarism", "data_duplication", "data_issues", "statistical_issues")
d$total_issues = rowSums(d[issue_types])

Analyses

#cors
d[map_lgl(d, ~is.numeric(.))] %>% wtd.cors

##                    citations self_plagiarism data_duplication data_issues
## citations              1.000          -0.074            -0.19      -0.135
## self_plagiarism       -0.074           1.000             0.19      -0.562
## data_duplication      -0.191           0.190             1.00       0.223
## data_issues           -0.135          -0.562             0.22       1.000
## statistical_issues    -0.210          -0.565             0.20       0.829
## year                  -0.247          -0.179            -0.17       0.071
## age                    0.247           0.179             0.17      -0.071
## total_issues          -0.280          -0.055             0.71       0.746
##                    statistical_issues   year    age total_issues
## citations                       -0.21 -0.247  0.247       -0.280
## self_plagiarism                 -0.56 -0.179  0.179       -0.055
## data_duplication                 0.20 -0.172  0.172        0.711
## data_issues                      0.83  0.071 -0.071        0.746
## statistical_issues               1.00  0.109 -0.109        0.739
## year                             0.11  1.000 -1.000       -0.060
## age                             -0.11 -1.000  1.000        0.060
## total_issues                     0.74 -0.060  0.060        1.000

#age only
lm(citations ~ age, data = d) %>% summary

## 
## Call:
## lm(formula = citations ~ age, data = d)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -125.9  -71.7  -38.3   -6.1  774.6 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)    31.84      42.81    0.74     0.46
## age             6.50       4.04    1.61     0.12
## 
## Residual standard error: 150 on 40 degrees of freedom
## Multiple R-squared:  0.0608, Adjusted R-squared:  0.0373 
## F-statistic: 2.59 on 1 and 40 DF,  p-value: 0.115

#total data issues
lm(citations ~ age + total_issues, data = d) %>% summary

## 
## Call:
## lm(formula = citations ~ age + total_issues, data = d)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -171.8  -82.8  -31.2   30.4  726.1 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)  
## (Intercept)     97.11      52.69    1.84    0.073 .
## age              6.97       3.90    1.78    0.082 .
## total_issues   -22.77      11.42   -1.99    0.053 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 145 on 39 degrees of freedom
## Multiple R-squared:  0.148,  Adjusted R-squared:  0.104 
## F-statistic: 3.38 on 2 and 39 DF,  p-value: 0.0443

By traditional NHST we almost found evidence that science does not reward QRPs, the beta is negative and almost significant, so just maybe readers did notice some of the problems and decided not to cite in return? Or, more cynically, maybe Wansink is just avoiding citing his own poor work to not draw attention. One would have to separate the citations into self and other citations to tell.

Of course, the sample size is too small; we could not even find a reliable effect of publication age. But Wansink has a lot of papers, so we (that means not me) can expand the dataset by analyzing more of his studies.

Are questionable research practices rewarded with citations? An analysis of publications by Brian Wansink

Emil O. W. Kirkegaard

Introduction

Init

Data + recode

Analyses