insight.R

richard — Jan 9, 2014, 8:41 AM

## This script: http://rpubs.com/gill1109/11755

## Reference: "Empirical Patterns in Google Scholar Citation Counts"
## Peter T. Breuer and Jonathan P. Bowen
## Preprint available from Peter Breuer (University of Birmingham)
## through ResearchGate.net and/or Academia.edu

## Lognormal model
## Number of citations of a random paper selected from my oeuvre 
## is distributed as floor(exp(N(mu,sigma)))

## My data can be downloaded from http://www.math.leidenuniv.nl/~gill/citations.txt

## Sequel to this script: http://rpubs.com/gill1109/11774

mu <- 2
sigma <- 2

##Compute probabilities of 0, 1, 2, ... citations according to log normal model

x <- 0:5000
probs <- pnorm(log(x+1), mu, sigma) - pnorm(log(x), mu, sigma)
sum(probs) ## These probabilities don't quite add to one
[1] 0.9994

plot(x, probs,
     main = "Probability distribution of citation count of random article",
     xlab = "Number of citations",
     ylab = "Probabilty")

plot of chunk unnamed-chunk-1


plot(y=log(-log(probs)), x=log(0:5000),
     main = "log -log probs versus log citation count",
     sub = "Zero count ommitted; fitted least squares straight line")
coefs <- lm(log(-log(probs))[-1] ~ log(0:5000)[-1])
abline(coefs)   ## coefficients of straight line fit

plot of chunk unnamed-chunk-1

coefs  

Call:
lm(formula = log(-log(probs))[-1] ~ log(0:5000)[-1])

Coefficients:
    (Intercept)  log(0:5000)[-1]  
          1.054            0.199  

## Sample N publications according to this model and draw Breuer's plot
set.seed(1234)
N <- 150  ## 150 papers
sim <- sort(floor(exp(rnorm(N, 2, 2))), decreasing = TRUE)
plot(sim,
     main = "Simulated citation counts of 150 papers")

plot of chunk unnamed-chunk-1


coef2 <- lm(log(sim+1) ~ sqrt(0:(N-1)))
plot(y=log(sim+1), x=sqrt(0:(N-1)),
     main = "Log citation count +1 versus square root reverse rank",
     xlab = "Square root reverse rank (most cited has r.r. zero)",
     sub = "Fitted least squares straight line",
     ylab = "Log citation count plus 1")
abline(coef2)

plot of chunk unnamed-chunk-1

coef2

Call:
lm(formula = log(sim + 1) ~ sqrt(0:(N - 1)))

Coefficients:
    (Intercept)  sqrt(0:(N - 1))  
          6.670           -0.572  

## Now some real data
# Source of data = Google Scholar
# Scholar = R D Gill

data <- read.csv("citations.txt", 
                 sep = "$", 
                 stringsAsFactors = FALSE, 
                 header = FALSE, 
                 blank.lines.skip = FALSE)
V1 <- data$V1
counts <- as.numeric(V1[seq(from = 4, by = 7, to = 1100)])
counts[is.na(counts)] <- 0
length(counts)
[1] 157
plot(counts,
     main="Raw data")

plot of chunk unnamed-chunk-1


fit <- lm(log(counts)[counts > 0] ~ sqrt(0:156)[counts > 0])


plot(sqrt(0:156), log(counts), 
     main = "Breuer plot of my citation data", 
     sub =" Fit: P=0.668")
abline(fit$coef)

plot of chunk unnamed-chunk-1




qqnorm(log(counts[counts>0]), 
       main = "Normal probability plot of my citation data", 
       ylab = "log positive citation counts", 
       sub = "Straight line: y = mu + sigma x with mu=sigma=2")
abline(2,2)

plot of chunk unnamed-chunk-1


## Compact summary of data

## 30 papers cited 0 times
## 17 cited 1 time
## 12 cited 2 times  etc.


table(counts)
counts
   0    1    2    3    4    5    6    7    8    9   10   11   12   13   14 
  30   17   12    7    6    4    2    6    1    4    1    3    4    3    2 
  15   17   18   19   21   22   23   24   27   28   29   30   31   38   41 
   2    2    1    1    2    2    2    1    2    2    1    2    1    1    1 
  42   47   49   52   57   60   61   65   73   76   86   87  105  113  114 
   2    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
 117  131  147  156  157  170  182  184  187  357  378  387  491  712 2300 
   1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
3234 
   1 



## The data again; different layout

cnt.numbers <- as.vector(table(counts))
cnt.names <- as.vector(names(table(counts)))

data.frame(counts=cnt.names,replications=cnt.numbers)
   counts replications
1       0           30
2       1           17
3       2           12
4       3            7
5       4            6
6       5            4
7       6            2
8       7            6
9       8            1
10      9            4
11     10            1
12     11            3
13     12            4
14     13            3
15     14            2
16     15            2
17     17            2
18     18            1
19     19            1
20     21            2
21     22            2
22     23            2
23     24            1
24     27            2
25     28            2
26     29            1
27     30            2
28     31            1
29     38            1
30     41            1
31     42            2
32     47            1
33     49            1
34     52            1
35     57            1
36     60            1
37     61            1
38     65            1
39     73            1
40     76            1
41     86            1
42     87            1
43    105            1
44    113            1
45    114            1
46    117            1
47    131            1
48    147            1
49    156            1
50    157            1
51    170            1
52    182            1
53    184            1
54    187            1
55    357            1
56    378            1
57    387            1
58    491            1
59    712            1
60   2300            1
61   3234            1

## Interlude - another simulation
set.seed(1234)
sim <- sort(floor(exp(rnorm(157, 2, 2))), decreasing = TRUE)
plot(sqrt(0:156), log(sim),
     main = "Another simulation")
coefs <- lm(log(sim)[sim>0] ~ sqrt(0:156)[sim>0])
abline(coefs)

plot of chunk unnamed-chunk-1

coefs

Call:
lm(formula = log(sim)[sim > 0] ~ sqrt(0:156)[sim > 0])

Coefficients:
         (Intercept)  sqrt(0:156)[sim > 0]  
               7.027                -0.632  

## Taking our cue from the plot of log(-log(prob k citations)) against log k
## which was roughly (k>0) the line y = a + bx with, approx, a = 1 and b = 1/5,
## we come up with a new model
## p(k) = exp(-exp(a + b log k)) for k > 0
## Define p(0) = 1 - sum_{k > 0} p(k)
## Now we simulate from that model, and draw the Breuer plot 
## and also the normal probability plot of the log data

## Fitting that model by maximum likelihood to my own data 
## (not reproduced here) gave, approx
## a = 0.90, b = 0.225

a <- 0.90
b <- 0.225

k <- 0:5000
pk <- exp(-exp(a)*k^b)
pk[1] <- 1 - sum(pk[-1])
plot(log(k), log(-log(pk)), 
     main = "log(-log(Prob(k citations))) versus log(k)",
     xlab = "log(k)",
     ylab = "log(-log(Prob k citations)), k>0")
abline(a, b)

plot of chunk unnamed-chunk-1


set.seed(1234)
N <- 157
sim <- sort(sample(k, N, replace = TRUE, prob = pk), decreasing = TRUE)
Warning: Walker's alias method used: results are different from R < 2.2.0
data <- sim

N <- length(data)
plot(sqrt(0:(N-1)), log(data),
     main = paste("Breuer plot, simulated data, N = ", N, " papers"),
     xlab = "root reverse rank (most cited = 0)",
     ylab = "log(citation count")
coefs <- lm(log(data)[data>0]~sqrt(0:(N-1))[data>0])
coefs

Call:
lm(formula = log(data)[data > 0] ~ sqrt(0:(N - 1))[data > 0])

Coefficients:
              (Intercept)  sqrt(0:(N - 1))[data > 0]  
                    6.867                     -0.543  
abline(coefs)

plot of chunk unnamed-chunk-1


qqnorm(log(data)[data>0], main = "Normal probability plot of log simulated data")
quartiles <- quantile(log(data)[data>0], probs = c(0.25, 0.75))
Nquartiles <- qnorm(c(0.25, 0.75))
coefs <- lm(quartiles ~ Nquartiles)$coef
coefs
(Intercept)  Nquartiles 
      2.890       1.629 
abline(coefs)

plot of chunk unnamed-chunk-1



## Now the same plots for the actual data
data <- counts

N <- length(data)
plot(sqrt(0:(N-1)), log(data),
     main = paste("Breuer plot, real data, N = ", N, " papers"),
     xlab = "root reverse rank (most cited = 0)",
     ylab = "log(citation count")
coefs <- lm(log(data)[data>0]~sqrt(0:(N-1))[data>0])
coefs

Call:
lm(formula = log(data)[data > 0] ~ sqrt(0:(N - 1))[data > 0])

Coefficients:
              (Intercept)  sqrt(0:(N - 1))[data > 0]  
                    7.518                     -0.668  
abline(coefs)

plot of chunk unnamed-chunk-1


qqnorm(log(data)[data>0], main = "Normal probability plot of log real data")
quartiles <- quantile(log(data)[data>0], probs = c(0.25, 0.75))
Nquartiles <- qnorm(c(0.25, 0.75))
coefs <- lm(quartiles ~ Nquartiles)$coef
coefs
(Intercept)  Nquartiles 
      2.412       1.947 
abline(coefs)

plot of chunk unnamed-chunk-1


## Conclusion: the new model has only two parameters and 
## connects the probability of zero citations to the 
## probabilities of large numbers. 
## We need a bit more flexibility. 
## Three parameters are needed to wag the tail of the dog!
## There are several obvious (and easy) options...