richard — Jan 9, 2014, 8:41 AM
## This script: http://rpubs.com/gill1109/11755
## Reference: "Empirical Patterns in Google Scholar Citation Counts"
## Peter T. Breuer and Jonathan P. Bowen
## Preprint available from Peter Breuer (University of Birmingham)
## through ResearchGate.net and/or Academia.edu
## Lognormal model
## Number of citations of a random paper selected from my oeuvre
## is distributed as floor(exp(N(mu,sigma)))
## My data can be downloaded from http://www.math.leidenuniv.nl/~gill/citations.txt
## Sequel to this script: http://rpubs.com/gill1109/11774
mu <- 2
sigma <- 2
##Compute probabilities of 0, 1, 2, ... citations according to log normal model
x <- 0:5000
probs <- pnorm(log(x+1), mu, sigma) - pnorm(log(x), mu, sigma)
sum(probs) ## These probabilities don't quite add to one
[1] 0.9994
plot(x, probs,
main = "Probability distribution of citation count of random article",
xlab = "Number of citations",
ylab = "Probabilty")
plot(y=log(-log(probs)), x=log(0:5000),
main = "log -log probs versus log citation count",
sub = "Zero count ommitted; fitted least squares straight line")
coefs <- lm(log(-log(probs))[-1] ~ log(0:5000)[-1])
abline(coefs) ## coefficients of straight line fit
coefs
Call:
lm(formula = log(-log(probs))[-1] ~ log(0:5000)[-1])
Coefficients:
(Intercept) log(0:5000)[-1]
1.054 0.199
## Sample N publications according to this model and draw Breuer's plot
set.seed(1234)
N <- 150 ## 150 papers
sim <- sort(floor(exp(rnorm(N, 2, 2))), decreasing = TRUE)
plot(sim,
main = "Simulated citation counts of 150 papers")
coef2 <- lm(log(sim+1) ~ sqrt(0:(N-1)))
plot(y=log(sim+1), x=sqrt(0:(N-1)),
main = "Log citation count +1 versus square root reverse rank",
xlab = "Square root reverse rank (most cited has r.r. zero)",
sub = "Fitted least squares straight line",
ylab = "Log citation count plus 1")
abline(coef2)
coef2
Call:
lm(formula = log(sim + 1) ~ sqrt(0:(N - 1)))
Coefficients:
(Intercept) sqrt(0:(N - 1))
6.670 -0.572
## Now some real data
# Source of data = Google Scholar
# Scholar = R D Gill
data <- read.csv("citations.txt",
sep = "$",
stringsAsFactors = FALSE,
header = FALSE,
blank.lines.skip = FALSE)
V1 <- data$V1
counts <- as.numeric(V1[seq(from = 4, by = 7, to = 1100)])
counts[is.na(counts)] <- 0
length(counts)
[1] 157
plot(counts,
main="Raw data")
fit <- lm(log(counts)[counts > 0] ~ sqrt(0:156)[counts > 0])
plot(sqrt(0:156), log(counts),
main = "Breuer plot of my citation data",
sub =" Fit: P=0.668")
abline(fit$coef)
qqnorm(log(counts[counts>0]),
main = "Normal probability plot of my citation data",
ylab = "log positive citation counts",
sub = "Straight line: y = mu + sigma x with mu=sigma=2")
abline(2,2)
## Compact summary of data
## 30 papers cited 0 times
## 17 cited 1 time
## 12 cited 2 times etc.
table(counts)
counts
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
30 17 12 7 6 4 2 6 1 4 1 3 4 3 2
15 17 18 19 21 22 23 24 27 28 29 30 31 38 41
2 2 1 1 2 2 2 1 2 2 1 2 1 1 1
42 47 49 52 57 60 61 65 73 76 86 87 105 113 114
2 1 1 1 1 1 1 1 1 1 1 1 1 1 1
117 131 147 156 157 170 182 184 187 357 378 387 491 712 2300
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
3234
1
## The data again; different layout
cnt.numbers <- as.vector(table(counts))
cnt.names <- as.vector(names(table(counts)))
data.frame(counts=cnt.names,replications=cnt.numbers)
counts replications
1 0 30
2 1 17
3 2 12
4 3 7
5 4 6
6 5 4
7 6 2
8 7 6
9 8 1
10 9 4
11 10 1
12 11 3
13 12 4
14 13 3
15 14 2
16 15 2
17 17 2
18 18 1
19 19 1
20 21 2
21 22 2
22 23 2
23 24 1
24 27 2
25 28 2
26 29 1
27 30 2
28 31 1
29 38 1
30 41 1
31 42 2
32 47 1
33 49 1
34 52 1
35 57 1
36 60 1
37 61 1
38 65 1
39 73 1
40 76 1
41 86 1
42 87 1
43 105 1
44 113 1
45 114 1
46 117 1
47 131 1
48 147 1
49 156 1
50 157 1
51 170 1
52 182 1
53 184 1
54 187 1
55 357 1
56 378 1
57 387 1
58 491 1
59 712 1
60 2300 1
61 3234 1
## Interlude - another simulation
set.seed(1234)
sim <- sort(floor(exp(rnorm(157, 2, 2))), decreasing = TRUE)
plot(sqrt(0:156), log(sim),
main = "Another simulation")
coefs <- lm(log(sim)[sim>0] ~ sqrt(0:156)[sim>0])
abline(coefs)
coefs
Call:
lm(formula = log(sim)[sim > 0] ~ sqrt(0:156)[sim > 0])
Coefficients:
(Intercept) sqrt(0:156)[sim > 0]
7.027 -0.632
## Taking our cue from the plot of log(-log(prob k citations)) against log k
## which was roughly (k>0) the line y = a + bx with, approx, a = 1 and b = 1/5,
## we come up with a new model
## p(k) = exp(-exp(a + b log k)) for k > 0
## Define p(0) = 1 - sum_{k > 0} p(k)
## Now we simulate from that model, and draw the Breuer plot
## and also the normal probability plot of the log data
## Fitting that model by maximum likelihood to my own data
## (not reproduced here) gave, approx
## a = 0.90, b = 0.225
a <- 0.90
b <- 0.225
k <- 0:5000
pk <- exp(-exp(a)*k^b)
pk[1] <- 1 - sum(pk[-1])
plot(log(k), log(-log(pk)),
main = "log(-log(Prob(k citations))) versus log(k)",
xlab = "log(k)",
ylab = "log(-log(Prob k citations)), k>0")
abline(a, b)
set.seed(1234)
N <- 157
sim <- sort(sample(k, N, replace = TRUE, prob = pk), decreasing = TRUE)
Warning: Walker's alias method used: results are different from R < 2.2.0
data <- sim
N <- length(data)
plot(sqrt(0:(N-1)), log(data),
main = paste("Breuer plot, simulated data, N = ", N, " papers"),
xlab = "root reverse rank (most cited = 0)",
ylab = "log(citation count")
coefs <- lm(log(data)[data>0]~sqrt(0:(N-1))[data>0])
coefs
Call:
lm(formula = log(data)[data > 0] ~ sqrt(0:(N - 1))[data > 0])
Coefficients:
(Intercept) sqrt(0:(N - 1))[data > 0]
6.867 -0.543
abline(coefs)
qqnorm(log(data)[data>0], main = "Normal probability plot of log simulated data")
quartiles <- quantile(log(data)[data>0], probs = c(0.25, 0.75))
Nquartiles <- qnorm(c(0.25, 0.75))
coefs <- lm(quartiles ~ Nquartiles)$coef
coefs
(Intercept) Nquartiles
2.890 1.629
abline(coefs)
## Now the same plots for the actual data
data <- counts
N <- length(data)
plot(sqrt(0:(N-1)), log(data),
main = paste("Breuer plot, real data, N = ", N, " papers"),
xlab = "root reverse rank (most cited = 0)",
ylab = "log(citation count")
coefs <- lm(log(data)[data>0]~sqrt(0:(N-1))[data>0])
coefs
Call:
lm(formula = log(data)[data > 0] ~ sqrt(0:(N - 1))[data > 0])
Coefficients:
(Intercept) sqrt(0:(N - 1))[data > 0]
7.518 -0.668
abline(coefs)
qqnorm(log(data)[data>0], main = "Normal probability plot of log real data")
quartiles <- quantile(log(data)[data>0], probs = c(0.25, 0.75))
Nquartiles <- qnorm(c(0.25, 0.75))
coefs <- lm(quartiles ~ Nquartiles)$coef
coefs
(Intercept) Nquartiles
2.412 1.947
abline(coefs)
## Conclusion: the new model has only two parameters and
## connects the probability of zero citations to the
## probabilities of large numbers.
## We need a bit more flexibility.
## Three parameters are needed to wag the tail of the dog!
## There are several obvious (and easy) options...