breuer.R

richard — Jan 9, 2014, 8:41 AM

## This script: http://rpubs.com/gill1109/11774

## Reference: "Empirical Patterns in Google Scholar Citation Counts"
## Peter T. Breuer and Jonathan P. Bowen
## Preprint available from Peter Breuer (University of Birmingham)
## through ResearchGate.net and/or Academia.edu

## Breuer plot: log positive citation count against square root of reverse rank number
## (most cited has reverse rank 0, next most cited 1, ...)
## Sometimes need a power different from 1/2

## My data (extracted from Google Scholar) 
## http://www.math.leidenuniv.nl/~gill/citations.txt

## Previous analyses: http://rpubs.com/gill1109/11755

## The Breuer plot corresponds to the following model.
## Suppose U ~ Unif(0,1). Define X = c(1 - U^a), in my case, c = 7.5, a = 1/2.
## Then my positive citation counts are distributed as a sample of size N = 127
## from the distribution of exp(X), rounded up to positive whole numbers.

## After fitting this straight line to my data I simulate 9 times from the estimated model
## and show 9 Breuer plots (which of course give beautiful straight lines)
## and 9 Normal probability plots (which show a good fit to the log normal).

data <- read.csv("http://www.math.leidenuniv.nl/~gill/citations.txt", 
                 sep = "$", 
                 stringsAsFactors = FALSE, 
                 header = FALSE, 
                 blank.lines.skip = FALSE)
V1 <- data$V1
counts <- as.numeric(V1[seq(from = 4, by = 7, to = 1100)])
counts[is.na(counts)] <- 0


## log positive citation counts, decreasing order

lpcounts <- sort(log(counts[counts>0]), decreasing = TRUE)

## create Breuer plot

plot(sqrt(ppoints(lpcounts)), lpcounts,
    main = "Breuer plot, my data",
    xlab = "sqrt(1-r/(n+1))",
    ylab = "log positive counts",
    sub = "r  runs from 1 to n (reverse rank)",
    xlim = c(0, 1))

## Fit a straight line to body of data - determined by first, third quartile

## I'd like the straight line to go through the point (1, 0)
## So if it's y = a + bx I want a = -b
## Adjusting the quartiles line a bit to get equality I settle on a = 7.5

quartiles <- quantile(lpcounts, probs = c(0.75, 0.25))
quartiles
  75%   25% 
3.726 1.099 

Uquartiles <- sqrt(c(0.25,0.75))
Uquartiles
[1] 0.500 0.866

coefs <- lm(quartiles ~ Uquartiles)$coef
coefs
(Intercept)  Uquartiles 
      7.314      -7.177 

abline(coefs, col = "blue")
abline(7.5, -7.5, col = "red")
abline(7.2, -7.2, col = "green")

legend(0.1, 2, 
    c("quartile fit", "both coeffs 7.2", "both coeffs 7.5"), 
    col = c("blue", "green", "red"), 
    lty = 1)

plot of chunk unnamed-chunk-1


qqnorm(lpcounts,
    main = "Normal probability plot, my data (log positive counts)",
    xlab = "standard normal quantiles",
    ylab = "log positive counts")
qqline(lpcounts)

plot of chunk unnamed-chunk-1


## Now simulate (9 times) from the estimated model according to the Breuer plot:
## log positive count = lpc = 7.5 ( 1 - sqrt U) where U ~ Unif(0,1)
## Equivalently: (1 - (log pos cnt)/7.5)^2 ~ Unif(0,1)
## However to add realism at low counts I'll transform the continuous lpc to discrete
## by the transformation discrete lpc = log(ceiling(exp(continuous lpc)))

set.seed(1234)
N <- length(lpcounts)

par(mfrow = c(3, 3))
for (i in 1: 9) {
    sim <- log(ceiling(exp(7.5 * (1 - sqrt(sort(runif(N)))))))
    plot(sqrt(ppoints(sim)), sim,
        main = "Breuer plot, simulated data",
        xlab = "sqrt(1-r/(n+1))",
        ylab = "log positive counts",
        sub = "r  runs from 1 to n (reverse rank)",
        xlim = c(0, 1))
    abline(7.5, -7.5)
}

plot of chunk unnamed-chunk-1


par(mfrow = c(3, 3))
set.seed(1234)
for (i in 1: 9) {
    sim <- log(ceiling(exp(7.5 * (1 - sqrt(sort(runif(N)))))))
    qqnorm(sim)
    qqline(sim)
}

plot of chunk unnamed-chunk-1


par(mfrow = c(1, 1))

## We noticed that the mean and standard devation of the normal seemed about equal.
## Is this a coincidence?
## Below I plot the QQ plot of the Breuer distribution (y-axis)
## against the standard normal (x-axis).
## The straight line is the tangent line at the medians.
## It is approximately y = 2.2 + 2.1 x
## So with this normal approximation, mu = 2.2 and sigma = 2.1
## Changing the power "1/2" a little, I could get them equal
## but changing the upper bound "7.5" does not help.


qBreuer <- 7.5 * (1 - sqrt(1-ppoints(100)))

qNorm <- qnorm(ppoints(100))


plot(qNorm, qBreuer, type="b", pch="*",
    main = "QQ plot of log-Breuer against log-normal",
    xlab = "standard normal quantiles",
    ylab = "Breuer(7.5, 1/2) quantiles",
    sub = "straight line = tangent at median: y = 2.2 + 2.1 x")

points(qNorm[50], qBreuer[50], col="red")

a <- 7.5*(1-sqrt(1/2))
b <- 7.5/(2*sqrt(pi))

a
[1] 2.197
b
[1] 2.116
abline(a, b)

plot of chunk unnamed-chunk-1