#first close any project you're in
load("shared/firstdigitlab/
firstdigitstuff.RData")
library(MASS)
#Note: to load the library MASS you may need to update your libpaths
.libPaths( c( .libPaths(), "/usr/lib/rstudio-server/R/library/statsclass") )
first.digit.display
function(l){
dig1 <- function(k){
as.numeric(head(strsplit(as.character(k),'')[[1]],n=1))
}
dig1.list <- function(l) {sapply(l,dig1)}
first.digit <- dig1.list(l)
truehist(first.digit, nbins=10, main=paste(length(first.digit), "data points"))
print(table(first.digit)/length(first.digit))
}
first.digit.display(pop$pop2012)
first.digit.display(pres$dem_votes)
first.digit.display(pres$rep_votes)
first.digit.display(mov$Opening)
first.digit.display(mm$mm)
pop - the population of every U.S. county
pres - the number of votes for dem/rep presidential candidates in 2012
mov - the Opening weekend grosses of movies
mm - the molar masses of common molecules
powers_of_2 <- 2^(1:1000)
powers_of_2[1:7] #the first 7 of 1000
[1] 2 4 8 16 32 64 128
first.digit.display(powers_of_2)
first.digit
1 2 3 4 5 6 7 8 9
0.301 0.176 0.125 0.097 0.079 0.069 0.056 0.052 0.045
Try multiplying everything by different factors:
first.digit.display(10*powers_of_2)
first.digit.display(3*powers_of_2)
first.digit.display(5*powers_of_2)
How does the distribution of first digits change?
Try powers of 3:
powers_of_3 <- 3^(1:500)
factorials <- factorial(1:150)
first.digit.display(factorials)
Factorials get big fast so R can only generate the first 150 or so (you'll get an error message if you go much larger).
How does the distribution of first digits of factorials compare to the first digits of powers of 2 and 3?
For more information: Fibonacci numbers
bunny <- vector()
bunny[1] <- 1
bunny[2] <- 1
for (i in 3:300){
bunny[i] <- bunny[i-1]+bunny[i-2]
}
bunny[1:7]
first.digit.display(bunny)
log10(2)
[1] 0.301
log10(3)-log10(2)
[1] 0.1761
Do these numbers look familiar?
The probability of a first digit d is: \[ P(d) = log_{10}(d+1) - log_{10}(d) \]
This can be expanded to first two digits. The probability of the first two digits (dd) is:
\[ P(dd) = log_{100}(dd+1) - log_{100}(dd) \]
x <- vector()
for (i in 1:9){
x[i] <- log10(i+1)-log10(i)
}
plot(1:9, x, type="h")
text(1:9, x, round(x,3))
Which distributions followed Benford's Law most closely?
Try generating other sequences of numbers and see when Benford's Law holds.