berkson <- read.table("berkson.dat")
berkson
## V1 V2 V3
## 1 2- 18 12.2
## 2 3 28 27.0
## 3 4 56 56.5
## 4 5 105 94.9
## 5 6 126 132.7
## 6 7 146 159.1
## 7 8 164 166.9
## 8 9 161 155.6
## 9 10 123 130.6
## 10 11 101 99.7
## 11 12 74 69.7
## 12 13 53 45.0
## 13 14 23 27.0
## 14 15 15 15.1
## 15 16 9 7.9
## 16 17+ 5 7.1
berkson$V2
## [1] 18 28 56 105 126 146 164 161 123 101 74 53 23 15 9 5
N <- sum(berkson$V2)
lambdahat <- sum((2:17)*berkson$V2)/sum(berkson$V2)
plot(2:17, berkson$V2)
lines(2:17, N*dpois(2:17, lambdahat))
Rice shows us what he calls a “smoothed histogram” of data from experiments on ion channels. It looks very much as though it is standard normally distributed - presumably he subtracted the sample mean and divided by the sample standard deviation in order to get observations whose mean is zero and standard deviation is zero!
He shows us what he calls a “smoothed histogram” but it seems to me that it is a what you get from a histogram by joining the mid-points of the tops of adjacent bars by straight line segments.
I simulate data from a standard normal and create the corresponding plot.
He says that his graph suggests some small amount of skewness in the data, in other words, some departure from the model of a normal distribution. However if we do the same simulation a number of times we can see that this amount of deviation is actually quite typical.
set.seed(123)
current <- rnorm(50000)
hist(current)
current.hist <- hist(current)
names(current.hist)
## [1] "breaks" "counts" "density" "mids" "xname" "equidist"
breaks # doesn't exist, so error
## Error: object 'breaks' not found
current.hist$breaks
## [1] -4.5 -4.0 -3.5 -3.0 -2.5 -2.0 -1.5 -1.0 -0.5 0.0 0.5 1.0 1.5 2.0
## [15] 2.5 3.0 3.5 4.0 4.5
current.hist
## $breaks
## [1] -4.5 -4.0 -3.5 -3.0 -2.5 -2.0 -1.5 -1.0 -0.5 0.0 0.5 1.0 1.5 2.0
## [15] 2.5 3.0 3.5 4.0 4.5
##
## $counts
## [1] 1 4 49 245 835 2258 4641 7378 9649 9521 7484 4598 2216 819
## [15] 237 53 11 1
##
## $density
## [1] 0.00004 0.00016 0.00196 0.00980 0.03340 0.09032 0.18564 0.29512
## [9] 0.38596 0.38084 0.29936 0.18392 0.08864 0.03276 0.00948 0.00212
## [17] 0.00044 0.00004
##
## $mids
## [1] -4.25 -3.75 -3.25 -2.75 -2.25 -1.75 -1.25 -0.75 -0.25 0.25 0.75
## [12] 1.25 1.75 2.25 2.75 3.25 3.75 4.25
##
## $xname
## [1] "current"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
lines(current.hist$mids, current.hist$counts)
current.hist <- hist(current, freq = FALSE)
current.hist
## $breaks
## [1] -4.5 -4.0 -3.5 -3.0 -2.5 -2.0 -1.5 -1.0 -0.5 0.0 0.5 1.0 1.5 2.0
## [15] 2.5 3.0 3.5 4.0 4.5
##
## $counts
## [1] 1 4 49 245 835 2258 4641 7378 9649 9521 7484 4598 2216 819
## [15] 237 53 11 1
##
## $density
## [1] 0.00004 0.00016 0.00196 0.00980 0.03340 0.09032 0.18564 0.29512
## [9] 0.38596 0.38084 0.29936 0.18392 0.08864 0.03276 0.00948 0.00212
## [17] 0.00044 0.00004
##
## $mids
## [1] -4.25 -3.75 -3.25 -2.75 -2.25 -1.75 -1.25 -0.75 -0.25 0.25 0.75
## [12] 1.25 1.75 2.25 2.75 3.25 3.75 4.25
##
## $xname
## [1] "current"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
lines(current.hist$mids, current.hist$density)
plot(current.hist$mids, current.hist$density, type="l")
x <- seq(from = -4, to = 4, length = 100)
lines(x, dnorm(x))
par(mfrow = c(3, 4)) # 12 plots per graphic, arranged in 3 rows, 4 columns
for (i in (1:12)) {
current <- rnorm(50000)
current.hist <- hist(current, plot=F)
plot(current.hist$mids, current.hist$density,
type = "l", xlim = c(-4, 4))
lines(x, dnorm(x), col = "red")
}
## I do it again with smaller sample size so you can better see the deviations
for (i in (1:12)) {
current <- rnorm(500)
current.hist <- hist(current, plot = FALSE)
plot(current.hist$mids, current.hist$density,
type="l", xlim = c(-3, 3), ylim = c(0, 0.5))
lines(x, dnorm(x), col = "red")
}
par(mfrow = c(1, 1)) # Back to one plot per graphic
) Next come some examples of the method of moments, the gamma distribution, and the parametric bootstrap.
## First just a quick look at simulated data
trueShape <- 0.8
trueRate <- 1/40
data <- rgamma(200, shape = trueShape, rate = trueRate)
hist(data)
## I prefer my histogram scaled to have area under the curve = one
hist(data, freq = FALSE)
x <- seq(from = 0, to = 200, length = 200)
lines(x, dgamma(x, trueShape, trueRate))
hist(data, freq = FALSE, breaks = 20) ## "hist" tends to give too few bars
x <- seq(from = 0, to = 250, length = 200)
lines(x, dgamma(x, shape = trueShape, rate = trueRate))
library(MASS) # this package has a better "histogram" function
truehist(data, nbins = 20)
lines(x, dgamma(x, shape = trueShape, rate = trueRate))
# Illinois storm data
ill60 <- scan("ill60.c10")
ill61 <- scan("ill61.c10")
ill62 <- scan("ill62.c10")
ill63 <- scan("ill63.c10")
ill64 <- scan("ill64.c10")
ill <- c(ill60,ill61,ill62,ill63,ill64)
summary(ill)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.001 0.010 0.070 0.224 0.270 2.130
ill
## [1] 0.020 0.001 0.001 0.120 0.080 0.420 1.720 0.050 0.010 0.010 0.003
## [12] 0.001 0.003 0.270 0.001 0.060 0.050 2.130 0.040 1.100 0.020 0.001
## [23] 0.140 0.080 0.210 0.070 0.320 0.240 0.290 0.001 0.290 1.130 0.003
## [34] 0.010 0.190 0.002 0.010 0.040 0.002 0.070 0.450 0.010 0.180 0.670
## [45] 0.003 0.010 0.040 0.002 0.490 0.020 0.020 0.340 0.140 0.370 0.330
## [56] 0.330 0.350 0.010 0.500 0.760 1.060 0.002 0.060 0.160 0.270 0.250
## [67] 0.290 0.020 0.050 0.460 0.070 0.410 0.020 0.080 0.210 0.010 0.440
## [78] 0.020 0.050 0.110 1.500 0.003 0.180 0.010 0.002 0.240 0.010 0.750
## [89] 0.010 0.140 0.130 0.010 0.010 0.270 0.450 1.780 0.250 0.240 0.004
## [100] 0.210 0.170 0.830 0.150 0.030 0.030 0.500 0.040 0.090 0.040 0.060
## [111] 0.060 0.120 0.003 0.003 0.400 0.020 0.510 0.003 0.020 0.020 0.020
## [122] 0.010 0.001 0.140 0.001 0.100 0.010 1.090 0.010 0.002 0.001 0.840
## [133] 0.030 0.350 0.070 0.001 0.002 0.002 0.200 0.060 0.140 0.010 0.020
## [144] 0.020 0.002 0.001 0.550 0.130 0.190 2.100 0.090 0.350 0.790 0.320
## [155] 1.350 0.170 0.020 0.002 0.010 0.250 0.230 0.170 0.010 0.020 0.001
## [166] 0.010 0.020 0.110 0.210 1.260 0.010 0.730 0.100 0.090 0.007 0.360
## [177] 0.770 0.210 1.270 0.070 0.080 0.160 0.260 0.010 0.230 0.080 0.020
## [188] 0.010 0.290 0.010 0.010 0.070 0.400 0.002 0.003 0.010 0.090 0.160
## [199] 0.040 0.270 0.730 0.410 0.030 0.120 0.030 1.040 0.060 0.090 0.730
## [210] 0.040 0.160 0.590 0.003 0.002 0.020 0.004 0.010 0.001 0.060 0.620
## [221] 0.010 0.520 0.110 0.003 0.600 0.002 0.050
truehist(ill, nbins = 20)
mean(ill)
## [1] 0.2244
sdev(ill) # no function called sdev! error!
## Error: could not find function "sdev"
sd(ill)
## [1] 0.3658
sqrt(var(ill))
## [1] 0.3658
# Moment estimators (well ... almost ...
# ... "wrong" normalization for sample variance)
# So what? This is the estimator I'll study:
# Let's call it the "almost moment estimator"
lambdahat <- mean(ill)/var(ill)
alphahat <- (mean(ill))^2/var(ill)
lambdahat # estimated rate parameter
## [1] 1.677
alphahat # estimated shape parameter
## [1] 0.3763
truehist(ill, nbins = 20)
xp <- seq(0, 2.2, length = 100)
lines(xp, dgamma(xp, shape = alphahat, rate = lambdahat))
# parametric bootstrap of (almost) moment estimators
alphahatsim <- numeric(1000)
lambdahatsim <- numeric(1000)
for (i in 1:1000) {
illsim <- rgamma(227,alphahat,lambdahat)
alphahatsim[i] <- (mean(illsim))^2/var(illsim)
lambdahatsim[i] <- mean(illsim)/var(illsim)
}
## Histograms of boostrap estimates
truehist(alphahatsim, nbins = 20)
abline(v = alphahat, col="blue")
truehist(lambdahatsim, nbins = 20)
abline(v = lambdahat, col = "blue")
## And here are bootstrap standard error, bias, root mean square error
sd(lambdahatsim) ## bootstrap estimated s.e. rate estimate (lambda)
## [1] 0.3543
mean(lambdahatsim) - lambdahat ## bootstrap estimated bias for rate
## [1] 0.09894
sqrt(mean((lambdahatsim - lambdahat)^2)) ## bootstrap estimated RMSE
## [1] 0.3677
sd(alphahatsim) ## bootstrapsd estimated s.e. shape estimate (alpha)
## [1] 0.06591
mean(alphahatsim) - alphahat ## bootstrap estimated bias for shape
## [1] 0.01738
sqrt(mean((alphahatsim - alphahat)^2)) ## bootstrap estimated RMSE
## [1] 0.06813
Now the same, with maximum likelihood and the parametric bootstrap.
# Now the MLE. I'll be lazy and use an existing R package
library(MASS)
# help(fitdistr) Do "help" to find out how to use fitdistr!
fitdistr(ill, "gamma")
## shape rate
## 0.44080 1.96476
## (0.03376) (0.24744)
result <- fitdistr(ill, "gamma")
str(result)
## List of 5
## $ estimate: Named num [1:2] 0.441 1.965
## ..- attr(*, "names")= chr [1:2] "shape" "rate"
## $ sd : Named num [1:2] 0.0338 0.2474
## ..- attr(*, "names")= chr [1:2] "shape" "rate"
## $ vcov : num [1:2, 1:2] 0.00114 0.00508 0.00508 0.06123
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "shape" "rate"
## .. ..$ : chr [1:2] "shape" "rate"
## $ loglik : num 185
## $ n : int 227
## - attr(*, "class")= chr "fitdistr"
typeof(result)
## [1] "list"
class(result)
## [1] "fitdistr"
names(result)
## [1] "estimate" "sd" "vcov" "loglik" "n"
ests <- result$estimate
ests
## shape rate
## 0.4408 1.9648
typeof(ests)
## [1] "double"
class(ests)
## [1] "numeric"
str(ests)
## Named num [1:2] 0.441 1.965
## - attr(*, "names")= chr [1:2] "shape" "rate"
ests["shape"]
## shape
## 0.4408
ests[1]
## shape
## 0.4408
alphahatMMsim <- numeric(1000)
lambdahatMMsim <- numeric(1000)
alphahatMLEsim <- numeric(1000)
lambdahatMLEsim <- numeric(1000)
ests <- fitdistr(ill,"gamma")$estimate
ests
## shape rate
## 0.4408 1.9648
lambdahat <- ests["rate"]
alphahat <- ests["shape"]
set.seed(170913)
for (i in 1:1000) {
illsim <- rgamma(227, alphahat, lambdahat)
alphahatMMsim[i] <- (mean(illsim))^2/var(illsim)
lambdahatMMsim[i] <- mean(illsim)/var(illsim)
estsMLEsim <- fitdistr(illsim,"gamma")$estimate
alphahatMLEsim[i] <- estsMLEsim["shape"]
lambdahatMLEsim[i] <- estsMLEsim["rate"]
}
warnings()
## NULL
par(mfrow = c(2,2))
truehist(alphahatMMsim)
abline(v = alphahat, col = "blue")
truehist(lambdahatMMsim)
abline(v = lambdahat, col = "blue")
truehist(alphahatMLEsim)
abline(v = alphahat, col = "blue")
truehist(lambdahatMLEsim)
abline(v = lambdahat, col = "blue")
truehist(alphahatMMsim, xlim=c(0.25,0.75), ylim=c(0,14),
breaks=seq(0.25,0.75,length=50))
abline(v = alphahat, col = "blue")
truehist(lambdahatMMsim, xlim=c(1,3.6), ylim=c(0,1.8),
breaks=seq(1,3.6,length=50))
abline(v = lambdahat, col = "blue")
truehist(alphahatMLEsim, xlim=c(0.25,0.75), ylim=c(0,14),
breaks=seq(0.25,0.75,length=50))
abline(v = alphahat, col = "blue")
truehist(lambdahatMLEsim, xlim=c(1,3.6), ylim=c(0,1.8),
breaks=seq(1,3.6,length=50))
abline(v = lambdahat, col = "blue")
rmse <- function(estimate, truth) sqrt(mean((estimate - truth)^2))
alphahat
## shape
## 0.4408
rmse(alphahatMMsim, alphahat)
## [1] 0.07215
rmse(alphahatMLEsim, alphahat)
## [1] 0.03458
lambdahat
## rate
## 1.965
rmse(lambdahatMMsim, lambdahat)
## [1] 0.3949
rmse(lambdahatMLEsim, lambdahat)
## [1] 0.2625
par(mfrow = c(1,1))