Introduction to Statistics:

Some R illustrations from the first sections of Chapter 8 of Rice

1. The Berkson data on alpha-particles

berkson <- read.table("berkson.dat")
berkson
##     V1  V2    V3
## 1   2-  18  12.2
## 2    3  28  27.0
## 3    4  56  56.5
## 4    5 105  94.9
## 5    6 126 132.7
## 6    7 146 159.1
## 7    8 164 166.9
## 8    9 161 155.6
## 9   10 123 130.6
## 10  11 101  99.7
## 11  12  74  69.7
## 12  13  53  45.0
## 13  14  23  27.0
## 14  15  15  15.1
## 15  16   9   7.9
## 16 17+   5   7.1
berkson$V2
##  [1]  18  28  56 105 126 146 164 161 123 101  74  53  23  15   9   5
N <- sum(berkson$V2)
lambdahat <- sum((2:17)*berkson$V2)/sum(berkson$V2)
plot(2:17, berkson$V2)
lines(2:17, N*dpois(2:17, lambdahat))

plot of chunk unnamed-chunk-1

2. Illustration of ion channel data

Rice shows us what he calls a “smoothed histogram” of data from experiments on ion channels. It looks very much as though it is standard normally distributed - presumably he subtracted the sample mean and divided by the sample standard deviation in order to get observations whose mean is zero and standard deviation is zero!

He shows us what he calls a “smoothed histogram” but it seems to me that it is a what you get from a histogram by joining the mid-points of the tops of adjacent bars by straight line segments.

I simulate data from a standard normal and create the corresponding plot.

He says that his graph suggests some small amount of skewness in the data, in other words, some departure from the model of a normal distribution. However if we do the same simulation a number of times we can see that this amount of deviation is actually quite typical.

set.seed(123)
current <- rnorm(50000)
hist(current)
current.hist <- hist(current)
names(current.hist)
## [1] "breaks"   "counts"   "density"  "mids"     "xname"    "equidist"
breaks # doesn't exist, so error
## Error: object 'breaks' not found
current.hist$breaks
##  [1] -4.5 -4.0 -3.5 -3.0 -2.5 -2.0 -1.5 -1.0 -0.5  0.0  0.5  1.0  1.5  2.0
## [15]  2.5  3.0  3.5  4.0  4.5
current.hist
## $breaks
##  [1] -4.5 -4.0 -3.5 -3.0 -2.5 -2.0 -1.5 -1.0 -0.5  0.0  0.5  1.0  1.5  2.0
## [15]  2.5  3.0  3.5  4.0  4.5
## 
## $counts
##  [1]    1    4   49  245  835 2258 4641 7378 9649 9521 7484 4598 2216  819
## [15]  237   53   11    1
## 
## $density
##  [1] 0.00004 0.00016 0.00196 0.00980 0.03340 0.09032 0.18564 0.29512
##  [9] 0.38596 0.38084 0.29936 0.18392 0.08864 0.03276 0.00948 0.00212
## [17] 0.00044 0.00004
## 
## $mids
##  [1] -4.25 -3.75 -3.25 -2.75 -2.25 -1.75 -1.25 -0.75 -0.25  0.25  0.75
## [12]  1.25  1.75  2.25  2.75  3.25  3.75  4.25
## 
## $xname
## [1] "current"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
lines(current.hist$mids, current.hist$counts)

plot of chunk unnamed-chunk-2

current.hist <- hist(current, freq = FALSE)
current.hist
## $breaks
##  [1] -4.5 -4.0 -3.5 -3.0 -2.5 -2.0 -1.5 -1.0 -0.5  0.0  0.5  1.0  1.5  2.0
## [15]  2.5  3.0  3.5  4.0  4.5
## 
## $counts
##  [1]    1    4   49  245  835 2258 4641 7378 9649 9521 7484 4598 2216  819
## [15]  237   53   11    1
## 
## $density
##  [1] 0.00004 0.00016 0.00196 0.00980 0.03340 0.09032 0.18564 0.29512
##  [9] 0.38596 0.38084 0.29936 0.18392 0.08864 0.03276 0.00948 0.00212
## [17] 0.00044 0.00004
## 
## $mids
##  [1] -4.25 -3.75 -3.25 -2.75 -2.25 -1.75 -1.25 -0.75 -0.25  0.25  0.75
## [12]  1.25  1.75  2.25  2.75  3.25  3.75  4.25
## 
## $xname
## [1] "current"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"
lines(current.hist$mids, current.hist$density)

plot of chunk unnamed-chunk-2

plot(current.hist$mids, current.hist$density, type="l")
x <- seq(from = -4, to = 4, length = 100)
lines(x, dnorm(x))

plot of chunk unnamed-chunk-2

par(mfrow = c(3, 4)) # 12 plots per graphic, arranged in 3 rows, 4 columns
for (i in (1:12)) {
    current <- rnorm(50000)
    current.hist <- hist(current, plot=F)
    plot(current.hist$mids, current.hist$density, 
         type = "l", xlim = c(-4, 4))
    lines(x, dnorm(x), col = "red")
}

plot of chunk unnamed-chunk-2

## I do it again with smaller sample size so you can better see the deviations

for (i in (1:12)) {
    current <- rnorm(500)
    current.hist <- hist(current, plot = FALSE)
    plot(current.hist$mids, current.hist$density, 
         type="l", xlim = c(-3, 3), ylim = c(0, 0.5))
    lines(x, dnorm(x), col = "red")
}

plot of chunk unnamed-chunk-2

par(mfrow = c(1, 1)) # Back to one plot per graphic

3. Method of moments for gamma distribution

) Next come some examples of the method of moments, the gamma distribution, and the parametric bootstrap.

## First just a quick look at simulated data

trueShape <- 0.8
trueRate <- 1/40
data <- rgamma(200, shape = trueShape, rate = trueRate)
hist(data)

plot of chunk unnamed-chunk-3

## I prefer my histogram scaled to have area under the curve = one
hist(data, freq = FALSE)
x <- seq(from = 0, to = 200, length = 200)
lines(x, dgamma(x, trueShape, trueRate))

plot of chunk unnamed-chunk-3

hist(data, freq = FALSE, breaks = 20) ## "hist" tends to give too few bars
x <- seq(from = 0, to = 250, length = 200)
lines(x, dgamma(x, shape = trueShape, rate = trueRate))

plot of chunk unnamed-chunk-3

library(MASS) # this package has a better "histogram" function

truehist(data, nbins = 20)
lines(x, dgamma(x, shape = trueShape, rate = trueRate))

plot of chunk unnamed-chunk-3

# Illinois storm data

ill60 <- scan("ill60.c10")
ill61 <- scan("ill61.c10")
ill62 <- scan("ill62.c10")
ill63 <- scan("ill63.c10")
ill64 <- scan("ill64.c10")
ill <- c(ill60,ill61,ill62,ill63,ill64)
summary(ill)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.001   0.010   0.070   0.224   0.270   2.130
ill
##   [1] 0.020 0.001 0.001 0.120 0.080 0.420 1.720 0.050 0.010 0.010 0.003
##  [12] 0.001 0.003 0.270 0.001 0.060 0.050 2.130 0.040 1.100 0.020 0.001
##  [23] 0.140 0.080 0.210 0.070 0.320 0.240 0.290 0.001 0.290 1.130 0.003
##  [34] 0.010 0.190 0.002 0.010 0.040 0.002 0.070 0.450 0.010 0.180 0.670
##  [45] 0.003 0.010 0.040 0.002 0.490 0.020 0.020 0.340 0.140 0.370 0.330
##  [56] 0.330 0.350 0.010 0.500 0.760 1.060 0.002 0.060 0.160 0.270 0.250
##  [67] 0.290 0.020 0.050 0.460 0.070 0.410 0.020 0.080 0.210 0.010 0.440
##  [78] 0.020 0.050 0.110 1.500 0.003 0.180 0.010 0.002 0.240 0.010 0.750
##  [89] 0.010 0.140 0.130 0.010 0.010 0.270 0.450 1.780 0.250 0.240 0.004
## [100] 0.210 0.170 0.830 0.150 0.030 0.030 0.500 0.040 0.090 0.040 0.060
## [111] 0.060 0.120 0.003 0.003 0.400 0.020 0.510 0.003 0.020 0.020 0.020
## [122] 0.010 0.001 0.140 0.001 0.100 0.010 1.090 0.010 0.002 0.001 0.840
## [133] 0.030 0.350 0.070 0.001 0.002 0.002 0.200 0.060 0.140 0.010 0.020
## [144] 0.020 0.002 0.001 0.550 0.130 0.190 2.100 0.090 0.350 0.790 0.320
## [155] 1.350 0.170 0.020 0.002 0.010 0.250 0.230 0.170 0.010 0.020 0.001
## [166] 0.010 0.020 0.110 0.210 1.260 0.010 0.730 0.100 0.090 0.007 0.360
## [177] 0.770 0.210 1.270 0.070 0.080 0.160 0.260 0.010 0.230 0.080 0.020
## [188] 0.010 0.290 0.010 0.010 0.070 0.400 0.002 0.003 0.010 0.090 0.160
## [199] 0.040 0.270 0.730 0.410 0.030 0.120 0.030 1.040 0.060 0.090 0.730
## [210] 0.040 0.160 0.590 0.003 0.002 0.020 0.004 0.010 0.001 0.060 0.620
## [221] 0.010 0.520 0.110 0.003 0.600 0.002 0.050
truehist(ill, nbins = 20)
mean(ill)
## [1] 0.2244
sdev(ill)   # no function called sdev! error!
## Error: could not find function "sdev"
sd(ill)
## [1] 0.3658
sqrt(var(ill))
## [1] 0.3658
# Moment estimators (well ... almost ...
#  ... "wrong" normalization for sample variance)
# So what? This is the estimator I'll study:
# Let's call it the "almost moment estimator"

lambdahat <- mean(ill)/var(ill)
alphahat <- (mean(ill))^2/var(ill)
lambdahat # estimated rate parameter
## [1] 1.677
alphahat # estimated shape parameter
## [1] 0.3763
truehist(ill, nbins = 20)
xp <- seq(0, 2.2, length = 100)
lines(xp, dgamma(xp, shape = alphahat, rate = lambdahat))

plot of chunk unnamed-chunk-3

# parametric bootstrap of (almost) moment estimators

alphahatsim <- numeric(1000)
lambdahatsim <- numeric(1000)

for (i in 1:1000) {
    illsim <- rgamma(227,alphahat,lambdahat)
    alphahatsim[i] <- (mean(illsim))^2/var(illsim)
    lambdahatsim[i] <- mean(illsim)/var(illsim)
}

## Histograms of boostrap estimates

truehist(alphahatsim, nbins = 20)

abline(v = alphahat, col="blue")

plot of chunk unnamed-chunk-3

truehist(lambdahatsim, nbins = 20)

abline(v = lambdahat, col = "blue")

plot of chunk unnamed-chunk-3

## And here are bootstrap standard error, bias, root mean square error

sd(lambdahatsim) ## bootstrap estimated s.e. rate estimate (lambda)
## [1] 0.3543
mean(lambdahatsim) - lambdahat  ## bootstrap estimated bias for rate
## [1] 0.09894
sqrt(mean((lambdahatsim - lambdahat)^2)) ## bootstrap estimated RMSE
## [1] 0.3677
sd(alphahatsim)  ## bootstrapsd estimated s.e. shape estimate (alpha)
## [1] 0.06591
mean(alphahatsim) - alphahat  ## bootstrap estimated bias for shape
## [1] 0.01738
sqrt(mean((alphahatsim - alphahat)^2)) ## bootstrap estimated RMSE
## [1] 0.06813

4. Maximum likelihood for gamma distribution

Now the same, with maximum likelihood and the parametric bootstrap.

# Now the MLE. I'll be lazy and use an existing R package

library(MASS)
#  help(fitdistr)  Do "help" to find out how to use fitdistr!
fitdistr(ill, "gamma")
##     shape     rate  
##   0.44080   1.96476 
##  (0.03376) (0.24744)
result <- fitdistr(ill, "gamma")
str(result)
## List of 5
##  $ estimate: Named num [1:2] 0.441 1.965
##   ..- attr(*, "names")= chr [1:2] "shape" "rate"
##  $ sd      : Named num [1:2] 0.0338 0.2474
##   ..- attr(*, "names")= chr [1:2] "shape" "rate"
##  $ vcov    : num [1:2, 1:2] 0.00114 0.00508 0.00508 0.06123
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "shape" "rate"
##   .. ..$ : chr [1:2] "shape" "rate"
##  $ loglik  : num 185
##  $ n       : int 227
##  - attr(*, "class")= chr "fitdistr"
typeof(result)
## [1] "list"
class(result)
## [1] "fitdistr"
names(result)
## [1] "estimate" "sd"       "vcov"     "loglik"   "n"
ests <- result$estimate

ests
##  shape   rate 
## 0.4408 1.9648
typeof(ests)
## [1] "double"
class(ests)
## [1] "numeric"
str(ests)
##  Named num [1:2] 0.441 1.965
##  - attr(*, "names")= chr [1:2] "shape" "rate"
ests["shape"]
##  shape 
## 0.4408
ests[1]
##  shape 
## 0.4408
alphahatMMsim <- numeric(1000)
lambdahatMMsim <- numeric(1000)
alphahatMLEsim <- numeric(1000)
lambdahatMLEsim <- numeric(1000)

ests <- fitdistr(ill,"gamma")$estimate
ests
##  shape   rate 
## 0.4408 1.9648
lambdahat <- ests["rate"]
alphahat <- ests["shape"]

set.seed(170913)
for (i in 1:1000) {
    illsim <- rgamma(227, alphahat, lambdahat)
    alphahatMMsim[i] <- (mean(illsim))^2/var(illsim)
    lambdahatMMsim[i] <- mean(illsim)/var(illsim)
    estsMLEsim <- fitdistr(illsim,"gamma")$estimate
    alphahatMLEsim[i] <- estsMLEsim["shape"]
    lambdahatMLEsim[i] <- estsMLEsim["rate"]
}

warnings()
## NULL
par(mfrow = c(2,2))
truehist(alphahatMMsim)
abline(v = alphahat, col = "blue")
truehist(lambdahatMMsim)
abline(v = lambdahat, col = "blue")
truehist(alphahatMLEsim)
abline(v = alphahat, col = "blue")
truehist(lambdahatMLEsim)
abline(v = lambdahat, col = "blue")

plot of chunk unnamed-chunk-4

truehist(alphahatMMsim, xlim=c(0.25,0.75), ylim=c(0,14),
    breaks=seq(0.25,0.75,length=50))
abline(v = alphahat, col = "blue")
truehist(lambdahatMMsim, xlim=c(1,3.6), ylim=c(0,1.8),
    breaks=seq(1,3.6,length=50))
abline(v = lambdahat, col = "blue")
truehist(alphahatMLEsim, xlim=c(0.25,0.75), ylim=c(0,14),
    breaks=seq(0.25,0.75,length=50))
abline(v = alphahat, col = "blue")
truehist(lambdahatMLEsim, xlim=c(1,3.6), ylim=c(0,1.8),
    breaks=seq(1,3.6,length=50))
abline(v = lambdahat, col = "blue")

plot of chunk unnamed-chunk-4

rmse <- function(estimate, truth) sqrt(mean((estimate - truth)^2))

alphahat
##  shape 
## 0.4408
rmse(alphahatMMsim, alphahat)
## [1] 0.07215
rmse(alphahatMLEsim, alphahat)
## [1] 0.03458
lambdahat
##  rate 
## 1.965
rmse(lambdahatMMsim, lambdahat)
## [1] 0.3949
rmse(lambdahatMLEsim, lambdahat)
## [1] 0.2625
par(mfrow = c(1,1))