DATA 605 Week 9 Discussion

Chapter 9 Section 3 Exercise 9

How large must \(n\) be before \(S_n = X_1+X_2+...+X_n\) is approximately normal? This number is often surprisingly small. Let us explore this question with a computer simulation. Choose \(n\) numbers from \([0, 1]\) with probability density \(f(x)\), where \(n = 3\), \(6\), \(12\), \(20\), and \(f(x)\) is each of the densities in Exercise 7. Compute their sum \(S_n\), repeat this experiment 1000 times, and make up a bar graph of 20 bars of the results. How large must \(n\) be before you get a good fit?

Probability density functions from Exercise 7:

\(f(x) = 1\)
\(f(x) = 2x\)
\(f(x) = 3x^2\)

Solution

Per note on page 361, \(X\) can be simulated using \(X=F^{-1}(rnd)\).

For \(x \in [0,1]\),

\(f(t) = 1\), \(F(X)=\int_{0}^{x} 1 dt = x\), \(F^{-1}(x)=x\)
\(f(t) = 2t\), \(F(X)=\int_{0}^{x} 2t dt = x^2\), \(F^{-1}(x)=\sqrt{x}\)
\(f(t) = 3t^2\), \(F(X)=\int_{0}^{x} 3t^2 dt = x^3\), \(F^{-1}(x)=\sqrt[3]{x}\)

Plot experiments with various values for \(n\) for each density function.

Density Function \(f(x) = 1\) (Uniform Distribution)

trials <- 1000

getsum <- function(n, trials) {
  sum <- rep(0, trials)
  for (i in 1:trials) {
    x <- runif(n,0,1)
    sum[i] <- sum(x)
  }
  return(sum)
}

par(mfrow=c(3,2))

n <- 1
sum <- getsum(n, trials)
hist(sum, breaks=50, xlim=c(mean(sum)-3*sd(sum),mean(sum)+3*sd(sum)), 
     prob=TRUE, xlab="", ylab="", main="n = 1")
curve(dnorm(x, mean=mean(sum), sd=sd(sum)),add=TRUE)

n <- 2
sum <- getsum(n, trials)
hist(sum, breaks=50, xlim=c(mean(sum)-3*sd(sum),mean(sum)+3*sd(sum)), 
     prob=TRUE, xlab="", ylab="", main="n = 2")
curve(dnorm(x, mean=mean(sum), sd=sd(sum)),add=TRUE)

n <- 3
sum <- getsum(n, trials)
hist(sum, breaks=50, xlim=c(mean(sum)-3*sd(sum),mean(sum)+3*sd(sum)), 
     prob=TRUE, xlab="", ylab="", main="n = 3")
curve(dnorm(x, mean=mean(sum), sd=sd(sum)),add=TRUE)

n <- 6
sum <- getsum(n, trials)
hist(sum, breaks=50, xlim=c(mean(sum)-3*sd(sum),mean(sum)+3*sd(sum)), 
     prob=TRUE, xlab="", ylab="", main="n = 6")
curve(dnorm(x, mean=mean(sum), sd=sd(sum)),add=TRUE)

n <- 12
sum <- getsum(n, trials)
hist(sum, breaks=50, xlim=c(mean(sum)-3*sd(sum),mean(sum)+3*sd(sum)), 
     prob=TRUE, xlab="", ylab="", main="n = 12")
curve(dnorm(x, mean=mean(sum), sd=sd(sum)),add=TRUE)

n <- 20
sum <- getsum(n, trials)
hist(sum, breaks=50, xlim=c(mean(sum)-3*sd(sum),mean(sum)+3*sd(sum)), 
     prob=TRUE, xlab="", ylab="", main="n = 20")
curve(dnorm(x, mean=mean(sum), sd=sd(sum)),add=TRUE)

Density Function \(f(x) = 2x\)

trials <- 1000

getsum <- function(n, trials) {
  sum <- rep(0, trials)
  for (i in 1:trials) {
    x <- runif(n,0,1)
    x <- sqrt(x)
    sum[i] <- sum(x)
  }
  return(sum)
}

par(mfrow=c(3,2))

n <- 1
sum <- getsum(n, trials)
hist(sum, breaks=50, xlim=c(mean(sum)-3*sd(sum),mean(sum)+3*sd(sum)), 
     prob=TRUE, xlab="", ylab="", main="n = 1")
curve(dnorm(x, mean=mean(sum), sd=sd(sum)),add=TRUE)

n <- 2
sum <- getsum(n, trials)
hist(sum, breaks=50, xlim=c(mean(sum)-3*sd(sum),mean(sum)+3*sd(sum)), 
     prob=TRUE, xlab="", ylab="", main="n = 2")
curve(dnorm(x, mean=mean(sum), sd=sd(sum)),add=TRUE)

n <- 3
sum <- getsum(n, trials)
hist(sum, breaks=50, xlim=c(mean(sum)-3*sd(sum),mean(sum)+3*sd(sum)), 
     prob=TRUE, xlab="", ylab="", main="n = 3")
curve(dnorm(x, mean=mean(sum), sd=sd(sum)),add=TRUE)

n <- 6
sum <- getsum(n, trials)
hist(sum, breaks=50, xlim=c(mean(sum)-3*sd(sum),mean(sum)+3*sd(sum)), 
     prob=TRUE, xlab="", ylab="", main="n = 6")
curve(dnorm(x, mean=mean(sum), sd=sd(sum)),add=TRUE)

n <- 12
sum <- getsum(n, trials)
hist(sum, breaks=50, xlim=c(mean(sum)-3*sd(sum),mean(sum)+3*sd(sum)), 
     prob=TRUE, xlab="", ylab="", main="n = 12")
curve(dnorm(x, mean=mean(sum), sd=sd(sum)),add=TRUE)

n <- 20
sum <- getsum(n, trials)
hist(sum, breaks=50, xlim=c(mean(sum)-3*sd(sum),mean(sum)+3*sd(sum)), 
     prob=TRUE, xlab="", ylab="", main="n = 20")
curve(dnorm(x, mean=mean(sum), sd=sd(sum)),add=TRUE)

Density Function \(f(x) = 3x^2\)

trials <- 1000

getsum <- function(n, trials) {
  sum <- rep(0, trials)
  for (i in 1:trials) {
    x <- runif(n,0,1)
    x <- x^(1/3)
    sum[i] <- sum(x)
  }
  return(sum)
}

par(mfrow=c(3,2))

n <- 1
sum <- getsum(n, trials)
hist(sum, breaks=50, xlim=c(mean(sum)-3*sd(sum),mean(sum)+3*sd(sum)), 
     prob=TRUE, xlab="", ylab="", main="n = 1")
curve(dnorm(x, mean=mean(sum), sd=sd(sum)),add=TRUE)

n <- 2
sum <- getsum(n, trials)
hist(sum, breaks=50, xlim=c(mean(sum)-3*sd(sum),mean(sum)+3*sd(sum)), 
     prob=TRUE, xlab="", ylab="", main="n = 2")
curve(dnorm(x, mean=mean(sum), sd=sd(sum)),add=TRUE)

n <- 3
sum <- getsum(n, trials)
hist(sum, breaks=50, xlim=c(mean(sum)-3*sd(sum),mean(sum)+3*sd(sum)), 
     prob=TRUE, xlab="", ylab="", main="n = 3")
curve(dnorm(x, mean=mean(sum), sd=sd(sum)),add=TRUE)

n <- 6
sum <- getsum(n, trials)
hist(sum, breaks=50, xlim=c(mean(sum)-3*sd(sum),mean(sum)+3*sd(sum)), 
     prob=TRUE, xlab="", ylab="", main="n = 6")
curve(dnorm(x, mean=mean(sum), sd=sd(sum)),add=TRUE)

n <- 12
sum <- getsum(n, trials)
hist(sum, breaks=50, xlim=c(mean(sum)-3*sd(sum),mean(sum)+3*sd(sum)), 
     prob=TRUE, xlab="", ylab="", main="n = 12")
curve(dnorm(x, mean=mean(sum), sd=sd(sum)),add=TRUE)

n <- 20
sum <- getsum(n, trials)
hist(sum, breaks=50, xlim=c(mean(sum)-3*sd(sum),mean(sum)+3*sd(sum)), 
     prob=TRUE, xlab="", ylab="", main="n = 20")
curve(dnorm(x, mean=mean(sum), sd=sd(sum)),add=TRUE)

Notes

I have added \(n=1\) and \(n=2\) to better illustrate the experiment. For \(n=1\), since there is only one element, we essentially get probability distribution described by the density function. It is clearly a bad fit for the normal curve. With \(n=2\), we are starting to see that we are moving towards desired goal, but the fit is still not good. With \(n=3\), the fit is not ideal, but it is much better. There are some variations based on density function. For example, uniform distribution starts fitting the normal density curve earlier than other distributions. I would say that by \(n=6\), the fit is pretty good for all functions in this problem.