Setting up WD

rm(list = ls()) # Clear environment
gc()            # Clear unused memory
##          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 533803 28.6    1190437 63.6         NA   669428 35.8
## Vcells 986611  7.6    8388608 64.0      16384  1851749 14.2
cat("\f")       # Clear the console
# dev.off()     # Clear par(mfrow=c(3,2)) type commands 

Installing Packages

library("psych")

Part I: Application of the CLT - Sample Mean of Binomial

Creating Random Variables

##    vars   n mean   sd median trimmed  mad min max range  skew kurtosis  se
## X1    1 150 39.4 4.91     39   39.51 4.45  27  53    26 -0.08     0.01 0.4

The rbinom function was used to generate a random sample from a given binomial distribution. The distribution is governed by the parameters of sample size (n=150) and probability (p=.4). The size corresponds to the number of trials (100).

The mean of the binomial distribution is 39.4, with a standard deviation of 4.91 (measure of variation around the mean).

Storing mean (mu) and standard deviation (sigma) in new vectors

mu <- mean(random.binomial)
sigma <- sd(random.binomial)

Plotting Binomial Distribution

hist(x = random.binomial,
     main = "Binomial Distribution (n = 150, p = .4)",
     xlab = ""
     )

Creating empty matrix to store means

#creating emty matrix to store means generated from a random sample

z <- matrix(data = rep(x  = 0, times = 10000), 
            nrow = 10000, 
            ncol = 1)

describe(z)
##    vars     n mean sd median trimmed mad min max range skew kurtosis se
## X1    1 10000    0  0      0       0   0   0   0     0  NaN      NaN  0

Taking a random sample of 1,000 observations from the binomial distribution

for (i in 1:10000){ 
    z[i,] <- mean(sample( x = random.binomial, #random.binomial vector used to generate distribution
                          size = 1000,    #taking a random sample of 1000 observations
                          replace = TRUE  #binomial sampling must be with replacement
                          ))}
                              
describe(z)
##    vars     n mean   sd median trimmed  mad   min   max range  skew kurtosis se
## X1    1 10000 39.4 0.16   39.4    39.4 0.16 38.81 39.94  1.13 -0.03    -0.03  0

Plotting histogram of sample mean

hist(z, xlab = "", main = "Histogram of Sample Mean (n = 1000)")

Testing the CLT using increasing sample sizes

#expanding matrix
z.1 <- matrix(data = rep(x = 0, times = 40000), 
                       nrow = 10000, 
                       ncol = 4)

n <- c(50, 100, 1000, 5000) #sample size

for (j in 1:4){  # indexing columns of matrix, each corresponds to a different sample size
 for (i in 1:10000){  # indexing rows of matrix
      z.1[i,j] <- mean(sample( x = random.binomial,  #computing mean/assigning to row
                              size    = n[j], 
                              replace = TRUE
      ))}}

colnames(z.1) <- c("n=50","n=100","n=1000","n=5000")
summary(z.1)
##       n=50           n=100           n=1000          n=5000     
##  Min.   :36.52   Min.   :37.52   Min.   :38.85   Min.   :39.18  
##  1st Qu.:38.92   1st Qu.:39.07   1st Qu.:39.30   1st Qu.:39.35  
##  Median :39.40   Median :39.40   Median :39.40   Median :39.40  
##  Mean   :39.40   Mean   :39.40   Mean   :39.40   Mean   :39.40  
##  3rd Qu.:39.88   3rd Qu.:39.73   3rd Qu.:39.51   3rd Qu.:39.45  
##  Max.   :42.38   Max.   :41.32   Max.   :39.97   Max.   :39.66

Plotting distribution

par(mfrow=c(3,2))

length(random.binomial)
## [1] 150
hist(x    = random.binomial, 
     main = "Histogram of Uniform Distribution, N=10,000",
     xlab = ""
     )

   for (k in 1:4){
    hist(x    = z.1[,k],     # matrix column 1-4 which contain sample means of randomly chosen samples from uniform dist
         main = "Histogram of Sample Mean of Uniform Distribution", 
         xlim = c(36, 42), # plot the domain of X axis from x=36 to x=42 only
         xlab = paste0("Sample Size ", n[k], " (Column ", k, " from matrix)") 
         )}  

Part II: Application of the CLT - Sample Median of Binomial

Creating empty matrix to store medians

#creating emty matrix to store means generated from a random sample

z.2 <- matrix(data = rep(x  = 0, times = 10000), 
            nrow = 10000, 
            ncol = 1)

describe(z.2)
##    vars     n mean sd median trimmed mad min max range skew kurtosis se
## X1    1 10000    0  0      0       0   0   0   0     0  NaN      NaN  0

Taking a random sample of 1,000 observations from the binomial distribution

for (i in 1:10000) {
  z.2[i,] <- median(sample(x = random.binomial, size = 1000, replace = TRUE))
}
                              
describe(z.2)
##    vars     n  mean  sd median trimmed mad min max range skew kurtosis se
## X1    1 10000 39.21 0.4     39   39.14   0  39  40     1 1.44     0.11  0
hist(z.2, xlab = "", main = "Histogram of Sample Median (n = 1000)")

z.3 <- matrix(data = rep(x = 0, times = 40000), 
                       nrow = 10000, 
                       ncol = 4)

n <- c(50, 100, 1000, 5000) #sample size

for (j in 1:4){  # indexing columns of matrix, each corresponds to a different sample size
 for (i in 1:10000){  # indexing rows of matrix
      z.3[i,j] <- median(sample( x = random.binomial,  #computing mean/assigning to row
                              size    = n[j], 
                              replace = TRUE
      ))}}

colnames(z.3) <- c("n=50","n=100","n=1000","n=5000")
summary(z.3)
##       n=50          n=100           n=1000          n=5000     
##  Min.   :36.5   Min.   :37.00   Min.   :39.00   Min.   :39.00  
##  1st Qu.:39.0   1st Qu.:39.00   1st Qu.:39.00   1st Qu.:39.00  
##  Median :39.0   Median :39.00   Median :39.00   Median :39.00  
##  Mean   :39.4   Mean   :39.37   Mean   :39.21   Mean   :39.03  
##  3rd Qu.:40.0   3rd Qu.:40.00   3rd Qu.:39.00   3rd Qu.:39.00  
##  Max.   :43.0   Max.   :42.00   Max.   :40.00   Max.   :40.00
par(mfrow=c(3,2))

length(random.binomial)
## [1] 150
hist(x    = random.binomial, 
     main = "Histogram of Binomial Distribution, N=10,000",
     xlab = ""
     )

   for (k in 1:4){
    hist(x    = z.3[,k],     # matrix column 1-4 which contain sample means of randomly chosen samples from uniform dist
         main = "Histogram of Sample Median of Binomial Distribution", 
         xlim = c(36, 42), # plot the domain of X axis from x=38 to x=42 only
         xlab = paste0("Sample Size ", n[k], " (Column ", k, " from matrix)") 
         )}