#title: "Simulation Exercise"
#author: "vinit pandey"
#date: "July 15, 2018"
#output: pdf_document


#Overview
#The purpose of this data analysis is to investigate the exponential distribution and compare it to the Central Limit Theorem. For this analysis, the lambda will be set to 0.2 for all of the simulations. 
#This investigation will compare the distribution of averages of 40 exponentials over 1000 simulations.

##Simulations

#We will compute 1000 averages of 40 exponentials
mns = NULL
lambda <- 0.2
n <- 1000
for (i in 1 : n) mns = c(mns, mean(rexp(40, lambda)))

##Sample Mean versus Theoretical Mean

#What is the sample mean of the distribution?
  

sample.mean <- mean(mns)
sample.mean
## [1] 4.98426
#The theoretical mean of the exponential distribution is 1/lambda:
  

theoretical.mean <- 1/lambda
theoretical.mean
## [1] 5
#They differ by less than 1 percent
  
  
abs(sample.mean - theoretical.mean) / theoretical.mean
## [1] 0.00314802
#Graph for this, blue for the statistical mean and red for the theoretical mean.
hist(mns)
abline(v = sample.mean, col = "blue", lwd = 2)
abline(v = theoretical.mean, col = "red", lwd = 2)

#Sample and theoretical variances
#What is the sample variance of the distribution?
  

vars = NULL
for (i in 1 : 1000) vars = c(vars, var(rexp(40, lambda)))
sample.var <- mean(vars)
sample.var
## [1] 24.74762
#The theoretical standard deviation of the exponential distribution is 1/lambda. So the variance is the square of this:
  

theoretical.var <- (1/lambda)^2
theoretical.var
## [1] 25
#They also differ by less than 1%:
  

abs(sample.var - theoretical.var) / theoretical.var
## [1] 0.01009513
#Let's graph this using red for the sample standard deviation and red for the theoretical standard deviation.
hist(vars)
abline(v = sample.var, col = "blue", lwd = 2)
abline(v = theoretical.var, col = "red", lwd = 2)




#Distribution
# We now illustrate that the distribution is approximately standard normal.
library("ggplot2")

nosim <- 1000
cfunc <- function(x, n) sqrt(n) * (mean(x) - theoretical.mean) / sqrt(theoretical.var) 
dat <- data.frame(
x = c(apply(matrix(rexp(nosim * 10, lambda), nosim), 1, cfunc, 10),
apply(matrix(rexp(nosim * 20, lambda), nosim), 1, cfunc, 20),
apply(matrix(rexp(nosim * 40, lambda), nosim), 1, cfunc, 40)),
size = factor(rep(c(10, 20, 40), rep(nosim, 3))))
g <- ggplot(dat, aes(x = x, fill = size)) + 
geom_histogram(binwidth=.3, colour = "black", aes(y = ..density..)) 
g <- g + stat_function(fun = dnorm, size = 2)
g + facet_grid(. ~ size)