Investigation of the exponential distribution in R and comparison with the Central Limit Theorem. The investigation is done with 1000 sample of exponentiel distribution. Each sample contains 40 observations.
knitr::opts_chunk$set(echo = TRUE)
library(ggplot2)
my_plot_fun <- function(my_data) {
my_data <- as.data.frame(my_data)
ggplot(my_data,aes(x = my_data)) + theme_classic() +
theme(plot.title = element_text(color="#ADA717", size=18, face="bold.italic",hjust=0.5),
axis.title.x = element_text(color="#993333", size=18, face="bold"),
axis.text.x = element_text(color="#993333", size=14,vjust = 0),
axis.title.y = element_text(color="darkgreen", size=18, face="bold"),
axis.text.y = element_text(face="bold", color="darkgreen", size=14),
legend.text = element_text(size=12),
legend.title = element_text(size=20))}
set.seed(10)
# Set lambda = 0.2 for all of the simulations.
lambda <- .2
# Number of total simulations/samples
N <- 1000
# Number of observations in each sample
numb <- 40
my_means = NULL
# By a for loop, add the mean of exponential distribution of 40 observations and a rate of .2
for (i in 1 : N) {
my_means <- c(my_means, mean(rexp(n = numb,rate = lambda)))
}
# Calculation of the average of all the thousand means
mean_exp <- mean(my_means)
print(paste("The maximum is : ",round(max(my_means),4),". The minimum is : ",
round(min(my_means),4),". The mean is : ",round(mean(my_means),4),sep = ""))
## [1] "The maximum is : 8.1401. The minimum is : 3.1262. The mean is : 5.0451"
count <- sum(my_means-qnorm(p = .975,mean = 5,sd = (1/lambda)/sqrt(numb))>0)
count <- count + sum(my_means-qnorm(p = .025,mean = 5,sd = (1/lambda)/sqrt(numb))<0)
print(count/1000*100)
## [1] 4.5
mpt <- my_plot_fun(my_means) +
scale_x_continuous(breaks=seq(round(min(my_means))-1, round(max(my_means))+1, by = .5),
limits =c(2.5,9)) +
geom_histogram(alpha=0.6,binwidth=.15,colour="black", fill="#935430") +
geom_vline(aes(xintercept=mean_exp, colour = "Simulation mean",
linetype="Simulation mean"), size=1) +
geom_vline(aes(xintercept=5, colour="Normal mean", linetype="Normal mean"), size=1)
## Warning: Removed 2 rows containing missing values (geom_bar).
set.seed(10)
my_variances = NULL
# By a for loop, add the variance of exponential distribution of 40 observations and a rate of .2
for (i in 1 : N) {
my_variances <- c(my_variances, var(rexp(n = numb,rate = lambda)))
}
mpt2 <- my_plot_fun(my_variances) +
xlim(0,100)+
geom_histogram(alpha=0.6,binwidth=2,colour="black", fill="#235440") +
geom_vline(aes(xintercept=25, colour = "Theoretical variance", linetype="Theoretical variance"),
size=1)
## Warning: Removed 2 rows containing missing values (geom_bar).
var_exp <- var(my_means)
The theoretical variance of one exponential distribution is
(1/lambda)^2=25. And the theoretical variance of the
averages is (1/lambda)^2/numb=0.625.
print(paste("The theoretical variance for averages is :",(1/lambda)^2/numb,", against
the simulation variance : ",var_exp,"."))
## [1] "The theoretical variance for averages is : 0.625 , against \n the simulation variance : 0.637254390249431 ."
The values are very close, this confirm the CLT. mpt3 <- my_plot_fun(my_means) +
scale_x_continuous(breaks=seq(round(min(my_means))-1,round(max(my_means)) +1, by = .5),
limits =c(1,9) ) +
geom_histogram(aes(y=..density..),alpha=0.6,binwidth=.15,colour="black")+
stat_function(fun = dnorm,n = 100, args = list(mean = 5, sd = (1/lambda)/sqrt(numb)),
geom = 'area', alpha = 0.5,
aes(colour = "Normal distribution",fill="Normal distribution"))+
geom_density(alpha=.5,aes(colour = "Simulated data",fill="Simulated data"))
## Warning: Removed 2 rows containing missing values (geom_bar).