The report is written as Peer Assignment in the course “Statistical Inference” at Johns Hopkins University“. The report investigate the exponential distribution and illustrate via simulation the properties of the distribution of the mean and variance.
* the sample mean and compare it to the theoretical mean of exponential
* the sample variance and compare it to the theoretical variance
* the distribution of mean is approximately normal
The exponential distribution is used to indicate the survival probability of products. Look at Appendix A).
my_lambda <- 1/5; my_size <- 40; my_n <- c( 100, 1000, 5000 ); my_init_seed <- 111124
Estimator of Mean: mean(X) := 1/n * summe( X(i) ) Note: mean(X) is gamma distributed. Look at Appendix B).
Look at Appendix E) call R Function simulate_means()
print( summary_samples, format="html", digits=3 )
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## s_sam_100 3.47 4.41 4.91 5.02 5.55 8.24
## s_sam_1000 2.76 4.38 4.91 4.97 5.46 8.24
## s_sam_5000 2.68 4.43 4.95 4.99 5.49 8.24
multiplot( p1, p2, p3, cols=3 )
Look at Appendix E) call R Function simulate_variances()
print( summary_var_samples, format="html", digits=3 )
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## svs_100 7.81 17.2 22.2 24.7 29.2 66.6
## svs_1000 6.96 17.1 22.4 24.8 30.0 134.2
## svs_5000 4.64 17.0 22.6 24.9 30.0 134.2
multiplot( v1, v2, v3, cols=3 )
The central limit theorem states that the distribution of the sum (or average) of a large number of independent, identically distributed variables will be approximately normal.
Suppose X1, X2, … , Xn i.i.d exponential. The distributation of the random variable Zn := ( lambda * n * (X1 + …+ Xn)/n - n ) / sqrt(n) converges to the standard normal distribution as n converges to infinity.
Look at Appendix C) Q-Q-plot and Histogram of Sample Mean and Shapiro-Wilk normality test.
H0: mean is equal to 0
H1: mean is not equal to 0
z_mns <- (mns2$mns - 1/my_lambda) / ( 1/my_lambda * sqrt(1000))
sample_1000 <- t.test(z_mns, mu=0, alt="two.sided")
## print table with confidence intervals : print( sample_1000, format="html", digits=3 )
mean of x: -0.00019 in confidence interval: [ -0.000503, 0.000123 ] zum level alpha equal 0.95
The mean of exponential distribution is 1/lambda, the standard deviation is also 1/lambda and the variance is 1/(lambda*lambda).
X ~ Exp(lambda) with desity function = lambda * exp{ - lambda * x }
Hypothese Sample mean (size 1000) of exponential is approximately normality distribute.
## check normal of mns2$mns
par(mfrow=c(1,2), mex=0.8)
qqnorm( mns2$mns )
hist( mns2$mns, breaks=20, main="Histogram (size = 1000)" )
shapiro_t_1000 <- shapiro.test(mns2$mns)
## print table with confidence intervals
print( shapiro_t_1000, format="html", digits=3 )
##
## Shapiro-Wilk normality test
##
## data: mns2$mns
## W = 1, p-value = 2e-08
sample_100 <- t.test(mns1$mns, mu=1/my_lambda, alt="two.sided")
sample_1000 <- t.test(mns2$mns, mu=1/my_lambda, alt="two.sided")
sample_5000 <- t.test(mns3$mns, mu=1/my_lambda, alt="two.sided")
samples_conf.int <- as.data.frame( rbind( sample_100, sample_1000, sample_5000 ) )
#### print table with confidence intervals
print( samples_conf.int, format="html", digits=3 )
## statistic parameter p.value conf.int estimate null.value
## sample_100 0.212 99 0.832 4.85, 5.19 5.02 5
## sample_1000 -1.19 999 0.233 4.92, 5.02 4.97 5
## sample_5000 -1.18 4999 0.237 4.97, 5.01 4.99 5
## alternative method data.name
## sample_100 two.sided One Sample t-test mns1$mns
## sample_1000 two.sided One Sample t-test mns2$mns
## sample_5000 two.sided One Sample t-test mns3$mns
simulate_variances <- function() {
for ( idy in 1:3 ) {
set.seed(my_init_seed)
mns <- NULL; pl <- NULL
for ( i in 1 : my_n[idy] ) { mns = c( mns, mean(rexp(n=my_size, rate=my_lambda )) ) }
## histogram
mns<-data.frame( mns )
pl<-ggplot( mns, aes( x=mns ) )
pl<-pl+geom_histogram(aes(y=..density..),binwidth=0.2,col="black",fill="white",na.rm=T)
pl<-pl+geom_density( alpha=.2, fill="#FF6666" )
pl<-pl+geom_vline( xintercept = 1/my_lambda, colour="blue" )
pl<-pl+xlim(1, 9) + ylim(0, 0.6)
pl<-pl+ggtitle( paste( "Mean of", my_n[idy], "Sample" ) )
pl<-pl+xlab( "Mean" ) + ylab( "Frequency / Density" )
if ( idy == 1 ) { p1 <- pl; mns1 <- mns } else { if ( idy == 2 ) {
p2 <- pl; mns2 <- mns } else { p3 <- pl; mns3 <- mns } }
}
s_sam_100<-summary(mns1$mns); s_sam_1000<-summary(mns2$mns); s_sam_5000<-summary(mns3$mns);
summary_samples <- as.data.frame( rbind( s_sam_100, s_sam_1000, s_sam_5000 ) )
}
simulate_variances <- function() {
for ( idy in 1:3 ) {
set.seed( my_init_seed ); vari <- NULL; vl <- NULL
for ( i in 1 : my_n[idy] ) { vari=c(vari,sd(rexp(n=my_size,rate=my_lambda))^2) }
## histogram
vari <- data.frame( vari )
vl<-ggplot( vari, aes( x=vari ) )
vl<-vl+geom_histogram(aes(y=..density..),binwidth=2,col="black",fill="white",na.rm=T)
vl<-vl+geom_density( alpha=.2, fill="#FF6666", na.rm=TRUE )
vl<-vl+geom_vline( xintercept = 1/my_lambda^2, colour="blue" )
vl<-vl+xlim(0, 100) + ylim(0, 0.07)
vl<-vl+ggtitle( paste( "Variance of", my_n[idy], "Sample" ) )
vl<-vl+xlab( "Variance" ) + ylab( "Frequency / Density" )
if ( idy == 1 ) { v1 <- vl; vari1 <- vari } else { if ( idy == 2 ) {
v2 <- vl; vari2 <-vari } else { v3 <- vl; vari3 <- vari } }
}
svs_100<-summary(vari1$vari); svs_1000<-summary(vari2$vari); svs_5000<-summary(vari3$vari);
summary_var_samples <- as.data.frame( rbind( svs_100, svs_1000, svs_5000 ) )
}