Course Project 1

1. Title

Exponential distribution approaches to normal form with increasing sample sizes

(By Jihan Yu)

2. Overview

Central Limit Theorem means that the distribution of averages of iid(identically ideally distributed) variables becomes that of a standard normal as the sample size increases. In the exponential distribution, the sample means & variances approach those of normal one with increasing the number of samples. And the distribution of means or variances become similar to normal one.

3. Simulations and Results - Means : Sample vs Theoretical

Prepare data from exponential distribution

declare required library & assign the parameters
define the function which calculates the means and variances according to the number of samples

workingpath <- "C:\\Users\\MED1\\Stat_Inf"
setwd(workingpath)
library(nortest)

lambda <- 0.2;  n <- 40
set.seed(2020)

mean.var <- function(lambda, n, iter){
    exp.vector <- NULL
    mns <- NULL;  vrs <- NULL;
    for(i in c(1:iter)){
        exp.vector <- rexp(n, rate=lambda)
        mns <- c(mns, mean(exp.vector))
        vrs <- c(vrs, var(exp.vector))
    }
    return(data.frame(mns, vrs))
}

res.10 <- mean.var(lambda, n, 10)
res.100 <- mean.var(lambda, n, 100)
res.1000 <- mean.var(lambda, n, 1000)
res.10000 <- mean.var(lambda, n, 10000)

Sample means versus theoretical means

compare the sample menans versus theoretical means as the sample numbers (10, 100, 1000, 10000)

mean.theo <- 1/lambda
mean.10 <- mean(res.10$mns);       mean.100 <- mean(res.100$mns);
mean.1000 <- mean(res.1000$mns);   mean.10000 <- mean(res.10000$mns)

mean.total <- c(mean.theo, mean.10, mean.100, mean.1000, mean.10000)
names(mean.total) <- c("theoretical", "n=10", "n=100", "n=1000", "n=10000")
print(mean.total)

## theoretical        n=10       n=100      n=1000     n=10000 
##    5.000000    5.197750    5.136882    5.023983    4.993842

par(mfrow=c(2,2))
hist(res.10$mns, breaks=seq(from=0, to=10, by=0.1), prob=TRUE, main="n=10", xlab="means")
lines(density(res.10$mns))
abline(v=c(mean.theo, mean.10), col=c("red", "blue"), lwd=3)

hist(res.100$mns, breaks=seq(from=0, to=10, by=0.1), prob=TRUE, main="n=100", xlab="means")
lines(density(res.100$mns))
abline(v=c(mean.theo, mean.100), col=c("red", "blue"), lwd=3)

hist(res.1000$mns, breaks=seq(from=0, to=10, by=0.1), prob=TRUE, main="n=1000", xlab="means")
lines(density(res.1000$mns))
abline(v=c(mean.theo, mean.1000), col=c("red", "blue"), lwd=3)

hist(res.10000$mns, breaks=seq(from=0, to=10, by=0.1), prob=TRUE, main="n=10000", xlab="means")
lines(density(res.10000$mns))
abline(v=c(mean.theo, mean.10000), col=c("red", "blue"), lwd=3)

par(mfrow=c(1,1))

With increasing the sample sizes, the sample means approach the theoretical mean.
The distributions of sample mean become similar to the normal bell shapes.

4. Simulations and Results - Variances : Sample vs Theoretical

Sample variances versus theoretical variances

compare the sample variances versus theoretical variances as the sample numbers (10, 100, 1000, 10000)

var.theo <- (1/lambda)^2
var.10 <- mean(res.10$vrs);     var.100 <- mean(res.100$vrs);
var.1000 <- mean(res.1000$vrs); var.10000 <- mean(res.10000$vrs);

var.total <- c(var.theo, var.10, var.100, var.1000, var.10000)
names(var.total) <- c("theoretical", "n=10", "n=100", "n=1000", "n=10000")
print(var.total)

## theoretical        n=10       n=100      n=1000     n=10000 
##    25.00000    32.21623    25.60763    25.26288    24.77814

par(mfrow=c(2,2))
hist(res.10$vrs, breaks=seq(from=0, to=150, by=1), prob=TRUE, main="n=10", xlab="variances")
lines(density(res.10$vrs))
abline(v=c(var.theo, var.10), col=c("red", "blue"), lwd=3)

hist(res.100$vrs, breaks=seq(from=0, to=150, by=1), prob=TRUE, main="n=100", xlab="variances")
lines(density(res.100$vrs))
abline(v=c(var.theo, var.100), col=c("red", "blue"), lwd=3)

hist(res.1000$vrs, breaks=seq(from=0, to=150, by=1), prob=TRUE, main="n=1000", xlab="variances")
lines(density(res.1000$vrs))
abline(v=c(var.theo, var.1000), col=c("red", "blue"), lwd=3)

hist(res.10000$vrs, breaks=seq(from=0, to=150, by=1), prob=TRUE, main="n=10000", xlab="variances")
lines(density(res.10000$vrs))
abline(v=c(var.theo, var.10000), col=c("red", "blue"), lwd=3)

par(mfrow=c(1,1))

As the cases of exponential means, the sample variances approach the theoretical one with increasing the sample sizes,
The distributions of sample mean become bell shapes with the sample sizes, but they are slightly left skewed rather than exact symmetric form.

5. Simulations and Results - Distribution

Distribution

To test normality, Kolmogorov-Smirnov tests are performed with the means & variances results as the number of samples
And Q-Q plots are also examined for the normality

mns.pvalue <- c(lillie.test(res.10$mns)$p.value, lillie.test(res.100$mns)$p.value,
                lillie.test(res.1000$mns)$p.value, lillie.test(res.10000$mns)$p.value)
names(mns.pvalue) <- c("n=10", "n=100", "n=1000", "n=10000")
print(mns.pvalue)

##         n=10        n=100       n=1000      n=10000 
## 7.980189e-01 4.197747e-01 9.939683e-02 4.426317e-19

par(mfrow=c(2,2))
qqnorm(res.10$mns);     qqline(res.10$mns, col=2);  
qqnorm(res.100$mns);    qqline(res.100$mns, col=2);  
qqnorm(res.1000$mns);   qqline(res.1000$mns, col=2);  
qqnorm(res.10000$mns);  qqline(res.10000$mns, col=2);

par(mfrow=c(1,1))

vrs.pvalue <- c(lillie.test(res.10$vrs)$p.value, lillie.test(res.100$vrs)$p.value,
                lillie.test(res.1000$vrs)$p.value, lillie.test(res.10000$vrs)$p.value)
names(vrs.pvalue) <- c("n=10", "n=100", "n=1000", "n=10000")
print(vrs.pvalue)

##          n=10         n=100        n=1000       n=10000 
##  1.152828e-01  7.603439e-02  4.333480e-23 1.289085e-222

par(mfrow=c(2,2))
qqnorm(res.10$vrs);     qqline(res.10$vrs, col=2);  
qqnorm(res.100$vrs);    qqline(res.100$vrs, col=2);  
qqnorm(res.1000$vrs);   qqline(res.1000$vrs, col=2);  
qqnorm(res.10000$vrs);  qqline(res.10000$vrs, col=2);

par(mfrow=c(1,1))

In the case of means, Q-Q plots reveal that there are almost dots on the Q-Q lines. But with increasing the sample sizes, some dots are deviated from the diagonal line at the left & right far zone.
As the results of Komogorov-Smirnov tests, the p-values are higher than significant level (alpha = 0.05) when the sample numbers are 10, 100, and 100. When the sample number is 10000, the p-value is less than significant level, so we cannot reject the null hypothesis which the sample follows the normal distribution.
In the case of variances, Q-Q plots show that many dots are on the Q-Q lines, as the mean. But with increasing the sample sizes, more dots are deviated from the diagnoal lines.
In the results of normality test, the p-values are higher than significant level when the sample size are 10, 100. But the sample numbers are 1000 or 10000, the p-values are less than significant level, so we cannot reject the null hypothesis. (The variances have normal distribution.)