## n = 15
## Creating 1000 test samples of a Binomial distribution with each of a size of n = 15
test_da <- matrix(0,nrow = 1000, ncol = 15)
for (i in 1:1000)
{
test_da[i,] <- rbinom(15,10,0.35)}
## Creating the mean for each sample
mean_data15 <- cbind(rowMeans(test_da))
## Ploting the means of all samples
hist(mean_data15,main="Sample-size of 15")
## n = 50
## Creating 1000 test samples of a Binomial distribution with each of a size of n = 50
test_da50 <- matrix(0,nrow = 1000, ncol = 50)
for (i in 1:1000)
{
test_da50[i,] <- rbinom(50,10,0.35)}
## Creating the mean for each sample
mean_data50 <- cbind(rowMeans(test_da50))
## Ploting the means of all samples
hist(mean_data50,main="Sample-Size of 50")
Observed: From the two histograms we can observe that with a bigger sample size the variance is getting smaller and the distribution is shaped more like a Gaussian, which demonstates the Central Limit theorem
## n = 15
## Creating 1000 test samples of a Poison distribution with each of a size of n = 15
test_da_poi10 <- matrix(0,nrow = 1000, ncol = 15)
for (i in 1:1000)
{
test_da_poi10[i,] <- rpois(15,6)}
mean_data_poi10 <- cbind(rowMeans(test_da_poi10))
## Ploting the means of all samples in a Histogram
hist(mean_data_poi10,main="Sample-size of 15 of Poisson")
## n = 50
## Creating 1000 test samples of a Poison distribution with each of a size of n = 50
test_da_poi50 <- matrix(0,nrow = 1000, ncol = 50)
for (i in 1:1000)
{
test_da_poi50[i,] <- rpois(50,6)}
mean_data_poi50 <- cbind(rowMeans(test_da_poi50))
## Ploting the means of all samples in a Histogram
hist(mean_data_poi50,main="Sample-size of 50 of Poisson")
Observed: With the increasing number of samples once again it can be observed that the plot starts to be shaped like a Normal distribution, with mean in \(\lambda=6\) as expected. That also proves the Central Limit Theorem.
Objective: Check that in order that the variance estimator be an unbiased estimator of the theoretical variance, it is necessary to divide by \(n-1\) instead of \(n\).
\[S_1^2 = \frac{1}{n-1}\sum_i{(x_i- \bar{x})^2}\]
and
\[S_2^2 = \frac{1}{n}\sum_i{(x_i- \bar{x})^2}\]
Compute the sample mean of all the \(S_1^2\) and \(S_2^2\) values and plot its histogram.
## Creating 1000 samples of size n = 10 of a Normal distribution
three_data <- matrix(0,nrow=1000,ncol=10)
for(i in 1:1000){
three_data[i,] <- rnorm(10,30,sqrt(1.5))
}
three_data_mean <- matrix(0,nrow=1000,ncol=1)
three_data_mean <-cbind(rowMeans(three_data))
## Creating the variance for each sample by using the unbiased estimator of n-1
three_data_var1 <- matrix(0,nrow=1000,ncol=1)
for (j in 1:1000){
three_data_var1[j,] <- ((sum((three_data[j,]-three_data_mean[j,1])^2))/((length(three_data[j,])-1)))
}
## Creating the variance for each sample by using the biased estimator of n
three_data_var2 <- matrix(0,nrow=1000,ncol=1)
for (j in 1:1000){
three_data_var2[j,] <- ((sum((three_data[j,]-three_data_mean[j,1])^2))/(length(three_data[j,])))
}
## Ploting the variances and calculating the means of the variances for each, the unbiased and biased calculations
hist(three_data_var1,main="unbiased")
mean(three_data_var1)
## [1] 1.478994
Observed: In the case of unbiased estimator, the estimated \(S_1^2\) is really close to the real value.
hist(three_data_var2,main="biased")
mean(three_data_var2)
## [1] 1.331095
Observed: As it can be clearly seen, biased estimator performs considerably worse.
## Creating 100 samples of size n = 10 of a Normal distribution
four_data <- matrix(0,nrow=100,ncol=10)
for(i in 1:100){
four_data[i,] <- rnorm(10,12,sqrt(3))
}
## Calculating the mean for each sample
four_data_mean <-rowMeans(four_data)
## Calculating the mean of all sample means
formula_mean <- mean(four_data_mean)
## Calculating lower and upper Confidence Intervals
lower_confidence <- formula_mean - qnorm(1-0.05/2)*sqrt(3/10)
upper_confidence <- formula_mean + qnorm(1-0.05/2)*sqrt(3/10)
## Counting how many means lie within the boundaries of the lower and upper confidence interval
countIn <- sum(four_data_mean >= lower_confidence & four_data_mean <= upper_confidence)
## Ploting the sample means and highlight the upper and lower confidence interval
plot(four_data_mean)
abline(h=lower_confidence, col='red')
abline(h=upper_confidence, col='red')
## Counting how many means fit into the upper and lower confidence intervals
countIn
## [1] 92
Observed: With the increase in sample size, we can see that in fact 95% or more of the means are laying within our confidence interval, as expected. But in contrary, with small sample size it is not always clearly observed.
n <- 100
a <- 0.05
## Creating 100 samples of the size n = 10 of a normal distribution
mat <- matrix (0,nrow = 100, ncol = n)
for (i in 1:100) {
mat[i,] <- rnorm(n,12,sqrt(3))
}
## Calculating the variance of each sample
row_var <- matrix (0,nrow = 100, ncol = 1)
for (i in 1:100) {
row_var[i] <- var(mat[i,])
}
## Calculating the mean of all the sample variances
sample_var <- mean(row_var)
## Calculating the lower and upper confidence intervals by using the chi square method
upper_int <- ((n-1)*sample_var)/qchisq(a/2, n-1)
lower_int <- ((n-1)*sample_var)/qchisq(1-a/2, n-1)
plot(row_var)
abline(h=lower_int, col='red')
abline(h=upper_int, col='red')
## Counting how many variances fit into the upper and lower confidence intervals
countIn <- sum(row_var >= lower_int & row_var <= upper_int)
countIn
## [1] 98
Observed: Again, with the increase in sample size, we can see that \(>=95%\) of the variances are laying within our confidence interval, as expected.