Exercise:
# This is a small demonstartion of the command replicate(). It replicates drawing a sample of size 15, 10 times
# from a normal distribution with mean 10 and variance 1. As you can see from the output, the samples appear along
# the columns as the repetitions appear along the columns.
#set.seed(1)
#samples <- replicate(10, rnorm(15, 10, 1))
#samples
# We can cycle through the columns and calculate the average of each column as follows by looping through
# with a for loop.
#for(i in 1:10) {
# cat(mean(samples[,i]), "is the average for column", i, "\n")
#}
# As an exercise read the documentation for the functions apply, sapply etc and see if you can rewrite
# the code to avoid the use of the for loop.
set.seed(98)
samples <- replicate(1000, rnorm(40, 10, 1)) # simulating our experiment.
sample_mean <- 1:1000
for(i in 1:1000) { # This part of the code computes the sample mean for all samples via a for loop.
sample_mean[i] <- mean(samples[,i])
}
#Empirical verification of unbiasedness
mean(sample_mean) # printing the sample mean.
#Normality plot
library(ggplot2)
# Create the normal probability plot (with the help of the inbuilt qq-plot package features).
ggplot(data.frame(sample_mean), aes(sample = sample_mean)) +
stat_qq() +
stat_qq_line() +
labs(title = "Normal Probability Plot")
Question 1:
# This is a small demonstartion of the command replicate(). It replicates drawing a sample of size 15, 10 times
# from a normal distribution with mean 10 and variance 1. As you can see from the output, the samples appear along
# the columns as the repetitions appear along the columns.
set.seed(98)
uniform_samples <- replicate(1000, runif(40, 0, 1))
uniform_samples_mean <- 1:1000
for(i in 1:1000) {
uniform_samples_mean[i] <- mean(uniform_samples[,i])
}
library(ggplot2)
ggplot(data.frame(uniform_samples_mean), aes(sample = uniform_samples_mean)) +
stat_qq() +
stat_qq_line() +
labs(title = "Normality Plot")

# We can cycle through the columns and calculate the average of each column as follows by looping through
# with a for loop.
Based on the normality plot generated according to the sample
averages, the points closely align with the line and there are no
significant deviations. Therefore, it is reasonable to say that the
averages are (approximately) normally distributed.
We will repeat this experiment now setting the sample size to 2 for
the uniform distrubution
set.seed(98)
uniform_samples <- replicate(1000, runif(2, 0, 1))
uniform_samples_mean <- 1:1000
for(i in 1:1000) {
uniform_samples_mean[i] <- mean(uniform_samples[,i])
}
library(ggplot2)
ggplot(data.frame(uniform_samples_mean), aes(sample = uniform_samples_mean)) +
stat_qq() +
stat_qq_line() +
labs(title = "Normality Plot")

This time, based on the normality plot generated according to the
sample averages, the points are highly deviated from the line.
Therefore, it is not reasonable to conclude that the sample averages
here are normally distributed.
Both the first experiment and the second experiment are also
supported with the Central Limit Theorem (ie. first experiment sample
size greater than 30, second experiment sample size less than 30),
therefore our conclusions empirically verify the Central Limit Theorem
for Uniform Distributions.
Question 2:
The code below repeats the outcome/purpose of exercise 1 but does not
use a built-in function that directly draws a normality plot and instead
uses a function that I have written.
set.seed(98)
samples <- replicate(1000, rnorm(40, 10, 1))
sample_mean <- 1:1000
for (i in 1:1000) {
sample_mean[i] <- mean(samples[, i])
}
mean_sample_mean <- mean(sample_mean)
# Calculate observed and theoretical values via textbook method:
calculate_normality_values <- function(sample_mean) {
n <- length(sample_mean)
p <- (1:n - 0.5) / n
theoretical_quantiles <- qnorm(p)
return(theoretical_quantiles)
}
sorted_sample_mean <- sort(sample_mean) # Using order instead of sort
theoretical_quantiles <- calculate_normality_values(sample_mean)
# Plotting the normality distribution from scratch:
plot(theoretical_quantiles, sorted_sample_mean,
xlab = "Theoretical",
ylab = "Observed",
main = "Normality Plot",
pch = 16, col = "blue")
# For reference purposes:
abline(a = mean_sample_mean, b = sd(sample_mean), col = "orange")

Question 3:
Part I:


Part II:
The following code calculates the appropriate Maximum Likelihood
Estimators for μ and σ^2:
set.seed(123) # Setting seed for reproducibility
# Generate a sample of size 3000 from a normal distribution
sample_data <- rnorm(3000, mean = 2, sd = sqrt(3))
# Compute maximum likelihood estimates
mle_mean <- mean(sample_data)
mle_variance <- var(sample_data)
# Print the results
cat("Maximum Likelihood Estimate for μ:", mle_mean, "\n")
Maximum Likelihood Estimate for μ: 2.022217
cat("Maximum Likelihood Estimate for σ^2:", mle_variance, "\n")
Maximum Likelihood Estimate for σ^2: 2.960085
Question 4:
set.seed(98)
lambda <- 4
sample_size <- 40
num_samples <- 10000
confidence_level <- 0.95
# Function used to calculate the confidence interval for the mean:
calculate_confidence_interval <- function(sample, confidence_level) {
sample_mean <- mean(sample)
standard_error <- sd(sample) / sqrt(length(sample))
z_value <- qnorm((confidence_level + 1) / 2)
margin_of_error <- z_value * standard_error
lower_bound <- sample_mean - margin_of_error
upper_bound <- sample_mean + margin_of_error
return(c(lower_bound, upper_bound))
}
# Simulate drawing 10000 samples and check inclusion of theoretical mean
proportion_containing_theoretical_mean <- 0
for (i in 1:num_samples) {
# Drawing sample for Distribution
sample_data <- rexp(sample_size, rate = lambda)
# Calculating confidence interval for the mean
confidence_interval <- calculate_confidence_interval(sample_data, confidence_level)
# Checking if the theoretical mean is in the confidence interval
theoretical_mean <- 1/lambda
if (theoretical_mean >= confidence_interval[1] && theoretical_mean <= confidence_interval[2]) {
proportion_containing_theoretical_mean <- proportion_containing_theoretical_mean + 1
}
}
# Calculating the proportion
proportion_containing_theoretical_mean <- proportion_containing_theoretical_mean / num_samples
cat("Proportion of Confidence Intervals containing theoretical mean:", proportion_containing_theoretical_mean, "\n")
Proportion of Confidence Intervals containing theoretical mean: 0.9267
We can see that this proportion of Confidence Intervals containing
the theoretical mean is roughly 0.93. This aligns with my understand of
what a Confidence Interval is because a 95% confidence interval means
that if we conduct the same sampling procedure many times and construct
confidence intervals from those samples, we would expect about 95% of
those intervals to contain the true population parameter (theoretical
mean in this case). The value we obtained is very close to this, and
with more samples it could only get closer.
Question 5:
To see if we can use the Central Limit Theorem, we must verify the
respective three specicies’ sample size is large enough (greater than
30):
cat("Sample Size (setosa):", length(setosa_data), "\n")
Sample Size (setosa): 50
cat("Sample Size (versicolor):", length(versicolor_data), "\n")
Sample Size (versicolor): 50
cat("Sample Size (virginica):", length(virginica_data), "\n")
Sample Size (virginica): 50
Yes, we can use the Central Limit Theorem since the sample sizes are
large enough and proceed:
# Load the iris dataset
data(iris)
# Split the data based on Species
setosa_data <- iris$Petal.Length[iris$Species == "setosa"]
versicolor_data <- iris$Petal.Length[iris$Species == "versicolor"]
virginica_data <- iris$Petal.Length[iris$Species == "virginica"]
# Confidence interval function using zconfint
calculate_confidence_interval <- function(data, confidence_level = 0.95) {
mean_val <- mean(data)
standard_error <- sd(data) / sqrt(length(data))
z_value <- qnorm((confidence_level + 1) / 2)
margin_of_error <- z_value * standard_error
lower_bound <- mean_val - margin_of_error
upper_bound <- mean_val + margin_of_error
return(c(lower_bound, upper_bound))
}
# Calculate confidence intervals for Petal.Length for each species
confidence_interval_setosa <- calculate_confidence_interval(setosa_data)
confidence_interval_versicolor <- calculate_confidence_interval(versicolor_data)
confidence_interval_virginica <- calculate_confidence_interval(virginica_data)
# Print the confidence intervals
cat("Confidence Interval (setosa):", confidence_interval_setosa, "\n")
Confidence Interval (setosa): 1.413864 1.510136
cat("Confidence Interval (versicolor):", confidence_interval_versicolor, "\n")
Confidence Interval (versicolor): 4.12975 4.39025
cat("Confidence Interval (virginica):", confidence_interval_virginica, "\n")
Confidence Interval (virginica): 5.399025 5.704975
---
title: "Project #3 - Faiz Shaikh"
output: html_notebook
---

# Exercise:

```{r}

# This is a small demonstartion of the command replicate(). It replicates drawing a sample of size 15, 10 times
# from a normal distribution with mean 10 and variance 1. As you can see from the output, the samples appear along
# the columns as the repetitions appear along the columns.

#set.seed(1)
#samples <- replicate(10, rnorm(15, 10, 1))
#samples
	
# We can cycle through the columns and calculate the average of each column as follows by looping through 
# with a for loop. 


#for(i in 1:10) {  
#    cat(mean(samples[,i]), "is the average for column", i, "\n")
#}

# As an exercise read the documentation for the functions apply, sapply etc and see if you can rewrite 
# the code to avoid the use of the for loop.

set.seed(98)
samples <- replicate(1000, rnorm(40, 10, 1)) # simulating our experiment.
sample_mean <- 1:1000


for(i in 1:1000) {  # This part of the code computes the sample mean for all samples via a for loop.
    sample_mean[i] <- mean(samples[,i])
}

#Empirical verification of unbiasedness
mean(sample_mean) # printing the sample mean.

#Normality plot

library(ggplot2)

# Create the normal probability plot (with the help of the inbuilt qq-plot package features).
ggplot(data.frame(sample_mean), aes(sample = sample_mean)) +
stat_qq() +
stat_qq_line() +
labs(title = "Normal Probability Plot")
```



# Question 1:

```{r}
set.seed(98)
uniform_samples <- replicate(1000, runif(40, 0, 1))

uniform_samples_mean <- 1:1000
for(i in 1:1000) {  
    uniform_samples_mean[i] <- mean(uniform_samples[,i])
}

library(ggplot2)
ggplot(data.frame(uniform_samples_mean), aes(sample = uniform_samples_mean)) +
  stat_qq() +
  stat_qq_line() +
  labs(title = "Normality Plot")
```
Based on the normality plot generated according to the sample averages, the points closely align with the line and there are no significant deviations. Therefore, it is reasonable to say that the averages are (approximately) normally distributed.

We will repeat this experiment now setting the sample size to 2 for the uniform distrubution
```{r}
set.seed(98)
uniform_samples <- replicate(1000, runif(2, 0, 1))

uniform_samples_mean <- 1:1000
for(i in 1:1000) {  
    uniform_samples_mean[i] <- mean(uniform_samples[,i])
}

library(ggplot2)
ggplot(data.frame(uniform_samples_mean), aes(sample = uniform_samples_mean)) +
  stat_qq() +
  stat_qq_line() +
  labs(title = "Normality Plot")
```
This time, based on the normality plot generated according to the sample averages, the points are highly deviated from the line. Therefore, it is not reasonable to conclude that the sample averages here are normally distributed.

Both the first experiment and the second experiment are also supported with the Central Limit Theorem (ie. first experiment sample size greater than 30, second experiment sample size less than 30), therefore our conclusions empirically verify the Central Limit Theorem for Uniform Distributions.



# Question 2:

The code below repeats the outcome/purpose of exercise 1 but does not use a built-in function that directly draws a normality plot and instead uses a function that I have written.

```{r}
set.seed(98)
samples <- replicate(1000, rnorm(40, 10, 1))
sample_mean <- 1:1000

for (i in 1:1000) {
  sample_mean[i] <- mean(samples[, i])
}

mean_sample_mean <- mean(sample_mean)

# Calculate observed and theoretical values via textbook method:
  calculate_normality_values <- function(sample_mean) {
    n <- length(sample_mean)
    p <- (1:n - 0.5) / n
    theoretical_quantiles <- qnorm(p)
    return(theoretical_quantiles)
  }

  sorted_sample_mean <- sort(sample_mean)  # Using order instead of sort
  theoretical_quantiles <- calculate_normality_values(sample_mean)
  
  # Plotting the normality distribution from scratch:
  plot(theoretical_quantiles, sorted_sample_mean,
       xlab = "Theoretical",
       ylab = "Observed",
       main = "Normality Plot",
       pch = 16, col = "blue")
  
  # For reference purposes:
  abline(a = mean_sample_mean, b = sd(sample_mean), col = "orange")
```

# Question 3:

# Part I:

  ![](/Users/faizshaikh/Downloads/IMG_0234.jpg)

  ![](/Users/faizshaikh/Downloads/IMG_0235.jpg)


# Part II:

The following code calculates the appropriate Maximum Likelihood Estimators for μ and σ^2:
```{r}
set.seed(98)  # Setting seed for reproducibility

# Generate a sample of size 3000 from a normal distribution
sample_data <- rnorm(3000, mean = 2, sd = sqrt(3))

# Compute maximum likelihood estimates
mle_mean <- mean(sample_data)
mle_variance <- var(sample_data)

# Print the results
cat("Maximum Likelihood Estimate for μ:", mle_mean, "\n")
cat("Maximum Likelihood Estimate for σ^2:", mle_variance, "\n")
```


# Question 4:

```{r}
set.seed(98)

lambda <- 4
sample_size <- 40
num_samples <- 10000
confidence_level <- 0.95

# Function used to calculate the confidence interval for the mean:
calculate_confidence_interval <- function(sample, confidence_level) {
  sample_mean <- mean(sample)
  standard_error <- sd(sample) / sqrt(length(sample))
  z_value <- qnorm((confidence_level + 1) / 2) # Using the leverage of the Q-Norm function.
  
  margin_of_error <- z_value * standard_error
  
  lower_bound <- sample_mean - margin_of_error
  upper_bound <- sample_mean + margin_of_error
  
  return(c(lower_bound, upper_bound))
}

# Simulate drawing 10000 samples and check inclusion of theoretical mean
proportion_containing_theoretical_mean <- 0

for (i in 1:num_samples) {
  # Drawing sample for Distribution
  sample_data <- rexp(sample_size, rate = lambda)
  
  # Calculating confidence interval for the mean
  confidence_interval <- calculate_confidence_interval(sample_data, confidence_level)
  
  # Checking if the theoretical mean is in the confidence interval
  theoretical_mean <- 1/lambda
  if (theoretical_mean >= confidence_interval[1] && theoretical_mean <= confidence_interval[2]) {
    proportion_containing_theoretical_mean <- proportion_containing_theoretical_mean + 1
  }
}

# Calculating the proportion
proportion_containing_theoretical_mean <- proportion_containing_theoretical_mean / num_samples
cat("Proportion of Confidence Intervals containing Theoretical Mean:", proportion_containing_theoretical_mean, "\n")
```
We can see that this proportion of Confidence Intervals containing the theoretical mean is roughly 0.93. This aligns with my understand of what a Confidence Interval is because a 95% confidence interval means that if we conduct the same sampling procedure many times and construct confidence intervals from those samples, we would expect about 95% of those intervals to contain the true population parameter (theoretical mean in this case). The value we obtained is very close to this, and with more samples it could only get closer.



# Question 5:

To see if we can use the Central Limit Theorem, we must verify the respective three specicies' sample size is large enough (greater than 30):
```{r}
cat("Sample Size (setosa):", length(setosa_data), "\n")
cat("Sample Size (versicolor):", length(versicolor_data), "\n")
cat("Sample Size (virginica):", length(virginica_data), "\n")
```

Yes, we can use the Central Limit Theorem since the sample sizes are large enough and proceed:
```{r}

set.seed(98)

# Loading the iris data set
data(iris)

# Splitting the data based on Species
setosa_data <- iris$Petal.Length[iris$Species == "setosa"]
versicolor_data <- iris$Petal.Length[iris$Species == "versicolor"]
virginica_data <- iris$Petal.Length[iris$Species == "virginica"]

# Confidence interval function
calculate_confidence_interval <- function(data, confidence_level = 0.95) {
  mean_val <- mean(data)
  standard_error <- sd(data) / sqrt(length(data))
  z_value <- qnorm((confidence_level + 1) / 2) # Using the Q-Norm function again
  
  margin_of_error <- z_value * standard_error
  
  lower_bound <- mean_val - margin_of_error
  upper_bound <- mean_val + margin_of_error
  
  return(c(lower_bound, upper_bound))
}

# Calculating the confidence intervals for Petal.Length for each species
confidence_interval_setosa <- calculate_confidence_interval(setosa_data)
confidence_interval_versicolor <- calculate_confidence_interval(versicolor_data)
confidence_interval_virginica <- calculate_confidence_interval(virginica_data)

# Printing the respective confidence intervals
cat("Confidence Interval (setosa):", confidence_interval_setosa, "\n")
cat("Confidence Interval (versicolor):", confidence_interval_versicolor, "\n")
cat("Confidence Interval (virginica):", confidence_interval_virginica, "\n")
```