rm(list = ls()) # Clear all files from your environment
         gc()            # Clear unused memory
##          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 526428 28.2    1169365 62.5         NA   669428 35.8
## Vcells 970675  7.5    8388608 64.0      16384  1851821 14.2
         cat("\f")       # Clear the console
 graphics.off()  # Clear all graphs

Part 1

The (student) t distribution converges to normal distribution as the degrees of freedom increase (beyond 120). Please plot a normal distribution, and a few t distributions on the same chart with 2, 5, 15, 30, 120 degrees of freedom.

library(ggplot2)

# Set seed for reproducibility
set.seed(123)

# Generating a sequence of values from -4 to 4 with 200 points in between
n_data <- seq(-4, 4, length.out = 200)

# Creating a data frame for nomal distribution
normal_data <- data.frame(x = n_data, 
                          y = dnorm(n_data), 
                          distribution = 'Normal')

# Creating a data frame for t distributions with different degrees of freedom
t_data_2 <- data.frame(x = n_data, y = dt(n_data, df = 2), distribution = 't (df=2)')
t_data_5 <- data.frame(x = n_data, y = dt(n_data, df = 5), distribution = 't (df=5)')
t_data_15 <- data.frame(x = n_data, y = dt(n_data, df = 15), distribution = 't (df=15)')
t_data_30 <- data.frame(x = n_data, y = dt(n_data, df = 30), distribution = 't (df=30)')
t_data_120 <- data.frame(x = n_data, y = dt(n_data, df = 120), distribution = 't (df=120)')

# Combine everything
t_data_all <- rbind(normal_data, t_data_2, t_data_5, t_data_15, t_data_30, t_data_120)

# Plot Graph
ggplot(t_data_all, 
       aes(x = x, 
           y = y, 
           color = distribution)) +
  geom_line() +
  theme_minimal() +
  labs(title = "Normal and t Distributions",
       x = "Value",
       y = "Density")

Part 2

# Set seed for reproducibility
set.seed(123)

# Generate normal data for graph
mu <- 108
sigma <- 7.2
data_values <- rnorm(n = 1000, 
                     mean = mu, # 108
                     sd = sigma) # 7.2

# Calculate Z-scores
z_scores <- (data_values - mu) / sigma

# Plot the original normal data
hist(data_values, 
     main = "Original Normal Data",
     xlab = "Value", 
     ylab = "Frequency", 
     col = "orange", 
     border = "orange")

# Plot the Z-score distribution
hist(z_scores, 
     main = "Z-Score Distribution",
     xlab = "Z-Score", 
     ylab = "Frequency", 
     col = "lightgreen", 
     border = "lightgreen")

The overall distribution shape is similar between both graphs. You can see the Z Score distribution remains a normal distribution but simply centered at 0.

Part 3

In your own words, please explain what is p-value?

The P Value quantifies the strength of evidence against the null hypothesis,

For example a P Value of 0.05 would indicate there is evidence against the null hypothesis as the observed results are statistically significant.