Clear R

rm(list = ls()) #clear environment
gc() #clear unused memory 
##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 523150 28.0    1164521 62.2   660385 35.3
## Vcells 952553  7.3    8388608 64.0  1769489 13.6
cat("\f") #clear console
#Import Packages
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Part 1

#set up a sequence of values for x-axis (from -5 to 5 at .01 steps)
x_value <- seq(-5, 5, .01)
#create data frames for distributions (normal and t with dof = 2, 5, 15, 30,120)

normal_dist <- data.frame(x = x_value, y = dnorm(x_value))

t_dist_2 <- data.frame(x = x_value, y = dt(x_value, df = 2))

t_dist_5 <- data.frame(x = x_value, y = dt(x_value, df = 5))

t_dist_15 <- data.frame(x = x_value, y = dt(x_value, df = 15))

t_dist_30 <- data.frame(x = x_value, y = dt(x_value, df = 30))

t_dist_120 <- data.frame(x = x_value, y = dt(x_value, df = 120))
#Combine the Data Frames

combined_data <- bind_rows(
  data.frame(distribution = "Normal", normal_dist),
  data.frame(distribution = "t (df = 2)", t_dist_2),
  data.frame(distribution = "t (df = 5)", t_dist_5),
  data.frame(distribution = "t (df = 15)", t_dist_15),
  data.frame(distribution = "t (df = 30)", t_dist_30),
  data.frame(distribution = "t (df = 120)", t_dist_120)
)
#plot the graph
ggplot(data = combined_data,
       mapping = aes(x = x,
                     y = y,
                     color = distribution)
       ) +
  geom_line(size = 1,
            linetype = ifelse(test = combined_data$distribution == "Normal",
                              yes = "solid",
                              no = "dashed"
            )
  ) +
  labs(title = "Normal and T-Distributions",
       x = "Value",
       y = "Density") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Part 2

#Enter Data Code

set.seed(123)
mu <- 108
sigma <- 7.2
data_values <- rnorm(n = 1000, mean = mu, sd = sigma)

normal_data <- data.frame(x = data_values)
#Plot Normally Distributed Data Histogram

ggplot(data = normal_data, 
       mapping = aes(x = x,)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

  labs(title = "Normally Distributed Data",
       x = "x-values",
       y = "y-values") +
  theme_minimal()
## NULL
z_dist <- (data_values - mu) / sigma
z_dist1 <- data.frame(x = z_dist)
#Plot Z-score histogram of the data
ggplot(data = z_dist1, 
       mapping = aes(x = x,)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

  labs(title = "Z-scores of Normal Distributed Data",
       x = "x-values",
       y = "y-values") +
  theme_minimal()
## NULL
#create data frame with cdf values for the normal generated data
normal_data_cdf <- data.frame(x = data_values, y = dnorm(data_values, mean = 108, sd = 7.2))

#create data frame with cdf values of the z-scores from the random generated data
z_dist_cdf <- data.frame(x = z_dist, y = dnorm(z_dist, mean = 0, sd = 1))
combined_data_2 <-  bind_rows(
  data.frame(distribution = "Normal Random Values", normal_data_cdf),
  data.frame(distribution = "z-scores", z_dist_cdf))
#plot the graph of normal generated data cdf
ggplot(data = normal_data_cdf,
       mapping = aes(x = x,
                     y = y) 
       ) +
  geom_line() +
 labs(title = "Normal Random Generated Dist.",
       x = "Value",
       y = "Density")

#plot the graph of z-scores of the normal generated data cdf
ggplot(data = z_dist_cdf,
       mapping = aes(x = x,
                     y = y) 
       ) +
  geom_line() +
 labs(title = "Z-score of Random Generated",
       x = "Value",
       y = "Density")

Yes, the two charts have the same distributional shape. This is because they are the same distribution (normal). The randomly generated data was generated based off a normal distribution random number generator. By definition, the z-distribution is the standardized normal distribution (mean = 0, sd = 1). To create the data/graph for the z-scores of the random generated numbers, the all the data points were normalized to show how many standard deviations away from the mean they are.

In a normally distributed data set (like here), there is no data lost between the data set and the z-scores. The standard deviation size only changes, which proportionately changes the data from raw to z-score.

Part 3

P-Value - A p- value, in statistical testing, gives the probability of obtaining a specific value in a given sample distribution.

The p-value is used to test how likely the null hypothesis is given the sample data and the point you are testing.

You must pick a significance level before the test (commonly .05) and compare the p-value to the significance level. If the p-value is greater we can accept the null or status quo. If the p-value is less than significance value we can reject the null.

For example, we believe the mean of the population is 10 with a standard deviation of X. We take a sample and find the mean is 7. We test to see if the true mean is less than 10.

Our p-value would give us the probability of getting a mean of 7 or less in a sample. If the probability is less than our significance level we can reject the null and say the true mean is lower than 10.