Week 7 Assignment - Karthik B

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

datas <- read.csv("C:\\Users\\karth\\Downloads\\Child Growth and Malnutrition.csv")

view(datas)

datas$Wasting = as.numeric(datas$Wasting)

## Warning: NAs introduced by coercion

datas$Overweight = as.numeric(datas$Overweight)

## Warning: NAs introduced by coercion

datas$Severe.wasting = as.numeric(datas$Severe.wasting)

## Warning: NAs introduced by coercion

1st Null Hypothesis - The value of Severe Wasting is always more than the corresponding value of Wasting

datas$difference <- datas$Wasting - datas$Severe.wasting
a <- mean(datas$Wasting, na.rm = TRUE)
b <- mean(datas$Severe.wasting, na.rm = TRUE)
c <- sd(datas$Wasting, na.rm = TRUE)
d <- sd(datas$Severe.wasting, na.rm = TRUE)
a - b

## [1] 4.881972

Choosing an alpha-value of 0.10 Choosing power-level as 0.95

x <- length(datas$Wasting)
y <- length(datas$Severe.wasting)
s_pooled <- sqrt(
  (((x-1)*c^2) + ((y-1)*d^2))/
    ((x-1)+(y-1)))
cohen_d <- (a - b)/s_pooled
cohen_d

## [1] 1.099829

The above value represents the minimum effect size for the chosen Null Hypothesis

observed_diff <- a - b
paste("Observed Difference: ", observed_diff)

## [1] "Observed Difference:  4.88197248917647"

# the same bootstrapping function from lab_06
bootstrap <- function (x, func=mean, n_iter=10^4) {
  # empty vector to be filled with values from each iteration
  func_values <- c(NULL)
  
  # we simulate sampling `n_iter` times
  for (i in 1:n_iter) {
    # pull the sample (e.g., a vector or data frame)
    x_sample <- sample_n(x, size = length(x), replace = TRUE)
    
    # add on this iteration's value to the collection
    func_values <- c(func_values, func(x_sample))
  }
  
  return(func_values)
}

diff_in_avg <- function (x_data) {
  diff <- mean(x_data$Wasting, na.rm = TRUE) - mean(x_data$Severe.wasting, na.rm = TRUE)
  
  return(diff)
}

diffs_in_avgs <- bootstrap(datas, diff_in_avg, n_iter = 100)

ggplot() +
  geom_function(xlim = c(-5, 5), 
                fun = function(x) dnorm(x, mean = 0, 
                                        sd = sd(diffs_in_avgs))) +
  geom_vline(mapping = aes(xintercept = a - b,
                           color = paste("observed: ",
                                         round(a - b)))) +
  labs(title = "Bootstrapped Sampling Distribution of Value Differences",
       x = "Difference in Value",
       y = "Probability Density",
       color = "") +
  scale_x_continuous(breaks = seq(-5, 5, 1)) +
  theme_minimal()

Alternative Hypothesis - The mean of Wasting > The mean of Severe.wasting

critical_value <- 2.5
delta <- 2

f_0 <- function(x) dnorm(x, mean = 0)
f_a <- function(x) dnorm(x, mean = delta)

ggplot() +
  stat_function(mapping = aes(fill = 'power'),
                fun = f_a, 
                xlim = c(critical_value, 4),
                geom = "area") +
    stat_function(mapping = aes(fill = 'alpha'),
                fun = f_0, 
                xlim = c(critical_value, 4),
                geom = "area") +
  geom_function(mapping = aes(color = 'Null Hypothesis'),
                xlim = c(-4, 4), fun = f_0) +
  geom_function(mapping = aes(color = 'Alternative Hypothesis'),
                xlim = c(-4, 4), fun = f_a) +
  geom_vline(mapping = aes(xintercept = critical_value,
                           color = "Critical Value")) +
  geom_vline(mapping = aes(xintercept = delta,
                           color = "Delta")) +
  geom_vline(mapping = aes(xintercept = 0),
             color = 'gray', linetype=2) +
  labs(title = "One-Tailed Test",
       x = "Mean Value",
       y = "Probability Density",
       color = "",
       fill = "") +
  scale_x_continuous(breaks = seq(-4, 4, 1)) +
  scale_fill_manual(values = c('lightblue', 'pink')) +
  scale_color_manual(values = c('darkred', 'darkorange', 'darkblue', 
                                'darkgreen')) +
  theme_minimal()

f_sampling <- function(x) dnorm(x, mean = 0, 
                                sd = sd(diffs_in_avgs))

ggplot() +
  stat_function(mapping = aes(fill = 'more extreme samples'),
                fun = f_sampling, 
                xlim = c(observed_diff, 5),
                geom = "area") +
  geom_function(xlim = c(-5, 5), 
                fun = f_sampling) +
  geom_vline(mapping = aes(xintercept = observed_diff,
                           color = paste("observed: ",
                                         round(observed_diff, 1)))) +
  labs(title = "Bootstrapped Sampling Distribution of Value Difference",
       x = "Difference in Value",
       y = "Probability Density",
       color = "",
       fill = "") +
  scale_x_continuous(breaks = seq(-5, 5, 1)) +
  scale_fill_manual(values = 'lightblue') +
  theme_minimal()

diffs_in_avgs_d <- diffs_in_avgs - mean(diffs_in_avgs)

paste("p-value ", 
      sum(observed_diff < diffs_in_avgs_d) /
        length(diffs_in_avgs_d))

## [1] "p-value  0"

The above graph and chunk shows the p-value for the Fisher’s test for the given hypothesis

2nd Null Hypothesis - Underweight is always greater than Stunting

datas$Ratio <- datas$Underweight/datas$Stunting
e <- mean(datas$Underweight, na.rm = TRUE)
f <- mean(datas$Stunting, na.rm = TRUE)
g <- sd(datas$Underweight, na.rm = TRUE)
h <- sd(datas$Stunting, na.rm = TRUE)
f/e

## [1] 1.759338

Choosing an alpha-value of 0.05 Choosing power-level as 0.90

i <- length(datas$Underweight)
j <- length(datas$Stunting)
s_pooled <- sqrt(
  (((i-1)*g^2) + ((j-1)*h^2))/
    ((i-1)+(j-1)))
cohen_d_1 <- (f - e)/s_pooled
cohen_d_1

## [1] 0.8132272

The minimum effect size is 0.813

observed_diff <- f - e
paste("Observed Difference: ", observed_diff)

## [1] "Observed Difference:  12.0064779902036"

# the same bootstrapping function from lab_06
bootstrap <- function (x, func=mean, n_iter=10^4) {
  # empty vector to be filled with values from each iteration
  func_values <- c(NULL)
  
  # we simulate sampling `n_iter` times
  for (i in 1:n_iter) {
    # pull the sample (e.g., a vector or data frame)
    x_sample <- sample_n(x, size = length(x), replace = TRUE)
    
    # add on this iteration's value to the collection
    func_values <- c(func_values, func(x_sample))
  }
  
  return(func_values)
}

diff_in_avg <- function (x_data) {
  diff <- mean(x_data$Stunting, na.rm = TRUE) - mean(x_data$Underweight, na.rm = TRUE)
  
  return(diff)
}

diffs_in_avgs <- bootstrap(datas, diff_in_avg, n_iter = 100)

ggplot() +
  geom_function(xlim = c(-10, 10), 
                fun = function(x) dnorm(x, mean = 0, 
                                        sd = sd(diffs_in_avgs))) +
  geom_vline(mapping = aes(xintercept = f - e,
                           color = paste("observed: ",
                                         round(f - e)))) +
  labs(title = "Bootstrapped Sampling Distribution of Body Parameter Differences",
       x = "Difference in Value",
       y = "Probability Density",
       color = "") +
  scale_x_continuous(breaks = seq(-10, 10, 1)) +
  theme_minimal()

Alternative Hypothesis - The mean of Stunting > The mean of Underweight

critical_value <- 2
delta <- 1.5

f_0 <- function(x) dnorm(x, mean = 0)
f_a <- function(x) dnorm(x, mean = delta)

ggplot() +
  stat_function(mapping = aes(fill = 'power'),
                fun = f_a, 
                xlim = c(critical_value, 4),
                geom = "area") +
    stat_function(mapping = aes(fill = 'alpha'),
                fun = f_0, 
                xlim = c(critical_value, 4),
                geom = "area") +
  geom_function(mapping = aes(color = 'Null Hypothesis'),
                xlim = c(-4, 4), fun = f_0) +
  geom_function(mapping = aes(color = 'Alternative Hypothesis'),
                xlim = c(-4, 4), fun = f_a) +
  geom_vline(mapping = aes(xintercept = critical_value,
                           color = "Critical Value")) +
  geom_vline(mapping = aes(xintercept = delta,
                           color = "Delta")) +
  geom_vline(mapping = aes(xintercept = 0),
             color = 'gray', linetype=2) +
  labs(title = "One-Tailed Test",
       x = "Mean Value",
       y = "Probability Density",
       color = "",
       fill = "") +
  scale_x_continuous(breaks = seq(-4, 4, 1)) +
  scale_fill_manual(values = c('lightblue', 'pink')) +
  scale_color_manual(values = c('darkred', 'darkorange', 'darkblue', 
                                'darkgreen')) +
  theme_minimal()

f_sampling <- function(x) dnorm(x, mean = 0, 
                                sd = sd(diffs_in_avgs))

ggplot() +
  stat_function(mapping = aes(fill = 'more extreme samples'),
                fun = f_sampling, 
                xlim = c(observed_diff, 5),
                geom = "area") +
  geom_function(xlim = c(-15, 15), 
                fun = f_sampling) +
  geom_vline(mapping = aes(xintercept = observed_diff,
                           color = paste("observed: ",
                                         round(observed_diff, 1)))) +
  labs(title = "Bootstrapped Sampling Distribution of Body difference",
       x = "Difference in Value",
       y = "Probability Density",
       color = "",
       fill = "") +
  scale_x_continuous(breaks = seq(-15, 15, 1)) +
  scale_fill_manual(values = 'lightblue') +
  theme_minimal()

diffs_in_avgs_d <- diffs_in_avgs - mean(diffs_in_avgs)

paste("p-value ", 
      sum(observed_diff < diffs_in_avgs_d) /
        length(diffs_in_avgs_d))

## [1] "p-value  0"

The above graph and chunk shows the p-value for the Fisher’s test for the given hypothesis

Week 7 Assignment - Karthik B

2023-10-08

R Markdown