This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
datas <- read.csv("C:\\Users\\karth\\Downloads\\Child Growth and Malnutrition.csv")
view(datas)
datas$Wasting = as.numeric(datas$Wasting)
## Warning: NAs introduced by coercion
datas$Overweight = as.numeric(datas$Overweight)
## Warning: NAs introduced by coercion
datas$Severe.wasting = as.numeric(datas$Severe.wasting)
## Warning: NAs introduced by coercion
1st Null Hypothesis - The value of Severe Wasting is always more than the corresponding value of Wasting
datas$difference <- datas$Wasting - datas$Severe.wasting
a <- mean(datas$Wasting, na.rm = TRUE)
b <- mean(datas$Severe.wasting, na.rm = TRUE)
c <- sd(datas$Wasting, na.rm = TRUE)
d <- sd(datas$Severe.wasting, na.rm = TRUE)
a - b
## [1] 4.881972
Choosing an alpha-value of 0.10 Choosing power-level as 0.95
x <- length(datas$Wasting)
y <- length(datas$Severe.wasting)
s_pooled <- sqrt(
(((x-1)*c^2) + ((y-1)*d^2))/
((x-1)+(y-1)))
cohen_d <- (a - b)/s_pooled
cohen_d
## [1] 1.099829
The above value represents the minimum effect size for the chosen Null Hypothesis
observed_diff <- a - b
paste("Observed Difference: ", observed_diff)
## [1] "Observed Difference: 4.88197248917647"
# the same bootstrapping function from lab_06
bootstrap <- function (x, func=mean, n_iter=10^4) {
# empty vector to be filled with values from each iteration
func_values <- c(NULL)
# we simulate sampling `n_iter` times
for (i in 1:n_iter) {
# pull the sample (e.g., a vector or data frame)
x_sample <- sample_n(x, size = length(x), replace = TRUE)
# add on this iteration's value to the collection
func_values <- c(func_values, func(x_sample))
}
return(func_values)
}
diff_in_avg <- function (x_data) {
diff <- mean(x_data$Wasting, na.rm = TRUE) - mean(x_data$Severe.wasting, na.rm = TRUE)
return(diff)
}
diffs_in_avgs <- bootstrap(datas, diff_in_avg, n_iter = 100)
ggplot() +
geom_function(xlim = c(-5, 5),
fun = function(x) dnorm(x, mean = 0,
sd = sd(diffs_in_avgs))) +
geom_vline(mapping = aes(xintercept = a - b,
color = paste("observed: ",
round(a - b)))) +
labs(title = "Bootstrapped Sampling Distribution of Value Differences",
x = "Difference in Value",
y = "Probability Density",
color = "") +
scale_x_continuous(breaks = seq(-5, 5, 1)) +
theme_minimal()
Alternative Hypothesis - The mean of Wasting > The mean of Severe.wasting
critical_value <- 2.5
delta <- 2
f_0 <- function(x) dnorm(x, mean = 0)
f_a <- function(x) dnorm(x, mean = delta)
ggplot() +
stat_function(mapping = aes(fill = 'power'),
fun = f_a,
xlim = c(critical_value, 4),
geom = "area") +
stat_function(mapping = aes(fill = 'alpha'),
fun = f_0,
xlim = c(critical_value, 4),
geom = "area") +
geom_function(mapping = aes(color = 'Null Hypothesis'),
xlim = c(-4, 4), fun = f_0) +
geom_function(mapping = aes(color = 'Alternative Hypothesis'),
xlim = c(-4, 4), fun = f_a) +
geom_vline(mapping = aes(xintercept = critical_value,
color = "Critical Value")) +
geom_vline(mapping = aes(xintercept = delta,
color = "Delta")) +
geom_vline(mapping = aes(xintercept = 0),
color = 'gray', linetype=2) +
labs(title = "One-Tailed Test",
x = "Mean Value",
y = "Probability Density",
color = "",
fill = "") +
scale_x_continuous(breaks = seq(-4, 4, 1)) +
scale_fill_manual(values = c('lightblue', 'pink')) +
scale_color_manual(values = c('darkred', 'darkorange', 'darkblue',
'darkgreen')) +
theme_minimal()
f_sampling <- function(x) dnorm(x, mean = 0,
sd = sd(diffs_in_avgs))
ggplot() +
stat_function(mapping = aes(fill = 'more extreme samples'),
fun = f_sampling,
xlim = c(observed_diff, 5),
geom = "area") +
geom_function(xlim = c(-5, 5),
fun = f_sampling) +
geom_vline(mapping = aes(xintercept = observed_diff,
color = paste("observed: ",
round(observed_diff, 1)))) +
labs(title = "Bootstrapped Sampling Distribution of Value Difference",
x = "Difference in Value",
y = "Probability Density",
color = "",
fill = "") +
scale_x_continuous(breaks = seq(-5, 5, 1)) +
scale_fill_manual(values = 'lightblue') +
theme_minimal()
diffs_in_avgs_d <- diffs_in_avgs - mean(diffs_in_avgs)
paste("p-value ",
sum(observed_diff < diffs_in_avgs_d) /
length(diffs_in_avgs_d))
## [1] "p-value 0"
The above graph and chunk shows the p-value for the Fisher’s test for the given hypothesis
2nd Null Hypothesis - Underweight is always greater than Stunting
datas$Ratio <- datas$Underweight/datas$Stunting
e <- mean(datas$Underweight, na.rm = TRUE)
f <- mean(datas$Stunting, na.rm = TRUE)
g <- sd(datas$Underweight, na.rm = TRUE)
h <- sd(datas$Stunting, na.rm = TRUE)
f/e
## [1] 1.759338
Choosing an alpha-value of 0.05 Choosing power-level as 0.90
i <- length(datas$Underweight)
j <- length(datas$Stunting)
s_pooled <- sqrt(
(((i-1)*g^2) + ((j-1)*h^2))/
((i-1)+(j-1)))
cohen_d_1 <- (f - e)/s_pooled
cohen_d_1
## [1] 0.8132272
The minimum effect size is 0.813
observed_diff <- f - e
paste("Observed Difference: ", observed_diff)
## [1] "Observed Difference: 12.0064779902036"
# the same bootstrapping function from lab_06
bootstrap <- function (x, func=mean, n_iter=10^4) {
# empty vector to be filled with values from each iteration
func_values <- c(NULL)
# we simulate sampling `n_iter` times
for (i in 1:n_iter) {
# pull the sample (e.g., a vector or data frame)
x_sample <- sample_n(x, size = length(x), replace = TRUE)
# add on this iteration's value to the collection
func_values <- c(func_values, func(x_sample))
}
return(func_values)
}
diff_in_avg <- function (x_data) {
diff <- mean(x_data$Stunting, na.rm = TRUE) - mean(x_data$Underweight, na.rm = TRUE)
return(diff)
}
diffs_in_avgs <- bootstrap(datas, diff_in_avg, n_iter = 100)
ggplot() +
geom_function(xlim = c(-10, 10),
fun = function(x) dnorm(x, mean = 0,
sd = sd(diffs_in_avgs))) +
geom_vline(mapping = aes(xintercept = f - e,
color = paste("observed: ",
round(f - e)))) +
labs(title = "Bootstrapped Sampling Distribution of Body Parameter Differences",
x = "Difference in Value",
y = "Probability Density",
color = "") +
scale_x_continuous(breaks = seq(-10, 10, 1)) +
theme_minimal()
Alternative Hypothesis - The mean of Stunting > The mean of
Underweight
critical_value <- 2
delta <- 1.5
f_0 <- function(x) dnorm(x, mean = 0)
f_a <- function(x) dnorm(x, mean = delta)
ggplot() +
stat_function(mapping = aes(fill = 'power'),
fun = f_a,
xlim = c(critical_value, 4),
geom = "area") +
stat_function(mapping = aes(fill = 'alpha'),
fun = f_0,
xlim = c(critical_value, 4),
geom = "area") +
geom_function(mapping = aes(color = 'Null Hypothesis'),
xlim = c(-4, 4), fun = f_0) +
geom_function(mapping = aes(color = 'Alternative Hypothesis'),
xlim = c(-4, 4), fun = f_a) +
geom_vline(mapping = aes(xintercept = critical_value,
color = "Critical Value")) +
geom_vline(mapping = aes(xintercept = delta,
color = "Delta")) +
geom_vline(mapping = aes(xintercept = 0),
color = 'gray', linetype=2) +
labs(title = "One-Tailed Test",
x = "Mean Value",
y = "Probability Density",
color = "",
fill = "") +
scale_x_continuous(breaks = seq(-4, 4, 1)) +
scale_fill_manual(values = c('lightblue', 'pink')) +
scale_color_manual(values = c('darkred', 'darkorange', 'darkblue',
'darkgreen')) +
theme_minimal()
f_sampling <- function(x) dnorm(x, mean = 0,
sd = sd(diffs_in_avgs))
ggplot() +
stat_function(mapping = aes(fill = 'more extreme samples'),
fun = f_sampling,
xlim = c(observed_diff, 5),
geom = "area") +
geom_function(xlim = c(-15, 15),
fun = f_sampling) +
geom_vline(mapping = aes(xintercept = observed_diff,
color = paste("observed: ",
round(observed_diff, 1)))) +
labs(title = "Bootstrapped Sampling Distribution of Body difference",
x = "Difference in Value",
y = "Probability Density",
color = "",
fill = "") +
scale_x_continuous(breaks = seq(-15, 15, 1)) +
scale_fill_manual(values = 'lightblue') +
theme_minimal()
diffs_in_avgs_d <- diffs_in_avgs - mean(diffs_in_avgs)
paste("p-value ",
sum(observed_diff < diffs_in_avgs_d) /
length(diffs_in_avgs_d))
## [1] "p-value 0"
The above graph and chunk shows the p-value for the Fisher’s test for the given hypothesis