library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(boot)
library(purrr)
df <- read.csv("~/Downloads/ObesityDataSet_raw_and_data_sinthetic.csv", header=TRUE)
df['BMI'] <- df['Weight']/(df['Height']*df['Height'])
uniq_vales_count <- lapply(df,table)
uniq_vales_count[c(1,5)]
## $Gender
##
## Female Male
## 1043 1068
##
## $family_history_with_overweight
##
## no yes
## 385 1726
#Bootstrap function with 1000 iterations
bootstrap <- function(x,size = lenght(x),func = mean, n_iter = 1000){
func_values <- c(NULL)
for (i in 1:n_iter){
x_sample <- sample(x,size = length(x), replace = TRUE)
func_values <- c(func_values, func(x_sample))
}
return (func_values)
}
Null Hypothesis 1: ‘The difference in the mean BMI between male and female is zero’
mean_BMI <- df|> group_by(Gender) |> summarise(count = n(),mean_bmi = mean(BMI), sd_bmi = sd(BMI)) |> arrange(Gender)
mean_BMI
## # A tibble: 2 × 4
## Gender count mean_bmi sd_bmi
## <chr> <int> <dbl> <dbl>
## 1 Female 1043 30.1 9.40
## 2 Male 1068 29.3 6.35
alpha level = probability of True negative,[ i.e., probability that mean_BMI of male and mean_BMI of female is same(hypothesis is true for population) but our results conveys to reject the null hypothesis.] 0.1 is chose for sig.level because it allows for more sensitivity and flexibility in detecting difference in BMI of female and male.
Delta(or effect size) can be adjusted to ‘1’.Since we are dealing with continous numeric means of BMI, which are can be classified into different categories based on rounded numeric values.
Power refers to the probability of correctly rejecting the null hypothesis when a true difference exists. A power of 0.85 means you have an 85% chance of detecting a true difference between male and female BMI if such a difference exists.
test <- power.t.test(delta = 1, #smallest detectable difference
sd = sd(pluck(df,'BMI')), #considered the sd of whole dataset
power = 0.85,
sig.level = 0.1, #alpha value = 0.1 #Type-I error probability
alternative = 'two.sided')
test
##
## Two-sample t test power calculation
##
## n = 923.5175
## delta = 1
## sd = 8.011337
## sig.level = 0.1
## power = 0.85
## alternative = two.sided
##
## NOTE: n is number in *each* group
Therefore,
mean_BMI <- df|> group_by(Gender) |> summarise(count= n(),mean_bmi = mean(BMI), sd_bmi = sd(BMI)) |> arrange(Gender)
mean_BMI
## # A tibble: 2 × 4
## Gender count mean_bmi sd_bmi
## <chr> <int> <dbl> <dbl>
## 1 Female 1043 30.1 9.40
## 2 Male 1068 29.3 6.35
observed_diff = mean_BMI$mean_bmi[1]-mean_BMI$mean_bmi[2]
observed_diff
## [1] 0.8496246
mean_male <- df |> filter(Gender == 'Male') |> pluck('BMI') |> bootstrap()
mean_female <- df |> filter(Gender == 'Female') |> pluck('BMI') |> bootstrap()
diffs_mal_fema <- mean_male - mean_female
ggplot() +
geom_function(xlim = c(-2,2),
fun = function(x) dnorm(x, mean = 0, sd = sd(diffs_mal_fema)))+
geom_vline(mapping = aes(xintercept = observed_diff, color = paste('observed:',observed_diff)))+
labs(title = 'Bootstrapped Sampling Distribution of diff in BMI',
y = 'Probability Density',
x = 'Difference in BMI')+
scale_x_continuous(breaks = seq(-3,3,1)) +
theme_minimal()
Null Hypothesis 2: ‘The difference in the mean BMI between individuals with history of overweight and no history of overweight is zero’
mean_family_history <- df|>group_by(family_history_with_overweight)|>summarise(count = n(), mean_BMI = mean(BMI), sd_BMI = sd(BMI))
mean_family_history
## # A tibble: 2 × 4
## family_history_with_overweight count mean_BMI sd_BMI
## <chr> <int> <dbl> <dbl>
## 1 no 385 21.5 4.21
## 2 yes 1726 31.5 7.50
observed_diff_family <- mean_family_history$mean_BMI[2]-mean_family_history$mean_BMI[1]
observed_diff_family
## [1] 10.02868
mean_yes <- df |> filter(family_history_with_overweight == 'yes') |> pluck('BMI') |> bootstrap(size = 385,n_iter = 10000) #size is same as number of 'no' instances in family_history_with_overweight
mean_no <- df |> filter(family_history_with_overweight == 'no') |> pluck('BMI') |> bootstrap(n_iter = 10000)
diffs_yes_no <- mean_yes - mean_no
diffs_yes_no_d <- diffs_yes_no - mean(diffs_yes_no)
paste('p-value',
sum(abs(observed_diff_family)< abs(diffs_yes_no_d))/length(diffs_yes_no_d))
## [1] "p-value 0"
Conclusion: The null hypothesis can be rejected as p-value is 0 (or negligable). Therefore, we can confidently say that there is some dependence between BMI and having history of overweight in their family.
ggplot() +
geom_function(xlim = c(-14,14),
fun = function(x) dnorm(x, mean = 0, sd = sd(diffs_yes_no_d)))+
geom_vline(mapping = aes(xintercept = observed_diff_family, color = paste('observed:',observed_diff_family)))+
labs(title = 'Bootstrapped Sampling Distribution of diff in BMI',
y = 'Probability Density',
x = 'Difference in BMI')+
scale_x_continuous(breaks = seq(-20,20,10)) +
theme_minimal()
delta = 1
ggplot() +
geom_function(xlim = c(-2,2),
fun = function(x) dnorm(x, mean = 0, sd = sd(diffs_mal_fema)))+
geom_vline(mapping = aes(xintercept = observed_diff, color = paste('observed:',observed_diff)))+
geom_vline(mapping = aes(xintercept = delta , color = paste('delta from 0')))+
geom_vline(mapping = aes(xintercept = - delta , color = paste('delta from 0')))+
labs(title = 'Bootstrapped Sampling Distribution of diff in BMI',
y = 'Probability Density',
x = 'Difference in BMI')+
scale_x_continuous(breaks = seq(-3,3,1)) +
theme_minimal()
ggplot() +
geom_function(xlim = c(-14,14),
fun = function(x) dnorm(x, mean = 0, sd = sd(diffs_yes_no_d)))+
geom_vline(mapping = aes(xintercept = observed_diff_family, color = paste('observed:',observed_diff_family)))+
labs(title = 'Bootstrapped Sampling Distribution of diff in BMI',
y = 'Probability Density',
x = 'Difference in BMI')+
scale_x_continuous(breaks = seq(-20,20,10)) +
theme_minimal()