COD_week5_1_MGK_BTE3207

Minsik Kim

2024-09-30

Before begin..

Let’s load the SBP dataset.

dataset_sbp <- read.csv(file = "Inha/5_Lectures/2024/Advanced biostatistics/scripts/BTE3207_Advanced_Biostatistics/dataset/sbp_dataset_korea_2013-2014.csv")

head(dataset_sbp)
##   SEX BTH_G SBP DBP FBS DIS  BMI
## 1   1     1 116  78  94   4 16.6
## 2   1     1 100  60  79   4 22.3
## 3   1     1 100  60  87   4 21.9
## 4   1     1 111  70  72   4 20.2
## 5   1     1 120  80  98   4 20.0
## 6   1     1 115  79  95   4 23.1

Degree of freedom

With smaller sampling number, the variance gets smaller

set.seed(1)
replicate(5000, sample(dataset_sbp$SBP, 150)) %>%
        data.frame() %>% 
        colMeans() %>%
        hist(100,
             main = "Distribution of 5000 sample mean p-hat of hypertation from 1M subjects.\nEach sample N = 150") +
        stat_function(fun = dnorm,
                      args = list(mean = mean(dataset_sbp$SBP),
                                  sd = sd(dataset_sbp$SBP)))

## NULL
dataset_sbp$SBP %>%
        data.frame() %>% 
        colMeans() %>%
        hist(100,
             main = "Distribution of total sample mean p-hat of hypertation from 1M subjects.\nEach sample N = 150") +
        stat_function(fun = dnorm,
                      args = list(mean = mean(dataset_sbp$SBP),
                                  sd = sd(dataset_sbp$SBP)))

## NULL
set.seed(1)
ggplot(data = dataset_sbp) +
        geom_histogram(aes(x = SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       fill = "white",
                       col = "black") +
        ggtitle("Histogram of \nSBP of 1M Koreans") +
        ylab("Count") +
        xlim(c(75, 180)) +
        theme_classic(base_size = 20, base_family = "serif") +
        xlab("Normalized SBP (mmHg)") +
        stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
                                               sd = sd(dataset_sbp$SBP)))

set.seed(1)

ggplot(data = dataset_sbp %>% sample_n(50)) +
        geom_histogram(aes(x = SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       fill = "white",
                       col = "black") +
        ggtitle("Histogram of \nSBP of random 50 Koreans") +
        ylab("Count") +
        xlim(c(75, 180)) +
        theme_classic(base_size = 20, base_family = "serif") +
        xlab("Normalized SBP (mmHg)") +
        stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
                                               sd = sd(dataset_sbp$SBP)))

set.seed(1)

ggplot(data = dataset_sbp %>% sample_n(30)) +
        geom_histogram(aes(x = SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       fill = "white",
                       col = "black") +
        ggtitle("Histogram of \nSBP of random 30 Koreans") +
        ylab("Count") +
        xlim(c(75, 180)) +
        theme_classic(base_size = 20, base_family = "serif") +
        xlab("Normalized SBP (mmHg)") +
        stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
                                               sd = sd(dataset_sbp$SBP)))

set.seed(1)

ggplot(data = dataset_sbp %>% sample_n(20)) +
        geom_histogram(aes(x = SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       fill = "white",
                       col = "black") +
        ggtitle("Histogram of \nSBP of random 20 Koreans") +
        ylab("Count") +
        xlim(c(75, 180)) +
        theme_classic(base_size = 20, base_family = "serif") +
        xlab("Normalized SBP (mmHg)") +
        stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
                                               sd = sd(dataset_sbp$SBP)))

set.seed(1)

ggplot(data = dataset_sbp %>% sample_n(10)) +
        geom_histogram(aes(x = SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       fill = "white",
                       col = "black") +
        ggtitle("Histogram of \nSBP of random 10 Koreans") +
        ylab("Count") +
        xlim(c(75, 180)) +
        theme_classic(base_size = 20, base_family = "serif") +
        xlab("Normalized SBP (mmHg)") +
        stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
                                               sd = sd(dataset_sbp$SBP)))

set.seed(1)

ggplot(data = dataset_sbp %>% sample_n(5)) +
        geom_histogram(aes(x = SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       fill = "white",
                       col = "black") +
        ggtitle("Histogram of \nSBP of random 5 Koreans") +
        ylab("Count") +
        xlim(c(75, 180)) +
        theme_classic(base_size = 20, base_family = "serif") +
        xlab("Normalized SBP (mmHg)") +
        stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
                                               sd = sd(dataset_sbp$SBP)))

set.seed(1)
ggplot(data = dataset_sbp %>% sample_n(2)) +
        geom_histogram(aes(x = SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       fill = "white",
                       col = "black") +
        ggtitle("Histogram of \nSBP of random 2 Koreans") +
        ylab("Count") +
        xlim(c(75, 180)) +
        theme_classic(base_size = 20, base_family = "serif") +
        xlab("Normalized SBP (mmHg)") +
        stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
                                               sd = sd(dataset_sbp$SBP)))

Comparing data

To compare two different sample groups, we can caculate the difference between samples and look at CIs of difference between them. The schematic diagram of how this process works is listed in below figures. Note: this is not a exact statistical test but drwan to help understanding the basic concept of it

dataset_sbp$hypertension <- ifelse(dataset_sbp$SBP > 130 | dataset_sbp$DBP > 80,
                                   1,
                                   0)
set.seed(1)
ggplot(data = dataset_sbp %>% sample_n(1000)) +
        geom_histogram(aes(x = SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       #fill = "white",
                       #col = "black",
                       alpha = 0.8) +
        ggtitle("Histogram of SBP") +
        ylab("Proportion") +
        xlim(c(75, 180)) +
        theme_classic(base_size = 15, base_family = "serif") +
        guides(fill = guide_legend(title = "Hypertension")) +
        xlab("SBP (mmHg)")

set.seed(1)
ggplot(data = dataset_sbp %>% sample_n(1000)) +
        geom_histogram(aes(x = SBP,
                           y = ..density..,
                           fill=factor(hypertension)),
                       #binwidth = 0.01,
                       #fill = "white",
                       #col = "black",
                       alpha = 0.4) +
        ggtitle("Histogram of SBP by two group") +
        ylab("Proportion") +
        xlim(c(75, 180)) +
        theme_classic(base_size = 15, base_family = "serif") +
        guides(fill = guide_legend(title = "Hypertension")) +
        xlab("SBP (mmHg)") +
set.seed(1)

ggplot(data = dataset_sbp %>% sample_n(1000)) +
        geom_histogram(aes(x = SBP,
                           y = ..density..,
                           fill=factor(hypertension)),
                       #binwidth = 0.01,
                       #fill = "white",
                       #col = "black",
                       alpha = 0.2) +
        ggtitle("Histogram of SBP by two group with normal curve") +
        ylab("Proportion") +
        xlim(c(75, 180)) +
        theme_classic(base_size = 15, base_family = "serif") +
        guides(fill = guide_legend(title = "Hypertension")) +
        xlab("SBP (mmHg)") +
        stat_function(fun = dnorm,
                          args = list(mean = mean(subset(dataset_sbp, hypertension == 1)$SBP),
                                      sd = sd(subset(dataset_sbp, hypertension == 1)$SBP)),
                      col = "blue") +
                stat_function(fun = dnorm,
                          args = list(mean = mean(subset(dataset_sbp, hypertension == 0)$SBP),
                                      sd = sd(subset(dataset_sbp, hypertension == 0)$SBP)),
                          col = "red")

set.seed(1)
high <- dataset_sbp %>% sample_n(1000) %>% subset(., hypertension == 1) %>% sample_n(100)
set.seed(1)
low <- dataset_sbp %>% sample_n(1000) %>% subset(., hypertension == 0) %>% sample_n(100)

ggplot() +
        geom_histogram(aes(x = high$SBP-low$SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       #fill = "white",
                       #col = "black",
                       alpha = 0.2) +
        ggtitle("Histogram of SBP **difference** by two group with normal curve") +
        ylab("Proportion") +
        #xlim(c(75, 180)) +
        theme_classic(base_size = 15, base_family = "serif") +
        theme(plot.title = ggtext::element_markdown()) +
        guides(fill = guide_legend(title = "Hypertension")) +
        xlab("SBP (mmHg)")

ggplot() +
        geom_histogram(aes(x = high$SBP-low$SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       #fill = "white",
                       #col = "black",
                       alpha = 0.2) +
        ggtitle("Histogram of SBP **difference** by two group with normal curve") +
        ylab("Proportion") +
        #xlim(c(75, 180)) +
        theme_classic(base_size = 15, base_family = "serif") +
        theme(plot.title = ggtext::element_markdown()) +
        guides(fill = guide_legend(title = "Hypertension")) +
        xlab("SBP (mmHg)") +
        stat_function(fun = dnorm,
                      args = list(mean = mean(high$SBP-low$SBP),
                                      sd = sd(high$SBP-low$SBP))) 

ggplot() +
        geom_histogram(aes(x = high$SBP-low$SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       #fill = "white",
                       #col = "black",
                       alpha = 0.2) +
        ggtitle("Histogram of SBP **difference** by two group with normal curve") +
        ylab("Proportion") +
        #xlim(c(75, 180)) +
        theme_classic(base_size = 15, base_family = "serif") +
        theme(plot.title = ggtext::element_markdown()) +
        guides(fill = guide_legend(title = "Hypertension")) +
        xlab("SBP (mmHg)") +
        stat_function(fun = dnorm,
                      args = list(mean = mean(high$SBP-low$SBP),
                                      sd = sd(high$SBP-low$SBP))) +
        geom_vline(aes(xintercept = 0), color = "red", linetype = "dashed", size = 1)

t.test(high$SBP, low$SBP, paired = F)
## 
##  Welch Two Sample t-test
## 
## data:  high$SBP and low$SBP
## t = 14.769, df = 195.57, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  18.67229 24.42771
## sample estimates:
## mean of x mean of y 
##    136.81    115.26

CIs of sample proportion

The Confidence interval of sample proportion need to employ exact test.

Here is an example how to calculate the CIs for p-hat in R.

install.packages("interpretCI")
## 
## The downloaded binary packages are in
##  /var/folders/qj/y615pkf53ms_hcxg1f9zjkyc0000gp/T//Rtmp6s8NbG/downloaded_packages
library(interpretCI)

#Normal approximation (2 sd)
x=propCI(n = 20, p = 0.15, alpha = 0.05)
x
## $data
## # A tibble: 1 × 1
##   value
##   <lgl>
## 1 NA   
## 
## $result
##   alpha  n df    p P        se critical        ME        lower     upper
## 1  0.05 20 19 0.15 0 0.0798436 1.959964 0.1564906 -0.006490575 0.3064906
##                        CI        z     pvalue alternative
## 1 0.15 [95CI -0.01; 0.31] 1.878673 0.06028917   two.sided
## 
## $call
## propCI(n = 20, p = 0.15, alpha = 0.05)
## 
## attr(,"measure")
## [1] "prop"
#Normal approximation 
x=propCI(n = 20, p = 0.15, alpha = 0.05, alternative =  "wald")
x
## $data
## # A tibble: 1 × 1
##   value
##   <lgl>
## 1 NA   
## 
## $result
##   alpha  n df    p P        se critical       ME      lower    upper
## 1  0.05 20 19 0.15 0 0.0798436 1.644854 0.131331 0.01866897 0.281331
##                       CI        z     pvalue alternative
## 1 0.15 [95CI 0.02; 0.28] 1.878673 0.03014459        wald
## 
## $call
## propCI(n = 20, p = 0.15, alpha = 0.05, alternative = "wald")
## 
## attr(,"measure")
## [1] "prop"
#exact method - Clopper-Pearson interval.
binom.test(x = 20 * 0.15, n = 20, conf.level = 0.95)
## 
##  Exact binomial test
## 
## data:  20 * 0.15 and 20
## number of successes = 3, number of trials = 20, p-value = 0.002577
## alternative hypothesis: true probability of success is not equal to 0.5
## 95 percent confidence interval:
##  0.03207094 0.37892683
## sample estimates:
## probability of success 
##                   0.15

Bibliography

## Computing. R Foundation for Statistical Computing, Vienna, Austria. <https://www.R-project.org/>. We have invested a lot of time and effort in creating R, please cite it when using it for data analysis. See also 'citation("pkgname")' for citing R packages.
## Grolemund G, Hayes A, Henry L, Hester J, Kuhn M, Pedersen TL, Miller E, Bache SM, Müller K, Ooms J, Robinson D, Seidel DP, Spinu V, Takahashi K, Vaughan D, Wilke C, Woo K, Yutani H (2019). "Welcome to the tidyverse." Journal of Open Source Software_, *4*(43), 1686. doi:10.21105/joss.01686 <https://doi.org/10.21105/joss.01686>.
## version 0.4.4, <https://CRAN.R-project.org/package=reactable>.
## J, reikoch, Beasley W, O'Connor B, Warnes GR, Quinn M, Kamvar ZN, Gao C (2024). yaml: Methods to Convert R Data to YAML and Back_. R package version 2.3.9, <https://CRAN.R-project.org/package=yaml>. ATTENTION: This citation information has been auto-generated from the package DESCRIPTION file and may need manual editing, see 'help("citation")'.
## R. version 0.5.0. Buffalo, New York. http://github.com/trinker/pacman