Before begin..

Let’s load the SBP dataset.

dataset_sbp <- read.csv(file = "/Users/minsikkim/Dropbox (Personal)/Inha/5_Lectures/Advanced biostatistics/scripts/BTE3207_Advanced_Biostatistics/dataset/sbp_dataset_korea_2013-2014.csv")

head(dataset_sbp)
##   SEX BTH_G SBP DBP FBS DIS  BMI
## 1   1     1 116  78  94   4 16.6
## 2   1     1 100  60  79   4 22.3
## 3   1     1 100  60  87   4 21.9
## 4   1     1 111  70  72   4 20.2
## 5   1     1 120  80  98   4 20.0
## 6   1     1 115  79  95   4 23.1

Degree of freedom

With smaller sampling number, the variance gets smaller

set.seed(1)
replicate(5000, sample(dataset_sbp$SBP, 150)) %>%
        data.frame() %>% 
        colMeans() %>%
        hist(100,
             main = "Distribution of 5000 sample mean p-hat of hypertation from 1M subjects.\nEach sample N = 150") +
        stat_function(fun = dnorm,
                      args = list(mean = mean(dataset_sbp$SBP),
                                  sd = sd(dataset_sbp$SBP)))

## NULL
dataset_sbp$SBP %>%
        data.frame() %>% 
        colMeans() %>%
        hist(100,
             main = "Distribution of 5000 sample mean p-hat of hypertation from 1M subjects.\nEach sample N = 150") +
        stat_function(fun = dnorm,
                      args = list(mean = mean(dataset_sbp$SBP),
                                  sd = sd(dataset_sbp$SBP)))

## NULL
set.seed(1)
ggplot(data = dataset_sbp) +
        geom_histogram(aes(x = SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       fill = "white",
                       col = "black") +
        ggtitle("Histogram of \nSBP of 1M Koreans") +
        ylab("Count") +
        xlim(c(75, 180)) +
        theme_classic(base_size = 20, base_family = "serif") +
        xlab("Normalized SBP (mmHg)") +
        stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
                                               sd = sd(dataset_sbp$SBP)))

set.seed(1)

ggplot(data = dataset_sbp %>% sample_n(50)) +
        geom_histogram(aes(x = SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       fill = "white",
                       col = "black") +
        ggtitle("Histogram of \nSBP of random 50 Koreans") +
        ylab("Count") +
        xlim(c(75, 180)) +
        theme_classic(base_size = 20, base_family = "serif") +
        xlab("Normalized SBP (mmHg)") +
        stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
                                               sd = sd(dataset_sbp$SBP)))

set.seed(1)

ggplot(data = dataset_sbp %>% sample_n(30)) +
        geom_histogram(aes(x = SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       fill = "white",
                       col = "black") +
        ggtitle("Histogram of \nSBP of random 30 Koreans") +
        ylab("Count") +
        xlim(c(75, 180)) +
        theme_classic(base_size = 20, base_family = "serif") +
        xlab("Normalized SBP (mmHg)") +
        stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
                                               sd = sd(dataset_sbp$SBP)))

set.seed(1)

ggplot(data = dataset_sbp %>% sample_n(20)) +
        geom_histogram(aes(x = SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       fill = "white",
                       col = "black") +
        ggtitle("Histogram of \nSBP of random 20 Koreans") +
        ylab("Count") +
        xlim(c(75, 180)) +
        theme_classic(base_size = 20, base_family = "serif") +
        xlab("Normalized SBP (mmHg)") +
        stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
                                               sd = sd(dataset_sbp$SBP)))

set.seed(1)

ggplot(data = dataset_sbp %>% sample_n(10)) +
        geom_histogram(aes(x = SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       fill = "white",
                       col = "black") +
        ggtitle("Histogram of \nSBP of random 10 Koreans") +
        ylab("Count") +
        xlim(c(75, 180)) +
        theme_classic(base_size = 20, base_family = "serif") +
        xlab("Normalized SBP (mmHg)") +
        stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
                                               sd = sd(dataset_sbp$SBP)))

set.seed(1)

ggplot(data = dataset_sbp %>% sample_n(5)) +
        geom_histogram(aes(x = SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       fill = "white",
                       col = "black") +
        ggtitle("Histogram of \nSBP of random 5 Koreans") +
        ylab("Count") +
        xlim(c(75, 180)) +
        theme_classic(base_size = 20, base_family = "serif") +
        xlab("Normalized SBP (mmHg)") +
        stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
                                               sd = sd(dataset_sbp$SBP)))

set.seed(1)
ggplot(data = dataset_sbp %>% sample_n(2)) +
        geom_histogram(aes(x = SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       fill = "white",
                       col = "black") +
        ggtitle("Histogram of \nSBP of random 2 Koreans") +
        ylab("Count") +
        xlim(c(75, 180)) +
        theme_classic(base_size = 20, base_family = "serif") +
        xlab("Normalized SBP (mmHg)") +
        stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
                                               sd = sd(dataset_sbp$SBP)))

Comparing data

To compare two different sample groups, we can caculate the difference between samples and look at CIs of difference between them. The schematic diagram of how this process works is listed in below figures. Note: this is not a exact statistical test but drwan to help understanding the basic concept of it

dataset_sbp$hypertension <- ifelse(dataset_sbp$SBP > 130 | dataset_sbp$DBP > 80,
                                   1,
                                   0)
set.seed(1)
ggplot(data = dataset_sbp %>% sample_n(1000)) +
        geom_histogram(aes(x = SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       #fill = "white",
                       #col = "black",
                       alpha = 0.8) +
        ggtitle("Histogram of SBP") +
        ylab("Proportion") +
        xlim(c(75, 180)) +
        theme_classic(base_size = 15, base_family = "serif") +
        guides(fill = guide_legend(title = "Hypertension")) +
        xlab("SBP (mmHg)")

set.seed(1)
ggplot(data = dataset_sbp %>% sample_n(1000)) +
        geom_histogram(aes(x = SBP,
                           y = ..density..,
                           fill=factor(hypertension)),
                       #binwidth = 0.01,
                       #fill = "white",
                       #col = "black",
                       alpha = 0.4) +
        ggtitle("Histogram of SBP by two group") +
        ylab("Proportion") +
        xlim(c(75, 180)) +
        theme_classic(base_size = 15, base_family = "serif") +
        guides(fill = guide_legend(title = "Hypertension")) +
        xlab("SBP (mmHg)") +
set.seed(1)

ggplot(data = dataset_sbp %>% sample_n(1000)) +
        geom_histogram(aes(x = SBP,
                           y = ..density..,
                           fill=factor(hypertension)),
                       #binwidth = 0.01,
                       #fill = "white",
                       #col = "black",
                       alpha = 0.2) +
        ggtitle("Histogram of SBP by two group with normal curve") +
        ylab("Proportion") +
        xlim(c(75, 180)) +
        theme_classic(base_size = 15, base_family = "serif") +
        guides(fill = guide_legend(title = "Hypertension")) +
        xlab("SBP (mmHg)") +
        stat_function(fun = dnorm,
                          args = list(mean = mean(subset(dataset_sbp, hypertension == 1)$SBP),
                                      sd = sd(subset(dataset_sbp, hypertension == 1)$SBP)),
                      col = "blue") +
                stat_function(fun = dnorm,
                          args = list(mean = mean(subset(dataset_sbp, hypertension == 0)$SBP),
                                      sd = sd(subset(dataset_sbp, hypertension == 0)$SBP)),
                          col = "red")

set.seed(1)
high <- dataset_sbp %>% sample_n(1000) %>% subset(., hypertension == 1) %>% sample_n(100)
set.seed(1)
low <- dataset_sbp %>% sample_n(1000) %>% subset(., hypertension == 0) %>% sample_n(100)

ggplot() +
        geom_histogram(aes(x = high$SBP-low$SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       #fill = "white",
                       #col = "black",
                       alpha = 0.2) +
        ggtitle("Histogram of SBP **difference** by two group with normal curve") +
        ylab("Proportion") +
        #xlim(c(75, 180)) +
        theme_classic(base_size = 15, base_family = "serif") +
        theme(plot.title = ggtext::element_markdown()) +
        guides(fill = guide_legend(title = "Hypertension")) +
        xlab("SBP (mmHg)")

ggplot() +
        geom_histogram(aes(x = high$SBP-low$SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       #fill = "white",
                       #col = "black",
                       alpha = 0.2) +
        ggtitle("Histogram of SBP **difference** by two group with normal curve") +
        ylab("Proportion") +
        #xlim(c(75, 180)) +
        theme_classic(base_size = 15, base_family = "serif") +
        theme(plot.title = ggtext::element_markdown()) +
        guides(fill = guide_legend(title = "Hypertension")) +
        xlab("SBP (mmHg)") +
        stat_function(fun = dnorm,
                      args = list(mean = mean(high$SBP-low$SBP),
                                      sd = sd(high$SBP-low$SBP))) 

ggplot() +
        geom_histogram(aes(x = high$SBP-low$SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       #fill = "white",
                       #col = "black",
                       alpha = 0.2) +
        ggtitle("Histogram of SBP **difference** by two group with normal curve") +
        ylab("Proportion") +
        #xlim(c(75, 180)) +
        theme_classic(base_size = 15, base_family = "serif") +
        theme(plot.title = ggtext::element_markdown()) +
        guides(fill = guide_legend(title = "Hypertension")) +
        xlab("SBP (mmHg)") +
        stat_function(fun = dnorm,
                      args = list(mean = mean(high$SBP-low$SBP),
                                      sd = sd(high$SBP-low$SBP))) +
        geom_vline(aes(xintercept = 0), color = "red", linetype = "dashed", size = 1)

t.test(high$SBP, low$SBP, paired = F)
## 
##  Welch Two Sample t-test
## 
## data:  high$SBP and low$SBP
## t = 14.769, df = 195.57, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  18.67229 24.42771
## sample estimates:
## mean of x mean of y 
##    136.81    115.26

CIs of sample proportion

The Confidence interval of sample proportion need to employ exact test.

Here is an example how to calculate the CIs for p-hat in R.

install.packages("interpretCI")
## 
## The downloaded binary packages are in
##  /var/folders/qj/y615pkf53ms_hcxg1f9zjkyc0000gp/T//RtmpFqlcyI/downloaded_packages
library(interpretCI)
x=propCI(n = 20, p = 0.15, alpha = 0.05)
x
## $data
## # A tibble: 1 × 1
##   value
##   <lgl>
## 1 NA   
## 
## $result
##   alpha  n df    p P        se critical        ME        lower     upper
## 1  0.05 20 19 0.15 0 0.0798436 1.959964 0.1564906 -0.006490575 0.3064906
##                        CI        z     pvalue alternative
## 1 0.15 [95CI -0.01; 0.31] 1.878673 0.06028917   two.sided
## 
## $call
## propCI(n = 20, p = 0.15, alpha = 0.05)
## 
## attr(,"measure")
## [1] "prop"

Bibliography

## Computing. R Foundation for Statistical Computing, Vienna, Austria. <https://www.R-project.org/>. We have invested a lot of time and effort in creating R, please cite it when using it for data analysis. See also 'citation("pkgname")' for citing R packages.
## Grolemund G, Hayes A, Henry L, Hester J, Kuhn M, Pedersen TL, Miller E, Bache SM, Müller K, Ooms J, Robinson D, Seidel DP, Spinu V, Takahashi K, Vaughan D, Wilke C, Woo K, Yutani H (2019). "Welcome to the tidyverse." Journal of Open Source Software_, *4*(43), 1686. doi:10.21105/joss.01686 <https://doi.org/10.21105/joss.01686>.
## R. version 0.5.0. Buffalo, New York. http://github.com/trinker/pacman
## J, reikoch, Beasley W, O'Connor B, Warnes GR, Quinn M, Kamvar ZN (2023). yaml: Methods to Convert R Data to YAML and Back_. R package version 2.3.7, <https://CRAN.R-project.org/package=yaml>. ATTENTION: This citation information has been auto-generated from the package DESCRIPTION file and may need manual editing, see 'help("citation")'.
## Springer-Verlag New York, 2016.
## Grolemund G, Hayes A, Henry L, Hester J, Kuhn M, Pedersen TL, Miller E, Bache SM, Müller K, Ooms J, Robinson D, Seidel DP, Spinu V, Takahashi K, Vaughan D, Wilke C, Woo K, Yutani H (2019). "Welcome to the tidyverse." Journal of Open Source Software_, *4*(43), 1686. doi:10.21105/joss.01686 <https://doi.org/10.21105/joss.01686>.
## R. R package version 2.4.5, <https://CRAN.R-project.org/package=swirl>.