Let’s load the SBP dataset.
dataset_sbp <- read.csv(file = "/Users/minsikkim/Dropbox (Personal)/Inha/5_Lectures/Advanced biostatistics/scripts/BTE3207_Advanced_Biostatistics/dataset/sbp_dataset_korea_2013-2014.csv")
head(dataset_sbp)
## SEX BTH_G SBP DBP FBS DIS BMI
## 1 1 1 116 78 94 4 16.6
## 2 1 1 100 60 79 4 22.3
## 3 1 1 100 60 87 4 21.9
## 4 1 1 111 70 72 4 20.2
## 5 1 1 120 80 98 4 20.0
## 6 1 1 115 79 95 4 23.1
With smaller sampling number, the variance gets smaller
set.seed(1)
replicate(5000, sample(dataset_sbp$SBP, 150)) %>%
data.frame() %>%
colMeans() %>%
hist(100,
main = "Distribution of 5000 sample mean p-hat of hypertation from 1M subjects.\nEach sample N = 150") +
stat_function(fun = dnorm,
args = list(mean = mean(dataset_sbp$SBP),
sd = sd(dataset_sbp$SBP)))
## NULL
dataset_sbp$SBP %>%
data.frame() %>%
colMeans() %>%
hist(100,
main = "Distribution of 5000 sample mean p-hat of hypertation from 1M subjects.\nEach sample N = 150") +
stat_function(fun = dnorm,
args = list(mean = mean(dataset_sbp$SBP),
sd = sd(dataset_sbp$SBP)))
## NULL
set.seed(1)
ggplot(data = dataset_sbp) +
geom_histogram(aes(x = SBP,
y = ..density..),
#binwidth = 0.01,
fill = "white",
col = "black") +
ggtitle("Histogram of \nSBP of 1M Koreans") +
ylab("Count") +
xlim(c(75, 180)) +
theme_classic(base_size = 20, base_family = "serif") +
xlab("Normalized SBP (mmHg)") +
stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
sd = sd(dataset_sbp$SBP)))
set.seed(1)
ggplot(data = dataset_sbp %>% sample_n(50)) +
geom_histogram(aes(x = SBP,
y = ..density..),
#binwidth = 0.01,
fill = "white",
col = "black") +
ggtitle("Histogram of \nSBP of random 50 Koreans") +
ylab("Count") +
xlim(c(75, 180)) +
theme_classic(base_size = 20, base_family = "serif") +
xlab("Normalized SBP (mmHg)") +
stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
sd = sd(dataset_sbp$SBP)))
set.seed(1)
ggplot(data = dataset_sbp %>% sample_n(30)) +
geom_histogram(aes(x = SBP,
y = ..density..),
#binwidth = 0.01,
fill = "white",
col = "black") +
ggtitle("Histogram of \nSBP of random 30 Koreans") +
ylab("Count") +
xlim(c(75, 180)) +
theme_classic(base_size = 20, base_family = "serif") +
xlab("Normalized SBP (mmHg)") +
stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
sd = sd(dataset_sbp$SBP)))
set.seed(1)
ggplot(data = dataset_sbp %>% sample_n(20)) +
geom_histogram(aes(x = SBP,
y = ..density..),
#binwidth = 0.01,
fill = "white",
col = "black") +
ggtitle("Histogram of \nSBP of random 20 Koreans") +
ylab("Count") +
xlim(c(75, 180)) +
theme_classic(base_size = 20, base_family = "serif") +
xlab("Normalized SBP (mmHg)") +
stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
sd = sd(dataset_sbp$SBP)))
set.seed(1)
ggplot(data = dataset_sbp %>% sample_n(10)) +
geom_histogram(aes(x = SBP,
y = ..density..),
#binwidth = 0.01,
fill = "white",
col = "black") +
ggtitle("Histogram of \nSBP of random 10 Koreans") +
ylab("Count") +
xlim(c(75, 180)) +
theme_classic(base_size = 20, base_family = "serif") +
xlab("Normalized SBP (mmHg)") +
stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
sd = sd(dataset_sbp$SBP)))
set.seed(1)
ggplot(data = dataset_sbp %>% sample_n(5)) +
geom_histogram(aes(x = SBP,
y = ..density..),
#binwidth = 0.01,
fill = "white",
col = "black") +
ggtitle("Histogram of \nSBP of random 5 Koreans") +
ylab("Count") +
xlim(c(75, 180)) +
theme_classic(base_size = 20, base_family = "serif") +
xlab("Normalized SBP (mmHg)") +
stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
sd = sd(dataset_sbp$SBP)))
set.seed(1)
ggplot(data = dataset_sbp %>% sample_n(2)) +
geom_histogram(aes(x = SBP,
y = ..density..),
#binwidth = 0.01,
fill = "white",
col = "black") +
ggtitle("Histogram of \nSBP of random 2 Koreans") +
ylab("Count") +
xlim(c(75, 180)) +
theme_classic(base_size = 20, base_family = "serif") +
xlab("Normalized SBP (mmHg)") +
stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
sd = sd(dataset_sbp$SBP)))
To compare two different sample groups, we can caculate the difference between samples and look at CIs of difference between them. The schematic diagram of how this process works is listed in below figures. Note: this is not a exact statistical test but drwan to help understanding the basic concept of it
dataset_sbp$hypertension <- ifelse(dataset_sbp$SBP > 130 | dataset_sbp$DBP > 80,
1,
0)
set.seed(1)
ggplot(data = dataset_sbp %>% sample_n(1000)) +
geom_histogram(aes(x = SBP,
y = ..density..),
#binwidth = 0.01,
#fill = "white",
#col = "black",
alpha = 0.8) +
ggtitle("Histogram of SBP") +
ylab("Proportion") +
xlim(c(75, 180)) +
theme_classic(base_size = 15, base_family = "serif") +
guides(fill = guide_legend(title = "Hypertension")) +
xlab("SBP (mmHg)")
set.seed(1)
ggplot(data = dataset_sbp %>% sample_n(1000)) +
geom_histogram(aes(x = SBP,
y = ..density..,
fill=factor(hypertension)),
#binwidth = 0.01,
#fill = "white",
#col = "black",
alpha = 0.4) +
ggtitle("Histogram of SBP by two group") +
ylab("Proportion") +
xlim(c(75, 180)) +
theme_classic(base_size = 15, base_family = "serif") +
guides(fill = guide_legend(title = "Hypertension")) +
xlab("SBP (mmHg)") +
set.seed(1)
ggplot(data = dataset_sbp %>% sample_n(1000)) +
geom_histogram(aes(x = SBP,
y = ..density..,
fill=factor(hypertension)),
#binwidth = 0.01,
#fill = "white",
#col = "black",
alpha = 0.2) +
ggtitle("Histogram of SBP by two group with normal curve") +
ylab("Proportion") +
xlim(c(75, 180)) +
theme_classic(base_size = 15, base_family = "serif") +
guides(fill = guide_legend(title = "Hypertension")) +
xlab("SBP (mmHg)") +
stat_function(fun = dnorm,
args = list(mean = mean(subset(dataset_sbp, hypertension == 1)$SBP),
sd = sd(subset(dataset_sbp, hypertension == 1)$SBP)),
col = "blue") +
stat_function(fun = dnorm,
args = list(mean = mean(subset(dataset_sbp, hypertension == 0)$SBP),
sd = sd(subset(dataset_sbp, hypertension == 0)$SBP)),
col = "red")
set.seed(1)
high <- dataset_sbp %>% sample_n(1000) %>% subset(., hypertension == 1) %>% sample_n(100)
set.seed(1)
low <- dataset_sbp %>% sample_n(1000) %>% subset(., hypertension == 0) %>% sample_n(100)
ggplot() +
geom_histogram(aes(x = high$SBP-low$SBP,
y = ..density..),
#binwidth = 0.01,
#fill = "white",
#col = "black",
alpha = 0.2) +
ggtitle("Histogram of SBP **difference** by two group with normal curve") +
ylab("Proportion") +
#xlim(c(75, 180)) +
theme_classic(base_size = 15, base_family = "serif") +
theme(plot.title = ggtext::element_markdown()) +
guides(fill = guide_legend(title = "Hypertension")) +
xlab("SBP (mmHg)")
ggplot() +
geom_histogram(aes(x = high$SBP-low$SBP,
y = ..density..),
#binwidth = 0.01,
#fill = "white",
#col = "black",
alpha = 0.2) +
ggtitle("Histogram of SBP **difference** by two group with normal curve") +
ylab("Proportion") +
#xlim(c(75, 180)) +
theme_classic(base_size = 15, base_family = "serif") +
theme(plot.title = ggtext::element_markdown()) +
guides(fill = guide_legend(title = "Hypertension")) +
xlab("SBP (mmHg)") +
stat_function(fun = dnorm,
args = list(mean = mean(high$SBP-low$SBP),
sd = sd(high$SBP-low$SBP)))
ggplot() +
geom_histogram(aes(x = high$SBP-low$SBP,
y = ..density..),
#binwidth = 0.01,
#fill = "white",
#col = "black",
alpha = 0.2) +
ggtitle("Histogram of SBP **difference** by two group with normal curve") +
ylab("Proportion") +
#xlim(c(75, 180)) +
theme_classic(base_size = 15, base_family = "serif") +
theme(plot.title = ggtext::element_markdown()) +
guides(fill = guide_legend(title = "Hypertension")) +
xlab("SBP (mmHg)") +
stat_function(fun = dnorm,
args = list(mean = mean(high$SBP-low$SBP),
sd = sd(high$SBP-low$SBP))) +
geom_vline(aes(xintercept = 0), color = "red", linetype = "dashed", size = 1)
t.test(high$SBP, low$SBP, paired = F)
##
## Welch Two Sample t-test
##
## data: high$SBP and low$SBP
## t = 14.769, df = 195.57, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 18.67229 24.42771
## sample estimates:
## mean of x mean of y
## 136.81 115.26
The Confidence interval of sample proportion need to employ exact test.
Here is an example how to calculate the CIs for p-hat in R.
install.packages("interpretCI")
##
## The downloaded binary packages are in
## /var/folders/qj/y615pkf53ms_hcxg1f9zjkyc0000gp/T//RtmpFqlcyI/downloaded_packages
library(interpretCI)
x=propCI(n = 20, p = 0.15, alpha = 0.05)
x
## $data
## # A tibble: 1 × 1
## value
## <lgl>
## 1 NA
##
## $result
## alpha n df p P se critical ME lower upper
## 1 0.05 20 19 0.15 0 0.0798436 1.959964 0.1564906 -0.006490575 0.3064906
## CI z pvalue alternative
## 1 0.15 [95CI -0.01; 0.31] 1.878673 0.06028917 two.sided
##
## $call
## propCI(n = 20, p = 0.15, alpha = 0.05)
##
## attr(,"measure")
## [1] "prop"
## Computing. R Foundation for Statistical Computing, Vienna, Austria. <https://www.R-project.org/>. We have invested a lot of time and effort in creating R, please cite it when using it for data analysis. See also 'citation("pkgname")' for citing R packages.
## Grolemund G, Hayes A, Henry L, Hester J, Kuhn M, Pedersen TL, Miller E, Bache SM, Müller K, Ooms J, Robinson D, Seidel DP, Spinu V, Takahashi K, Vaughan D, Wilke C, Woo K, Yutani H (2019). "Welcome to the tidyverse." Journal of Open Source Software_, *4*(43), 1686. doi:10.21105/joss.01686 <https://doi.org/10.21105/joss.01686>.
## R. version 0.5.0. Buffalo, New York. http://github.com/trinker/pacman
## J, reikoch, Beasley W, O'Connor B, Warnes GR, Quinn M, Kamvar ZN (2023). yaml: Methods to Convert R Data to YAML and Back_. R package version 2.3.7, <https://CRAN.R-project.org/package=yaml>. ATTENTION: This citation information has been auto-generated from the package DESCRIPTION file and may need manual editing, see 'help("citation")'.
## Springer-Verlag New York, 2016.
## Grolemund G, Hayes A, Henry L, Hester J, Kuhn M, Pedersen TL, Miller E, Bache SM, Müller K, Ooms J, Robinson D, Seidel DP, Spinu V, Takahashi K, Vaughan D, Wilke C, Woo K, Yutani H (2019). "Welcome to the tidyverse." Journal of Open Source Software_, *4*(43), 1686. doi:10.21105/joss.01686 <https://doi.org/10.21105/joss.01686>.
## R. R package version 2.4.5, <https://CRAN.R-project.org/package=swirl>.