Loading packages
#===============================================================================
#BTC.LineZero.Header.1.1.0
#===============================================================================
#R Markdown environment setup and reporting utility.
#===============================================================================
#RLB.Dependencies:
# knitr, magrittr, pacman, rio, rmarkdown, rmdformats, tibble, yaml
#===============================================================================
#Input for document parameters, libraries, file paths, and options.
#=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=
# --- working dir:
host <- Sys.info()[["nodename"]]
paths <- list(
"bagel-S2600CO" = list(#Minsik's lab server
work = "/mnt/4T_samsung/Dropbox/",
lib = "/home/bagel/R_lib/"
),
"Minsiks-Mac-mini-2.local" = list(#Minsik's mac mini
work = "/Volumes/macdrive/Dropbox/",
lib = "/Library/Frameworks/R.framework/Resources/library/"
),
"Minsiks-MacBook-Pro.local|Minsiks-MBP.ht.home" = list(#Minsik's macbook pro
work = "/Users/minsikkim/Dropbox (Personal)/",
lib = "/Library/Frameworks/R.framework/Resources/library/"
),
"MGB030975" = list(#LaiLab mac mini
work = "/Volumes/Kingston/Dropbox/",
lib = "/Library/Frameworks/R.framework/Resources/library/"
)
)
pick_cfg <- function(host, map) {
if (!is.null(map[[host]])) return(map[[host]])
for (k in names(map)) if (grepl(k, host, ignore.case = TRUE)) return(map[[k]])
NULL
}
cfg <- pick_cfg(host, paths)
if (is.null(cfg)) stop("No path for the host: ", host)
path_working <- cfg$work
path_library <- cfg$lib
# Adding libraries
knitr::opts_knit$set(root.dir = path_working)
str_libraries <- c("yaml", "pacman")
YAML_header <-
'---
title: "COD 2025 BTE3207 Week 7-1"
author: "Minsik Kim"
date: "2025.10.13"
output:
rmdformats::downcute:
downcute_theme: "chaos"
code_folding: hide
fig_width: 8
fig_height: 8
---'
seed <- "20251013"
#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
#Loads libraries, file paths, and other document options.
#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
FUN.LineZero.Boot <- function() {
setwd(path_working)
.libPaths(c(path_library, setdiff(.libPaths(), path_library)))
require(pacman)
pacman::p_load(c("knitr", "rmarkdown", "rmdformats", "yaml"))
knitr::opts_knit$set(root.dir = path_working)
str_libraries <<- unique(sort(str_libraries))
pacman::p_load(char = str_libraries)
set.seed(as.integer(seed))
}
FUN.LineZero.Boot()
## Loading required package: pacman
#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
#Outputs R environment report.
#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
FUN.LineZero.Report <- function() {
cat("Line Zero Environment:\n\n")
paste("R:", pacman::p_version(), "\n") |> cat()
cat("Libraries:\n")
for (str_libraries in str_libraries) {
paste(
" ", str_libraries, ": ", pacman::p_version(package = str_libraries),
"\n", sep = ""
) |> cat()
}
paste("\nOperating System:", pacman::p_detectOS(), "\n") |> cat()
paste(" Library Path:", path_library, "\n") |> cat()
paste(" Working Path:", path_working, "\n") |> cat()
paste("Seed:", seed, "\n\n") |> cat()
cat("YAML Header:\n")
cat(YAML_header)
}
FUN.LineZero.Report()
## Line Zero Environment:
##
## R: 4.4.2
## Libraries:
## pacman: 0.5.1
## yaml: 2.3.10
##
## Operating System: Darwin
## Library Path: /Library/Frameworks/R.framework/Resources/library/
## Working Path: /Volumes/macdrive/Dropbox/
## Seed: 20251013
##
## YAML Header:
## ---
## title: "COD 2025 BTE3207 Week 7-1"
## author: "Minsik Kim"
## date: "2025.10.13"
## output:
## rmdformats::downcute:
## downcute_theme: "chaos"
## code_folding: hide
## fig_width: 8
## fig_height: 8
## ---
Before begin..
Let’s load the SBP dataset.
# download.file("https://raw.githubusercontent.com/minsiksudo/BTE3207_Advanced_Biostatistics/refs/heads/main/dataset/sbp_dataset_korea_2013-2014.csv", destfile = "sbp_dataset.csv")
dataset_sbp <- read.csv(file = "Inha/5_Lectures/1_BTE3207_Biostats/2025F/scripts/BTE3207_Advanced_Biostatistics/dataset/sbp_dataset_korea_2013-2014.csv")
head(dataset_sbp)
The normal distribution
set.seed(1)
sim_100_5 <- rnorm(1000, mean = 100, sd = 5) #makes a data with 1000 elements having mean of 100 and sd of 5
hist(sim_100_5, 100, probability = T) #how does it look like?
curve(dnorm(x, mean=100, sd=5),
col="darkblue", lwd=2, add=TRUE, yaxt="n")
mean(sim_100_5)
## [1] 99.94176
sd(sim_100_5)
## [1] 5.174579
normal distribution
Let’s compare some data collected from real people (random 100 Koreans) with the normal distribution
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
set.seed(1)
sbp_10000 <- sample(dataset_sbp$SBP, 10000) %>%
data.frame()
hist_sbp_10000 <- ggplot(data = sbp_10000) +
geom_histogram(aes(x = .,
y =..density..),
bins = 15,
fill = "white",
col = "black") +
ggtitle("Histogram of SBP of 10000 Koreans") +
ylab("Probability") +
xlab("SBP (mmHg)")
hist_sbp_10000
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
hist_sbp_10000 +
stat_function(fun = dnorm, args = list(mean = mean(sbp_10000$.), sd = sd(sbp_10000$.)))
sbp_10000$. %>% mean
## [1] 121.7429
sbp_10000$. %>% sd
## [1] 14.68246
sbp_10000$. %>% median
## [1] 120
Histogram of sample means
set.seed(1)
sbp_5000 <- replicate(5000, sample(dataset_sbp$SBP, 150)) %>%
data.frame() %>%
colMeans() %>%
as.numeric()
sbp_5000 %>%
#colMeans() %>%
hist(main = "Distribution of 5000 sample mean SBP from 1M subjects.\nEach sample N = 150",
probability = T) +
stat_function(fun = dnorm, args = list(mean = mean(sbp_5000), sd = sd(sbp_5000)))
## NULL
curve(dnorm(x, mean = mean(sbp_5000), sd = sd(sbp_5000)),
col="darkblue", lwd=2, add=TRUE, yaxt="n")
sbp_5000 %>% mean
## [1] 121.8551
sbp_5000 %>% sd
## [1] 1.168471
sbp_5000 %>% median
## [1] 121.8533
z-score
How can we say the location of a person with 130 mmHg of SBP, in terms of distribution? We usually calculate z-score, to understand the location of one sample more easilty.
z_sbp_10000 <- (131-mean(sbp_10000$.))/sd(sbp_10000$.)
z_sbp_10000
## [1] 0.6304871
Here, the z-score of a person with SBP of 131 is 0.63.
Using this information, we cancalculate the percentile of him (assuming the SBP dataset is following the normal distribution)
pnorm(z_sbp_10000)
## [1] 0.735812
It’s percentile is 73.6%.
In other words,
29% of samples are having greater value than 130
71% of samples are having smaller SBP value than 130
Meanwhile, the actural 73.6th percentile of our dataset (sbp_10000) was 130. Why?
quantile(sbp_10000$., 0.736)
## 73.6%
## 130
Normalization
min-max normalization makes minimum vale to the 0 and maximum value to 1.
dataset_sbp$norm_SBP <- (dataset_sbp$SBP - min(dataset_sbp$SBP))/
(max(dataset_sbp$SBP) - min(dataset_sbp$SBP))
ggplot(data = dataset_sbp) +
geom_histogram(aes(x = SBP,
y = ..count..),
binwidth = 0.01,
fill = "white",
col = "black") +
ggtitle("Histogram of SBP of 10000 Koreans") +
ylab("Count") +
theme_classic(base_size = 20, base_family = "serif") +
xlab("SBP (mmHg)")
ggplot(data = dataset_sbp) +
geom_histogram(aes(x = norm_SBP,
y = ..count..),
binwidth = 0.01,
fill = "white",
col = "black") +
ggtitle("Histogram of Min-Max Normalized\nSBP of 10000 Koreans") +
ylab("Count") +
theme_classic(base_size = 20, base_family = "serif") +
xlab("Normalized SBP (mmHg)")
# stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$norm_SBP),
# sd = sd(dataset_sbp$norm_SBP)))
Standardization
Standardization makes mean to 0 and SD into 1.
dataset_sbp$std_SBP <- (dataset_sbp$SBP - mean(dataset_sbp$SBP))/
sd(dataset_sbp$SBP)
ggplot(data = dataset_sbp) +
geom_histogram(aes(x = std_SBP,
y = ..count..),
binwidth = 0.3,
fill = "white",
col = "black") +
ggtitle("Histogram of standardized\nSBP of 10000 Koreans") +
ylab("Probability") +
xlab("Standardized SBP (mmHg)") +
theme_classic(base_family = "serif", base_size = 20)
#stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$std_SBP), sd = sd(dataset_sbp$std_SBP)))
Skewed data and z-score
We can calculate z-score of skewed data, but their calculation does not match with our assumption “the data is approximately normal”, therefore it results in wrong predictions.
dataset_sbp$FBS %>%
hist(probability = T, main = "Histogram of blood sugar level (FBS)", 15)
dataset_sbp$FBS %>% mean
## [1] 98.86443
dataset_sbp$FBS %>% sd
## [1] 22.9813
dataset_sbp$FBS %>% median
## [1] 94
So, if we use z-score to guess the 2.5% percentile’s FBS level,
(dataset_sbp$FBS %>% mean) - 2 * (dataset_sbp$FBS %>% sd)
## [1] 52.90183
However, in actual database,
quantile(dataset_sbp$FBS, 0.025)
## 2.5%
## 74
Thee z-score cannot represent the actual dataset.
CIs of sample proportion
The Confidence interval of sample proportion: employ exact test.
Here is an example how to calculate the CIs for p-hat in R.
#install.packages("interpretCI")
library(interpretCI)
#Normal approximation (2 sd)
x=propCI(n = 20, p = 0.15, alpha = 0.05)
x
## $data
## # A tibble: 1 × 1
## value
## <lgl>
## 1 NA
##
## $result
## alpha n df p P se critical ME lower upper
## 1 0.05 20 19 0.15 0 0.0798436 1.959964 0.1564906 -0.006490575 0.3064906
## CI z pvalue alternative
## 1 0.15 [95CI -0.01; 0.31] 1.878673 0.06028917 two.sided
##
## $call
## propCI(n = 20, p = 0.15, alpha = 0.05)
##
## attr(,"measure")
## [1] "prop"
#Normal approximation
x=propCI(n = 20, p = 0.15, alpha = 0.05, alternative = "wald")
x
## $data
## # A tibble: 1 × 1
## value
## <lgl>
## 1 NA
##
## $result
## alpha n df p P se critical ME lower upper
## 1 0.05 20 19 0.15 0 0.0798436 1.644854 0.131331 0.01866897 0.281331
## CI z pvalue alternative
## 1 0.15 [95CI 0.02; 0.28] 1.878673 0.03014459 wald
##
## $call
## propCI(n = 20, p = 0.15, alpha = 0.05, alternative = "wald")
##
## attr(,"measure")
## [1] "prop"
#exact method - Clopper-Pearson interval.
binom.test(x = 20 * 0.15, n = 20, conf.level = 0.95)
##
## Exact binomial test
##
## data: 20 * 0.15 and 20
## number of successes = 3, number of trials = 20, p-value = 0.002577
## alternative hypothesis: true probability of success is not equal to 0.5
## 95 percent confidence interval:
## 0.03207094 0.37892683
## sample estimates:
## probability of success
## 0.15
Creating a New Variable: Hypertension
Logical Variables in R:
Logical variables store TRUE or FALSE values. These variables are useful in decision-making processes (like identifying conditions where blood pressure exceeds a threshold). In R, the result of comparisons such as > or == is always logical (TRUE or FALSE).
Using ifelse() to Create Logical Variables:
The ifelse() function is a vectorized conditional function in R that checks a condition for each element in a vector. It returns one value if the condition is TRUE and another value if it is FALSE.
Syntax: ifelse(condition, value_if_TRUE, value_if_FALSE)
Below, we use ifelse() to create a logical variable called hypertension, which will be TRUE if either the systolic blood pressure (SBP) is greater than 130 or the diastolic blood pressure (DBP) is greater than 80, and FALSE otherwise.
Use of str() to validate the datatype
From the last lecture (weeke 6-2), we’ve leared how to check type of a data in R.
a <- 1
a
## [1] 1
str(a)
## num 1
b <- c(1, 2, 3)
b <- c(1:3)
str(b)
## int [1:3] 1 2 3
a
## [1] 1
b
## [1] 1 2 3
c <- c(a, b)
c
## [1] 1 1 2 3
c <- c("a", "b", "c")
str(c)
## chr [1:3] "a" "b" "c"
Logical values
From (Homework - reading) Week 3-2 markdown URL, we’ve learned what is logical variables.
Logical values…having logical values.
a <- 1
a = 1
a == 1
## [1] TRUE
a
## [1] 1
= does the same thing as <-.
To test if the thing are same, R uses ==.
a
## [1] 1
1
## [1] 1
a == 1
## [1] TRUE
as we inserted a <- 1 in the previous code chunk,
this test results in TRUE
"a"
## [1] "a"
1
## [1] 1
"a" > 1
## [1] TRUE
"a" < 1
## [1] FALSE
"a" == 1
## [1] FALSE
"a" != 1
## [1] TRUE
!("a" == 1)
## [1] TRUE
This test will test whether a character, "a", is the
same with a numeric value 1. As they are not the same, it
returns FALSE
Here, TRUE and FALSE results are , and they
are one type of binary variable. It works for longer
vectors or variables as well.
c(1, 2, 3, 4, 5) == c(1, 2, 2, 4, 5)
## [1] TRUE TRUE FALSE TRUE TRUE
c(1, 2, 3, 4, 5) == 1
## [1] TRUE FALSE FALSE FALSE FALSE
And it results in a vector of all the logical tests. Using this, we can filter data easily!
T/F often converted to binary variable.
Therfore R recognizes T as 1 and F as 0 as well.
str(c(1,2,3))
## num [1:3] 1 2 3
str(c("one","two","three"))
## chr [1:3] "one" "two" "three"
str(c("1","2","3"))
## chr [1:3] "1" "2" "3"
as.numeric(c("1","2","3"))
## [1] 1 2 3
str(as.numeric(c("1","2","3")))
## num [1:3] 1 2 3
1 == 1
## [1] TRUE
TRUE == "1"
## [1] FALSE
TRUE == 1
## [1] TRUE
ifelse() does the same as if from excel.
ifelse(TRUE, "Right", "Wrong")
## [1] "Right"
ifelse(FALSE, "Right", "Wrong")
## [1] "Wrong"
ifelse(1 > 0, 1, 11)
## [1] 1
ifelse(1 < 0, 1, 11)
## [1] 11
ifelse(13 > 20, "Right", "Wrong")
## [1] "Wrong"
ifelse(13 == 20 | 1 > 0, "Right", "Wrong")
## [1] "Right"
AND and OR
In R, | is “OR” and & is “AND”.
• If either condition is true, the result will be TRUE.
• If both conditions are false, the result will be FALSE.
#AND
TRUE & TRUE
## [1] TRUE
TRUE & FALSE
## [1] FALSE
FALSE & FALSE
## [1] FALSE
#OR
TRUE | TRUE
## [1] TRUE
FALSE | TRUE
## [1] TRUE
FALSE | FALSE
## [1] FALSE
#AND
(1 == 1) & (2 < 3)
## [1] TRUE
(1 == 1) & (2 > 3)
## [1] FALSE
#OR
(1 == 1) | (2 < 3)
## [1] TRUE
(1 == 1) | (2 > 3)
## [1] TRUE
(1 == 2) | (2 > 3)
## [1] FALSE
Using ifelse(), &, and |,
we can create a new variable called hypertension
# Create 'hypertension' variable based on SBP and DBP
dataset_sbp$SBP %>% head
## [1] 116 100 100 111 120 115
dataset_sbp$DBP %>% head
## [1] 78 60 60 70 80 79
dataset_sbp$hypertension <- ifelse(dataset_sbp$SBP > 130 |
dataset_sbp$DBP > 80,
T,
F)
In the code, | is the logical OR operator. The
condition checks if either SBP > 130 or DBP > 80 is
true.
This new logical variable can now be used to classify individuals as hypertensive or non-hypertensive.
Understanding Normal Distribution Functions
- rnorm() – Random Sampling from a Normal Distribution
This function generates random numbers following a normal distribution.
# Generate 5 random numbers from a standard normal distribution (mean = 0, sd = 1)
rnorm(5)
## [1] 0.23757562 0.55610458 0.04469922 1.00026278 0.75550442
# Example with custom mean and standard deviation
# rnorm(100, mean = 100, sd = 1)
- pnorm() – Cumulative Probability for Z-scores (z-score table results)
pnorm() gives the cumulative probability for a given z-score (i.e., the area under the curve up to that point).
# Cumulative probabilities for different z-scores
pnorm(2) # ~97.72% of the data lies below a z-score of 2
## [1] 0.9772499
pnorm(0) # 50% of the data lies below the mean
## [1] 0.5
pnorm(1) # ~84.13% of the data is below a z-score of 1
## [1] 0.8413447
pnorm(3) # ~99.87% of the data is below a z-score of 3
## [1] 0.9986501
- qnorm() – Find Z-scores from Cumulative Probability
qnorm() returns the z-score corresponding to a given cumulative probability.
# Z-scores for given probabilities
qnorm(0.97724985) # Returns ~2, the z-score for 97.72%
## [1] 2
qnorm(0.5) # Returns 0, the z-score for 50%
## [1] 0
Note: pnorm() and qnorm() are
reverse operations of each other.
# Calculating density at different points
dnorm(0) # Height at the mean
## [1] 0.3989423
dnorm(1) # Height at z = 1
## [1] 0.2419707
dnorm(-1) # Height at z = -1
## [1] 0.2419707
dnorm() - probability density function, the
height of normal curve.
dnorm(0)
## [1] 0.3989423
dnorm(1)
## [1] 0.2419707
dnorm(-1)
## [1] 0.2419707
dnorm(10000000000000000000000)
## [1] 0
Plotting the Normal Distribution
Plotting Discrete Values from dnorm().
Using sequences, add dnorm, the calculated height of the normal distribution function
seq(-4, 4, by = 0.5) # generates seqeunces
## [1] -4.0 -3.5 -3.0 -2.5 -2.0 -1.5 -1.0 -0.5 0.0 0.5 1.0 1.5 2.0 2.5 3.0
## [16] 3.5 4.0
dnorm(0)
## [1] 0.3989423
dnorm(1)
## [1] 0.2419707
dnorm(-1)
## [1] 0.2419707
dnorm(c(0, 0, 0, 1))
## [1] 0.3989423 0.3989423 0.3989423 0.2419707
dnorm(seq(-4, 4, by = 0.5))# Height of normal distribution function of the sequences as x values
## [1] 0.0001338302 0.0008726827 0.0044318484 0.0175283005 0.0539909665
## [6] 0.1295175957 0.2419707245 0.3520653268 0.3989422804 0.3520653268
## [11] 0.2419707245 0.1295175957 0.0539909665 0.0175283005 0.0044318484
## [16] 0.0008726827 0.0001338302
# Generate sequence and plot dnorm results
x_vals <- seq(-4, 4, by = 0.5) #storing the data
dnorm(seq(-4, 4, by = 0.5))# Height of normal distribution function of the sequences as x values
## [1] 0.0001338302 0.0008726827 0.0044318484 0.0175283005 0.0539909665
## [6] 0.1295175957 0.2419707245 0.3520653268 0.3989422804 0.3520653268
## [11] 0.2419707245 0.1295175957 0.0539909665 0.0175283005 0.0044318484
## [16] 0.0008726827 0.0001338302
dnorm(x_vals)
## [1] 0.0001338302 0.0008726827 0.0044318484 0.0175283005 0.0539909665
## [6] 0.1295175957 0.2419707245 0.3520653268 0.3989422804 0.3520653268
## [11] 0.2419707245 0.1295175957 0.0539909665 0.0175283005 0.0044318484
## [16] 0.0008726827 0.0001338302
x_vals
## [1] -4.0 -3.5 -3.0 -2.5 -2.0 -1.5 -1.0 -0.5 0.0 0.5 1.0 1.5 2.0 2.5 3.0
## [16] 3.5 4.0
plot(x_vals ~ x_vals)
## Warning in plot.formula(x_vals ~ x_vals): the formula 'x_vals ~ x_vals' is
## treated as 'x_vals ~ 1'
dnorm(x_vals)
## [1] 0.0001338302 0.0008726827 0.0044318484 0.0175283005 0.0539909665
## [6] 0.1295175957 0.2419707245 0.3520653268 0.3989422804 0.3520653268
## [11] 0.2419707245 0.1295175957 0.0539909665 0.0175283005 0.0044318484
## [16] 0.0008726827 0.0001338302
plot(dnorm(x_vals) ~ x_vals)
plot(dnorm(x_vals) ~ x_vals,
ylab = "dnorm() result",
xlab = "dnorm() input")
Ploting of a smooth normal curve
# Create a sequence of values and calculate their densities
x <- seq(-4, 4, length = 100)
y <- dnorm(x)
# Plot the normal curve with labeled x-axis
plot(x, y, type = "l", lwd = 2, axes = FALSE, xlab = "", ylab = "") # here, type l means line.
axis(1, at = -3:3, labels = c("-3σ", "-2σ", "-1σ", "mean", "1σ", "2σ", "3σ")) # adding axes texts
abline(v = 2, col = "red") # Add a vertical line at z = 2
abline(v = -2, col = "red") # Add a vertical line at z = -2
Logical Variables and Statistical Tests
Now that we’ve created the hypertension variable as a logical variable, we can use it in statistical tests.
Single-sample t-test (getting confidence interval of some data)
This tests if the mean of a sample is significantly different from a known value.
c(1, 2, 3)
## [1] 1 2 3
t.test(c(1, 2, 3))
##
## One Sample t-test
##
## data: c(1, 2, 3)
## t = 3.4641, df = 2, p-value = 0.07418
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -0.4841377 4.4841377
## sample estimates:
## mean of x
## 2
result <- t.test(c(1, 2, 3))
result$p.value
## [1] 0.0741799
result$conf.int
## [1] -0.4841377 4.4841377
## attr(,"conf.level")
## [1] 0.95
result #a list of statistical test. Use $ to explore more!
##
## One Sample t-test
##
## data: c(1, 2, 3)
## t = 3.4641, df = 2, p-value = 0.07418
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -0.4841377 4.4841377
## sample estimates:
## mean of x
## 2
result <- t.test(dataset_sbp$SBP) # Extract the confidence interval
result$p.value
## [1] 0
result$conf.int
## [1] 121.8432 121.9003
## attr(,"conf.level")
## [1] 0.95
Two-Sample t-test:
This tests if two groups have significantly different means.
Now that we’ve created the hypertension variable as a logical variable, we can use it in statistical tests.
Two-Sample t-Test Example:
#t.test(dataset_sbp$SBP ~ dataset_sbp$SEX)
result <- t.test(SBP ~ SEX, # y ~ x
data = dataset_sbp) # Extract the confidence interval
result
##
## Welch Two Sample t-test
##
## data: SBP by SEX
## t = 170.86, df = 977496, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
## 4.860551 4.973355
## sample estimates:
## mean in group 1 mean in group 2
## 124.280 119.363
result <- t.test(SBP ~ hypertension, data = dataset_sbp) # Extract the confidence interval
result
##
## Welch Two Sample t-test
##
## data: SBP by hypertension
## t = -928.38, df = 551223, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
## -21.87828 -21.78610
## sample estimates:
## mean in group FALSE mean in group TRUE
## 114.9553 136.7875
Z-Test for Proportions
one-sample z-test using z.test()
Test if the proportion of a sample is different from a given proportion.
# Install and load the BSDA package
#install.packages("BSDA")
library(BSDA)
## Loading required package: lattice
##
## Attaching package: 'BSDA'
## The following object is masked from 'package:datasets':
##
## Orange
str(dataset_sbp$hypertension)
## logi [1:1000000] FALSE FALSE FALSE FALSE FALSE FALSE ...
mean(dataset_sbp$hypertension)
## [1] 0.316803
# Perform a one-sample z-test
result <- z.test(dataset_sbp$hypertension,
sigma.x = mean(dataset_sbp$hypertension) *
(1-mean(dataset_sbp$hypertension))
)
result
##
## One-sample z-Test
##
## data: dataset_sbp$hypertension
## z = 1463.7, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 0.3163788 0.3172272
## sample estimates:
## mean of x
## 0.316803
Hypertention table
dataset_sbp$BTH_G %>% head
## [1] 1 1 1 1 1 1
dataset_sbp$SEX %>% head
## [1] 1 1 1 1 1 1
dataset_sbp$DIS %>% head
## [1] 4 4 4 4 4 4
table(dataset_sbp$SEX)
##
## 1 2
## 510227 489773
table(dataset_sbp$DIS)
##
## 1 2 3 4
## 53398 162826 43114 740662
table(dataset_sbp$BTH_G)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## 26699 22398 26925 30595 34984 33797 29666 30074 53811 50787 50881 49544 46303
## 14 15 16 17 18 19 20 21 22 23 24 25 26
## 46373 52352 53382 47760 45048 37174 33846 30824 32253 22906 22720 24530 18463
## 27
## 45905
table(dataset_sbp$SEX, dataset_sbp$hypertension)
##
## FALSE TRUE
## 1 322199 188028
## 2 360998 128775
Two-Sample Z-Test:
two-sample z-test
# Subset the data by sex
library(tidyverse)
dataset_male <- dataset_sbp %>%
subset(., .$SEX == 1)
dataset_male <- dataset_sbp %>%
filter(SEX == 1)
dataset_female <- dataset_sbp %>% subset(., .$SEX == 2)
# Perform two-sample z-test
result <- z.test(x = dataset_male$hypertension,
y = dataset_female$hypertension,
sigma.x = mean(dataset_male$hypertension) * (1-mean(dataset_male$hypertension)),
sigma.y = mean(dataset_female$hypertension) * (1-mean(dataset_female$hypertension))
)
result
##
## Two-sample z-Test
##
## data: dataset_male$hypertension and dataset_female$hypertension
## z = 246.95, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.1047524 0.1064284
## sample estimates:
## mean of x mean of y
## 0.3685183 0.2629279
Chi-square test for independence
Two-group Chi-square test
Check if two categorical variables are independent.
# Chi-square test between hypertension and sex
result <- chisq.test(y = dataset_sbp$hypertension,
x = dataset_sbp$SEX)
result$statistic
## X-squared
## 12872.28
result$p.value
## [1] 0
Note: p-value is the same as z-score test
Chi-square test for multiple groups
Check relationships between multiple categorical variables.
#https://www.data.go.kr/data/15095105/fileData.do?recommendDataYn=Y
# - DIS : 고혈압/당뇨병 진료여부
# 고혈압/당뇨병 진료내역 있음: 1
# 고혈압 진료내역 있음: 2
# 당뇨병 진료내역 있음: 3
# 고혈압/당뇨병 진료내역 없음: 4
table(dataset_sbp$SEX, dataset_sbp$DIS)
##
## 1 2 3 4
## 1 27979 79892 23900 378456
## 2 25419 82934 19214 362206
# Chi-square test between sex and disease status
result <- chisq.test(x = dataset_sbp$DIS,
y = dataset_sbp$SEX)
result$p.value
## [1] 1.217887e-135
result$statistic
## X-squared
## 627.2968
result$expected
## dataset_sbp$SEX
## dataset_sbp$DIS 1 2
## 1 27245.10 26152.90
## 2 83078.22 79747.78
## 3 21997.93 21116.07
## 4 377905.75 362756.25
result$stdres
## dataset_sbp$SEX
## dataset_sbp$DIS 1 2
## 1 6.529963 -6.529963
## 2 -17.263430 17.263430
## 3 18.733064 -18.733064
## 4 2.511525 -2.511525
Fisher’s exact test
Fisher’s exact test is used when sample sizes are small.
#Author DataFlair
data_frame <- read.csv("https://goo.gl/j6lRXD")
data_frame %>% head
#Reading CSV
data <- table(data_frame$treatment, data_frame$improvement)
fisher.test(data)
##
## Fisher's Exact Test for Count Data
##
## data: data
## p-value = 0.02889
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.1582959 0.9212947
## sample estimates:
## odds ratio
## 0.3878601
# or
# Perform Fisher's exact test on the hypertension and sex table
#table(dataset_sbp$SEX, dataset_sbp$hypertension)
fisher.test(table(dataset_sbp$SEX, dataset_sbp$hypertension))
##
## Fisher's Exact Test for Count Data
##
## data: table(dataset_sbp$SEX, dataset_sbp$hypertension)
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.6060831 0.6165366
## sample estimates:
## odds ratio
## 0.611277
#Author DataFlair
table(dataset_sbp$SEX,
dataset_sbp$hypertension)
##
## FALSE TRUE
## 1 322199 188028
## 2 360998 128775
fisher.test(table(dataset_sbp$SEX,
dataset_sbp$hypertension))
##
## Fisher's Exact Test for Count Data
##
## data: table(dataset_sbp$SEX, dataset_sbp$hypertension)
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.6060831 0.6165366
## sample estimates:
## odds ratio
## 0.611277
table(dataset_sbp$SEX,
dataset_sbp$hypertension) %>%
fisher.test()
##
## Fisher's Exact Test for Count Data
##
## data: .
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.6060831 0.6165366
## sample estimates:
## odds ratio
## 0.611277
As sample number is high, Fisher’s exact test results are having the same result as Chi-square / z-test.
Confidence intervals and statistical tests using epitools
epitools pacakge automates all the calculations and
tests! Especially for risk ratios and odds ratios.
# Install and load epitools
#install.packages("epitools")
library(epitools)
data
##
## improved not-improved
## not-treated 26 29
## treated 35 15
# Calculate risk ratio with confidence interval
riskratio.wald(data)
## $data
##
## improved not-improved Total
## not-treated 26 29 55
## treated 35 15 50
## Total 61 44 105
##
## $measure
## risk ratio with 95% C.I.
## estimate lower upper
## not-treated 1.0000000 NA NA
## treated 0.5689655 0.3479293 0.930424
##
## $p.value
## two-sided
## midp.exact fisher.exact chi.square
## not-treated NA NA NA
## treated 0.02013718 0.02888541 0.01840777
##
## $correction
## [1] FALSE
##
## attr(,"method")
## [1] "Unconditional MLE & normal approximation (Wald) CI"
# Calculate odds ratio with confidence interval
oddsratio.wald(data)
## $data
##
## improved not-improved Total
## not-treated 26 29 55
## treated 35 15 50
## Total 61 44 105
##
## $measure
## odds ratio with 95% C.I.
## estimate lower upper
## not-treated 1.0000000 NA NA
## treated 0.3842365 0.1719968 0.8583743
##
## $p.value
## two-sided
## midp.exact fisher.exact chi.square
## not-treated NA NA NA
## treated 0.02013718 0.02888541 0.01840777
##
## $correction
## [1] FALSE
##
## attr(,"method")
## [1] "Unconditional MLE & normal approximation (Wald) CI"
(Extra)
Degree of freedom
Simulation of repeated sampling & mean values
With smaller sampling number, the variance gets smaller
set.seed(1)
replicate(5000, sample(dataset_sbp$SBP, 150)) %>%
data.frame() %>%
colMeans() %>%
hist(100,
main = "Distribution of 5000 sample mean p-hat of hypertation from 1M subjects.\nEach sample N = 150") +
stat_function(fun = dnorm,
args = list(mean = mean(dataset_sbp$SBP),
sd = sd(dataset_sbp$SBP)))
## NULL
Distribution of 1 M
set.seed(1)
ggplot(data = dataset_sbp) +
geom_histogram(aes(x = SBP,
y = ..density..),
#binwidth = 0.01,
fill = "white",
col = "black") +
ggtitle("Histogram of \nSBP of 1M Koreans") +
ylab("Count") +
xlim(c(75, 180)) +
theme_classic(base_size = 20, base_family = "serif") +
xlab("Normalized SBP (mmHg)") +
stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
sd = sd(dataset_sbp$SBP)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 757 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
Distribution of 50
set.seed(1)
ggplot(data = dataset_sbp %>% sample_n(50)) +
geom_histogram(aes(x = SBP,
y = ..density..),
#binwidth = 0.01,
fill = "white",
col = "black") +
ggtitle("Histogram of \nSBP of random 50 Koreans") +
ylab("Count") +
xlim(c(75, 180)) +
theme_classic(base_size = 20, base_family = "serif") +
xlab("Normalized SBP (mmHg)") +
stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
sd = sd(dataset_sbp$SBP)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
### Distribution of 30
set.seed(1)
ggplot(data = dataset_sbp %>% sample_n(30)) +
geom_histogram(aes(x = SBP,
y = ..density..),
#binwidth = 0.01,
fill = "white",
col = "black") +
ggtitle("Histogram of \nSBP of random 30 Koreans") +
ylab("Count") +
xlim(c(75, 180)) +
theme_classic(base_size = 20, base_family = "serif") +
xlab("Normalized SBP (mmHg)") +
stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
sd = sd(dataset_sbp$SBP)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
Distribution of 20
set.seed(1)
ggplot(data = dataset_sbp %>% sample_n(20)) +
geom_histogram(aes(x = SBP,
y = ..density..),
#binwidth = 0.01,
fill = "white",
col = "black") +
ggtitle("Histogram of \nSBP of random 20 Koreans") +
ylab("Count") +
xlim(c(75, 180)) +
theme_classic(base_size = 20, base_family = "serif") +
xlab("Normalized SBP (mmHg)") +
stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
sd = sd(dataset_sbp$SBP)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
Distribution of 10
set.seed(1)
ggplot(data = dataset_sbp %>% sample_n(10)) +
geom_histogram(aes(x = SBP,
y = ..density..),
#binwidth = 0.01,
fill = "white",
col = "black") +
ggtitle("Histogram of \nSBP of random 10 Koreans") +
ylab("Count") +
xlim(c(75, 180)) +
theme_classic(base_size = 20, base_family = "serif") +
xlab("Normalized SBP (mmHg)") +
stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
sd = sd(dataset_sbp$SBP)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
Distribution of 5
set.seed(1)
ggplot(data = dataset_sbp %>% sample_n(5)) +
geom_histogram(aes(x = SBP,
y = ..density..),
#binwidth = 0.01,
fill = "white",
col = "black") +
ggtitle("Histogram of \nSBP of random 5 Koreans") +
ylab("Count") +
xlim(c(75, 180)) +
theme_classic(base_size = 20, base_family = "serif") +
xlab("Normalized SBP (mmHg)") +
stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
sd = sd(dataset_sbp$SBP)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
Distribution of 2
set.seed(1)
ggplot(data = dataset_sbp %>% sample_n(2)) +
geom_histogram(aes(x = SBP,
y = ..density..),
#binwidth = 0.01,
fill = "white",
col = "black") +
ggtitle("Histogram of \nSBP of random 2 Koreans") +
ylab("Count") +
xlim(c(75, 180)) +
theme_classic(base_size = 20, base_family = "serif") +
xlab("Normalized SBP (mmHg)") +
stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
sd = sd(dataset_sbp$SBP)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
Comparing data
To compare two different sample groups, we can caculate the difference between samples and look at CIs of difference between them. The schematic diagram of how this process works is listed in below figures. Note: this is not a exact statistical test but drwan to help understanding the basic concept of it
Creating hypertension variable
dataset_sbp$hypertension <- ifelse(dataset_sbp$SBP > 130 | dataset_sbp$DBP > 80,
1,
0)
Whole histogram
set.seed(1)
ggplot(data = dataset_sbp %>% sample_n(1000)) +
geom_histogram(aes(x = SBP,
y = ..density..),
#binwidth = 0.01,
#fill = "white",
#col = "black",
alpha = 0.8) +
ggtitle("Histogram of SBP") +
ylab("Proportion") +
xlim(c(75, 180)) +
theme_classic(base_size = 15, base_family = "serif") +
guides(fill = guide_legend(title = "Hypertension")) +
xlab("SBP (mmHg)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).
color coded histogram
set.seed(1)
ggplot(data = dataset_sbp %>% sample_n(1000)) +
geom_histogram(aes(x = SBP,
y = ..density..,
fill=factor(hypertension)),
#binwidth = 0.01,
#fill = "white",
#col = "black",
alpha = 0.4) +
ggtitle("Histogram of SBP by two group") +
ylab("Proportion") +
xlim(c(75, 180)) +
theme_classic(base_size = 15, base_family = "serif") +
guides(fill = guide_legend(title = "Hypertension")) +
xlab("SBP (mmHg)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).
Color coded histogram with normal distribution
set.seed(1)
ggplot(data = dataset_sbp %>% sample_n(1000)) +
geom_histogram(aes(x = SBP,
y = ..density..,
fill=factor(hypertension)),
#binwidth = 0.01,
#fill = "white",
#col = "black",
alpha = 0.2) +
ggtitle("Histogram of SBP by two group with normal curve") +
ylab("Proportion") +
xlim(c(75, 180)) +
theme_classic(base_size = 15, base_family = "serif") +
guides(fill = guide_legend(title = "Hypertension")) +
xlab("SBP (mmHg)") +
stat_function(fun = dnorm,
args = list(mean = mean(subset(dataset_sbp, hypertension == 1)$SBP),
sd = sd(subset(dataset_sbp, hypertension == 1)$SBP)),
col = "blue") +
stat_function(fun = dnorm,
args = list(mean = mean(subset(dataset_sbp, hypertension == 0)$SBP),
sd = sd(subset(dataset_sbp, hypertension == 0)$SBP)),
col = "red")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).
Histogram of differences
set.seed(1)
high <- dataset_sbp %>% sample_n(1000) %>% subset(., hypertension == 1) %>% sample_n(100)
set.seed(1)
low <- dataset_sbp %>% sample_n(1000) %>% subset(., hypertension == 0) %>% sample_n(100)
ggplot() +
geom_histogram(aes(x = high$SBP-low$SBP,
y = ..density..),
#binwidth = 0.01,
#fill = "white",
#col = "black",
alpha = 0.2) +
ggtitle("Histogram of SBP **difference** by two group with normal curve") +
ylab("Proportion") +
#xlim(c(75, 180)) +
theme_classic(base_size = 15, base_family = "serif") +
theme(plot.title = ggtext::element_markdown()) +
guides(fill = guide_legend(title = "Hypertension")) +
xlab("SBP (mmHg)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Distribution of differences
ggplot() +
geom_histogram(aes(x = high$SBP-low$SBP,
y = ..density..),
#binwidth = 0.01,
#fill = "white",
#col = "black",
alpha = 0.2) +
ggtitle("Histogram of SBP **difference** by two group with normal curve") +
ylab("Proportion") +
#xlim(c(75, 180)) +
theme_classic(base_size = 15, base_family = "serif") +
theme(plot.title = ggtext::element_markdown()) +
guides(fill = guide_legend(title = "Hypertension")) +
xlab("SBP (mmHg)") +
stat_function(fun = dnorm,
args = list(mean = mean(high$SBP-low$SBP),
sd = sd(high$SBP-low$SBP)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot() +
geom_histogram(aes(x = high$SBP-low$SBP,
y = ..density..),
#binwidth = 0.01,
#fill = "white",
#col = "black",
alpha = 0.2) +
ggtitle("Histogram of SBP **difference** by two group with normal curve") +
ylab("Proportion") +
#xlim(c(75, 180)) +
theme_classic(base_size = 15, base_family = "serif") +
theme(plot.title = ggtext::element_markdown()) +
guides(fill = guide_legend(title = "Hypertension")) +
xlab("SBP (mmHg)") +
stat_function(fun = dnorm,
args = list(mean = mean(high$SBP-low$SBP),
sd = sd(high$SBP-low$SBP))) +
geom_vline(aes(xintercept = 0), color = "red", linetype = "dashed", size = 1)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
t.test(high$SBP, low$SBP, paired = F)
##
## Welch Two Sample t-test
##
## data: high$SBP and low$SBP
## t = 14.769, df = 195.57, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 18.67229 24.42771
## sample estimates:
## mean of x mean of y
## 136.81 115.26
Bibliography
## Computing. R Foundation for Statistical Computing, Vienna, Austria. <https://www.R-project.org/>. We have invested a lot of time and effort in creating R, please cite it when using it for data analysis. See also 'citation("pkgname")' for citing R packages.
## R. version 0.5.0. Buffalo, New York. http://github.com/trinker/pacman
## J, reikoch, Beasley W, O'Connor B, Warnes GR, Quinn M, Kamvar ZN, Gao C (2024). yaml: Methods to Convert R Data to YAML and Back_. R package version 2.3.10, <https://CRAN.R-project.org/package=yaml>. ATTENTION: This citation information has been auto-generated from the package DESCRIPTION file and may need manual editing, see 'help("citation")'.