COD_2025_week7_1_MGK_BTE3207

Minsik Kim

2025-10-13

Loading packages

#===============================================================================
#BTC.LineZero.Header.1.1.0
#===============================================================================
#R Markdown environment setup and reporting utility.
#===============================================================================
#RLB.Dependencies:
#   knitr, magrittr, pacman, rio, rmarkdown, rmdformats, tibble, yaml
#===============================================================================
#Input for document parameters, libraries, file paths, and options.
#=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=

# --- working dir:
host <- Sys.info()[["nodename"]]

paths <- list(
  "bagel-S2600CO" = list(#Minsik's lab server
    work = "/mnt/4T_samsung/Dropbox/",
    lib  = "/home/bagel/R_lib/"
  ),
  "Minsiks-Mac-mini-2.local" = list(#Minsik's mac mini
    work = "/Volumes/macdrive/Dropbox/",
    lib  = "/Library/Frameworks/R.framework/Resources/library/"
  ),
  "Minsiks-MacBook-Pro.local|Minsiks-MBP.ht.home" = list(#Minsik's macbook pro
    work = "/Users/minsikkim/Dropbox (Personal)/",
    lib  = "/Library/Frameworks/R.framework/Resources/library/"
  ),
  "MGB030975" = list(#LaiLab mac mini
    work = "/Volumes/Kingston/Dropbox/",
    lib  = "/Library/Frameworks/R.framework/Resources/library/"
  )
)

pick_cfg <- function(host, map) {
  if (!is.null(map[[host]])) return(map[[host]])
  for (k in names(map)) if (grepl(k, host, ignore.case = TRUE)) return(map[[k]])
  NULL
}
cfg <- pick_cfg(host, paths)
if (is.null(cfg)) stop("No path for the host: ", host)

path_working <- cfg$work
path_library <- cfg$lib

# Adding libraries
knitr::opts_knit$set(root.dir = path_working)

str_libraries <- c("yaml", "pacman")

        
YAML_header <-
'---
title: "COD 2025 BTE3207 Week 7-1"
author: "Minsik Kim"
date: "2025.10.13"
output:
    rmdformats::downcute:
        downcute_theme: "chaos"
        code_folding: hide
        fig_width: 8
        fig_height: 8
---'

seed <- "20251013"

#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
#Loads libraries, file paths, and other document options.
#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
FUN.LineZero.Boot <- function() {
    setwd(path_working)
    .libPaths(c(path_library, setdiff(.libPaths(), path_library)))
    require(pacman)
    pacman::p_load(c("knitr", "rmarkdown", "rmdformats", "yaml"))
    knitr::opts_knit$set(root.dir = path_working)
    str_libraries <<- unique(sort(str_libraries))
    pacman::p_load(char = str_libraries)
    set.seed(as.integer(seed))
}

FUN.LineZero.Boot()
## Loading required package: pacman
#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
#Outputs R environment report.
#=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
FUN.LineZero.Report <- function() {
    cat("Line Zero Environment:\n\n")
    paste("R:", pacman::p_version(), "\n") |> cat()
    cat("Libraries:\n")
    for (str_libraries in str_libraries) {
        paste(
            "    ", str_libraries, ": ", pacman::p_version(package = str_libraries),
            "\n", sep = ""
        ) |> cat()
    }
    paste("\nOperating System:", pacman::p_detectOS(), "\n") |> cat()
    paste("    Library Path:", path_library, "\n") |> cat()
    paste("    Working Path:", path_working, "\n") |> cat()
    paste("Seed:", seed, "\n\n") |> cat()
    cat("YAML Header:\n")
    cat(YAML_header)
}
FUN.LineZero.Report()
## Line Zero Environment:
## 
## R: 4.4.2 
## Libraries:
##     pacman: 0.5.1
##     yaml: 2.3.10
## 
## Operating System: Darwin 
##     Library Path: /Library/Frameworks/R.framework/Resources/library/ 
##     Working Path: /Volumes/macdrive/Dropbox/ 
## Seed: 20251013 
## 
## YAML Header:
## ---
## title: "COD 2025 BTE3207 Week 7-1"
## author: "Minsik Kim"
## date: "2025.10.13"
## output:
##     rmdformats::downcute:
##         downcute_theme: "chaos"
##         code_folding: hide
##         fig_width: 8
##         fig_height: 8
## ---

Before begin..

Let’s load the SBP dataset.

# download.file("https://raw.githubusercontent.com/minsiksudo/BTE3207_Advanced_Biostatistics/refs/heads/main/dataset/sbp_dataset_korea_2013-2014.csv", destfile = "sbp_dataset.csv")

dataset_sbp <- read.csv(file = "Inha/5_Lectures/1_BTE3207_Biostats/2025F/scripts/BTE3207_Advanced_Biostatistics/dataset/sbp_dataset_korea_2013-2014.csv")

head(dataset_sbp)

The normal distribution

set.seed(1)
sim_100_5 <- rnorm(1000, mean = 100, sd = 5) #makes a data with 1000 elements having mean of 100 and sd of 5

hist(sim_100_5, 100, probability = T) #how does it look like?
curve(dnorm(x, mean=100, sd=5), 
      col="darkblue", lwd=2, add=TRUE, yaxt="n")

mean(sim_100_5) 
## [1] 99.94176
sd(sim_100_5)
## [1] 5.174579

normal distribution

Let’s compare some data collected from real people (random 100 Koreans) with the normal distribution

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
set.seed(1)
sbp_10000 <- sample(dataset_sbp$SBP, 10000) %>%
        data.frame()

hist_sbp_10000 <- ggplot(data = sbp_10000) +
        geom_histogram(aes(x = .,
                           y =..density..),
                       bins = 15,
                       fill = "white",
                       col = "black") +
        ggtitle("Histogram of SBP of 10000 Koreans") +
        ylab("Probability") +
        xlab("SBP (mmHg)")

hist_sbp_10000
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

hist_sbp_10000 + 
        stat_function(fun = dnorm, args = list(mean = mean(sbp_10000$.), sd = sd(sbp_10000$.)))

sbp_10000$. %>% mean
## [1] 121.7429
sbp_10000$. %>% sd
## [1] 14.68246
sbp_10000$. %>% median
## [1] 120

Histogram of sample means

set.seed(1)
        
sbp_5000 <- replicate(5000, sample(dataset_sbp$SBP, 150)) %>%
        data.frame() %>% 
        colMeans() %>% 
        as.numeric()

sbp_5000 %>% 
        #colMeans() %>%
        hist(main = "Distribution of 5000 sample mean SBP from 1M subjects.\nEach sample N = 150",
             probability = T) +
        stat_function(fun = dnorm, args = list(mean = mean(sbp_5000), sd = sd(sbp_5000))) 
## NULL
curve(dnorm(x, mean = mean(sbp_5000), sd = sd(sbp_5000)), 
      col="darkblue", lwd=2, add=TRUE, yaxt="n")

sbp_5000 %>% mean
## [1] 121.8551
sbp_5000 %>% sd
## [1] 1.168471
sbp_5000 %>% median
## [1] 121.8533

z-score

How can we say the location of a person with 130 mmHg of SBP, in terms of distribution? We usually calculate z-score, to understand the location of one sample more easilty.

z_sbp_10000 <- (131-mean(sbp_10000$.))/sd(sbp_10000$.)

z_sbp_10000
## [1] 0.6304871

Here, the z-score of a person with SBP of 131 is 0.63.

Using this information, we cancalculate the percentile of him (assuming the SBP dataset is following the normal distribution)

pnorm(z_sbp_10000)
## [1] 0.735812

It’s percentile is 73.6%.

In other words,

  1. 29% of samples are having greater value than 130

  2. 71% of samples are having smaller SBP value than 130

Meanwhile, the actural 73.6th percentile of our dataset (sbp_10000) was 130. Why?

quantile(sbp_10000$., 0.736)
## 73.6% 
##   130

Normalization

min-max normalization makes minimum vale to the 0 and maximum value to 1.

dataset_sbp$norm_SBP <- (dataset_sbp$SBP - min(dataset_sbp$SBP))/
        (max(dataset_sbp$SBP) - min(dataset_sbp$SBP))


ggplot(data = dataset_sbp) +
        geom_histogram(aes(x = SBP,
                           y = ..count..),
                       binwidth = 0.01,
                       fill = "white",
                       col = "black") +
        ggtitle("Histogram of SBP of 10000 Koreans") +
        ylab("Count") +
        theme_classic(base_size = 20, base_family = "serif") +
        xlab("SBP (mmHg)")

ggplot(data = dataset_sbp) +
        geom_histogram(aes(x = norm_SBP,
                           y = ..count..),
                       binwidth = 0.01,
                       fill = "white",
                       col = "black") +
        ggtitle("Histogram of Min-Max Normalized\nSBP of 10000 Koreans") +
        ylab("Count") +
        theme_classic(base_size = 20, base_family = "serif") +
        xlab("Normalized SBP (mmHg)")

#        stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$norm_SBP),
#                                               sd = sd(dataset_sbp$norm_SBP)))

Standardization

Standardization makes mean to 0 and SD into 1.

dataset_sbp$std_SBP <- (dataset_sbp$SBP - mean(dataset_sbp$SBP))/
        sd(dataset_sbp$SBP)

ggplot(data = dataset_sbp) +
        geom_histogram(aes(x = std_SBP,
                           y = ..count..),
                       binwidth = 0.3,
                       fill = "white",
                       col = "black") +
        ggtitle("Histogram of standardized\nSBP of 10000 Koreans") +
        ylab("Probability") +
        xlab("Standardized SBP (mmHg)") +
        theme_classic(base_family = "serif", base_size = 20) 

        #stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$std_SBP), sd = sd(dataset_sbp$std_SBP)))

Skewed data and z-score

We can calculate z-score of skewed data, but their calculation does not match with our assumption “the data is approximately normal”, therefore it results in wrong predictions.

dataset_sbp$FBS %>%
        hist(probability = T, main = "Histogram of blood sugar level (FBS)", 15)

dataset_sbp$FBS %>% mean
## [1] 98.86443
dataset_sbp$FBS %>% sd
## [1] 22.9813
dataset_sbp$FBS %>% median
## [1] 94

So, if we use z-score to guess the 2.5% percentile’s FBS level,

(dataset_sbp$FBS %>% mean) - 2 * (dataset_sbp$FBS %>% sd)
## [1] 52.90183

However, in actual database,

quantile(dataset_sbp$FBS, 0.025)
## 2.5% 
##   74

Thee z-score cannot represent the actual dataset.

CIs of sample proportion

The Confidence interval of sample proportion: employ exact test.

Here is an example how to calculate the CIs for p-hat in R.

#install.packages("interpretCI")
library(interpretCI)

#Normal approximation (2 sd)
x=propCI(n = 20, p = 0.15, alpha = 0.05)

x
## $data
## # A tibble: 1 × 1
##   value
##   <lgl>
## 1 NA   
## 
## $result
##   alpha  n df    p P        se critical        ME        lower     upper
## 1  0.05 20 19 0.15 0 0.0798436 1.959964 0.1564906 -0.006490575 0.3064906
##                        CI        z     pvalue alternative
## 1 0.15 [95CI -0.01; 0.31] 1.878673 0.06028917   two.sided
## 
## $call
## propCI(n = 20, p = 0.15, alpha = 0.05)
## 
## attr(,"measure")
## [1] "prop"
#Normal approximation 
x=propCI(n = 20, p = 0.15, alpha = 0.05, alternative =  "wald")

x
## $data
## # A tibble: 1 × 1
##   value
##   <lgl>
## 1 NA   
## 
## $result
##   alpha  n df    p P        se critical       ME      lower    upper
## 1  0.05 20 19 0.15 0 0.0798436 1.644854 0.131331 0.01866897 0.281331
##                       CI        z     pvalue alternative
## 1 0.15 [95CI 0.02; 0.28] 1.878673 0.03014459        wald
## 
## $call
## propCI(n = 20, p = 0.15, alpha = 0.05, alternative = "wald")
## 
## attr(,"measure")
## [1] "prop"
#exact method - Clopper-Pearson interval.
binom.test(x = 20 * 0.15, n = 20, conf.level = 0.95)
## 
##  Exact binomial test
## 
## data:  20 * 0.15 and 20
## number of successes = 3, number of trials = 20, p-value = 0.002577
## alternative hypothesis: true probability of success is not equal to 0.5
## 95 percent confidence interval:
##  0.03207094 0.37892683
## sample estimates:
## probability of success 
##                   0.15

Creating a New Variable: Hypertension

Logical Variables in R:

Logical variables store TRUE or FALSE values. These variables are useful in decision-making processes (like identifying conditions where blood pressure exceeds a threshold). In R, the result of comparisons such as > or == is always logical (TRUE or FALSE).

Using ifelse() to Create Logical Variables:

The ifelse() function is a vectorized conditional function in R that checks a condition for each element in a vector. It returns one value if the condition is TRUE and another value if it is FALSE.

Syntax: ifelse(condition, value_if_TRUE, value_if_FALSE)

Below, we use ifelse() to create a logical variable called hypertension, which will be TRUE if either the systolic blood pressure (SBP) is greater than 130 or the diastolic blood pressure (DBP) is greater than 80, and FALSE otherwise.

Use of str() to validate the datatype

From the last lecture (weeke 6-2), we’ve leared how to check type of a data in R.

a <- 1

a
## [1] 1
str(a)
##  num 1
b <- c(1, 2, 3)
b <- c(1:3)

str(b)
##  int [1:3] 1 2 3
a
## [1] 1
b
## [1] 1 2 3
c <- c(a, b)
c
## [1] 1 1 2 3
c <- c("a", "b", "c")

str(c)
##  chr [1:3] "a" "b" "c"

Logical values

From (Homework - reading) Week 3-2 markdown URL, we’ve learned what is logical variables.

Logical values…having logical values.

a <- 1
a = 1
a == 1
## [1] TRUE
a
## [1] 1

= does the same thing as <-.

To test if the thing are same, R uses ==.

a
## [1] 1
1
## [1] 1
a == 1
## [1] TRUE

as we inserted a <- 1 in the previous code chunk, this test results in TRUE

"a"
## [1] "a"
1
## [1] 1
"a" > 1
## [1] TRUE
"a" < 1
## [1] FALSE
"a" == 1
## [1] FALSE
"a" != 1
## [1] TRUE
!("a" == 1)
## [1] TRUE

This test will test whether a character, "a", is the same with a numeric value 1. As they are not the same, it returns FALSE

Here, TRUE and FALSE results are , and they are one type of binary variable. It works for longer vectors or variables as well.

c(1, 2, 3, 4, 5) == c(1, 2, 2, 4, 5)
## [1]  TRUE  TRUE FALSE  TRUE  TRUE
c(1, 2, 3, 4, 5) == 1
## [1]  TRUE FALSE FALSE FALSE FALSE

And it results in a vector of all the logical tests. Using this, we can filter data easily!

T/F often converted to binary variable. Therfore R recognizes T as 1 and F as 0 as well.

str(c(1,2,3))
##  num [1:3] 1 2 3
str(c("one","two","three"))
##  chr [1:3] "one" "two" "three"
str(c("1","2","3"))
##  chr [1:3] "1" "2" "3"
as.numeric(c("1","2","3"))
## [1] 1 2 3
str(as.numeric(c("1","2","3")))
##  num [1:3] 1 2 3
1 == 1
## [1] TRUE
TRUE == "1"
## [1] FALSE
TRUE == 1
## [1] TRUE

ifelse() does the same as if from excel.

ifelse(TRUE, "Right", "Wrong")
## [1] "Right"
ifelse(FALSE, "Right", "Wrong")
## [1] "Wrong"
ifelse(1 > 0, 1, 11)
## [1] 1
ifelse(1 < 0, 1, 11)
## [1] 11
ifelse(13 > 20, "Right", "Wrong")
## [1] "Wrong"
ifelse(13 == 20 | 1 > 0, "Right", "Wrong")
## [1] "Right"

AND and OR

In R, | is “OR” and & is “AND”.

•   If either condition is true, the result will be TRUE.
•   If both conditions are false, the result will be FALSE.
#AND
TRUE & TRUE
## [1] TRUE
TRUE & FALSE
## [1] FALSE
FALSE & FALSE
## [1] FALSE
#OR

TRUE | TRUE
## [1] TRUE
FALSE | TRUE
## [1] TRUE
FALSE | FALSE
## [1] FALSE
#AND

(1 == 1) & (2 < 3)
## [1] TRUE
(1 == 1) & (2 > 3)
## [1] FALSE
#OR

(1 == 1) | (2 < 3)
## [1] TRUE
(1 == 1) | (2 > 3)
## [1] TRUE
(1 == 2) | (2 > 3)
## [1] FALSE

Using ifelse(), &, and |, we can create a new variable called hypertension

# Create 'hypertension' variable based on SBP and DBP

dataset_sbp$SBP %>% head
## [1] 116 100 100 111 120 115
dataset_sbp$DBP %>% head
## [1] 78 60 60 70 80 79
dataset_sbp$hypertension <- ifelse(dataset_sbp$SBP > 130 |
                                           dataset_sbp$DBP > 80,
                                   T,
                                   F)

In the code, | is the logical OR operator. The condition checks if either SBP > 130 or DBP > 80 is true.

This new logical variable can now be used to classify individuals as hypertensive or non-hypertensive.

Understanding Normal Distribution Functions

  1. rnorm() – Random Sampling from a Normal Distribution

This function generates random numbers following a normal distribution.

# Generate 5 random numbers from a standard normal distribution (mean = 0, sd = 1)
rnorm(5)
## [1] 0.23757562 0.55610458 0.04469922 1.00026278 0.75550442
# Example with custom mean and standard deviation
# rnorm(100, mean = 100, sd = 1)
  1. pnorm() – Cumulative Probability for Z-scores (z-score table results)

pnorm() gives the cumulative probability for a given z-score (i.e., the area under the curve up to that point).

# Cumulative probabilities for different z-scores
pnorm(2)   # ~97.72% of the data lies below a z-score of 2
## [1] 0.9772499
pnorm(0)   # 50% of the data lies below the mean
## [1] 0.5
pnorm(1)   # ~84.13% of the data is below a z-score of 1
## [1] 0.8413447
pnorm(3)   # ~99.87% of the data is below a z-score of 3
## [1] 0.9986501
  1. qnorm() – Find Z-scores from Cumulative Probability

qnorm() returns the z-score corresponding to a given cumulative probability.

# Z-scores for given probabilities
qnorm(0.97724985)  # Returns ~2, the z-score for 97.72%
## [1] 2
qnorm(0.5)         # Returns 0, the z-score for 50%
## [1] 0

Note: pnorm() and qnorm() are reverse operations of each other.

# Calculating density at different points
dnorm(0)  # Height at the mean
## [1] 0.3989423
dnorm(1)  # Height at z = 1
## [1] 0.2419707
dnorm(-1) # Height at z = -1
## [1] 0.2419707

dnorm() - probability density function, the height of normal curve.

dnorm(0)
## [1] 0.3989423
dnorm(1)
## [1] 0.2419707
dnorm(-1)
## [1] 0.2419707
dnorm(10000000000000000000000)
## [1] 0

Plotting the Normal Distribution

Plotting Discrete Values from dnorm().

Using sequences, add dnorm, the calculated height of the normal distribution function

seq(-4, 4, by = 0.5) # generates seqeunces
##  [1] -4.0 -3.5 -3.0 -2.5 -2.0 -1.5 -1.0 -0.5  0.0  0.5  1.0  1.5  2.0  2.5  3.0
## [16]  3.5  4.0
dnorm(0)
## [1] 0.3989423
dnorm(1)
## [1] 0.2419707
dnorm(-1)
## [1] 0.2419707
dnorm(c(0, 0, 0, 1))
## [1] 0.3989423 0.3989423 0.3989423 0.2419707
dnorm(seq(-4, 4, by = 0.5))# Height of normal distribution function of the sequences as x values
##  [1] 0.0001338302 0.0008726827 0.0044318484 0.0175283005 0.0539909665
##  [6] 0.1295175957 0.2419707245 0.3520653268 0.3989422804 0.3520653268
## [11] 0.2419707245 0.1295175957 0.0539909665 0.0175283005 0.0044318484
## [16] 0.0008726827 0.0001338302
# Generate sequence and plot dnorm results
x_vals <- seq(-4, 4, by = 0.5) #storing the data
dnorm(seq(-4, 4, by = 0.5))# Height of normal distribution function of the sequences as x values
##  [1] 0.0001338302 0.0008726827 0.0044318484 0.0175283005 0.0539909665
##  [6] 0.1295175957 0.2419707245 0.3520653268 0.3989422804 0.3520653268
## [11] 0.2419707245 0.1295175957 0.0539909665 0.0175283005 0.0044318484
## [16] 0.0008726827 0.0001338302
dnorm(x_vals)
##  [1] 0.0001338302 0.0008726827 0.0044318484 0.0175283005 0.0539909665
##  [6] 0.1295175957 0.2419707245 0.3520653268 0.3989422804 0.3520653268
## [11] 0.2419707245 0.1295175957 0.0539909665 0.0175283005 0.0044318484
## [16] 0.0008726827 0.0001338302
x_vals
##  [1] -4.0 -3.5 -3.0 -2.5 -2.0 -1.5 -1.0 -0.5  0.0  0.5  1.0  1.5  2.0  2.5  3.0
## [16]  3.5  4.0
plot(x_vals ~ x_vals)
## Warning in plot.formula(x_vals ~ x_vals): the formula 'x_vals ~ x_vals' is
## treated as 'x_vals ~ 1'

dnorm(x_vals)
##  [1] 0.0001338302 0.0008726827 0.0044318484 0.0175283005 0.0539909665
##  [6] 0.1295175957 0.2419707245 0.3520653268 0.3989422804 0.3520653268
## [11] 0.2419707245 0.1295175957 0.0539909665 0.0175283005 0.0044318484
## [16] 0.0008726827 0.0001338302
plot(dnorm(x_vals) ~ x_vals)

plot(dnorm(x_vals) ~ x_vals,  
     ylab = "dnorm() result", 
     xlab = "dnorm() input")

Ploting of a smooth normal curve

# Create a sequence of values and calculate their densities

x <- seq(-4, 4, length = 100)
y <- dnorm(x)

# Plot the normal curve with labeled x-axis
plot(x, y, type = "l", lwd = 2, axes = FALSE, xlab = "", ylab = "") # here, type l means line.

axis(1, at = -3:3, labels = c("-3σ", "-2σ", "-1σ", "mean", "1σ", "2σ", "3σ")) # adding axes texts
abline(v = 2, col = "red")  # Add a vertical line at z = 2
abline(v = -2, col = "red") # Add a vertical line at z = -2

Logical Variables and Statistical Tests

Now that we’ve created the hypertension variable as a logical variable, we can use it in statistical tests.

Single-sample t-test (getting confidence interval of some data)

This tests if the mean of a sample is significantly different from a known value.

c(1, 2, 3)
## [1] 1 2 3
t.test(c(1, 2, 3))
## 
##  One Sample t-test
## 
## data:  c(1, 2, 3)
## t = 3.4641, df = 2, p-value = 0.07418
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.4841377  4.4841377
## sample estimates:
## mean of x 
##         2
result <- t.test(c(1, 2, 3))
result$p.value
## [1] 0.0741799
result$conf.int
## [1] -0.4841377  4.4841377
## attr(,"conf.level")
## [1] 0.95
result #a list of statistical test. Use $ to explore more!
## 
##  One Sample t-test
## 
## data:  c(1, 2, 3)
## t = 3.4641, df = 2, p-value = 0.07418
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.4841377  4.4841377
## sample estimates:
## mean of x 
##         2
result <- t.test(dataset_sbp$SBP) # Extract the confidence interval 
result$p.value
## [1] 0
result$conf.int
## [1] 121.8432 121.9003
## attr(,"conf.level")
## [1] 0.95

Two-Sample t-test:

This tests if two groups have significantly different means.

Now that we’ve created the hypertension variable as a logical variable, we can use it in statistical tests.

Two-Sample t-Test Example:

#t.test(dataset_sbp$SBP ~ dataset_sbp$SEX) 
result <- t.test(SBP ~ SEX, # y ~ x 
                 data = dataset_sbp) # Extract the confidence interval 
result
## 
##  Welch Two Sample t-test
## 
## data:  SBP by SEX
## t = 170.86, df = 977496, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
##  4.860551 4.973355
## sample estimates:
## mean in group 1 mean in group 2 
##         124.280         119.363
result <- t.test(SBP ~ hypertension, data = dataset_sbp) # Extract the confidence interval 
result
## 
##  Welch Two Sample t-test
## 
## data:  SBP by hypertension
## t = -928.38, df = 551223, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group FALSE and group TRUE is not equal to 0
## 95 percent confidence interval:
##  -21.87828 -21.78610
## sample estimates:
## mean in group FALSE  mean in group TRUE 
##            114.9553            136.7875

Z-Test for Proportions

one-sample z-test using z.test()

Test if the proportion of a sample is different from a given proportion.

# Install and load the BSDA package

#install.packages("BSDA")

library(BSDA)
## Loading required package: lattice
## 
## Attaching package: 'BSDA'
## The following object is masked from 'package:datasets':
## 
##     Orange
str(dataset_sbp$hypertension)
##  logi [1:1000000] FALSE FALSE FALSE FALSE FALSE FALSE ...
mean(dataset_sbp$hypertension)
## [1] 0.316803
# Perform a one-sample z-test

result <- z.test(dataset_sbp$hypertension,
                 sigma.x = mean(dataset_sbp$hypertension) *
                         (1-mean(dataset_sbp$hypertension))
                 )


result
## 
##  One-sample z-Test
## 
## data:  dataset_sbp$hypertension
## z = 1463.7, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  0.3163788 0.3172272
## sample estimates:
## mean of x 
##  0.316803

Hypertention table

dataset_sbp$BTH_G %>% head
## [1] 1 1 1 1 1 1
dataset_sbp$SEX %>% head
## [1] 1 1 1 1 1 1
dataset_sbp$DIS %>% head
## [1] 4 4 4 4 4 4
table(dataset_sbp$SEX)
## 
##      1      2 
## 510227 489773
table(dataset_sbp$DIS)
## 
##      1      2      3      4 
##  53398 162826  43114 740662
table(dataset_sbp$BTH_G)
## 
##     1     2     3     4     5     6     7     8     9    10    11    12    13 
## 26699 22398 26925 30595 34984 33797 29666 30074 53811 50787 50881 49544 46303 
##    14    15    16    17    18    19    20    21    22    23    24    25    26 
## 46373 52352 53382 47760 45048 37174 33846 30824 32253 22906 22720 24530 18463 
##    27 
## 45905
table(dataset_sbp$SEX, dataset_sbp$hypertension)
##    
##      FALSE   TRUE
##   1 322199 188028
##   2 360998 128775

Two-Sample Z-Test:

two-sample z-test

# Subset the data by sex


library(tidyverse)

dataset_male <- dataset_sbp %>% 
        subset(., .$SEX == 1)

dataset_male <- dataset_sbp %>% 
        filter(SEX == 1)


dataset_female <-  dataset_sbp %>% subset(., .$SEX == 2)

# Perform two-sample z-test

result <- z.test(x = dataset_male$hypertension,
                 y = dataset_female$hypertension,
                 sigma.x = mean(dataset_male$hypertension) * (1-mean(dataset_male$hypertension)),
                 sigma.y = mean(dataset_female$hypertension) * (1-mean(dataset_female$hypertension))
                 )


result
## 
##  Two-sample z-Test
## 
## data:  dataset_male$hypertension and dataset_female$hypertension
## z = 246.95, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.1047524 0.1064284
## sample estimates:
## mean of x mean of y 
## 0.3685183 0.2629279

Chi-square test for independence

Two-group Chi-square test

Check if two categorical variables are independent.

# Chi-square test between hypertension and sex

result <- chisq.test(y = dataset_sbp$hypertension,
                     x = dataset_sbp$SEX)

result$statistic
## X-squared 
##  12872.28
result$p.value
## [1] 0

Note: p-value is the same as z-score test

Chi-square test for multiple groups

Check relationships between multiple categorical variables.

#https://www.data.go.kr/data/15095105/fileData.do?recommendDataYn=Y
# - DIS : 고혈압/당뇨병 진료여부 
# 고혈압/당뇨병 진료내역 있음: 1 
# 고혈압 진료내역 있음: 2
# 당뇨병 진료내역 있음: 3
# 고혈압/당뇨병 진료내역 없음: 4

table(dataset_sbp$SEX, dataset_sbp$DIS)
##    
##          1      2      3      4
##   1  27979  79892  23900 378456
##   2  25419  82934  19214 362206
# Chi-square test between sex and disease status

result <- chisq.test(x = dataset_sbp$DIS,
           y = dataset_sbp$SEX)

result$p.value
## [1] 1.217887e-135
result$statistic
## X-squared 
##  627.2968
result$expected
##                dataset_sbp$SEX
## dataset_sbp$DIS         1         2
##               1  27245.10  26152.90
##               2  83078.22  79747.78
##               3  21997.93  21116.07
##               4 377905.75 362756.25
result$stdres
##                dataset_sbp$SEX
## dataset_sbp$DIS          1          2
##               1   6.529963  -6.529963
##               2 -17.263430  17.263430
##               3  18.733064 -18.733064
##               4   2.511525  -2.511525

Fisher’s exact test

Fisher’s exact test is used when sample sizes are small.

#Author DataFlair

data_frame <- read.csv("https://goo.gl/j6lRXD")
data_frame %>% head
#Reading CSV
data <- table(data_frame$treatment, data_frame$improvement)
fisher.test(data)
## 
##  Fisher's Exact Test for Count Data
## 
## data:  data
## p-value = 0.02889
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.1582959 0.9212947
## sample estimates:
## odds ratio 
##  0.3878601
# or 

# Perform Fisher's exact test on the hypertension and sex table

#table(dataset_sbp$SEX, dataset_sbp$hypertension)
fisher.test(table(dataset_sbp$SEX, dataset_sbp$hypertension))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(dataset_sbp$SEX, dataset_sbp$hypertension)
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.6060831 0.6165366
## sample estimates:
## odds ratio 
##   0.611277
#Author DataFlair
table(dataset_sbp$SEX,
                  dataset_sbp$hypertension)
##    
##      FALSE   TRUE
##   1 322199 188028
##   2 360998 128775
fisher.test(table(dataset_sbp$SEX,
                  dataset_sbp$hypertension))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table(dataset_sbp$SEX, dataset_sbp$hypertension)
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.6060831 0.6165366
## sample estimates:
## odds ratio 
##   0.611277
table(dataset_sbp$SEX,
                  dataset_sbp$hypertension) %>%
        fisher.test()
## 
##  Fisher's Exact Test for Count Data
## 
## data:  .
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.6060831 0.6165366
## sample estimates:
## odds ratio 
##   0.611277

As sample number is high, Fisher’s exact test results are having the same result as Chi-square / z-test.

Confidence intervals and statistical tests using epitools

epitools pacakge automates all the calculations and tests! Especially for risk ratios and odds ratios.

# Install and load epitools
#install.packages("epitools")
library(epitools)
data
##              
##               improved not-improved
##   not-treated       26           29
##   treated           35           15
# Calculate risk ratio with confidence interval
riskratio.wald(data)
## $data
##              
##               improved not-improved Total
##   not-treated       26           29    55
##   treated           35           15    50
##   Total             61           44   105
## 
## $measure
##              risk ratio with 95% C.I.
##                estimate     lower    upper
##   not-treated 1.0000000        NA       NA
##   treated     0.5689655 0.3479293 0.930424
## 
## $p.value
##              two-sided
##               midp.exact fisher.exact chi.square
##   not-treated         NA           NA         NA
##   treated     0.02013718   0.02888541 0.01840777
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "Unconditional MLE & normal approximation (Wald) CI"
# Calculate odds ratio with confidence interval
oddsratio.wald(data)
## $data
##              
##               improved not-improved Total
##   not-treated       26           29    55
##   treated           35           15    50
##   Total             61           44   105
## 
## $measure
##              odds ratio with 95% C.I.
##                estimate     lower     upper
##   not-treated 1.0000000        NA        NA
##   treated     0.3842365 0.1719968 0.8583743
## 
## $p.value
##              two-sided
##               midp.exact fisher.exact chi.square
##   not-treated         NA           NA         NA
##   treated     0.02013718   0.02888541 0.01840777
## 
## $correction
## [1] FALSE
## 
## attr(,"method")
## [1] "Unconditional MLE & normal approximation (Wald) CI"

(Extra)

Degree of freedom

Simulation of repeated sampling & mean values

With smaller sampling number, the variance gets smaller

set.seed(1)
replicate(5000, sample(dataset_sbp$SBP, 150)) %>%
        data.frame() %>% 
        colMeans() %>%
        hist(100,
             main = "Distribution of 5000 sample mean p-hat of hypertation from 1M subjects.\nEach sample N = 150") +
        stat_function(fun = dnorm,
                      args = list(mean = mean(dataset_sbp$SBP),
                                  sd = sd(dataset_sbp$SBP)))

## NULL

Distribution of 1 M

set.seed(1)
ggplot(data = dataset_sbp) +
        geom_histogram(aes(x = SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       fill = "white",
                       col = "black") +
        ggtitle("Histogram of \nSBP of 1M Koreans") +
        ylab("Count") +
        xlim(c(75, 180)) +
        theme_classic(base_size = 20, base_family = "serif") +
        xlab("Normalized SBP (mmHg)") +
        stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
                                               sd = sd(dataset_sbp$SBP)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 757 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

Distribution of 50

set.seed(1)

ggplot(data = dataset_sbp %>% sample_n(50)) +
        geom_histogram(aes(x = SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       fill = "white",
                       col = "black") +
        ggtitle("Histogram of \nSBP of random 50 Koreans") +
        ylab("Count") +
        xlim(c(75, 180)) +
        theme_classic(base_size = 20, base_family = "serif") +
        xlab("Normalized SBP (mmHg)") +
        stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
                                               sd = sd(dataset_sbp$SBP)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

### Distribution of 30

set.seed(1)

ggplot(data = dataset_sbp %>% sample_n(30)) +
        geom_histogram(aes(x = SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       fill = "white",
                       col = "black") +
        ggtitle("Histogram of \nSBP of random 30 Koreans") +
        ylab("Count") +
        xlim(c(75, 180)) +
        theme_classic(base_size = 20, base_family = "serif") +
        xlab("Normalized SBP (mmHg)") +
        stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
                                               sd = sd(dataset_sbp$SBP)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

Distribution of 20

set.seed(1)

ggplot(data = dataset_sbp %>% sample_n(20)) +
        geom_histogram(aes(x = SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       fill = "white",
                       col = "black") +
        ggtitle("Histogram of \nSBP of random 20 Koreans") +
        ylab("Count") +
        xlim(c(75, 180)) +
        theme_classic(base_size = 20, base_family = "serif") +
        xlab("Normalized SBP (mmHg)") +
        stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
                                               sd = sd(dataset_sbp$SBP)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

Distribution of 10

set.seed(1)

ggplot(data = dataset_sbp %>% sample_n(10)) +
        geom_histogram(aes(x = SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       fill = "white",
                       col = "black") +
        ggtitle("Histogram of \nSBP of random 10 Koreans") +
        ylab("Count") +
        xlim(c(75, 180)) +
        theme_classic(base_size = 20, base_family = "serif") +
        xlab("Normalized SBP (mmHg)") +
        stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
                                               sd = sd(dataset_sbp$SBP)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

Distribution of 5

set.seed(1)

ggplot(data = dataset_sbp %>% sample_n(5)) +
        geom_histogram(aes(x = SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       fill = "white",
                       col = "black") +
        ggtitle("Histogram of \nSBP of random 5 Koreans") +
        ylab("Count") +
        xlim(c(75, 180)) +
        theme_classic(base_size = 20, base_family = "serif") +
        xlab("Normalized SBP (mmHg)") +
        stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
                                               sd = sd(dataset_sbp$SBP)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

Distribution of 2

set.seed(1)
ggplot(data = dataset_sbp %>% sample_n(2)) +
        geom_histogram(aes(x = SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       fill = "white",
                       col = "black") +
        ggtitle("Histogram of \nSBP of random 2 Koreans") +
        ylab("Count") +
        xlim(c(75, 180)) +
        theme_classic(base_size = 20, base_family = "serif") +
        xlab("Normalized SBP (mmHg)") +
        stat_function(fun = dnorm, args = list(mean = mean(dataset_sbp$SBP),
                                               sd = sd(dataset_sbp$SBP)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

Comparing data

To compare two different sample groups, we can caculate the difference between samples and look at CIs of difference between them. The schematic diagram of how this process works is listed in below figures. Note: this is not a exact statistical test but drwan to help understanding the basic concept of it

Creating hypertension variable

dataset_sbp$hypertension <- ifelse(dataset_sbp$SBP > 130 | dataset_sbp$DBP > 80,
                                   1,
                                   0)

Whole histogram

set.seed(1)
ggplot(data = dataset_sbp %>% sample_n(1000)) +
        geom_histogram(aes(x = SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       #fill = "white",
                       #col = "black",
                       alpha = 0.8) +
        ggtitle("Histogram of SBP") +
        ylab("Proportion") +
        xlim(c(75, 180)) +
        theme_classic(base_size = 15, base_family = "serif") +
        guides(fill = guide_legend(title = "Hypertension")) +
        xlab("SBP (mmHg)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_bar()`).

color coded histogram

set.seed(1)
ggplot(data = dataset_sbp %>% sample_n(1000)) +
        geom_histogram(aes(x = SBP,
                           y = ..density..,
                           fill=factor(hypertension)),
                       #binwidth = 0.01,
                       #fill = "white",
                       #col = "black",
                       alpha = 0.4) +
        ggtitle("Histogram of SBP by two group") +
        ylab("Proportion") +
        xlim(c(75, 180)) +
        theme_classic(base_size = 15, base_family = "serif") +
        guides(fill = guide_legend(title = "Hypertension")) +
        xlab("SBP (mmHg)") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).

Color coded histogram with normal distribution

set.seed(1)
ggplot(data = dataset_sbp %>% sample_n(1000)) +
        geom_histogram(aes(x = SBP,
                           y = ..density..,
                           fill=factor(hypertension)),
                       #binwidth = 0.01,
                       #fill = "white",
                       #col = "black",
                       alpha = 0.2) +
        ggtitle("Histogram of SBP by two group with normal curve") +
        ylab("Proportion") +
        xlim(c(75, 180)) +
        theme_classic(base_size = 15, base_family = "serif") +
        guides(fill = guide_legend(title = "Hypertension")) +
        xlab("SBP (mmHg)") +
        stat_function(fun = dnorm,
                          args = list(mean = mean(subset(dataset_sbp, hypertension == 1)$SBP),
                                      sd = sd(subset(dataset_sbp, hypertension == 1)$SBP)),
                      col = "blue") +
                stat_function(fun = dnorm,
                          args = list(mean = mean(subset(dataset_sbp, hypertension == 0)$SBP),
                                      sd = sd(subset(dataset_sbp, hypertension == 0)$SBP)),
                          col = "red")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_bar()`).

Histogram of differences

set.seed(1)
high <- dataset_sbp %>% sample_n(1000) %>% subset(., hypertension == 1) %>% sample_n(100)
set.seed(1)
low <- dataset_sbp %>% sample_n(1000) %>% subset(., hypertension == 0) %>% sample_n(100)

ggplot() +
        geom_histogram(aes(x = high$SBP-low$SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       #fill = "white",
                       #col = "black",
                       alpha = 0.2) +
        ggtitle("Histogram of SBP **difference** by two group with normal curve") +
        ylab("Proportion") +
        #xlim(c(75, 180)) +
        theme_classic(base_size = 15, base_family = "serif") +
        theme(plot.title = ggtext::element_markdown()) +
        guides(fill = guide_legend(title = "Hypertension")) +
        xlab("SBP (mmHg)")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Distribution of differences

ggplot() +
        geom_histogram(aes(x = high$SBP-low$SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       #fill = "white",
                       #col = "black",
                       alpha = 0.2) +
        ggtitle("Histogram of SBP **difference** by two group with normal curve") +
        ylab("Proportion") +
        #xlim(c(75, 180)) +
        theme_classic(base_size = 15, base_family = "serif") +
        theme(plot.title = ggtext::element_markdown()) +
        guides(fill = guide_legend(title = "Hypertension")) +
        xlab("SBP (mmHg)") +
        stat_function(fun = dnorm,
                      args = list(mean = mean(high$SBP-low$SBP),
                                      sd = sd(high$SBP-low$SBP))) 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot() +
        geom_histogram(aes(x = high$SBP-low$SBP,
                           y = ..density..),
                       #binwidth = 0.01,
                       #fill = "white",
                       #col = "black",
                       alpha = 0.2) +
        ggtitle("Histogram of SBP **difference** by two group with normal curve") +
        ylab("Proportion") +
        #xlim(c(75, 180)) +
        theme_classic(base_size = 15, base_family = "serif") +
        theme(plot.title = ggtext::element_markdown()) +
        guides(fill = guide_legend(title = "Hypertension")) +
        xlab("SBP (mmHg)") +
        stat_function(fun = dnorm,
                      args = list(mean = mean(high$SBP-low$SBP),
                                      sd = sd(high$SBP-low$SBP))) +
        geom_vline(aes(xintercept = 0), color = "red", linetype = "dashed", size = 1)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

t.test(high$SBP, low$SBP, paired = F)
## 
##  Welch Two Sample t-test
## 
## data:  high$SBP and low$SBP
## t = 14.769, df = 195.57, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  18.67229 24.42771
## sample estimates:
## mean of x mean of y 
##    136.81    115.26

Bibliography

## Computing. R Foundation for Statistical Computing, Vienna, Austria. <https://www.R-project.org/>. We have invested a lot of time and effort in creating R, please cite it when using it for data analysis. See also 'citation("pkgname")' for citing R packages.
## R. version 0.5.0. Buffalo, New York. http://github.com/trinker/pacman
## J, reikoch, Beasley W, O'Connor B, Warnes GR, Quinn M, Kamvar ZN, Gao C (2024). yaml: Methods to Convert R Data to YAML and Back_. R package version 2.3.10, <https://CRAN.R-project.org/package=yaml>. ATTENTION: This citation information has been auto-generated from the package DESCRIPTION file and may need manual editing, see 'help("citation")'.