Before begin..
Let’s load the SBP dataset.
dataset_sbp <- read.csv(file = "/Users/minsikkim/Dropbox (Personal)/Inha/5_Lectures/Advanced biostatistics/scripts/BTE3207_Advanced_Biostatistics/dataset/sbp_dataset_korea_2013-2014.csv")
head(dataset_sbp)
## SEX BTH_G SBP DBP FBS DIS BMI
## 1 1 1 116 78 94 4 16.6
## 2 1 1 100 60 79 4 22.3
## 3 1 1 100 60 87 4 21.9
## 4 1 1 111 70 72 4 20.2
## 5 1 1 120 80 98 4 20.0
## 6 1 1 115 79 95 4 23.1
Making a new variable hypertension
dataset_sbp$hypertension <- ifelse(dataset_sbp$SBP > 130 | dataset_sbp$DBP > 80,
T,
F)
norm functions
rnorm()
- random sampling
rnorm(5)
## [1] 0.96893326 0.79296110 0.49994485 -0.05475673 1.29812150
pnorm()
- probability of given z-score (z-score table
results)
pnorm(2)
## [1] 0.9772499
qnorm()
- gets z-score of given probability
qnorm(0.97724985)
## [1] 2
pnorm()
and qnorm()
are reverse!
pnorm(qnorm(0))
## [1] 0
dnorm()
- probability density function, the
height of normal curve.
dnorm(0)
## [1] 0.3989423
dnorm(1)
## [1] 0.2419707
dnorm(-1)
## [1] 0.2419707
Ploting of dnorm result
plot(dnorm(seq(-4, 4, by = 0.5))~seq(-4, 4, by = 0.5),
ylab = "dnorm() result",
xlab = "dnorm() input",)
Ploting of normal curve
#Create a sequence of 100 equally spaced numbers between -4 and 4
x <- seq(-4, 4, length=100)
#create a vector of values that shows the height of the probability distribution
#for each value in x
y <- dnorm(x)
#plot x and y as a scatterplot with connected lines (type = "l") and add
#an x-axis with custom labels
plot(x,y, type = "l", lwd = 2, axes = FALSE, xlab = "", ylab = "") +
axis(1, at = -3:3, labels = c("-3s", "-2s", "-1s", "mean", "1s", "2s", "3s")) +
abline(v = 2, col = "red") +
abline(v = -2, col = "red")
## numeric(0)
Statistical tests
t-test
getting confidence interval of some data
result <- t.test(dataset_sbp$SBP) # Extract the confidence interval
result$conf.int
## [1] 121.8432 121.9003
## attr(,"conf.level")
## [1] 0.95
comparing groups with t-test
result <- t.test(SBP ~ SEX, data = dataset_sbp) # Extract the confidence interval
result
##
## Welch Two Sample t-test
##
## data: SBP by SEX
## t = 170.86, df = 977496, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 1 and group 2 is not equal to 0
## 95 percent confidence interval:
## 4.860551 4.973355
## sample estimates:
## mean in group 1 mean in group 2
## 124.280 119.363
z.test()
one-sample z-test
install.packages("BSDA")
##
## The downloaded binary packages are in
## /var/folders/qj/y615pkf53ms_hcxg1f9zjkyc0000gp/T//RtmpZUdYnj/downloaded_packages
library(BSDA)
result <- z.test(dataset_sbp$hypertension,
sigma.x = mean(dataset_sbp$hypertension) * (1-mean(dataset_sbp$hypertension)))
result
##
## One-sample z-Test
##
## data: dataset_sbp$hypertension
## z = 1463.7, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## 0.3163788 0.3172272
## sample estimates:
## mean of x
## 0.316803
Hypertention table
table(dataset_sbp$SEX, dataset_sbp$hypertension)
##
## FALSE TRUE
## 1 322199 188028
## 2 360998 128775
two-sample z-test
#Maknig subsets of data
dataset_male <- dataset_sbp %>% subset(., .$SEX == 1)
dataset_female <- dataset_sbp %>% subset(., .$SEX == 2)
result <- z.test(x = dataset_male$hypertension,
y = dataset_female$hypertension,
sigma.x = mean(dataset_male$hypertension) * (1-mean(dataset_male$hypertension)),
sigma.y = mean(dataset_female$hypertension) * (1-mean(dataset_female$hypertension))
)
result
##
## Two-sample z-Test
##
## data: dataset_male$hypertension and dataset_female$hypertension
## z = 246.95, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.1047524 0.1064284
## sample estimates:
## mean of x mean of y
## 0.3685183 0.2629279
Chi-square test
Chi-square test for two groups
result <- chisq.test(x = dataset_sbp$hypertension,
y = dataset_sbp$SEX)
result
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: dataset_sbp$hypertension and dataset_sbp$SEX
## X-squared = 12872, df = 1, p-value < 2.2e-16
p-value is the same as z-score test
Chi-square test for multiple groups
table(dataset_sbp$SEX, dataset_sbp$DIS)
##
## 1 2 3 4
## 1 27979 79892 23900 378456
## 2 25419 82934 19214 362206
result <- chisq.test(x = dataset_sbp$DIS,
y = dataset_sbp$SEX)
result
##
## Pearson's Chi-squared test
##
## data: dataset_sbp$DIS and dataset_sbp$SEX
## X-squared = 627.3, df = 3, p-value < 2.2e-16
Exact test
#Author DataFlair
data_frame <- read.csv("https://goo.gl/j6lRXD")
#Reading CSV
data <- table(data_frame$treatment, data_frame$improvement)
fisher.test(data)
##
## Fisher's Exact Test for Count Data
##
## data: data
## p-value = 0.02889
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.1582959 0.9212947
## sample estimates:
## odds ratio
## 0.3878601
#Author DataFlair
fisher.test(table(dataset_sbp$SEX,
dataset_sbp$hypertension))
##
## Fisher's Exact Test for Count Data
##
## data: table(dataset_sbp$SEX, dataset_sbp$hypertension)
## p-value < 2.2e-16
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
## 0.6060831 0.6165366
## sample estimates:
## odds ratio
## 0.611277
As sample number is high, Fisher’s exact test results are having the same result as Chi-square / z-test.
Confidence interval calculations, statistical tests
epitools
pacakge automates all the calculations and
tests!
install.packages("epitools")
##
## The downloaded binary packages are in
## /var/folders/qj/y615pkf53ms_hcxg1f9zjkyc0000gp/T//RtmpZUdYnj/downloaded_packages
library(epitools)
#Confidence interval of RR
riskratio.wald(data)
## $data
##
## improved not-improved Total
## not-treated 26 29 55
## treated 35 15 50
## Total 61 44 105
##
## $measure
## risk ratio with 95% C.I.
## estimate lower upper
## not-treated 1.0000000 NA NA
## treated 0.5689655 0.3479293 0.930424
##
## $p.value
## two-sided
## midp.exact fisher.exact chi.square
## not-treated NA NA NA
## treated 0.02013718 0.02888541 0.01840777
##
## $correction
## [1] FALSE
##
## attr(,"method")
## [1] "Unconditional MLE & normal approximation (Wald) CI"
#Confidence interval of RR
oddsratio.wald(data)
## $data
##
## improved not-improved Total
## not-treated 26 29 55
## treated 35 15 50
## Total 61 44 105
##
## $measure
## odds ratio with 95% C.I.
## estimate lower upper
## not-treated 1.0000000 NA NA
## treated 0.3842365 0.1719968 0.8583743
##
## $p.value
## two-sided
## midp.exact fisher.exact chi.square
## not-treated NA NA NA
## treated 0.02013718 0.02888541 0.01840777
##
## $correction
## [1] FALSE
##
## attr(,"method")
## [1] "Unconditional MLE & normal approximation (Wald) CI"
Bibliography
## Computing. R Foundation for Statistical Computing, Vienna, Austria. <https://www.R-project.org/>. We have invested a lot of time and effort in creating R, please cite it when using it for data analysis. See also 'citation("pkgname")' for citing R packages.
## Grolemund G, Hayes A, Henry L, Hester J, Kuhn M, Pedersen TL, Miller E, Bache SM, Müller K, Ooms J, Robinson D, Seidel DP, Spinu V, Takahashi K, Vaughan D, Wilke C, Woo K, Yutani H (2019). "Welcome to the tidyverse." Journal of Open Source Software_, *4*(43), 1686. doi:10.21105/joss.01686 <https://doi.org/10.21105/joss.01686>.
## R. version 0.5.0. Buffalo, New York. http://github.com/trinker/pacman
## J, reikoch, Beasley W, O'Connor B, Warnes GR, Quinn M, Kamvar ZN (2023). yaml: Methods to Convert R Data to YAML and Back_. R package version 2.3.7, <https://CRAN.R-project.org/package=yaml>. ATTENTION: This citation information has been auto-generated from the package DESCRIPTION file and may need manual editing, see 'help("citation")'.