Before begin..
Let’s load the SBP dataset.
dataset_sbp <- read.csv(file = "Inha/5_Lectures/2024/Advanced biostatistics/scripts/BTE3207_Advanced_Biostatistics/dataset/sbp_dataset_korea_2013-2014.csv")
head(dataset_sbp)
## SEX BTH_G SBP DBP FBS DIS BMI
## 1 1 1 116 78 94 4 16.6
## 2 1 1 100 60 79 4 22.3
## 3 1 1 100 60 87 4 21.9
## 4 1 1 111 70 72 4 20.2
## 5 1 1 120 80 98 4 20.0
## 6 1 1 115 79 95 4 23.1
Making a new variable hypertension
dataset_sbp$hypertension <- ifelse(dataset_sbp$SBP > 130 |
dataset_sbp$DBP > 80,
T,
F)
DBP ~ SBP
With a scatter plot, we can see there is a correlation between SBP and DBP.
set.seed(1)
dataset_sbp_small <- subset(dataset_sbp, row.names(dataset_sbp) %in% sample(x = 1:1000000, size = 1000))
ggplot(data = dataset_sbp_small, aes(x = SBP, y = DBP)) +
geom_point() +
theme_classic( base_family = "serif", base_size = 20) +
xlab("SBP (mmHg)") +
ylab("DBP (mmHg)")
Data with trendline
With lm()
function, we can calculate the linear model of
DBP~SBP with the least squares algorithm.
#A function that returns equation
lm_eqn <- function(m){
eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2,
list(a = format(unname(coef(m)[1]), digits = 2),
b = format(unname(coef(m)[2]), digits = 2),
r2 = format(summary(m)$r.squared, digits = 3)))
as.character(as.expression(eq));
}
set.seed(1)
dataset_sbp_small <- subset(dataset_sbp, row.names(dataset_sbp) %in% sample(x = 1:1000000, size = 1000))
ggplot(data = dataset_sbp_small, aes(x = SBP, y = DBP)) +
geom_point() +
theme_classic(base_family = "serif", base_size = 20) +
xlab("SBP (mmHg)") +
ylab("DBP (mmHg)") +
geom_smooth(method = "lm") +
geom_label(x = 130, y = 115,
label = lm_eqn(lm(data = dataset_sbp_small, DBP ~ SBP)),
parse = TRUE,
family='serif',
size = 8, label.size = 0, color = "blue")
#rnorm(100, mean = 100, sd = 1)
Calculation of residual
The result of Model's predicted value - sample
is called
residual. It can be shown as below.
lm_result <-
lm(data = dataset_sbp_small,DBP ~ SBP) #%>% summary
d <- dataset_sbp_small
#Saving predicted values
d$predicted <- lm_result$fitted.values
d$residuals <- residuals(lm_result)
ggplot(d, aes(x = SBP, y = DBP)) +
geom_smooth(method = "lm", se = FALSE, color = "blue") + # Plot regression slope
geom_segment(aes(xend = SBP, yend = predicted), alpha = .2) + # alpha to fade lines
geom_point() +
xlab("SBP (mmHg)") +
ylab("DBP (mmHg)") +
theme_classic( base_family = "serif", base_size = 20) +
geom_label(x = 130, y = 115,
label = lm_eqn(lm(data = dataset_sbp_small,DBP ~ SBP)),
parse = TRUE,
family='serif',
size = 8, label.size = 0, color = "blue")
# Intercept, slope, and their units
Remember that the size of intercept and slope is DEPENDENT on the units of input or outcome.
dataset_sbp_small$SBP_pa <- 133.322 * dataset_sbp_small$SBP
dataset_sbp_small$DBP_pa <- 133.322 * dataset_sbp_small$DBP
dataset_sbp_small$SBP_psi <- 0.0193368 * dataset_sbp_small$SBP
dataset_sbp_small$DBP_psi <- 0.0193368 * dataset_sbp_small$DBP
If we chnage the units of SBP or DBP,
DBP (pascal) ~ SBP (pascal)
lm(data = dataset_sbp_small, DBP_pa ~ SBP_pa) %>% summary
##
## Call:
## lm(formula = DBP_pa ~ SBP_pa, data = dataset_sbp_small)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4102.1 -611.7 16.4 623.6 3085.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.657e+03 2.277e+02 7.275 6.99e-13 ***
## SBP_pa 5.223e-01 1.398e-02 37.372 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 865 on 998 degrees of freedom
## Multiple R-squared: 0.5832, Adjusted R-squared: 0.5828
## F-statistic: 1397 on 1 and 998 DF, p-value: < 2.2e-16
DBP (mmHg) ~ SBP (pascal)
lm(data = dataset_sbp_small, DBP ~ SBP_pa) %>% summary
##
## Call:
## lm(formula = DBP ~ SBP_pa, data = dataset_sbp_small)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30.7682 -4.5884 0.1233 4.6775 23.1427
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.243e+01 1.708e+00 7.275 6.99e-13 ***
## SBP_pa 3.917e-03 1.048e-04 37.372 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.488 on 998 degrees of freedom
## Multiple R-squared: 0.5832, Adjusted R-squared: 0.5828
## F-statistic: 1397 on 1 and 998 DF, p-value: < 2.2e-16
DBP (psi) ~ SBP (psi)
lm(data = dataset_sbp_small, DBP_psi ~ SBP_psi) %>% summary
##
## Call:
## lm(formula = DBP_psi ~ SBP_psi, data = dataset_sbp_small)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.59496 -0.08873 0.00238 0.09045 0.44751
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.24026 0.03302 7.275 6.99e-13 ***
## SBP_psi 0.52229 0.01398 37.372 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1255 on 998 degrees of freedom
## Multiple R-squared: 0.5832, Adjusted R-squared: 0.5828
## F-statistic: 1397 on 1 and 998 DF, p-value: < 2.2e-16
Distance from the mean value can be drawn as below.
This is the residual from mean of all value - which is X-bar - Xi. Using them we can calcualte SD of mean!
#Saving predicted values
d$mean <- mean(dataset_sbp_small$DBP)
d$predicted <- lm_result$fitted.values
d$residuals <- residuals(lm_result)
ggplot(d, aes(x = SBP, y = DBP)) +
#geom_smooth(method = "lm", se = FALSE, color = "blue") + # Plot regression slope
geom_segment(aes(xend = SBP, yend = mean), alpha = 1, linetype = "dashed", color = "red") + # alpha to fade lines
geom_point() +
#xlab("SBP (mmHg)") +
ylab("DBP (mmHg)") +
theme_classic( base_family = "serif", base_size = 20) +
geom_hline(yintercept = mean(dataset_sbp_small$DBP), color = "blue") +
geom_label(x = 100, y = 95,
label = "Mean DBP",
#parse = TRUE,
family='serif',
size = 8, label.size = 0, color = "blue")
# Distance from a regression line can be drawn as below
With the same approach as above, we can calculate SD of a linear regression!
lm_result <- dataset_sbp_small %>%
lm(data = .,DBP ~ SBP) #%>% summary
ggplot(d, aes(x = SBP, y = DBP)) +
geom_smooth(method = "lm", se = FALSE, color = "blue") + # Plot regression slope
geom_segment(aes(xend = SBP, yend = predicted), alpha = 1, linetype = "dashed", color = "red") + # alpha to fade
geom_point() +
xlab("SBP (mmHg)") +
ylab("DBP (mmHg)") +
theme_classic( base_family = "serif", base_size = 20) +
geom_label(x = 100, y = 90,
label = "Linear model",
#parse = TRUE,
family='serif',
size = 8, label.size = 0, color = "blue")
Two draw both,
ggplot(d, aes(x = SBP, y = DBP)) +
geom_smooth(method = "lm", se = FALSE, color = "blue") + # Plot regression slope
geom_segment(aes(xend = SBP, yend = predicted), alpha = 1, linetype = "dashed", color = "blue") + # alpha to fade
#geom_segment(aes(xend = SBP, yend = mean), alpha = 1, linetype = "dashed", color = "purple") + # alpha to fade
geom_point() +
xlab("SBP (mmHg)") +
ylab("DBP (mmHg)") +
theme_classic( base_family = "serif", base_size = 20) +
geom_label(x = 100, y = 90,
label = "Linear model",
#parse = TRUE,
family='serif',
size = 8, label.size = 0, color = "blue") +
geom_hline(yintercept = mean(dataset_sbp_small$DBP), color = "purple") +
geom_label(x = 170, y = 70,
label = "Mean DBP",
#parse = TRUE,
family='serif',
size = 8, label.size = 0, color = "purple")
Example 2 - BMI vs. SBP
R-squared is the measuremnt how the regression line fits well with the data.
With lower signifiacnt associations, R-squared value will be smaller.
lm(data = dataset_sbp_small, BMI ~ SBP) %>% summary
##
## Call:
## lm(formula = BMI ~ SBP, data = dataset_sbp_small)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.4686 -2.3082 -0.2373 1.9334 14.8563
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.834783 0.850774 18.612 <2e-16 ***
## SBP 0.065282 0.006962 9.377 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.232 on 998 degrees of freedom
## Multiple R-squared: 0.08097, Adjusted R-squared: 0.08005
## F-statistic: 87.93 on 1 and 998 DF, p-value: < 2.2e-16
BMI_model <- lm(data = dataset_sbp_small, BMI ~ SBP)
d$predicted <- BMI_model$fitted.values
ggplot(d, aes(x = SBP, y = BMI)) +
geom_smooth(method = "lm", se = FALSE, color = "red") + # Plot regression slope
geom_segment(aes(xend = SBP, yend = predicted), alpha = 1, linetype = "dashed", color = "red") + # alpha to fade
#geom_segment(aes(xend = SBP, yend = mean), alpha = 1, linetype = "dashed", color = "purple") + # alpha to fade
geom_point() +
xlab("SBP (mmHg)") +
ylab("BMI (kg/m2)") +
theme_classic( base_family = "serif", base_size = 20) +
geom_label(x = 170, y = 36,
label = "Linear model",
#parse = TRUE,
family='serif',
size = 8, label.size = 0, color = "red") +
#geom_hline(yintercept = mean(dataset_sbp_small$DBP), color = "purple") +
geom_label(x = 155, y = 16,
label = lm_eqn(lm(data = dataset_sbp_small,BMI ~ SBP)),
parse = TRUE,
family='serif',
size = 8, label.size = 0, color = "red", fill = alpha(c("white"),1))
#ylim(c(11, 30))
Example 3 - Binary variables
Linear model, confidence intervals of each term, p-values, and R-squared value can be calculated for binary input as well.
SBP vs Gender
dataset_sbp_small$Female <- ifelse(dataset_sbp_small$SEX == 1, 0, 1)
lm(data = dataset_sbp_small, DBP ~ Female) %>% summary
##
## Call:
## lm(formula = DBP ~ Female, data = dataset_sbp_small)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.538 -7.538 0.083 6.083 46.083
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 77.5377 0.4348 178.338 < 2e-16 ***
## Female -3.6205 0.6256 -5.787 9.57e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.886 on 998 degrees of freedom
## Multiple R-squared: 0.03247, Adjusted R-squared: 0.0315
## F-statistic: 33.49 on 1 and 998 DF, p-value: 9.57e-09
gender_model <- lm(data = dataset_sbp_small, DBP ~ Female)
dataset_sbp_small$predicted <- gender_model$fitted.values
ggplot(dataset_sbp_small, aes(x = Female, y = DBP)) +
geom_smooth(method = "lm", se = FALSE, color = "red") + # Plot regression slope
geom_segment(aes(xend = Female, yend = predicted), alpha = 1, linetype = "dashed", color = "red") + # alpha to fade
#geom_segment(aes(xend = SBP, yend = mean), alpha = 1, linetype = "dashed", color = "purple") + # alpha to fade
geom_point() +
#xlab("") +
scale_x_discrete(name ="Gender (Male = 0, Female = 1)",
limits=c(0,1)) +
#scale_x_discrete(breaks = c(0,1))+
ylab("BMI (kg/m2)") +
theme_classic( base_family = "serif", base_size = 20) +
geom_label(x = 0.5, y = 36,
label = "Linear model",
#parse = TRUE,
family='serif',
size = 8, label.size = 0, color = "red") +
#geom_hline(yintercept = mean(dataset_sbp_small$DBP), color = "purple") +
geom_label(x = 0.5, y = 120,
label = lm_eqn(lm(data = dataset_sbp_small,DBP ~ Female)),
parse = TRUE,
family='serif',
size = 8, label.size = 0, color = "red", fill = alpha(c("white"),1))
#ylim(c(11, 30))
Pearson’s correlation
Pearson’s correlation can be calculated using
stats::cor()
function.
#THis is Pearson's correlation coefficient
cor(dataset_sbp_small$DBP, dataset_sbp_small$SBP)
## [1] 0.7637024
lm(data = dataset_sbp_small, SBP ~ SEX) %>% confint
## 2.5 % 97.5 %
## (Intercept) 126.058909 131.684338
## SEX -6.889471 -3.294782
Where, Pearson’s correlation is just the square root of R-squared!
#It's sqrt() of R-squared.
sqrt(lm(data = dataset_sbp_small, DBP ~ SBP) %>% summary %>% .$r.squared)
## [1] 0.7637024
stats::cor()
function will calculated multiple
correlations at the same time!
cor(dataset_sbp)
## SEX BTH_G SBP DBP FBS
## SEX 1.00000000 0.08522637 -0.1687964 -0.1904205 -0.1011504
## BTH_G 0.08522637 1.00000000 0.2759806 0.1344836 0.2139513
## SBP -0.16879639 0.27598057 1.0000000 0.7430064 0.1865008
## DBP -0.19042046 0.13448362 0.7430064 1.0000000 0.1387174
## FBS -0.10115044 0.21395130 0.1865008 0.1387174 1.0000000
## DIS -0.00479659 -0.47804876 -0.3107574 -0.1935247 -0.3159353
## BMI -0.17083273 0.08808903 0.3043832 0.2754917 0.1736879
## hypertension -0.11345822 0.18508300 0.6975140 0.6676741 0.1307362
## DIS BMI hypertension
## SEX -0.00479659 -0.17083273 -0.1134582
## BTH_G -0.47804876 0.08808903 0.1850830
## SBP -0.31075745 0.30438319 0.6975140
## DBP -0.19352467 0.27549170 0.6676741
## FBS -0.31593533 0.17368791 0.1307362
## DIS 1.00000000 -0.20010681 -0.2226826
## BMI -0.20010681 1.00000000 0.2225391
## hypertension -0.22268256 0.22253908 1.0000000
Other types of correlation - Spearman’s correlation (rank-based)
cor(dataset_sbp,method = "spearman")
## SEX BTH_G SBP DBP FBS
## SEX 1.000000000 0.08885402 -0.1789945 -0.1922290 -0.1282142
## BTH_G 0.088854025 1.00000000 0.2724776 0.1401683 0.2719809
## SBP -0.178994460 0.27247764 1.0000000 0.7287952 0.2348772
## DBP -0.192228956 0.14016825 0.7287952 1.0000000 0.1818949
## FBS -0.128214249 0.27198093 0.2348772 0.1818949 1.0000000
## DIS -0.003510257 -0.48851313 -0.3018710 -0.1906525 -0.3085306
## BMI -0.191643751 0.11242824 0.3159298 0.2762916 0.2292492
## hypertension -0.113458215 0.18404539 0.7143712 0.6864544 0.1657422
## DIS BMI hypertension
## SEX -0.003510257 -0.1916438 -0.1134582
## BTH_G -0.488513132 0.1124282 0.1840454
## SBP -0.301870974 0.3159298 0.7143712
## DBP -0.190652466 0.2762916 0.6864544
## FBS -0.308530592 0.2292492 0.1657422
## DIS 1.000000000 -0.2063562 -0.2228300
## BMI -0.206356163 1.0000000 0.2229989
## hypertension -0.222829982 0.2229989 1.0000000
Bibliography
## Computing. R Foundation for Statistical Computing, Vienna, Austria. <https://www.R-project.org/>. We have invested a lot of time and effort in creating R, please cite it when using it for data analysis. See also 'citation("pkgname")' for citing R packages.
## Grolemund G, Hayes A, Henry L, Hester J, Kuhn M, Pedersen TL, Miller E, Bache SM, Müller K, Ooms J, Robinson D, Seidel DP, Spinu V, Takahashi K, Vaughan D, Wilke C, Woo K, Yutani H (2019). "Welcome to the tidyverse." Journal of Open Source Software_, *4*(43), 1686. doi:10.21105/joss.01686 <https://doi.org/10.21105/joss.01686>.
## R. version 0.5.0. Buffalo, New York. http://github.com/trinker/pacman
## J, reikoch, Beasley W, O'Connor B, Warnes GR, Quinn M, Kamvar ZN, Gao C (2024). yaml: Methods to Convert R Data to YAML and Back_. R package version 2.3.10, <https://CRAN.R-project.org/package=yaml>. ATTENTION: This citation information has been auto-generated from the package DESCRIPTION file and may need manual editing, see 'help("citation")'.
## version 0.4.4, <https://CRAN.R-project.org/package=reactable>.