Analysis of variance
Before begin..
Let’s load the SBP dataset.
dataset_sbp <- read.csv(file = "Inha/5_Lectures/1_BTE3207_Biostats/2025F/scripts/BTE3207_Advanced_Biostatistics/dataset/sbp_dataset_korea_2013-2014.csv")
reactable::reactable(cbind(order = 1:1000, dataset_sbp[1:1000,]),
filterable = T, sortable = T)
Making a new variable hypertension
dataset_sbp$hypertension <- ifelse(dataset_sbp$SBP > 130 |
dataset_sbp$DBP > 80,
T,
F)
dataset_sbp$history_of_hypertension <- ifelse(dataset_sbp$DIS == 1 |
dataset_sbp$DIS == 2,
T,
F)
dataset_sbp$DIS <- factor(dataset_sbp$DIS,
levels = c(1,2,3,4),
labels = c("Hypertension and diabetes",
"Hypertension",
"Diabetes",
"No history"))
set.seed(1)
dataset_sbp_small <- subset(dataset_sbp, row.names(dataset_sbp) %in% sample(x = 1:1000000, size = 1000))
DIS groups
ggplot(dataset_sbp, aes(y = SBP, x = DIS)) +
geom_boxplot() +
xlab("History of hypertension or diabetes") +
ylab("SBP (mmHg)") +
theme_classic(base_size = 12)
T-test?
DIS group 1 vs 4
t.test(dataset_sbp %>% filter(as.numeric(DIS) == 1) %>% .$SBP,
dataset_sbp %>% filter(as.numeric(DIS) == 4) %>% .$SBP,
var.equal = F)
##
## Welch Two Sample t-test
##
## data: dataset_sbp %>% filter(as.numeric(DIS) == 1) %>% .$SBP and dataset_sbp %>% filter(as.numeric(DIS) == 4) %>% .$SBP
## t = 169.59, df = 59802, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 11.18067 11.44213
## sample estimates:
## mean of x mean of y
## 130.5640 119.2526
DIS group 2 vs 4
t.test(dataset_sbp %>% filter(as.numeric(DIS) == 2) %>% .$SBP,
dataset_sbp %>% filter(as.numeric(DIS) == 4) %>% .$SBP,
var.equal = F)
##
## Welch Two Sample t-test
##
## data: dataset_sbp %>% filter(as.numeric(DIS) == 2) %>% .$SBP and dataset_sbp %>% filter(as.numeric(DIS) == 4) %>% .$SBP
## t = 282.46, df = 225561, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 11.22032 11.37713
## sample estimates:
## mean of x mean of y
## 130.5513 119.2526
DIS group 3 vs 4
t.test(dataset_sbp %>% filter(as.numeric(DIS) == 3) %>% .$SBP,
dataset_sbp %>% filter(as.numeric(DIS) == 4) %>% .$SBP,
var.equal = F)
##
## Welch Two Sample t-test
##
## data: dataset_sbp %>% filter(as.numeric(DIS) == 3) %>% .$SBP and dataset_sbp %>% filter(as.numeric(DIS) == 4) %>% .$SBP
## t = 60.351, df = 48165, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 3.937403 4.201738
## sample estimates:
## mean of x mean of y
## 123.3221 119.2526
DIS group 1 vs 2
t.test(dataset_sbp %>% filter(as.numeric(DIS) == 1) %>% .$SBP,
dataset_sbp %>% filter(as.numeric(DIS) == 2) %>% .$SBP,
var.equal = F)
##
## Welch Two Sample t-test
##
## data: dataset_sbp %>% filter(as.numeric(DIS) == 1) %>% .$SBP and dataset_sbp %>% filter(as.numeric(DIS) == 2) %>% .$SBP
## t = 0.16998, df = 90286, p-value = 0.865
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1334474 0.1587919
## sample estimates:
## mean of x mean of y
## 130.5640 130.5513
….
Do you think it is reasonable?
Linear model?
lm(data = dataset_sbp, SBP ~ DIS) %>%
anova
## Analysis of Variance Table
##
## Response: SBP
## Df Sum Sq Mean Sq F value Pr(>F)
## DIS 3e+00 21472596 7157532 37558 < 2.2e-16 ***
## Residuals 1e+06 190570477 191
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
aov(data = dataset_sbp, SBP ~ DIS) %>% summary
## Df Sum Sq Mean Sq F value Pr(>F)
## DIS 3e+00 21472596 7157532 37558 <2e-16 ***
## Residuals 1e+06 190570477 191
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
It shows how different whether the group difference estimates with beta values (as well as CIs and p-values).
But can we assess wheter the DIS is a significant factor
affecting SBP?
Here comes the analysis of variance.
Analysis of variance
Let’s say we are comparing multipel groups
Making a new dataset
midterm <- data.frame(midterm = c(82, 83, 97, 83, 78, 68, 38, 59, 55),
major = rep(c("BE", "MBE", "CHEM"), times=1, each=3))
mean_sim <- 10
std_sim <- 5
lcb <- ((mean_sim - (3 * std_sim)) - 5)
ucb <- (((2 * mean_sim) + (3 * (2 * std_sim))) + 5)
ggplot(data = data.frame(u = c(lcb, ucb)),
mapping = aes(x = u)) +
stat_function(mapping = aes(colour = "Group 1"),
fun = dnorm,
args = list(mean = 20,
sd = std_sim)) +
stat_function(mapping = aes(colour = "Group 2"),
fun = dnorm,
args = list(mean = 40,
sd = (std_sim))) +
stat_function(mapping = aes(colour = "Group 3"),
fun = dnorm,
args = list(mean = 15,
sd = (std_sim))) +
scale_colour_manual(values = c("red", "blue", "green")) +
labs(x = "Values",
y = "Densities",
col = "Groups") +
theme_classic(base_size = 12)
Variance of group means
mean_sim <- 10
std_sim <- 5
lcb <- ((mean_sim - (3 * std_sim)) - 5)
ucb <- (((2 * mean_sim) + (3 * (2 * std_sim))) + 5)
ggplot(data = data.frame(u = c(lcb, ucb)),
mapping = aes(x = u)) +
stat_function(mapping = aes(colour = "Group 1"),
fun = dnorm,
args = list(mean = 20,
sd = std_sim)) +
stat_function(mapping = aes(colour = "Group 2"),
fun = dnorm,
args = list(mean = 40,
sd = (std_sim))) +
stat_function(mapping = aes(colour = "Group 3"),
fun = dnorm,
args = list(mean = 15,
sd = (std_sim))) +
geom_vline(xintercept = c(20, 40, 15), color= c("red", "blue", "green")) +
scale_colour_manual(values = c("grey", "grey", "grey")) +
geom_vline(xintercept = mean(c(15, 40, 20)), col = "Black", size = 2) +
labs(x = "Values",
y = "Densities",
col = "Groups") +
theme_classic(base_size = 12)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Uncertainty - variance within groups
with smaller variance within group
mean_sim <- 10
std_sim <- 5
lcb <- ((mean_sim - (3 * std_sim)) - 5)
ucb <- (((2 * mean_sim) + (3 * (2 * std_sim))) + 5)
ggplot(data = data.frame(u = c(lcb, ucb)),
mapping = aes(x = u)) +
stat_function(mapping = aes(colour = "Group 1"),
fun = dnorm,
args = list(mean = 20,
sd = 0.4*std_sim)) +
stat_function(mapping = aes(colour = "Group 2"),
fun = dnorm,
args = list(mean = 40,
sd = 0.4*(std_sim))) +
stat_function(mapping = aes(colour = "Group 3"),
fun = dnorm,
args = list(mean = 15,
sd = 0.4*(std_sim))) +
geom_vline(xintercept = c(20, 40, 15), color= c("red", "blue", "green")) +
scale_colour_manual(values = c("grey", "grey", "grey")) +
geom_vline(xintercept = mean(c(15, 40, 20)), col = "Black", size = 2) +
labs(x = "Values",
y = "Densities",
col = "Groups") +
theme_classic(base_size = 12)
Extremely smalle variance
mean_sim <- 10
std_sim <- 5
lcb <- ((mean_sim - (3 * std_sim)) - 5)
ucb <- (((2 * mean_sim) + (3 * (2 * std_sim))) + 5)
ggplot(data = data.frame(u = c(lcb, ucb)),
mapping = aes(x = u)) +
stat_function(mapping = aes(colour = "Group 1"),
fun = dnorm,
args = list(mean = 20,
sd = 0.01*std_sim)) +
stat_function(mapping = aes(colour = "Group 2"),
fun = dnorm,
args = list(mean = 40,
sd = 0.01*(std_sim))) +
stat_function(mapping = aes(colour = "Group 3"),
fun = dnorm,
args = list(mean = 15,
sd = 0.01*(std_sim))) +
geom_vline(xintercept = c(20, 40, 15), color= c("red", "blue", "green")) +
scale_colour_manual(values = c("grey", "grey", "grey")) +
geom_vline(xintercept = mean(c(15, 40, 20)), col = "Black", size = 2) +
labs(x = "Values",
y = "Densities",
col = "Groups") +
theme_classic(base_size = 12)
With extremly high sd
mean_sim <- 10
std_sim <- 5
lcb <- ((mean_sim - (3 * std_sim)) - 5)
ucb <- (((2 * mean_sim) + (3 * (2 * std_sim))) + 5)
ggplot(data = data.frame(u = c(lcb, ucb)),
mapping = aes(x = u)) +
stat_function(mapping = aes(colour = "Group 1"),
fun = dnorm,
args = list(mean = 20,
sd = 10*std_sim)) +
stat_function(mapping = aes(colour = "Group 2"),
fun = dnorm,
args = list(mean = 40,
sd = 10*(std_sim))) +
stat_function(mapping = aes(colour = "Group 3"),
fun = dnorm,
args = list(mean = 15,
sd = 10*(std_sim))) +
geom_vline(xintercept = c(20, 40, 15), color= c("red", "blue", "green")) +
scale_colour_manual(values = c("grey", "grey", "grey")) +
geom_vline(xintercept = mean(c(15, 40, 20)), col = "Black", size = 2) +
ylim(c(0,0.01)) +
labs(x = "Values",
y = "Densities",
col = "Groups") +
theme_classic(base_size = 12)
SBP dataset
In histogram, the data can be shown as
ggplot(data = dataset_sbp) +
stat_function(mapping = aes(colour = "History of hypertension and diabetes"),
fun = dnorm,
args = list(mean = dataset_sbp %>% filter(as.numeric(DIS) == 1) %>% .$SBP %>% mean,
sd = dataset_sbp %>% filter(as.numeric(DIS) == 1) %>% .$SBP %>% sd)) +
stat_function(mapping = aes(colour = "History of hypertension"),
fun = dnorm,
args = list(mean = dataset_sbp %>% filter(as.numeric(DIS) == 2) %>% .$SBP %>% mean,
sd = dataset_sbp %>% filter(as.numeric(DIS) == 2) %>% .$SBP %>% sd)) +
stat_function(mapping = aes(colour = "History of diabetes"),
fun = dnorm,
args = list(mean = dataset_sbp %>% filter(as.numeric(DIS) == 3) %>% .$SBP %>% mean,
sd = dataset_sbp %>% filter(as.numeric(DIS) == 3) %>% .$SBP %>% sd)) +
stat_function(mapping = aes(colour = "No history"),
fun = dnorm,
args = list(mean = dataset_sbp %>% filter(as.numeric(DIS) == 4) %>% .$SBP %>% mean,
sd = dataset_sbp %>% filter(as.numeric(DIS) == 4) %>% .$SBP %>% sd)) +
scale_color_brewer(type = "qual", palette = 2) +
xlim(c(100,150)) +
labs(x = "SBP (mmHg)",
y = "Densities",
col = "Groups") +
theme_classic(base_size = 12)
## Warning in stat_function(mapping = aes(colour = "History of hypertension and diabetes"), : All aesthetics have length 1, but the data has 1000000 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
## Warning in stat_function(mapping = aes(colour = "History of hypertension"), : All aesthetics have length 1, but the data has 1000000 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
## Warning in stat_function(mapping = aes(colour = "History of diabetes"), : All aesthetics have length 1, but the data has 1000000 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
## Warning in stat_function(mapping = aes(colour = "No history"), fun = dnorm, : All aesthetics have length 1, but the data has 1000000 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
ANOVA in R
aov() function can calculate the result of analysis of
variance
aov(midterm ~ major, data = midterm) %>%
summary
## Df Sum Sq Mean Sq F value Pr(>F)
## major 2 2124 1062.1 12.59 0.00712 **
## Residuals 6 506 84.3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
aov(SBP ~ DIS, data = dataset_sbp_small) %>%
summary
## Df Sum Sq Mean Sq F value Pr(>F)
## DIS 3 29932 9977 53.54 <2e-16 ***
## Residuals 996 185607 186
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Else, we can implement anova into effect modifications
aov(SBP ~ SEX + BTH_G * BMI + FBS + DIS * DBP, data = dataset_sbp_small) %>%
summary
## Df Sum Sq Mean Sq F value Pr(>F)
## SEX 1 6475 6475 82.630 < 2e-16 ***
## BTH_G 1 23618 23618 301.394 < 2e-16 ***
## BMI 1 10826 10826 138.150 < 2e-16 ***
## FBS 1 2905 2905 37.073 1.63e-09 ***
## DIS 3 8318 2773 35.383 < 2e-16 ***
## DBP 1 85682 85682 1093.431 < 2e-16 ***
## BTH_G:BMI 1 0 0 0.000 0.985
## DIS:DBP 3 374 125 1.593 0.190
## Residuals 987 77342 78
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lm(SBP ~ SEX + BTH_G * BMI + FBS + DIS * DBP, data = dataset_sbp_small) %>%
summary
##
## Call:
## lm(formula = SBP ~ SEX + BTH_G * BMI + FBS + DIS * DBP, data = dataset_sbp_small)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.193 -5.601 -0.520 5.510 38.301
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.345205 11.941720 4.132 3.90e-05 ***
## SEX -0.976183 0.589594 -1.656 0.0981 .
## BTH_G 0.256344 0.292661 0.876 0.3813
## BMI 0.245572 0.187394 1.310 0.1903
## FBS 0.019089 0.012977 1.471 0.1416
## DISHypertension -4.553308 12.265896 -0.371 0.7106
## DISDiabetes -1.391342 17.500850 -0.080 0.9366
## DISNo history -17.968618 11.307352 -1.589 0.1124
## DBP 0.859173 0.134571 6.385 2.64e-10 ***
## BTH_G:BMI 0.001867 0.012488 0.149 0.8812
## DISHypertension:DBP 0.054381 0.150584 0.361 0.7181
## DISDiabetes:DBP -0.033645 0.224297 -0.150 0.8808
## DISNo history:DBP 0.178389 0.138652 1.287 0.1985
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.852 on 987 degrees of freedom
## Multiple R-squared: 0.6412, Adjusted R-squared: 0.6368
## F-statistic: 147 on 12 and 987 DF, p-value: < 2.2e-16
aov(SBP ~ BTH_G * SEX * DIS * DBP, data = dataset_sbp_small) %>%
summary
## Df Sum Sq Mean Sq F value Pr(>F)
## BTH_G 1 22804 22804 293.300 < 2e-16 ***
## SEX 1 7289 7289 93.744 < 2e-16 ***
## DIS 3 11307 3769 48.475 < 2e-16 ***
## DBP 1 95484 95484 1228.103 < 2e-16 ***
## BTH_G:SEX 1 854 854 10.990 0.00095 ***
## BTH_G:DIS 3 700 233 3.001 0.02973 *
## SEX:DIS 3 453 151 1.942 0.12116
## BTH_G:DBP 1 157 157 2.024 0.15514
## SEX:DBP 1 29 29 0.376 0.53987
## DIS:DBP 3 137 46 0.588 0.62266
## BTH_G:SEX:DIS 3 136 45 0.584 0.62536
## BTH_G:SEX:DBP 1 0 0 0.000 0.98243
## BTH_G:DIS:DBP 3 343 114 1.469 0.22155
## SEX:DIS:DBP 3 364 121 1.561 0.19722
## BTH_G:SEX:DIS:DBP 3 220 73 0.945 0.41823
## Residuals 968 75261 78
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova() function accepts linear model outputs
lm(SBP ~ BTH_G * SEX * DIS * DBP, data = dataset_sbp_small) %>%
summary
##
## Call:
## lm(formula = SBP ~ BTH_G * SEX * DIS * DBP, data = dataset_sbp_small)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.013 -5.341 -0.617 5.572 43.134
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -201.34069 205.73745 -0.979 0.3280
## BTH_G 14.12147 9.50847 1.485 0.1378
## SEX 90.26545 144.25017 0.626 0.5316
## DISHypertension 288.35627 216.54324 1.332 0.1833
## DISDiabetes 178.64106 332.71491 0.537 0.5914
## DISNo history 229.88453 206.55369 1.113 0.2660
## DBP 4.02042 2.45536 1.637 0.1019
## BTH_G:SEX -5.37119 6.18981 -0.868 0.3857
## BTH_G:DISHypertension -16.87641 10.12132 -1.667 0.0958 .
## BTH_G:DISDiabetes -13.99269 16.88817 -0.829 0.4076
## BTH_G:DISNo history -12.67335 9.61234 -1.318 0.1877
## SEX:DISHypertension -133.96369 152.76266 -0.877 0.3807
## SEX:DISDiabetes -58.21450 205.47704 -0.283 0.7770
## SEX:DISNo history -83.80291 144.69323 -0.579 0.5626
## BTH_G:DBP -0.16922 0.11451 -1.478 0.1398
## SEX:DBP -1.14763 1.74774 -0.657 0.5116
## DISHypertension:DBP -3.59142 2.58840 -1.388 0.1656
## DISDiabetes:DBP -1.93448 4.10562 -0.471 0.6376
## DISNo history:DBP -2.76757 2.46747 -1.122 0.2623
## BTH_G:SEX:DISHypertension 8.28844 6.64491 1.247 0.2126
## BTH_G:SEX:DISDiabetes 6.19111 9.77716 0.633 0.5267
## BTH_G:SEX:DISNo history 4.53109 6.24898 0.725 0.4686
## BTH_G:SEX:DBP 0.06734 0.07496 0.898 0.3692
## BTH_G:DISHypertension:DBP 0.20757 0.12216 1.699 0.0896 .
## BTH_G:DISDiabetes:DBP 0.15526 0.21062 0.737 0.4612
## BTH_G:DISNo history:DBP 0.15002 0.11603 1.293 0.1963
## SEX:DISHypertension:DBP 1.66182 1.85315 0.897 0.3701
## SEX:DISDiabetes:DBP 0.60939 2.55070 0.239 0.8112
## SEX:DISNo history:DBP 0.99992 1.75449 0.570 0.5689
## BTH_G:SEX:DISHypertension:DBP -0.10115 0.08066 -1.254 0.2102
## BTH_G:SEX:DISDiabetes:DBP -0.06943 0.12222 -0.568 0.5701
## BTH_G:SEX:DISNo history:DBP -0.05381 0.07584 -0.709 0.4782
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.818 on 968 degrees of freedom
## Multiple R-squared: 0.6508, Adjusted R-squared: 0.6396
## F-statistic: 58.2 on 31 and 968 DF, p-value: < 2.2e-16
PBD and RSM (statistical optimization)
This lecture will practice running
- Plackett-Burman Design (PBD)
and
- Response surface method (RSM)
Before begin..
Let’s install packages.
# install.packages("daewr")
library(daewr)
PBDes function creates the experimental design of the
PBD. randomize = T will generate a matrix with mixed orders.
daewr::PBDes(nfactors = 11, nruns = 12)
## A B C D E F G H J K L
## 1 1 1 -1 1 1 1 -1 -1 -1 1 -1
## 2 -1 1 1 -1 1 1 1 -1 -1 -1 1
## 3 1 -1 1 1 -1 1 1 1 -1 -1 -1
## 4 -1 1 -1 1 1 -1 1 1 1 -1 -1
## 5 -1 -1 1 -1 1 1 -1 1 1 1 -1
## 6 -1 -1 -1 1 -1 1 1 -1 1 1 1
## 7 1 -1 -1 -1 1 -1 1 1 -1 1 1
## 8 1 1 -1 -1 -1 1 -1 1 1 -1 1
## 9 1 1 1 -1 -1 -1 1 -1 1 1 -1
## 10 -1 1 1 1 -1 -1 -1 1 -1 1 1
## 11 1 -1 1 1 1 -1 -1 -1 1 -1 1
## 12 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
# Error examples
# daewr::PBDes(nfactors = 12, nruns = 12)
daewr::PBDes(nfactors = 10, nruns = 12)
## A B C D E F G H J K
## 1 1 1 -1 1 1 1 -1 -1 -1 1
## 2 -1 1 1 -1 1 1 1 -1 -1 -1
## 3 1 -1 1 1 -1 1 1 1 -1 -1
## 4 -1 1 -1 1 1 -1 1 1 1 -1
## 5 -1 -1 1 -1 1 1 -1 1 1 1
## 6 -1 -1 -1 1 -1 1 1 -1 1 1
## 7 1 -1 -1 -1 1 -1 1 1 -1 1
## 8 1 1 -1 -1 -1 1 -1 1 1 -1
## 9 1 1 1 -1 -1 -1 1 -1 1 1
## 10 -1 1 1 1 -1 -1 -1 1 -1 1
## 11 1 -1 1 1 1 -1 -1 -1 1 -1
## 12 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
#daewr::PBDes(nfactors = 15, nruns = 24)
After setting your higher level and
lower level based on your intuition, you can run a
experiment with 12 experimental groups, and get the result out of the
data.
analyiss of PBD outcome
You can run a miultiple linear regression that is containing all the data!
download.file("https://raw.githubusercontent.com/minsiksudo/BTE3207_Advanced_Biostatistics/refs/heads/main/dataset/PBD_example_data.csv", "PBD_example_data.csv")
pbd_data <- read.csv("PBD_example_data.csv")
lm(Biomass ~ Glucose + YE + KH2PO4 + K2HPO4 + MgSO4 + CaCl2 + FeSO4 + Fructose + NH4Cl + TM + Vitamin,
#<<<<<<< HEAD
data = pbd_data) %>% summary
##
## Call:
## lm(formula = Biomass ~ Glucose + YE + KH2PO4 + K2HPO4 + MgSO4 +
## CaCl2 + FeSO4 + Fructose + NH4Cl + TM + Vitamin, data = pbd_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.04 -0.02 0.00 0.02 0.04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.437500 0.007022 347.134 < 2e-16 ***
## Glucose -0.012500 0.007022 -1.780 0.100360
## YE 0.845833 0.007022 120.458 < 2e-16 ***
## KH2PO4 0.037500 0.007022 5.341 0.000176 ***
## K2HPO4 0.572500 0.007022 81.532 < 2e-16 ***
## MgSO4 0.032500 0.007022 4.628 0.000582 ***
## CaCl2 -0.054167 0.007022 -7.714 5.45e-06 ***
## FeSO4 0.055833 0.007022 7.951 4.00e-06 ***
## Fructose 0.080833 0.007022 11.512 7.67e-08 ***
## NH4Cl -0.017500 0.007022 -2.492 0.028315 *
## TM 0.015833 0.007022 2.255 0.043611 *
## Vitamin -0.019167 0.007022 -2.730 0.018280 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0344 on 12 degrees of freedom
## Multiple R-squared: 0.9994, Adjusted R-squared: 0.9989
## F-statistic: 1953 on 11 and 12 DF, p-value: < 2.2e-16
This is your Plackett-Burman result.
RSM
# install.packages("rsm")
library(rsm)
using ccd function, you can generate a experimental
design that can be used for RSM experiment and analysis
#2 factor experiment
ccd (2)
## run.order std.order x1.as.is x2.as.is Block
## 1 1 1 -1.000000 -1.000000 1
## 2 2 6 0.000000 0.000000 1
## 3 3 2 1.000000 -1.000000 1
## 4 4 4 1.000000 1.000000 1
## 5 5 8 0.000000 0.000000 1
## 6 6 3 -1.000000 1.000000 1
## 7 7 5 0.000000 0.000000 1
## 8 8 7 0.000000 0.000000 1
## 9 1 7 0.000000 0.000000 2
## 10 2 3 0.000000 -1.414214 2
## 11 3 5 0.000000 0.000000 2
## 12 4 6 0.000000 0.000000 2
## 13 5 4 0.000000 1.414214 2
## 14 6 1 -1.414214 0.000000 2
## 15 7 2 1.414214 0.000000 2
## 16 8 8 0.000000 0.000000 2
##
## Data are stored in coded form using these coding formulas ...
## x1 ~ x1.as.is
## x2 ~ x2.as.is
#3 factor experiment
ccd (3)
## run.order std.order x1.as.is x2.as.is x3.as.is Block
## 1 1 1 -1.000000 -1.000000 -1.000000 1
## 2 2 8 1.000000 1.000000 1.000000 1
## 3 3 12 0.000000 0.000000 0.000000 1
## 4 4 10 0.000000 0.000000 0.000000 1
## 5 5 4 1.000000 1.000000 -1.000000 1
## 6 6 5 -1.000000 -1.000000 1.000000 1
## 7 7 7 -1.000000 1.000000 1.000000 1
## 8 8 2 1.000000 -1.000000 -1.000000 1
## 9 9 6 1.000000 -1.000000 1.000000 1
## 10 10 9 0.000000 0.000000 0.000000 1
## 11 11 3 -1.000000 1.000000 -1.000000 1
## 12 12 11 0.000000 0.000000 0.000000 1
## 13 1 5 0.000000 0.000000 -1.825742 2
## 14 2 6 0.000000 0.000000 1.825742 2
## 15 3 9 0.000000 0.000000 0.000000 2
## 16 4 10 0.000000 0.000000 0.000000 2
## 17 5 2 1.825742 0.000000 0.000000 2
## 18 6 7 0.000000 0.000000 0.000000 2
## 19 7 1 -1.825742 0.000000 0.000000 2
## 20 8 4 0.000000 1.825742 0.000000 2
## 21 9 8 0.000000 0.000000 0.000000 2
## 22 10 3 0.000000 -1.825742 0.000000 2
##
## Data are stored in coded form using these coding formulas ...
## x1 ~ x1.as.is
## x2 ~ x2.as.is
## x3 ~ x3.as.is
SOdes2 <- ccd (2, n0 = c(4,6), alpha = "rotatable", inscribed = F)
SOdes2
## run.order std.order x1.as.is x2.as.is Block
## 1 1 1 -1.000000 -1.000000 1
## 2 2 7 0.000000 0.000000 1
## 3 3 2 1.000000 -1.000000 1
## 4 4 5 0.000000 0.000000 1
## 5 5 8 0.000000 0.000000 1
## 6 6 3 -1.000000 1.000000 1
## 7 7 6 0.000000 0.000000 1
## 8 8 4 1.000000 1.000000 1
## 9 1 7 0.000000 0.000000 2
## 10 2 4 0.000000 1.414214 2
## 11 3 6 0.000000 0.000000 2
## 12 4 8 0.000000 0.000000 2
## 13 5 1 -1.414214 0.000000 2
## 14 6 5 0.000000 0.000000 2
## 15 7 2 1.414214 0.000000 2
## 16 8 3 0.000000 -1.414214 2
## 17 9 10 0.000000 0.000000 2
## 18 10 9 0.000000 0.000000 2
##
## Data are stored in coded form using these coding formulas ...
## x1 ~ x1.as.is
## x2 ~ x2.as.is
With coding argument, you can set the levels of your
data into actual values
SOdes2 <- ccd (3, n0 = c(4,6), alpha = "rotatable", inscribed = F,
coding = list (
x1 ~ (Glucose - 50)/10,
x2 ~ (NaNO3 - 5)/2,
x3 ~ (K2HPO4 - 2)/1)
)
SOdes2
## run.order std.order Glucose NaNO3 K2HPO4 Block
## 1 1 5 40.00000 3.000000 3.0000000 1
## 2 2 7 40.00000 7.000000 3.0000000 1
## 3 3 11 50.00000 5.000000 2.0000000 1
## 4 4 2 60.00000 3.000000 1.0000000 1
## 5 5 6 60.00000 3.000000 3.0000000 1
## 6 6 1 40.00000 3.000000 1.0000000 1
## 7 7 4 60.00000 7.000000 1.0000000 1
## 8 8 10 50.00000 5.000000 2.0000000 1
## 9 9 3 40.00000 7.000000 1.0000000 1
## 10 10 8 60.00000 7.000000 3.0000000 1
## 11 11 12 50.00000 5.000000 2.0000000 1
## 12 12 9 50.00000 5.000000 2.0000000 1
## 13 1 3 50.00000 1.636414 2.0000000 2
## 14 2 5 50.00000 5.000000 0.3182072 2
## 15 3 7 50.00000 5.000000 2.0000000 2
## 16 4 1 33.18207 5.000000 2.0000000 2
## 17 5 6 50.00000 5.000000 3.6817928 2
## 18 6 2 66.81793 5.000000 2.0000000 2
## 19 7 4 50.00000 8.363586 2.0000000 2
## 20 8 9 50.00000 5.000000 2.0000000 2
## 21 9 8 50.00000 5.000000 2.0000000 2
## 22 10 10 50.00000 5.000000 2.0000000 2
## 23 11 12 50.00000 5.000000 2.0000000 2
## 24 12 11 50.00000 5.000000 2.0000000 2
##
## Data are stored in coded form using these coding formulas ...
## x1 ~ (Glucose - 50)/10
## x2 ~ (NaNO3 - 5)/2
## x3 ~ (K2HPO4 - 2)/1
Analysis of RSM
CR <- coded.data(
ChemReact,
x1 ~ (Time - 85) / 5,
x2 ~ (Temp - 175) / 5
)
CR1.rsm <- rsm::rsm(Yield ~ SO(Time, Temp), data = ChemReact1)
## Warning in rsm::rsm(Yield ~ SO(Time, Temp), data = ChemReact1): Some coefficients are aliased - cannot use 'rsm' methods.
## Returning an 'lm' object.
summary(CR1.rsm)
##
## Call:
## rsm(formula = Yield ~ SO(Time, Temp), data = ChemReact1)
##
## Residuals:
## 1 2 3 4 5 6 7
## -5.663e-18 -5.516e-18 -1.410e-18 -5.944e-18 -1.667e-01 2.333e-01 -6.667e-02
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.117e+02 7.717e+01 -6.631 0.02199 *
## FO(Time, Temp)Time 1.420e+01 1.304e+00 10.893 0.00832 **
## FO(Time, Temp)Temp -3.000e-01 3.545e-01 -0.846 0.48651
## TWI(Time, Temp) 5.000e-03 4.163e-03 1.201 0.35270
## PQ(Time, Temp)Time^2 -8.767e-02 6.360e-03 -13.785 0.00522 **
## PQ(Time, Temp)Temp^2 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2082 on 2 degrees of freedom
## Multiple R-squared: 0.9933, Adjusted R-squared: 0.98
## F-statistic: 74.55 on 4 and 2 DF, p-value: 0.01328
Here, besides the interaction term, all the term is significant.
Anova here tests the “lack of fit” which tests the variance not explaned by model but by the error (from the repeated experiments). This term’s p-values should be greater than effect of others!
To draw contour, use contour function
contour(CR1.rsm, ~ Time + Temp, image = TRUE)
To draw 3d plot, use persp function
persp(CR1.rsm,
Time ~ Temp, #plot xs
col=rainbow(50),
xlabs = c(expression("Time (min)", "Temperature (°C)")), # axis labels
theta=30,
phi=30, r = 120, d=120,
border = NULL,
ltheta = 0,
lphi = 0,
shade = 0.75, zlab="Yield", col.axis=37, font.lab=2, col.lab=35,
contour=("colors"))
## Warning in title(sub = dat$labs[5], ...): "ltheta" is not a graphical parameter
## Warning in title(sub = dat$labs[5], ...): "lphi" is not a graphical parameter
## Warning in title(sub = dat$labs[5], ...): "shade" is not a graphical parameter
Changing theta, phi, and r can rotate the graph
persp(CR1.rsm,
Time ~ Temp, #plot xs
col=rainbow(50),
xlabs = c(expression("Time (min)", "Temperature (°C)")), # axis labels
theta=50,
phi=3, r = 180, d=120,
border = NULL,
ltheta = 0,
lphi = 0,
shade = 0.75, zlab="Yield", col.axis=37, font.lab=2, col.lab=35,
contour=("colors"))
## Warning in title(sub = dat$labs[5], ...): "ltheta" is not a graphical parameter
## Warning in title(sub = dat$labs[5], ...): "lphi" is not a graphical parameter
## Warning in title(sub = dat$labs[5], ...): "shade" is not a graphical parameter
Logistic regression
Binary outcome and logistic regression
dataset_sbp$BTH_G %>% hist
Where, each level of BTH_G means 1: 20 to 24 years old 2: 25 to 26 years
old … 26: is 73 to 74 27: 75 years old or older.
Data manipulation
from this dataset, we can calulate
Hypertension
dataset_sbp$hypertension <- ifelse(dataset_sbp$SBP > 130 |
dataset_sbp$DBP > 80,
1,
0) %>%
as.factor
Obesity
dataset_sbp$obesity <- ifelse(dataset_sbp$BMI > 25,
1,
0) %>%
as.factor()
Female (re-level)
dataset_sbp$Female <- ifelse(dataset_sbp$SEX == 2,
1,
0) %>%
as.factor
set.seed(1)
dataset_sbp_small <- subset(dataset_sbp,
row.names(dataset_sbp)%in% sample(x = 1:1000000, size = 1000))
Generating table
table(dataset_sbp$obesity,dataset_sbp$Female)
##
## 0 1
## 0 314235 359858
## 1 195992 129915
ggplot(dataset_sbp_small, aes(x = Female, y = obesity)) +
geom_point() +
ylab("Fasting blood sugar (mg/dL)") +
theme_classic()
ggplot(dataset_sbp_small, aes(x = FBS, y = obesity)) +
geom_point() +
ylab("Fasting blood sugar (mg/dL)") +
theme_classic()
male_oo <- table(dataset_sbp$obesity,dataset_sbp$SEX)[2,1]/table(dataset_sbp$obesity,dataset_sbp$SEX)[1,1]
female_oo <- table(dataset_sbp$obesity,dataset_sbp$SEX)[2,2]/table(dataset_sbp$obesity,dataset_sbp$SEX)[1,2]
odds_ratio_female_to_male <- female_oo/male_oo
odds_ratio_female_to_male
## [1] 0.5788211
Logistic regression with binary input (unadjusted)
glm_obesity_gender <- glm(data = dataset_sbp, obesity ~ Female,family = "binomial")
summary(glm_obesity_gender)
##
## Call:
## glm(formula = obesity ~ Female, family = "binomial", data = dataset_sbp)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.472067 0.002878 -164.0 <2e-16 ***
## Female1 -0.546762 0.004331 -126.2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1262484 on 999999 degrees of freedom
## Residual deviance: 1246322 on 999998 degrees of freedom
## AIC: 1246326
##
## Number of Fisher Scoring iterations: 4
glm_obesity_gender$coefficients["Female1"][[1]]
## [1] -0.5467618
glm_obesity_gender$coefficients["Female1"][[1]] %>% exp()
## [1] 0.5788211
The odds ratio of obesity from male to female is 0.5788211.
confidence interval
confint(glm_obesity_gender)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -0.4777099 -0.4664271
## Female1 -0.5552524 -0.5382738
After exponemtial transformation…
confint(glm_obesity_gender) %>% exp
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.6202021 0.6272393
## Female1 0.5739274 0.5837551
With continuous variable
glm_obesity_fbs <- glm(data = dataset_sbp, obesity ~ FBS,family = "binomial")
summary(glm_obesity_fbs)
##
## Call:
## glm(formula = obesity ~ FBS, family = "binomial", data = dataset_sbp)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.962e+00 9.903e-03 -198.1 <2e-16 ***
## FBS 1.242e-02 9.706e-05 127.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1262484 on 999999 degrees of freedom
## Residual deviance: 1244426 on 999998 degrees of freedom
## AIC: 1244430
##
## Number of Fisher Scoring iterations: 4
confint(glm_obesity_fbs) %>% exp
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.1378634 0.1433205
## FBS 1.0123035 1.0126887
Multiple logistic regression
In case the Gender and FBS as correlation, they may adjust the effect of obese
Data visualiation
ggplot(dataset_sbp_small, aes(x = Female, y = FBS)) +
geom_boxplot() +
ylab("Fasting blood sugar (mg/dL)") +
theme_classic()
Does the FBS differ by Gender?
lm(dat = dataset_sbp, FBS ~ Female) %>%
summary
##
## Call:
## lm(formula = FBS ~ Female, data = dataset_sbp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -41.142 -11.492 -4.492 4.508 261.508
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 101.14193 0.03201 3159.9 <2e-16 ***
## Female1 -4.65011 0.04574 -101.7 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.86 on 999998 degrees of freedom
## Multiple R-squared: 0.01023, Adjusted R-squared: 0.01023
## F-statistic: 1.034e+04 on 1 and 999998 DF, p-value: < 2.2e-16
It looks like so.
Let’s run multiple logistic regression
glm(data = dataset_sbp, obesity ~ Female,family = "binomial") %>% confint() %>% exp
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.6202021 0.6272393
## Female1 0.5739274 0.5837551
glm(data = dataset_sbp, obesity ~ FBS,family = "binomial") %>% confint() %>% exp
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.1378634 0.1433205
## FBS 1.0123035 1.0126887
glm_obesity_gender_fbs <- glm(data = dataset_sbp, obesity ~ Female + FBS,family = "binomial")
glm_obesity_gender_fbs %>% confint() %>% exp
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.1919579 0.1998134
## Female1 0.6011418 0.6115459
## FBS 1.0112972 1.0116816
glm(data = dataset_sbp, obesity ~ Female + FBS,family = "binomial") %>% confint() %>% exp
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.1919579 0.1998134
## Female1 0.6011418 0.6115459
## FBS 1.0112972 1.0116816
Confidence intervals
confint(glm_obesity_gender_fbs)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -1.65047915 -1.6103711
## Female1 -0.50892448 -0.4917653
## FBS 0.01123387 0.0116139
confint(glm_obesity_gender_fbs) %>% exp
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.1919579 0.1998134
## Female1 0.6011418 0.6115459
## FBS 1.0112972 1.0116816
Effect modification
To see if there is a effect modication, anova can be used.
glm_obesity_gender_fbs_int <- glm(data = dataset_sbp, obesity ~ Female * FBS,family = "binomial")
anova(glm_obesity_gender_fbs_int, test = "F")
## Warning in anova.glm(glm_obesity_gender_fbs_int, test = "F"): using F test with
## a 'binomial' family is inappropriate
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: obesity
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev F Pr(>F)
## NULL 1e+06 1262484
## Female 1 16162.1 1e+06 1246322 16162.1 < 2.2e-16 ***
## FBS 1 15098.9 1e+06 1231223 15098.9 < 2.2e-16 ***
## Female:FBS 1 2997.4 1e+06 1228226 2997.4 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(glm_obesity_gender_fbs_int)
##
## Call:
## glm(formula = obesity ~ Female * FBS, family = "binomial", data = dataset_sbp)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.2189161 0.0122824 -99.24 <2e-16 ***
## Female1 -1.5924525 0.0207134 -76.88 <2e-16 ***
## FBS 0.0073635 0.0001176 62.59 <2e-16 ***
## Female1:FBS 0.0110150 0.0002043 53.90 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1262484 on 999999 degrees of freedom
## Residual deviance: 1228226 on 999996 degrees of freedom
## AIC: 1228234
##
## Number of Fisher Scoring iterations: 4
Binomial distribution
How can we caclulate the confidence interval?
Data visualization of logistic regression
For categorical varaibles
We can visualize the Odds ratio and its confidence interval using
forest plot
#install.packages("sjPlot")
library(sjPlot)
##
## Attaching package: 'sjPlot'
## The following object is masked from 'package:ggplot2':
##
## set_theme
glm(data = dataset_sbp_small, obesity ~ SBP + FBS + DIS, family = "binomial") %>%
plot_model() +
theme_classic()
plot_models(
glm(data = dataset_sbp_small, hypertension ~ FBS, family = "binomial"),
glm(data = dataset_sbp_small, Female ~ SBP + DIS, family = "binomial"),
glm(data = dataset_sbp_small, obesity ~ SBP + FBS + SEX, family = "binomial"),
grid = TRUE
) +
theme_classic()
## Ignoring unknown labels:
## • shape : "p-level"
Logistic plot for binary predictors
Continuous variables
dataset_sbp_small$FBS %>% summary
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 60.0 87.0 95.0 100.1 105.0 352.0
pred1 <- data.frame(FBS = seq(from = 60,
to = 352, by = 2))
pred1$predict <- predict(glm_obesity_fbs, newdata = pred1, type = "response")
predict(glm_obesity_fbs, newdata = pred1, type = "response")
## 1 2 3 4 5 6 7 8
## 0.2284725 0.2328800 0.2373465 0.2418716 0.2464551 0.2510966 0.2557960 0.2605526
## 9 10 11 12 13 14 15 16
## 0.2653662 0.2702362 0.2751621 0.2801434 0.2851793 0.2902692 0.2954125 0.3006083
## 17 18 19 20 21 22 23 24
## 0.3058558 0.3111541 0.3165024 0.3218997 0.3273449 0.3328370 0.3383749 0.3439574
## 25 26 27 28 29 30 31 32
## 0.3495834 0.3552516 0.3609606 0.3667093 0.3724961 0.3783197 0.3841786 0.3900713
## 33 34 35 36 37 38 39 40
## 0.3959962 0.4019519 0.4079366 0.4139488 0.4199867 0.4260486 0.4321329 0.4382377
## 41 42 43 44 45 46 47 48
## 0.4443613 0.4505018 0.4566574 0.4628263 0.4690066 0.4751964 0.4813938 0.4875970
## 49 50 51 52 53 54 55 56
## 0.4938040 0.5000129 0.5062218 0.5124288 0.5186320 0.5248294 0.5310192 0.5371994
## 57 58 59 60 61 62 63 64
## 0.5433682 0.5495238 0.5556643 0.5617877 0.5678925 0.5739766 0.5800385 0.5860763
## 65 66 67 68 69 70 71 72
## 0.5920883 0.5980729 0.6040285 0.6099533 0.6158459 0.6217046 0.6275280 0.6333147
## 73 74 75 76 77 78 79 80
## 0.6390632 0.6447721 0.6504401 0.6560659 0.6616482 0.6671860 0.6726779 0.6781229
## 81 82 83 84 85 86 87 88
## 0.6835199 0.6888680 0.6941662 0.6994135 0.7046090 0.7097521 0.7148418 0.7198775
## 89 90 91 92 93 94 95 96
## 0.7248585 0.7297842 0.7346539 0.7394673 0.7442237 0.7489228 0.7535641 0.7581474
## 97 98 99 100 101 102 103 104
## 0.7626722 0.7671384 0.7715457 0.7758940 0.7801831 0.7844129 0.7885834 0.7926945
## 105 106 107 108 109 110 111 112
## 0.7967463 0.8007388 0.8046721 0.8085463 0.8123616 0.8161182 0.8198162 0.8234559
## 113 114 115 116 117 118 119 120
## 0.8270377 0.8305617 0.8340283 0.8374379 0.8407908 0.8440875 0.8473282 0.8505136
## 121 122 123 124 125 126 127 128
## 0.8536439 0.8567198 0.8597416 0.8627099 0.8656252 0.8684881 0.8712990 0.8740585
## 129 130 131 132 133 134 135 136
## 0.8767673 0.8794258 0.8820347 0.8845945 0.8871059 0.8895695 0.8919859 0.8943557
## 137 138 139 140 141 142 143 144
## 0.8966795 0.8989579 0.9011917 0.9033813 0.9055276 0.9076310 0.9096923 0.9117120
## 145 146 147
## 0.9136909 0.9156295 0.9175285
predict(glm_obesity_fbs, data.frame(FBS = 0), type = "response")
## 1
## 0.1232439
plot(dataset_sbp_small$FBS, as.numeric(dataset_sbp_small$obesity == 1),
ylab = "Obesity",
xlab = "FBS (mg/dL)") +
lines(pred1$FBS, pred1$predict)
## integer(0)
ggplot(dataset_sbp_small, aes(x = FBS, y = as.numeric(obesity) - 1)) +
geom_point() +
labs(y = "Probability of obesity", x = "FBS (mg/dL)") +
theme_classic()
Binomial distribution
Using binomial distribution, we can breifly know what the data is telling.
Let’s assume that we are flipping a coin.
#install.packages("tidydice")
library(tidydice)
flip_coin(times = 1)
## # A tibble: 1 × 5
## experiment round nr result success
## <int> <int> <int> <int> <lgl>
## 1 1 1 1 2 TRUE
We can flip the coin 1 times, and the result will be
flip_coin(times = 2) %>% .$success %>% (mean)
## [1] 0.5
flip_coin(times = 2) %>% .$success %>% (mean)
## [1] 0
Sometimes we get 5 heads by 10, but many of trials will have results of 4 heads/3heads/6heads…etc.
We can simulate 1000 round of 10-flips as below.
flip_coin(times = 10, rounds = 1000) %>% head()
## # A tibble: 6 × 5
## experiment round nr result success
## <int> <int> <int> <int> <lgl>
## 1 1 1 1 2 TRUE
## 2 1 1 2 2 TRUE
## 3 1 1 3 1 FALSE
## 4 1 1 4 2 TRUE
## 5 1 1 5 2 TRUE
## 6 1 1 6 1 FALSE
flip_coin(times = 2, rounds = 10) %>%
group_by(round) %>%
summarise(probability = mean(success))
## # A tibble: 10 × 2
## round probability
## <int> <dbl>
## 1 1 0.5
## 2 2 0.5
## 3 3 0.5
## 4 4 0
## 5 5 1
## 6 6 0
## 7 7 0.5
## 8 8 1
## 9 9 0.5
## 10 10 0.5
c(1.0, 0.5, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 1.0 ) %>% hist(main = "Probability of getting heads when flipiing a coin\n(2 times X 10 rounds)", probability = T)
flip_coin(times = 10, rounds = 100000) %>%
group_by(round) %>%
summarise(probability = mean(success)) %>%
.$probability %>%
hist(., main = "Probability of getting heads when flipiing a coin\n(10 times X 100000 rounds)",
breaks = 10, xlim = c(0,1), probability = T)
flip_coin(times = 50, rounds = 100000) %>%
group_by(round) %>%
summarise(probability = mean(success)) %>%
.$probability %>%
hist(., main = "Probability of getting heads when flipiing a coin\n(50 times X 100000 rounds)",
breaks = 25, probability = T, xlim = c(0,1))
flip_coin(times = 100, rounds = 10000) %>%
group_by(round) %>%
summarise(probability = mean(success)) %>%
.$probability %>%
hist(., main = "Probability of getting heads when flipiing a coin\n(100 times X 10000 rounds)",
breaks = 50, probability = T, xlim = c(0,1))
flip_coin(times = 1000, rounds = 10000) %>%
group_by(round) %>%
summarise(probability = mean(success)) %>%
.$probability %>%
hist(., main = "Probability of getting heads when flipiing a coin\n(1000 times X 10000 rounds)",
breaks = 20, xlim = c(0,1),
probability = T)
As continuous variable is So, we usually “Predict” the proportion of the sample.
glm_obesity_fbs_small <- glm(data = dataset_sbp, obesity ~ FBS, family = "binomial")
ilink <- family(glm_obesity_fbs_small)$linkinv
pred1 <- with(dataset_sbp_small,
data.frame(FBS = seq(min(FBS), max(FBS),
length = 100)))
pred1 <- cbind(pred1, predict(glm_obesity_fbs_small, pred1, type = "link", se.fit = TRUE)[1:2])
pred1 <- transform(pred1, Fitted = ilink(fit), Upper = ilink(fit + (2 * se.fit)),
Lower = ilink(fit - (2 * se.fit)))
ggplot(dataset_sbp_small, aes(x = FBS, y = as.numeric(obesity) - 1)) +
geom_point()
theme_classic() +
labs(y = "Probability of obesity", x = "FBS (mg/dL)")
## <theme> List of 146
## $ line : <ggplot2::element_line>
## ..@ colour : chr "black"
## ..@ linewidth : num 0.5
## ..@ linetype : num 1
## ..@ lineend : chr "butt"
## ..@ linejoin : chr "round"
## ..@ arrow : logi FALSE
## ..@ arrow.fill : chr "black"
## ..@ inherit.blank: logi TRUE
## $ rect : <ggplot2::element_rect>
## ..@ fill : chr "white"
## ..@ colour : chr "black"
## ..@ linewidth : num 0.5
## ..@ linetype : num 1
## ..@ linejoin : chr "round"
## ..@ inherit.blank: logi TRUE
## $ text : <ggplot2::element_text>
## ..@ family : chr ""
## ..@ face : chr "plain"
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : chr "black"
## ..@ size : num 11
## ..@ hjust : num 0.5
## ..@ vjust : num 0.5
## ..@ angle : num 0
## ..@ lineheight : num 0.9
## ..@ margin : <ggplot2::margin> num [1:4] 0 0 0 0
## ..@ debug : logi FALSE
## ..@ inherit.blank: logi TRUE
## $ title : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : NULL
## ..@ vjust : NULL
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : NULL
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ point : <ggplot2::element_point>
## ..@ colour : chr "black"
## ..@ shape : num 19
## ..@ size : num 1.5
## ..@ fill : chr "white"
## ..@ stroke : num 0.5
## ..@ inherit.blank: logi TRUE
## $ polygon : <ggplot2::element_polygon>
## ..@ fill : chr "white"
## ..@ colour : chr "black"
## ..@ linewidth : num 0.5
## ..@ linetype : num 1
## ..@ linejoin : chr "round"
## ..@ inherit.blank: logi TRUE
## $ geom : <ggplot2::element_geom>
## ..@ ink : chr "black"
## ..@ paper : chr "white"
## ..@ accent : chr "#3366FF"
## ..@ linewidth : num 0.5
## ..@ borderwidth: num 0.5
## ..@ linetype : int 1
## ..@ bordertype : int 1
## ..@ family : chr ""
## ..@ fontsize : num 3.87
## ..@ pointsize : num 1.5
## ..@ pointshape : num 19
## ..@ colour : NULL
## ..@ fill : NULL
## $ spacing : 'simpleUnit' num 5.5points
## ..- attr(*, "unit")= int 8
## $ margins : <ggplot2::margin> num [1:4] 5.5 5.5 5.5 5.5
## $ aspect.ratio : NULL
## $ axis.title : NULL
## $ axis.title.x : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : NULL
## ..@ vjust : num 1
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : <ggplot2::margin> num [1:4] 2.75 0 0 0
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.title.x.top : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : NULL
## ..@ vjust : num 0
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : <ggplot2::margin> num [1:4] 0 0 2.75 0
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.title.x.bottom : NULL
## $ axis.title.y : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : NULL
## ..@ vjust : num 1
## ..@ angle : num 90
## ..@ lineheight : NULL
## ..@ margin : <ggplot2::margin> num [1:4] 0 2.75 0 0
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.title.y.left : NULL
## $ axis.title.y.right : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : NULL
## ..@ vjust : num 1
## ..@ angle : num -90
## ..@ lineheight : NULL
## ..@ margin : <ggplot2::margin> num [1:4] 0 0 0 2.75
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.text : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : 'rel' num 0.8
## ..@ hjust : NULL
## ..@ vjust : NULL
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : NULL
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.text.x : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : NULL
## ..@ vjust : num 1
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : <ggplot2::margin> num [1:4] 2.2 0 0 0
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.text.x.top : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : NULL
## ..@ vjust : num 0
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : <ggplot2::margin> num [1:4] 0 0 2.2 0
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.text.x.bottom : NULL
## $ axis.text.y : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : num 1
## ..@ vjust : NULL
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : <ggplot2::margin> num [1:4] 0 2.2 0 0
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.text.y.left : NULL
## $ axis.text.y.right : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : num 0
## ..@ vjust : NULL
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : <ggplot2::margin> num [1:4] 0 0 0 2.2
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.text.theta : NULL
## $ axis.text.r : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : num 0.5
## ..@ vjust : NULL
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : <ggplot2::margin> num [1:4] 0 2.2 0 2.2
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.ticks : <ggplot2::element_line>
## ..@ colour : NULL
## ..@ linewidth : NULL
## ..@ linetype : NULL
## ..@ lineend : NULL
## ..@ linejoin : NULL
## ..@ arrow : logi FALSE
## ..@ arrow.fill : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.ticks.x : NULL
## $ axis.ticks.x.top : NULL
## $ axis.ticks.x.bottom : NULL
## $ axis.ticks.y : NULL
## $ axis.ticks.y.left : NULL
## $ axis.ticks.y.right : NULL
## $ axis.ticks.theta : NULL
## $ axis.ticks.r : NULL
## $ axis.minor.ticks.x.top : NULL
## $ axis.minor.ticks.x.bottom : NULL
## $ axis.minor.ticks.y.left : NULL
## $ axis.minor.ticks.y.right : NULL
## $ axis.minor.ticks.theta : NULL
## $ axis.minor.ticks.r : NULL
## $ axis.ticks.length : 'rel' num 0.5
## $ axis.ticks.length.x : NULL
## $ axis.ticks.length.x.top : NULL
## $ axis.ticks.length.x.bottom : NULL
## $ axis.ticks.length.y : NULL
## $ axis.ticks.length.y.left : NULL
## $ axis.ticks.length.y.right : NULL
## $ axis.ticks.length.theta : NULL
## $ axis.ticks.length.r : NULL
## $ axis.minor.ticks.length : 'rel' num 0.75
## $ axis.minor.ticks.length.x : NULL
## $ axis.minor.ticks.length.x.top : NULL
## $ axis.minor.ticks.length.x.bottom: NULL
## $ axis.minor.ticks.length.y : NULL
## $ axis.minor.ticks.length.y.left : NULL
## $ axis.minor.ticks.length.y.right : NULL
## $ axis.minor.ticks.length.theta : NULL
## $ axis.minor.ticks.length.r : NULL
## $ axis.line : <ggplot2::element_line>
## ..@ colour : NULL
## ..@ linewidth : NULL
## ..@ linetype : NULL
## ..@ lineend : chr "square"
## ..@ linejoin : NULL
## ..@ arrow : logi FALSE
## ..@ arrow.fill : NULL
## ..@ inherit.blank: logi TRUE
## $ axis.line.x : NULL
## $ axis.line.x.top : NULL
## $ axis.line.x.bottom : NULL
## $ axis.line.y : NULL
## $ axis.line.y.left : NULL
## $ axis.line.y.right : NULL
## $ axis.line.theta : NULL
## $ axis.line.r : NULL
## $ legend.background : <ggplot2::element_rect>
## ..@ fill : NULL
## ..@ colour : logi NA
## ..@ linewidth : NULL
## ..@ linetype : NULL
## ..@ linejoin : NULL
## ..@ inherit.blank: logi TRUE
## $ legend.margin : NULL
## $ legend.spacing : 'rel' num 2
## $ legend.spacing.x : NULL
## $ legend.spacing.y : NULL
## $ legend.key : NULL
## $ legend.key.size : 'simpleUnit' num 1.2lines
## ..- attr(*, "unit")= int 3
## $ legend.key.height : NULL
## $ legend.key.width : NULL
## $ legend.key.spacing : NULL
## $ legend.key.spacing.x : NULL
## $ legend.key.spacing.y : NULL
## $ legend.key.justification : NULL
## $ legend.frame : NULL
## $ legend.ticks : NULL
## $ legend.ticks.length : 'rel' num 0.2
## $ legend.axis.line : NULL
## $ legend.text : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : 'rel' num 0.8
## ..@ hjust : NULL
## ..@ vjust : NULL
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : NULL
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ legend.text.position : NULL
## $ legend.title : <ggplot2::element_text>
## ..@ family : NULL
## ..@ face : NULL
## ..@ italic : chr NA
## ..@ fontweight : num NA
## ..@ fontwidth : num NA
## ..@ colour : NULL
## ..@ size : NULL
## ..@ hjust : num 0
## ..@ vjust : NULL
## ..@ angle : NULL
## ..@ lineheight : NULL
## ..@ margin : NULL
## ..@ debug : NULL
## ..@ inherit.blank: logi TRUE
## $ legend.title.position : NULL
## $ legend.position : chr "right"
## $ legend.position.inside : NULL
## $ legend.direction : NULL
## $ legend.byrow : NULL
## $ legend.justification : chr "center"
## $ legend.justification.top : NULL
## $ legend.justification.bottom : NULL
## $ legend.justification.left : NULL
## $ legend.justification.right : NULL
## $ legend.justification.inside : NULL
## [list output truncated]
## @ complete: logi TRUE
## @ validate: logi TRUE
ggplot(dataset_sbp_small, aes(x = FBS, y = as.numeric(obesity) - 1)) +
geom_point() +
geom_line(data = pred1, aes(y = Fitted, x = FBS)) +
geom_ribbon(data = pred1, aes(ymin = Lower, ymax = Upper, x = FBS),
fill = "steelblue2", alpha = 0.2, inherit.aes = FALSE) +
theme_classic() +
labs(y = "Probability of obesity", x = "FBS (mg/dL)")
ggplot(dataset_sbp_small, aes(x = FBS, y = as.numeric(obesity) - 1)) +
#geom_point() +
geom_line(data = pred1, aes(y = Fitted, x = FBS)) +
geom_ribbon(data = pred1, aes(ymin = Lower, ymax = Upper, x = FBS),
fill = "steelblue2", alpha = 0.2, inherit.aes = FALSE) +
theme_classic() +
labs(y = "Probability of obesity", x = "FBS (mg/dL)")
glm_hypertension_sbp <- glm(data = dataset_sbp_small, hypertension ~ SBP, family = "binomial")
ilink <- family(glm_hypertension_sbp)$linkinv
pred1 <- with(dataset_sbp_small,
data.frame(SBP = seq(min(SBP), max(SBP),
length = 100)))
pred1 <- cbind(pred1, predict(glm_hypertension_sbp, pred1, type = "link", se.fit = TRUE)[1:2])
pred1 <- transform(pred1, Fitted = ilink(fit), Upper = ilink(fit + (2 * se.fit)),
Lower = ilink(fit - (2 * se.fit)))
ggplot(dataset_sbp_small, aes(x = SBP, y = as.numeric(hypertension)-1)) +
geom_point() +
theme_classic() +
labs(y = "Hypretension", x = "SBP (mmHg)")
ggplot(dataset_sbp_small, aes(x = SBP, y = as.numeric(hypertension)-1)) +
geom_point() +
geom_line(data = pred1, aes(y = Fitted, x = SBP)) +
geom_ribbon(data = pred1, aes(ymin = Lower, ymax = Upper, x = SBP),
fill = "steelblue2", alpha = 0.2, inherit.aes = FALSE) +
theme_classic() +
labs(y = "Hypretension", x = "SBP (mmHg)")
ggplot(dataset_sbp_small, aes(x = SBP, y = as.numeric(hypertension)-1)) +
#geom_point() +
geom_line(data = pred1, aes(y = Fitted, x = SBP)) +
geom_ribbon(data = pred1, aes(ymin = Lower, ymax = Upper, x = SBP),
fill = "steelblue2", alpha = 0.2, inherit.aes = FALSE) +
theme_classic() +
labs(y = "Hypretension", x = "SBP (mmHg)")
Predicted porportion by groups
dataset_sbp_small_dis1 <- dataset_sbp_small %>%
subset(., as.numeric(.$DIS) == 1)
glm_hypertension_sbp <- glm(data = dataset_sbp_small_dis1,
hypertension ~ SBP, family = "binomial")
ilink <- family(glm_hypertension_sbp)$linkinv
pred1 <- with(dataset_sbp_small_dis1,
data.frame(SBP = seq(min(SBP), max(SBP),
length = 100)))
pred1 <- cbind(pred1, predict(glm_hypertension_sbp, pred1, type = "link", se.fit = TRUE)[1:2])
pred1 <- transform(pred1, Fitted = ilink(fit), Upper = ilink(fit + (2 * se.fit)),
Lower = ilink(fit - (2 * se.fit)))
dataset_sbp_small_dis2 <- dataset_sbp_small %>%
subset(., as.numeric(.$DIS) == 2)
glm_hypertension_sbp <- glm(data = dataset_sbp_small_dis2,
hypertension ~ SBP, family = "binomial")
ilink <- family(glm_hypertension_sbp)$linkinv
pred2 <- with(dataset_sbp_small_dis2,
data.frame(SBP = seq(min(SBP), max(SBP),
length = 100)))
pred2 <- cbind(pred2, predict(glm_hypertension_sbp, pred2, type = "link", se.fit = TRUE)[1:2])
pred2 <- transform(pred2, Fitted = ilink(fit), Upper = ilink(fit + (2 * se.fit)),
Lower = ilink(fit - (2 * se.fit)))
dataset_sbp_small_dis3 <- dataset_sbp_small %>%
subset(., as.numeric(.$DIS) == 3)
glm_hypertension_sbp <- glm(data = dataset_sbp_small_dis3,
hypertension ~ SBP, family = "binomial")
ilink <- family(glm_hypertension_sbp)$linkinv
pred3 <- with(dataset_sbp_small_dis3,
data.frame(SBP = seq(min(SBP), max(SBP),
length = 100)))
pred3 <- cbind(pred3, predict(glm_hypertension_sbp, pred3, type = "link", se.fit = TRUE)[1:2])
pred3 <- transform(pred3, Fitted = ilink(fit), Upper = ilink(fit + (2 * se.fit)),
Lower = ilink(fit - (2 * se.fit)))
dataset_sbp_small_dis4 <- dataset_sbp_small %>%
subset(., as.numeric(.$DIS) == 4)
glm_hypertension_sbp <- glm(data = dataset_sbp_small_dis4,
hypertension ~ SBP, family = "binomial")
ilink <- family(glm_hypertension_sbp)$linkinv
pred4 <- with(dataset_sbp_small_dis4,
data.frame(SBP = seq(min(SBP), max(SBP),
length = 100)))
pred4 <- cbind(pred4, predict(glm_hypertension_sbp, pred4, type = "link", se.fit = TRUE)[1:2])
pred4 <- transform(pred4, Fitted = ilink(fit), Upper = ilink(fit + (2 * se.fit)),
Lower = ilink(fit - (2 * se.fit)))
ggplot(dataset_sbp_small, aes(x = SBP, y = as.numeric(hypertension)-1)) +
#geom_point() +
geom_line(data = pred1, aes(y = Fitted, x = SBP), col = "blue") +
geom_ribbon(data = pred1, aes(ymin = Lower, ymax = Upper, x = SBP),
fill = "steelblue2", alpha = 0.2, inherit.aes = FALSE) +
theme_classic() +
geom_line(data = pred2, aes(y = Fitted, x = SBP), col = "red") +
geom_ribbon(data = pred1, aes(ymin = Lower, ymax = Upper, x = SBP),
fill = "pink", alpha = 0.2, inherit.aes = FALSE) +
theme_classic() +
geom_line(data = pred3, aes(y = Fitted, x = SBP), col = "green") +
geom_ribbon(data = pred3, aes(ymin = Lower, ymax = Upper, x = SBP),
fill = "lightgreen", alpha = 0.2, inherit.aes = FALSE) +
theme_classic() +
geom_line(data = pred4, aes(y = Fitted, x = SBP), col = "brown") +
geom_ribbon(data = pred4, aes(ymin = Lower, ymax = Upper, x = SBP),
fill = "yellow", alpha = 0.2, inherit.aes = FALSE) +
theme_classic() +
labs(y = "Hypretension", x = "SBP (mmHg)")
How to use this logistic regression in prediction?
Types of errors
ggplot(dataset_sbp_small, aes(x = SBP)) +
geom_histogram() +
labs(y = "Hypertension", x = "SBP (mmHg)") +
theme_classic() +
facet_wrap(~hypertension, ncol = 1)
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
dataset_sbp_small$pred_hypertension <-
ifelse(glm(data = dataset_sbp_small, hypertension ~ SBP, family = "binomial") %>% predict(., type = "link") > 0, 1, 0)
table(dataset_sbp_small$hypertension, dataset_sbp_small$pred_hypertension)
##
## 0 1
## 0 604 81
## 1 52 263
ggplot(dataset_sbp_small, aes(x = SBP, y = hypertension, col = as.factor(pred_hypertension))) +
geom_point() +
theme_classic() +
guides(col=guide_legend(title="Prediction"))
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
pROC::roc(dataset_sbp_small$hypertension, dataset_sbp_small$pred_hypertensio, plot = TRUE,
print.auc = TRUE) +
xlim(c(1,0))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## NULL
Separating dataset into train and test sets
set.seed(1)
default_idx = sample(nrow(dataset_sbp), 5000)
default_trn = dataset_sbp[default_idx, ]
default_tst = dataset_sbp[-default_idx, ]
#install.packages("pROC")
library(pROC)
model_train <- glm(data = default_trn, hypertension ~ SBP, family = "binomial")
test_prob = predict(model_train,
newdata = default_tst,
type = "response")
test_roc = roc(default_tst$hypertension ~ test_prob,
plot = TRUE,
print.auc = TRUE) +
xlim(c(1,0))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
Cox proportional hazard model
Cox proportional hazards model (survival::lung)
Cox proportional hazards model using survival::lung - This is an example dataset from the survival package. - Outcome: time to death - Covariates: age, sex, performance status (ph.ecog)
library(survival)
# Load built-in lung cancer survival dataset
lung_data <- survival::lung
# Quick look at the data
head(lung_data)
## inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss
## 1 3 306 2 74 1 1 90 100 1175 NA
## 2 3 455 2 68 1 0 90 90 1225 15
## 3 3 1010 1 56 1 0 90 90 NA 15
## 4 5 210 2 57 1 1 90 60 1150 11
## 5 1 883 2 60 1 0 100 90 NA 0
## 6 12 1022 1 74 1 1 50 80 513 0
summary(lung_data)
## inst time status age
## Min. : 1.00 Min. : 5.0 Min. :1.000 Min. :39.00
## 1st Qu.: 3.00 1st Qu.: 166.8 1st Qu.:1.000 1st Qu.:56.00
## Median :11.00 Median : 255.5 Median :2.000 Median :63.00
## Mean :11.09 Mean : 305.2 Mean :1.724 Mean :62.45
## 3rd Qu.:16.00 3rd Qu.: 396.5 3rd Qu.:2.000 3rd Qu.:69.00
## Max. :33.00 Max. :1022.0 Max. :2.000 Max. :82.00
## NA's :1
## sex ph.ecog ph.karno pat.karno
## Min. :1.000 Min. :0.0000 Min. : 50.00 Min. : 30.00
## 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.: 75.00 1st Qu.: 70.00
## Median :1.000 Median :1.0000 Median : 80.00 Median : 80.00
## Mean :1.395 Mean :0.9515 Mean : 81.94 Mean : 79.96
## 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.: 90.00 3rd Qu.: 90.00
## Max. :2.000 Max. :3.0000 Max. :100.00 Max. :100.00
## NA's :1 NA's :1 NA's :3
## meal.cal wt.loss
## Min. : 96.0 Min. :-24.000
## 1st Qu.: 635.0 1st Qu.: 0.000
## Median : 975.0 Median : 7.000
## Mean : 928.8 Mean : 9.832
## 3rd Qu.:1150.0 3rd Qu.: 15.750
## Max. :2600.0 Max. : 68.000
## NA's :47 NA's :14
# Recode status into 0/1 for clarity (0 = censored, 1 = death)
lung_data <- lung_data %>%
mutate(
status_bin = ifelse(status == 2, 1, 0),
sex = factor(sex, levels = c(1, 2),
labels = c("Male", "Female"))
)
table(lung_data$status_bin)
##
## 0 1
## 63 165
table(lung_data$sex)
##
## Male Female
## 138 90
Fitting Cox model
# Cox proportional hazards model
cox_fit <- coxph(Surv(time, status_bin) ~ age + sex + ph.ecog,
data = lung_data)
summary(cox_fit)
## Call:
## coxph(formula = Surv(time, status_bin) ~ age + sex + ph.ecog,
## data = lung_data)
##
## n= 227, number of events= 164
## (1 observation deleted due to missingness)
##
## coef exp(coef) se(coef) z Pr(>|z|)
## age 0.011067 1.011128 0.009267 1.194 0.232416
## sexFemale -0.552612 0.575445 0.167739 -3.294 0.000986 ***
## ph.ecog 0.463728 1.589991 0.113577 4.083 4.45e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## exp(coef) exp(-coef) lower .95 upper .95
## age 1.0111 0.9890 0.9929 1.0297
## sexFemale 0.5754 1.7378 0.4142 0.7994
## ph.ecog 1.5900 0.6289 1.2727 1.9864
##
## Concordance= 0.637 (se = 0.025 )
## Likelihood ratio test= 30.5 on 3 df, p=1e-06
## Wald test = 29.93 on 3 df, p=1e-06
## Score (logrank) test = 30.5 on 3 df, p=1e-06
Kaplan–Meier curves by sex
# Kaplan–Meier survival curves stratified by sex
km_fit_sex <- survfit(Surv(time, status_bin) ~ sex,
data = lung_data)
plot(km_fit_sex,
col = c("blue", "red"),
lwd = 2,
xlab = "Time (days)",
ylab = "Survival probability",
main = "Kaplan–Meier curves by sex (lung dataset)")
legend("bottomleft",
legend = levels(lung_data$sex),
col = c("blue", "red"),
lwd = 2,
bty = "n")
Check proportional hazards assumption
# Test proportional hazards assumption
cox_zph <- cox.zph(cox_fit)
cox_zph
## chisq df p
## age 0.188 1 0.66
## sex 2.305 1 0.13
## ph.ecog 2.054 1 0.15
## GLOBAL 4.464 3 0.22
# Plot Schoenfeld residuals for visual inspection (optional)
par(mfrow = c(2, 2))
plot(cox_zph)
par(mfrow = c(1, 1))
## Computing. R Foundation for Statistical Computing, Vienna, Austria. <https://www.R-project.org/>. We have invested a lot of time and effort in creating R, please cite it when using it for data analysis. See also 'citation("pkgname")' for citing R packages.
## Grolemund G, Hayes A, Henry L, Hester J, Kuhn M, Pedersen TL, Miller E, Bache SM, Müller K, Ooms J, Robinson D, Seidel DP, Spinu V, Takahashi K, Vaughan D, Wilke C, Woo K, Yutani H (2019). "Welcome to the tidyverse." Journal of Open Source Software_, *4*(43), 1686. doi:10.21105/joss.01686 <https://doi.org/10.21105/joss.01686>.
## R. version 0.5.0. Buffalo, New York. http://github.com/trinker/pacman
## J, reikoch, Beasley W, O'Connor B, Warnes GR, Quinn M, Kamvar ZN, Gao C (2024). yaml: Methods to Convert R Data to YAML and Back_. R package version 2.3.10, <https://CRAN.R-project.org/package=yaml>. ATTENTION: This citation information has been auto-generated from the package DESCRIPTION file and may need manual editing, see 'help("citation")'.
## version 0.4.4, <https://CRAN.R-project.org/package=reactable>.