COD_week14_2_MGK_BTE3207

Minsik Kim

2025-12-04

Analysis of variance

Before begin..

Let’s load the SBP dataset.

dataset_sbp <- read.csv(file = "Inha/5_Lectures/1_BTE3207_Biostats/2025F/scripts/BTE3207_Advanced_Biostatistics/dataset/sbp_dataset_korea_2013-2014.csv")

reactable::reactable(cbind(order = 1:1000, dataset_sbp[1:1000,]),
                     filterable = T, sortable = T)

Making a new variable hypertension

dataset_sbp$hypertension <- ifelse(dataset_sbp$SBP > 130 |
                                           dataset_sbp$DBP > 80,
                                   T,
                                   F)


dataset_sbp$history_of_hypertension <- ifelse(dataset_sbp$DIS == 1 |
                                           dataset_sbp$DIS == 2,
                                   T,
                                   F)

dataset_sbp$DIS <- factor(dataset_sbp$DIS, 
                          levels = c(1,2,3,4),
                          labels = c("Hypertension and diabetes",
                                     "Hypertension",
                                     "Diabetes",
                                     "No history"))

set.seed(1)

dataset_sbp_small <- subset(dataset_sbp, row.names(dataset_sbp) %in% sample(x = 1:1000000, size = 1000))

DIS groups

ggplot(dataset_sbp, aes(y = SBP, x = DIS)) +
        geom_boxplot() +
        xlab("History of hypertension or diabetes") +
        ylab("SBP (mmHg)") +
        theme_classic(base_size = 12)

T-test?

DIS group 1 vs 4

t.test(dataset_sbp %>% filter(as.numeric(DIS) == 1) %>% .$SBP,
       dataset_sbp %>% filter(as.numeric(DIS) == 4) %>% .$SBP,
       var.equal = F)

## 
##  Welch Two Sample t-test
## 
## data:  dataset_sbp %>% filter(as.numeric(DIS) == 1) %>% .$SBP and dataset_sbp %>% filter(as.numeric(DIS) == 4) %>% .$SBP
## t = 169.59, df = 59802, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  11.18067 11.44213
## sample estimates:
## mean of x mean of y 
##  130.5640  119.2526

DIS group 2 vs 4

t.test(dataset_sbp %>% filter(as.numeric(DIS) == 2) %>% .$SBP,
       dataset_sbp %>% filter(as.numeric(DIS) == 4) %>% .$SBP,
       var.equal = F)

## 
##  Welch Two Sample t-test
## 
## data:  dataset_sbp %>% filter(as.numeric(DIS) == 2) %>% .$SBP and dataset_sbp %>% filter(as.numeric(DIS) == 4) %>% .$SBP
## t = 282.46, df = 225561, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  11.22032 11.37713
## sample estimates:
## mean of x mean of y 
##  130.5513  119.2526

DIS group 3 vs 4

t.test(dataset_sbp %>% filter(as.numeric(DIS) == 3) %>% .$SBP,
       dataset_sbp %>% filter(as.numeric(DIS) == 4) %>% .$SBP,
       var.equal = F)

## 
##  Welch Two Sample t-test
## 
## data:  dataset_sbp %>% filter(as.numeric(DIS) == 3) %>% .$SBP and dataset_sbp %>% filter(as.numeric(DIS) == 4) %>% .$SBP
## t = 60.351, df = 48165, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  3.937403 4.201738
## sample estimates:
## mean of x mean of y 
##  123.3221  119.2526

DIS group 1 vs 2

t.test(dataset_sbp %>% filter(as.numeric(DIS) == 1) %>% .$SBP,
       dataset_sbp %>% filter(as.numeric(DIS) == 2) %>% .$SBP,
       var.equal = F)

## 
##  Welch Two Sample t-test
## 
## data:  dataset_sbp %>% filter(as.numeric(DIS) == 1) %>% .$SBP and dataset_sbp %>% filter(as.numeric(DIS) == 2) %>% .$SBP
## t = 0.16998, df = 90286, p-value = 0.865
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1334474  0.1587919
## sample estimates:
## mean of x mean of y 
##  130.5640  130.5513

….

Do you think it is reasonable?

Linear model?

lm(data = dataset_sbp, SBP ~ DIS) %>% 
        anova

## Analysis of Variance Table
## 
## Response: SBP
##              Df    Sum Sq Mean Sq F value    Pr(>F)    
## DIS       3e+00  21472596 7157532   37558 < 2.2e-16 ***
## Residuals 1e+06 190570477     191                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

aov(data = dataset_sbp, SBP ~ DIS) %>% summary

##                Df    Sum Sq Mean Sq F value Pr(>F)    
## DIS         3e+00  21472596 7157532   37558 <2e-16 ***
## Residuals   1e+06 190570477     191                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

It shows how different whether the group difference estimates with beta values (as well as CIs and p-values).

But can we assess wheter the DIS is a significant factor affecting SBP?

Here comes the analysis of variance.

Analysis of variance

Let’s say we are comparing multipel groups

Making a new dataset

midterm <- data.frame(midterm = c(82, 83, 97, 83, 78, 68, 38, 59, 55),
           major = rep(c("BE", "MBE", "CHEM"), times=1, each=3))

mean_sim <- 10
std_sim <- 5

lcb <- ((mean_sim - (3 * std_sim)) - 5)
ucb <- (((2 * mean_sim) + (3 * (2 * std_sim))) + 5)

ggplot(data = data.frame(u = c(lcb, ucb)),
       mapping = aes(x = u)) +
  stat_function(mapping = aes(colour = "Group 1"),
                fun = dnorm,
                args = list(mean = 20,
                            sd = std_sim)) +
  stat_function(mapping = aes(colour = "Group 2"),
                fun = dnorm,
                args = list(mean = 40,
                            sd = (std_sim))) +
        stat_function(mapping = aes(colour = "Group 3"),
                fun = dnorm,
                args = list(mean = 15,
                            sd = (std_sim))) +
  scale_colour_manual(values = c("red", "blue", "green")) +
  labs(x = "Values",
       y = "Densities",
       col = "Groups") +
        theme_classic(base_size = 12)

Variance of group means

mean_sim <- 10
std_sim <- 5

lcb <- ((mean_sim - (3 * std_sim)) - 5)
ucb <- (((2 * mean_sim) + (3 * (2 * std_sim))) + 5)

ggplot(data = data.frame(u = c(lcb, ucb)),
       mapping = aes(x = u)) +
  stat_function(mapping = aes(colour = "Group 1"),
                fun = dnorm,
                args = list(mean = 20,
                            sd = std_sim)) +
  stat_function(mapping = aes(colour = "Group 2"),
                fun = dnorm,
                args = list(mean = 40,
                            sd = (std_sim))) +
        stat_function(mapping = aes(colour = "Group 3"),
                fun = dnorm,
                args = list(mean = 15,
                            sd = (std_sim))) +
        geom_vline(xintercept = c(20, 40, 15), color= c("red", "blue", "green")) +
        scale_colour_manual(values = c("grey", "grey", "grey")) +
        geom_vline(xintercept = mean(c(15, 40, 20)), col = "Black", size = 2) +
  labs(x = "Values",
       y = "Densities",
       col = "Groups") +
        theme_classic(base_size = 12)

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Uncertainty - variance within groups

with smaller variance within group

mean_sim <- 10
std_sim <- 5

lcb <- ((mean_sim - (3 * std_sim)) - 5)
ucb <- (((2 * mean_sim) + (3 * (2 * std_sim))) + 5)

ggplot(data = data.frame(u = c(lcb, ucb)),
       mapping = aes(x = u)) +
  stat_function(mapping = aes(colour = "Group 1"),
                fun = dnorm,
                args = list(mean = 20,
                            sd = 0.4*std_sim)) +
  stat_function(mapping = aes(colour = "Group 2"),
                fun = dnorm,
                args = list(mean = 40,
                            sd = 0.4*(std_sim))) +
        stat_function(mapping = aes(colour = "Group 3"),
                fun = dnorm,
                args = list(mean = 15,
                            sd = 0.4*(std_sim))) +
        geom_vline(xintercept = c(20, 40, 15), color= c("red", "blue", "green")) +
        scale_colour_manual(values = c("grey", "grey", "grey")) +
        geom_vline(xintercept = mean(c(15, 40, 20)), col = "Black", size = 2) +
  labs(x = "Values",
       y = "Densities",
       col = "Groups") +
        theme_classic(base_size = 12)

Extremely smalle variance

mean_sim <- 10
std_sim <- 5

lcb <- ((mean_sim - (3 * std_sim)) - 5)
ucb <- (((2 * mean_sim) + (3 * (2 * std_sim))) + 5)

ggplot(data = data.frame(u = c(lcb, ucb)),
       mapping = aes(x = u)) +
  stat_function(mapping = aes(colour = "Group 1"),
                fun = dnorm,
                args = list(mean = 20,
                            sd = 0.01*std_sim)) +
  stat_function(mapping = aes(colour = "Group 2"),
                fun = dnorm,
                args = list(mean = 40,
                            sd = 0.01*(std_sim))) +
        stat_function(mapping = aes(colour = "Group 3"),
                fun = dnorm,
                args = list(mean = 15,
                            sd = 0.01*(std_sim))) +
        geom_vline(xintercept = c(20, 40, 15), color= c("red", "blue", "green")) +
        scale_colour_manual(values = c("grey", "grey", "grey")) +
        geom_vline(xintercept = mean(c(15, 40, 20)), col = "Black", size = 2) +
  labs(x = "Values",
       y = "Densities",
       col = "Groups") +
        theme_classic(base_size = 12)

With extremly high sd

mean_sim <- 10
std_sim <- 5

lcb <- ((mean_sim - (3 * std_sim)) - 5)
ucb <- (((2 * mean_sim) + (3 * (2 * std_sim))) + 5)

ggplot(data = data.frame(u = c(lcb, ucb)),
       mapping = aes(x = u)) +
  stat_function(mapping = aes(colour = "Group 1"),
                fun = dnorm,
                args = list(mean = 20,
                            sd = 10*std_sim)) +
  stat_function(mapping = aes(colour = "Group 2"),
                fun = dnorm,
                args = list(mean = 40,
                            sd = 10*(std_sim))) +
        stat_function(mapping = aes(colour = "Group 3"),
                fun = dnorm,
                args = list(mean = 15,
                            sd = 10*(std_sim))) +
        geom_vline(xintercept = c(20, 40, 15), color= c("red", "blue", "green")) +
        scale_colour_manual(values = c("grey", "grey", "grey")) +
        geom_vline(xintercept = mean(c(15, 40, 20)), col = "Black", size = 2) +
        ylim(c(0,0.01)) +
  labs(x = "Values",
       y = "Densities",
       col = "Groups") +
        theme_classic(base_size = 12)

SBP dataset

In histogram, the data can be shown as

ggplot(data = dataset_sbp) +
  stat_function(mapping = aes(colour = "History of hypertension and diabetes"),
                fun = dnorm,
                args = list(mean = dataset_sbp %>% filter(as.numeric(DIS) == 1) %>% .$SBP %>% mean,
                            sd = dataset_sbp %>% filter(as.numeric(DIS) == 1) %>% .$SBP %>% sd)) +
  stat_function(mapping = aes(colour = "History of hypertension"),
                fun = dnorm,
                args = list(mean = dataset_sbp %>% filter(as.numeric(DIS) == 2) %>% .$SBP %>% mean,
                            sd = dataset_sbp %>% filter(as.numeric(DIS) == 2) %>% .$SBP %>% sd)) +
        stat_function(mapping = aes(colour = "History of diabetes"),
                fun = dnorm,
                args = list(mean = dataset_sbp %>% filter(as.numeric(DIS) == 3) %>% .$SBP %>% mean,
                            sd = dataset_sbp %>% filter(as.numeric(DIS) == 3) %>% .$SBP %>% sd)) +
        stat_function(mapping = aes(colour = "No history"),
                fun = dnorm,
                args = list(mean = dataset_sbp %>% filter(as.numeric(DIS) == 4) %>% .$SBP %>% mean,
                            sd = dataset_sbp %>% filter(as.numeric(DIS) == 4) %>% .$SBP %>% sd)) +
  scale_color_brewer(type = "qual", palette = 2) +
        xlim(c(100,150)) +
  labs(x = "SBP (mmHg)",
       y = "Densities",
       col = "Groups") +
        theme_classic(base_size = 12)

## Warning in stat_function(mapping = aes(colour = "History of hypertension and diabetes"), : All aesthetics have length 1, but the data has 1000000 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

## Warning in stat_function(mapping = aes(colour = "History of hypertension"), : All aesthetics have length 1, but the data has 1000000 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

## Warning in stat_function(mapping = aes(colour = "History of diabetes"), : All aesthetics have length 1, but the data has 1000000 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

## Warning in stat_function(mapping = aes(colour = "No history"), fun = dnorm, : All aesthetics have length 1, but the data has 1000000 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

ANOVA in R

aov() function can calculate the result of analysis of variance

aov(midterm ~ major, data = midterm) %>%
        summary

##             Df Sum Sq Mean Sq F value  Pr(>F)   
## major        2   2124  1062.1   12.59 0.00712 **
## Residuals    6    506    84.3                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

aov(SBP ~ DIS, data = dataset_sbp_small) %>%
        summary

##              Df Sum Sq Mean Sq F value Pr(>F)    
## DIS           3  29932    9977   53.54 <2e-16 ***
## Residuals   996 185607     186                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Else, we can implement anova into effect modifications

aov(SBP ~ SEX + BTH_G * BMI + FBS + DIS * DBP, data = dataset_sbp_small) %>%
        summary

##              Df Sum Sq Mean Sq  F value   Pr(>F)    
## SEX           1   6475    6475   82.630  < 2e-16 ***
## BTH_G         1  23618   23618  301.394  < 2e-16 ***
## BMI           1  10826   10826  138.150  < 2e-16 ***
## FBS           1   2905    2905   37.073 1.63e-09 ***
## DIS           3   8318    2773   35.383  < 2e-16 ***
## DBP           1  85682   85682 1093.431  < 2e-16 ***
## BTH_G:BMI     1      0       0    0.000    0.985    
## DIS:DBP       3    374     125    1.593    0.190    
## Residuals   987  77342      78                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

lm(SBP ~ SEX + BTH_G * BMI + FBS + DIS * DBP, data = dataset_sbp_small) %>%
        summary

## 
## Call:
## lm(formula = SBP ~ SEX + BTH_G * BMI + FBS + DIS * DBP, data = dataset_sbp_small)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.193  -5.601  -0.520   5.510  38.301 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          49.345205  11.941720   4.132 3.90e-05 ***
## SEX                  -0.976183   0.589594  -1.656   0.0981 .  
## BTH_G                 0.256344   0.292661   0.876   0.3813    
## BMI                   0.245572   0.187394   1.310   0.1903    
## FBS                   0.019089   0.012977   1.471   0.1416    
## DISHypertension      -4.553308  12.265896  -0.371   0.7106    
## DISDiabetes          -1.391342  17.500850  -0.080   0.9366    
## DISNo history       -17.968618  11.307352  -1.589   0.1124    
## DBP                   0.859173   0.134571   6.385 2.64e-10 ***
## BTH_G:BMI             0.001867   0.012488   0.149   0.8812    
## DISHypertension:DBP   0.054381   0.150584   0.361   0.7181    
## DISDiabetes:DBP      -0.033645   0.224297  -0.150   0.8808    
## DISNo history:DBP     0.178389   0.138652   1.287   0.1985    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.852 on 987 degrees of freedom
## Multiple R-squared:  0.6412, Adjusted R-squared:  0.6368 
## F-statistic:   147 on 12 and 987 DF,  p-value: < 2.2e-16

aov(SBP ~  BTH_G * SEX * DIS * DBP, data = dataset_sbp_small) %>%
        summary

##                    Df Sum Sq Mean Sq  F value  Pr(>F)    
## BTH_G               1  22804   22804  293.300 < 2e-16 ***
## SEX                 1   7289    7289   93.744 < 2e-16 ***
## DIS                 3  11307    3769   48.475 < 2e-16 ***
## DBP                 1  95484   95484 1228.103 < 2e-16 ***
## BTH_G:SEX           1    854     854   10.990 0.00095 ***
## BTH_G:DIS           3    700     233    3.001 0.02973 *  
## SEX:DIS             3    453     151    1.942 0.12116    
## BTH_G:DBP           1    157     157    2.024 0.15514    
## SEX:DBP             1     29      29    0.376 0.53987    
## DIS:DBP             3    137      46    0.588 0.62266    
## BTH_G:SEX:DIS       3    136      45    0.584 0.62536    
## BTH_G:SEX:DBP       1      0       0    0.000 0.98243    
## BTH_G:DIS:DBP       3    343     114    1.469 0.22155    
## SEX:DIS:DBP         3    364     121    1.561 0.19722    
## BTH_G:SEX:DIS:DBP   3    220      73    0.945 0.41823    
## Residuals         968  75261      78                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

anova() function accepts linear model outputs

lm(SBP ~ BTH_G * SEX * DIS * DBP, data = dataset_sbp_small) %>% 
        summary

## 
## Call:
## lm(formula = SBP ~ BTH_G * SEX * DIS * DBP, data = dataset_sbp_small)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -26.013  -5.341  -0.617   5.572  43.134 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                   -201.34069  205.73745  -0.979   0.3280  
## BTH_G                           14.12147    9.50847   1.485   0.1378  
## SEX                             90.26545  144.25017   0.626   0.5316  
## DISHypertension                288.35627  216.54324   1.332   0.1833  
## DISDiabetes                    178.64106  332.71491   0.537   0.5914  
## DISNo history                  229.88453  206.55369   1.113   0.2660  
## DBP                              4.02042    2.45536   1.637   0.1019  
## BTH_G:SEX                       -5.37119    6.18981  -0.868   0.3857  
## BTH_G:DISHypertension          -16.87641   10.12132  -1.667   0.0958 .
## BTH_G:DISDiabetes              -13.99269   16.88817  -0.829   0.4076  
## BTH_G:DISNo history            -12.67335    9.61234  -1.318   0.1877  
## SEX:DISHypertension           -133.96369  152.76266  -0.877   0.3807  
## SEX:DISDiabetes                -58.21450  205.47704  -0.283   0.7770  
## SEX:DISNo history              -83.80291  144.69323  -0.579   0.5626  
## BTH_G:DBP                       -0.16922    0.11451  -1.478   0.1398  
## SEX:DBP                         -1.14763    1.74774  -0.657   0.5116  
## DISHypertension:DBP             -3.59142    2.58840  -1.388   0.1656  
## DISDiabetes:DBP                 -1.93448    4.10562  -0.471   0.6376  
## DISNo history:DBP               -2.76757    2.46747  -1.122   0.2623  
## BTH_G:SEX:DISHypertension        8.28844    6.64491   1.247   0.2126  
## BTH_G:SEX:DISDiabetes            6.19111    9.77716   0.633   0.5267  
## BTH_G:SEX:DISNo history          4.53109    6.24898   0.725   0.4686  
## BTH_G:SEX:DBP                    0.06734    0.07496   0.898   0.3692  
## BTH_G:DISHypertension:DBP        0.20757    0.12216   1.699   0.0896 .
## BTH_G:DISDiabetes:DBP            0.15526    0.21062   0.737   0.4612  
## BTH_G:DISNo history:DBP          0.15002    0.11603   1.293   0.1963  
## SEX:DISHypertension:DBP          1.66182    1.85315   0.897   0.3701  
## SEX:DISDiabetes:DBP              0.60939    2.55070   0.239   0.8112  
## SEX:DISNo history:DBP            0.99992    1.75449   0.570   0.5689  
## BTH_G:SEX:DISHypertension:DBP   -0.10115    0.08066  -1.254   0.2102  
## BTH_G:SEX:DISDiabetes:DBP       -0.06943    0.12222  -0.568   0.5701  
## BTH_G:SEX:DISNo history:DBP     -0.05381    0.07584  -0.709   0.4782  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.818 on 968 degrees of freedom
## Multiple R-squared:  0.6508, Adjusted R-squared:  0.6396 
## F-statistic:  58.2 on 31 and 968 DF,  p-value: < 2.2e-16

PBD and RSM (statistical optimization)

This lecture will practice running

Plackett-Burman Design (PBD)

and

Response surface method (RSM)

Before begin..

Let’s install packages.

# install.packages("daewr")

library(daewr)

PBDes function creates the experimental design of the PBD. randomize = T will generate a matrix with mixed orders.

daewr::PBDes(nfactors = 11, nruns = 12)

##     A  B  C  D  E  F  G  H  J  K  L
## 1   1  1 -1  1  1  1 -1 -1 -1  1 -1
## 2  -1  1  1 -1  1  1  1 -1 -1 -1  1
## 3   1 -1  1  1 -1  1  1  1 -1 -1 -1
## 4  -1  1 -1  1  1 -1  1  1  1 -1 -1
## 5  -1 -1  1 -1  1  1 -1  1  1  1 -1
## 6  -1 -1 -1  1 -1  1  1 -1  1  1  1
## 7   1 -1 -1 -1  1 -1  1  1 -1  1  1
## 8   1  1 -1 -1 -1  1 -1  1  1 -1  1
## 9   1  1  1 -1 -1 -1  1 -1  1  1 -1
## 10 -1  1  1  1 -1 -1 -1  1 -1  1  1
## 11  1 -1  1  1  1 -1 -1 -1  1 -1  1
## 12 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1

# Error examples
#  daewr::PBDes(nfactors = 12, nruns = 12) 


daewr::PBDes(nfactors = 10, nruns = 12)

##     A  B  C  D  E  F  G  H  J  K
## 1   1  1 -1  1  1  1 -1 -1 -1  1
## 2  -1  1  1 -1  1  1  1 -1 -1 -1
## 3   1 -1  1  1 -1  1  1  1 -1 -1
## 4  -1  1 -1  1  1 -1  1  1  1 -1
## 5  -1 -1  1 -1  1  1 -1  1  1  1
## 6  -1 -1 -1  1 -1  1  1 -1  1  1
## 7   1 -1 -1 -1  1 -1  1  1 -1  1
## 8   1  1 -1 -1 -1  1 -1  1  1 -1
## 9   1  1  1 -1 -1 -1  1 -1  1  1
## 10 -1  1  1  1 -1 -1 -1  1 -1  1
## 11  1 -1  1  1  1 -1 -1 -1  1 -1
## 12 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1

#daewr::PBDes(nfactors = 15, nruns = 24)

After setting your higher level and lower level based on your intuition, you can run a experiment with 12 experimental groups, and get the result out of the data.

analyiss of PBD outcome

You can run a miultiple linear regression that is containing all the data!

download.file("https://raw.githubusercontent.com/minsiksudo/BTE3207_Advanced_Biostatistics/refs/heads/main/dataset/PBD_example_data.csv", "PBD_example_data.csv")

pbd_data <- read.csv("PBD_example_data.csv")

 lm(Biomass ~ Glucose + YE + KH2PO4 + K2HPO4 + MgSO4 + CaCl2 + FeSO4 + Fructose + NH4Cl + TM + Vitamin,
#<<<<<<< HEAD
    data = pbd_data) %>% summary

## 
## Call:
## lm(formula = Biomass ~ Glucose + YE + KH2PO4 + K2HPO4 + MgSO4 + 
##     CaCl2 + FeSO4 + Fructose + NH4Cl + TM + Vitamin, data = pbd_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -0.04  -0.02   0.00   0.02   0.04 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.437500   0.007022 347.134  < 2e-16 ***
## Glucose     -0.012500   0.007022  -1.780 0.100360    
## YE           0.845833   0.007022 120.458  < 2e-16 ***
## KH2PO4       0.037500   0.007022   5.341 0.000176 ***
## K2HPO4       0.572500   0.007022  81.532  < 2e-16 ***
## MgSO4        0.032500   0.007022   4.628 0.000582 ***
## CaCl2       -0.054167   0.007022  -7.714 5.45e-06 ***
## FeSO4        0.055833   0.007022   7.951 4.00e-06 ***
## Fructose     0.080833   0.007022  11.512 7.67e-08 ***
## NH4Cl       -0.017500   0.007022  -2.492 0.028315 *  
## TM           0.015833   0.007022   2.255 0.043611 *  
## Vitamin     -0.019167   0.007022  -2.730 0.018280 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0344 on 12 degrees of freedom
## Multiple R-squared:  0.9994, Adjusted R-squared:  0.9989 
## F-statistic:  1953 on 11 and 12 DF,  p-value: < 2.2e-16

This is your Plackett-Burman result.

RSM

# install.packages("rsm")
library(rsm)

using ccd function, you can generate a experimental design that can be used for RSM experiment and analysis

#2 factor experiment
ccd (2)

##    run.order std.order  x1.as.is  x2.as.is Block
## 1          1         1 -1.000000 -1.000000     1
## 2          2         6  0.000000  0.000000     1
## 3          3         2  1.000000 -1.000000     1
## 4          4         4  1.000000  1.000000     1
## 5          5         8  0.000000  0.000000     1
## 6          6         3 -1.000000  1.000000     1
## 7          7         5  0.000000  0.000000     1
## 8          8         7  0.000000  0.000000     1
## 9          1         7  0.000000  0.000000     2
## 10         2         3  0.000000 -1.414214     2
## 11         3         5  0.000000  0.000000     2
## 12         4         6  0.000000  0.000000     2
## 13         5         4  0.000000  1.414214     2
## 14         6         1 -1.414214  0.000000     2
## 15         7         2  1.414214  0.000000     2
## 16         8         8  0.000000  0.000000     2
## 
## Data are stored in coded form using these coding formulas ...
## x1 ~ x1.as.is
## x2 ~ x2.as.is

#3 factor experiment
ccd (3)

##    run.order std.order  x1.as.is  x2.as.is  x3.as.is Block
## 1          1         1 -1.000000 -1.000000 -1.000000     1
## 2          2         8  1.000000  1.000000  1.000000     1
## 3          3        12  0.000000  0.000000  0.000000     1
## 4          4        10  0.000000  0.000000  0.000000     1
## 5          5         4  1.000000  1.000000 -1.000000     1
## 6          6         5 -1.000000 -1.000000  1.000000     1
## 7          7         7 -1.000000  1.000000  1.000000     1
## 8          8         2  1.000000 -1.000000 -1.000000     1
## 9          9         6  1.000000 -1.000000  1.000000     1
## 10        10         9  0.000000  0.000000  0.000000     1
## 11        11         3 -1.000000  1.000000 -1.000000     1
## 12        12        11  0.000000  0.000000  0.000000     1
## 13         1         5  0.000000  0.000000 -1.825742     2
## 14         2         6  0.000000  0.000000  1.825742     2
## 15         3         9  0.000000  0.000000  0.000000     2
## 16         4        10  0.000000  0.000000  0.000000     2
## 17         5         2  1.825742  0.000000  0.000000     2
## 18         6         7  0.000000  0.000000  0.000000     2
## 19         7         1 -1.825742  0.000000  0.000000     2
## 20         8         4  0.000000  1.825742  0.000000     2
## 21         9         8  0.000000  0.000000  0.000000     2
## 22        10         3  0.000000 -1.825742  0.000000     2
## 
## Data are stored in coded form using these coding formulas ...
## x1 ~ x1.as.is
## x2 ~ x2.as.is
## x3 ~ x3.as.is

SOdes2 <- ccd (2, n0 = c(4,6), alpha = "rotatable", inscribed = F)

SOdes2

##    run.order std.order  x1.as.is  x2.as.is Block
## 1          1         1 -1.000000 -1.000000     1
## 2          2         7  0.000000  0.000000     1
## 3          3         2  1.000000 -1.000000     1
## 4          4         5  0.000000  0.000000     1
## 5          5         8  0.000000  0.000000     1
## 6          6         3 -1.000000  1.000000     1
## 7          7         6  0.000000  0.000000     1
## 8          8         4  1.000000  1.000000     1
## 9          1         7  0.000000  0.000000     2
## 10         2         4  0.000000  1.414214     2
## 11         3         6  0.000000  0.000000     2
## 12         4         8  0.000000  0.000000     2
## 13         5         1 -1.414214  0.000000     2
## 14         6         5  0.000000  0.000000     2
## 15         7         2  1.414214  0.000000     2
## 16         8         3  0.000000 -1.414214     2
## 17         9        10  0.000000  0.000000     2
## 18        10         9  0.000000  0.000000     2
## 
## Data are stored in coded form using these coding formulas ...
## x1 ~ x1.as.is
## x2 ~ x2.as.is

With coding argument, you can set the levels of your data into actual values

SOdes2 <- ccd (3, n0 = c(4,6), alpha = "rotatable", inscribed = F,
               coding = list (
                x1 ~ (Glucose - 50)/10, 
                x2 ~ (NaNO3 - 5)/2, 
                x3 ~ (K2HPO4 - 2)/1)
)

SOdes2

##    run.order std.order  Glucose    NaNO3    K2HPO4 Block
## 1          1         5 40.00000 3.000000 3.0000000     1
## 2          2         7 40.00000 7.000000 3.0000000     1
## 3          3        11 50.00000 5.000000 2.0000000     1
## 4          4         2 60.00000 3.000000 1.0000000     1
## 5          5         6 60.00000 3.000000 3.0000000     1
## 6          6         1 40.00000 3.000000 1.0000000     1
## 7          7         4 60.00000 7.000000 1.0000000     1
## 8          8        10 50.00000 5.000000 2.0000000     1
## 9          9         3 40.00000 7.000000 1.0000000     1
## 10        10         8 60.00000 7.000000 3.0000000     1
## 11        11        12 50.00000 5.000000 2.0000000     1
## 12        12         9 50.00000 5.000000 2.0000000     1
## 13         1         3 50.00000 1.636414 2.0000000     2
## 14         2         5 50.00000 5.000000 0.3182072     2
## 15         3         7 50.00000 5.000000 2.0000000     2
## 16         4         1 33.18207 5.000000 2.0000000     2
## 17         5         6 50.00000 5.000000 3.6817928     2
## 18         6         2 66.81793 5.000000 2.0000000     2
## 19         7         4 50.00000 8.363586 2.0000000     2
## 20         8         9 50.00000 5.000000 2.0000000     2
## 21         9         8 50.00000 5.000000 2.0000000     2
## 22        10        10 50.00000 5.000000 2.0000000     2
## 23        11        12 50.00000 5.000000 2.0000000     2
## 24        12        11 50.00000 5.000000 2.0000000     2
## 
## Data are stored in coded form using these coding formulas ...
## x1 ~ (Glucose - 50)/10
## x2 ~ (NaNO3 - 5)/2
## x3 ~ (K2HPO4 - 2)/1

Analysis of RSM

CR <- coded.data(
  ChemReact,
  x1 ~ (Time - 85) / 5,
  x2 ~ (Temp - 175) / 5
)

CR1.rsm <- rsm::rsm(Yield ~ SO(Time, Temp), data = ChemReact1)

## Warning in rsm::rsm(Yield ~ SO(Time, Temp), data = ChemReact1): Some coefficients are aliased - cannot use 'rsm' methods.
##   Returning an 'lm' object.

summary(CR1.rsm)

## 
## Call:
## rsm(formula = Yield ~ SO(Time, Temp), data = ChemReact1)
## 
## Residuals:
##          1          2          3          4          5          6          7 
## -5.663e-18 -5.516e-18 -1.410e-18 -5.944e-18 -1.667e-01  2.333e-01 -6.667e-02 
## 
## Coefficients: (1 not defined because of singularities)
##                        Estimate Std. Error t value Pr(>|t|)   
## (Intercept)          -5.117e+02  7.717e+01  -6.631  0.02199 * 
## FO(Time, Temp)Time    1.420e+01  1.304e+00  10.893  0.00832 **
## FO(Time, Temp)Temp   -3.000e-01  3.545e-01  -0.846  0.48651   
## TWI(Time, Temp)       5.000e-03  4.163e-03   1.201  0.35270   
## PQ(Time, Temp)Time^2 -8.767e-02  6.360e-03 -13.785  0.00522 **
## PQ(Time, Temp)Temp^2         NA         NA      NA       NA   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2082 on 2 degrees of freedom
## Multiple R-squared:  0.9933, Adjusted R-squared:   0.98 
## F-statistic: 74.55 on 4 and 2 DF,  p-value: 0.01328

Here, besides the interaction term, all the term is significant.

Anova here tests the “lack of fit” which tests the variance not explaned by model but by the error (from the repeated experiments). This term’s p-values should be greater than effect of others!

To draw contour, use contour function

contour(CR1.rsm, ~ Time + Temp, image = TRUE)

To draw 3d plot, use persp function

persp(CR1.rsm, 
      Time ~ Temp, #plot xs
      col=rainbow(50),
      xlabs = c(expression("Time (min)", "Temperature (°C)")), # axis labels
      theta=30, 
      phi=30, r = 120, d=120, 
      border = NULL, 
      ltheta = 0, 
      lphi = 0, 
              shade = 0.75, zlab="Yield", col.axis=37, font.lab=2, col.lab=35,
              contour=("colors"))

## Warning in title(sub = dat$labs[5], ...): "ltheta" is not a graphical parameter

## Warning in title(sub = dat$labs[5], ...): "lphi" is not a graphical parameter

## Warning in title(sub = dat$labs[5], ...): "shade" is not a graphical parameter

Changing theta, phi, and r can rotate the graph

persp(CR1.rsm, 
      Time ~ Temp, #plot xs
      col=rainbow(50),
      xlabs = c(expression("Time (min)", "Temperature (°C)")), # axis labels
      theta=50, 
      phi=3, r = 180, d=120, 
      border = NULL, 
      ltheta = 0, 
      lphi = 0, 
              shade = 0.75, zlab="Yield", col.axis=37, font.lab=2, col.lab=35,
              contour=("colors"))

## Warning in title(sub = dat$labs[5], ...): "ltheta" is not a graphical parameter

## Warning in title(sub = dat$labs[5], ...): "lphi" is not a graphical parameter

## Warning in title(sub = dat$labs[5], ...): "shade" is not a graphical parameter

Logistic regression

Binary outcome and logistic regression

dataset_sbp$BTH_G %>% hist

Where, each level of BTH_G means 1: 20 to 24 years old 2: 25 to 26 years old … 26: is 73 to 74 27: 75 years old or older.

Data manipulation

from this dataset, we can calulate

Hypertension

dataset_sbp$hypertension <- ifelse(dataset_sbp$SBP > 130 |
                                           dataset_sbp$DBP > 80,
                                   1,
                                   0) %>%
        as.factor

Obesity

dataset_sbp$obesity <- ifelse(dataset_sbp$BMI > 25,
                                   1,
                                   0) %>% 
        as.factor()

Female (re-level)

dataset_sbp$Female <- ifelse(dataset_sbp$SEX == 2, 
                          1,
                          0) %>%
        as.factor

set.seed(1)

dataset_sbp_small <- subset(dataset_sbp,
                        row.names(dataset_sbp)%in% sample(x = 1:1000000, size = 1000))

Generating table

table(dataset_sbp$obesity,dataset_sbp$Female)

##    
##          0      1
##   0 314235 359858
##   1 195992 129915

ggplot(dataset_sbp_small, aes(x = Female, y = obesity)) +
        geom_point() +
        ylab("Fasting blood sugar (mg/dL)") +
        theme_classic()

ggplot(dataset_sbp_small, aes(x = FBS, y = obesity)) +
        geom_point() +
        ylab("Fasting blood sugar (mg/dL)") +
        theme_classic()

male_oo <- table(dataset_sbp$obesity,dataset_sbp$SEX)[2,1]/table(dataset_sbp$obesity,dataset_sbp$SEX)[1,1]
female_oo <- table(dataset_sbp$obesity,dataset_sbp$SEX)[2,2]/table(dataset_sbp$obesity,dataset_sbp$SEX)[1,2]

odds_ratio_female_to_male <- female_oo/male_oo
odds_ratio_female_to_male

## [1] 0.5788211

Logistic regression with binary input (unadjusted)

glm_obesity_gender <- glm(data = dataset_sbp, obesity ~ Female,family = "binomial")
summary(glm_obesity_gender)

## 
## Call:
## glm(formula = obesity ~ Female, family = "binomial", data = dataset_sbp)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.472067   0.002878  -164.0   <2e-16 ***
## Female1     -0.546762   0.004331  -126.2   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1262484  on 999999  degrees of freedom
## Residual deviance: 1246322  on 999998  degrees of freedom
## AIC: 1246326
## 
## Number of Fisher Scoring iterations: 4

glm_obesity_gender$coefficients["Female1"][[1]]

## [1] -0.5467618

glm_obesity_gender$coefficients["Female1"][[1]] %>% exp()

## [1] 0.5788211

The odds ratio of obesity from male to female is 0.5788211.

confidence interval

confint(glm_obesity_gender)

## Waiting for profiling to be done...

##                  2.5 %     97.5 %
## (Intercept) -0.4777099 -0.4664271
## Female1     -0.5552524 -0.5382738

After exponemtial transformation…

confint(glm_obesity_gender) %>% exp

## Waiting for profiling to be done...

##                 2.5 %    97.5 %
## (Intercept) 0.6202021 0.6272393
## Female1     0.5739274 0.5837551

With continuous variable

glm_obesity_fbs <- glm(data = dataset_sbp, obesity ~ FBS,family = "binomial")
summary(glm_obesity_fbs)

## 
## Call:
## glm(formula = obesity ~ FBS, family = "binomial", data = dataset_sbp)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.962e+00  9.903e-03  -198.1   <2e-16 ***
## FBS          1.242e-02  9.706e-05   127.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1262484  on 999999  degrees of freedom
## Residual deviance: 1244426  on 999998  degrees of freedom
## AIC: 1244430
## 
## Number of Fisher Scoring iterations: 4

confint(glm_obesity_fbs) %>% exp

## Waiting for profiling to be done...

##                 2.5 %    97.5 %
## (Intercept) 0.1378634 0.1433205
## FBS         1.0123035 1.0126887

Multiple logistic regression

In case the Gender and FBS as correlation, they may adjust the effect of obese

Data visualiation

ggplot(dataset_sbp_small, aes(x = Female, y = FBS)) +
        geom_boxplot() +
        ylab("Fasting blood sugar (mg/dL)") +
        theme_classic()

Does the FBS differ by Gender?

lm(dat = dataset_sbp, FBS ~ Female) %>%
        summary

## 
## Call:
## lm(formula = FBS ~ Female, data = dataset_sbp)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.142 -11.492  -4.492   4.508 261.508 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 101.14193    0.03201  3159.9   <2e-16 ***
## Female1      -4.65011    0.04574  -101.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.86 on 999998 degrees of freedom
## Multiple R-squared:  0.01023,    Adjusted R-squared:  0.01023 
## F-statistic: 1.034e+04 on 1 and 999998 DF,  p-value: < 2.2e-16

It looks like so.

Let’s run multiple logistic regression

glm(data = dataset_sbp, obesity ~ Female,family = "binomial") %>% confint() %>% exp

## Waiting for profiling to be done...

##                 2.5 %    97.5 %
## (Intercept) 0.6202021 0.6272393
## Female1     0.5739274 0.5837551

glm(data = dataset_sbp, obesity ~ FBS,family = "binomial") %>% confint() %>% exp

## Waiting for profiling to be done...

##                 2.5 %    97.5 %
## (Intercept) 0.1378634 0.1433205
## FBS         1.0123035 1.0126887

glm_obesity_gender_fbs <- glm(data = dataset_sbp, obesity ~ Female + FBS,family = "binomial")
glm_obesity_gender_fbs %>% confint() %>% exp

## Waiting for profiling to be done...

##                 2.5 %    97.5 %
## (Intercept) 0.1919579 0.1998134
## Female1     0.6011418 0.6115459
## FBS         1.0112972 1.0116816

glm(data = dataset_sbp, obesity ~ Female + FBS,family = "binomial") %>% confint() %>% exp

## Waiting for profiling to be done...

##                 2.5 %    97.5 %
## (Intercept) 0.1919579 0.1998134
## Female1     0.6011418 0.6115459
## FBS         1.0112972 1.0116816

Confidence intervals

confint(glm_obesity_gender_fbs)

## Waiting for profiling to be done...

##                   2.5 %     97.5 %
## (Intercept) -1.65047915 -1.6103711
## Female1     -0.50892448 -0.4917653
## FBS          0.01123387  0.0116139

confint(glm_obesity_gender_fbs) %>% exp

## Waiting for profiling to be done...

##                 2.5 %    97.5 %
## (Intercept) 0.1919579 0.1998134
## Female1     0.6011418 0.6115459
## FBS         1.0112972 1.0116816

Effect modification

To see if there is a effect modication, anova can be used.

glm_obesity_gender_fbs_int <- glm(data = dataset_sbp, obesity ~ Female * FBS,family = "binomial")
anova(glm_obesity_gender_fbs_int, test = "F")

## Warning in anova.glm(glm_obesity_gender_fbs_int, test = "F"): using F test with
## a 'binomial' family is inappropriate

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: obesity
## 
## Terms added sequentially (first to last)
## 
## 
##            Df Deviance Resid. Df Resid. Dev       F    Pr(>F)    
## NULL                       1e+06    1262484                      
## Female      1  16162.1     1e+06    1246322 16162.1 < 2.2e-16 ***
## FBS         1  15098.9     1e+06    1231223 15098.9 < 2.2e-16 ***
## Female:FBS  1   2997.4     1e+06    1228226  2997.4 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

summary(glm_obesity_gender_fbs_int)

## 
## Call:
## glm(formula = obesity ~ Female * FBS, family = "binomial", data = dataset_sbp)
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.2189161  0.0122824  -99.24   <2e-16 ***
## Female1     -1.5924525  0.0207134  -76.88   <2e-16 ***
## FBS          0.0073635  0.0001176   62.59   <2e-16 ***
## Female1:FBS  0.0110150  0.0002043   53.90   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1262484  on 999999  degrees of freedom
## Residual deviance: 1228226  on 999996  degrees of freedom
## AIC: 1228234
## 
## Number of Fisher Scoring iterations: 4

Binomial distribution

How can we caclulate the confidence interval?

Data visualization of logistic regression

For categorical varaibles

We can visualize the Odds ratio and its confidence interval using forest plot

#install.packages("sjPlot")
library(sjPlot)

## 
## Attaching package: 'sjPlot'

## The following object is masked from 'package:ggplot2':
## 
##     set_theme

glm(data = dataset_sbp_small, obesity ~ SBP + FBS + DIS, family = "binomial") %>% 
        plot_model() +
        theme_classic()

plot_models(
        glm(data = dataset_sbp_small, hypertension ~ FBS, family = "binomial"),
        glm(data = dataset_sbp_small, Female ~ SBP + DIS, family = "binomial"),
        glm(data = dataset_sbp_small, obesity ~ SBP + FBS + SEX, family = "binomial"),
        grid = TRUE
        ) +
        theme_classic()

## Ignoring unknown labels:
## • shape : "p-level"

Logistic plot for binary predictors

Continuous variables

dataset_sbp_small$FBS %>% summary

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    60.0    87.0    95.0   100.1   105.0   352.0

pred1 <- data.frame(FBS = seq(from = 60,
                                    to = 352, by = 2))

pred1$predict <- predict(glm_obesity_fbs, newdata = pred1, type = "response")

predict(glm_obesity_fbs, newdata = pred1, type = "response")

##         1         2         3         4         5         6         7         8 
## 0.2284725 0.2328800 0.2373465 0.2418716 0.2464551 0.2510966 0.2557960 0.2605526 
##         9        10        11        12        13        14        15        16 
## 0.2653662 0.2702362 0.2751621 0.2801434 0.2851793 0.2902692 0.2954125 0.3006083 
##        17        18        19        20        21        22        23        24 
## 0.3058558 0.3111541 0.3165024 0.3218997 0.3273449 0.3328370 0.3383749 0.3439574 
##        25        26        27        28        29        30        31        32 
## 0.3495834 0.3552516 0.3609606 0.3667093 0.3724961 0.3783197 0.3841786 0.3900713 
##        33        34        35        36        37        38        39        40 
## 0.3959962 0.4019519 0.4079366 0.4139488 0.4199867 0.4260486 0.4321329 0.4382377 
##        41        42        43        44        45        46        47        48 
## 0.4443613 0.4505018 0.4566574 0.4628263 0.4690066 0.4751964 0.4813938 0.4875970 
##        49        50        51        52        53        54        55        56 
## 0.4938040 0.5000129 0.5062218 0.5124288 0.5186320 0.5248294 0.5310192 0.5371994 
##        57        58        59        60        61        62        63        64 
## 0.5433682 0.5495238 0.5556643 0.5617877 0.5678925 0.5739766 0.5800385 0.5860763 
##        65        66        67        68        69        70        71        72 
## 0.5920883 0.5980729 0.6040285 0.6099533 0.6158459 0.6217046 0.6275280 0.6333147 
##        73        74        75        76        77        78        79        80 
## 0.6390632 0.6447721 0.6504401 0.6560659 0.6616482 0.6671860 0.6726779 0.6781229 
##        81        82        83        84        85        86        87        88 
## 0.6835199 0.6888680 0.6941662 0.6994135 0.7046090 0.7097521 0.7148418 0.7198775 
##        89        90        91        92        93        94        95        96 
## 0.7248585 0.7297842 0.7346539 0.7394673 0.7442237 0.7489228 0.7535641 0.7581474 
##        97        98        99       100       101       102       103       104 
## 0.7626722 0.7671384 0.7715457 0.7758940 0.7801831 0.7844129 0.7885834 0.7926945 
##       105       106       107       108       109       110       111       112 
## 0.7967463 0.8007388 0.8046721 0.8085463 0.8123616 0.8161182 0.8198162 0.8234559 
##       113       114       115       116       117       118       119       120 
## 0.8270377 0.8305617 0.8340283 0.8374379 0.8407908 0.8440875 0.8473282 0.8505136 
##       121       122       123       124       125       126       127       128 
## 0.8536439 0.8567198 0.8597416 0.8627099 0.8656252 0.8684881 0.8712990 0.8740585 
##       129       130       131       132       133       134       135       136 
## 0.8767673 0.8794258 0.8820347 0.8845945 0.8871059 0.8895695 0.8919859 0.8943557 
##       137       138       139       140       141       142       143       144 
## 0.8966795 0.8989579 0.9011917 0.9033813 0.9055276 0.9076310 0.9096923 0.9117120 
##       145       146       147 
## 0.9136909 0.9156295 0.9175285

predict(glm_obesity_fbs, data.frame(FBS = 0), type = "response")

##         1 
## 0.1232439

plot(dataset_sbp_small$FBS, as.numeric(dataset_sbp_small$obesity == 1),
     ylab = "Obesity",
     xlab = "FBS (mg/dL)") +
lines(pred1$FBS, pred1$predict)

## integer(0)

ggplot(dataset_sbp_small, aes(x = FBS, y = as.numeric(obesity) - 1)) +
    geom_point() +
    labs(y = "Probability of obesity", x = "FBS (mg/dL)") +
        theme_classic()

Binomial distribution

Using binomial distribution, we can breifly know what the data is telling.

Let’s assume that we are flipping a coin.

#install.packages("tidydice")
library(tidydice)

flip_coin(times = 1)

## # A tibble: 1 × 5
##   experiment round    nr result success
##        <int> <int> <int>  <int> <lgl>  
## 1          1     1     1      2 TRUE

We can flip the coin 1 times, and the result will be

flip_coin(times = 2) %>% .$success %>% (mean)

## [1] 0.5

flip_coin(times = 2) %>% .$success %>% (mean)

## [1] 0

Sometimes we get 5 heads by 10, but many of trials will have results of 4 heads/3heads/6heads…etc.

We can simulate 1000 round of 10-flips as below.

flip_coin(times = 10, rounds = 1000) %>% head()

## # A tibble: 6 × 5
##   experiment round    nr result success
##        <int> <int> <int>  <int> <lgl>  
## 1          1     1     1      2 TRUE   
## 2          1     1     2      2 TRUE   
## 3          1     1     3      1 FALSE  
## 4          1     1     4      2 TRUE   
## 5          1     1     5      2 TRUE   
## 6          1     1     6      1 FALSE

flip_coin(times = 2, rounds = 10) %>%
        group_by(round) %>%
        summarise(probability = mean(success))

## # A tibble: 10 × 2
##    round probability
##    <int>       <dbl>
##  1     1         0.5
##  2     2         0.5
##  3     3         0.5
##  4     4         0  
##  5     5         1  
##  6     6         0  
##  7     7         0.5
##  8     8         1  
##  9     9         0.5
## 10    10         0.5

c(1.0, 0.5, 0.5, 0.0, 0.5, 0.0, 0.5, 0.0, 0.5, 1.0  ) %>% hist(main = "Probability of getting heads when flipiing a coin\n(2 times X 10 rounds)", probability = T)

flip_coin(times = 10, rounds = 100000) %>%
        group_by(round) %>%
        summarise(probability = mean(success)) %>%
        .$probability %>%
        hist(., main = "Probability of getting heads when flipiing a coin\n(10 times X 100000 rounds)",
             breaks = 10, xlim = c(0,1), probability = T)

flip_coin(times = 50, rounds = 100000) %>%
        group_by(round) %>%
        summarise(probability = mean(success)) %>%
        .$probability %>%
        hist(., main = "Probability of getting heads when flipiing a coin\n(50 times X 100000 rounds)", 
             breaks = 25, probability = T, xlim = c(0,1))

flip_coin(times = 100, rounds = 10000) %>%
        group_by(round) %>%
        summarise(probability = mean(success)) %>%
        .$probability %>%
        hist(., main = "Probability of getting heads when flipiing a coin\n(100 times X 10000 rounds)", 
             breaks = 50, probability = T, xlim = c(0,1))

flip_coin(times = 1000, rounds = 10000) %>%
        group_by(round) %>%
        summarise(probability = mean(success)) %>%
        .$probability %>%
        hist(., main = "Probability of getting heads when flipiing a coin\n(1000 times X 10000 rounds)", 
             breaks = 20, xlim = c(0,1),
             probability = T)

As continuous variable is So, we usually “Predict” the proportion of the sample.

glm_obesity_fbs_small <- glm(data = dataset_sbp, obesity ~ FBS, family = "binomial")


ilink <- family(glm_obesity_fbs_small)$linkinv

pred1 <- with(dataset_sbp_small,
           data.frame(FBS = seq(min(FBS), max(FBS),
                                       length = 100)))

pred1 <- cbind(pred1, predict(glm_obesity_fbs_small, pred1, type = "link", se.fit = TRUE)[1:2])

pred1 <- transform(pred1, Fitted = ilink(fit), Upper = ilink(fit + (2 * se.fit)),
                Lower = ilink(fit - (2 * se.fit)))


ggplot(dataset_sbp_small, aes(x = FBS, y = as.numeric(obesity) - 1)) +
    geom_point()

        theme_classic() +
    labs(y = "Probability of obesity", x = "FBS (mg/dL)")

## <theme> List of 146
##  $ line                            : <ggplot2::element_line>
##   ..@ colour       : chr "black"
##   ..@ linewidth    : num 0.5
##   ..@ linetype     : num 1
##   ..@ lineend      : chr "butt"
##   ..@ linejoin     : chr "round"
##   ..@ arrow        : logi FALSE
##   ..@ arrow.fill   : chr "black"
##   ..@ inherit.blank: logi TRUE
##  $ rect                            : <ggplot2::element_rect>
##   ..@ fill         : chr "white"
##   ..@ colour       : chr "black"
##   ..@ linewidth    : num 0.5
##   ..@ linetype     : num 1
##   ..@ linejoin     : chr "round"
##   ..@ inherit.blank: logi TRUE
##  $ text                            : <ggplot2::element_text>
##   ..@ family       : chr ""
##   ..@ face         : chr "plain"
##   ..@ italic       : chr NA
##   ..@ fontweight   : num NA
##   ..@ fontwidth    : num NA
##   ..@ colour       : chr "black"
##   ..@ size         : num 11
##   ..@ hjust        : num 0.5
##   ..@ vjust        : num 0.5
##   ..@ angle        : num 0
##   ..@ lineheight   : num 0.9
##   ..@ margin       : <ggplot2::margin> num [1:4] 0 0 0 0
##   ..@ debug        : logi FALSE
##   ..@ inherit.blank: logi TRUE
##  $ title                           : <ggplot2::element_text>
##   ..@ family       : NULL
##   ..@ face         : NULL
##   ..@ italic       : chr NA
##   ..@ fontweight   : num NA
##   ..@ fontwidth    : num NA
##   ..@ colour       : NULL
##   ..@ size         : NULL
##   ..@ hjust        : NULL
##   ..@ vjust        : NULL
##   ..@ angle        : NULL
##   ..@ lineheight   : NULL
##   ..@ margin       : NULL
##   ..@ debug        : NULL
##   ..@ inherit.blank: logi TRUE
##  $ point                           : <ggplot2::element_point>
##   ..@ colour       : chr "black"
##   ..@ shape        : num 19
##   ..@ size         : num 1.5
##   ..@ fill         : chr "white"
##   ..@ stroke       : num 0.5
##   ..@ inherit.blank: logi TRUE
##  $ polygon                         : <ggplot2::element_polygon>
##   ..@ fill         : chr "white"
##   ..@ colour       : chr "black"
##   ..@ linewidth    : num 0.5
##   ..@ linetype     : num 1
##   ..@ linejoin     : chr "round"
##   ..@ inherit.blank: logi TRUE
##  $ geom                            : <ggplot2::element_geom>
##   ..@ ink        : chr "black"
##   ..@ paper      : chr "white"
##   ..@ accent     : chr "#3366FF"
##   ..@ linewidth  : num 0.5
##   ..@ borderwidth: num 0.5
##   ..@ linetype   : int 1
##   ..@ bordertype : int 1
##   ..@ family     : chr ""
##   ..@ fontsize   : num 3.87
##   ..@ pointsize  : num 1.5
##   ..@ pointshape : num 19
##   ..@ colour     : NULL
##   ..@ fill       : NULL
##  $ spacing                         : 'simpleUnit' num 5.5points
##   ..- attr(*, "unit")= int 8
##  $ margins                         : <ggplot2::margin> num [1:4] 5.5 5.5 5.5 5.5
##  $ aspect.ratio                    : NULL
##  $ axis.title                      : NULL
##  $ axis.title.x                    : <ggplot2::element_text>
##   ..@ family       : NULL
##   ..@ face         : NULL
##   ..@ italic       : chr NA
##   ..@ fontweight   : num NA
##   ..@ fontwidth    : num NA
##   ..@ colour       : NULL
##   ..@ size         : NULL
##   ..@ hjust        : NULL
##   ..@ vjust        : num 1
##   ..@ angle        : NULL
##   ..@ lineheight   : NULL
##   ..@ margin       : <ggplot2::margin> num [1:4] 2.75 0 0 0
##   ..@ debug        : NULL
##   ..@ inherit.blank: logi TRUE
##  $ axis.title.x.top                : <ggplot2::element_text>
##   ..@ family       : NULL
##   ..@ face         : NULL
##   ..@ italic       : chr NA
##   ..@ fontweight   : num NA
##   ..@ fontwidth    : num NA
##   ..@ colour       : NULL
##   ..@ size         : NULL
##   ..@ hjust        : NULL
##   ..@ vjust        : num 0
##   ..@ angle        : NULL
##   ..@ lineheight   : NULL
##   ..@ margin       : <ggplot2::margin> num [1:4] 0 0 2.75 0
##   ..@ debug        : NULL
##   ..@ inherit.blank: logi TRUE
##  $ axis.title.x.bottom             : NULL
##  $ axis.title.y                    : <ggplot2::element_text>
##   ..@ family       : NULL
##   ..@ face         : NULL
##   ..@ italic       : chr NA
##   ..@ fontweight   : num NA
##   ..@ fontwidth    : num NA
##   ..@ colour       : NULL
##   ..@ size         : NULL
##   ..@ hjust        : NULL
##   ..@ vjust        : num 1
##   ..@ angle        : num 90
##   ..@ lineheight   : NULL
##   ..@ margin       : <ggplot2::margin> num [1:4] 0 2.75 0 0
##   ..@ debug        : NULL
##   ..@ inherit.blank: logi TRUE
##  $ axis.title.y.left               : NULL
##  $ axis.title.y.right              : <ggplot2::element_text>
##   ..@ family       : NULL
##   ..@ face         : NULL
##   ..@ italic       : chr NA
##   ..@ fontweight   : num NA
##   ..@ fontwidth    : num NA
##   ..@ colour       : NULL
##   ..@ size         : NULL
##   ..@ hjust        : NULL
##   ..@ vjust        : num 1
##   ..@ angle        : num -90
##   ..@ lineheight   : NULL
##   ..@ margin       : <ggplot2::margin> num [1:4] 0 0 0 2.75
##   ..@ debug        : NULL
##   ..@ inherit.blank: logi TRUE
##  $ axis.text                       : <ggplot2::element_text>
##   ..@ family       : NULL
##   ..@ face         : NULL
##   ..@ italic       : chr NA
##   ..@ fontweight   : num NA
##   ..@ fontwidth    : num NA
##   ..@ colour       : NULL
##   ..@ size         : 'rel' num 0.8
##   ..@ hjust        : NULL
##   ..@ vjust        : NULL
##   ..@ angle        : NULL
##   ..@ lineheight   : NULL
##   ..@ margin       : NULL
##   ..@ debug        : NULL
##   ..@ inherit.blank: logi TRUE
##  $ axis.text.x                     : <ggplot2::element_text>
##   ..@ family       : NULL
##   ..@ face         : NULL
##   ..@ italic       : chr NA
##   ..@ fontweight   : num NA
##   ..@ fontwidth    : num NA
##   ..@ colour       : NULL
##   ..@ size         : NULL
##   ..@ hjust        : NULL
##   ..@ vjust        : num 1
##   ..@ angle        : NULL
##   ..@ lineheight   : NULL
##   ..@ margin       : <ggplot2::margin> num [1:4] 2.2 0 0 0
##   ..@ debug        : NULL
##   ..@ inherit.blank: logi TRUE
##  $ axis.text.x.top                 : <ggplot2::element_text>
##   ..@ family       : NULL
##   ..@ face         : NULL
##   ..@ italic       : chr NA
##   ..@ fontweight   : num NA
##   ..@ fontwidth    : num NA
##   ..@ colour       : NULL
##   ..@ size         : NULL
##   ..@ hjust        : NULL
##   ..@ vjust        : num 0
##   ..@ angle        : NULL
##   ..@ lineheight   : NULL
##   ..@ margin       : <ggplot2::margin> num [1:4] 0 0 2.2 0
##   ..@ debug        : NULL
##   ..@ inherit.blank: logi TRUE
##  $ axis.text.x.bottom              : NULL
##  $ axis.text.y                     : <ggplot2::element_text>
##   ..@ family       : NULL
##   ..@ face         : NULL
##   ..@ italic       : chr NA
##   ..@ fontweight   : num NA
##   ..@ fontwidth    : num NA
##   ..@ colour       : NULL
##   ..@ size         : NULL
##   ..@ hjust        : num 1
##   ..@ vjust        : NULL
##   ..@ angle        : NULL
##   ..@ lineheight   : NULL
##   ..@ margin       : <ggplot2::margin> num [1:4] 0 2.2 0 0
##   ..@ debug        : NULL
##   ..@ inherit.blank: logi TRUE
##  $ axis.text.y.left                : NULL
##  $ axis.text.y.right               : <ggplot2::element_text>
##   ..@ family       : NULL
##   ..@ face         : NULL
##   ..@ italic       : chr NA
##   ..@ fontweight   : num NA
##   ..@ fontwidth    : num NA
##   ..@ colour       : NULL
##   ..@ size         : NULL
##   ..@ hjust        : num 0
##   ..@ vjust        : NULL
##   ..@ angle        : NULL
##   ..@ lineheight   : NULL
##   ..@ margin       : <ggplot2::margin> num [1:4] 0 0 0 2.2
##   ..@ debug        : NULL
##   ..@ inherit.blank: logi TRUE
##  $ axis.text.theta                 : NULL
##  $ axis.text.r                     : <ggplot2::element_text>
##   ..@ family       : NULL
##   ..@ face         : NULL
##   ..@ italic       : chr NA
##   ..@ fontweight   : num NA
##   ..@ fontwidth    : num NA
##   ..@ colour       : NULL
##   ..@ size         : NULL
##   ..@ hjust        : num 0.5
##   ..@ vjust        : NULL
##   ..@ angle        : NULL
##   ..@ lineheight   : NULL
##   ..@ margin       : <ggplot2::margin> num [1:4] 0 2.2 0 2.2
##   ..@ debug        : NULL
##   ..@ inherit.blank: logi TRUE
##  $ axis.ticks                      : <ggplot2::element_line>
##   ..@ colour       : NULL
##   ..@ linewidth    : NULL
##   ..@ linetype     : NULL
##   ..@ lineend      : NULL
##   ..@ linejoin     : NULL
##   ..@ arrow        : logi FALSE
##   ..@ arrow.fill   : NULL
##   ..@ inherit.blank: logi TRUE
##  $ axis.ticks.x                    : NULL
##  $ axis.ticks.x.top                : NULL
##  $ axis.ticks.x.bottom             : NULL
##  $ axis.ticks.y                    : NULL
##  $ axis.ticks.y.left               : NULL
##  $ axis.ticks.y.right              : NULL
##  $ axis.ticks.theta                : NULL
##  $ axis.ticks.r                    : NULL
##  $ axis.minor.ticks.x.top          : NULL
##  $ axis.minor.ticks.x.bottom       : NULL
##  $ axis.minor.ticks.y.left         : NULL
##  $ axis.minor.ticks.y.right        : NULL
##  $ axis.minor.ticks.theta          : NULL
##  $ axis.minor.ticks.r              : NULL
##  $ axis.ticks.length               : 'rel' num 0.5
##  $ axis.ticks.length.x             : NULL
##  $ axis.ticks.length.x.top         : NULL
##  $ axis.ticks.length.x.bottom      : NULL
##  $ axis.ticks.length.y             : NULL
##  $ axis.ticks.length.y.left        : NULL
##  $ axis.ticks.length.y.right       : NULL
##  $ axis.ticks.length.theta         : NULL
##  $ axis.ticks.length.r             : NULL
##  $ axis.minor.ticks.length         : 'rel' num 0.75
##  $ axis.minor.ticks.length.x       : NULL
##  $ axis.minor.ticks.length.x.top   : NULL
##  $ axis.minor.ticks.length.x.bottom: NULL
##  $ axis.minor.ticks.length.y       : NULL
##  $ axis.minor.ticks.length.y.left  : NULL
##  $ axis.minor.ticks.length.y.right : NULL
##  $ axis.minor.ticks.length.theta   : NULL
##  $ axis.minor.ticks.length.r       : NULL
##  $ axis.line                       : <ggplot2::element_line>
##   ..@ colour       : NULL
##   ..@ linewidth    : NULL
##   ..@ linetype     : NULL
##   ..@ lineend      : chr "square"
##   ..@ linejoin     : NULL
##   ..@ arrow        : logi FALSE
##   ..@ arrow.fill   : NULL
##   ..@ inherit.blank: logi TRUE
##  $ axis.line.x                     : NULL
##  $ axis.line.x.top                 : NULL
##  $ axis.line.x.bottom              : NULL
##  $ axis.line.y                     : NULL
##  $ axis.line.y.left                : NULL
##  $ axis.line.y.right               : NULL
##  $ axis.line.theta                 : NULL
##  $ axis.line.r                     : NULL
##  $ legend.background               : <ggplot2::element_rect>
##   ..@ fill         : NULL
##   ..@ colour       : logi NA
##   ..@ linewidth    : NULL
##   ..@ linetype     : NULL
##   ..@ linejoin     : NULL
##   ..@ inherit.blank: logi TRUE
##  $ legend.margin                   : NULL
##  $ legend.spacing                  : 'rel' num 2
##  $ legend.spacing.x                : NULL
##  $ legend.spacing.y                : NULL
##  $ legend.key                      : NULL
##  $ legend.key.size                 : 'simpleUnit' num 1.2lines
##   ..- attr(*, "unit")= int 3
##  $ legend.key.height               : NULL
##  $ legend.key.width                : NULL
##  $ legend.key.spacing              : NULL
##  $ legend.key.spacing.x            : NULL
##  $ legend.key.spacing.y            : NULL
##  $ legend.key.justification        : NULL
##  $ legend.frame                    : NULL
##  $ legend.ticks                    : NULL
##  $ legend.ticks.length             : 'rel' num 0.2
##  $ legend.axis.line                : NULL
##  $ legend.text                     : <ggplot2::element_text>
##   ..@ family       : NULL
##   ..@ face         : NULL
##   ..@ italic       : chr NA
##   ..@ fontweight   : num NA
##   ..@ fontwidth    : num NA
##   ..@ colour       : NULL
##   ..@ size         : 'rel' num 0.8
##   ..@ hjust        : NULL
##   ..@ vjust        : NULL
##   ..@ angle        : NULL
##   ..@ lineheight   : NULL
##   ..@ margin       : NULL
##   ..@ debug        : NULL
##   ..@ inherit.blank: logi TRUE
##  $ legend.text.position            : NULL
##  $ legend.title                    : <ggplot2::element_text>
##   ..@ family       : NULL
##   ..@ face         : NULL
##   ..@ italic       : chr NA
##   ..@ fontweight   : num NA
##   ..@ fontwidth    : num NA
##   ..@ colour       : NULL
##   ..@ size         : NULL
##   ..@ hjust        : num 0
##   ..@ vjust        : NULL
##   ..@ angle        : NULL
##   ..@ lineheight   : NULL
##   ..@ margin       : NULL
##   ..@ debug        : NULL
##   ..@ inherit.blank: logi TRUE
##  $ legend.title.position           : NULL
##  $ legend.position                 : chr "right"
##  $ legend.position.inside          : NULL
##  $ legend.direction                : NULL
##  $ legend.byrow                    : NULL
##  $ legend.justification            : chr "center"
##  $ legend.justification.top        : NULL
##  $ legend.justification.bottom     : NULL
##  $ legend.justification.left       : NULL
##  $ legend.justification.right      : NULL
##  $ legend.justification.inside     : NULL
##   [list output truncated]
##  @ complete: logi TRUE
##  @ validate: logi TRUE

ggplot(dataset_sbp_small, aes(x = FBS, y = as.numeric(obesity) - 1)) +
    geom_point() +
    geom_line(data = pred1, aes(y = Fitted, x = FBS)) + 
    geom_ribbon(data = pred1, aes(ymin = Lower, ymax = Upper, x = FBS),
    fill = "steelblue2", alpha = 0.2, inherit.aes = FALSE) +
        theme_classic() +
    labs(y = "Probability of obesity", x = "FBS (mg/dL)")

ggplot(dataset_sbp_small, aes(x = FBS, y = as.numeric(obesity) - 1)) +
    #geom_point() +
    geom_line(data = pred1, aes(y = Fitted, x = FBS)) + 
    geom_ribbon(data = pred1, aes(ymin = Lower, ymax = Upper, x = FBS),
    fill = "steelblue2", alpha = 0.2, inherit.aes = FALSE) +
        theme_classic() +
    labs(y = "Probability of obesity", x = "FBS (mg/dL)")

glm_hypertension_sbp <- glm(data = dataset_sbp_small, hypertension ~ SBP, family = "binomial")


ilink <- family(glm_hypertension_sbp)$linkinv

pred1 <- with(dataset_sbp_small,
           data.frame(SBP = seq(min(SBP), max(SBP),
                                       length = 100)))

pred1 <- cbind(pred1, predict(glm_hypertension_sbp, pred1, type = "link", se.fit = TRUE)[1:2])

pred1 <- transform(pred1, Fitted = ilink(fit), Upper = ilink(fit + (2 * se.fit)),
                Lower = ilink(fit - (2 * se.fit)))


ggplot(dataset_sbp_small, aes(x = SBP, y = as.numeric(hypertension)-1)) +
    geom_point() +
        theme_classic() +
    labs(y = "Hypretension", x = "SBP (mmHg)")

ggplot(dataset_sbp_small, aes(x = SBP, y = as.numeric(hypertension)-1)) +
    geom_point() +
    geom_line(data = pred1, aes(y = Fitted, x = SBP)) + 
    geom_ribbon(data = pred1, aes(ymin = Lower, ymax = Upper, x = SBP),
    fill = "steelblue2", alpha = 0.2, inherit.aes = FALSE) +
        theme_classic() +
    labs(y = "Hypretension", x = "SBP (mmHg)")

ggplot(dataset_sbp_small, aes(x = SBP, y = as.numeric(hypertension)-1)) +
    #geom_point() +
    geom_line(data = pred1, aes(y = Fitted, x = SBP)) + 
    geom_ribbon(data = pred1, aes(ymin = Lower, ymax = Upper, x = SBP),
    fill = "steelblue2", alpha = 0.2, inherit.aes = FALSE) +
        theme_classic() +
    labs(y = "Hypretension", x = "SBP (mmHg)")

Predicted porportion by groups

dataset_sbp_small_dis1 <- dataset_sbp_small %>%
                                    subset(., as.numeric(.$DIS) == 1)
glm_hypertension_sbp <- glm(data = dataset_sbp_small_dis1, 
                            hypertension ~ SBP, family = "binomial")

ilink <- family(glm_hypertension_sbp)$linkinv
pred1 <- with(dataset_sbp_small_dis1,
           data.frame(SBP = seq(min(SBP), max(SBP),
                                       length = 100)))
pred1 <- cbind(pred1, predict(glm_hypertension_sbp, pred1, type = "link", se.fit = TRUE)[1:2])
pred1 <- transform(pred1, Fitted = ilink(fit), Upper = ilink(fit + (2 * se.fit)),
                Lower = ilink(fit - (2 * se.fit)))

dataset_sbp_small_dis2 <- dataset_sbp_small %>%
                                    subset(., as.numeric(.$DIS) == 2)
glm_hypertension_sbp <- glm(data = dataset_sbp_small_dis2, 
                            hypertension ~ SBP, family = "binomial")

ilink <- family(glm_hypertension_sbp)$linkinv
pred2 <- with(dataset_sbp_small_dis2,
           data.frame(SBP = seq(min(SBP), max(SBP),
                                       length = 100)))
pred2 <- cbind(pred2, predict(glm_hypertension_sbp, pred2, type = "link", se.fit = TRUE)[1:2])
pred2 <- transform(pred2, Fitted = ilink(fit), Upper = ilink(fit + (2 * se.fit)),
                Lower = ilink(fit - (2 * se.fit)))


dataset_sbp_small_dis3 <- dataset_sbp_small %>%
                                    subset(., as.numeric(.$DIS) == 3)
glm_hypertension_sbp <- glm(data = dataset_sbp_small_dis3, 
                            hypertension ~ SBP, family = "binomial")

ilink <- family(glm_hypertension_sbp)$linkinv
pred3 <- with(dataset_sbp_small_dis3,
           data.frame(SBP = seq(min(SBP), max(SBP),
                                       length = 100)))
pred3 <- cbind(pred3, predict(glm_hypertension_sbp, pred3, type = "link", se.fit = TRUE)[1:2])
pred3 <- transform(pred3, Fitted = ilink(fit), Upper = ilink(fit + (2 * se.fit)),
                Lower = ilink(fit - (2 * se.fit)))




dataset_sbp_small_dis4 <- dataset_sbp_small %>%
                                    subset(., as.numeric(.$DIS) == 4)
glm_hypertension_sbp <- glm(data = dataset_sbp_small_dis4, 
                            hypertension ~ SBP, family = "binomial")

ilink <- family(glm_hypertension_sbp)$linkinv
pred4 <- with(dataset_sbp_small_dis4,
           data.frame(SBP = seq(min(SBP), max(SBP),
                                       length = 100)))
pred4 <- cbind(pred4, predict(glm_hypertension_sbp, pred4, type = "link", se.fit = TRUE)[1:2])
pred4 <- transform(pred4, Fitted = ilink(fit), Upper = ilink(fit + (2 * se.fit)),
                Lower = ilink(fit - (2 * se.fit)))



ggplot(dataset_sbp_small, aes(x = SBP, y = as.numeric(hypertension)-1)) +
    #geom_point() +
    geom_line(data = pred1, aes(y = Fitted, x = SBP), col = "blue") + 
    geom_ribbon(data = pred1, aes(ymin = Lower, ymax = Upper, x = SBP),
    fill = "steelblue2", alpha = 0.2, inherit.aes = FALSE) +
        theme_classic() +
        geom_line(data = pred2, aes(y = Fitted, x = SBP), col = "red") + 
    geom_ribbon(data = pred1, aes(ymin = Lower, ymax = Upper, x = SBP),
    fill = "pink", alpha = 0.2, inherit.aes = FALSE) +
        theme_classic() +
    
        geom_line(data = pred3, aes(y = Fitted, x = SBP), col = "green") + 
    geom_ribbon(data = pred3, aes(ymin = Lower, ymax = Upper, x = SBP),
    fill = "lightgreen", alpha = 0.2, inherit.aes = FALSE) +
        theme_classic() +
    
        geom_line(data = pred4, aes(y = Fitted, x = SBP), col = "brown") + 
    geom_ribbon(data = pred4, aes(ymin = Lower, ymax = Upper, x = SBP),
    fill = "yellow", alpha = 0.2, inherit.aes = FALSE) +
        theme_classic()  + 
    labs(y = "Hypretension", x = "SBP (mmHg)")

How to use this logistic regression in prediction?

Types of errors

ggplot(dataset_sbp_small, aes(x = SBP)) +
    geom_histogram() +
    labs(y = "Hypertension", x = "SBP (mmHg)") +
    theme_classic() +
        facet_wrap(~hypertension, ncol = 1)

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

dataset_sbp_small$pred_hypertension <- 
        ifelse(glm(data = dataset_sbp_small, hypertension ~ SBP, family = "binomial") %>% predict(., type = "link") > 0, 1, 0)

table(dataset_sbp_small$hypertension, dataset_sbp_small$pred_hypertension)

##    
##       0   1
##   0 604  81
##   1  52 263

ggplot(dataset_sbp_small, aes(x = SBP, y = hypertension, col = as.factor(pred_hypertension))) +
               geom_point() +
        theme_classic() +
        guides(col=guide_legend(title="Prediction"))

library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

pROC::roc(dataset_sbp_small$hypertension,  dataset_sbp_small$pred_hypertensio, plot = TRUE,
               print.auc = TRUE) +
        xlim(c(1,0))

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

## NULL

Separating dataset into train and test sets

set.seed(1)
default_idx = sample(nrow(dataset_sbp), 5000)
default_trn = dataset_sbp[default_idx, ]
default_tst = dataset_sbp[-default_idx, ]

#install.packages("pROC")
library(pROC)

model_train <- glm(data = default_trn, hypertension ~ SBP, family = "binomial")

test_prob = predict(model_train,
                    newdata = default_tst,
                    type = "response")

test_roc = roc(default_tst$hypertension ~ test_prob,
               plot = TRUE,
               print.auc = TRUE) +
        xlim(c(1,0))

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

Cox proportional hazard model

Cox proportional hazards model (survival::lung)

Cox proportional hazards model using survival::lung - This is an example dataset from the survival package. - Outcome: time to death - Covariates: age, sex, performance status (ph.ecog)

library(survival)

# Load built-in lung cancer survival dataset
lung_data <- survival::lung

# Quick look at the data
head(lung_data)

##   inst time status age sex ph.ecog ph.karno pat.karno meal.cal wt.loss
## 1    3  306      2  74   1       1       90       100     1175      NA
## 2    3  455      2  68   1       0       90        90     1225      15
## 3    3 1010      1  56   1       0       90        90       NA      15
## 4    5  210      2  57   1       1       90        60     1150      11
## 5    1  883      2  60   1       0      100        90       NA       0
## 6   12 1022      1  74   1       1       50        80      513       0

summary(lung_data)

##       inst            time            status           age       
##  Min.   : 1.00   Min.   :   5.0   Min.   :1.000   Min.   :39.00  
##  1st Qu.: 3.00   1st Qu.: 166.8   1st Qu.:1.000   1st Qu.:56.00  
##  Median :11.00   Median : 255.5   Median :2.000   Median :63.00  
##  Mean   :11.09   Mean   : 305.2   Mean   :1.724   Mean   :62.45  
##  3rd Qu.:16.00   3rd Qu.: 396.5   3rd Qu.:2.000   3rd Qu.:69.00  
##  Max.   :33.00   Max.   :1022.0   Max.   :2.000   Max.   :82.00  
##  NA's   :1                                                       
##       sex           ph.ecog          ph.karno        pat.karno     
##  Min.   :1.000   Min.   :0.0000   Min.   : 50.00   Min.   : 30.00  
##  1st Qu.:1.000   1st Qu.:0.0000   1st Qu.: 75.00   1st Qu.: 70.00  
##  Median :1.000   Median :1.0000   Median : 80.00   Median : 80.00  
##  Mean   :1.395   Mean   :0.9515   Mean   : 81.94   Mean   : 79.96  
##  3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.: 90.00   3rd Qu.: 90.00  
##  Max.   :2.000   Max.   :3.0000   Max.   :100.00   Max.   :100.00  
##                  NA's   :1        NA's   :1        NA's   :3       
##     meal.cal         wt.loss       
##  Min.   :  96.0   Min.   :-24.000  
##  1st Qu.: 635.0   1st Qu.:  0.000  
##  Median : 975.0   Median :  7.000  
##  Mean   : 928.8   Mean   :  9.832  
##  3rd Qu.:1150.0   3rd Qu.: 15.750  
##  Max.   :2600.0   Max.   : 68.000  
##  NA's   :47       NA's   :14

# Recode status into 0/1 for clarity (0 = censored, 1 = death)
lung_data <- lung_data %>%
        mutate(
                status_bin = ifelse(status == 2, 1, 0),
                sex        = factor(sex, levels = c(1, 2),
                                    labels = c("Male", "Female"))
        )

table(lung_data$status_bin)

## 
##   0   1 
##  63 165

table(lung_data$sex)

## 
##   Male Female 
##    138     90

Fitting Cox model

# Cox proportional hazards model
cox_fit <- coxph(Surv(time, status_bin) ~ age + sex + ph.ecog,
                 data = lung_data)

summary(cox_fit)

## Call:
## coxph(formula = Surv(time, status_bin) ~ age + sex + ph.ecog, 
##     data = lung_data)
## 
##   n= 227, number of events= 164 
##    (1 observation deleted due to missingness)
## 
##                coef exp(coef)  se(coef)      z Pr(>|z|)    
## age        0.011067  1.011128  0.009267  1.194 0.232416    
## sexFemale -0.552612  0.575445  0.167739 -3.294 0.000986 ***
## ph.ecog    0.463728  1.589991  0.113577  4.083 4.45e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##           exp(coef) exp(-coef) lower .95 upper .95
## age          1.0111     0.9890    0.9929    1.0297
## sexFemale    0.5754     1.7378    0.4142    0.7994
## ph.ecog      1.5900     0.6289    1.2727    1.9864
## 
## Concordance= 0.637  (se = 0.025 )
## Likelihood ratio test= 30.5  on 3 df,   p=1e-06
## Wald test            = 29.93  on 3 df,   p=1e-06
## Score (logrank) test = 30.5  on 3 df,   p=1e-06

Kaplan–Meier curves by sex

# Kaplan–Meier survival curves stratified by sex
km_fit_sex <- survfit(Surv(time, status_bin) ~ sex,
                      data = lung_data)

plot(km_fit_sex,
     col  = c("blue", "red"),
     lwd  = 2,
     xlab = "Time (days)",
     ylab = "Survival probability",
     main = "Kaplan–Meier curves by sex (lung dataset)")

legend("bottomleft",
       legend = levels(lung_data$sex),
       col    = c("blue", "red"),
       lwd    = 2,
       bty    = "n")

Check proportional hazards assumption

# Test proportional hazards assumption
cox_zph <- cox.zph(cox_fit)
cox_zph

##         chisq df    p
## age     0.188  1 0.66
## sex     2.305  1 0.13
## ph.ecog 2.054  1 0.15
## GLOBAL  4.464  3 0.22

# Plot Schoenfeld residuals for visual inspection (optional)
par(mfrow = c(2, 2))
plot(cox_zph)
par(mfrow = c(1, 1))

## Computing. R Foundation for Statistical Computing, Vienna, Austria. <https://www.R-project.org/>. We have invested a lot of time and effort in creating R, please cite it when using it for data analysis. See also 'citation("pkgname")' for citing R packages.
## Grolemund G, Hayes A, Henry L, Hester J, Kuhn M, Pedersen TL, Miller E, Bache SM, Müller K, Ooms J, Robinson D, Seidel DP, Spinu V, Takahashi K, Vaughan D, Wilke C, Woo K, Yutani H (2019). "Welcome to the tidyverse." Journal of Open Source Software_, *4*(43), 1686. doi:10.21105/joss.01686 <https://doi.org/10.21105/joss.01686>.
## R. version 0.5.0. Buffalo, New York. http://github.com/trinker/pacman
## J, reikoch, Beasley W, O'Connor B, Warnes GR, Quinn M, Kamvar ZN, Gao C (2024). yaml: Methods to Convert R Data to YAML and Back_. R package version 2.3.10, <https://CRAN.R-project.org/package=yaml>. ATTENTION: This citation information has been auto-generated from the package DESCRIPTION file and may need manual editing, see 'help("citation")'.
## version 0.4.4, <https://CRAN.R-project.org/package=reactable>.