COD_week11_2_MGK_BTE3207

Minsik Kim

2025-11-13

Linear regression

Before begin..

Let’s load the SBP dataset.

dataset_sbp <- read.csv(file = "Inha/5_Lectures/1_BTE3207_Biostats/2025F/scripts/BTE3207_Advanced_Biostatistics/dataset/sbp_dataset_korea_2013-2014.csv")



head(dataset_sbp)
##   SEX BTH_G SBP DBP FBS DIS  BMI
## 1   1     1 116  78  94   4 16.6
## 2   1     1 100  60  79   4 22.3
## 3   1     1 100  60  87   4 21.9
## 4   1     1 111  70  72   4 20.2
## 5   1     1 120  80  98   4 20.0
## 6   1     1 115  79  95   4 23.1

Making a new variable hypertension

dataset_sbp$hypertension <- ifelse(dataset_sbp$SBP > 130 |
                                           dataset_sbp$DBP > 80,
                                   T,
                                   F)


dataset_sbp$history_of_hypertension <- ifelse(dataset_sbp$DIS == 1 |
                                           dataset_sbp$DIS == 2,
                                   T,
                                   F)

set.seed(1)

dataset_sbp_small <- subset(dataset_sbp, row.names(dataset_sbp) %in% sample(x = 1:1000000, size = 1000))

dataset_sbp_small$DIS <- as.factor(dataset_sbp_small$DIS)

Simple linear regression

With lm() function, we can calculate the linear model of DBP~SBP with the least squares algorithm.

#A function that returns equation

lm_eqn <- function(m){
    eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2, 
         list(a = format(unname(coef(m)[1]), digits = 2),
              b = format(unname(coef(m)[2]), digits = 2),
             r2 = format(summary(m)$r.squared, digits = 3)))
    as.character(as.expression(eq));
}




ggplot(data = dataset_sbp_small, aes(y = SBP, x = DBP)) +
        geom_point() +
        theme_classic(base_family = "serif", base_size = 20) +
        ylab("SBP (mmHg)") +
        xlab("DBP (mmHg)") + 
        geom_smooth(method = "lm") +
        geom_label(y = 170, x = 68,
                  label = lm_eqn(lm(data = dataset_sbp_small, SBP ~ DBP)),
                  parse = TRUE,
                  family='serif',
                  size = 8, label.size = 0, color = "blue")
## Warning: The `label.size` argument of `geom_label()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

lm(data = dataset_sbp_small, SBP ~ DBP) %>% confint()
##                2.5 %    97.5 %
## (Intercept) 32.20308 41.168723
## DBP          1.05807  1.175343
#rnorm(100, mean = 100, sd = 1)

Calculation of residual

The result of Model's predicted value - sample is called residual. It can be shown as below.

lm_result <- 
        lm(data = dataset_sbp_small,DBP ~ SBP) #%>% summary

d <- dataset_sbp_small


#Saving predicted values 
d$predicted <- lm_result$fitted.values
d$residuals <- residuals(lm_result)


ggplot(d, aes(x = SBP, y = DBP)) +
        geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Plot regression slope
        geom_segment(aes(xend = SBP, yend = predicted), alpha = .2) +  # alpha to fade lines
        geom_point() +
        xlab("SBP (mmHg)") +
        ylab("DBP (mmHg)") + 
        theme_classic( base_family = "serif", base_size = 20) +
        geom_label(x = 130, y = 115,
                  label = lm_eqn(lm(data = dataset_sbp_small,DBP ~ SBP)),
                  parse = TRUE,
                  family='serif',
                  size = 8, label.size = 0, color = "blue")
## `geom_smooth()` using formula = 'y ~ x'

# Intercept, slope, and their units

Remember that the size of intercept and slope is DEPENDENT on the units of input or outcome.

dataset_sbp_small$SBP_pa <- 133.322 * dataset_sbp_small$SBP
dataset_sbp_small$DBP_pa <- 133.322 * dataset_sbp_small$DBP

dataset_sbp_small$SBP_psi <- 0.0193368 * dataset_sbp_small$SBP
dataset_sbp_small$DBP_psi <- 0.0193368 * dataset_sbp_small$DBP

If we chnage the units of SBP or DBP,

DBP (pascal) ~ SBP (pascal)

lm(data = dataset_sbp_small, DBP_pa ~ SBP_pa) %>% summary
## 
## Call:
## lm(formula = DBP_pa ~ SBP_pa, data = dataset_sbp_small)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4102.1  -611.7    16.4   623.6  3085.4 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.657e+03  2.277e+02   7.275 6.99e-13 ***
## SBP_pa      5.223e-01  1.398e-02  37.372  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 865 on 998 degrees of freedom
## Multiple R-squared:  0.5832, Adjusted R-squared:  0.5828 
## F-statistic:  1397 on 1 and 998 DF,  p-value: < 2.2e-16

DBP (mmHg) ~ SBP (pascal)

lm(data = dataset_sbp_small, DBP ~ SBP_pa) %>% summary
## 
## Call:
## lm(formula = DBP ~ SBP_pa, data = dataset_sbp_small)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -30.7682  -4.5884   0.1233   4.6775  23.1427 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.243e+01  1.708e+00   7.275 6.99e-13 ***
## SBP_pa      3.917e-03  1.048e-04  37.372  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.488 on 998 degrees of freedom
## Multiple R-squared:  0.5832, Adjusted R-squared:  0.5828 
## F-statistic:  1397 on 1 and 998 DF,  p-value: < 2.2e-16

DBP (psi) ~ SBP (psi)

lm(data = dataset_sbp_small, DBP_psi ~ SBP_psi) %>% summary
## 
## Call:
## lm(formula = DBP_psi ~ SBP_psi, data = dataset_sbp_small)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.59496 -0.08873  0.00238  0.09045  0.44751 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.24026    0.03302   7.275 6.99e-13 ***
## SBP_psi      0.52229    0.01398  37.372  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1255 on 998 degrees of freedom
## Multiple R-squared:  0.5832, Adjusted R-squared:  0.5828 
## F-statistic:  1397 on 1 and 998 DF,  p-value: < 2.2e-16

Distance from the mean value can be drawn as below.

This is the residual from mean of all value - which is X-bar - Xi. Using them we can calcualte SD of mean!

#Saving predicted values 
d$mean <- mean(dataset_sbp_small$DBP)
d$predicted <- lm_result$fitted.values
d$residuals <- residuals(lm_result)


ggplot(d, aes(x = SBP, y = DBP)) +
        #geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Plot regression slope
        geom_segment(aes(xend = SBP, yend = mean), alpha = 1, linetype = "dashed", color = "red") +  # alpha to fade lines
        geom_point() +
        #xlab("SBP (mmHg)") +
        ylab("DBP (mmHg)") + 
        theme_classic( base_family = "serif", base_size = 20) +
        geom_hline(yintercept = mean(dataset_sbp_small$DBP), color = "blue") +
        geom_label(x = 100, y = 95,
                  label = "Mean DBP",
                  #parse = TRUE,
                  family='serif',
                  size = 8, label.size = 0, color = "blue")

Distance from a regression line can be drawn as below

With the same approach as above, we can calculate SD of a linear regression!

lm_result <- dataset_sbp_small %>%
        lm(data = .,DBP ~ SBP) #%>% summary



ggplot(d, aes(x = SBP, y = DBP)) +
        geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Plot regression slope
        geom_segment(aes(xend = SBP, yend = predicted), alpha = 1, linetype = "dashed", color = "red") +  # alpha to fade
        geom_point() +
        xlab("SBP (mmHg)") +
        ylab("DBP (mmHg)") + 
        theme_classic( base_family = "serif", base_size = 20) +
        geom_label(x = 100, y = 90,
                  label = "Linear model",
                  #parse = TRUE,
                  family='serif',
                  size = 8, label.size = 0, color = "blue")
## `geom_smooth()` using formula = 'y ~ x'

Two draw both,

ggplot(d, aes(x = SBP, y = DBP)) +
        geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Plot regression slope
        geom_segment(aes(xend = SBP, yend = predicted), alpha = 1, linetype = "dashed", color = "blue") +  # alpha to fade
        #geom_segment(aes(xend = SBP, yend = mean), alpha = 1, linetype = "dashed", color = "purple") +  # alpha to fade
        geom_point() +
        xlab("SBP (mmHg)") +
        ylab("DBP (mmHg)") + 
        theme_classic( base_family = "serif", base_size = 20) +
        geom_label(x = 100, y = 90,
                  label = "Linear model",
                  #parse = TRUE,
                  family='serif',
                  size = 8, label.size = 0, color = "blue") +
        geom_hline(yintercept = mean(dataset_sbp_small$DBP), color = "purple") +
        geom_label(x = 170, y = 70,
                  label = "Mean DBP",
                  #parse = TRUE,
                  family='serif',
                  size = 8, label.size = 0, color = "purple")
## `geom_smooth()` using formula = 'y ~ x'

Multiple linear regression

Multiple regression

Basic concepts

Deviding samples

In fact, there are two SEXs in the sample.

ggplot(dataset_sbp_small, aes(y = SBP, x = DBP)) +
        #geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Plot regression slope
        #geom_segment(aes(xend = SBP, yend = predicted), alpha = .2) +  # alpha to fade lines
        geom_point(aes(col = history_of_hypertension)) +
        ylab("SBP (mmHg)") +
        xlab("DBP (mmHg)") + 
        theme_classic( base_family = "serif", base_size = 20) +
        theme(legend.position = "top") +
        guides(col=guide_legend(title = "History of hyper tension"))

ggplot(dataset_sbp_small, aes(y = SBP, x = DBP)) +
        #geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Plot regression slope
        #geom_segment(aes(xend = SBP, yend = predicted), alpha = .2) +  # alpha to fade lines
        geom_point(aes(col = history_of_hypertension)) +
        ylab("SBP (mmHg)") +
        xlab("DBP (mmHg)") +
        facet_wrap(~history_of_hypertension) +
        theme_classic( base_family = "serif", base_size = 20) +
        theme(legend.position = "top",
              plot.title = element_text(color = "Blue", hjust = 0.7, vjust = -23)) +
        guides(col=guide_legend(title = "History of hyper tension")) +
        geom_abline(intercept = 37, slope = 1.1, col = "blue") +
        ggtitle("y = 37 + 1.1 x")

lm(data = dataset_sbp_small, SBP ~ DBP) %>% summary
## 
## Call:
## lm(formula = SBP ~ DBP, data = dataset_sbp_small)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.723  -6.022  -1.180   5.145  46.312 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 36.68590    2.28442   16.06   <2e-16 ***
## DBP          1.11671    0.02988   37.37   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.487 on 998 degrees of freedom
## Multiple R-squared:  0.5832, Adjusted R-squared:  0.5828 
## F-statistic:  1397 on 1 and 998 DF,  p-value: < 2.2e-16
lm(data = dataset_sbp_small, SBP ~ DBP + history_of_hypertension) %>% summary
## 
## Call:
## lm(formula = SBP ~ DBP + history_of_hypertension, data = dataset_sbp_small)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.590  -4.500  -0.984   5.702  40.255 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 40.40433    2.23748  18.058   <2e-16 ***
## DBP                          1.04867    0.02974  35.256   <2e-16 ***
## history_of_hypertensionTRUE  6.42067    0.71630   8.964   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.131 on 997 degrees of freedom
## Multiple R-squared:  0.6143, Adjusted R-squared:  0.6135 
## F-statistic:   794 on 2 and 997 DF,  p-value: < 2.2e-16
palette()
## [1] "black"   "#DF536B" "#61D04F" "#2297E6" "#28E2E5" "#CD0BBC" "#F5C710"
## [8] "gray62"
ggplot(dataset_sbp_small, aes(y = SBP, x = DBP)) +
        #geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Plot regression slope
        #geom_segment(aes(xend = SBP, yend = predicted), alpha = .2) +  # alpha to fade lines
        geom_point(aes(col = history_of_hypertension)) +
        ylab("SBP (mmHg)") +
        xlab("DBP (mmHg)") + 
        theme_classic( base_family = "serif", base_size = 20) +
        theme(legend.position = "top") +
        guides(col=guide_legend(title = "History of hyper tension")) +
        geom_abline(intercept = 40, slope = 1.04, col = "#F8766D") +
        annotate(geom="text", x=120, y=140, label="LM of HT-False",
              color="#F8766D", family = "serif", size = 8) + 
        geom_abline(intercept = 46.42, slope = 1.04, col = "#00BFC4") +
        annotate(geom="text", x=100, y=170, label="LM of HT-True",
              color="#00BFC4", family = "serif", size = 8)

Continuous variable as covariate

Deviding samples

ggplot(dataset_sbp_small, aes(y = SBP, x = DBP)) +
        geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Plot regression slope
        #geom_segment(aes(xend = SBP, yend = predicted), alpha = .2) +  # alpha to fade lines
        geom_point() +
        ylab("SBP (mmHg)") +
        xlab("DBP (mmHg)") +
        theme_classic( base_family = "serif", base_size = 20) +
        theme(legend.position = "top",
              plot.title = element_text(color = "Blue", hjust = 0.7, vjust = -23)) +
        guides(col=guide_legend(title = "History of hyper tension")) + 
        geom_label(y = 170, x = 68,
                  label = lm_eqn(lm(data = dataset_sbp_small, SBP ~ DBP)),
                  parse = TRUE,
                  family='serif',
                  size = 8, label.size = 0, color = "blue")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(dataset_sbp_small, aes(y = SBP, x = BTH_G)) +
        geom_smooth(method = "lm", se = FALSE, color = "blue") +  # Plot regression slope
        #geom_segment(aes(xend = SBP, yend = predicted), alpha = .2) +  # alpha to fade lines
        geom_point() +
        ylab("SBP (mmHg)") +
        xlab("Age group") +
        theme_classic( base_family = "serif", base_size = 20) +
        theme(legend.position = "top",
              plot.title = element_text(color = "Blue", hjust = 0.7, vjust = -23)) +
        guides(col=guide_legend(title = "History of hyper tension")) + 
        geom_label(y = 170, x = 18,
                  label = lm_eqn(lm(data = dataset_sbp_small, SBP ~ BTH_G)),
                  parse = TRUE,
                  family='serif',
                  size = 8, label.size = 0, color = "blue")
## `geom_smooth()` using formula = 'y ~ x'

lm(data = dataset_sbp, formula = SBP ~ DBP)
## 
## Call:
## lm(formula = SBP ~ DBP, data = dataset_sbp)
## 
## Coefficients:
## (Intercept)          DBP  
##      38.144        1.105
model_result <- lm(data = dataset_sbp, formula = SBP ~ DBP)

dataset_sbp$DIS <- as.factor(dataset_sbp$DIS)
dataset_sbp %>% summary
##       SEX           BTH_G            SBP             DBP        
##  Min.   :1.00   Min.   : 1.00   Min.   : 82.0   Min.   : 50.00  
##  1st Qu.:1.00   1st Qu.: 9.00   1st Qu.:110.0   1st Qu.: 70.00  
##  Median :1.00   Median :14.00   Median :120.0   Median : 76.00  
##  Mean   :1.49   Mean   :13.91   Mean   :121.9   Mean   : 75.79  
##  3rd Qu.:2.00   3rd Qu.:19.00   3rd Qu.:130.0   3rd Qu.: 80.00  
##  Max.   :2.00   Max.   :27.00   Max.   :190.0   Max.   :120.00  
##       FBS         DIS             BMI       hypertension   
##  Min.   : 60.00   1: 53398   Min.   :14.8   Mode :logical  
##  1st Qu.: 87.00   2:162826   1st Qu.:21.5   FALSE:683197   
##  Median : 94.00   3: 43114   Median :23.6   TRUE :316803   
##  Mean   : 98.86   4:740662   Mean   :23.8                  
##  3rd Qu.:104.00              3rd Qu.:25.8                  
##  Max.   :358.00              Max.   :40.3                  
##  history_of_hypertension
##  Mode :logical          
##  FALSE:783776           
##  TRUE :216224           
##                         
##                         
## 
dataset_sbp %>% 
        lm(formula = SBP ~ DBP + DIS + DBP * DIS) %>%
        summary
## 
## Call:
## lm(formula = SBP ~ DBP + DIS + DBP * DIS, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -43.292  -4.998  -0.600   5.474  89.299 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  55.478249   0.325920 170.220  < 2e-16 ***
## DBP           0.958139   0.004126 232.202  < 2e-16 ***
## DIS2         -4.460697   0.375896 -11.867  < 2e-16 ***
## DIS3         -7.903914   0.499333 -15.829  < 2e-16 ***
## DIS4        -16.772084   0.337319 -49.722  < 2e-16 ***
## DBP:DIS2      0.035531   0.004735   7.504 6.17e-14 ***
## DBP:DIS3      0.042764   0.006454   6.626 3.45e-11 ***
## DBP:DIS4      0.120510   0.004285  28.124  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.415 on 999992 degrees of freedom
## Multiple R-squared:  0.5819, Adjusted R-squared:  0.5819 
## F-statistic: 1.989e+05 on 7 and 999992 DF,  p-value: < 2.2e-16
dataset_sbp_small %>%
        lm(SBP ~ DBP + BTH_G, data = .) %>% 
        summary
## 
## Call:
## lm(formula = SBP ~ DBP + BTH_G, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -28.618  -5.813  -0.839   5.478  44.670 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 34.41804    2.18271   15.77   <2e-16 ***
## DBP          1.06715    0.02881   37.05   <2e-16 ***
## BTH_G        0.43020    0.04153   10.36   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.019 on 997 degrees of freedom
## Multiple R-squared:  0.6237, Adjusted R-squared:  0.623 
## F-statistic: 826.4 on 2 and 997 DF,  p-value: < 2.2e-16
dataset_sbp_small %>%
        lm(SBP ~ DBP + SEX, data = .) %>% 
        summary
## 
## Call:
## lm(formula = SBP ~ DBP + SEX, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -27.063  -5.970  -1.219   5.211  45.634 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 39.03083    2.63553  14.809   <2e-16 ***
## DBP          1.10698    0.03035  36.480   <2e-16 ***
## SEX         -1.08426    0.60970  -1.778   0.0757 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.477 on 997 degrees of freedom
## Multiple R-squared:  0.5846, Adjusted R-squared:  0.5837 
## F-statistic: 701.4 on 2 and 997 DF,  p-value: < 2.2e-16
car::avPlots(dataset_sbp_small %>%
        lm(SBP ~ DBP + history_of_hypertension, data = .))

car::avPlots(dataset_sbp_small %>%
        lm(SBP ~ DBP + history_of_hypertension, data = .))
dataset_sbp_small %>%
        lm(SBP ~ SEX, data = .) %>% 
        summary
## 
## Call:
## lm(formula = SBP ~ SEX, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -34.779  -8.779   0.221  10.221  55.221 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 128.8716     1.4333   89.91  < 2e-16 ***
## SEX          -5.0921     0.9159   -5.56 3.47e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.47 on 998 degrees of freedom
## Multiple R-squared:  0.03004,    Adjusted R-squared:  0.02907 
## F-statistic: 30.91 on 1 and 998 DF,  p-value: 3.47e-08
dataset_sbp_small %>%
        lm(SBP ~ history_of_hypertension, data = .) %>% 
        summary
## 
## Call:
## lm(formula = SBP ~ history_of_hypertension, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -33.304  -8.438   0.562   9.913  51.562 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 118.4381     0.4911   241.2   <2e-16 ***
## history_of_hypertensionTRUE  12.8654     1.0376    12.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.68 on 998 degrees of freedom
## Multiple R-squared:  0.1335, Adjusted R-squared:  0.1326 
## F-statistic: 153.7 on 1 and 998 DF,  p-value: < 2.2e-16
dataset_sbp_small %>%
        lm(SBP ~ DBP + history_of_hypertension, data = .) %>% 
        summary
## 
## Call:
## lm(formula = SBP ~ DBP + history_of_hypertension, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.590  -4.500  -0.984   5.702  40.255 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 40.40433    2.23748  18.058   <2e-16 ***
## DBP                          1.04867    0.02974  35.256   <2e-16 ***
## history_of_hypertensionTRUE  6.42067    0.71630   8.964   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.131 on 997 degrees of freedom
## Multiple R-squared:  0.6143, Adjusted R-squared:  0.6135 
## F-statistic:   794 on 2 and 997 DF,  p-value: < 2.2e-16
dataset_sbp_small %>%
        lm(SBP ~ DBP + history_of_hypertension, data = .) %>%
        car::avPlots()

dataset_sbp_small %>%
        ggplot(., aes(y = SBP, x = history_of_hypertension)) +
        geom_jitter()

Mediation analysis

dataset_sbp$history_diabete <- ifelse(
        dataset_sbp$DIS == 1 | dataset_sbp$DIS == 2,
        1,0
) %>%
        as.factor()

set.seed(1)
dataset_sbp_1000 <- dataset_sbp[sample(1:1000000, 1000),]
# Step 1 Check direct effect
lm(history_diabete ~ BMI, data = dataset_sbp_1000)
## Warning in model.response(mf, "numeric"): using type = "numeric" with a factor
## response will be ignored
## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors
## 
## Call:
## lm(formula = history_diabete ~ BMI, data = dataset_sbp_1000)
## 
## Coefficients:
## (Intercept)          BMI  
##     0.82860      0.01664
# Step 2 Mediation fit, adjusted by status of hypertension
med.fit <- glm(history_diabete ~ BMI + hypertension, data = dataset_sbp_1000, family = "binomial")

# Step 3 Outcome fit, adjusted by status of hypertension
out.fit <- lm(SBP ~ BMI + history_diabete + hypertension, data = dataset_sbp_1000)

# Mediation analysis, adjusted for hypertension status # (for each group with hypertension and without hypertension)
library(mediation) #  This package is required
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loading required package: mvtnorm
## Loading required package: sandwich
## mediation: Causal Mediation Analysis
## Version: 4.5.1
med.out <- mediate(med.fit, out.fit, treat = "hypertension", mediator = "history_diabete")

summary(med.out)
## 
## Causal Mediation Analysis 
## 
## Quasi-Bayesian Confidence Intervals
## 
##                 Estimate 95% CI Lower 95% CI Upper   p-value    
## ACME            1.414512     0.888585     1.984442 < 2.2e-16 ***
## ADE            19.684145    18.298861    21.231975 < 2.2e-16 ***
## Total Effect   21.098657    19.660184    22.654182 < 2.2e-16 ***
## Prop. Mediated  0.066940     0.043601     0.094207 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Sample Size Used: 1000 
## 
## 
## Simulations: 1000

Random effect dataset

https://stats.idre.ucla.edu/

https://rpubs.com/rslbliss/r_mlm_ws

https://methods101.com.au/docs/soci832_11_3_multilevel_models_week_11/

Dataset is students from 10 handpicked schools, representing a subset of students and schools from a US survey of eight-grade students at 1000 schools (800 public 200 private).

There are quite a lot of variables in the dataset, but we are going start by using just three:


 # Variable Type Len Pos Label
-----------------------------------------------------------------------------------------------
15 cstr     Num    8 112
5 homework Num    8  32 Time spent on math homework each week
11 math     Num    8  80 Math score
4 meanses  Num    8  24 Mean SES for the school
7 parented Num    8  48 Parents highest education level

10 percmin  Num    8  72 Percent minority in school
 8 public   Num    8  56 Public school: 1=public, 0=non-public
13 race     Num    8  96 race of student, 1=asian, 2=Hispanic, 3=Black, 4=White, 5=Native American
 9 ratio    Num    8  64 Student-Teacher ratio
18 region   Num    8 136
 1 schid    Num    8   0 School ID
19 schnum   Num    8 144 group(schid)
16 scsize   Num    8 120
14 sctype   Num    8 104 Type of school, 1=public, 2=catholic, 3=Private
                         other religious, 4=Private non-r
 3 ses      Num    8  16 Socioecnonomic Status
12 sex      Num    8  88 Sex: 1=male, 2=female
 2 stuid    Num    8   8 Student ID
17 urban    Num    8 128
 6 white    Num    8  40 Race: 1=white, 0=non-white

Here,

homework: number of hours of homework - Level 1 variable (associated with students) math: score in a math test - expanatory variable schnum: a number for each school - Level 2 variable (associated with school) scsize: school size - Level 2 variable public: school type - Level 2 variable

library(haven)

imm10_data <- read_dta("https://stats.idre.ucla.edu/stat/examples/imm/imm10.dta") 

data visualization

Summary data

hist(imm10_data$math)

table(imm10_data$homework)
## 
##   0   1   2   3   4   5   6   7 
##  27 105  48  27  25  25   2   1
table(imm10_data$schnum)
## 
##  1  2  3  4  5  6  7  8  9 10 
## 23 20 24 22 22 20 67 21 21 20
table(imm10_data$public)
## 
##   0   1 
##  67 193

Random intercept - two groups

imm10 <- imm10_data 
imm10
## # A tibble: 260 × 19
##    schid stuid    ses meanses homework white parented public ratio percmin  math
##    <dbl> <dbl>  <dbl>   <dbl>    <dbl> <dbl>    <dbl>  <dbl> <dbl>   <dbl> <dbl>
##  1  7472     3 -0.130  -0.483        1     1        2      1    19       0    48
##  2  7472     8 -0.390  -0.483        0     1        2      1    19       0    48
##  3  7472    13 -0.800  -0.483        0     1        2      1    19       0    53
##  4  7472    17 -0.720  -0.483        1     1        2      1    19       0    42
##  5  7472    27 -0.740  -0.483        2     1        2      1    19       0    43
##  6  7472    28 -0.580  -0.483        1     1        2      1    19       0    57
##  7  7472    30 -0.830  -0.483        5     1        2      1    19       0    33
##  8  7472    36 -0.510  -0.483        1     1        3      1    19       0    64
##  9  7472    37 -0.560  -0.483        1     1        2      1    19       0    36
## 10  7472    42  0.210  -0.483        2     1        3      1    19       0    56
## # ℹ 250 more rows
## # ℹ 8 more variables: sex <dbl>, race <dbl>, sctype <dbl>, cstr <dbl>,
## #   scsize <dbl>, urban <dbl>, region <dbl>, schnum <dbl>

Visualization of models

ggplot(imm10, aes(y = math, x = homework)) +
        geom_smooth(method = lm, color = "black") +
        geom_point(size = 1.5, alpha = 0.8) +
        theme_classic() + 
        ylab("Math test result") +
        xlab("Hours spent on homework") +
        labs(color='School') 
## Ignoring unknown labels:
## • colour : "School"
## `geom_smooth()` using formula = 'y ~ x'

When we look into binary variable,

ggplot(imm10, aes(y = math, x = homework)) +
        geom_smooth(method = lm, color = "black") +
        geom_point(size = 1.5, alpha = 0.8, aes(colour = as.factor(public))) +
        theme_classic() + 
        ylab("Math test result") +
        xlab("Hours spent on homework") +
        labs(color='Publick school') 
## `geom_smooth()` using formula = 'y ~ x'

Store simple linear regression

model1 <- lm(math ~ homework, data = imm10)
imm10$FEPredictions <- fitted(model1)
slm_est <- coef(summary(model1))[ , "Estimate"]
slm_se <- coef(summary(model1))[ , "Std. Error"]

Multiple linear regression with binary variable

library(moderndive)

gg <- ggplot(imm10, aes(y = math, x = homework)) +
        geom_parallel_slopes(aes(col = factor(imm10$public)), se =F) +
        geom_point(size = 1.5, alpha = 0.8, aes(color=factor(imm10$public))) +
        theme_classic() +
        ylab("Math test result") +
        xlab("Hours spent on homework") +
        labs(col='School') 

        
print(gg)
## Warning: Use of `imm10$public` is discouraged.
## ℹ Use `public` instead.
## Use of `imm10$public` is discouraged.
## ℹ Use `public` instead.

Simple vs multiple (binary)

gg <- ggplot(imm10, aes(y = math, x = homework)) +
        geom_abline(slope = slm_est[2], intercept = slm_est[1], size=1) +
        geom_parallel_slopes(aes(col = factor(imm10$public)), se =F) +
        geom_point(size = 1.5, alpha = 0.8, aes(color=factor(imm10$public))) +
        theme_classic() +
        ylab("Math test result") +
        xlab("Hours spent on homework") +
        labs(col='School') +
        ggtitle("Modeling with Simple linear regression")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(gg)
## Warning: Use of `imm10$public` is discouraged.
## ℹ Use `public` instead.
## Warning: Use of `imm10$public` is discouraged.
## ℹ Use `public` instead.

Making random effect model

library(lmerTest)
## Loading required package: lme4
## 
## Attaching package: 'lmerTest'
## The following object is masked from 'package:lme4':
## 
##     lmer
## The following object is masked from 'package:stats':
## 
##     step
library(lme4)

model2 <- lme4::lmer(data = imm10, math ~ homework + (1|public))

summary(model2)
## Linear mixed model fit by REML ['lmerMod']
## Formula: math ~ homework + (1 | public)
##    Data: imm10
## 
## REML criterion at convergence: 1851.2
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.46023 -0.61670 -0.01182  0.58184  2.58110 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  public   (Intercept) 74.50    8.631   
##  Residual             71.79    8.473   
## Number of obs: 260, groups:  public, 2
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  50.3850     6.2050   8.120
## homework      1.9051     0.3882   4.908
## 
## Correlation of Fixed Effects:
##          (Intr)
## homework -0.152
model2 <- lmerTest::lmer(data = imm10, math ~ homework + (1|public))

summary(model2)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: math ~ homework + (1 | public)
##    Data: imm10
## 
## REML criterion at convergence: 1851.2
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.46023 -0.61670 -0.01182  0.58184  2.58110 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  public   (Intercept) 74.50    8.631   
##  Residual             71.79    8.473   
## Number of obs: 260, groups:  public, 2
## 
## Fixed effects:
##             Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)  50.3850     6.2050   1.0415   8.120   0.0721 .  
## homework      1.9051     0.3882 257.9447   4.908 1.64e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##          (Intr)
## homework -0.152

Mixed effect model vs multiple (binary)

imm10$FEPredictions <- fitted(model2)
ml_est <- coef(summary(model2))[ , "Estimate"]
ml_se <- coef(summary(model2))[ , "Std. Error"]


gg <- ggplot(imm10, aes(y = math, x = homework)) +
        geom_abline(slope = ml_est[2], intercept = ml_est[1], size=1) +
        geom_parallel_slopes(aes(col = factor(imm10$public)), se =F) +
        geom_point(size = 1.5, alpha = 0.8, aes(color=factor(imm10$public))) +
        theme_classic() +
        ylab("Math test result") +
        xlab("Hours spent on homework") +
        labs(col='School') +
        ggtitle("Modeling with randeom effect (multi level analysis)")

        
print(gg)
## Warning: Use of `imm10$public` is discouraged.
## ℹ Use `public` instead.
## Use of `imm10$public` is discouraged.
## ℹ Use `public` instead.

With effect modification…

Multiple linear regression with effect modification (binary)

gg <- ggplot(imm10, aes(y = math, x = homework)) +
        geom_smooth(method = "lm", aes(col = factor(imm10$public)), se = F) +
        geom_point(size = 1.5, alpha = 0.8, aes(color=factor(imm10$public))) +
        theme_classic() +
        ylab("Math test result") +
        xlab("Hours spent on homework") +
        labs(col='School') 

        
print(gg)
## Warning: Use of `imm10$public` is discouraged.
## ℹ Use `public` instead.
## Use of `imm10$public` is discouraged.
## ℹ Use `public` instead.
## `geom_smooth()` using formula = 'y ~ x'

# With facetting

gg <- ggplot(imm10, aes(y = math, x = homework)) +
        geom_smooth(method = "lm", aes(col = factor(imm10$public)), se = F) +
        geom_point(size = 1.5, alpha = 0.8, aes(color=factor(imm10$public))) +
        theme_classic() +
        facet_wrap(~public) +
        ylab("Math test result") +
        xlab("Hours spent on homework") +
        labs(col='School') 

        
print(gg)
## Warning: Use of `imm10$public` is discouraged.
## ℹ Use `public` instead.
## Use of `imm10$public` is discouraged.
## ℹ Use `public` instead.
## `geom_smooth()` using formula = 'y ~ x'

Simple LM + interaction term

Simple vs Multiple linear regression with effect modification (binary)

gg <- ggplot(imm10, aes(y = math, x = homework)) +
        geom_abline(slope = slm_est[2], intercept = slm_est[1], size=1) +
        geom_smooth(method = "lm", aes(col = factor(imm10$public)), se = F) +
        geom_point(size = 1.5, alpha = 0.8, aes(color=factor(imm10$public))) +
        theme_classic() +
        ylab("Math test result") +
        xlab("Hours spent on homework") +
        labs(col='School') 

        
print(gg)
## Warning: Use of `imm10$public` is discouraged.
## ℹ Use `public` instead.
## Use of `imm10$public` is discouraged.
## ℹ Use `public` instead.
## `geom_smooth()` using formula = 'y ~ x'

Random effect

Calculation of random slope

model2 <- lme4::lmer(data = imm10, math ~ homework + (homework|public)) 
## boundary (singular) fit: see help('isSingular')
summary(model2)
## Linear mixed model fit by REML ['lmerMod']
## Formula: math ~ homework + (homework | public)
##    Data: imm10
## 
## REML criterion at convergence: 1848.3
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.69605 -0.63601 -0.02561  0.63863  2.56869 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  public   (Intercept) 122.7580 11.0796       
##           homework      0.8922  0.9446  -1.00
##  Residual              70.9827  8.4251       
## Number of obs: 260, groups:  public, 2
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  51.2419     7.9209   6.469
## homework      1.7881     0.7738   2.311
## 
## Correlation of Fixed Effects:
##          (Intr)
## homework -0.917
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
lmerTest::lmer(data = imm10, math ~ homework + (homework|public)) %>% summary
## boundary (singular) fit: see help('isSingular')
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: math ~ homework + (homework | public)
##    Data: imm10
## 
## REML criterion at convergence: 1848.3
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.69605 -0.63601 -0.02561  0.63863  2.56869 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  public   (Intercept) 122.7580 11.0796       
##           homework      0.8922  0.9446  -1.00
##  Residual              70.9827  8.4251       
## Number of obs: 260, groups:  public, 2
## 
## Fixed effects:
##             Estimate Std. Error      df t value Pr(>|t|)  
## (Intercept)  51.2419     7.9209  0.9955   6.469   0.0984 .
## homework      1.7881     0.7738  1.0612   2.311   0.2483  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##          (Intr)
## homework -0.917
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')

Mixed model with random slope vs Multiple linear regression with effect modification (binary)

imm10$REPredictions <- fitted(model2)
ml_est <- coef(summary(model2))[ , "Estimate"]
ml_se <- coef(summary(model2))[ , "Std. Error"]
ml_est
## (Intercept)    homework 
##   51.241945    1.788118
gg <- ggplot(imm10, aes(y = math, x = homework, col=factor(imm10$public))) +
        geom_smooth(method = "lm", se = F) +
        geom_abline(slope = ml_est[2], intercept = ml_est[1], size=1) +
        geom_point(size = 1.5, alpha = 0.8) +
        theme_classic() +
        ylab("Math test result") +
        xlab("Hours spent on homework") +
        labs(col='School') 
        

print(gg)
## Warning: Use of `imm10$public` is discouraged.
## ℹ Use `public` instead.
## Use of `imm10$public` is discouraged.
## ℹ Use `public` instead.
## `geom_smooth()` using formula = 'y ~ x'

The sample analysis can be done for a categorical variable

Data filtering

imm10 <- imm10_data %>% filter(schnum == 2 | schnum == 3 | schnum == 4 )

Visualization of models

ggplot(imm10, aes(y = math, x = homework)) +
        geom_smooth(method = lm, color = "black") +
        geom_point(size = 1.5, alpha = 0.8) +
        theme_classic() + 
        ylab("Math test result") +
        ylab("Hours spent on homework") +
        labs(color='School') 
## Ignoring unknown labels:
## • colour : "School"
## `geom_smooth()` using formula = 'y ~ x'

ggplot(imm10, aes(y = math, x = homework)) +
        geom_smooth(method = lm, color = "black") +
        geom_point(size = 1.5, alpha = 0.8, aes(colour = as.factor(schnum))) +
        theme_classic() + 
        ylab("Math test result") +
        xlab("Hours spent on homework") +
        labs(color='School') 
## `geom_smooth()` using formula = 'y ~ x'

model1 <- lm(math ~ homework, data = imm10)
imm10$FEPredictions <- fitted(model1)
slm_est <- coef(summary(model1))[ , "Estimate"]
slm_se <- coef(summary(model1))[ , "Std. Error"]

One panel

library(moderndive)

gg <- ggplot(imm10, aes(y = math, x = homework)) +
        geom_parallel_slopes(aes(col = factor(imm10$schnum)), se =F) +
        geom_point(size = 1.5, alpha = 0.8, aes(color=factor(imm10$schnum))) +
        theme_classic() +
        ylab("Math test result") +
        xlab("Hours spent on homework") +
        labs(col='School') 

        
print(gg)
## Warning: Use of `imm10$schnum` is discouraged.
## ℹ Use `schnum` instead.
## Use of `imm10$schnum` is discouraged.
## ℹ Use `schnum` instead.

facetted

gg <- ggplot(imm10, aes(y = math, x = homework)) +
        geom_parallel_slopes(aes(col = factor(imm10$schnum)), se =F) +
        geom_point(size = 1.5, alpha = 0.8, aes(color=factor(imm10$schnum))) +
        theme_classic() +
        facet_wrap(~schnum) +
        ylab("Math test result") +
        xlab("Hours spent on homework") +
        labs(col='School') 

        
print(gg)
## Warning: Use of `imm10$schnum` is discouraged.
## ℹ Use `schnum` instead.
## Use of `imm10$schnum` is discouraged.
## ℹ Use `schnum` instead.
## Warning: `geom_parallel_slopes()` didn't recieve a grouping variable with more
## than one unique value. Make sure you supply one. Basic model is fitted.
## Warning: `geom_parallel_slopes()` didn't recieve a grouping variable with more
## than one unique value. Make sure you supply one. Basic model is fitted.
## Warning: `geom_parallel_slopes()` didn't recieve a grouping variable with more
## than one unique value. Make sure you supply one. Basic model is fitted.

gg <- ggplot(imm10, aes(y = math, x = homework)) +
        geom_abline(slope = slm_est[2], intercept = slm_est[1], size=1) +
        geom_parallel_slopes(aes(col = factor(imm10$schnum)), se =F) +
        geom_point(size = 1.5, alpha = 0.8, aes(color=factor(imm10$schnum))) +
        theme_classic() +
        ylab("Math test result") +
        xlab("Hours spent on homework") +
        labs(col='School') +
        ggtitle("Modeling with Simple linear regression")

        
print(gg)
## Warning: Use of `imm10$schnum` is discouraged.
## ℹ Use `schnum` instead.
## Use of `imm10$schnum` is discouraged.
## ℹ Use `schnum` instead.

library(lmerTest)
library(lme4)

model2 <- lmer(data = imm10, math ~ homework + (1|schnum))

imm10$FEPredictions <- fitted(model2)
ml_est <- coef(summary(model2))[ , "Estimate"]
ml_se <- coef(summary(model2))[ , "Std. Error"]


gg <- ggplot(imm10, aes(y = math, x = homework)) +
        geom_abline(slope = ml_est[2], intercept = ml_est[1], size=1) +
        geom_parallel_slopes(aes(col = factor(imm10$schnum)), se =F) +
        geom_point(size = 1.5, alpha = 0.8, aes(color=factor(imm10$schnum))) +
        theme_classic() +
        ylab("Math test result") +
        xlab("Hours spent on homework") +
        labs(col='School') +
        ggtitle("Modeling with randeom effect (multi level analysis)")

        
print(gg)
## Warning: Use of `imm10$schnum` is discouraged.
## ℹ Use `schnum` instead.
## Use of `imm10$schnum` is discouraged.
## ℹ Use `schnum` instead.

With effect modification…

gg <- ggplot(imm10, aes(y = math, x = homework)) +
        geom_smooth(method = "lm", aes(col = factor(imm10$schnum)), se = F) +
        geom_point(size = 1.5, alpha = 0.8, aes(color=factor(imm10$schnum))) +
        theme_classic() +
        ylab("Math test result") +
        xlab("Hours spent on homework") +
        labs(col='School') 

        
print(gg)
## Warning: Use of `imm10$schnum` is discouraged.
## ℹ Use `schnum` instead.
## Use of `imm10$schnum` is discouraged.
## ℹ Use `schnum` instead.
## `geom_smooth()` using formula = 'y ~ x'

Simple LM + interaction term

gg <- ggplot(imm10, aes(y = math, x = homework)) +
        geom_abline(slope = slm_est[2], intercept = slm_est[1], size=1) +
        geom_smooth(method = "lm", aes(col = factor(imm10$schnum)), se = F) +
        geom_point(size = 1.5, alpha = 0.8, aes(color=factor(imm10$schnum))) +
        theme_classic() +
        ylab("Math test result") +
        xlab("Hours spent on homework") +
        labs(col='School') 

        
print(gg)
## Warning: Use of `imm10$schnum` is discouraged.
## ℹ Use `schnum` instead.
## Use of `imm10$schnum` is discouraged.
## ℹ Use `schnum` instead.
## `geom_smooth()` using formula = 'y ~ x'

Random effect

imm10$schnum <- as.factor(imm10$schnum)

model2 <- lme4::lmer(data = imm10, math ~ homework + (homework|schnum)) 

summary(model2)
## Linear mixed model fit by REML ['lmerMod']
## Formula: math ~ homework + (homework | schnum)
##    Data: imm10
## 
## REML criterion at convergence: 448.4
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.93689 -0.52976 -0.02434  0.54695  2.45935 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  schnum   (Intercept) 49.48    7.035         
##           homework    31.06    5.573    -0.88
##  Residual             48.05    6.932         
## Number of obs: 66, groups:  schnum, 3
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)   40.509      4.358   9.296
## homework       3.603      3.287   1.096
## 
## Correlation of Fixed Effects:
##          (Intr)
## homework -0.867
imm10$REPredictions <- fitted(model2)
ml_est <- coef(summary(model2))[ , "Estimate"]
ml_se <- coef(summary(model2))[ , "Std. Error"]
ml_est
## (Intercept)    homework 
##   40.508687    3.602677
gg <- ggplot(imm10, aes(y = math, x = homework, col=factor(imm10$schnum))) +
        geom_smooth(method = "lm", se = F) +
        geom_abline(slope = ml_est[2], intercept = ml_est[1], size=1) +
        geom_point(size = 1.5, alpha = 0.8) +
        theme_classic() +
        ylab("Math test result") +
        xlab("Hours spent on homework") +
        labs(col='School') 
        

print(gg)
## Warning: Use of `imm10$schnum` is discouraged.
## ℹ Use `schnum` instead.
## Use of `imm10$schnum` is discouraged.
## ℹ Use `schnum` instead.
## `geom_smooth()` using formula = 'y ~ x'

Categorical variable with more factors

Data filtering

imm10 <- imm10_data
imm10
## # A tibble: 260 × 19
##    schid stuid    ses meanses homework white parented public ratio percmin  math
##    <dbl> <dbl>  <dbl>   <dbl>    <dbl> <dbl>    <dbl>  <dbl> <dbl>   <dbl> <dbl>
##  1  7472     3 -0.130  -0.483        1     1        2      1    19       0    48
##  2  7472     8 -0.390  -0.483        0     1        2      1    19       0    48
##  3  7472    13 -0.800  -0.483        0     1        2      1    19       0    53
##  4  7472    17 -0.720  -0.483        1     1        2      1    19       0    42
##  5  7472    27 -0.740  -0.483        2     1        2      1    19       0    43
##  6  7472    28 -0.580  -0.483        1     1        2      1    19       0    57
##  7  7472    30 -0.830  -0.483        5     1        2      1    19       0    33
##  8  7472    36 -0.510  -0.483        1     1        3      1    19       0    64
##  9  7472    37 -0.560  -0.483        1     1        2      1    19       0    36
## 10  7472    42  0.210  -0.483        2     1        3      1    19       0    56
## # ℹ 250 more rows
## # ℹ 8 more variables: sex <dbl>, race <dbl>, sctype <dbl>, cstr <dbl>,
## #   scsize <dbl>, urban <dbl>, region <dbl>, schnum <dbl>

Visualization of models

ggplot(imm10, aes(y = math, x = homework)) +
        geom_smooth(method = lm, color = "black") +
        geom_point(size = 1.5, alpha = 0.8) +
        theme_classic() + 
        ylab("Math test result") +
        ylab("Hours spent on homework") +
        labs(color='School') 
## Ignoring unknown labels:
## • colour : "School"
## `geom_smooth()` using formula = 'y ~ x'

ggplot(imm10, aes(y = math, x = homework)) +
        geom_smooth(method = lm, color = "black") +
        geom_point(size = 1.5, alpha = 0.8, aes(colour = as.factor(schnum))) +
        theme_classic() + 
        ylab("Math test result") +
        xlab("Hours spent on homework") +
        labs(color='School') 
## `geom_smooth()` using formula = 'y ~ x'

model1 <- lm(math ~ homework, data = imm10)
imm10$FEPredictions <- fitted(model1)
slm_est <- coef(summary(model1))[ , "Estimate"]
slm_se <- coef(summary(model1))[ , "Std. Error"]
library(moderndive)

gg <- ggplot(imm10, aes(y = math, x = homework)) +
        geom_parallel_slopes(aes(col = factor(imm10$schnum)), se =F) +
        geom_point(size = 1.5, alpha = 0.8, aes(color=factor(imm10$schnum))) +
        theme_classic() +
        ylab("Math test result") +
        xlab("Hours spent on homework") +
        labs(col='School') 

        
print(gg)
## Warning: Use of `imm10$schnum` is discouraged.
## ℹ Use `schnum` instead.
## Use of `imm10$schnum` is discouraged.
## ℹ Use `schnum` instead.

gg <- ggplot(imm10, aes(y = math, x = homework)) +
        geom_abline(slope = slm_est[2], intercept = slm_est[1], size=1) +
        geom_parallel_slopes(aes(col = factor(imm10$schnum)), se =F) +
        geom_point(size = 1.5, alpha = 0.8, aes(color=factor(imm10$schnum))) +
        theme_classic() +
        ylab("Math test result") +
        xlab("Hours spent on homework") +
        labs(col='School') +
        ggtitle("Modeling with Simple linear regression")

        
print(gg)
## Warning: Use of `imm10$schnum` is discouraged.
## ℹ Use `schnum` instead.
## Use of `imm10$schnum` is discouraged.
## ℹ Use `schnum` instead.

library(lmerTest)
library(lme4)

model2 <- lmer(data = imm10, math ~ homework + (1|schnum))

imm10$FEPredictions <- fitted(model2)
ml_est <- coef(summary(model2))[ , "Estimate"]
ml_se <- coef(summary(model2))[ , "Std. Error"]


gg <- ggplot(imm10, aes(y = math, x = homework)) +
        geom_abline(slope = ml_est[2], intercept = ml_est[1], size=1) +
        geom_parallel_slopes(aes(col = factor(imm10$schnum)), se =F) +
        geom_point(size = 1.5, alpha = 0.8, aes(color=factor(imm10$schnum))) +
        theme_classic() +
        ylab("Math test result") +
        xlab("Hours spent on homework") +
        labs(col='School') +
        ggtitle("Modeling with randeom effect (multi level analysis)")

        
print(gg)
## Warning: Use of `imm10$schnum` is discouraged.
## ℹ Use `schnum` instead.
## Use of `imm10$schnum` is discouraged.
## ℹ Use `schnum` instead.

With effect modification…

gg <- ggplot(imm10, aes(y = math, x = homework)) +
        geom_smooth(method = "lm", aes(col = factor(imm10$schnum)), se = F) +
        geom_point(size = 1.5, alpha = 0.8, aes(color=factor(imm10$schnum))) +
        theme_classic() +
        ylab("Math test result") +
        xlab("Hours spent on homework") +
        labs(col='School') 

        
print(gg)
## Warning: Use of `imm10$schnum` is discouraged.
## ℹ Use `schnum` instead.
## Use of `imm10$schnum` is discouraged.
## ℹ Use `schnum` instead.
## `geom_smooth()` using formula = 'y ~ x'

Simple LM + interaction term

gg <- ggplot(imm10, aes(y = math, x = homework)) +
        geom_abline(slope = slm_est[2], intercept = slm_est[1], size=1) +
        geom_smooth(method = "lm", aes(col = factor(imm10$schnum)), se = F) +
        geom_point(size = 1.5, alpha = 0.8, aes(color=factor(imm10$schnum))) +
        theme_classic() +
        ylab("Math test result") +
        xlab("Hours spent on homework") +
        labs(col='School') 

        
print(gg)
## Warning: Use of `imm10$schnum` is discouraged.
## ℹ Use `schnum` instead.
## Use of `imm10$schnum` is discouraged.
## ℹ Use `schnum` instead.
## `geom_smooth()` using formula = 'y ~ x'

Random effect

imm10$schnum <- as.factor(imm10$schnum)

model2 <- lme4::lmer(data = imm10, math ~ homework + (homework|schnum)) 

summary(model2)
## Linear mixed model fit by REML ['lmerMod']
## Formula: math ~ homework + (homework | schnum)
##    Data: imm10
## 
## REML criterion at convergence: 1764
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -2.5110 -0.5357  0.0175  0.6121  2.5708 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  schnum   (Intercept) 69.31    8.325         
##           homework    22.45    4.738    -0.81
##  Residual             43.07    6.563         
## Number of obs: 260, groups:  schnum, 10
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)   44.771      2.744  16.318
## homework       2.040      1.554   1.313
## 
## Correlation of Fixed Effects:
##          (Intr)
## homework -0.804
imm10$REPredictions <- fitted(model2)
ml_est <- coef(summary(model2))[ , "Estimate"]
ml_se <- coef(summary(model2))[ , "Std. Error"]
ml_est
## (Intercept)    homework 
##   44.770593    2.040465
gg <- ggplot(imm10, aes(y = math, x = homework, col=factor(imm10$schnum))) +
        geom_smooth(method = "lm", se = F) +
        geom_abline(slope = ml_est[2], intercept = ml_est[1], size=1) +
        geom_point(size = 1.5, alpha = 0.8) +
        theme_classic() +
        ylab("Math test result") +
        xlab("Hours spent on homework") +
        labs(col='School') 
        

print(gg)
## Warning: Use of `imm10$schnum` is discouraged.
## ℹ Use `schnum` instead.
## Use of `imm10$schnum` is discouraged.
## ℹ Use `schnum` instead.
## `geom_smooth()` using formula = 'y ~ x'

Simple LM + interaction term

gg <- ggplot(imm10, aes(y = math, x = homework)) +
        geom_abline(slope = slm_est[2], intercept = slm_est[1], size=1) +
        geom_smooth(method = "lm", aes(col = factor(imm10$schnum)), se = F) +
        geom_point(size = 1.5, alpha = 0.8, aes(color=factor(imm10$schnum))) +
        theme_classic() +
        ylab("Math test result") +
        xlab("Hours spent on homework") +
        labs(col='School') 

        
print(gg)
## Warning: Use of `imm10$schnum` is discouraged.
## ℹ Use `schnum` instead.
## Use of `imm10$schnum` is discouraged.
## ℹ Use `schnum` instead.
## `geom_smooth()` using formula = 'y ~ x'

Continuous random effect - scool size

Data filtering

imm10 <- imm10_data
imm10
## # A tibble: 260 × 19
##    schid stuid    ses meanses homework white parented public ratio percmin  math
##    <dbl> <dbl>  <dbl>   <dbl>    <dbl> <dbl>    <dbl>  <dbl> <dbl>   <dbl> <dbl>
##  1  7472     3 -0.130  -0.483        1     1        2      1    19       0    48
##  2  7472     8 -0.390  -0.483        0     1        2      1    19       0    48
##  3  7472    13 -0.800  -0.483        0     1        2      1    19       0    53
##  4  7472    17 -0.720  -0.483        1     1        2      1    19       0    42
##  5  7472    27 -0.740  -0.483        2     1        2      1    19       0    43
##  6  7472    28 -0.580  -0.483        1     1        2      1    19       0    57
##  7  7472    30 -0.830  -0.483        5     1        2      1    19       0    33
##  8  7472    36 -0.510  -0.483        1     1        3      1    19       0    64
##  9  7472    37 -0.560  -0.483        1     1        2      1    19       0    36
## 10  7472    42  0.210  -0.483        2     1        3      1    19       0    56
## # ℹ 250 more rows
## # ℹ 8 more variables: sex <dbl>, race <dbl>, sctype <dbl>, cstr <dbl>,
## #   scsize <dbl>, urban <dbl>, region <dbl>, schnum <dbl>

The same analysis but with a continuous variable

Visualization of models

ggplot(imm10, aes(y = math, x = homework)) +
        geom_smooth(method = lm, color = "black") +
        geom_point(size = 1.5, alpha = 0.8) +
        theme_classic() + 
        ylab("Math test result") +
        ylab("Hours spent on homework") +
        labs(color='School') 
## Ignoring unknown labels:
## • colour : "School"
## `geom_smooth()` using formula = 'y ~ x'

ggplot(imm10, aes(y = math, x = homework)) +
        geom_smooth(method = lm, color = "black") +
        geom_point(size = 1.5, alpha = 0.8, aes(colour = meanses)) +
        theme_classic() +
        scale_color_viridis_c() +
        ylab("Math test result") +
        xlab("Hours spent on homework") +
        labs(color='Mean socio economic status (by schoool)') 
## `geom_smooth()` using formula = 'y ~ x'

model1 <- lm(math ~ homework + meanses, data = imm10)
imm10$FEPredictions <- fitted(model1)
slm_est <- coef(summary(model1))[ , "Estimate"]
slm_se <- coef(summary(model1))[ , "Std. Error"]
gg <- ggplot(imm10, aes(y = math, x = homework)) +
        facet_wrap(~meanses) + 
        geom_smooth(method = "lm", se =F) +
        geom_point(size = 1.5, alpha = 0.8) +
        theme_classic() +
        ylab("Math test result") +
        xlab("Hours spent on homework") +
        labs(col='School') 
print(gg)
## Ignoring unknown labels:
## • colour : "School"
## `geom_smooth()` using formula = 'y ~ x'

Grey is regular simple linear regression, and red is regular mixed linear regression

model2 <- lmer(data = imm10, math ~ homework + (1|meanses))

imm10$FEPredictions <- fitted(model2)
ml_est <- coef(summary(model2))[ , "Estimate"]
ml_se <- coef(summary(model2))[ , "Std. Error"]



gg <- ggplot(imm10, aes(y = math, x = homework)) +
        facet_wrap(~meanses) + 
        geom_smooth(method = "lm", se =F) +
        geom_point(size = 1.5, alpha = 0.8) +
        geom_abline(slope = slm_est[2], intercept = slm_est[1], size=1, col = "grey") +
        geom_abline(slope = ml_est[2], intercept = ml_est[1], size=1, col = "red") +
        theme_classic() +
        ylab("Math test result") +
        xlab("Hours spent on homework") +
        labs(col='School') 
print(gg)
## Ignoring unknown labels:
## • colour : "School"
## `geom_smooth()` using formula = 'y ~ x'

Ultimate model

imm10 <- imm10_data 
model5 <- lmer(math ~ homework + ses + meanses + (homework|schnum) + (meanses|region), REML=FALSE, data = imm10)
## boundary (singular) fit: see help('isSingular')
summary(model5)
## Linear mixed model fit by maximum likelihood . t-tests use Satterthwaite's
##   method [lmerModLmerTest]
## Formula: math ~ homework + ses + meanses + (homework | schnum) + (meanses |  
##     region)
##    Data: imm10
## 
##       AIC       BIC    logLik -2*log(L)  df.resid 
##    1758.0    1797.2    -868.0    1736.0       249 
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -2.65746 -0.67499  0.03456  0.63597  2.65138 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev. Corr 
##  schnum   (Intercept) 4.920e+01 7.014486      
##           homework    1.708e+01 4.132667 -0.99
##  region   (Intercept) 0.000e+00 0.000000      
##           meanses     1.666e-06 0.001291  NaN 
##  Residual             4.132e+01 6.428340      
## Number of obs: 260, groups:  schnum, 10; region, 3
## 
## Fixed effects:
##             Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)  48.0222     2.3525   9.9707  20.413 1.83e-09 ***
## homework      1.8024     1.3600   9.6920   1.325 0.215476    
## ses           2.3765     0.6359 239.2027   3.737 0.000233 ***
## meanses       6.2304     1.0196  11.6230   6.110 6.02e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##          (Intr) homwrk ses   
## homework -0.973              
## ses       0.042 -0.037       
## meanses   0.066 -0.009 -0.611
## optimizer (nloptwrap) convergence code: 0 (OK)
## boundary (singular) fit: see help('isSingular')
imm10$REPredictions <- fitted(model5)
ml_est <- coef(summary(model5))[ , "Estimate"]
ml_se <- coef(summary(model5))[ , "Std. Error"]
ml_est
## (Intercept)    homework         ses     meanses 
##   48.022201    1.802384    2.376458    6.230416
gg <- ggplot(imm10, aes(y = math, x = homework)) +
        facet_wrap(~meanses) + 
        geom_smooth(method = "lm", se =F) +
        geom_point(size = 1.5, alpha = 0.8) +
        geom_abline(slope = slm_est[2], intercept = slm_est[1], size=1, col = "grey") +
        geom_abline(slope = ml_est[2], intercept = ml_est[1], size=1, col = "red") +
        theme_classic() +
        ylab("Math test result") +
        xlab("Hours spent on homework") +
        labs(col='School') 
print(gg)
## Ignoring unknown labels:
## • colour : "School"
## `geom_smooth()` using formula = 'y ~ x'

Rather then homework, the socio economic status is more important.

## Computing. R Foundation for Statistical Computing, Vienna, Austria. <https://www.R-project.org/>. We have invested a lot of time and effort in creating R, please cite it when using it for data analysis. See also 'citation("pkgname")' for citing R packages.
## Grolemund G, Hayes A, Henry L, Hester J, Kuhn M, Pedersen TL, Miller E, Bache SM, Müller K, Ooms J, Robinson D, Seidel DP, Spinu V, Takahashi K, Vaughan D, Wilke C, Woo K, Yutani H (2019). "Welcome to the tidyverse." Journal of Open Source Software_, *4*(43), 1686. doi:10.21105/joss.01686 <https://doi.org/10.21105/joss.01686>.
## R. version 0.5.0. Buffalo, New York. http://github.com/trinker/pacman
## J, reikoch, Beasley W, O'Connor B, Warnes GR, Quinn M, Kamvar ZN, Gao C (2024). yaml: Methods to Convert R Data to YAML and Back_. R package version 2.3.10, <https://CRAN.R-project.org/package=yaml>. ATTENTION: This citation information has been auto-generated from the package DESCRIPTION file and may need manual editing, see 'help("citation")'.
## version 0.4.4, <https://CRAN.R-project.org/package=reactable>.