Logistic Regression — Part 3: Three Predictors

# First time only: remove the # and run, then put the # back.
# install.packages(c("tidyverse", "broom", "gtsummary", "ggeffects"))
library(tidyverse)
library(broom)
library(gtsummary)
library(ggeffects)

data <- read_csv2("data2.csv")
glimpse(data)   # <chr> = category variable, <dbl>/<int> = numeric variable

## Rows: 150
## Columns: 7
## $ verb                   <chr> "give", "give", "give", "give", "give", "give",…
## $ sentence               <chr> "crucial part of the marketing for many compani…
## $ RealizationOfRecipient <chr> "NP NP", "NP PP", "NP PP", "NP PP", "NP PP", "N…
## $ RecipientLength        <dbl> 1, 2, 1, 2, 2, 3, 2, 3, 5, 1, 1, 1, 2, 1, 1, 1,…
## $ RecipientAnimacy       <chr> "animate", "animate", "inanimate", "inanimate",…
## $ Modality               <chr> "written", "written", "written", "written", "wr…
## $ ...7                   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…

selected_vars <- c("Modality", "RecipientLength", "RecipientAnimacy")

variable_labels <- c(
  Modality          = "Modality (spoken vs. written)",
  RecipientLength   = "Recipient Length (words)",
  RecipientAnimacy  = "Recipient Animacy (animate vs. inanimate)"
)

# Keep outcome + chosen predictors, drop rows with missing values,
# and make sure R treats text columns as categorical factors.
data <- data %>%
  select(RealizationOfRecipient, all_of(selected_vars)) %>%
  drop_na() %>%
  mutate(across(where(is.character), as.factor))

# The SECOND name in this output is the outcome being modelled (here: PP).
# Keep this in mind when interpreting every number in the results.
levels(data$RealizationOfRecipient)

## [1] "NP NP" "NP PP"

# How many observations per outcome category?
data %>% count(RealizationOfRecipient)

# Descriptive summary per predictor:
#   - numeric predictor  → mean and SD split by outcome
#   - categorical predictor → cross-tabulation with outcome
for (var in selected_vars) {
  cat("\n=====", var, "=====\n")
  if (is.numeric(data[[var]])) {
    print(
      data %>%
        group_by(RealizationOfRecipient) %>%
        summarise(Mean = round(mean(.data[[var]]), 2),
                  SD   = round(sd(.data[[var]]),   2),
                  .groups = "drop")
    )
  } else {
    print(table(data[[var]], data$RealizationOfRecipient))
  }
}

## 
## ===== Modality =====
##          
##           NP NP NP PP
##   spoken     23     7
##   written    75    45
## 
## ===== RecipientLength =====
## # A tibble: 2 × 3
##   RealizationOfRecipient  Mean    SD
##   <fct>                  <dbl> <dbl>
## 1 NP NP                   1.23  0.69
## 2 NP PP                   2.27  1.32
## 
## ===== RecipientAnimacy =====
##            
##             NP NP NP PP
##   animate      90    28
##   inanimate     8    24

model_full <- glm(RealizationOfRecipient ~ ., data = data, family = binomial)

# Main results table: odds ratios (OR) with 95% confidence intervals.
# This is the table you will interpret row by row in the next section.
broom::tidy(model_full, exponentiate = TRUE, conf.int = TRUE) %>%
  mutate(across(where(is.numeric), ~ round(.x, 3))) %>%
  rename(Predictor = term,
         OR        = estimate,
         SE        = std.error,
         z         = statistic,
         p         = p.value,
         CI_low    = conf.low,
         CI_high   = conf.high)

# The same results in a clean, report-ready layout.
tbl_regression(model_full, exponentiate = TRUE)

Characteristic	OR	95% CI	p-value
Modality
spoken	—	—
written	0.79	0.29, 2.30	0.6
RecipientLength	2.46	1.56, 4.29	<0.001
RecipientAnimacy
animate	—	—
inanimate	5.21	1.91, 14.9	0.001
Abbreviations: CI = Confidence Interval, OR = Odds Ratio

pred1 <- selected_vars[1]   # first predictor,  e.g. "Modality"
pred2 <- selected_vars[2]   # second predictor, e.g. "LengthOfRecipient"

# Build all four models on the same dataset.
model_null  <- glm(RealizationOfRecipient ~ 1,          data = data, family = binomial)
model_pred1 <- glm(RealizationOfRecipient ~ get(pred1), data = data, family = binomial)
model_pred2 <- glm(RealizationOfRecipient ~ get(pred2), data = data, family = binomial)
# model_full was already fitted in section 5 above.

# ── Test 1: does pred2 add value beyond pred1? ────────────────────────────
# Read the SECOND chi-square row: null→pred1 (does pred1 help on its own?),
# then pred1→full (does pred2 add anything on top of pred1?).
cat("=== Does", pred2, "add value beyond", pred1, "? ===\n")

## === Does RecipientLength add value beyond Modality ? ===

anova(model_null, model_pred1, model_full, test = "Chisq")

# ── Test 2: does pred1 add value beyond pred2? ────────────────────────────
# Same logic, order reversed: read the SECOND row for pred1's unique contribution.
cat("\n=== Does", pred1, "add value beyond", pred2, "? ===\n")

## 
## === Does Modality add value beyond RecipientLength ? ===

anova(model_null, model_pred2, model_full, test = "Chisq")

for (var in selected_vars) {
  eff   <- ggpredict(model_full, terms = var)
  label <- if (var %in% names(variable_labels)) variable_labels[[var]] else var
  print(plot(eff) + ggtitle(paste("Predicted probability of PP by", label)))
}

Logistic Regression — Part 3: Three Predictors

Exploring the Dative Alternation in English

K. Osolin, M. Smith, C. Weichselbaum

2026-06-26