# First time only: remove the # and run, then put the # back.
# install.packages(c("tidyverse", "broom", "gtsummary", "ggeffects"))
library(tidyverse)
library(broom)
library(gtsummary)
library(ggeffects)
data <- read_csv2("data2.csv")
glimpse(data) # <chr> = category variable, <dbl>/<int> = numeric variable
## Rows: 150
## Columns: 7
## $ verb <chr> "give", "give", "give", "give", "give", "give",…
## $ sentence <chr> "crucial part of the marketing for many compani…
## $ RealizationOfRecipient <chr> "NP NP", "NP PP", "NP PP", "NP PP", "NP PP", "N…
## $ RecipientLength <dbl> 1, 2, 1, 2, 2, 3, 2, 3, 5, 1, 1, 1, 2, 1, 1, 1,…
## $ RecipientAnimacy <chr> "animate", "animate", "inanimate", "inanimate",…
## $ Modality <chr> "written", "written", "written", "written", "wr…
## $ ...7 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
selected_vars <- c("Modality", "RecipientLength", "RecipientAnimacy")
variable_labels <- c(
Modality = "Modality (spoken vs. written)",
RecipientLength = "Recipient Length (words)",
RecipientAnimacy = "Recipient Animacy (animate vs. inanimate)"
)
# Keep outcome + chosen predictors, drop rows with missing values,
# and make sure R treats text columns as categorical factors.
data <- data %>%
select(RealizationOfRecipient, all_of(selected_vars)) %>%
drop_na() %>%
mutate(across(where(is.character), as.factor))
# The SECOND name in this output is the outcome being modelled (here: PP).
# Keep this in mind when interpreting every number in the results.
levels(data$RealizationOfRecipient)
## [1] "NP NP" "NP PP"
# How many observations per outcome category?
data %>% count(RealizationOfRecipient)
# Descriptive summary per predictor:
# - numeric predictor → mean and SD split by outcome
# - categorical predictor → cross-tabulation with outcome
for (var in selected_vars) {
cat("\n=====", var, "=====\n")
if (is.numeric(data[[var]])) {
print(
data %>%
group_by(RealizationOfRecipient) %>%
summarise(Mean = round(mean(.data[[var]]), 2),
SD = round(sd(.data[[var]]), 2),
.groups = "drop")
)
} else {
print(table(data[[var]], data$RealizationOfRecipient))
}
}
##
## ===== Modality =====
##
## NP NP NP PP
## spoken 23 7
## written 75 45
##
## ===== RecipientLength =====
## # A tibble: 2 × 3
## RealizationOfRecipient Mean SD
## <fct> <dbl> <dbl>
## 1 NP NP 1.23 0.69
## 2 NP PP 2.27 1.32
##
## ===== RecipientAnimacy =====
##
## NP NP NP PP
## animate 90 28
## inanimate 8 24
model_full <- glm(RealizationOfRecipient ~ ., data = data, family = binomial)
# Main results table: odds ratios (OR) with 95% confidence intervals.
# This is the table you will interpret row by row in the next section.
broom::tidy(model_full, exponentiate = TRUE, conf.int = TRUE) %>%
mutate(across(where(is.numeric), ~ round(.x, 3))) %>%
rename(Predictor = term,
OR = estimate,
SE = std.error,
z = statistic,
p = p.value,
CI_low = conf.low,
CI_high = conf.high)
# The same results in a clean, report-ready layout.
tbl_regression(model_full, exponentiate = TRUE)
| Characteristic |
OR |
95% CI |
p-value |
| Modality |
|
|
|
| spoken |
— |
— |
|
| written |
0.79 |
0.29, 2.30 |
0.6 |
| RecipientLength |
2.46 |
1.56, 4.29 |
<0.001 |
| RecipientAnimacy |
|
|
|
| animate |
— |
— |
|
| inanimate |
5.21 |
1.91, 14.9 |
0.001 |
| Abbreviations: CI = Confidence Interval, OR = Odds Ratio |
pred1 <- selected_vars[1] # first predictor, e.g. "Modality"
pred2 <- selected_vars[2] # second predictor, e.g. "LengthOfRecipient"
# Build all four models on the same dataset.
model_null <- glm(RealizationOfRecipient ~ 1, data = data, family = binomial)
model_pred1 <- glm(RealizationOfRecipient ~ get(pred1), data = data, family = binomial)
model_pred2 <- glm(RealizationOfRecipient ~ get(pred2), data = data, family = binomial)
# model_full was already fitted in section 5 above.
# ── Test 1: does pred2 add value beyond pred1? ────────────────────────────
# Read the SECOND chi-square row: null→pred1 (does pred1 help on its own?),
# then pred1→full (does pred2 add anything on top of pred1?).
cat("=== Does", pred2, "add value beyond", pred1, "? ===\n")
## === Does RecipientLength add value beyond Modality ? ===
anova(model_null, model_pred1, model_full, test = "Chisq")
# ── Test 2: does pred1 add value beyond pred2? ────────────────────────────
# Same logic, order reversed: read the SECOND row for pred1's unique contribution.
cat("\n=== Does", pred1, "add value beyond", pred2, "? ===\n")
##
## === Does Modality add value beyond RecipientLength ? ===
anova(model_null, model_pred2, model_full, test = "Chisq")
for (var in selected_vars) {
eff <- ggpredict(model_full, terms = var)
label <- if (var %in% names(variable_labels)) variable_labels[[var]] else var
print(plot(eff) + ggtitle(paste("Predicted probability of PP by", label)))
}


