Load and Explore Data
# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load the data
data <- read.csv("./data/Transcription_data.csv")
Define Analysis Function
analyze_variable <- function(data, variable_name) {
library(dplyr)
library(tidyr)
library(ggplot2)
# Filter and reshape data
data_filtered <- data %>%
filter(Variable %in% c(variable_name, "AI Initiated")) %>%
pivot_wider(names_from = Variable, values_from = Value)
# Clean numeric variable (remove ' chars')
data_filtered[[variable_name]] <- as.numeric(gsub(" chars", "", data_filtered[[variable_name]]))
# Convert AI Initiated to factor and label clearly
data_filtered$InitiationType <- factor(
ifelse(data_filtered[["AI Initiated"]] == "True", "AI initiated", "User initiated"),
levels = c("User initiated", "AI initiated")
)
# Perform t-test
t_test_result <- t.test(data_filtered[[variable_name]] ~ data_filtered$InitiationType)
print(t_test_result)
# Format p-value for display
p_val <- signif(t_test_result$p.value, 3)
df <- round(t_test_result$parameter, 2)
# Create the plot
y_max <- max(data_filtered[[variable_name]], na.rm = TRUE)
p <- ggplot(data_filtered, aes(x = InitiationType, y = !!sym(variable_name), fill = InitiationType)) +
geom_boxplot() +
geom_jitter(width = 0.2, alpha = 0.5) +
annotate("text",
x = 1.5,
y = y_max * 1.05, # safely below plot limit
label = paste0("p = ", p_val, "\n(df = ", df, ")"),
size = 5, hjust = 0.5, vjust = 1) +
labs(
title = paste("Comparison of", variable_name, "by AI Initiation"),
x = "Initiation Type",
y = variable_name
) +
scale_fill_manual(
values = c("AI initiated" = "steelblue", "User initiated" = "tomato")
) +
scale_y_continuous(
labels = scales::comma,
limits = c(-1, y_max * 1.15) # higher than data and annotation
) +
theme_minimal() +
theme(legend.position = "none")
# Save the plot
ggsave(
filename = paste0("plots/Transcription_", gsub(" ", "_", variable_name), ".png"),
plot = p, width = 10, height = 6, dpi = 300, bg = "white"
)
return(list(plot = p, t_test = t_test_result))
}
Sentence Length
# Analyze "Average Sentence Length of User"
result_Sentence <- analyze_variable(data, "Average Sentence Length of User")
##
## Welch Two Sample t-test
##
## data: data_filtered[[variable_name]] by data_filtered$InitiationType
## t = 0.54789, df = 16.348, p-value = 0.5912
## alternative hypothesis: true difference in means between group User initiated and group AI initiated is not equal to 0
## 95 percent confidence interval:
## -23.63183 40.14277
## sample estimates:
## mean in group User initiated mean in group AI initiated
## 40.02778 31.77231
# Show the plot
result_Sentence$plot

Time of conversation
# Analyze "Average Sentence Length of User"
result_Time <- analyze_variable(data, "Difference in Seconds")
##
## Welch Two Sample t-test
##
## data: data_filtered[[variable_name]] by data_filtered$InitiationType
## t = 1.0087, df = 14.456, p-value = 0.3297
## alternative hypothesis: true difference in means between group User initiated and group AI initiated is not equal to 0
## 95 percent confidence interval:
## -47.57736 132.53462
## sample estimates:
## mean in group User initiated mean in group AI initiated
## 153.5556 111.0769
result_Time$plot

Total Turns
# Analyze "Average Sentence Length of User"
result_Turns <- analyze_variable(data, "Total Turns")
##
## Welch Two Sample t-test
##
## data: data_filtered[[variable_name]] by data_filtered$InitiationType
## t = 0.23878, df = 19.611, p-value = 0.8138
## alternative hypothesis: true difference in means between group User initiated and group AI initiated is not equal to 0
## 95 percent confidence interval:
## -6.091806 7.664456
## sample estimates:
## mean in group User initiated mean in group AI initiated
## 11.55556 10.76923
result_Turns$plot
