Load and Explore Data

# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
# Load the data
data <- read.csv("./data/Transcription_data.csv")

Define Analysis Function

analyze_variable <- function(data, variable_name) {
  library(dplyr)
  library(tidyr)
  library(ggplot2)

  # Filter and reshape data
  data_filtered <- data %>%
    filter(Variable %in% c(variable_name, "AI Initiated")) %>%
    pivot_wider(names_from = Variable, values_from = Value)

  # Clean numeric variable (remove ' chars')
  data_filtered[[variable_name]] <- as.numeric(gsub(" chars", "", data_filtered[[variable_name]]))

  # Convert AI Initiated to factor and label clearly
  data_filtered$InitiationType <- factor(
    ifelse(data_filtered[["AI Initiated"]] == "True", "AI initiated", "User initiated"),
    levels = c("User initiated", "AI initiated")
  )

  # Perform t-test
  t_test_result <- t.test(data_filtered[[variable_name]] ~ data_filtered$InitiationType)
  print(t_test_result)

  # Format p-value for display
  p_val <- signif(t_test_result$p.value, 3)
  df <- round(t_test_result$parameter, 2)

  # Create the plot
  y_max <- max(data_filtered[[variable_name]], na.rm = TRUE)
  p <- ggplot(data_filtered, aes(x = InitiationType, y = !!sym(variable_name), fill = InitiationType)) +
    geom_boxplot() +
    geom_jitter(width = 0.2, alpha = 0.5) +
    annotate("text", 
           x = 1.5, 
           y = y_max * 1.05,  # safely below plot limit
           label = paste0("p = ", p_val, "\n(df = ", df, ")"), 
           size = 5, hjust = 0.5, vjust = 1) +
    labs(
      title = paste("Comparison of", variable_name, "by AI Initiation"),
      x = "Initiation Type",
      y = variable_name
    ) +
    scale_fill_manual(
      values = c("AI initiated" = "steelblue", "User initiated" = "tomato")
    ) +
    scale_y_continuous(
    labels = scales::comma,
    limits = c(-1, y_max * 1.15)  # higher than data and annotation
  ) +    
    theme_minimal() +
    theme(legend.position = "none")

  # Save the plot
  ggsave(
    filename = paste0("plots/Transcription_", gsub(" ", "_", variable_name), ".png"),
    plot = p, width = 10, height = 6, dpi = 300, bg = "white"
  )

  return(list(plot = p, t_test = t_test_result))
}

Sentence Length

# Analyze "Average Sentence Length of User"
result_Sentence  <- analyze_variable(data, "Average Sentence Length of User")
## 
##  Welch Two Sample t-test
## 
## data:  data_filtered[[variable_name]] by data_filtered$InitiationType
## t = 0.54789, df = 16.348, p-value = 0.5912
## alternative hypothesis: true difference in means between group User initiated and group AI initiated is not equal to 0
## 95 percent confidence interval:
##  -23.63183  40.14277
## sample estimates:
## mean in group User initiated   mean in group AI initiated 
##                     40.02778                     31.77231
# Show the plot
result_Sentence$plot

Time of conversation

# Analyze "Average Sentence Length of User"
result_Time <- analyze_variable(data, "Difference in Seconds")
## 
##  Welch Two Sample t-test
## 
## data:  data_filtered[[variable_name]] by data_filtered$InitiationType
## t = 1.0087, df = 14.456, p-value = 0.3297
## alternative hypothesis: true difference in means between group User initiated and group AI initiated is not equal to 0
## 95 percent confidence interval:
##  -47.57736 132.53462
## sample estimates:
## mean in group User initiated   mean in group AI initiated 
##                     153.5556                     111.0769
result_Time$plot

Total Turns

# Analyze "Average Sentence Length of User"
result_Turns <- analyze_variable(data, "Total Turns")
## 
##  Welch Two Sample t-test
## 
## data:  data_filtered[[variable_name]] by data_filtered$InitiationType
## t = 0.23878, df = 19.611, p-value = 0.8138
## alternative hypothesis: true difference in means between group User initiated and group AI initiated is not equal to 0
## 95 percent confidence interval:
##  -6.091806  7.664456
## sample estimates:
## mean in group User initiated   mean in group AI initiated 
##                     11.55556                     10.76923
result_Turns$plot