Success failure experiment

Data loading

We load the data files from the participants actions folder.

The data is in the format of a csv file with the following columns: - Timestamp: The time the action was taken - Session: The session (participant number) - AI Initiates: Whether the AI initiated the action - User Initiates: Whether the user initiated the action - Success: Whether the interaction was successful - Failure: Whether the interaction failed - Leaves: Whether the user left the session before satisfactory interaction - Asks for Help: Whether the user asked facilitators for help

library(ggplot2)
library(tidyr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

file_list <- list.files(path = "../Participants_Actions/", pattern = "\\.csv$", full.names = TRUE)

data_list <- lapply(file_list, function(file) {
  read.csv(file, header = TRUE, check.names = FALSE, stringsAsFactors = FALSE)
})
all_data <- do.call(rbind, data_list)

# Clean up the data
colnames(all_data) <- gsub(" ", "_", colnames(all_data))



## Ensure logicals are treated properly
all_data$AI_Initiates <- as.logical(all_data$AI_Initiates)
all_data$User_Initiates <- as.logical(all_data$User_Initiates)
all_data$Success <- as.logical(all_data$Success)
all_data$Failure <- as.logical(all_data$Failure)
all_data$Leaves <- as.logical(all_data$Leaves)
all_data$Asks_for_Help <- as.logical(all_data$Asks_for_Help)

## Convert Timestamp to POSIXct
all_data$Timestamp <- as.POSIXct(all_data$Timestamp)

## Show the structure of the data
str(all_data)

## 'data.frame':    185 obs. of  8 variables:
##  $ Timestamp     : POSIXct, format: "2025-05-02 13:42:11" "2025-05-02 13:45:55" ...
##  $ Session       : int  2 6 6 7 10 11 2 3 4 5 ...
##  $ AI_Initiates  : logi  TRUE TRUE TRUE FALSE TRUE FALSE ...
##  $ User_Initiates: logi  FALSE FALSE FALSE TRUE FALSE TRUE ...
##  $ Success       : logi  FALSE TRUE FALSE FALSE TRUE FALSE ...
##  $ Failure       : logi  TRUE FALSE FALSE TRUE FALSE TRUE ...
##  $ Leaves        : logi  FALSE FALSE TRUE FALSE FALSE FALSE ...
##  $ Asks_for_Help : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...

colnames(all_data)

## [1] "Timestamp"      "Session"        "AI_Initiates"   "User_Initiates"
## [5] "Success"        "Failure"        "Leaves"         "Asks_for_Help"

Plan

The experiment was conducted over multiple sessions with slight changes in experimental setup and criteria for participation in the experiment as this was refined over time as can be read in the report.

sessions <- length(data_list)

session_sizes <- sapply(data_list, nrow)

session_hours <- sapply(data_list, function(df) {
  as.numeric(difftime(df$Timestamp[nrow(df)], df$Timestamp[1], units = "hours"))
})
session_hours <- round(session_hours, 2)


# Create a data frame with one row per session
session_data <- data.frame(
  Session = 1:sessions,
  Session_size = session_sizes,
  Session_hours = session_hours
)

session_data

##   Session Session_size Session_hours
## 1       1            6          0.56
## 2       2           54          3.83
## 3       3            5          1.43
## 4       4           21          4.36
## 5       5            8          1.54
## 6       6           91          5.12

How many people pr hour?

dateovertime <- function(hours,participants) {
 return(
   participants/hours
 )
  
}

#Make a new dataframe that combines session 1 and 2. Then it combines session 4 and 5. Then it also combines session size and session hours
session_data_combined <- data.frame(
  Setup = c(1, 2, 3, 4),
  size = c(
    sum(session_sizes[1:2]), sum(session_sizes[3]), 
    sum(session_sizes[4:5]), sum(session_sizes[6])
    ),
  hours = c(
    sum(session_hours[1:2]),sum(session_hours[3]), 
    sum(session_hours[4:5]),sum(session_hours[6])
    ),
  people_pr_hour =c(
    sum(session_sizes[1:2])/sum(session_hours[1:2]), 
    sum(session_sizes[3])/sum(session_hours[3]), 
    sum(session_sizes[4:5])/sum(session_hours[4:5]), 
    sum(session_sizes[6])/sum(session_hours[6])
    )
)
session_data_combined

##   Setup size hours people_pr_hour
## 1     1   60  4.39      13.667426
## 2     2    5  1.43       3.496503
## 3     3   29  5.90       4.915254
## 4     4   91  5.12      17.773438

# Create a setup assignment for each row in all_data
setup_ids <- rep(session_data_combined$Setup, session_data_combined$size)

# Add setup ID to all_data
all_data$Setup <- setup_ids


library(dplyr)

outcome_summary <- all_data %>%
  group_by(Setup) %>%
  summarise(
    Successes = sum(Success, na.rm = TRUE),
    Failures = sum(Failure, na.rm = TRUE)
  )
outcome_summary

## # A tibble: 4 × 3
##   Setup Successes Failures
##   <dbl>     <int>    <int>
## 1     1         6       53
## 2     2         4        0
## 3     3        10       18
## 4     4        10       76

final_data <- session_data_combined %>%
  left_join(outcome_summary, by = "Setup") %>%
  replace_na(list(Successes = 0, Failures = 0))
final_data

##   Setup size hours people_pr_hour Successes Failures
## 1     1   60  4.39      13.667426         6       53
## 2     2    5  1.43       3.496503         4        0
## 3     3   29  5.90       4.915254        10       18
## 4     4   91  5.12      17.773438        10       76

final_data <- final_data %>%
  mutate(
    successes_per_hour = Successes / hours,
    failures_per_hour = Failures / hours
  )
final_data

##   Setup size hours people_pr_hour Successes Failures successes_per_hour
## 1     1   60  4.39      13.667426         6       53           1.366743
## 2     2    5  1.43       3.496503         4        0           2.797203
## 3     3   29  5.90       4.915254        10       18           1.694915
## 4     4   91  5.12      17.773438        10       76           1.953125
##   failures_per_hour
## 1         12.072893
## 2          0.000000
## 3          3.050847
## 4         14.843750

library(ggplot2)
library(tidyr)

# Pivot longer to plot both success and failure in one go
plot_data <- final_data %>%
  select(Setup, successes_per_hour, failures_per_hour) %>%
  pivot_longer(
    cols = c(successes_per_hour, failures_per_hour),
    names_to = "Outcome",
    values_to = "Per_Hour"
  )

ggplot(plot_data, aes(x = factor(Setup), y = Per_Hour, fill = Outcome)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    x = "Setup",
    y = "Per Hour",
    title = "Successes and Failures per Hour by Setup"
  ) +
  scale_fill_manual(values = c("successes_per_hour" = "steelblue", "failures_per_hour" = "tomato")) +
  theme_minimal()

outcome_split <- all_data %>%
  group_by(Setup) %>%
  summarise(
    Success_AI = sum(Success & AI_Initiates, na.rm = TRUE),
    Success_User = sum(Success & User_Initiates, na.rm = TRUE),
    Failure_AI = sum(Failure & AI_Initiates, na.rm = TRUE),
    Failure_User = sum(Failure & User_Initiates, na.rm = TRUE),
    .groups = "drop"
  )
final_data <- final_data %>%
  left_join(outcome_split, by = "Setup") %>%
  mutate(
    success_ai_per_hour = Success_AI / hours,
    success_user_per_hour = Success_User / hours,
    failure_ai_per_hour = Failure_AI / hours,
    failure_user_per_hour = Failure_User / hours
  )
plot_data <- final_data %>%
  select(Setup, success_ai_per_hour, success_user_per_hour, failure_ai_per_hour, failure_user_per_hour) %>%
  pivot_longer(
    cols = -Setup,
    names_to = "Outcome",
    values_to = "Per_Hour"
  ) %>%
  mutate(
    Outcome = recode(Outcome,
      "success_ai_per_hour" = "Success (AI Initiated)",
      "success_user_per_hour" = "Success (User Initiated)",
      "failure_ai_per_hour" = "Failure (AI Initiated)",
      "failure_user_per_hour" = "Failure (User Initiated)"
    )
  )


# Plot
ggplot(plot_data, aes(x = factor(Setup), y = Per_Hour, fill = Outcome)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    x = "Setup",
    y = "Participants Per Hour",
    title = "AI/User Initiated Successes and Failures per Hour by Setup"
  ) +
  scale_fill_manual(values = c(
    "Success (AI Initiated)" = "cyan",
    "Success (User Initiated)" = "steelblue",
    "Failure (AI Initiated)" = "brown",
    "Failure (User Initiated)" = "tomato"
  )) +
  theme_minimal()

ggsave("plots/Successes_and_Failures_per_Hour_by_Setup.png", width = 8, height = 4, dpi = 300, bg = "white")

Plots

The raw data for each session can be seen below

Session success over time

# Calculate the start index of each session
session_starts <- c(1, cumsum(session_sizes) + 1)
session_starts <- session_starts[-length(session_starts)]  # Remove the last over-count

# Optional: create session labels
session_labels <- paste("Session - ", 1:length(session_sizes)," - ", session_hours, "Hours")

# Plot as before
plot_data <- all_data %>%
  arrange(Timestamp) %>%
  mutate(
    EventIndex = row_number(),
    Outcome = case_when(
      Success == 1 & User_Initiates == 1 ~ "Success User",
      Success == 1 & AI_Initiates == 1 ~ "Success AI",
      Failure == 1 & User_Initiates == 1 ~ "Failure User",
      Failure == 1 & AI_Initiates == 1 ~ "Failure AI",
      TRUE ~ "Unknown"
    )
  )

p <- ggplot(plot_data, aes(x = EventIndex, y = 1, color = Outcome)) +
  geom_point(size = 3) +
  geom_vline(xintercept = session_starts, linetype = "dashed", color = "black", alpha = 0.5) +
  geom_text(data = data.frame(x = session_starts, y = 1.08, label = session_labels),
            aes(x = x, y = y, label = label),
            inherit.aes = FALSE, angle = 90, vjust = 1.2, size = 3) +  # changed vjust for better centering
  scale_color_manual(values = c("Success User" = "steelblue", "Success AI" = "cyan",
                                "Failure User" = "tomato", "Failure AI" = "brown",
                                "Unknown" = "grey")) +
  labs(
    title = "Session Outcomes in Sequence",
    x = "Session Index",
    y = "",
    color = "Outcome"
  ) +
  scale_x_continuous(breaks = seq(0, max(plot_data$EventIndex), by = 10)) +
  expand_limits(y = c(0.95, 1.15)) +  # Expand y limits slightly
  theme_minimal() +
  theme(axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        panel.grid.major.y = element_blank())

ggsave("plots/Session_Outcomes_with_Session_Markers.png", plot = p, width = 8, height = 3, dpi = 300, bg = "white")

p

#### Bar graph

The data is a litle misleading as the facilitaros have introduced an error into the data. Whenever users was presented the “User initiated” condition they were often not sure on how to interact with the agent. Therefore they came to the facilitators and the facilitators said “Just say hi”. This was not ideal but was done to gain more qualitative data into the experience of initiation or not initiating a conversation. However these people could have been either successes or failures and the exact outcome could no longer be recorded as now all the participants just became successes.

# Create combined outcome and label AI Initiates as descriptive factor
plot_data <- all_data %>%
  pivot_longer(cols = c(Success, Failure), names_to = "Outcome", values_to = "Count") %>%
  group_by(AI_Initiates, Outcome, Asks_for_Help) %>%
  summarise(Count = sum(Count), .groups = "drop") %>%
  mutate(
    Combined = case_when(
      Outcome == "Success" & Asks_for_Help == 0 ~ "Success (No Help)",
      Outcome == "Success" & Asks_for_Help == 1 ~ "Success (Asked Help)",
      Outcome == "Failure" & Asks_for_Help == 0 ~ "Failure (No Help)",
      Outcome == "Failure" & Asks_for_Help == 1 ~ "Failure (Asked Help)"
    ),
    AI_Initiates_Label = factor(AI_Initiates, levels = c(0, 1), labels = c("User Initiated", "AI Initiated"))
  )

# Plot stacked bars by Combined
p <- ggplot(plot_data, aes(x = AI_Initiates_Label, y = Count, fill = Combined)) +
  geom_bar(stat = "identity", position = "stack") +
  facet_wrap(~Outcome) +
  scale_fill_manual(values = c(
    "Success (No Help)" = "steelblue",
    "Success (Asked Help)" = '#A3717C',
    "Failure (No Help)" = "tomato",
    "Failure (Asked Help)" = "gold"  # Add if exists
  )) +
  labs(
    title = "Success/Failure by Initiator and Help Request (Colored by Help + Outcome)",
    x = "Who Initiated",
    y = "Participants",
    fill = "Outcome + Help"
  ) +
  scale_y_continuous(breaks = seq(0, max(plot_data$Count), by = 5)) +
  theme_minimal()

ggsave("plots/SuccessFailure_Stacked_by_Help_and_Outcome_Labelled.png", plot = p, width = 8, height = 4, dpi = 300, bg = "white")

p

Statistical analysis

Understanding whether the condition changed user behavior is done by finding the ratio of success to failure in both conditions as well as their dispersion.

analyze <- function(successes, failures) {
  n <- successes + failures
  p <- successes / n
  sd <- sqrt(p * (1 - p) / n)
  ci_low <- p - 1.96 * sd
  ci_high <- p + 1.96 * sd
  return(list(
    success_rate = p,
    sd = sd,
    ci_95 = c(ci_low, ci_high)
  ))
}

# Summarize data into success and failures
successes <- c(
  sum(all_data$Success[all_data$AI_Initiates == TRUE]), 
  sum(all_data$Success[all_data$AI_Initiates == FALSE])
  )

failures <- c(
  sum(all_data$Failure[all_data$AI_Initiates == TRUE]), 
  sum(all_data$Failure[all_data$AI_Initiates == FALSE])
  )

ai_result <- analyze(successes[1], failures[1])
user_result <- analyze(successes[2], failures[2])

# Create a data frame for the results
results_df <- data.frame(
  Group = c("AI Initiated", "User Initiated"),
  Successes = successes,
  Failures = failures,
  Success_Rate = c(ai_result$success_rate, user_result$success_rate),
  SD = c(ai_result$sd, user_result$sd),
  CI_Low = c(ai_result$ci_95[1], user_result$ci_95[1]),
  CI_High = c(ai_result$ci_95[2], user_result$ci_95[2])
)
# Print the results
print(results_df)

##            Group Successes Failures Success_Rate         SD     CI_Low
## 1   AI Initiated        16       73    0.1797753 0.04070394 0.09999557
## 2 User Initiated        14       74    0.1590909 0.03899024 0.08267005
##     CI_High
## 1 0.2595550
## 2 0.2355118

# Fishers test
# Filter only the two groups
contingency_data <- results_df %>%
  filter(Group %in% c("AI Initiated", "User Initiated")) %>%
  select(Group, Successes, Failures)

# Create matrix for Fisher's test
contingency_table <- matrix(
  c(
    contingency_data$Successes[contingency_data$Group == "AI Initiated"],
    contingency_data$Failures[contingency_data$Group == "AI Initiated"],
    contingency_data$Successes[contingency_data$Group == "User Initiated"],
    contingency_data$Failures[contingency_data$Group == "User Initiated"]
  ),
  nrow = 2,
  byrow = TRUE
)

rownames(contingency_table) <- c("Successes", "Failures")
colnames(contingency_table) <- c("AI Initiated", "User Initiated")

print(contingency_table)

##           AI Initiated User Initiated
## Successes           16             73
## Failures            14             74

# Perform Fisher's exact test
fisher_test_result <- fisher.test(contingency_table, alternative = "two.sided")
print(fisher_test_result)

## 
##  Fisher's Exact Test for Count Data
## 
## data:  contingency_table
## p-value = 0.8416
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.4894725 2.7653626
## sample estimates:
## odds ratio 
##   1.157549

# plot the results with confidence interval
p <- ggplot(results_df, aes(x = Group, y = Success_Rate)) +
  geom_point(size = 3) +
  geom_errorbar(aes(ymin = CI_Low, ymax = CI_High), width = 0.15) +
  scale_y_continuous(limits = c(-0.5, 1)) +
  labs(
    subtitle = paste0("Fisher's Exact Test (AI vs AI no ask) p = ", signif(fisher_test_result$p.value, 3)),
    title = "Success Rate by AI Initiation",
    x = "Condition",
    y = "Success Rate"
  ) +
  theme_minimal()
p

ggsave("plots/Success_Rate_by_AI_Initiation.png", plot = p, width = 8, height = 3, dpi = 300, bg = "white")

analyze <- function(successes, failures) {
  n <- successes + failures
  p <- successes / n
  sd <- sqrt(p * (1 - p) / n)
  ci_low <- p - 1.96 * sd
  ci_high <- p + 1.96 * sd
  return(list(
    success_rate = p,
    sd = sd,
    ci_95 = c(ci_low, ci_high)
  ))
}

# Summarize data into success and failures
successes <- c(
  sum(all_data$Success[all_data$AI_Initiates == TRUE]), 
  sum(all_data$Success[all_data$AI_Initiates == FALSE & all_data$Asks_for_Help == FALSE]))

failures <- c(
  sum(all_data$Failure[all_data$AI_Initiates == TRUE]), 
  sum(all_data$Failure[all_data$AI_Initiates == FALSE &all_data$Asks_for_Help == FALSE]))

ai_result <- analyze(successes[1], failures[1])
noASK <- analyze(successes[2], failures[2])

# Create a data frame for the results
results_df <- data.frame(
  Group = c("AI Initiated", "User Initiated without asking for help"),
  Successes = successes,
  Failures = failures,
  Success_Rate = c(ai_result$success_rate,  noASK$success_rate),
  SD = c(ai_result$sd,  noASK$sd),
  CI_Low = c(ai_result$ci_95[1],  noASK$ci_95[1]),
  CI_High = c(ai_result$ci_95[2],  noASK$ci_95[2])
)
# Print the results
print(results_df)

##                                    Group Successes Failures Success_Rate
## 1                           AI Initiated        16       73   0.17977528
## 2 User Initiated without asking for help         7       74   0.08641975
##           SD     CI_Low   CI_High
## 1 0.04070394 0.09999557 0.2595550
## 2 0.03122032 0.02522793 0.1476116

# Fishers test
# Filter only the two groups
contingency_data <- results_df %>%
  filter(Group %in% c("AI Initiated", "User Initiated without asking for help")) %>%
  select(Group, Successes, Failures)

# Create matrix for Fisher's test
contingency_table <- matrix(
  c(
    contingency_data$Successes[contingency_data$Group == "AI Initiated"],
    contingency_data$Failures[contingency_data$Group == "AI Initiated"],
    contingency_data$Successes[contingency_data$Group == "User Initiated without asking for help"],
    contingency_data$Failures[contingency_data$Group == "User Initiated without asking for help"]
  ),
  nrow = 2,
  byrow = TRUE
)

rownames(contingency_table) <- c("Successes", "Failures")
colnames(contingency_table) <- c("AI Initiated", "User Initiated w/o Help")

print(contingency_table)

##           AI Initiated User Initiated w/o Help
## Successes           16                      73
## Failures             7                      74

# Perform Fisher's exact test
fisher_test_result <- fisher.test(contingency_table, alternative = "two.sided")
print(fisher_test_result)

## 
##  Fisher's Exact Test for Count Data
## 
## data:  contingency_table
## p-value = 0.1147
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.8374566 7.0356439
## sample estimates:
## odds ratio 
##   2.305987

# plot the results with confidence interval
p <- ggplot(results_df, aes(x = Group, y = Success_Rate)) +
  geom_point(size = 3) +
  geom_errorbar(aes(ymin = CI_Low, ymax = CI_High), width = 0.15) +
  scale_y_continuous(limits = c(-0.5, 1)) +
  labs(
    subtitle = paste0("Fisher's Exact Test (AI vs User no ask) p = ", signif(fisher_test_result$p.value, 3)),
    title = "Success Rate by AI Initiation -w/o asking facilitators",
    x = "Condition",
    y = "Success Rate"
  ) +
  theme_minimal()
p

ggsave("plots/Success_Rate_by_AI_Initiation_-_no_asked.png", plot = p, width = 8, height = 3, dpi = 300, bg = "white")

Grasping at straws

What if we assume that all the people that asked would have given up on Rosie and walked away defeated.

# Summarize data into success and failures
successes <- c(
  sum(all_data$Success[all_data$AI_Initiates == TRUE]),
  sum(all_data$Success[all_data$AI_Initiates == FALSE]),
  sum(all_data$Success[all_data$AI_Initiates == FALSE & all_data$Asks_for_Help == FALSE])
)

failures <- c(
  sum(all_data$Failure[all_data$AI_Initiates == TRUE]),
  sum(all_data$Failure[all_data$AI_Initiates == FALSE]),
  sum(all_data$Failure[all_data$AI_Initiates == FALSE & all_data$Asks_for_Help == FALSE])
)

# Assume everyone who asked for help would have failed
failures[3] <- failures[3] + sum(all_data$Success[all_data$Asks_for_Help == 1])
successes[3] <- successes[3]  # Stays the same (no successes from those asking for help)

# Create analyzed results
ai_result <- analyze(successes[1], failures[1])
noASK_result <- analyze(successes[3], failures[3])

# Now adjust data to have just AI Initiated vs User Initiated (without help)
# Overwrite the general user-initiated group (2nd position) with the no-help group (3rd position)
successes[2] <- successes[3]
failures[2] <- failures[3]

# Remove AI initiated (first entry) to leave comparison between AI and User (no help)
successes <- successes[-2]
failures <- failures[-2]

# Check the revised numbers
print(successes)

## [1] 16  7

print(failures)

## [1] 73 82

# Create a data frame for the results
results_df <- data.frame(
  Group = c("AI Initiated", "User Initiated without asked for help"),
  Successes = successes,
  Failures = failures,
  Success_Rate = c(ai_result$success_rate,  noASK$success_rate),
  SD = c(ai_result$sd, noASK$sd),
  CI_Low  =  c(ai_result$ci_95[1], noASK$ci_95[1]),
  CI_High =  c(ai_result$ci_95[2], noASK$ci_95[2])
)
results_df

##                                   Group Successes Failures Success_Rate
## 1                          AI Initiated        16       73   0.17977528
## 2 User Initiated without asked for help         7       82   0.08641975
##           SD     CI_Low   CI_High
## 1 0.04070394 0.09999557 0.2595550
## 2 0.03122032 0.02522793 0.1476116

# Filter only the two groups
contingency_data <- results_df %>%
  filter(Group %in% c("AI Initiated", "User Initiated without asked for help")) %>%
  select(Group, Successes, Failures)

# Create matrix for Fisher's test
contingency_table <- matrix(
  c(
    contingency_data$Successes[contingency_data$Group == "AI Initiated"],
    contingency_data$Failures[contingency_data$Group == "AI Initiated"],
    contingency_data$Successes[contingency_data$Group == "User Initiated without asked for help"],
    contingency_data$Failures[contingency_data$Group == "User Initiated without asked for help"]
  ),
  nrow = 2,
  byrow = TRUE
)

rownames(contingency_table) <- c("Successes", "Failures")
colnames(contingency_table) <- c("AI Initiated", "User Initiated w/o Help")

print(contingency_table)

##           AI Initiated User Initiated w/o Help
## Successes           16                      73
## Failures             7                      82

# Perform Fisher's exact test
fisher_test_result <- fisher.test(contingency_table, alternative = "two.sided")
print(fisher_test_result)

## 
##  Fisher's Exact Test for Count Data
## 
## data:  contingency_table
## p-value = 0.07209
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.9307952 7.7712041
## sample estimates:
## odds ratio 
##   2.554301

print(fisher_test_result$p.value)

## [1] 0.07208567

# Print the results
print(results_df)

##                                   Group Successes Failures Success_Rate
## 1                          AI Initiated        16       73   0.17977528
## 2 User Initiated without asked for help         7       82   0.08641975
##           SD     CI_Low   CI_High
## 1 0.04070394 0.09999557 0.2595550
## 2 0.03122032 0.02522793 0.1476116

# plot the results with confidence interval
p <- ggplot(results_df, aes(x = Group, y = Success_Rate)) +
  geom_point(size = 3) +
  geom_errorbar(aes(ymin = CI_Low, ymax = CI_High), width = 0.15) +
  scale_y_continuous(limits = c(-0.5, 1)) +
  labs(
    subtitle = paste0("Fisher's Exact Test AI vs user no ask where asking is failure p = ", signif(fisher_test_result$p.value, 3)),
    title = "Success Rate by AI Initiation where asking is failing",
    x = "Condition",
    y = "Success Rate"
  ) +
  theme_minimal()
p

ggsave("plots/Success_Rate_by_AI_Initiation_-_asking_is_failing.png", plot = p, width = 8, height = 3, dpi = 300, bg = "white")

Higher likelyhood of asking if user initiated?

# Summarize data into success and failures  
Asked <- c(
  sum(all_data$Success[all_data$AI_Initiates == TRUE & all_data$Asks_for_Help == TRUE]),
  sum(all_data$Success[all_data$AI_Initiates == FALSE & all_data$Asks_for_Help == TRUE])
)
DidNotAsk <- c(
  sum(all_data$Success[all_data$AI_Initiates == TRUE & all_data$Asks_for_Help == FALSE]),
  sum(all_data$Success[all_data$AI_Initiates == FALSE & all_data$Asks_for_Help == FALSE])
)

# Create analyzed results
ai_result <- analyze(Asked[1], DidNotAsk[1])
user_result <- analyze(Asked[2], DidNotAsk[2])


# Create a data frame for the results
results_df <- data.frame(
  Group = c("AI Initiated", "User Initiated"),
  Asked = Asked,
  DidNotAsk = DidNotAsk,
  Success_Rate = c(ai_result$success_rate, user_result$success_rate),
  SD = c(ai_result$sd, user_result$sd),
  CI_Low = c(ai_result$ci_95[1], user_result$ci_95[1]),
  CI_High = c(ai_result$ci_95[2], user_result$ci_95[2])
)
# Print the results
print(results_df)

##            Group Asked DidNotAsk Success_Rate         SD      CI_Low   CI_High
## 1   AI Initiated     1        15       0.0625 0.06051536 -0.05611011 0.1811101
## 2 User Initiated     7         7       0.5000 0.13363062  0.23808398 0.7619160

# Filter only the two groups
contingency_data <- results_df %>%
  filter(Group %in% c("AI Initiated", "User Initiated")) %>%
  select(Group, Asked, DidNotAsk)
contingency_data

##            Group Asked DidNotAsk
## 1   AI Initiated     1        15
## 2 User Initiated     7         7

# Remove the firs column from contingency data
contingency_data <- contingency_data[, -1]

rownames(contingency_data) <- c("AI Initiated", "User Initiated")
colnames(contingency_data) <- c("Asked", "DidNotAsk")


contingency_data

##                Asked DidNotAsk
## AI Initiated       1        15
## User Initiated     7         7

# Perform Fisher's exact test
fisher_test_result <- fisher.test(contingency_data, alternative = "two.sided")
print(fisher_test_result)

## 
##  Fisher's Exact Test for Count Data
## 
## data:  contingency_data
## p-value = 0.01209
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.001395919 0.741585602
## sample estimates:
## odds ratio 
## 0.07342368

print(fisher_test_result$p.value)

## [1] 0.01209395

# plot the results with confidence interval
p <- ggplot(results_df, aes(x = Group, y = Success_Rate)) +
  geom_point(size = 3) +
  geom_errorbar(aes(ymin = CI_Low, ymax = CI_High), width = 0.15) +
  scale_y_continuous(limits = c(-0.5, 1)) +
  labs(
    subtitle = paste0("Fisher's Exact Test Asking vs not asking vs AI vs user p = ", signif(fisher_test_result$p.value, 3)),
    title = "Askin Rate by AI Initiation",
    x = "Condition",
    y = "Asked Rate"
  ) +
  theme_minimal()
p

ggsave("plots/Askin_Rate_by_AI_Initiation.png", plot = p, width = 8, height = 3, dpi = 300, bg = "white")

Combined plots

analyze <- function(successes, failures) {
  n <- successes + failures
  p <- successes / n
  sd <- sqrt(p * (1 - p) / n)
  ci_low <- p - 1.96 * sd
  ci_high <- p + 1.96 * sd
  return(list(
    success_rate = p,
    sd = sd,
    ci_95 = c(ci_low, ci_high)
  ))
}

library(Hmisc)

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:dplyr':
## 
##     src, summarize

## The following objects are masked from 'package:base':
## 
##     format.pval, units

analyse_wilson_Confidence <- function(successes, failures) {
  n <- successes + failures
  p <- successes / n
  sd <- sqrt(p * (1 - p) / n)
  ci <- binconf(x = successes, n = successes+failures, method = "wilson")
  return(list(
    success_rate = p,
    sd = sd,
    ci_lower = ci[2],
    ci_upper = ci[3]
  ))
}




successes <- c(
  sum(all_data$Success[all_data$AI_Initiates == TRUE]),
  sum(all_data$Success[all_data$AI_Initiates == FALSE]),
  sum(all_data$Success[all_data$AI_Initiates == FALSE & all_data$Asks_for_Help == FALSE]),
  sum(all_data$Success[all_data$AI_Initiates == FALSE]) - sum(all_data$Success[all_data$AI_Initiates == FALSE & all_data$Asks_for_Help == TRUE])
)

failures <- c(
  sum(all_data$Failure[all_data$AI_Initiates == TRUE]),
  sum(all_data$Failure[all_data$AI_Initiates == FALSE]),
  sum(all_data$Failure[all_data$AI_Initiates == FALSE & all_data$Asks_for_Help == FALSE]),
  sum(all_data$Failure[all_data$AI_Initiates == FALSE]) + 
  sum(all_data$Success[all_data$Asks_for_Help == 1])
)

# Create analyzed results
ai_result <- analyse_wilson_Confidence(successes[1], failures[1])
user_result <- analyse_wilson_Confidence(successes[2], failures[2])
noASK_result <- analyse_wilson_Confidence(successes[3], failures[3])
ask_failing_result <- analyse_wilson_Confidence(successes[4], failures[4])


results_df <- data.frame(
  Group = c("AI Initiated", "User Initiated", "w/o Asked for Help", "Asking is failing"),
  Success_Rate = c(ai_result$success_rate, user_result$success_rate, 
                   noASK_result$success_rate, ask_failing_result$success_rate),
  CI_Low = c(ai_result$ci_lower, user_result$ci_lower, 
             noASK_result$ci_lower, ask_failing_result$ci_lower),
  CI_High = c(ai_result$ci_upper, user_result$ci_upper, 
              noASK_result$ci_upper, ask_failing_result$ci_upper)
)


# Run Fisher's exact tests
pvals <- c(
  fisher.test(matrix(c(successes[1], failures[1], successes[2], failures[2]), nrow = 2))$p.value,
  fisher.test(matrix(c(successes[1], failures[1], successes[3], failures[3]), nrow = 2))$p.value,
  fisher.test(matrix(c(successes[1], failures[1], successes[4], failures[4]), nrow = 2))$p.value
)
fisher.test(matrix(c(successes[1], failures[1], successes[2], failures[2]), nrow = 2))

## 
##  Fisher's Exact Test for Count Data
## 
## data:  matrix(c(successes[1], failures[1], successes[2], failures[2]), nrow = 2)
## p-value = 0.8416
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  0.4894725 2.7653626
## sample estimates:
## odds ratio 
##   1.157549

# Format p-values for plotting
p_labels <- c("",
              paste0("p = ", signif(pvals[1], 3)),
              paste0("p = ", signif(pvals[2], 3)),
              paste0("p = ", signif(pvals[3], 3)))

results_df$p_label <- p_labels

results_df

##                Group Success_Rate     CI_Low   CI_High    p_label
## 1       AI Initiated   0.17977528 0.11379893 0.2722512           
## 2     User Initiated   0.15909091 0.09719902 0.2495012  p = 0.842
## 3 w/o Asked for Help   0.08641975 0.04249258 0.1677992  p = 0.115
## 4  Asking is failing   0.07865169 0.03861982 0.1535514 p = 0.0721

# Reorder rows manually: 1, 2, 4, 3
results_df <- results_df[c(1, 2, 3, 4), ]
results_df$Group <- factor(results_df$Group, levels = results_df$Group)



p <- ggplot(results_df, aes(x = Group, y = Success_Rate)) +
  geom_point(size = 3) +
  geom_errorbar(aes(ymin = CI_Low, ymax = CI_High), width = 0.15) +
  geom_text(aes(label = p_label, y = Success_Rate + 0.05), size = 3, vjust = -4) +
  geom_hline(yintercept = 0.5, linetype = "solid", color = "red", size = 1.2)+
  scale_y_continuous(limits = c(0, 1)) +
  labs(
    title = "Success Rate by AI Initiation",
    subtitle = "Fisher's Exact Test p-values using Wilson score interval",
    x = "Condition",
    y = "Success Rate"
  ) +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggsave("plots/Success_Rate_by_AI_Initiation_combined.png", plot = p, width = 8, height = 7, dpi = 300, bg = "white")

get_fisher_ci <- function(success_a, failure_a, success_b, failure_b) {
  table <- matrix(c(success_a, failure_a, success_b, failure_b), nrow = 2)
  test <- fisher.test(table)
  return(data.frame(
    Comparison = paste0("AI vs ", deparse(substitute(success_b))),
    odds_ratio = test$estimate,
    CI_Low = test$conf.int[1],
    CI_High = test$conf.int[2],
    p_value = test$p.value
  ))
}


# Use your current values
successes <- c(
  sum(all_data$Success[all_data$AI_Initiates == TRUE]),
  sum(all_data$Success[all_data$AI_Initiates == FALSE]),
  sum(all_data$Success[all_data$AI_Initiates == FALSE & all_data$Asks_for_Help == FALSE]),
  sum(all_data$Success[all_data$AI_Initiates == FALSE]) - sum(all_data$Success[all_data$AI_Initiates == FALSE & all_data$Asks_for_Help == TRUE])
)

failures <- c(
  sum(all_data$Failure[all_data$AI_Initiates == TRUE]),
  sum(all_data$Failure[all_data$AI_Initiates == FALSE]),
  sum(all_data$Failure[all_data$AI_Initiates == FALSE & all_data$Asks_for_Help == FALSE]),
  sum(all_data$Failure[all_data$AI_Initiates == FALSE]) + 
  sum(all_data$Success[all_data$Asks_for_Help == 1])
)

# Run Fisher's test for comparisons with AI Initiated
labels <- c("User Initiated", "User Initiated (No Help)", "User Initiated (Help Fails)")

fisher_results <- do.call(rbind, lapply(2:4, function(i) {
  test <- fisher.test(matrix(c(successes[1], failures[1], successes[i], failures[i]), nrow = 2))
  data.frame(
    Group = labels[i - 1],
    odds_ratio = test$estimate,
    CI_Low = test$conf.int[1],
    CI_High = test$conf.int[2],
    p_value = test$p.value
  )
}))

ggplot(fisher_results, aes(x = Group, y = odds_ratio)) +
  geom_point(size = 3) +
  geom_errorbar(aes(ymin = CI_Low, ymax = CI_High), width = 0.2) +
  geom_text(aes(label = paste0("p = ", signif(p_value, 3))), vjust = -1.5) +
  scale_y_log10() +  # Odds ratios should be plotted on log scale
  labs(
    title = "Fisher's Exact Test: Odds Ratios vs AI Initiated",
    x = "Compared Group",
    y = "Odds Ratio (log scale)"
  ) +
  theme_minimal()

SuccessFailure

Jgivsk, Skth, Rdha

2025-05-23