LLM vs Doctors

# Load necessary libraries
pacman::p_load(pacman, readr, dplyr, ggplot2, gridExtra)

# Data Frame with Metrics for Groups
df <- data.frame(
  Outcome = c("Diagnostic Performance", "Diagnostic Performance", "Time per Case", "Time per Case", "LLM Alone", "LLM Alone"),
  Group = c("Conventional Resources", "With LLM", "Conventional Resources", "With LLM", "LLM Alone", "Conventional Resources"),
  Median = c(74, 76, 565, 519, 92, NA),  # LLM Alone does not have a median for the conventional group in the 3rd outcome
  IQR = c("63%-84%", "66%-87%", "456-788", "371-668", "82%-97%", NA),
  Difference = c(NA, 2, -82, NA, 16, NA),
  CI = c(NA, "95% CI, -4 to 8", "95% CI, -195 to 31", "95% CI, -195 to 31", "95% CI, 2-30", NA),
  P_value = c(NA, 0.60, 0.20, 0.20, 0.03, NA)
)

# Export df to CSV
write.csv(df, "df.csv", row.names = FALSE)

# Diagnostic Performance Data Frame
diagnostic_performance_data <- data.frame(
  Group = c("Conventional Resources", "With LLM", "LLM Alone"),
  Score = c(74, 76, 92)  # Median score of each group
)

# Export diagnostic_performance_data to CSV
write.csv(diagnostic_performance_data, "diagnostic_performance_data.csv", row.names = FALSE)

# Create Bar Plot for Diagnostic Performance Comparison
p1 <- ggplot(diagnostic_performance_data, aes(x = Group, y = Score, fill = Group)) +
  geom_bar(stat = "identity", position = "dodge", width = 0.7, show.legend = FALSE) +
  geom_text(aes(label = paste0(Score, "%")), vjust = -0.5, size = 5, color = "#000000") +
  labs(
    title = "Diagnostic Performance: Conventional, With LLM, and LLM Alone",
    subtitle = "Large Language Model (ChatGPT-4) Influence on Diagnostic Reasoning",  
    y = "Median Score (%)",
    x = "Group"
  ) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))  # Center align the title

# Time Spent Per Case Data Frame
time_df <- data.frame(
  Group = c("With LLM", "Conventional Resources"),
  Median = c(519, 565),
  IQR = c("371-668", "456-788")
)

# Export time_df to CSV
write.csv(time_df, "time_df.csv", row.names = FALSE)

# Bar Plot for Time Spent Per Case
p2 <- ggplot(time_df, aes(x = Group, y = Median, fill = Group)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = Median), vjust = -0.5, size = 5, color = "#000000") +
  labs(
    title = "Time Spent Per Case: With LLM vs Conventional Resources",
    subtitle = "Large Language Model (ChatGPT-4) Influence on Diagnostic Reasoning",  
    y = "Time (seconds)",
    x = "Group"
  ) +
  theme_minimal()

# Time Spent Analysis by Subgroups (e.g., Attending vs Resident)
time_subgroup_data <- data.frame(
  Time = c(rnorm(50, 519, 70), rnorm(50, 565, 80)),  # Time for With LLM and Conventional Resources
  Group = rep(c("With LLM", "Conventional Resources"), each = 50),
  Subgroup = rep(c("Attending", "Resident"), each = 50)
)

# Export time_subgroup_data to CSV
write.csv(time_subgroup_data, "time_subgroup_data.csv", row.names = FALSE)

p3 <- ggplot(time_subgroup_data, aes(x = Subgroup, y = Time, fill = Group)) +
  geom_boxplot() +
  labs(
    title = "Time Spent Per Case by Subgroup: With LLM vs Conventional",
    subtitle = "Large Language Model (ChatGPT-4) Influence on Diagnostic Reasoning",  
    y = "Time (seconds)",
    x = "Subgroup"
  ) +
  theme_minimal()

# Distribution of Diagnostic Performance Scores
set.seed(123) # For reproducibility
n <- 100  # Sample size per group
llm_scores <- rnorm(n, mean = 76, sd = 6)
conventional_scores <- rnorm(n, mean = 74, sd = 7)

score_data <- data.frame(
  Score = c(llm_scores, conventional_scores),
  Group = rep(c("With LLM", "Conventional Resources"), each = n)
)

# Export score_data to CSV
write.csv(score_data, "score_data.csv", row.names = FALSE)

p4 <- ggplot(score_data, aes(x = Group, y = Score, fill = Group)) +
  geom_boxplot() +
  labs(
    title = "Distribution of Diagnostic Performance Scores: With LLM vs Conventional",
    subtitle = "Large Language Model (ChatGPT-4) Influence on Diagnostic Reasoning",  
    y = "Diagnostic Performance (%)",
    x = "Group"
  ) +
  theme_minimal()

# Subgroup Analysis by Training Level (Attending vs Resident)
# Sample data for subgroup analysis
set.seed(123)  # For reproducibility
attending_llm <- rnorm(50, mean = 79, sd = 5)
attending_conventional <- rnorm(50, mean = 75, sd = 6)
resident_llm <- rnorm(50, mean = 76, sd = 6)
resident_conventional <- rnorm(50, mean = 74, sd = 7)

subgroup_data <- data.frame(
  Score = c(attending_llm, attending_conventional, resident_llm, resident_conventional),
  Group = rep(c("With LLM", "Conventional Resources"), each = 50, times = 2),
  Subgroup = rep(c("Attending", "Resident"), each = 100)
)

# Export subgroup_data to CSV
write.csv(subgroup_data, "subgroup_data.csv", row.names = FALSE)

# Plot: Diagnostic Performance by Training Level
p5 <- ggplot(subgroup_data, aes(x = Subgroup, y = Score, fill = Group)) +
  geom_boxplot() +
  labs(
    title = "Diagnostic Performance by Training Level: With LLM vs Conventional",
    subtitle = "Large Language Model (ChatGPT-4) Influence on Diagnostic Reasoning",  
    y = "Diagnostic Performance (%)",
    x = "Training Level"
  ) +
  theme_minimal()

# Subgroup Analysis by LLM Experience (Less than monthly vs More than monthly)
# Sample data for LLM Experience subgroup analysis
less_than_monthly_llm <- rnorm(50, mean = 76, sd = 5)
less_than_monthly_conventional <- rnorm(50, mean = 76, sd = 6)
more_than_monthly_llm <- rnorm(50, mean = 79, sd = 4)
more_than_monthly_conventional <- rnorm(50, mean = 74, sd = 5)

experience_data <- data.frame(
  Score = c(less_than_monthly_llm, less_than_monthly_conventional, more_than_monthly_llm, more_than_monthly_conventional),
  Group = rep(c("With LLM", "Conventional Resources"), each = 50, times = 2),
  Experience = rep(c("Less than Monthly", "More than Monthly"), each = 100)
)

# Export experience_data to CSV
write.csv(experience_data, "experience_data.csv", row.names = FALSE)

# Plot: Diagnostic Performance by LLM Experience
p6 <- ggplot(experience_data, aes(x = Experience, y = Score, fill = Group)) +
  geom_boxplot() +
  labs(
    title = "Diagnostic Performance by LLM Experience: Less vs More Frequent Use",
    subtitle = "Large Language Model (ChatGPT-4) Influence on Diagnostic Reasoning",  
    y = "Diagnostic Performance (%)",
    x = "Experience Level"
  ) +
  theme_minimal()

# Group plots together
grid.arrange(p5, p6, ncol = 2)

# Group plots together
grid.arrange(p2, p3, ncol = 2)

# Group plots together
grid.arrange(p4, p1, ncol = 2)

LLM vs Doctors

Patrick Ford

2024-12-06