Roche Adverse Events Automated tagging

library(ellmer)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
rm(list = ls())

Adverse events data

Here we use some adverse events data from the PhUSE CS Working Group 5, the CDISC esubmission.

# AEBODSYS — Adverse Event Body System
vct_SYS <- c( "CARDIAC DISORDERS" , "GASTROINTESTINAL DISORDERS" , "PSYCHIATRIC DISORDERS" )

df_ADAE <- 
  haven::read_xpt( "https://raw.githubusercontent.com/phuse-org/phuse-scripts/master/data/adam/cdiscpilot01/adae.xpt" ) %>% 
  filter( AEBODSYS %in% vct_SYS ) %>% 
  # AELLT (lowest level term)
  distinct( AELLT , AEBODSYS ) %>%
  arrange( AELLT )


df_ADAE
# A tibble: 74 × 2
   AELLT                   AEBODSYS                  
   <chr>                   <chr>                     
 1 ABDOMINAL CRAMPS        GASTROINTESTINAL DISORDERS
 2 ABDOMINAL DISCOMFORT    GASTROINTESTINAL DISORDERS
 3 ABDOMINAL PAIN          GASTROINTESTINAL DISORDERS
 4 AGITATION               PSYCHIATRIC DISORDERS     
 5 ANTEROSEPTAL INFARCTION CARDIAC DISORDERS         
 6 ANXIETY                 PSYCHIATRIC DISORDERS     
 7 ANXIETY AGGRAVATED      PSYCHIATRIC DISORDERS     
 8 ANXIETY ATTACK          PSYCHIATRIC DISORDERS     
 9 ATRIAL FIBRILLATION     CARDIAC DISORDERS         
10 ATRIAL FLUTTER          CARDIAC DISORDERS         
# ℹ 64 more rows

Automated tagging with mistral-large-latest LLM

The task here is to automatically assign the MedDRA System Organ Class label (i.e. Adverse Event Body System) to the AELLT (lowest level term) using a LLM.

text_df <- 
  # paste(capture.output(print(df_ADAE[,1 ], n = Inf)), 
  #         width = Inf, collapse = "\n")
  paste(
    capture.output(
      write.table(df_ADAE[, 1, drop = FALSE] ,
                  row.names = FALSE,
                  sep = ",")
    ),
    collapse = "\n"
  )


type_named_entity <- type_object(
  name = type_string("A classified MedDRA symptom in column AEDECOD"),
  type = type_enum(
    description = "Classification label",
    values      = vct_SYS
  ),
  confidence = type_string("Confidence in classification: high, medium, or low")
)


type_named_entities <- type_array(type_named_entity)
# ------------

chat <- chat_mistral(
  model         = "mistral-large-latest",
  system_prompt = "You are a medical text classifier expert in MedDRA terminology.
    The user will give you a data.frame, and you will classify the given symptom in the column AELLT into exactly one of the provided categories"
)

df_classified <- 
  chat$chat_structured(
    text_df ,
    type = type_named_entities
  )

df_classified
# A tibble: 74 × 3
   name                    type                       confidence
   <chr>                   <fct>                      <chr>     
 1 ABDOMINAL CRAMPS        GASTROINTESTINAL DISORDERS high      
 2 ABDOMINAL DISCOMFORT    GASTROINTESTINAL DISORDERS high      
 3 ABDOMINAL PAIN          GASTROINTESTINAL DISORDERS high      
 4 AGITATION               PSYCHIATRIC DISORDERS      high      
 5 ANTEROSEPTAL INFARCTION CARDIAC DISORDERS          high      
 6 ANXIETY                 PSYCHIATRIC DISORDERS      high      
 7 ANXIETY AGGRAVATED      PSYCHIATRIC DISORDERS      high      
 8 ANXIETY ATTACK          PSYCHIATRIC DISORDERS      high      
 9 ATRIAL FIBRILLATION     CARDIAC DISORDERS          high      
10 ATRIAL FLUTTER          CARDIAC DISORDERS          high      
# ℹ 64 more rows

Classification Performance for multiclass classification

df_classified %>% 
  left_join( df_ADAE , by = c("name" = "AELLT") ) %>% 
  mutate(
    type = factor(type, levels = vct_SYS),
    AEBODSYS = factor(AEBODSYS, levels = vct_SYS)
  ) %>%
  yardstick::metrics(truth = AEBODSYS, estimate = type)
# A tibble: 2 × 3
  .metric  .estimator .estimate
  <chr>    <chr>          <dbl>
1 accuracy multiclass         1
2 kap      multiclass         1