Background

Currently we are using three different X-ray tube voltages (10 kV, 30 kV, sometimes 50 kV) and three corresponding models. Some of the elements in the models seem very noisy, some of the elements are listed in more than one models. While the idea behind listing elements in different voltage-dependent models is to take advantage of a different continuum/background spectrum (due to different filters), this information is never used in our case. Additionally, some elements that are included may give information about X-ray tube conditions or measurement conditions, but these data are never utilised.

This report compares export data from bAXILbatch and shows statistics on noisyness of the element/voltages employed in the models.

Initialising R environment and functions

installPackages <- function() {
  listOfPackages = c("readr",
                     "tidyr",
                     "dplyr",
                     "stringr",
                     "purrr",
                     "tibble",
                     "ggplot2",
                     "viridis")
  newPackages <-
    listOfPackages[!(listOfPackages %in% installed.packages()[, "Package"])]
  if (length(newPackages) > 0) {
    install.packages(newPackages)
  }
  lapply(listOfPackages, require, character.only = TRUE)
}

parsebAXILfile <- function(fileName) {
  tmp <- read_csv(fileName, quote = "") %>%
    distinct()
  if (!str_detect((names(tmp)[1]), "Spectrum"))
    stop("No valid Avaatech XRF baxil batch file (csv)") # check structure of file
  names(tmp) <- names(tmp) %>%
    str_replace_all('\\"', "") %>% # remove unnecessary quotes
    str_trim() # trim whitespace
  
  tmp2 <- str_split_fixed(tmp$Spectrum, "\\!", 17) %>%
    as_tibble() # Split spectrum field to gain additional information
  names(tmp2) <-
    c(
      "CoreID",
      "unknown1",
      "unknown2",
      "Depth",
      "Date",
      "Time",
      "Duration",
      "Voltage",
      "Current",
      "unknown3",
      "unknown4",
      "Filter",
      "SlitDown",
      "SlitCross",
      "Run",
      "Rep",
      "unknown5"
    ) # naming new variables
  
  tmp3 <- bind_cols(tmp2, tmp) %>%
    select(-starts_with("unknown"),
           -Spectrum,
           -`Live time`,
           -`Real time`,
           -Sample,
           -User) %>%
    mutate_at(
      vars(
        Depth,
        Voltage,
        Current,
        SlitDown,
        SlitCross,
        Duration,
        Throughput
      ),
      as.numeric
    ) %>%
    # gather wide-spread counts/element data, turning into long form
    gather(-(CoreID:Throughput), key = "Measure", value = "Value")
  tmp3
}

batchparsebAXIL <- function(dir) {
  # this function calls the specific functions to process data from each section in the raw file.
  installPackages()
  
  files <- dir(dir, pattern = ".+\\.csv", full.names = TRUE)
  if (is_empty(files))
    stop("No csv files found in directory")

    computedhash <- paste0(tools::md5sum(files), collapse = "")
  storedhash <- try(readRDS("xrfinputhash.rds"), silent = TRUE)
  
  
  if (computedhash == storedhash & file.exists("xrfrawresults.rds")) {
      res <- readRDS("xrfrawresults.rds")
  } else {
    res <- map(files, parsebAXILfile) %>%
    bind_rows() %>% # bind dataframes together (row-wise)
    distinct()
    saveRDS(res, "xrfrawresults.rds")
    saveRDS(computedhash, file = "xrfinputhash.rds")
  }
  
  res
}

Parsing bAXIL export data

Reading and parsing all data in subfolder ./xrfdiagdata (execute this chunk to read in new data). Change the path (inside the quotation marks) in this command in the code:

xrfraw <- batchparsebAXIL("./xrfdiagdata/")

NB: The function checks if files have been changed, added, or removed and will only re-read all files if so.

xrfraw <- batchparsebAXIL("./xrfdiagdata/")
glimpse(xrfraw)
## Rows: 10,414,415
## Columns: 16
## $ CoreID     <chr> "BIC B 2-3m", "BIC B 2-3m", "BIC B 2-3m", "BIC B 2-3m", "B…
## $ Depth      <dbl> 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291…
## $ Date       <chr> "2020-05-07", "2020-05-07", "2020-05-07", "2020-05-07", "2…
## $ Time       <chr> "15-16-57", "15-17-29", "15-18-02", "15-18-34", "15-19-05"…
## $ Duration   <dbl> 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15…
## $ Voltage    <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10…
## $ Current    <dbl> 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500…
## $ Filter     <chr> "No-Filter", "No-Filter", "No-Filter", "No-Filter", "No-Fi…
## $ SlitDown   <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ SlitCross  <dbl> 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12…
## $ Run        <chr> "Run1", "Run1", "Run1", "Run1", "Run1", "Run1", "Run1", "R…
## $ Rep        <chr> "Rep0", "Rep0", "Rep0", "Rep0", "Rep0", "Rep0", "Rep0", "R…
## $ Excitation <chr> "XRay Tube Generator", "XRay Tube Generator", "XRay Tube G…
## $ Throughput <dbl> 171000, 173000, 173000, 173000, 169000, 164000, 161000, 16…
## $ Measure    <chr> "Mg-Ka Chi2", "Mg-Ka Chi2", "Mg-Ka Chi2", "Mg-Ka Chi2", "M…
## $ Value      <dbl> 0.5, 1.0, 0.7, 0.5, 0.5, 0.6, 0.4, 0.6, 0.6, 0.5, 0.6, 0.7…

Next we tidy up the parsed raw data:

xrftidy <- xrfraw %>% 
  mutate(Measure = str_replace_all(Measure, " -", "-"), Stat = str_extract(Measure, "\\w+$"), Element = str_extract(Measure, "^\\w+"), AbsLine = str_extract(Measure, "(?<=-)\\w+"), Scatter = str_extract(Measure, "Coh|Inc")) %>% 
  select(-Measure) %>% 
  pivot_wider(names_from = Stat, values_from = Value) %>% 
  mutate(relsd = cpsStd/cps*100, Scatter = replace_na(Scatter, ""), ElementLine = paste0(Element, "-", AbsLine, " ", Scatter), Voltage = as.factor(Voltage), ElementLineVoltage = paste0(ElementLine," @ ", Voltage, " kV"))
glimpse(xrftidy)
## Rows: 2,082,883
## Columns: 25
## $ CoreID             <chr> "BIC B 2-3m", "BIC B 2-3m", "BIC B 2-3m", "BIC B 2…
## $ Depth              <dbl> 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, …
## $ Date               <chr> "2020-05-07", "2020-05-07", "2020-05-07", "2020-05…
## $ Time               <chr> "15-16-57", "15-17-29", "15-18-02", "15-18-34", "1…
## $ Duration           <dbl> 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15…
## $ Voltage            <fct> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10…
## $ Current            <dbl> 1500, 1500, 1500, 1500, 1500, 1500, 1500, 1500, 15…
## $ Filter             <chr> "No-Filter", "No-Filter", "No-Filter", "No-Filter"…
## $ SlitDown           <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ SlitCross          <dbl> 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12…
## $ Run                <chr> "Run1", "Run1", "Run1", "Run1", "Run1", "Run1", "R…
## $ Rep                <chr> "Rep0", "Rep0", "Rep0", "Rep0", "Rep0", "Rep0", "R…
## $ Excitation         <chr> "XRay Tube Generator", "XRay Tube Generator", "XRa…
## $ Throughput         <dbl> 171000, 173000, 173000, 173000, 169000, 164000, 16…
## $ Element            <chr> "Mg", "Mg", "Mg", "Mg", "Mg", "Mg", "Mg", "Mg", "M…
## $ AbsLine            <chr> "Ka", "Ka", "Ka", "Ka", "Ka", "Ka", "Ka", "Ka", "K…
## $ Scatter            <chr> "", "", "", "", "", "", "", "", "", "", "", "", ""…
## $ Chi2               <dbl> 0.5, 1.0, 0.7, 0.5, 0.5, 0.6, 0.4, 0.6, 0.6, 0.5, …
## $ Area               <dbl> 642, 742, 750, 801, 721, 679, 501, 472, 747, 793, …
## $ AreaStd            <dbl> 49, 50, 50, 52, 51, 51, 47, 47, 50, 52, 52, 53, 53…
## $ cps                <dbl> 42.8, 49.5, 50.0, 53.4, 48.1, 45.3, 33.4, 31.5, 49…
## $ cpsStd             <dbl> 3.3, 3.3, 3.4, 3.4, 3.4, 3.4, 3.1, 3.1, 3.3, 3.4, …
## $ relsd              <dbl> 7.710280, 6.666667, 6.800000, 6.367041, 7.068607, …
## $ ElementLine        <chr> "Mg-Ka ", "Mg-Ka ", "Mg-Ka ", "Mg-Ka ", "Mg-Ka ", …
## $ ElementLineVoltage <chr> "Mg-Ka  @ 10 kV", "Mg-Ka  @ 10 kV", "Mg-Ka  @ 10 k…

Calculating statistics and criteria

And we calculate some summary/descriptive statistics about the goodness of our data, specifically the relative standard deviation relsd (Error/Noise of the spectrum in percent). Additionally we compute the median relsd, median goodness of fit (medchi2), sum of Chi2 values (totalchi2) and totalcounts resp. cps for each spectrum (Each core with each element/voltage/Absorption Line combination). Ideally, this will tell us which elements we should exclude from our models and of those, that are measured more than once (e.g. at 10 kV and 30 kV), which one we should keep.

The bAXIL files are not checked/corrected for bad data! Reasons for bad data include:

To ignore outlier data, we just use the median for most measures (see above) and to visually see differences, we plot log values. This occasionally produces NaN (“Not a Number”; mostly due to taking the log of a negative number) and is shown as a warning and in plots as grey dots. This behaviour is expected.

xrfstats <- xrftidy %>% 
  select(CoreID, Depth, Voltage, Element:ElementLineVoltage) %>% 
  group_by(CoreID, Voltage, Element, AbsLine) %>% 
  summarise(medsd = median(relsd), medchi2 = median(Chi2), totalchi2 = sum(Chi2), totalcounts = sum(cps), ElementLineVoltage = first(ElementLineVoltage)) %>% 
  ungroup()
glimpse(xrfstats)
## Rows: 3,439
## Columns: 9
## $ CoreID             <chr> "BIC B 2-3m", "BIC B 2-3m", "BIC B 2-3m", "BIC B 2…
## $ Voltage            <fct> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10…
## $ Element            <chr> "Al", "Ar", "Ba", "Ca", "Cl", "Co", "Cr", "Cu", "F…
## $ AbsLine            <chr> "Ka", "Ka", "La", "Ka", "Ka", "Ka", "Ka", "Ka", "K…
## $ medsd              <dbl> 1.0655965, 8.5101890, 17.3398795, 0.1386253, 4.578…
## $ medchi2            <dbl> 1.8, 1.4, 1.0, 19.5, 2.5, 11.2, 1.2, 0.9, 3.2, 10.…
## $ totalchi2          <dbl> 1206.8, 956.0, 600.0, 12427.1, 1596.6, 6566.5, 748…
## $ totalcounts        <dbl> 412519.85, 140665.54, 48308.82, 21193406.41, 22698…
## $ ElementLineVoltage <chr> "Al-Ka  @ 10 kV", "Ar-Ka  @ 10 kV", "Ba-La  @ 10 k…

We also want to look more specifically at all the elements that occur more than once in our 10 kV, 30 kV, 50 kV models to see which Absorption Line performs best. For this comparison we exclude the scatter (incoheren/coherent) XRF spectra of Rh and Ag - these data do not reflect measured elements in the sample, but the state of our Xray tube. Currently we do not make use of these data. These elements are the following (see code):

multivoltelements <- c("Ba", "Br", "Ca", "Co", "Cr", "Cu", "Fe", "K", "Mn", "Mo", "Ni", "Rb", "Sr", "Ti", "V", "Y", "Zr")

xrftidy_multielem <- xrftidy %>% 
  filter(Element %in% multivoltelements)

xrfstats_multielem <- xrfstats %>% 
  filter(Element %in% multivoltelements)

Distribution plots

First we want to see the distribution of noise (relsd) and goodness of fit (Chi2) for all cores together, per Element/Line/Voltage combination.

Distribution of noise:

ggplot(xrftidy, aes(x = ElementLine, y = relsd, fill = Voltage)) + geom_boxplot() + scale_y_log10() + scale_fill_viridis(discrete = TRUE) + theme(axis.text.x = element_text(angle = 90, hjust=1, vjust = 0.5))

Distribution of goodness of fit:

ggplot(xrftidy, aes(x = ElementLine, y = Chi2, fill = Voltage)) + geom_boxplot() + scale_y_log10() + scale_fill_viridis(discrete = TRUE) + theme(axis.text.x = element_text(angle = 90, hjust=1, vjust = 0.5))

We want to see the same as well just for elements that we measure at different voltages (see above) Distribution of noise:

ggplot(xrftidy_multielem, aes(x = ElementLine, y = relsd, fill = Voltage)) + geom_boxplot() + scale_y_log10() + scale_fill_viridis(discrete = TRUE) + theme(axis.text.x = element_text(angle = 90, hjust=1, vjust = 0.5))

Distribution of goodness of fit:

ggplot(xrftidy_multielem, aes(x = ElementLine, y = Chi2, fill = Voltage)) + geom_boxplot() + scale_y_log10() + scale_fill_viridis(discrete = TRUE) + theme(axis.text.x = element_text(angle = 90, hjust=1, vjust = 0.5))

# Criteria overview plots - per core

Additionally we want to look at the situation for every core to uncover systematic differences:

We start with all elements again and with the logarithm of the median of the noise (relsd) - the lower the better:

ggplot(xrfstats, aes(x = ElementLineVoltage, y = CoreID)) + geom_tile(aes(fill = log(medsd))) + scale_fill_viridis()+ theme(axis.text.x = element_text(angle = 90, hjust=1, vjust = 0.5))

NB: We’re not showing the total relative noise since some of it is negative (introduced through negative counts).

The same but for the log of the median goodness of fit (the lower the better):

ggplot(xrfstats, aes(x = ElementLineVoltage, y = CoreID)) + geom_tile(aes(fill = log(medchi2))) + scale_fill_viridis() + theme(axis.text.x = element_text(angle = 90, hjust=1, vjust = 0.5))

The same for the summed goodness of fit per core per element/voltage/line (the lower the better):

ggplot(xrfstats, aes(x = ElementLineVoltage, y = CoreID)) + geom_tile(aes(fill = log(totalchi2))) + scale_fill_viridis()  + theme(axis.text.x = element_text(angle = 90, hjust=1, vjust = 0.5))

And finally the log of the total cps/counts - the higher the better:

ggplot(xrfstats, aes(x = ElementLineVoltage, y = CoreID)) + geom_tile(aes(fill = log(totalcounts))) + scale_fill_viridis()  + theme(axis.text.x = element_text(angle = 90, hjust=1, vjust = 0.5))

Consensus criteria and consensus plots

If we want to see what the results are of an automated selection of the “better” element/line, we need to calculate the best of two possibilities for every metric of concern:

xrf_by_medsd <- xrfstats_multielem %>% 
  select(CoreID, Element, AbsLine, Voltage, medsd) %>% 
  group_by(CoreID, Element, AbsLine) %>% 
  arrange(CoreID, Element, AbsLine, medsd) %>% 
  top_n(1, medsd) %>% 
  mutate(by_medsd = TRUE)

xrf_by_medchi2 <- xrfstats_multielem %>% 
  select(CoreID, Element, AbsLine, Voltage, medchi2) %>% 
  group_by(CoreID, Element, AbsLine) %>% 
  arrange(CoreID, Element, AbsLine, medchi2) %>% 
  top_n(1, medchi2) %>% 
  mutate(by_medchi2 = TRUE)

xrf_by_totalchi2 <- xrfstats_multielem %>% 
  select(CoreID, Element, AbsLine, Voltage, totalchi2) %>% 
  group_by(CoreID, Element, AbsLine) %>% 
  arrange(CoreID, Element, AbsLine, totalchi2) %>% 
  top_n(1, totalchi2) %>% 
  mutate(by_totalchi2 = TRUE)

xrf_by_totalcounts <- xrfstats_multielem %>% 
  select(CoreID, Element, AbsLine, Voltage, totalcounts) %>% 
  group_by(CoreID, Element, AbsLine) %>% 
  arrange(CoreID, Element, AbsLine, desc(totalcounts)) %>% 
  top_n(1, totalcounts) %>% 
  mutate(by_totalcounts = TRUE)

Then we join these data back together with our previous statistics:

xrfstats_autoelements <- list(xrfstats_multielem, xrf_by_medsd, xrf_by_medchi2, xrf_by_totalchi2, xrf_by_totalcounts) %>% 
  reduce(left_join) %>% 
  mutate_at(vars(by_medsd:by_totalcounts), ~ replace_na(., FALSE))

If we use the median noise as a metric, we can see, that there is a consensus for many elements except for Mn, Ni, V. This may be due to very low contents of these elements in the resp. sediment cores.

ggplot(xrfstats_autoelements, aes(x = ElementLineVoltage, y = CoreID)) + geom_tile(aes(fill = by_medsd)) + scale_fill_viridis(discrete = TRUE) + theme(axis.text.x = element_text(angle = 90, hjust=1, vjust = 0.5))

If we use the median goodness of fit, there is a unclear picture for some elements (NB: Check why e.g. Cu are not exactly opposite matching)

ggplot(xrfstats_autoelements, aes(x = ElementLineVoltage, y = CoreID)) + geom_tile(aes(fill = by_medchi2)) + scale_fill_viridis(discrete = TRUE) + theme(axis.text.x = element_text(angle = 90, hjust=1, vjust = 0.5))

If we use the total goodness of fit over the spectrum, there is a more clear picture.

ggplot(xrfstats_autoelements, aes(x = ElementLineVoltage, y = CoreID)) + geom_tile(aes(fill = by_totalchi2)) + scale_fill_viridis(discrete = TRUE) + theme(axis.text.x = element_text(angle = 90, hjust=1, vjust = 0.5))

If we use total counts, we can see that independently of the core, the same line would be used/maximised.

ggplot(xrfstats_autoelements, aes(x = ElementLineVoltage, y = CoreID)) + geom_tile(aes(fill = by_totalcounts)) + scale_fill_viridis(discrete = TRUE) + theme(axis.text.x = element_text(angle = 90, hjust=1, vjust = 0.5))

If we want to know what the consensus is, we should calculate the fraction of the approval by the different metrics over all cores. For this we remove all 50 kv data, since they are usually not recorded. This only leaves the following elements:

Ca, Co, Cr, Fe, K, Mn, Ni, Ti, V

ncores <- length(unique(xrfstats_autoelements$CoreID))
multielem_10_30 <- c("Ca", "Co", "Cr", "Fe", "K", "Mn", "Ni", "Ti", "V")

xrfstats_consensus <- xrfstats_autoelements %>%
  filter(Voltage != 50, Element %in% multielem_10_30) %>% 
  mutate(Voltage = droplevels(Voltage)) %>% 
  select(Voltage, Element, AbsLine, ElementLineVoltage, by_medsd:by_totalcounts) %>% 
  group_by(Element, Voltage, AbsLine, ElementLineVoltage) %>% 
  summarise(frac_by_medsd = sum(by_medsd)/ncores, frac_by_medchi2 = sum(by_medchi2)/ncores, frac_by_totalchi2 = sum(by_totalchi2)/ncores, frac_by_totalcounts = sum(by_totalcounts)/ncores) %>% 
  pivot_longer(frac_by_medsd:frac_by_totalcounts, names_to = "Criterion", values_to = "Fraction")

Then we can compare the four metrics (continuous scale, fraction = 1 means complete consensus/approval of element/line):

ggplot(xrfstats_consensus, aes(x = ElementLineVoltage, y = Criterion)) + geom_tile(aes(fill = Fraction)) + scale_fill_viridis() + theme(axis.text.x = element_text(angle = 90, hjust=1, vjust = 0.5))

If we sum up the criteria, we get the following. It is evident, that in some cases, it is difficult to choose the better voltage.

xrfstats_sumconsensus <- xrfstats_consensus %>% 
  group_by(Element, Voltage, AbsLine, ElementLineVoltage) %>% 
  summarise(sumconsensus = sum(Fraction))

ggplot(xrfstats_sumconsensus, aes(x = ElementLineVoltage, y = sumconsensus, fill = Element, group = Element)) + geom_col() + scale_fill_viridis(discrete = TRUE) + theme(axis.text.x = element_text(angle = 90, hjust=1, vjust = 0.5))

If we strictly go by the numbers, we get the following choices:

multielem_choices <- xrfstats_sumconsensus %>% 
  group_by(Element) %>% 
  arrange(desc(sumconsensus)) %>% 
  slice(1) %>% 
  select(Element, AbsLine, Voltage)

knitr::kable(multielem_choices)
Element AbsLine Voltage
Ca Ka 10
Co Ka 30
Cr Ka 10
Fe Ka 30
K Ka 10
Mn Ka 30
Ni Ka 10
Ti Ka 10
V Ka 30