Day 1 - Advanced AMR Data Visualization in R

Task

Data pre-processing:

Download the “dirty” dataset.
Data Cleaning:
Clean the species column to harmonize values like “E. Coli”, “e.coli”, “Salmonella sp.”, etc. •Handle missing or empty resistance values (““, NA), and recode:
- “R” → 1
- “S” → 0

Matrix Preparation:

Create a resistance matrix with
- rows = isolate_id,
- columns = antibiotics.
Make sure row names are unique.

Heatmap Visualization:

Create a basic heatmap with no clustering or annotations.
Then add annotations for: –species (color-coded) –country (color-coded)
Optionally, arrange the heatmap by country or species.

Data Cleaning

#Import libraries 
library(conflicted)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4

library(janitor)
library(pheatmap)
library(patchwork)

#Load data from file
raw_data <- read_csv("data/session 1/amr_phenotypes_dirty.csv")

## Rows: 20 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): isolate_id, species, country, source, host, ciprofloxacin, ampicill...
## dbl (1): year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

raw_data

#Summarise our data
skimr::skim(raw_data)

Data summary
Name	raw_data
Number of rows	20
Number of columns	10
_______________________
Column type frequency:
character	9
numeric	1
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	n_unique
isolate_id	0	1.00	7	7	20
species	0	1.00	6	14	5
country	0	1.00	5	7	4
source	0	1.00	5	5	3
host	0	1.00	5	6	2
ciprofloxacin	1	0.95	1	3	3
ampicillin	2	0.90	1	3	3
tetracycline	2	0.90	1	3	3
colistin	0	1.00	1	1	2

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
year	0	1	2021	0.86	2020	2020	2021	2022	2022	▇▁▇▁▇

#Unique species
unique(raw_data$species)

## [1] "salmonella sp." "Klebsiella"     "Salmonella sp." "E. Coli"       
## [5] "e.coli"

#Clean our data
clean_data <- raw_data %>% 
  clean_names() %>%
  mutate(
    species = str_to_title(species),
    species = case_match(species,
                         "E.coli" ~ "E. coli",
                         .default = species),
    country = str_to_title(country),
    host = str_to_title(host),
    across(c(ciprofloxacin:colistin), ~str_to_upper(.)),
    across(c(ciprofloxacin:colistin), ~ifelse(.=="R", 1, 
                                              ifelse(. %in% c("NAN", NA), NA,0)
                                              )
           ),
  ) %>%
  drop_na(ciprofloxacin:colistin)

clean_data

Matrix Preparation

#Make a list for unique isolates
clean_data_unique <- clean_data %>%
  mutate(isolate_id = make.unique(as.character(isolate_id)))

clean_data_unique

Heatmap Visualization

#Create a matrix
phenotype_matrix <- clean_data_unique %>%
  select(isolate_id, ciprofloxacin:colistin) %>%
  column_to_rownames(var = "isolate_id") %>%
  as.matrix()

phenotype_matrix

##         ciprofloxacin ampicillin tetracycline colistin
## ISO_001             0          0            1        0
## ISO_002             1          0            0        1
## ISO_003             1          0            0        1
## ISO_004             1          1            1        1
## ISO_006             0          1            0        0
## ISO_009             1          0            0        1
## ISO_010             1          1            1        1
## ISO_011             0          1            1        1
## ISO_012             0          1            0        1
## ISO_013             0          0            1        0
## ISO_018             0          0            1        0
## ISO_020             1          0            1        1

#Create an annotation matrix
annotation_matrix <- clean_data_unique %>%
  select(isolate_id, species, country, year) %>%
  mutate(
    Species = species,
    Country = country,
    Year=year
  ) %>%
  select(isolate_id, Country, Year, Species) %>%
  column_to_rownames("isolate_id")

annotation_matrix

#Arrange per countries and species
annotation_matrix <- annotation_matrix %>%
  arrange(Country, Year)

annotation_matrix

#Arrange our phenotype matrix like annotation matrix
phenotype_matrix <- phenotype_matrix[row.names(annotation_matrix),]

phenotype_matrix

##         ciprofloxacin ampicillin tetracycline colistin
## ISO_018             0          0            1        0
## ISO_009             1          0            0        1
## ISO_020             1          0            1        1
## ISO_003             1          0            0        1
## ISO_006             0          1            0        0
## ISO_002             1          0            0        1
## ISO_010             1          1            1        1
## ISO_001             0          0            1        0
## ISO_004             1          1            1        1
## ISO_011             0          1            1        1
## ISO_012             0          1            0        1
## ISO_013             0          0            1        0

#Create a heatmap
pheatmap(
  phenotype_matrix,
  cluster_rows = F,
  cluster_cols = T,
  annotation_row = annotation_matrix,
  display_numbers = F,
  clustering_distance_cols = "binary",
  color = colorRampPalette(c("white", "tomato"))(100),
  main = "AMR Heatmap with Species, Countries and Antibiotics"
)

Day 1 - Advanced AMR Data Visualization in R - Homework

Task

Data Cleaning

Matrix Preparation

Heatmap Visualization