Download the “dirty” dataset.
Data Cleaning:
Clean the species column to harmonize values like “E. Coli”, “e.coli”, “Salmonella sp.”, etc. •Handle missing or empty resistance values (““, NA), and recode:
“R” → 1
“S” → 0
Create a resistance matrix with
rows = isolate_id,
columns = antibiotics.
Make sure row names are unique.
#Import libraries
library(conflicted)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
library(janitor)
library(pheatmap)
library(patchwork)
#Load data from file
raw_data <- read_csv("data/session 1/amr_phenotypes_dirty.csv")
## Rows: 20 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): isolate_id, species, country, source, host, ciprofloxacin, ampicill...
## dbl (1): year
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
raw_data
#Summarise our data
skimr::skim(raw_data)
| Name | raw_data |
| Number of rows | 20 |
| Number of columns | 10 |
| _______________________ | |
| Column type frequency: | |
| character | 9 |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| isolate_id | 0 | 1.00 | 7 | 7 | 0 | 20 | 0 |
| species | 0 | 1.00 | 6 | 14 | 0 | 5 | 0 |
| country | 0 | 1.00 | 5 | 7 | 0 | 4 | 0 |
| source | 0 | 1.00 | 5 | 5 | 0 | 3 | 0 |
| host | 0 | 1.00 | 5 | 6 | 0 | 2 | 0 |
| ciprofloxacin | 1 | 0.95 | 1 | 3 | 0 | 3 | 0 |
| ampicillin | 2 | 0.90 | 1 | 3 | 0 | 3 | 0 |
| tetracycline | 2 | 0.90 | 1 | 3 | 0 | 3 | 0 |
| colistin | 0 | 1.00 | 1 | 1 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| year | 0 | 1 | 2021 | 0.86 | 2020 | 2020 | 2021 | 2022 | 2022 | ▇▁▇▁▇ |
#Unique species
unique(raw_data$species)
## [1] "salmonella sp." "Klebsiella" "Salmonella sp." "E. Coli"
## [5] "e.coli"
#Clean our data
clean_data <- raw_data %>%
clean_names() %>%
mutate(
species = str_to_title(species),
species = case_match(species,
"E.coli" ~ "E. coli",
.default = species),
country = str_to_title(country),
host = str_to_title(host),
across(c(ciprofloxacin:colistin), ~str_to_upper(.)),
across(c(ciprofloxacin:colistin), ~ifelse(.=="R", 1,
ifelse(. %in% c("NAN", NA), NA,0)
)
),
) %>%
drop_na(ciprofloxacin:colistin)
clean_data
#Make a list for unique isolates
clean_data_unique <- clean_data %>%
mutate(isolate_id = make.unique(as.character(isolate_id)))
clean_data_unique
#Create a matrix
phenotype_matrix <- clean_data_unique %>%
select(isolate_id, ciprofloxacin:colistin) %>%
column_to_rownames(var = "isolate_id") %>%
as.matrix()
phenotype_matrix
## ciprofloxacin ampicillin tetracycline colistin
## ISO_001 0 0 1 0
## ISO_002 1 0 0 1
## ISO_003 1 0 0 1
## ISO_004 1 1 1 1
## ISO_006 0 1 0 0
## ISO_009 1 0 0 1
## ISO_010 1 1 1 1
## ISO_011 0 1 1 1
## ISO_012 0 1 0 1
## ISO_013 0 0 1 0
## ISO_018 0 0 1 0
## ISO_020 1 0 1 1
#Create an annotation matrix
annotation_matrix <- clean_data_unique %>%
select(isolate_id, species, country, year) %>%
mutate(
Species = species,
Country = country,
Year=year
) %>%
select(isolate_id, Country, Year, Species) %>%
column_to_rownames("isolate_id")
annotation_matrix
#Arrange per countries and species
annotation_matrix <- annotation_matrix %>%
arrange(Country, Year)
annotation_matrix
#Arrange our phenotype matrix like annotation matrix
phenotype_matrix <- phenotype_matrix[row.names(annotation_matrix),]
phenotype_matrix
## ciprofloxacin ampicillin tetracycline colistin
## ISO_018 0 0 1 0
## ISO_009 1 0 0 1
## ISO_020 1 0 1 1
## ISO_003 1 0 0 1
## ISO_006 0 1 0 0
## ISO_002 1 0 0 1
## ISO_010 1 1 1 1
## ISO_001 0 0 1 0
## ISO_004 1 1 1 1
## ISO_011 0 1 1 1
## ISO_012 0 1 0 1
## ISO_013 0 0 1 0
#Create a heatmap
pheatmap(
phenotype_matrix,
cluster_rows = F,
cluster_cols = T,
annotation_row = annotation_matrix,
display_numbers = F,
clustering_distance_cols = "binary",
color = colorRampPalette(c("white", "tomato"))(100),
main = "AMR Heatmap with Species, Countries and Antibiotics"
)