Task

  1. Data pre-processing:
  1. Matrix Preparation:
  1. Heatmap Visualization:

Data Cleaning

#Import libraries 
library(conflicted)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4
library(janitor)
library(pheatmap)
library(patchwork)
#Load data from file
raw_data <- read_csv("data/session 1/amr_phenotypes_dirty.csv")
## Rows: 20 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): isolate_id, species, country, source, host, ciprofloxacin, ampicill...
## dbl (1): year
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
raw_data
#Summarise our data
skimr::skim(raw_data)
Data summary
Name raw_data
Number of rows 20
Number of columns 10
_______________________
Column type frequency:
character 9
numeric 1
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
isolate_id 0 1.00 7 7 0 20 0
species 0 1.00 6 14 0 5 0
country 0 1.00 5 7 0 4 0
source 0 1.00 5 5 0 3 0
host 0 1.00 5 6 0 2 0
ciprofloxacin 1 0.95 1 3 0 3 0
ampicillin 2 0.90 1 3 0 3 0
tetracycline 2 0.90 1 3 0 3 0
colistin 0 1.00 1 1 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
year 0 1 2021 0.86 2020 2020 2021 2022 2022 ▇▁▇▁▇
#Unique species
unique(raw_data$species)
## [1] "salmonella sp." "Klebsiella"     "Salmonella sp." "E. Coli"       
## [5] "e.coli"
#Clean our data
clean_data <- raw_data %>% 
  clean_names() %>%
  mutate(
    species = str_to_title(species),
    species = case_match(species,
                         "E.coli" ~ "E. coli",
                         .default = species),
    country = str_to_title(country),
    host = str_to_title(host),
    across(c(ciprofloxacin:colistin), ~str_to_upper(.)),
    across(c(ciprofloxacin:colistin), ~ifelse(.=="R", 1, 
                                              ifelse(. %in% c("NAN", NA), NA,0)
                                              )
           ),
  ) %>%
  drop_na(ciprofloxacin:colistin)

clean_data

Matrix Preparation

#Make a list for unique isolates
clean_data_unique <- clean_data %>%
  mutate(isolate_id = make.unique(as.character(isolate_id)))

clean_data_unique

Heatmap Visualization

#Create a matrix
phenotype_matrix <- clean_data_unique %>%
  select(isolate_id, ciprofloxacin:colistin) %>%
  column_to_rownames(var = "isolate_id") %>%
  as.matrix()

phenotype_matrix
##         ciprofloxacin ampicillin tetracycline colistin
## ISO_001             0          0            1        0
## ISO_002             1          0            0        1
## ISO_003             1          0            0        1
## ISO_004             1          1            1        1
## ISO_006             0          1            0        0
## ISO_009             1          0            0        1
## ISO_010             1          1            1        1
## ISO_011             0          1            1        1
## ISO_012             0          1            0        1
## ISO_013             0          0            1        0
## ISO_018             0          0            1        0
## ISO_020             1          0            1        1
#Create an annotation matrix
annotation_matrix <- clean_data_unique %>%
  select(isolate_id, species, country, year) %>%
  mutate(
    Species = species,
    Country = country,
    Year=year
  ) %>%
  select(isolate_id, Country, Year, Species) %>%
  column_to_rownames("isolate_id")

annotation_matrix
#Arrange per countries and species
annotation_matrix <- annotation_matrix %>%
  arrange(Country, Year)

annotation_matrix
#Arrange our phenotype matrix like annotation matrix
phenotype_matrix <- phenotype_matrix[row.names(annotation_matrix),]

phenotype_matrix
##         ciprofloxacin ampicillin tetracycline colistin
## ISO_018             0          0            1        0
## ISO_009             1          0            0        1
## ISO_020             1          0            1        1
## ISO_003             1          0            0        1
## ISO_006             0          1            0        0
## ISO_002             1          0            0        1
## ISO_010             1          1            1        1
## ISO_001             0          0            1        0
## ISO_004             1          1            1        1
## ISO_011             0          1            1        1
## ISO_012             0          1            0        1
## ISO_013             0          0            1        0
#Create a heatmap
pheatmap(
  phenotype_matrix,
  cluster_rows = F,
  cluster_cols = T,
  annotation_row = annotation_matrix,
  display_numbers = F,
  clustering_distance_cols = "binary",
  color = colorRampPalette(c("white", "tomato"))(100),
  main = "AMR Heatmap with Species, Countries and Antibiotics"
)