Load packages

pacman::p_load(
  rio,          # File import
  here,         # File locator
  tidyverse)    # data management + ggplot2 graphics

Wide to long

Import data

# wide to long
count_data <- import(here("data", "malaria_facility_count_data.rds"))
head(count_data)
## # A tibble: 6 × 10
##   location_name data_date  submitted_date Province District `malaria_rdt_0-4`
##   <chr>         <date>     <date>         <chr>    <chr>                <int>
## 1 Facility 1    2020-08-11 2020-08-12     North    Spring                  11
## 2 Facility 2    2020-08-11 2020-08-12     North    Bolo                    11
## 3 Facility 3    2020-08-11 2020-08-12     North    Dingo                    8
## 4 Facility 4    2020-08-11 2020-08-12     North    Bolo                    16
## 5 Facility 5    2020-08-11 2020-08-12     North    Bolo                     9
## 6 Facility 6    2020-08-11 2020-08-12     North    Dingo                    3
## # ℹ 4 more variables: `malaria_rdt_5-14` <int>, malaria_rdt_15 <int>,
## #   malaria_tot <int>, newid <int>

Each observation in this dataset refers to the malaria counts at one of 65 facilities on a given date, ranging from count_data$data_date %>% min() to count_data$data_date %>% max(). These facilities are located in one Province (North) and four Districts (Spring, Bolo, Dingo, and Barnard). The dataset provides the overall counts of malaria, as well as age-specific counts in each of three age groups - <4 years, 5-14 years, and 15 years and older.

Data with columns with same class

ggplot(count_data) +
  geom_col(aes(x = data_date, y = malaria_tot), width = 1)

Using pivot_longer()

# Option 1: manually select columns 
df_long <- count_data %>% 
  pivot_longer(
    cols = c(`malaria_rdt_0-4`, `malaria_rdt_5-14`, malaria_rdt_15, malaria_tot)  # select 4 age group columns 
  )

df_long
## # A tibble: 12,152 × 8
##    location_name data_date  submitted_date Province District newid name    value
##    <chr>         <date>     <date>         <chr>    <chr>    <int> <chr>   <int>
##  1 Facility 1    2020-08-11 2020-08-12     North    Spring       1 malari…    11
##  2 Facility 1    2020-08-11 2020-08-12     North    Spring       1 malari…    12
##  3 Facility 1    2020-08-11 2020-08-12     North    Spring       1 malari…    23
##  4 Facility 1    2020-08-11 2020-08-12     North    Spring       1 malari…    46
##  5 Facility 2    2020-08-11 2020-08-12     North    Bolo         2 malari…    11
##  6 Facility 2    2020-08-11 2020-08-12     North    Bolo         2 malari…    10
##  7 Facility 2    2020-08-11 2020-08-12     North    Bolo         2 malari…     5
##  8 Facility 2    2020-08-11 2020-08-12     North    Bolo         2 malari…    26
##  9 Facility 3    2020-08-11 2020-08-12     North    Dingo        3 malari…     8
## 10 Facility 3    2020-08-11 2020-08-12     North    Dingo        3 malari…     5
## # ℹ 12,142 more rows
# Option 2: The columns we want to select starting with "malaria_" so we can use starts_with() to select colums
df_long2 <- count_data %>% 
  pivot_longer(
    cols = starts_with("malaria_"))  # select 4 age group columns 

df_long2
## # A tibble: 12,152 × 8
##    location_name data_date  submitted_date Province District newid name    value
##    <chr>         <date>     <date>         <chr>    <chr>    <int> <chr>   <int>
##  1 Facility 1    2020-08-11 2020-08-12     North    Spring       1 malari…    11
##  2 Facility 1    2020-08-11 2020-08-12     North    Spring       1 malari…    12
##  3 Facility 1    2020-08-11 2020-08-12     North    Spring       1 malari…    23
##  4 Facility 1    2020-08-11 2020-08-12     North    Spring       1 malari…    46
##  5 Facility 2    2020-08-11 2020-08-12     North    Bolo         2 malari…    11
##  6 Facility 2    2020-08-11 2020-08-12     North    Bolo         2 malari…    10
##  7 Facility 2    2020-08-11 2020-08-12     North    Bolo         2 malari…     5
##  8 Facility 2    2020-08-11 2020-08-12     North    Bolo         2 malari…    26
##  9 Facility 3    2020-08-11 2020-08-12     North    Dingo        3 malari…     8
## 10 Facility 3    2020-08-11 2020-08-12     North    Dingo        3 malari…     5
## # ℹ 12,142 more rows
# Option 3: Select colums by position
df_long3 <- count_data %>% 
  pivot_longer(
    cols = 6:9)  # select 4 age group columns 

df_long3
## # A tibble: 12,152 × 8
##    location_name data_date  submitted_date Province District newid name    value
##    <chr>         <date>     <date>         <chr>    <chr>    <int> <chr>   <int>
##  1 Facility 1    2020-08-11 2020-08-12     North    Spring       1 malari…    11
##  2 Facility 1    2020-08-11 2020-08-12     North    Spring       1 malari…    12
##  3 Facility 1    2020-08-11 2020-08-12     North    Spring       1 malari…    23
##  4 Facility 1    2020-08-11 2020-08-12     North    Spring       1 malari…    46
##  5 Facility 2    2020-08-11 2020-08-12     North    Bolo         2 malari…    11
##  6 Facility 2    2020-08-11 2020-08-12     North    Bolo         2 malari…    10
##  7 Facility 2    2020-08-11 2020-08-12     North    Bolo         2 malari…     5
##  8 Facility 2    2020-08-11 2020-08-12     North    Bolo         2 malari…    26
##  9 Facility 3    2020-08-11 2020-08-12     North    Dingo        3 malari…     8
## 10 Facility 3    2020-08-11 2020-08-12     North    Dingo        3 malari…     5
## # ℹ 12,142 more rows
# Option 4: Select colums by named range (provide range of consecutive columns)

df_long4 <- count_data %>% 
  pivot_longer(
    cols = `malaria_rdt_0-4`:malaria_tot)  # select 4 age group columns 

df_long4
## # A tibble: 12,152 × 8
##    location_name data_date  submitted_date Province District newid name    value
##    <chr>         <date>     <date>         <chr>    <chr>    <int> <chr>   <int>
##  1 Facility 1    2020-08-11 2020-08-12     North    Spring       1 malari…    11
##  2 Facility 1    2020-08-11 2020-08-12     North    Spring       1 malari…    12
##  3 Facility 1    2020-08-11 2020-08-12     North    Spring       1 malari…    23
##  4 Facility 1    2020-08-11 2020-08-12     North    Spring       1 malari…    46
##  5 Facility 2    2020-08-11 2020-08-12     North    Bolo         2 malari…    11
##  6 Facility 2    2020-08-11 2020-08-12     North    Bolo         2 malari…    10
##  7 Facility 2    2020-08-11 2020-08-12     North    Bolo         2 malari…     5
##  8 Facility 2    2020-08-11 2020-08-12     North    Bolo         2 malari…    26
##  9 Facility 3    2020-08-11 2020-08-12     North    Dingo        3 malari…     8
## 10 Facility 3    2020-08-11 2020-08-12     North    Dingo        3 malari…     5
## # ℹ 12,142 more rows

These two new columns are given the default names of name and value, but we can override these defaults to provide more meaningful names:

df_long <- 
  count_data %>% 
  pivot_longer(
    cols = starts_with("malaria_"),
    names_to = "age_group",
    values_to = "counts"
  )

df_long
## # A tibble: 12,152 × 8
##    location_name data_date  submitted_date Province District newid age_group    
##    <chr>         <date>     <date>         <chr>    <chr>    <int> <chr>        
##  1 Facility 1    2020-08-11 2020-08-12     North    Spring       1 malaria_rdt_…
##  2 Facility 1    2020-08-11 2020-08-12     North    Spring       1 malaria_rdt_…
##  3 Facility 1    2020-08-11 2020-08-12     North    Spring       1 malaria_rdt_…
##  4 Facility 1    2020-08-11 2020-08-12     North    Spring       1 malaria_tot  
##  5 Facility 2    2020-08-11 2020-08-12     North    Bolo         2 malaria_rdt_…
##  6 Facility 2    2020-08-11 2020-08-12     North    Bolo         2 malaria_rdt_…
##  7 Facility 2    2020-08-11 2020-08-12     North    Bolo         2 malaria_rdt_…
##  8 Facility 2    2020-08-11 2020-08-12     North    Bolo         2 malaria_tot  
##  9 Facility 3    2020-08-11 2020-08-12     North    Dingo        3 malaria_rdt_…
## 10 Facility 3    2020-08-11 2020-08-12     North    Dingo        3 malaria_rdt_…
## # ℹ 12,142 more rows
## # ℹ 1 more variable: counts <int>
df_long %>% filter(age_group != "malaria_tot") %>% 
ggplot() +
  geom_col(
    mapping = aes(x    = data_date,
                  y    = counts,
                  fill = age_group),
    width = 1
  )
## Warning: Removed 2058 rows containing missing values (`position_stack()`).

Data with multiple classes

Each columns has different classes (messy)

df %>% 
  pivot_longer(
    cols = -id,
    names_to = c("observation")
  )

## # A tibble: 18 × 3
##    id    observation value     
##    <chr> <chr>       <chr>     
##  1 A     obs1_date   2021-04-23
##  2 A     obs1_status Healthy   
##  3 A     obs2_date   2021-04-24
##  4 A     obs2_status Healthy   
##  5 A     obs3_date   2021-04-25
##  6 A     obs3_status Unwell    
##  7 B     obs1_date   2021-04-23
##  8 B     obs1_status Healthy   
##  9 B     obs2_date   2021-04-24
## 10 B     obs2_status Healthy   
## 11 B     obs3_date   2021-04-25
## 12 B     obs3_status Healthy   
## 13 C     obs1_date   2021-04-23
## 14 C     obs1_status Missing   
## 15 C     obs2_date   2021-04-24
## 16 C     obs2_status Healthy   
## 17 C     obs3_date   2021-04-25
## 18 C     obs3_status Healthy
df_long <- 
  df %>% 
  pivot_longer(
    cols = -id,
    names_to = c("observation", ".value"),
    names_sep = "_"
  )

df_long

## # A tibble: 9 × 4
##   id    observation date       status 
##   <chr> <chr>       <chr>      <chr>  
## 1 A     obs1        2021-04-23 Healthy
## 2 A     obs2        2021-04-24 Healthy
## 3 A     obs3        2021-04-25 Unwell 
## 4 B     obs1        2021-04-23 Healthy
## 5 B     obs2        2021-04-24 Healthy
## 6 B     obs3        2021-04-25 Healthy
## 7 C     obs1        2021-04-23 Missing
## 8 C     obs2        2021-04-24 Healthy
## 9 C     obs3        2021-04-25 Healthy

Note that the date column is currently in character class - we can easily convert this into it’s proper date class using the mutate() and as_date() functions.

We may also want to convert the observation column to a numeric format by dropping the “obs” prefix and converting to numeric. We cando this with str_remove_all() from the stringr package.

df_long <- 
  df_long %>% 
  mutate(
    date = date %>% lubridate::as_date(),
    observation =
      observation %>% 
      str_remove_all("obs") %>% 
      as.numeric()
  )

df_long 

Long to wide

Import data

linelist <- import(here("data", "linelist_cleaned.xlsx"))
head(linelist)
##   case_id generation date_infection date_onset date_hospitalisation
## 1  5fe599          4     2014-05-08 2014-05-13           2014-05-15
## 2  8689b7          4           <NA> 2014-05-13           2014-05-14
## 3  11f8ea          2           <NA> 2014-05-16           2014-05-18
## 4  b8812a          3     2014-05-04 2014-05-18           2014-05-20
## 5  893f25          3     2014-05-18 2014-05-21           2014-05-22
## 6  be99c8          3     2014-05-03 2014-05-22           2014-05-23
##   date_outcome outcome gender age age_unit age_years age_cat age_cat5
## 1         <NA>    <NA>      m   2    years         2     0-4      0-4
## 2   2014-05-18 Recover      f   3    years         3     0-4      0-4
## 3   2014-05-30 Recover      m  56    years        56   50-69    55-59
## 4         <NA>    <NA>      f  18    years        18   15-19    15-19
## 5   2014-05-29 Recover      m   3    years         3     0-4      0-4
## 6   2014-05-24 Recover      f  16    years        16   15-19    15-19
##                               hospital       lon      lat infector source wt_kg
## 1                                Other -13.21574 8.468973   f547d6  other    27
## 2                              Missing -13.21523 8.451719     <NA>   <NA>    25
## 3 St. Mark's Maternity Hospital (SMMH) -13.21291 8.464817     <NA>   <NA>    91
## 4                        Port Hospital -13.23637 8.475476   f90f5f  other    41
## 5                    Military Hospital -13.22286 8.460824   11f8ea  other    36
## 6                        Port Hospital -13.22263 8.461831   aec8ec  other    56
##   ht_cm ct_blood fever chills cough aches vomit temp time_admission       bmi
## 1    48       22    no     no   yes    no   yes 36.8           <NA> 117.18750
## 2    59       22  <NA>   <NA>  <NA>  <NA>  <NA> 36.9          09:36  71.81844
## 3   238       21  <NA>   <NA>  <NA>  <NA>  <NA> 36.9          16:48  16.06525
## 4   135       23    no     no    no    no    no 36.8          11:22  22.49657
## 5    71       23    no     no   yes    no   yes 36.9          12:60  71.41440
## 6   116       21    no     no   yes    no   yes 37.6          14:13  41.61712
##   days_onset_hosp
## 1               2
## 2               1
## 3               2
## 4               2
## 5               1
## 6               1
df_wide <- 
  linelist %>% 
  count(age_cat, gender)

df_wide
##    age_cat gender   n
## 1      0-4      f 640
## 2      0-4      m 416
## 3      0-4   <NA>  39
## 4    10-14      f 518
## 5    10-14      m 383
## 6    10-14   <NA>  40
## 7    15-19      f 359
## 8    15-19      m 364
## 9    15-19   <NA>  20
## 10   20-29      f 468
## 11   20-29      m 575
## 12   20-29   <NA>  30
## 13   30-49      f 179
## 14   30-49      m 557
## 15   30-49   <NA>  18
## 16     5-9      f 641
## 17     5-9      m 412
## 18     5-9   <NA>  42
## 19   50-69      f   2
## 20   50-69      m  91
## 21   50-69   <NA>   2
## 22     70+      m   5
## 23     70+   <NA>   1
## 24    <NA>   <NA>  86
ggplot(df_wide) +
  geom_col(aes(x = age_cat, y= n, fill = gender))

table_wide <-
  df_wide %>% 
  pivot_wider(
    id_cols = age_cat,
    names_from = gender,
    values_from = n
  )

table_wide
## # A tibble: 9 × 4
##   age_cat     f     m  `NA`
##   <chr>   <int> <int> <int>
## 1 0-4       640   416    39
## 2 10-14     518   383    40
## 3 15-19     359   364    20
## 4 20-29     468   575    30
## 5 30-49     179   557    18
## 6 5-9       641   412    42
## 7 50-69       2    91     2
## 8 70+        NA     5     1
## 9 <NA>       NA    NA    86

Filter

df1 <- 
  tibble::tribble(
       ~Measurement, ~Facility, ~Cases,
                  1,  "Hosp 1",     66,
                  2,  "Hosp 1",     26,
                  3,  "Hosp 1",      8,
                  1,  "Hosp 2",     71,
                  2,  "Hosp 2",     62,
                  3,  "Hosp 2",     70,
                  1,  "Hosp 3",     47,
                  2,  "Hosp 3",     70,
                  3,  "Hosp 3",     38,
       )

df1 
## # A tibble: 9 × 3
##   Measurement Facility Cases
##         <dbl> <chr>    <dbl>
## 1           1 Hosp 1      66
## 2           2 Hosp 1      26
## 3           3 Hosp 1       8
## 4           1 Hosp 2      71
## 5           2 Hosp 2      62
## 6           3 Hosp 2      70
## 7           1 Hosp 3      47
## 8           2 Hosp 3      70
## 9           3 Hosp 3      38
df2 <- 
  tibble::tribble(
    ~Year, ~Measurement, ~Facility, ~Cases,
     2000,            1,  "Hosp 4",     82,
     2001,            2,  "Hosp 4",     87,
     2002,            3,  "Hosp 4",     46
  )

df2
## # A tibble: 3 × 4
##    Year Measurement Facility Cases
##   <dbl>       <dbl> <chr>    <dbl>
## 1  2000           1 Hosp 4      82
## 2  2001           2 Hosp 4      87
## 3  2002           3 Hosp 4      46
df_combined <- 
  bind_rows(df1, df2) %>% 
  arrange(Measurement, Facility)

df_combined
## # A tibble: 12 × 4
##    Measurement Facility Cases  Year
##          <dbl> <chr>    <dbl> <dbl>
##  1           1 Hosp 1      66    NA
##  2           1 Hosp 2      71    NA
##  3           1 Hosp 3      47    NA
##  4           1 Hosp 4      82  2000
##  5           2 Hosp 1      26    NA
##  6           2 Hosp 2      62    NA
##  7           2 Hosp 3      70    NA
##  8           2 Hosp 4      87  2001
##  9           3 Hosp 1       8    NA
## 10           3 Hosp 2      70    NA
## 11           3 Hosp 3      38    NA
## 12           3 Hosp 4      46  2002
df_combined %>% fill(Year, .direction = "up")
## # A tibble: 12 × 4
##    Measurement Facility Cases  Year
##          <dbl> <chr>    <dbl> <dbl>
##  1           1 Hosp 1      66  2000
##  2           1 Hosp 2      71  2000
##  3           1 Hosp 3      47  2000
##  4           1 Hosp 4      82  2000
##  5           2 Hosp 1      26  2001
##  6           2 Hosp 2      62  2001
##  7           2 Hosp 3      70  2001
##  8           2 Hosp 4      87  2001
##  9           3 Hosp 1       8  2002
## 10           3 Hosp 2      70  2002
## 11           3 Hosp 3      38  2002
## 12           3 Hosp 4      46  2002
df_combined <- 
  df_combined %>% 
  arrange(Measurement, desc(Facility)) 

df_combined
## # A tibble: 12 × 4
##    Measurement Facility Cases  Year
##          <dbl> <chr>    <dbl> <dbl>
##  1           1 Hosp 4      82  2000
##  2           1 Hosp 3      47    NA
##  3           1 Hosp 2      71    NA
##  4           1 Hosp 1      66    NA
##  5           2 Hosp 4      87  2001
##  6           2 Hosp 3      70    NA
##  7           2 Hosp 2      62    NA
##  8           2 Hosp 1      26    NA
##  9           3 Hosp 4      46  2002
## 10           3 Hosp 3      38    NA
## 11           3 Hosp 2      70    NA
## 12           3 Hosp 1       8    NA
df_combined <- 
  df_combined %>% 
  fill(Year, .direction = "down")

df_combined
## # A tibble: 12 × 4
##    Measurement Facility Cases  Year
##          <dbl> <chr>    <dbl> <dbl>
##  1           1 Hosp 4      82  2000
##  2           1 Hosp 3      47  2000
##  3           1 Hosp 2      71  2000
##  4           1 Hosp 1      66  2000
##  5           2 Hosp 4      87  2001
##  6           2 Hosp 3      70  2001
##  7           2 Hosp 2      62  2001
##  8           2 Hosp 1      26  2001
##  9           3 Hosp 4      46  2002
## 10           3 Hosp 3      38  2002
## 11           3 Hosp 2      70  2002
## 12           3 Hosp 1       8  2002
ggplot(df_combined) +
  aes(Year, Cases, fill = Facility) +
  geom_col()

References

  1. https://epirhandbook.com/en/pivoting-data.html;