pacman::p_load(
rio, # File import
here, # File locator
tidyverse) # data management + ggplot2 graphics
# wide to long
count_data <- import(here("data", "malaria_facility_count_data.rds"))
head(count_data)
## # A tibble: 6 × 10
## location_name data_date submitted_date Province District `malaria_rdt_0-4`
## <chr> <date> <date> <chr> <chr> <int>
## 1 Facility 1 2020-08-11 2020-08-12 North Spring 11
## 2 Facility 2 2020-08-11 2020-08-12 North Bolo 11
## 3 Facility 3 2020-08-11 2020-08-12 North Dingo 8
## 4 Facility 4 2020-08-11 2020-08-12 North Bolo 16
## 5 Facility 5 2020-08-11 2020-08-12 North Bolo 9
## 6 Facility 6 2020-08-11 2020-08-12 North Dingo 3
## # ℹ 4 more variables: `malaria_rdt_5-14` <int>, malaria_rdt_15 <int>,
## # malaria_tot <int>, newid <int>
Each observation in this dataset refers to the malaria counts at one
of 65 facilities on a given date, ranging from
count_data$data_date %>% min()
to
count_data$data_date %>% max()
. These facilities are
located in one Province (North)
and four
Districts
(Spring, Bolo, Dingo, and Barnard). The dataset
provides the overall counts of malaria, as well as
age-specific
counts in each of three age groups
- <4 years
, 5-14 years
, and
15 years and older
.
ggplot(count_data) +
geom_col(aes(x = data_date, y = malaria_tot), width = 1)
Using pivot_longer()
# Option 1: manually select columns
df_long <- count_data %>%
pivot_longer(
cols = c(`malaria_rdt_0-4`, `malaria_rdt_5-14`, malaria_rdt_15, malaria_tot) # select 4 age group columns
)
df_long
## # A tibble: 12,152 × 8
## location_name data_date submitted_date Province District newid name value
## <chr> <date> <date> <chr> <chr> <int> <chr> <int>
## 1 Facility 1 2020-08-11 2020-08-12 North Spring 1 malari… 11
## 2 Facility 1 2020-08-11 2020-08-12 North Spring 1 malari… 12
## 3 Facility 1 2020-08-11 2020-08-12 North Spring 1 malari… 23
## 4 Facility 1 2020-08-11 2020-08-12 North Spring 1 malari… 46
## 5 Facility 2 2020-08-11 2020-08-12 North Bolo 2 malari… 11
## 6 Facility 2 2020-08-11 2020-08-12 North Bolo 2 malari… 10
## 7 Facility 2 2020-08-11 2020-08-12 North Bolo 2 malari… 5
## 8 Facility 2 2020-08-11 2020-08-12 North Bolo 2 malari… 26
## 9 Facility 3 2020-08-11 2020-08-12 North Dingo 3 malari… 8
## 10 Facility 3 2020-08-11 2020-08-12 North Dingo 3 malari… 5
## # ℹ 12,142 more rows
# Option 2: The columns we want to select starting with "malaria_" so we can use starts_with() to select colums
df_long2 <- count_data %>%
pivot_longer(
cols = starts_with("malaria_")) # select 4 age group columns
df_long2
## # A tibble: 12,152 × 8
## location_name data_date submitted_date Province District newid name value
## <chr> <date> <date> <chr> <chr> <int> <chr> <int>
## 1 Facility 1 2020-08-11 2020-08-12 North Spring 1 malari… 11
## 2 Facility 1 2020-08-11 2020-08-12 North Spring 1 malari… 12
## 3 Facility 1 2020-08-11 2020-08-12 North Spring 1 malari… 23
## 4 Facility 1 2020-08-11 2020-08-12 North Spring 1 malari… 46
## 5 Facility 2 2020-08-11 2020-08-12 North Bolo 2 malari… 11
## 6 Facility 2 2020-08-11 2020-08-12 North Bolo 2 malari… 10
## 7 Facility 2 2020-08-11 2020-08-12 North Bolo 2 malari… 5
## 8 Facility 2 2020-08-11 2020-08-12 North Bolo 2 malari… 26
## 9 Facility 3 2020-08-11 2020-08-12 North Dingo 3 malari… 8
## 10 Facility 3 2020-08-11 2020-08-12 North Dingo 3 malari… 5
## # ℹ 12,142 more rows
# Option 3: Select colums by position
df_long3 <- count_data %>%
pivot_longer(
cols = 6:9) # select 4 age group columns
df_long3
## # A tibble: 12,152 × 8
## location_name data_date submitted_date Province District newid name value
## <chr> <date> <date> <chr> <chr> <int> <chr> <int>
## 1 Facility 1 2020-08-11 2020-08-12 North Spring 1 malari… 11
## 2 Facility 1 2020-08-11 2020-08-12 North Spring 1 malari… 12
## 3 Facility 1 2020-08-11 2020-08-12 North Spring 1 malari… 23
## 4 Facility 1 2020-08-11 2020-08-12 North Spring 1 malari… 46
## 5 Facility 2 2020-08-11 2020-08-12 North Bolo 2 malari… 11
## 6 Facility 2 2020-08-11 2020-08-12 North Bolo 2 malari… 10
## 7 Facility 2 2020-08-11 2020-08-12 North Bolo 2 malari… 5
## 8 Facility 2 2020-08-11 2020-08-12 North Bolo 2 malari… 26
## 9 Facility 3 2020-08-11 2020-08-12 North Dingo 3 malari… 8
## 10 Facility 3 2020-08-11 2020-08-12 North Dingo 3 malari… 5
## # ℹ 12,142 more rows
# Option 4: Select colums by named range (provide range of consecutive columns)
df_long4 <- count_data %>%
pivot_longer(
cols = `malaria_rdt_0-4`:malaria_tot) # select 4 age group columns
df_long4
## # A tibble: 12,152 × 8
## location_name data_date submitted_date Province District newid name value
## <chr> <date> <date> <chr> <chr> <int> <chr> <int>
## 1 Facility 1 2020-08-11 2020-08-12 North Spring 1 malari… 11
## 2 Facility 1 2020-08-11 2020-08-12 North Spring 1 malari… 12
## 3 Facility 1 2020-08-11 2020-08-12 North Spring 1 malari… 23
## 4 Facility 1 2020-08-11 2020-08-12 North Spring 1 malari… 46
## 5 Facility 2 2020-08-11 2020-08-12 North Bolo 2 malari… 11
## 6 Facility 2 2020-08-11 2020-08-12 North Bolo 2 malari… 10
## 7 Facility 2 2020-08-11 2020-08-12 North Bolo 2 malari… 5
## 8 Facility 2 2020-08-11 2020-08-12 North Bolo 2 malari… 26
## 9 Facility 3 2020-08-11 2020-08-12 North Dingo 3 malari… 8
## 10 Facility 3 2020-08-11 2020-08-12 North Dingo 3 malari… 5
## # ℹ 12,142 more rows
These two new columns are given the default names of name and value, but we can override these defaults to provide more meaningful names:
df_long <-
count_data %>%
pivot_longer(
cols = starts_with("malaria_"),
names_to = "age_group",
values_to = "counts"
)
df_long
## # A tibble: 12,152 × 8
## location_name data_date submitted_date Province District newid age_group
## <chr> <date> <date> <chr> <chr> <int> <chr>
## 1 Facility 1 2020-08-11 2020-08-12 North Spring 1 malaria_rdt_…
## 2 Facility 1 2020-08-11 2020-08-12 North Spring 1 malaria_rdt_…
## 3 Facility 1 2020-08-11 2020-08-12 North Spring 1 malaria_rdt_…
## 4 Facility 1 2020-08-11 2020-08-12 North Spring 1 malaria_tot
## 5 Facility 2 2020-08-11 2020-08-12 North Bolo 2 malaria_rdt_…
## 6 Facility 2 2020-08-11 2020-08-12 North Bolo 2 malaria_rdt_…
## 7 Facility 2 2020-08-11 2020-08-12 North Bolo 2 malaria_rdt_…
## 8 Facility 2 2020-08-11 2020-08-12 North Bolo 2 malaria_tot
## 9 Facility 3 2020-08-11 2020-08-12 North Dingo 3 malaria_rdt_…
## 10 Facility 3 2020-08-11 2020-08-12 North Dingo 3 malaria_rdt_…
## # ℹ 12,142 more rows
## # ℹ 1 more variable: counts <int>
df_long %>% filter(age_group != "malaria_tot") %>%
ggplot() +
geom_col(
mapping = aes(x = data_date,
y = counts,
fill = age_group),
width = 1
)
## Warning: Removed 2058 rows containing missing values (`position_stack()`).
Each columns has different classes (messy)
df %>%
pivot_longer(
cols = -id,
names_to = c("observation")
)
## # A tibble: 18 × 3
## id observation value
## <chr> <chr> <chr>
## 1 A obs1_date 2021-04-23
## 2 A obs1_status Healthy
## 3 A obs2_date 2021-04-24
## 4 A obs2_status Healthy
## 5 A obs3_date 2021-04-25
## 6 A obs3_status Unwell
## 7 B obs1_date 2021-04-23
## 8 B obs1_status Healthy
## 9 B obs2_date 2021-04-24
## 10 B obs2_status Healthy
## 11 B obs3_date 2021-04-25
## 12 B obs3_status Healthy
## 13 C obs1_date 2021-04-23
## 14 C obs1_status Missing
## 15 C obs2_date 2021-04-24
## 16 C obs2_status Healthy
## 17 C obs3_date 2021-04-25
## 18 C obs3_status Healthy
df_long <-
df %>%
pivot_longer(
cols = -id,
names_to = c("observation", ".value"),
names_sep = "_"
)
df_long
## # A tibble: 9 × 4
## id observation date status
## <chr> <chr> <chr> <chr>
## 1 A obs1 2021-04-23 Healthy
## 2 A obs2 2021-04-24 Healthy
## 3 A obs3 2021-04-25 Unwell
## 4 B obs1 2021-04-23 Healthy
## 5 B obs2 2021-04-24 Healthy
## 6 B obs3 2021-04-25 Healthy
## 7 C obs1 2021-04-23 Missing
## 8 C obs2 2021-04-24 Healthy
## 9 C obs3 2021-04-25 Healthy
Note that the date column is currently in character
class - we can easily convert this into it’s proper date class using the
mutate() and as_date()
functions.
We may also want to convert the observation
column to a
numeric format by dropping the “obs” prefix and converting to numeric.
We cando this with str_remove_all() from the stringr package.
df_long <-
df_long %>%
mutate(
date = date %>% lubridate::as_date(),
observation =
observation %>%
str_remove_all("obs") %>%
as.numeric()
)
df_long
linelist <- import(here("data", "linelist_cleaned.xlsx"))
head(linelist)
## case_id generation date_infection date_onset date_hospitalisation
## 1 5fe599 4 2014-05-08 2014-05-13 2014-05-15
## 2 8689b7 4 <NA> 2014-05-13 2014-05-14
## 3 11f8ea 2 <NA> 2014-05-16 2014-05-18
## 4 b8812a 3 2014-05-04 2014-05-18 2014-05-20
## 5 893f25 3 2014-05-18 2014-05-21 2014-05-22
## 6 be99c8 3 2014-05-03 2014-05-22 2014-05-23
## date_outcome outcome gender age age_unit age_years age_cat age_cat5
## 1 <NA> <NA> m 2 years 2 0-4 0-4
## 2 2014-05-18 Recover f 3 years 3 0-4 0-4
## 3 2014-05-30 Recover m 56 years 56 50-69 55-59
## 4 <NA> <NA> f 18 years 18 15-19 15-19
## 5 2014-05-29 Recover m 3 years 3 0-4 0-4
## 6 2014-05-24 Recover f 16 years 16 15-19 15-19
## hospital lon lat infector source wt_kg
## 1 Other -13.21574 8.468973 f547d6 other 27
## 2 Missing -13.21523 8.451719 <NA> <NA> 25
## 3 St. Mark's Maternity Hospital (SMMH) -13.21291 8.464817 <NA> <NA> 91
## 4 Port Hospital -13.23637 8.475476 f90f5f other 41
## 5 Military Hospital -13.22286 8.460824 11f8ea other 36
## 6 Port Hospital -13.22263 8.461831 aec8ec other 56
## ht_cm ct_blood fever chills cough aches vomit temp time_admission bmi
## 1 48 22 no no yes no yes 36.8 <NA> 117.18750
## 2 59 22 <NA> <NA> <NA> <NA> <NA> 36.9 09:36 71.81844
## 3 238 21 <NA> <NA> <NA> <NA> <NA> 36.9 16:48 16.06525
## 4 135 23 no no no no no 36.8 11:22 22.49657
## 5 71 23 no no yes no yes 36.9 12:60 71.41440
## 6 116 21 no no yes no yes 37.6 14:13 41.61712
## days_onset_hosp
## 1 2
## 2 1
## 3 2
## 4 2
## 5 1
## 6 1
df_wide <-
linelist %>%
count(age_cat, gender)
df_wide
## age_cat gender n
## 1 0-4 f 640
## 2 0-4 m 416
## 3 0-4 <NA> 39
## 4 10-14 f 518
## 5 10-14 m 383
## 6 10-14 <NA> 40
## 7 15-19 f 359
## 8 15-19 m 364
## 9 15-19 <NA> 20
## 10 20-29 f 468
## 11 20-29 m 575
## 12 20-29 <NA> 30
## 13 30-49 f 179
## 14 30-49 m 557
## 15 30-49 <NA> 18
## 16 5-9 f 641
## 17 5-9 m 412
## 18 5-9 <NA> 42
## 19 50-69 f 2
## 20 50-69 m 91
## 21 50-69 <NA> 2
## 22 70+ m 5
## 23 70+ <NA> 1
## 24 <NA> <NA> 86
ggplot(df_wide) +
geom_col(aes(x = age_cat, y= n, fill = gender))
table_wide <-
df_wide %>%
pivot_wider(
id_cols = age_cat,
names_from = gender,
values_from = n
)
table_wide
## # A tibble: 9 × 4
## age_cat f m `NA`
## <chr> <int> <int> <int>
## 1 0-4 640 416 39
## 2 10-14 518 383 40
## 3 15-19 359 364 20
## 4 20-29 468 575 30
## 5 30-49 179 557 18
## 6 5-9 641 412 42
## 7 50-69 2 91 2
## 8 70+ NA 5 1
## 9 <NA> NA NA 86
df1 <-
tibble::tribble(
~Measurement, ~Facility, ~Cases,
1, "Hosp 1", 66,
2, "Hosp 1", 26,
3, "Hosp 1", 8,
1, "Hosp 2", 71,
2, "Hosp 2", 62,
3, "Hosp 2", 70,
1, "Hosp 3", 47,
2, "Hosp 3", 70,
3, "Hosp 3", 38,
)
df1
## # A tibble: 9 × 3
## Measurement Facility Cases
## <dbl> <chr> <dbl>
## 1 1 Hosp 1 66
## 2 2 Hosp 1 26
## 3 3 Hosp 1 8
## 4 1 Hosp 2 71
## 5 2 Hosp 2 62
## 6 3 Hosp 2 70
## 7 1 Hosp 3 47
## 8 2 Hosp 3 70
## 9 3 Hosp 3 38
df2 <-
tibble::tribble(
~Year, ~Measurement, ~Facility, ~Cases,
2000, 1, "Hosp 4", 82,
2001, 2, "Hosp 4", 87,
2002, 3, "Hosp 4", 46
)
df2
## # A tibble: 3 × 4
## Year Measurement Facility Cases
## <dbl> <dbl> <chr> <dbl>
## 1 2000 1 Hosp 4 82
## 2 2001 2 Hosp 4 87
## 3 2002 3 Hosp 4 46
df_combined <-
bind_rows(df1, df2) %>%
arrange(Measurement, Facility)
df_combined
## # A tibble: 12 × 4
## Measurement Facility Cases Year
## <dbl> <chr> <dbl> <dbl>
## 1 1 Hosp 1 66 NA
## 2 1 Hosp 2 71 NA
## 3 1 Hosp 3 47 NA
## 4 1 Hosp 4 82 2000
## 5 2 Hosp 1 26 NA
## 6 2 Hosp 2 62 NA
## 7 2 Hosp 3 70 NA
## 8 2 Hosp 4 87 2001
## 9 3 Hosp 1 8 NA
## 10 3 Hosp 2 70 NA
## 11 3 Hosp 3 38 NA
## 12 3 Hosp 4 46 2002
df_combined %>% fill(Year, .direction = "up")
## # A tibble: 12 × 4
## Measurement Facility Cases Year
## <dbl> <chr> <dbl> <dbl>
## 1 1 Hosp 1 66 2000
## 2 1 Hosp 2 71 2000
## 3 1 Hosp 3 47 2000
## 4 1 Hosp 4 82 2000
## 5 2 Hosp 1 26 2001
## 6 2 Hosp 2 62 2001
## 7 2 Hosp 3 70 2001
## 8 2 Hosp 4 87 2001
## 9 3 Hosp 1 8 2002
## 10 3 Hosp 2 70 2002
## 11 3 Hosp 3 38 2002
## 12 3 Hosp 4 46 2002
df_combined <-
df_combined %>%
arrange(Measurement, desc(Facility))
df_combined
## # A tibble: 12 × 4
## Measurement Facility Cases Year
## <dbl> <chr> <dbl> <dbl>
## 1 1 Hosp 4 82 2000
## 2 1 Hosp 3 47 NA
## 3 1 Hosp 2 71 NA
## 4 1 Hosp 1 66 NA
## 5 2 Hosp 4 87 2001
## 6 2 Hosp 3 70 NA
## 7 2 Hosp 2 62 NA
## 8 2 Hosp 1 26 NA
## 9 3 Hosp 4 46 2002
## 10 3 Hosp 3 38 NA
## 11 3 Hosp 2 70 NA
## 12 3 Hosp 1 8 NA
df_combined <-
df_combined %>%
fill(Year, .direction = "down")
df_combined
## # A tibble: 12 × 4
## Measurement Facility Cases Year
## <dbl> <chr> <dbl> <dbl>
## 1 1 Hosp 4 82 2000
## 2 1 Hosp 3 47 2000
## 3 1 Hosp 2 71 2000
## 4 1 Hosp 1 66 2000
## 5 2 Hosp 4 87 2001
## 6 2 Hosp 3 70 2001
## 7 2 Hosp 2 62 2001
## 8 2 Hosp 1 26 2001
## 9 3 Hosp 4 46 2002
## 10 3 Hosp 3 38 2002
## 11 3 Hosp 2 70 2002
## 12 3 Hosp 1 8 2002
ggplot(df_combined) +
aes(Year, Cases, fill = Facility) +
geom_col()