First, I’ll create some fake datasets that hopefully work to stand in for your dataset.
# first, these are remediation types for the "by date" dataset you have
remediation_types1 <- c("replace", "clean", "chemical", "drip")
# next, these are remediation types for the "by outlet" dataset you have --
# "DEC" is our "target" that we want to find and replace eventually
remediation_types2 <- c("replace", "DEC", "clean", "chemical", "drip")
set.seed(3.14) # set.seed so random samples are reproducible
# create a fake "by date" dataset
df_dummy <- tibble(
outlet_id = sample(seq(10), 50, replace = TRUE),
date = sample(seq(as.Date('2019/01/01'), as.Date('2021/01/01'), by="day"), 50),
remediation1 = sample(remediation_types1, 50, replace = TRUE)
)
head(df_dummy, 50)
## # A tibble: 50 x 3
## outlet_id date remediation1
## <int> <date> <chr>
## 1 5 2019-07-16 chemical
## 2 10 2020-10-01 replace
## 3 7 2020-12-10 clean
## 4 4 2019-06-20 replace
## 5 10 2020-05-04 drip
## 6 8 2020-07-01 clean
## 7 8 2019-11-30 drip
## 8 4 2019-03-14 drip
## 9 10 2019-07-14 replace
## 10 7 2019-07-23 replace
## # … with 40 more rows
# create a fake "by outlet" dataset
df_intermediate <- tibble(
outlet_id = sample(seq(10)),
date = sample(seq(as.Date('2019/01/01'), as.Date('2021/01/01'), by="day"), 10),
remediation2 = sample(remediation_types2, 10, replace = TRUE)
)
head(df_intermediate, 10)
## # A tibble: 10 x 3
## outlet_id date remediation2
## <int> <date> <chr>
## 1 1 2020-10-24 chemical
## 2 4 2020-05-23 clean
## 3 2 2019-04-27 DEC
## 4 8 2019-04-12 clean
## 5 3 2020-09-28 drip
## 6 10 2020-05-30 replace
## 7 7 2019-09-04 drip
## 8 9 2019-09-15 clean
## 9 6 2019-10-23 replace
## 10 5 2020-11-06 clean
Your goal, as I understand, is to find any place in df_intermediate (i.e. your “by outlet”) where the value of remediation2 is “DEC” (i.e. your target value), and then replace that value with the latest value of remediation1 from the df_dummy (i.e. “by date”) data frame for that outlet_id.
First, we need to find the latest (latest date) value of remediation1 for each outlet_id.
latest_remediation <- df_dummy %>%
group_by(outlet_id) %>%
filter(date == max(date)) %>%
select(outlet_id, remediation1)
head(latest_remediation, 10)
## # A tibble: 10 x 2
## # Groups: outlet_id [10]
## outlet_id remediation1
## <int> <chr>
## 1 10 replace
## 2 7 clean
## 3 8 chemical
## 4 5 replace
## 5 2 replace
## 6 6 clean
## 7 3 chemical
## 8 1 replace
## 9 4 drip
## 10 9 replace
We now only have ONE value for each outlet_id in latest_remediation.
We can join df_intermediate with latest_remediation.
df_intermediate <- df_intermediate %>%
left_join(latest_remediation, by = "outlet_id")
head(df_intermediate, 10)
## # A tibble: 10 x 4
## outlet_id date remediation2 remediation1
## <int> <date> <chr> <chr>
## 1 1 2020-10-24 chemical replace
## 2 4 2020-05-23 clean drip
## 3 2 2019-04-27 DEC replace
## 4 8 2019-04-12 clean chemical
## 5 3 2020-09-28 drip chemical
## 6 10 2020-05-30 replace replace
## 7 7 2019-09-04 drip clean
## 8 9 2019-09-15 clean replace
## 9 6 2019-10-23 replace clean
## 10 5 2020-11-06 clean replace
Now we can do our mutate() to replace the values where necessary.
df_final <- df_intermediate %>%
group_by(outlet_id) %>%
mutate(remediation_prime = case_when(
remediation2 == "DEC" ~ remediation1,
TRUE ~ as.character(remediation2)
))
head(df_final, 10)
## # A tibble: 10 x 5
## # Groups: outlet_id [10]
## outlet_id date remediation2 remediation1 remediation_prime
## <int> <date> <chr> <chr> <chr>
## 1 1 2020-10-24 chemical replace chemical
## 2 4 2020-05-23 clean drip clean
## 3 2 2019-04-27 DEC replace replace
## 4 8 2019-04-12 clean chemical clean
## 5 3 2020-09-28 drip chemical drip
## 6 10 2020-05-30 replace replace replace
## 7 7 2019-09-04 drip clean drip
## 8 9 2019-09-15 clean replace clean
## 9 6 2019-10-23 replace clean replace
## 10 5 2020-11-06 clean replace clean
Or, if you want to get rid of the now “extra” remediation columns:
df_final <- df_final %>%
select(-remediation1, -remediation2) # if you want to get rid of the old columns
head(df_final, 10)
## # A tibble: 10 x 3
## # Groups: outlet_id [10]
## outlet_id date remediation_prime
## <int> <date> <chr>
## 1 1 2020-10-24 chemical
## 2 4 2020-05-23 clean
## 3 2 2019-04-27 replace
## 4 8 2019-04-12 clean
## 5 3 2020-09-28 drip
## 6 10 2020-05-30 replace
## 7 7 2019-09-04 drip
## 8 9 2019-09-15 clean
## 9 6 2019-10-23 replace
## 10 5 2020-11-06 clean