# Load libraries
library(tidyverse)
First we’ll create some random data
# Create df1
df1 <- data.frame(id = c(1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4),
date = c(1, 2, 4, 5, 2, 3, 6, 2, 4, 6, 7, 8, 2, 6, 7),
water = round(rnorm(n = 15, mean = .5, sd = .1), 2))
# Create Df2
df2 <- data.frame(id = c(1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4),
date = c(1, 4, 5, 8, 1, 2, 7, 1, 2, 3, 5, 8, 2, 3, 7),
compliance = round(rnorm(n = 15, mean = .5, sd = .1), 2))
Here are the results:
knitr::kable(df1)
| id | date | water |
|---|---|---|
| 1 | 1 | 0.54 |
| 1 | 2 | 0.48 |
| 1 | 4 | 0.54 |
| 1 | 5 | 0.42 |
| 2 | 2 | 0.47 |
| 2 | 3 | 0.33 |
| 2 | 6 | 0.38 |
| 3 | 2 | 0.51 |
| 3 | 4 | 0.52 |
| 3 | 6 | 0.59 |
| 3 | 7 | 0.61 |
| 3 | 8 | 0.50 |
| 4 | 2 | 0.48 |
| 4 | 6 | 0.58 |
| 4 | 7 | 0.48 |
knitr::kable(df2)
| id | date | compliance |
|---|---|---|
| 1 | 1 | 0.61 |
| 1 | 4 | 0.37 |
| 1 | 5 | 0.37 |
| 1 | 8 | 0.35 |
| 2 | 1 | 0.43 |
| 2 | 2 | 0.69 |
| 2 | 7 | 0.47 |
| 3 | 1 | 0.54 |
| 3 | 2 | 0.57 |
| 3 | 3 | 0.58 |
| 3 | 5 | 0.62 |
| 3 | 8 | 0.59 |
| 4 | 2 | 0.39 |
| 4 | 3 | 0.42 |
| 4 | 7 | 0.61 |
Our goal is to add matching values of compliance in df2 to df1 using a ‘fuzzy’ matching rule.
First, we’ll add some dummy columns to df1 and df2.
# First, add compliance = NA column to df 1
df1 <- df1 %>%
mutate(
compliance = NA,
compliance_matched = 0,
compliance_datediff = NA
)
# Now add matched = 0 column to df2
df2 <- df2 %>%
mutate(
compliance_matched = 0
)
Here’s what we’ll do
Loop 1 – Look for exact matches on id and date
match_i in df2), add df2$compliance[match_i]$ to df 1 (df$compliance[i]])Run loop 1
for(i in 1:nrow(df1)) {
# Get id of current row
id.i <- df1$id[i]
# Get date of current row
date.i <- df1$date[i]
# Look for exact match in df2 (and where there hasn't been a match)
df2$match <- with(df2, id == id.i & date == date.i & compliance_matched == 0)
# If there are any matches...
if(any(df2$match)) {
# Stop if there are more than 1 matches (should never happen)
if(sum(any(df2$match)) > 1) {
stop("More than 1 identical match!")}
# Get the row number (match_i) of the match in df2
match_i <- which(df2$match)
# Get compliance
compliance.i <- df2$compliance[match_i]
# Add compliance and date_diff data to df1
df1$compliance[i] <- compliance.i
df1$compliance_datediff[i] <- 0 # 0 because there is a perfect match
# Indicate that both have been matched
df1$compliance_matched[i] <- 1
df2$compliance_matched[match_i] <- 1
}
}
# remove df2$match from df2
df2 <- df2 %>% select(-match)
Here is the result after running loop 1
knitr::kable(df1)
| id | date | water | compliance | compliance_matched | compliance_datediff |
|---|---|---|---|---|---|
| 1 | 1 | 0.54 | 0.61 | 1 | 0 |
| 1 | 2 | 0.48 | NA | 0 | NA |
| 1 | 4 | 0.54 | 0.37 | 1 | 0 |
| 1 | 5 | 0.42 | 0.37 | 1 | 0 |
| 2 | 2 | 0.47 | 0.69 | 1 | 0 |
| 2 | 3 | 0.33 | NA | 0 | NA |
| 2 | 6 | 0.38 | NA | 0 | NA |
| 3 | 2 | 0.51 | 0.57 | 1 | 0 |
| 3 | 4 | 0.52 | NA | 0 | NA |
| 3 | 6 | 0.59 | NA | 0 | NA |
| 3 | 7 | 0.61 | NA | 0 | NA |
| 3 | 8 | 0.50 | 0.59 | 1 | 0 |
| 4 | 2 | 0.48 | 0.39 | 1 | 0 |
| 4 | 6 | 0.58 | NA | 0 | NA |
| 4 | 7 | 0.48 | 0.61 | 1 | 0 |
Loop 2 – Look for exact matches on id and ‘close’ matches on date
match_i in df 2 (df2$compliance[match_i]) to df1$compliance[i]. Additionally, record the specific date difference between rows i and j.# Loop over rows in df1
for(i in 1:nrow(df1)) {
# Get id of current row
id.i <- df1$id[i]
# Get date of current row
date.i <- df1$date[i]
# Only do the following if the row has not already been matched
if(df1$compliance_matched[i] == 0) {
# Add difference between date.i and date in df2
df2 <- df2 %>% mutate(date_dev = date - date.i)
# Look for fuzzy match in df 2
# id is the same, but abs(date_dev) must be less than 7
df2$match <- with(df2, id == id.i & abs(date_dev) <= 7 & compliance_matched == 0)
# If there are any matches...
if(any(df2$match)) {
# Which row has a match AND has the lowest date deviation of all matches
match_i <- which(with(df2, match & abs(date_dev) == min(abs(date_dev[match]))))
# In case there are 2 matches, take the first
match_i <- match_i[1]
# Get compliance
compliance.i <- df2$compliance[match_i]
# Get date difference for match
datediff.i <- abs(df2$date_dev[match_i])
# Add compliance and date difference data to df1
df1$compliance[i] <- compliance.i
df1$compliance_datediff[i] <- abs(df2$date_dev[match_i])
# Indicate that both have been matched
df1$compliance_matched[i] <- 1
df2$compliance_matched[match_i] <- 1
}
}
}
# remove compliance matched from df1
df1 <- select(df1, -compliance_matched)
Here is the final result!
knitr::kable(df1)
| id | date | water | compliance | compliance_datediff |
|---|---|---|---|---|
| 1 | 1 | 0.54 | 0.61 | 0 |
| 1 | 2 | 0.48 | 0.35 | 6 |
| 1 | 4 | 0.54 | 0.37 | 0 |
| 1 | 5 | 0.42 | 0.37 | 0 |
| 2 | 2 | 0.47 | 0.69 | 0 |
| 2 | 3 | 0.33 | 0.43 | 2 |
| 2 | 6 | 0.38 | 0.47 | 1 |
| 3 | 2 | 0.51 | 0.57 | 0 |
| 3 | 4 | 0.52 | 0.58 | 1 |
| 3 | 6 | 0.59 | 0.62 | 1 |
| 3 | 7 | 0.61 | 0.54 | 6 |
| 3 | 8 | 0.50 | 0.59 | 0 |
| 4 | 2 | 0.48 | 0.39 | 0 |
| 4 | 6 | 0.58 | 0.42 | 3 |
| 4 | 7 | 0.48 | 0.61 | 0 |