Serena Consulting

# Load libraries

library(tidyverse)

First we’ll create some random data

# Create df1
df1 <- data.frame(id = c(1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4),
                  date = c(1, 2, 4, 5, 2, 3, 6, 2, 4, 6, 7, 8, 2, 6, 7),
                  water = round(rnorm(n = 15, mean = .5, sd = .1), 2))

# Create Df2
df2 <- data.frame(id = c(1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4),
                  date = c(1, 4, 5, 8, 1, 2, 7, 1, 2, 3, 5, 8, 2, 3, 7),
                  compliance = round(rnorm(n = 15, mean = .5, sd = .1), 2))

Here are the results:

knitr::kable(df1)

id	date	water
1	1	0.54
1	2	0.48
1	4	0.54
1	5	0.42
2	2	0.47
2	3	0.33
2	6	0.38
3	2	0.51
3	4	0.52
3	6	0.59
3	7	0.61
3	8	0.50
4	2	0.48
4	6	0.58
4	7	0.48

knitr::kable(df2)

id	date	compliance
1	1	0.61
1	4	0.37
1	5	0.37
1	8	0.35
2	1	0.43
2	2	0.69
2	7	0.47
3	1	0.54
3	2	0.57
3	3	0.58
3	5	0.62
3	8	0.59
4	2	0.39
4	3	0.42
4	7	0.61

Our goal is to add matching values of compliance in df2 to df1 using a ‘fuzzy’ matching rule.

First, we’ll add some dummy columns to df1 and df2.

# First, add compliance = NA column to df 1

df1 <- df1 %>%
  mutate(
    compliance = NA,
    compliance_matched = 0,
    compliance_datediff = NA
  )

# Now add matched = 0 column to df2 

df2 <- df2 %>%
  mutate(
    compliance_matched = 0
  )

Here’s what we’ll do

Loop 1 – Look for exact matches on id and date

Loop over all rows in df1
For each row i in df1, search for a row in df2 that is an exact match on both id and date.
- If an exact match is found (call it row match_i in df2), add df2$compliance[match_i]$ to df 1 (df$compliance[i]])
- Indicate that row i in df1 and row j in df2 have both been matched.

Run loop 1

for(i in 1:nrow(df1)) {
  
  # Get id of current row
  id.i <- df1$id[i]
  
  # Get date of current row
  date.i <- df1$date[i]
  
  # Look for exact match in df2 (and where there hasn't been a match)
  df2$match <- with(df2, id == id.i & date == date.i & compliance_matched == 0)
  
  # If there are any matches...
  if(any(df2$match)) {
    
    # Stop if there are more than 1 matches (should never happen)
    if(sum(any(df2$match)) > 1) {
      
      stop("More than 1 identical match!")}
    
    # Get the row number (match_i) of the match in df2
    match_i <- which(df2$match)
    
    # Get compliance
    compliance.i <- df2$compliance[match_i]
    
    # Add compliance and date_diff data to df1
    df1$compliance[i] <- compliance.i
    df1$compliance_datediff[i] <- 0 # 0 because there is a perfect match

    # Indicate that both have been matched
    
    df1$compliance_matched[i] <- 1
    df2$compliance_matched[match_i] <- 1

  }
}

# remove df2$match from df2
df2 <- df2 %>% select(-match)

Here is the result after running loop 1

knitr::kable(df1)

id	date	water	compliance	compliance_matched	compliance_datediff
1	1	0.54	0.61	1	0
1	2	0.48	NA	0	NA
1	4	0.54	0.37	1	0
1	5	0.42	0.37	1	0
2	2	0.47	0.69	1	0
2	3	0.33	NA	0	NA
2	6	0.38	NA	0	NA
3	2	0.51	0.57	1	0
3	4	0.52	NA	0	NA
3	6	0.59	NA	0	NA
3	7	0.61	NA	0	NA
3	8	0.50	0.59	1	0
4	2	0.48	0.39	1	0
4	6	0.58	NA	0	NA
4	7	0.48	0.61	1	0

Loop 2 – Look for exact matches on id and ‘close’ matches on date

Loop over all rows in df1.
For each row i in df 1 (df[i,]) that have not already been matched, look for a row in df2 that has the same id, and has a date within 7 days of the date in row i of df 1.
- If a match is found, copy the compliance data from row match_i in df 2 (df2$compliance[match_i]) to df1$compliance[i]. Additionally, record the specific date difference between rows i and j.

# Loop over rows in df1
for(i in 1:nrow(df1)) {
  
  # Get id of current row
  id.i <- df1$id[i]
  
  # Get date of current row
  date.i <- df1$date[i]
  
  # Only do the following if the row has not already been matched
  if(df1$compliance_matched[i] == 0) {
  
  # Add difference between date.i and date in df2
  df2 <- df2 %>% mutate(date_dev = date - date.i)
  
  # Look for fuzzy match in df 2
  #  id is the same, but abs(date_dev) must be less than 7
  df2$match <- with(df2, id == id.i & abs(date_dev) <= 7 & compliance_matched == 0)
  
  # If there are any matches...
  if(any(df2$match)) {
    
    # Which row has a match AND has the lowest date deviation of all matches
    match_i <- which(with(df2, match & abs(date_dev) == min(abs(date_dev[match]))))
    
    # In case there are 2 matches, take the first
    match_i <- match_i[1]
  
    # Get compliance
    compliance.i <- df2$compliance[match_i]
    
    # Get date difference for match
    datediff.i <- abs(df2$date_dev[match_i])
    
    # Add compliance and date difference data to df1
    df1$compliance[i] <- compliance.i
    df1$compliance_datediff[i] <- abs(df2$date_dev[match_i])
    
    # Indicate that both have been matched
    df1$compliance_matched[i] <- 1
    df2$compliance_matched[match_i] <- 1
    
  }
  }
}

# remove compliance matched from df1
df1 <- select(df1, -compliance_matched)

Here is the final result!

knitr::kable(df1)

id	date	water	compliance	compliance_datediff
1	1	0.54	0.61	0
1	2	0.48	0.35	6
1	4	0.54	0.37	0
1	5	0.42	0.37	0
2	2	0.47	0.69	0
2	3	0.33	0.43	2
2	6	0.38	0.47	1
3	2	0.51	0.57	0
3	4	0.52	0.58	1
3	6	0.59	0.62	1
3	7	0.61	0.54	6
3	8	0.50	0.59	0
4	2	0.48	0.39	0
4	6	0.58	0.42	3
4	7	0.48	0.61	0

Serena Consulting

Nathaniel Phillips

1/9/2018