1 Reconciliation Adjustment

We first correct the definition of reconciliation. It is now based on the sums rather than cumsum defined in Reconciliation Analysis I [1].

PDF <- PDF |> group_by(user_id, project_id)

PDF$is_reconcilable <- NULL

sums <- PDF |>
  summarise(sum_total_redemption_amount = sum(total_redemption_amount),
            sum_credit_given_in_usd = sum(credit_given_in_usd),
            .groups = 'drop')

sums$is_reconcilable <- 
  ifelse(sums$sum_total_redemption_amount <= sums$sum_credit_given_in_usd, 1, 0)

PDF <- PDF |> left_join(sums, by = c("user_id", "project_id")) |>
  select(-sum_total_redemption_amount, -sum_credit_given_in_usd)

PDF <- PDF[, c(1:13, 76, 14:75)]

2 Time-Series Cross-Sectional (TSCS) Data Transformation

Two important timing indexes, biweek_i and biweek_i_indiv, are created here for our TSCS data visualization.

PDF$created_at <- as.Date(PDF$created_at)

start_date <- as.Date(PDF$created_at) |> min()

PDF$biweek_i <- as.numeric(PDF$created_at - start_date) %/% 14 + 1

PDF$user_project_pair <- paste(PDF$user_id, PDF$project_id, sep = "-")

PDF <- PDF[, c(1:2, 78, 3, 77, 4:76)]

PDF <- arrange(PDF, user_project_pair, created_at)

PDF <- PDF |>
  group_by(user_project_pair) |> 
  mutate(biweek_i_indiv = 
           as.integer(as.numeric(difftime(created_at, min(created_at), units = "weeks")) %/% 2 + 1))

PDF <- PDF[, c(1:5, 79, 6:78)]

3 Panel Transformation of Time Aggregation

In this section, PDF_matrix is extracted from PDF first, and then we perform equivalence merging to combine any covariates that happen within the same timing index of biweek_i. A function binary_max is created here for combining binary variables correctly.

PDF_matrix <- PDF[, c(3, 5:6, 12, 14, 13, 16, 17:19, 22:23, 24:61, 76, 79)]

setDT(PDF_matrix)

binary_max <- function(x) {
  
  if(sum(x) > 1) {
    
    return(1.0)  
    
  } 
  
  else {
    
    return(as.double(sum(x)))
    
  }
  
}

cols_to_sum <- names(PDF_matrix)[c(4:7, 10)]

binary_cols <- names(PDF_matrix)[c(8:9, 11:50)]

PDF_numeric <- 
  PDF_matrix[, lapply(.SD, sum), by = .(user_project_pair, biweek_i, account_created_at, user_app_platform), .SDcols = cols_to_sum]

PDF_binary <- 
  PDF_matrix[, lapply(.SD, binary_max), by = .(user_project_pair, biweek_i, account_created_at, user_app_platform), .SDcols = binary_cols]

PDF_biweek <- full_join(PDF_numeric, PDF_binary, by = c("user_project_pair", "biweek_i", "account_created_at", "user_app_platform"))

PDF_biweek <- PDF_biweek[, c(1:2, 5:51, 3:4)]

4 Naturalistic Time-Series Plot

The following sections are based upon Mou, Liu, Xu (2023)’s Panel Data Visualization in R (panelView) and Stata (panelview) [2]. We will examine their approaches on our inKind data with some adjustments.

The first time-series plot we would like to visualize is based on naturalistic timing index, biweek_i.

setDT(PDF_biweek)

all_combinations <- 
  CJ(user_project_pair = unique(PDF_biweek$user_project_pair), biweek_i = 1:55)

imputed_PDF_biweek <- 
  merge(all_combinations, PDF_biweek, by = c("user_project_pair", "biweek_i"), all.x = TRUE)

cols_to_fill <- names(imputed_PDF_biweek)[3:51]

imputed_PDF_biweek[, 
                   (cols_to_fill) := lapply(.SD, function(x) ifelse(is.na(x), NA, x)), 
                   .SDcols = cols_to_fill]

imputed_PDF_biweek[, observed := ifelse(rowSums(is.na(.SD)) > 0, 0, 1), .SDcols = cols_to_fill]

replace_func <- function(x) {
  
    first_one_index <- which(x == 1)[1]
    
    if (!is.na(first_one_index)) {
      
        x[1:(first_one_index-1)][x[1:(first_one_index-1)] == 0] <- -1
        
    }
    
    return(x)
    
}

imputed_PDF_biweek$observed <- 
  ave(imputed_PDF_biweek$observed, 
      imputed_PDF_biweek$user_project_pair, 
      FUN = replace_func)

5 Self-Pacing Time-Series Plot

The second time-series plot we would like to visualize is based on each customer\(\times\)project’s self-pacing timing index, biweek_i_indiv.

imputed_PDF_biweek <- imputed_PDF_biweek[order(imputed_PDF_biweek$user_project_pair, imputed_PDF_biweek$biweek_i),]

first_observed <- aggregate(biweek_i ~ user_project_pair, imputed_PDF_biweek[imputed_PDF_biweek$observed == 1,], min)

names(first_observed)[names(first_observed) == "biweek_i"] <- "first_observed_biweek_i"

imputed_PDF_biweek <- merge(imputed_PDF_biweek, first_observed, by = "user_project_pair", all.x = TRUE)

imputed_PDF_biweek$biweek_i_indiv <- imputed_PDF_biweek$biweek_i - imputed_PDF_biweek$first_observed_biweek_i

imputed_PDF_biweek <- imputed_PDF_biweek[, c(1, 54, 2, 3:53)]

PDF_biweek_indiv <- filter(imputed_PDF_biweek, biweek_i_indiv >= 0)

setDT(PDF_biweek_indiv)

all_combinations_indiv <- 
  CJ(user_project_pair = unique(PDF_biweek_indiv$user_project_pair), biweek_i_indiv = 0:54)

imputed_PDF_biweek_indiv <- 
  merge(all_combinations_indiv, PDF_biweek_indiv, by = c("user_project_pair", "biweek_i_indiv"), all.x = TRUE)

imputed_PDF_biweek_indiv$observed[is.na(imputed_PDF_biweek_indiv$observed)] <- -1

6 Complete Self-Pacing Time-Series Plot

The third time-series plot we would like to visualize is based on each customer\(\times\)project’s self-pacing timing index, biweek_i_indiv, and it is reflecting the complete history of each customer\(\times\)project.

PDF_biweek_T <- imputed_PDF_biweek[, c(1:3, 53)]

PDF_biweek_T$observed[PDF_biweek_T$observed == -1] <- 0

setDT(PDF_biweek_T)

all_combinations_T <- 
  CJ(user_project_pair = unique(PDF_biweek_T$user_project_pair), biweek_i_indiv = -55:54)

imputed_PDF_biweek_T <-
  merge(all_combinations_T, PDF_biweek_T, by = c("user_project_pair", "biweek_i_indiv"), all.x = TRUE)

imputed_PDF_biweek_T$observed[is.na(imputed_PDF_biweek_T$observed)] <- -1

7 Causal Panel Analysis - Visualization

Notations:

1: obs_TR (observed data with transaction/redemption recorded)
0: obs_nTR (observed data with transaction/redemption NOT recorded)
-1: mis (missing data, mainly for counterfactual inference if it hasn’t happened yet)

7.1 Naturalistical Time-Series Plot - Visualization

panelview(D = "observed", 
          data = imputed_PDF_biweek, 
          index = c("user_project_pair", "biweek_i"), 
          display.all = TRUE, type = "treat", by.timing = TRUE,
          xlab = "Biweek Index", ylab = "User Project Pair", gridOff = TRUE,
          axis.lab.gap = c(2,10), main = "Naturalistic Transaction & Redemption History")

## 3 treatment levels.

7.2 Naturalistic Time-Series Plot [Cleaned] - Treatment Visualization

Notice that the sub-section above has shown that some customer\(\times\)projects are completely missing (shown by 0). Therefore, we aim to discover them and temporarily remove them for the analyese below. The removed version is ended with _clean for each panel data frame.

pair_nTR <- 
  imputed_PDF_biweek[, c(1:2, 53)] |>
  group_by(user_project_pair) |>
  summarise(all_zeros = all(observed == 0)) |>
  filter(all_zeros) |>
  pull(user_project_pair)

length(pair_nTR)

## [1] 10658

imputed_PDF_biweek_clean <- imputed_PDF_biweek[!imputed_PDF_biweek$user_project_pair %in% pair_nTR, ]

panelview(D = "observed", 
          data = imputed_PDF_biweek_clean, 
          index = c("user_project_pair", "biweek_i"), 
          display.all = TRUE, type = "treat", by.timing = TRUE,
          xlab = "Biweek Index", ylab = "User Project Pair", gridOff = TRUE,
          axis.lab.gap = c(2,10), main = "Naturalistic Transaction & Redemption History")

## 3 treatment levels.

7.3 Self-Pacing Time-Series Plot [Cleaned] - Treatment Visualization

imputed_PDF_biweek_indiv_clean <- 
  imputed_PDF_biweek_indiv[!imputed_PDF_biweek_indiv$user_project_pair %in% pair_nTR, ]

panelview(D = "observed", 
          data = imputed_PDF_biweek_indiv_clean, 
          index = c("user_project_pair", "biweek_i_indiv"), 
          display.all = TRUE, type = "treat", by.timing = TRUE,
          xlab = "Biweek Index", ylab = "User Project Pair",
          axis.lab.gap = c(2,10), main = "Self-Pacing Transaction & Redemption History")

## If the number of units is more than 300, we set "gridOff = TRUE".

## 3 treatment levels.

7.4 Self-Pacing Time-Series Plot [Cleaned] - Binary Treatment Visualization

As one may realize, the above self-pacing times-series plot can be further cast into 2 binary states - control (observed data) and treatment (missing counterfactual).

imputed_PDF_biweek_indiv_binary_clean <- imputed_PDF_biweek_indiv_clean[, c(1:3, 53)]

imputed_PDF_biweek_indiv_binary_clean$observed[imputed_PDF_biweek_indiv_binary_clean$observed != -1] <- 0

imputed_PDF_biweek_indiv_binary_clean$observed[imputed_PDF_biweek_indiv_binary_clean$observed == -1] <- 1

panelview(D = "observed", 
          data = imputed_PDF_biweek_indiv_binary_clean, 
          index = c("user_project_pair", "biweek_i_indiv"), 
          display.all = TRUE, type = "treat", by.timing = TRUE,
          xlab = "Biweek Index", ylab = "User Project Pair",
          axis.lab.gap = c(2,10))

## If the number of units is more than 300, we set "gridOff = TRUE".

7.5 Complete Self-Pacing Time-Series Plot [Cleaned] - Treatment Visualization

imputed_PDF_biweek_T_clean <- 
  imputed_PDF_biweek_T[!imputed_PDF_biweek_T$user_project_pair %in% pair_nTR, ]

panelview(D = "observed", 
          data = imputed_PDF_biweek_T_clean, 
          index = c("user_project_pair", "biweek_i_indiv"), 
          display.all = TRUE, type = "treat", by.timing = TRUE,
          xlab = "Biweek Index", ylab = "User Project Pair",
          axis.lab.gap = c(2,10), main = "Complete Self-Pacing Naturalistic Transaction & Redemption History")

## If the number of units is more than 300, we set "gridOff = TRUE".

## 3 treatment levels.

8 References

[1] Z. Jiang, “RPubs - Reconciliation Analysis I - Modeling and Extracting Emblematic Non-reconcilable Dynamics,” rpubs.com, Jul. 13, 2023. https://rpubs.com/jiangzm/1063534 (accessed Aug 5, 2023).

[2] H. Mou, L. Liu, and Y. Xu, “panelView: Panel Data Visualization in R and Stata,” SSRN Electronic Journal, 2022, doi: https://doi.org/10.2139/ssrn.4202154.

Statistical Modeling II - Visualization of Causal Panel Analysis on inKind Data

Zhongming Jiang

2023-08-04