We first correct the definition of reconciliation.
It is now based on the sums rather than cumsum
defined in Reconciliation Analysis I [1].
PDF <- PDF |> group_by(user_id, project_id)
PDF$is_reconcilable <- NULL
sums <- PDF |>
summarise(sum_total_redemption_amount = sum(total_redemption_amount),
sum_credit_given_in_usd = sum(credit_given_in_usd),
.groups = 'drop')
sums$is_reconcilable <-
ifelse(sums$sum_total_redemption_amount <= sums$sum_credit_given_in_usd, 1, 0)
PDF <- PDF |> left_join(sums, by = c("user_id", "project_id")) |>
select(-sum_total_redemption_amount, -sum_credit_given_in_usd)
PDF <- PDF[, c(1:13, 76, 14:75)]
Two important timing indexes, biweek_i and
biweek_i_indiv, are created here for our TSCS data
visualization.
PDF$created_at <- as.Date(PDF$created_at)
start_date <- as.Date(PDF$created_at) |> min()
PDF$biweek_i <- as.numeric(PDF$created_at - start_date) %/% 14 + 1
PDF$user_project_pair <- paste(PDF$user_id, PDF$project_id, sep = "-")
PDF <- PDF[, c(1:2, 78, 3, 77, 4:76)]
PDF <- arrange(PDF, user_project_pair, created_at)
PDF <- PDF |>
group_by(user_project_pair) |>
mutate(biweek_i_indiv =
as.integer(as.numeric(difftime(created_at, min(created_at), units = "weeks")) %/% 2 + 1))
PDF <- PDF[, c(1:5, 79, 6:78)]
In this section, PDF_matrix is extracted from
PDF first, and then we perform equivalence
merging to combine any covariates that happen within the same
timing index of biweek_i. A function
binary_max is created here for combining binary variables
correctly.
PDF_matrix <- PDF[, c(3, 5:6, 12, 14, 13, 16, 17:19, 22:23, 24:61, 76, 79)]
setDT(PDF_matrix)
binary_max <- function(x) {
if(sum(x) > 1) {
return(1.0)
}
else {
return(as.double(sum(x)))
}
}
cols_to_sum <- names(PDF_matrix)[c(4:7, 10)]
binary_cols <- names(PDF_matrix)[c(8:9, 11:50)]
PDF_numeric <-
PDF_matrix[, lapply(.SD, sum), by = .(user_project_pair, biweek_i, account_created_at, user_app_platform), .SDcols = cols_to_sum]
PDF_binary <-
PDF_matrix[, lapply(.SD, binary_max), by = .(user_project_pair, biweek_i, account_created_at, user_app_platform), .SDcols = binary_cols]
PDF_biweek <- full_join(PDF_numeric, PDF_binary, by = c("user_project_pair", "biweek_i", "account_created_at", "user_app_platform"))
PDF_biweek <- PDF_biweek[, c(1:2, 5:51, 3:4)]
The following sections are based upon Mou, Liu, Xu (2023)’s Panel Data Visualization in R (panelView) and Stata (panelview) [2]. We will examine their approaches on our inKind data with some adjustments.
The first time-series plot we would like to visualize is based on
naturalistic timing index, biweek_i.
setDT(PDF_biweek)
all_combinations <-
CJ(user_project_pair = unique(PDF_biweek$user_project_pair), biweek_i = 1:55)
imputed_PDF_biweek <-
merge(all_combinations, PDF_biweek, by = c("user_project_pair", "biweek_i"), all.x = TRUE)
cols_to_fill <- names(imputed_PDF_biweek)[3:51]
imputed_PDF_biweek[,
(cols_to_fill) := lapply(.SD, function(x) ifelse(is.na(x), NA, x)),
.SDcols = cols_to_fill]
imputed_PDF_biweek[, observed := ifelse(rowSums(is.na(.SD)) > 0, 0, 1), .SDcols = cols_to_fill]
replace_func <- function(x) {
first_one_index <- which(x == 1)[1]
if (!is.na(first_one_index)) {
x[1:(first_one_index-1)][x[1:(first_one_index-1)] == 0] <- -1
}
return(x)
}
imputed_PDF_biweek$observed <-
ave(imputed_PDF_biweek$observed,
imputed_PDF_biweek$user_project_pair,
FUN = replace_func)
The second time-series plot we would like to visualize is based on
each customer\(\times\)project’s
self-pacing timing index, biweek_i_indiv.
imputed_PDF_biweek <- imputed_PDF_biweek[order(imputed_PDF_biweek$user_project_pair, imputed_PDF_biweek$biweek_i),]
first_observed <- aggregate(biweek_i ~ user_project_pair, imputed_PDF_biweek[imputed_PDF_biweek$observed == 1,], min)
names(first_observed)[names(first_observed) == "biweek_i"] <- "first_observed_biweek_i"
imputed_PDF_biweek <- merge(imputed_PDF_biweek, first_observed, by = "user_project_pair", all.x = TRUE)
imputed_PDF_biweek$biweek_i_indiv <- imputed_PDF_biweek$biweek_i - imputed_PDF_biweek$first_observed_biweek_i
imputed_PDF_biweek <- imputed_PDF_biweek[, c(1, 54, 2, 3:53)]
PDF_biweek_indiv <- filter(imputed_PDF_biweek, biweek_i_indiv >= 0)
setDT(PDF_biweek_indiv)
all_combinations_indiv <-
CJ(user_project_pair = unique(PDF_biweek_indiv$user_project_pair), biweek_i_indiv = 0:54)
imputed_PDF_biweek_indiv <-
merge(all_combinations_indiv, PDF_biweek_indiv, by = c("user_project_pair", "biweek_i_indiv"), all.x = TRUE)
imputed_PDF_biweek_indiv$observed[is.na(imputed_PDF_biweek_indiv$observed)] <- -1
The third time-series plot we would like to visualize is based on
each customer\(\times\)project’s
self-pacing timing index, biweek_i_indiv, and it is
reflecting the complete history of each customer\(\times\)project.
PDF_biweek_T <- imputed_PDF_biweek[, c(1:3, 53)]
PDF_biweek_T$observed[PDF_biweek_T$observed == -1] <- 0
setDT(PDF_biweek_T)
all_combinations_T <-
CJ(user_project_pair = unique(PDF_biweek_T$user_project_pair), biweek_i_indiv = -55:54)
imputed_PDF_biweek_T <-
merge(all_combinations_T, PDF_biweek_T, by = c("user_project_pair", "biweek_i_indiv"), all.x = TRUE)
imputed_PDF_biweek_T$observed[is.na(imputed_PDF_biweek_T$observed)] <- -1
Notations:
1: obs_TR (observed data with
transaction/redemption recorded)0: obs_nTR (observed data with
transaction/redemption NOT recorded)-1: mis (missing data, mainly for
counterfactual inference if it hasn’t happened yet)panelview(D = "observed",
data = imputed_PDF_biweek,
index = c("user_project_pair", "biweek_i"),
display.all = TRUE, type = "treat", by.timing = TRUE,
xlab = "Biweek Index", ylab = "User Project Pair", gridOff = TRUE,
axis.lab.gap = c(2,10), main = "Naturalistic Transaction & Redemption History")
## 3 treatment levels.
Notice that the sub-section above has shown that some customer\(\times\)projects are completely
missing (shown by 0). Therefore, we aim to
discover them and temporarily remove them for the analyese below. The
removed version is ended with _clean for each panel data
frame.
pair_nTR <-
imputed_PDF_biweek[, c(1:2, 53)] |>
group_by(user_project_pair) |>
summarise(all_zeros = all(observed == 0)) |>
filter(all_zeros) |>
pull(user_project_pair)
length(pair_nTR)
## [1] 10658
imputed_PDF_biweek_clean <- imputed_PDF_biweek[!imputed_PDF_biweek$user_project_pair %in% pair_nTR, ]
panelview(D = "observed",
data = imputed_PDF_biweek_clean,
index = c("user_project_pair", "biweek_i"),
display.all = TRUE, type = "treat", by.timing = TRUE,
xlab = "Biweek Index", ylab = "User Project Pair", gridOff = TRUE,
axis.lab.gap = c(2,10), main = "Naturalistic Transaction & Redemption History")
## 3 treatment levels.
imputed_PDF_biweek_indiv_clean <-
imputed_PDF_biweek_indiv[!imputed_PDF_biweek_indiv$user_project_pair %in% pair_nTR, ]
panelview(D = "observed",
data = imputed_PDF_biweek_indiv_clean,
index = c("user_project_pair", "biweek_i_indiv"),
display.all = TRUE, type = "treat", by.timing = TRUE,
xlab = "Biweek Index", ylab = "User Project Pair",
axis.lab.gap = c(2,10), main = "Self-Pacing Transaction & Redemption History")
## If the number of units is more than 300, we set "gridOff = TRUE".
## 3 treatment levels.
As one may realize, the above self-pacing times-series plot can be
further cast into 2 binary states - control (observed data)
and treatment (missing counterfactual).
imputed_PDF_biweek_indiv_binary_clean <- imputed_PDF_biweek_indiv_clean[, c(1:3, 53)]
imputed_PDF_biweek_indiv_binary_clean$observed[imputed_PDF_biweek_indiv_binary_clean$observed != -1] <- 0
imputed_PDF_biweek_indiv_binary_clean$observed[imputed_PDF_biweek_indiv_binary_clean$observed == -1] <- 1
panelview(D = "observed",
data = imputed_PDF_biweek_indiv_binary_clean,
index = c("user_project_pair", "biweek_i_indiv"),
display.all = TRUE, type = "treat", by.timing = TRUE,
xlab = "Biweek Index", ylab = "User Project Pair",
axis.lab.gap = c(2,10))
## If the number of units is more than 300, we set "gridOff = TRUE".
imputed_PDF_biweek_T_clean <-
imputed_PDF_biweek_T[!imputed_PDF_biweek_T$user_project_pair %in% pair_nTR, ]
panelview(D = "observed",
data = imputed_PDF_biweek_T_clean,
index = c("user_project_pair", "biweek_i_indiv"),
display.all = TRUE, type = "treat", by.timing = TRUE,
xlab = "Biweek Index", ylab = "User Project Pair",
axis.lab.gap = c(2,10), main = "Complete Self-Pacing Naturalistic Transaction & Redemption History")
## If the number of units is more than 300, we set "gridOff = TRUE".
## 3 treatment levels.
[1] Z. Jiang, “RPubs - Reconciliation Analysis I - Modeling and Extracting Emblematic Non-reconcilable Dynamics,” rpubs.com, Jul. 13, 2023. https://rpubs.com/jiangzm/1063534 (accessed Aug 5, 2023).
[2] H. Mou, L. Liu, and Y. Xu, “panelView: Panel Data Visualization in R and Stata,” SSRN Electronic Journal, 2022, doi: https://doi.org/10.2139/ssrn.4202154.