knitr::opts_chunk$set(echo = TRUE)
Package Invoking
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(zoo)
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(panelView)
## ## See bit.ly/panelview4r for more info.
## ## Report bugs -> yiqingxu@stanford.edu.
library(bpCausal)
Data Loading
PDF_JA <- read.csv("/Users/apple/Quantitative\ Marketing\ Research/Statistical\ Modeling\ V/Statistical\ Modeling\ V\ Data/PDF_JA.csv")
Multiple Outcome
Time-Series Cross-Sectional (TSCS) Data Maniputation
TSCS <-
PDF_JA[, c(1:3, 8:9, 14, 16, 4:5, 12, 17, 6:7, 13, 18, 10:11, 15, 19:67)]
TSCS$D <- 1 - TSCS$D
TSCS_Y <- TSCS[, -c(4, 8, 12, 16, 19:67)]
colnames(TSCS_Y)[c(1:14)] <-
c("id", "T", "D",
"F_trans", "M_trans", "R_trans",
"F_redem", "M_redem", "R_redem",
"F_charg", "M_charg", "R_charg",
"F_tip", "M_tip")
split_pairs <- strsplit(as.character(TSCS_Y$id), "-")
TSCS_Y$user <- sapply(split_pairs, "[[", 1)
TSCS_Y$proj <- sapply(split_pairs, "[[", 2)
TSCS_Y$user <- as.integer(TSCS_Y$user)
TSCS_Y$proj <- as.integer(TSCS_Y$proj)
unique_pairs <- unique(paste(TSCS_Y$user, TSCS_Y$proj, sep="-"))
pair_to_int <- setNames(seq_along(unique_pairs), unique_pairs)
TSCS_Y$id <- pair_to_int[paste(TSCS_Y$user, TSCS_Y$proj, sep="-")]
TSCS_Y$D <- as.integer(TSCS_Y$D)
TSCS_Y$user <- as.factor(TSCS_Y$user)
TSCS_Y$proj <- as.factor(TSCS_Y$proj)
Matrix Expansion
complete_set <- expand_grid(id = unique(TSCS_Y$id), T = 1:55)
TSCS_Y <- complete_set |>
left_join(TSCS_Y, by = c("id", "T")) |>
group_by(id) |>
fill(user, proj, .direction = "downup")
TSCS_Y <- TSCS_Y |> ungroup()
Impute
D
impute_D <- function(sub_df) {
first_one_idx <- which(sub_df$D == 1)[1]
if (!is.na(first_one_idx)) {
sub_df[1:(first_one_idx - 1), "D"] <- 0
sub_df[first_one_idx:nrow(sub_df), "D"] <- 1
}
else {
sub_df[, "D"] <- 0
}
return(sub_df)
}
TSCS_Y <- TSCS_Y |> group_by(id) |> group_modify(~impute_D(.x)) |> ungroup()
Impute Multiple
Ys
Fix First Row
TSCS_Y <- TSCS_Y |> arrange(id) |> group_by(id) |>
mutate(across(starts_with("R_"), ~ifelse(row_number() == 1 & is.na(.), 999, .))) |>
mutate(across(-starts_with("R_"), ~ifelse(row_number() == 1 & is.na(.), 0, .))) |>
ungroup()
Impute Frequency
Metrics
TSCS_Y <- TSCS_Y |> arrange(id, T)
f_columns <- grep("^F_", names(TSCS_Y), value = TRUE)
impute_freq <- function(group_df) {
for (col in f_columns) {
group_df[[col]] <- ifelse(is.na(group_df[[col]]), 0, group_df[[col]])
group_df[[col]] <- cumsum(group_df[[col]])
}
return(group_df)
}
TSCS_Y <- TSCS_Y |> group_by(id) |>
group_modify(~ impute_freq(.)) |> ungroup()
Impute Monetary
Metrics
TSCS_Y <- as.data.table(TSCS_Y)
m_columns <- grep("^M_", names(TSCS_Y), value = TRUE)
TSCS_Y[, (m_columns) := lapply(.SD, function(x) {
x <- nafill(x, type = "locf")
ifelse(is.na(x), 0, x)
}), by = id, .SDcols = m_columns]
Impute Recency
Metrics
r_columns <- grep("^R_", names(TSCS_Y), value = TRUE)
TSCS_Y[, (r_columns) := lapply(.SD, function(x) {
x <- nafill(x, type = "locf")
ifelse(is.na(x), 999, x)
}), by = id, .SDcols = r_columns]
TSCS_Y$D <- as.integer(TSCS_Y$D)
Panel Data
Overview
Panel View of Top 100
Pair IDs
TSCS_Y_top100 <- TSCS_Y[1:5500, ]
panelview(D = "D",
data = TSCS_Y_top100,
index = c("id", "T"),
display.all = TRUE, type = "treat", by.timing = TRUE,
xlab = "Time (Biweek)", ylab = "Pair ID",
axis.lab.gap = c(2,10), main = "Treatment Status (Top 100 Pair IDs)")

Panel View of All
Pair IDs
panelview(D = "D",
data = TSCS_Y,
index = c("id", "T"),
display.all = TRUE, type = "treat", by.timing = TRUE,
xlab = "Time (Biweek)", ylab = "Pair ID",
axis.lab.gap = c(2,10), main = "Treatment Status")
## If the number of units is more than 300, we set "gridOff = TRUE".

Panel View of All
Pair IDs Excluding One-And-Gone Customers
TSCS_Y_D0 <- TSCS_Y[D == 0]
D0_id_count <- TSCS_Y_D0[, .(count = .N), by = id]
subset_ids <- D0_id_count[count == 1, .(id)]
TSCS_Y_length1 <- TSCS_Y[id %in% subset_ids$id]
TSCS_Y_filter <- TSCS_Y[!(id %in% subset_ids$id)]
panelview(D = "D",
data = TSCS_Y_filter,
index = c("id", "T"),
display.all = TRUE, type = "treat", by.timing = TRUE,
xlab = "Time (Biweek)", ylab = "Pair ID",
axis.lab.gap = c(2,10), main = "Treatment Status (Exlcuding One-And-Gone Customers)")
## If the number of units is more than 300, we set "gridOff = TRUE".

Large N,
Small T, Multiple P
N <- as.integer(length(TSCS_Y_filter$id)/55)
TT <- as.integer(length(unique(TSCS_Y_filter$T)))
P <- as.integer(length(colnames(TSCS_Y_filter)) - 5)
data.frame(N, TT, P)
## N TT P
## 1 22347 55 11