knitr::opts_chunk$set(echo = TRUE)

1 Package Invoking

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(zoo)
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(data.table)
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
library(panelView)
## ## See bit.ly/panelview4r for more info.
## ## Report bugs -> yiqingxu@stanford.edu.
library(bpCausal)

2 Data Loading

PDF_JA <- read.csv("/Users/apple/Quantitative\ Marketing\ Research/Statistical\ Modeling\ V/Statistical\ Modeling\ V\ Data/PDF_JA.csv") 

3 Multiple Outcome Time-Series Cross-Sectional (TSCS) Data Maniputation

TSCS <- 
  PDF_JA[, c(1:3, 8:9, 14, 16, 4:5, 12, 17, 6:7, 13, 18, 10:11, 15, 19:67)]

TSCS$D <- 1 - TSCS$D

TSCS_Y <- TSCS[, -c(4, 8, 12, 16, 19:67)]

colnames(TSCS_Y)[c(1:14)] <- 
  c("id", "T", "D",
    "F_trans", "M_trans", "R_trans", 
    "F_redem", "M_redem", "R_redem",
    "F_charg", "M_charg", "R_charg",
    "F_tip", "M_tip")

split_pairs <- strsplit(as.character(TSCS_Y$id), "-")

TSCS_Y$user <- sapply(split_pairs, "[[", 1)

TSCS_Y$proj <- sapply(split_pairs, "[[", 2)

TSCS_Y$user <- as.integer(TSCS_Y$user)

TSCS_Y$proj <- as.integer(TSCS_Y$proj)

unique_pairs <- unique(paste(TSCS_Y$user, TSCS_Y$proj, sep="-"))

pair_to_int <- setNames(seq_along(unique_pairs), unique_pairs)

TSCS_Y$id <- pair_to_int[paste(TSCS_Y$user, TSCS_Y$proj, sep="-")]

TSCS_Y$D <- as.integer(TSCS_Y$D)

TSCS_Y$user <- as.factor(TSCS_Y$user)

TSCS_Y$proj <- as.factor(TSCS_Y$proj)

4 Matrix Expansion

complete_set <- expand_grid(id = unique(TSCS_Y$id), T = 1:55)

TSCS_Y <- complete_set |>
  left_join(TSCS_Y, by = c("id", "T")) |>
  group_by(id) |>
  fill(user, proj, .direction = "downup")

TSCS_Y <- TSCS_Y |> ungroup()

5 Impute D

impute_D <- function(sub_df) {

  first_one_idx <- which(sub_df$D == 1)[1]
  
  if (!is.na(first_one_idx)) {
    
    sub_df[1:(first_one_idx - 1), "D"] <- 0
    
    sub_df[first_one_idx:nrow(sub_df), "D"] <- 1
    
  } 
  
  else {

    sub_df[, "D"] <- 0
    
  }
  
  return(sub_df)
  
}

TSCS_Y <- TSCS_Y |> group_by(id) |> group_modify(~impute_D(.x)) |> ungroup()

6 Impute Multiple Ys

6.1 Fix First Row

TSCS_Y <- TSCS_Y |> arrange(id) |> group_by(id) |>
  mutate(across(starts_with("R_"), ~ifelse(row_number() == 1 & is.na(.), 999, .))) |>
  mutate(across(-starts_with("R_"), ~ifelse(row_number() == 1 & is.na(.), 0, .))) |>
  ungroup()

6.2 Impute Frequency Metrics

TSCS_Y <- TSCS_Y |> arrange(id, T)

f_columns <- grep("^F_", names(TSCS_Y), value = TRUE)

impute_freq <- function(group_df) {
  
  for (col in f_columns) {
    
    group_df[[col]] <- ifelse(is.na(group_df[[col]]), 0, group_df[[col]])
    
    group_df[[col]] <- cumsum(group_df[[col]])
    
  }
  
  return(group_df)
  
}

TSCS_Y <- TSCS_Y |> group_by(id) |> 
  group_modify(~ impute_freq(.)) |> ungroup()

6.3 Impute Monetary Metrics

TSCS_Y <- as.data.table(TSCS_Y)

m_columns <- grep("^M_", names(TSCS_Y), value = TRUE)

TSCS_Y[, (m_columns) := lapply(.SD, function(x) {
                         x <- nafill(x, type = "locf")
                         ifelse(is.na(x), 0, x)
                       }), by = id, .SDcols = m_columns]

6.4 Impute Recency Metrics

r_columns <- grep("^R_", names(TSCS_Y), value = TRUE)

TSCS_Y[, (r_columns) := lapply(.SD, function(x) {
                         x <- nafill(x, type = "locf")
                         ifelse(is.na(x), 999, x)
                       }), by = id, .SDcols = r_columns]
TSCS_Y$D <- as.integer(TSCS_Y$D)

7 Panel Data Overview

7.1 Panel View of Top 100 Pair IDs

TSCS_Y_top100 <- TSCS_Y[1:5500, ]

panelview(D = "D", 
          data = TSCS_Y_top100, 
          index = c("id", "T"), 
          display.all = TRUE, type = "treat", by.timing = TRUE,
          xlab = "Time (Biweek)", ylab = "Pair ID",
          axis.lab.gap = c(2,10), main = "Treatment Status (Top 100 Pair IDs)")

7.2 Panel View of All Pair IDs

panelview(D = "D", 
          data = TSCS_Y, 
          index = c("id", "T"), 
          display.all = TRUE, type = "treat", by.timing = TRUE,
          xlab = "Time (Biweek)", ylab = "Pair ID",
          axis.lab.gap = c(2,10), main = "Treatment Status")
## If the number of units is more than 300, we set "gridOff = TRUE".

7.3 Panel View of All Pair IDs Excluding One-And-Gone Customers

TSCS_Y_D0 <- TSCS_Y[D == 0]

D0_id_count <- TSCS_Y_D0[, .(count = .N), by = id]

subset_ids <- D0_id_count[count == 1, .(id)]

TSCS_Y_length1 <- TSCS_Y[id %in% subset_ids$id]

TSCS_Y_filter <- TSCS_Y[!(id %in% subset_ids$id)]
panelview(D = "D", 
          data = TSCS_Y_filter, 
          index = c("id", "T"), 
          display.all = TRUE, type = "treat", by.timing = TRUE,
          xlab = "Time (Biweek)", ylab = "Pair ID",
          axis.lab.gap = c(2,10), main = "Treatment Status (Exlcuding One-And-Gone Customers)")
## If the number of units is more than 300, we set "gridOff = TRUE".

7.4 Large N, Small T, Multiple P

N <- as.integer(length(TSCS_Y_filter$id)/55)

TT <- as.integer(length(unique(TSCS_Y_filter$T)))

P <- as.integer(length(colnames(TSCS_Y_filter)) - 5)

data.frame(N, TT, P) 
##       N TT  P
## 1 22347 55 11