library("purrr")
library("plm")
library("rjson")
library("DT")
library("data.table")
##
## Attaching package: 'data.table'
## The following object is masked from 'package:plm':
##
## between
## The following object is masked from 'package:purrr':
##
## transpose
library("tidyverse")
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ dplyr 1.1.2
## ✔ tibble 3.2.1 ✔ stringr 1.4.1
## ✔ tidyr 1.2.0 ✔ forcats 0.5.2
## ✔ readr 2.1.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between() masks data.table::between(), plm::between()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks data.table::first()
## ✖ dplyr::lag() masks plm::lag(), stats::lag()
## ✖ dplyr::last() masks data.table::last()
## ✖ dplyr::lead() masks plm::lead()
## ✖ data.table::transpose() masks purrr::transpose()
library("usmap")
library("ggplot2")
library("maptools")
## Loading required package: sp
## Checking rgeos availability: FALSE
## Please note that 'maptools' will be retired during 2023,
## plan transition at your earliest convenience;
## some functionality will be moved to 'sp'.
## Note: when rgeos is not available, polygon geometry computations in maptools depend on gpclib,
## which has a restricted licence. It is disabled by default;
## to enable gpclib, type gpclibPermit()
library("mapview")
library("devtools")
## Loading required package: usethis
library("vcd")
## Loading required package: grid
library("gridExtra")
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library("shiny")
##
## Attaching package: 'shiny'
##
## The following objects are masked from 'package:DT':
##
## dataTableOutput, renderDataTable
library("MASS")
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library("faraway")
library("rgdal")
## Please note that rgdal will be retired during 2023,
## plan transition to sf/stars/terra functions using GDAL and PROJ
## at your earliest convenience.
## See https://r-spatial.org/r/2022/04/12/evolution.html and https://github.com/r-spatial/evolution
## rgdal: version: 1.6-6, (SVN revision 1201)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 3.4.2, released 2022/03/08
## Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/4.2/Resources/library/rgdal/gdal
## GDAL binary built with GEOS: FALSE
## Loaded PROJ runtime: Rel. 8.2.1, January 1st, 2022, [PJ_VERSION: 821]
## Path to PROJ shared files: /Library/Frameworks/R.framework/Versions/4.2/Resources/library/rgdal/proj
## PROJ CDN enabled: FALSE
## Linking to sp version:1.6-0
## To mute warnings of possible GDAL/OSR exportToProj4() degradation,
## use options("rgdal_show_exportToProj4_warnings"="none") before loading sp or rgdal.
library("tigris")
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
library("sf")
## Linking to GEOS 3.10.2, GDAL 3.4.2, PROJ 8.2.1; sf_use_s2() is TRUE
library("car")
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following objects are masked from 'package:faraway':
##
## logit, vif
##
## The following object is masked from 'package:maptools':
##
## pointLabel
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library("GGally")
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Attaching package: 'GGally'
##
## The following object is masked from 'package:faraway':
##
## happy
library("patchwork")
##
## Attaching package: 'patchwork'
##
## The following object is masked from 'package:MASS':
##
## area
library("tibble")
library("corrplot")
## corrplot 0.92 loaded
library("Hmisc")
##
## Attaching package: 'Hmisc'
##
## The following object is masked from 'package:maptools':
##
## label
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
## The following objects are masked from 'package:base':
##
## format.pval, units
library("ggcorrplot")
library("lubridate")
##
## Attaching package: 'lubridate'
##
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library("date")
library("fastDummies")
## Thank you for using fastDummies!
## To acknowledge our work, please cite the package:
## Kaplan, J. & Schlegel, B. (2023). fastDummies: Fast Creation of Dummy (Binary) Columns and Rows from Categorical Variables. Version 1.7.1. URL: https://github.com/jacobkap/fastDummies, https://jacobkap.github.io/fastDummies/.
library("plotly")
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:Hmisc':
##
## subplot
##
## The following object is masked from 'package:MASS':
##
## select
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library("scales")
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:readr':
##
## col_factor
##
## The following object is masked from 'package:purrr':
##
## discard
library("readr")
library("cowplot")
##
## Attaching package: 'cowplot'
##
## The following object is masked from 'package:lubridate':
##
## stamp
##
## The following object is masked from 'package:patchwork':
##
## align_plots
library("ggpmisc")
## Loading required package: ggpp
##
## Attaching package: 'ggpp'
##
## The following object is masked from 'package:ggplot2':
##
## annotate
library("scales")
library("broom")
library("pROC")
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library("pscl")
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
PDF_daily_impute_char_cumulated <-
read.csv("/Users/apple/Quantitative\ Marketing\ Research/Reconciliation\ Analysis\ I/Reconciliation\ Analysis\ I\ Data/PDF_daily_impute_char_cumulated.csv")
PDF_indiv_reconciliation_non <-
read.csv("/Users/apple/Quantitative\ Marketing\ Research/Reconciliation\ Analysis\ II/Reconciliation\ Analysis\ II\ Data/PDF_indiv_reconciliation_non.csv")
PDF_indiv_reconciliation_non_dynamic <-
read.csv("/Users/apple/Quantitative\ Marketing\ Research/Reconciliation\ Analysis\ I/Reconciliation\ Analysis\ I\ Data/PDF_indiv_reconciliation_non_dynamic.csv")
PDF_indiv_reconciliation <- read.csv("/Users/apple/Quantitative\ Marketing\ Research/Reconciliation\ Analysis\ I/Reconciliation\ Analysis\ I\ Data/PDF_indiv_reconciliation.csv")
count3to8 <-
read.csv("/Users/apple/Quantitative\ Marketing\ Research/Reconciliation\ Analysis\ II/Reconciliation\ Analysis\ II\ Data/count3to8.csv")
new_discrep_df <-
read.csv("/Users/apple/Quantitative\ Marketing\ Research/Reconciliation\ Analysis\ II/Reconciliation\ Analysis\ II\ Data/new_discrep_df.csv")
Continuing from last time, we notice that there are a total of 284
customers whose discrepancy_amount is larger than 1,000 USD
(to be specific, 5 customers are above 10,000 USD and 279 customers are
between 1,000 and 10,000 USD). This amount of discrepancy is treated to
be very suspicious, so we should attempt to take a deeper look at these
user_ids’ original transformed data frame
(PDF_daily_impute_char_cumulated).
new_discrep_df
## discrepancy_amount observation_count cumulative_observation_count user_count
## 1 <= $50 26718 26718 25404
## 2 <= $100 11491 38209 11243
## 3 <= $200 4830 43039 4680
## 4 <= $300 1612 44651 1570
## 5 <= $500 1103 45754 1084
## 6 <= $1000 646 46400 627
## 7 <= $10000 294 46694 279
## 8 <= $50000 6 46700 5
## 9 > $50000 0 46700 0
## cumulative_user_count project_count cumulative_project_count
## 1 25404 143 143
## 2 36647 138 281
## 3 41327 135 416
## 4 42897 104 520
## 5 43981 89 609
## 6 44608 74 683
## 7 44887 37 720
## 8 44892 4 724
## 9 44892 0 724
discrepancy_amount > 10,000 USDWe have stored the 5 customers with discrepancy_amount
> 10,000 USD into the vector
user_id_large_discrep_gt10000. Each customer’s history is
presented below with partial columns on
(total_redemption_amount and
credit_given_in_usd).
user_id_large_discrep_gt10000 <-
unique(PDF_indiv_reconciliation_non$user_id
[PDF_indiv_reconciliation_non$discrep_summary_non > 10000])
user_id_large_discrep_gt10000 <- c(user_id_large_discrep_gt10000)
PDF_discrep_gt10000 <-
filter(PDF_daily_impute_char_cumulated, user_id %in% user_id_large_discrep_gt10000)
for (i in 1:5) {
print(filter(PDF_daily_impute_char_cumulated,
user_id == user_id_large_discrep_gt10000[i])[, c(1:4, 6)])
}
## user_id project_id created_at total_redemption_amount credit_given_in_usd
## 1 126211 715 2022-12-19 11704.31 0
## 2 126211 726 2022-12-27 3981.50 0
## 3 126211 726 2022-12-27 9306.72 0
## 4 126211 726 2023-01-30 15377.32 0
## user_id project_id created_at total_redemption_amount credit_given_in_usd
## 1 141174 697 2022-10-19 5000.00 0
## 2 141174 697 2022-11-02 5610.81 0
## 3 141174 697 2022-11-16 67.05 0
## 4 141174 697 2022-12-16 144.71 0
## 5 141174 708 2022-12-01 103.93 0
## user_id project_id created_at total_redemption_amount credit_given_in_usd
## 1 193280 697 2022-12-08 31250 0
## user_id project_id created_at total_redemption_amount credit_given_in_usd
## 1 225744 697 2023-01-23 15000 0
## user_id project_id created_at total_redemption_amount credit_given_in_usd
## 1 61487 716 2021-10-26 0.00 3625
## 2 61487 716 2021-10-29 636.92 0
## 3 61487 716 2021-11-03 1255.33 0
## 4 61487 716 2021-11-04 595.55 0
## 5 61487 716 2021-11-18 2463.84 0
## 6 61487 716 2021-11-18 0.00 6500
## 7 61487 716 2021-12-03 1536.23 0
## 8 61487 716 2021-12-03 93.63 0
## 9 61487 716 2021-12-09 1210.69 0
## 10 61487 716 2021-12-10 219.93 0
## 11 61487 716 2021-12-15 596.64 0
## 12 61487 716 2022-01-14 1655.99 0
## 13 61487 716 2022-01-15 0.00 6500
## 14 61487 716 2022-01-17 542.19 0
## 15 61487 716 2022-01-18 839.43 0
## 16 61487 716 2022-01-19 799.14 0
## 17 61487 716 2022-01-28 678.29 0
## 18 61487 716 2022-02-03 436.59 0
## 19 61487 716 2022-02-11 843.79 0
## 20 61487 716 2022-03-31 462.72 0
## 21 61487 716 2022-04-27 753.42 0
## 22 61487 716 2022-04-27 26.13 0
## 23 61487 716 2022-05-06 972.25 0
## 24 61487 716 2022-05-06 55.51 0
## 25 61487 716 2022-05-06 0.00 6500
## 26 61487 716 2022-05-09 1212.89 0
## 27 61487 716 2022-09-05 1523.17 0
## 28 61487 716 2022-09-08 639.11 0
## 29 61487 716 2022-09-09 1883.55 0
## 30 61487 716 2022-09-26 930.88 0
## 31 61487 716 2022-10-25 777.39 0
## 32 61487 716 2022-10-25 26.13 0
## 33 61487 716 2022-11-01 236.81 0
## 34 61487 716 2022-11-02 1813.86 0
## 35 61487 716 2022-11-10 483.42 0
## 36 61487 716 2022-11-11 1651.65 0
## 37 61487 716 2022-11-22 1937.98 0
## 38 61487 716 2022-11-29 1642.94 0
## 39 61487 716 2022-11-30 3501.43 0
## 40 61487 716 2023-01-19 517.16 0
## 41 61487 716 2023-01-20 462.72 0
## 42 61487 716 2023-01-31 1433.36 0
We find out that besides the last user_id
(61487), the first four customers only redeem but never
purchase, and the amount of redemption is significant. The last
user_id, though purchases a total of four times, still
redeems more than what he/she purchases for. We may suspect that the
transaction is through other mediums or the project/application is
giving some special promotion (i.e. winning of a “lottery”). Each of
these 5 customers’ csv files are loaded below.
for (user_id in user_id_large_discrep_gt10000) {
PDF_user <- filter(PDF_daily_impute_char_cumulated, user_id == user_id)
write_csv(PDF_user, paste0("PDF_", user_id, ".csv"))
}
Later in section 8, we will model several examples from here into dynamic visualizations.
discrepancy_amount > 1,000 USDSimilarly, we can extract customers whose
discrepancy_amount is greater than 1,000 USD, and they are
stored in PDF_discrep_gt1000.
user_id_large_discrep_gt1000 <-
unique(PDF_indiv_reconciliation_non$user_id
[PDF_indiv_reconciliation_non$discrep_summary_non > 1000])
PDF_discrep_gt1000 <-
filter(PDF_daily_impute_char_cumulated, user_id %in% user_id_large_discrep_gt1000)
dim(PDF_discrep_gt1000)
## [1] 7109 69
The dimension of such a data frame composes of 7,109 (dynamic)
observations among a total of 284 unique user_ids.
The dynamic daily panel data frame for non-reconcilable accounts has
already been built previously in Reconciliation Analysis I [1], which
can be invoked by PDF_indiv_reconciliation_non_dynamic. We
can reproduce the result here by using the most comprehensive panel data
frame PDF_daily_impute_char_cumulated. However, the dynamic
version of those reconcilable accounts is not established yet. In this
section, we will create the equivalent dynamic history of the
reconcilable accounts for us to their analyze customer lifetime value
(CLV) later.
unique_pairs_recon <-
PDF_indiv_reconciliation |> distinct(user_id, project_id)
PDF_indiv_reconciliation_dynamic <-
PDF_daily_impute_char_cumulated |>
inner_join(unique_pairs_recon, by = c("user_id", "project_id"))
PDF_indiv_reconciliation_dynamic <-
PDF_indiv_reconciliation_dynamic |>
arrange(user_id, project_id, created_at)
dim(PDF_indiv_reconciliation_dynamic)
## [1] 157638 69
PDF_indiv_reconciliation_dynamic$user_id |> unique() |> length()
## [1] 48876
unique_pairs_non <-
PDF_indiv_reconciliation_non |> distinct(user_id, project_id)
PDF_indiv_reconciliation_non_dynamic <-
PDF_daily_impute_char_cumulated |>
inner_join(unique_pairs_non, by = c("user_id", "project_id"))
PDF_indiv_reconciliation_non_dynamic <-
PDF_indiv_reconciliation_non_dynamic |>
arrange(user_id, project_id, created_at)
dim(PDF_indiv_reconciliation_non_dynamic)
## [1] 126477 69
PDF_indiv_reconciliation_non_dynamic$user_id |> unique() |> length()
## [1] 43217
It is rather ineffective to process multiple panel data frames, so we
consider aggregating and integrating them together. Such daily panel
data frame will be directly applicable for our future statistical
modeling, and so we will name it simply as PDF.
is_reconcilable and is_lastIn particular, the above two frames will be merged by adding a manual
categorizing column, is_reconcilable, where it has two
factors, 0 (non-reconcilable) and 1
(reconcilable). For accessing the last created_at easily,
another new column called is_last will also be mutated,
also with two factors - 0 (not the last observation of this
unique pair) and 1 (the last observation of this unique
pair).
PDF_indiv_reconciliation_dynamic <-
mutate(PDF_indiv_reconciliation_dynamic, is_reconcilable = 1)
PDF_indiv_reconciliation_non_dynamic <-
mutate(PDF_indiv_reconciliation_non_dynamic, is_reconcilable = 0)
PDF <-
rbind(PDF_indiv_reconciliation_dynamic, PDF_indiv_reconciliation_non_dynamic)
PDF <- PDF |>
arrange(user_id, project_id, created_at) |>
group_by(user_id, project_id) |>
mutate(is_last = ifelse(created_at == max(created_at), 1, 0))
head(PDF[, c(1:3, 70:71)], 10)
## # A tibble: 10 × 5
## # Groups: user_id, project_id [4]
## user_id project_id created_at is_reconcilable is_last
## <int> <int> <chr> <dbl> <dbl>
## 1 10017 185 2022-08-04 1 0
## 2 10017 185 2022-08-05 1 0
## 3 10017 185 2023-01-01 1 0
## 4 10017 185 2023-01-18 1 1
## 5 10048 200 2022-06-23 0 0
## 6 10048 200 2022-06-30 0 0
## 7 10048 200 2022-10-19 0 1
## 8 10050 200 2022-12-22 0 1
## 9 10074 185 2021-01-29 0 0
## 10 10074 185 2021-05-28 0 0
discrep_amount and
discrep_categoryOur previous analyses have added labels of
discrep_summary_non and discrep_category for
PDF_indiv_reconciliation_non. Such information is still
important, so we will manually implement these labels in our giant
PDF. The definition of discrep_amount, by
re-emphasis, is defined by cumsum_total_redemption_amount -
cumsum_credit_given_in_usd.
PDF <-
mutate(PDF, discrep_amount = cumsum_total_redemption_amount - cumsum_credit_given_in_usd)
head(PDF[, c(1:3, 70:72)], 10)
## # A tibble: 10 × 6
## # Groups: user_id, project_id [4]
## user_id project_id created_at is_reconcilable is_last discrep_amount
## <int> <int> <chr> <dbl> <dbl> <dbl>
## 1 10017 185 2022-08-04 1 0 234.
## 2 10017 185 2022-08-05 1 0 -1016.
## 3 10017 185 2023-01-01 1 0 -758.
## 4 10017 185 2023-01-18 1 1 -643.
## 5 10048 200 2022-06-23 0 0 116.
## 6 10048 200 2022-06-30 0 0 180.
## 7 10048 200 2022-10-19 0 1 218.
## 8 10050 200 2022-12-22 0 1 104.
## 9 10074 185 2021-01-29 0 0 -330
## 10 10074 185 2021-05-28 0 0 -660
It seems to be a little bit tricky as we never see negative
discrep_amount before. That is because we only study this
quantity in PDF_indiv_reconciliation_non. To translate,
only non-reconcilable non-dynamic customers are presented. In fact, in
terms of is_reconcilable, is_last, and
discrep_amount, only two columns are needed. The way of
determining a reconcilable account is explained below.
is_last == 1 (cumulative row) +
discrep_amount <= 0 (non-positive discrepancy amount)
\(\rightarrow\)
is_reconcilable == 1 (reconcilable account)
is_last == 1 (cumulative row) +
discrep_amount > 0 (positive discrepancy amount) \(\rightarrow\)
is_reconcilable == 0 (non-reconcilable account)
Some other combinations can be easily adapted above. For clarity, we
still incorporate all of them in PDF, but keep in mind that
we do have the legitimate reason to remove any one of them.
Then, we will add a ranging column called
discrep_category. The logistics of such a mutation can be
adapted from codes in Reconciliation Analysis I [1].
break_points <- c(-Inf, 0, 50, 100, 200, 500, 1000, 10000, 50000, Inf)
labels <- c("$0-", "$0 ~ $50", "$50 ~ $100", "$100 ~ $200", "$200 ~ $500",
"$500 ~ $1000", "$1000 ~ $10000", "$10000 ~ $50000", "$50000+")
PDF$discrep_category <-
cut(PDF$discrep_amount, breaks = break_points, labels = labels, include.lowest = TRUE)
head(PDF[, c(1:3, 72:73)], 10)
## # A tibble: 10 × 5
## # Groups: user_id, project_id [4]
## user_id project_id created_at discrep_amount discrep_category
## <int> <int> <chr> <dbl> <fct>
## 1 10017 185 2022-08-04 234. $200 ~ $500
## 2 10017 185 2022-08-05 -1016. $0-
## 3 10017 185 2023-01-01 -758. $0-
## 4 10017 185 2023-01-18 -643. $0-
## 5 10048 200 2022-06-23 116. $100 ~ $200
## 6 10048 200 2022-06-30 180. $100 ~ $200
## 7 10048 200 2022-10-19 218. $200 ~ $500
## 8 10050 200 2022-12-22 104. $100 ~ $200
## 9 10074 185 2021-01-29 -330 $0-
## 10 10074 185 2021-05-28 -660 $0-
The suspicious promotion column implemented in
Reconciliation Analysis II [2] is a meaningful and powerful categorizing
column. We will first briefly review what we have done last time.
The following code chunk is the adapted from Reconciliation Analysis II [2]. Such a procedure generally returns accurate categorization. We will incorporate such a method into our more advanced algorithms.
PDF_indiv_reconciliation_non_seg1 <-
PDF_indiv_reconciliation_non[PDF_indiv_reconciliation_non$discrep_summary_non <= 300, ]
density_non_seg1 <- density(PDF_indiv_reconciliation_non_seg1$discrep_summary_non)
peaks_x_seg1 <- density_non_seg1$x[which(diff(sign(diff(density_non_seg1$y))) < 0)]
peaks_y_seg1 <- density_non_seg1$y[which(diff(sign(diff(density_non_seg1$y))) < 0)]
peaks_pdf_seg1 <- data.frame(x = peaks_x_seg1, y = peaks_y_seg1)
peak_list <- c(24.32, 48.89, 74.1, 99.3, 123.25, 149.08,
174.28, 198.86, 222.8, 249.27, 273.21, 298.41)
multiples_of_25 <- c(25, 50, 75, 100, 125, 150,
175, 200, 225, 250, 275, 300)
midpoints <- (peak_list + multiples_of_25) / 2
min_interval_width <- 3 # may vary based on personal judgment
interval_list <- list()
for(i in 1:length(midpoints)) {
dist_to_peak <- abs(midpoints[i] - peak_list[i])
dist_to_multiple <- abs(midpoints[i] - multiples_of_25[i])
half_width <- max(dist_to_peak, dist_to_multiple, min_interval_width / 2)
lower_bound <- midpoints[i] - half_width
upper_bound <- midpoints[i] + half_width
interval_list[[i]] <-
c("mid" = midpoints[i], "lowerCI" = lower_bound, "upperCI" = upper_bound)
}
interval_df <- do.call(rbind, interval_list)
interval_df <- as.data.frame(interval_df)
interval_df$peak_list <- peak_list
interval_df$multiples_of_25 <- multiples_of_25
interval_df <- interval_df[, c("peak_list", "multiples_of_25",
"mid", "lowerCI", "upperCI")]
density_estimate <-
density(PDF_indiv_reconciliation_non_seg1$discrep_summary_non)
peak_density <-
approx(x = density_estimate$x, y = density_estimate$y, xout = peak_list)$y
interval_df$peak_density <- peak_density
compute_density_values <- function(lower, upper) {
x_values <- seq(lower, upper, length.out = 100)
y_values <- approx(x = density_estimate$x, y = density_estimate$y, xout = x_values)$y
data.frame(x = x_values, y = y_values)
}
interval_densities <- lapply(1:nrow(interval_df), function(i) {
compute_density_values(interval_df$lowerCI[i], interval_df$upperCI[i])
})
interval_densities_df <- do.call(rbind, interval_densities)
interval_densities_df$group <- rep(1:nrow(interval_df), each = 100)
ggplot(PDF_indiv_reconciliation_non_seg1, aes(x = discrep_summary_non)) +
geom_density(fill = "#FF6666", alpha = 0.5) +
geom_text(data = peaks_pdf_seg1, aes(x = x, y = y, label = round(x, 2)),
vjust = -1, color = "black", size = 3) +
geom_segment(data = interval_df,
aes(x = lowerCI, xend = lowerCI, y = 0, yend = peak_density),
color = "black", linetype = "dashed", size = 0.7) +
geom_segment(data = interval_df,
aes(x = upperCI, xend = upperCI, y = 0, yend = peak_density),
color = "black", linetype = "dashed", size = 0.7) +
geom_ribbon(data = interval_densities_df, aes(x = x, ymin = 0, ymax = y, fill = factor(group)),
alpha = 0.9) +
xlim(c(0, 305)) +
labs(x = "Discrepancy Amount ($)", y = "Density",
title = "Density Plot of Discrepancy Amount (<= $305)",
fill = "Group") +
scale_fill_manual(values = c("1" = "orchid", "2" = "darkorchid4", "3" = "orchid", "4" = "orchid",
"5" = "plum", "6" = "plum", "7" = "plum", "8" = "plum",
"9" = "plum", "10" = "plum", "11" = "plum", "12" = "plum"),
labels = c("1" = "[23.160, 26.160]", "2" = "[47.945, 50.945]", "3" = "[73.050, 76.050]",
"4" = "[98.150, 101.150]", "5" = "[122.625, 125.625]", "6" = "[148.040, 151.040]",
"7" = "[173.140, 176.140]", "8" = "[197.930, 200.930]", "9" = "[222.400, 225.400]",
"10" = "[248.135, 251.135]", "11" = "[272.605, 275.605]", "12" = "[297.705, 300.705]")) + theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
interval_df <- interval_df |>
mutate(suspicious_promo = case_when(row_number() == 2 ~ 3,
row_number() %in% c(1, 3:4) ~ 2,
TRUE ~ 1))
interval_df
## peak_list multiples_of_25 mid lowerCI upperCI peak_density
## 1 24.32 25 24.660 23.160 26.160 0.0204172075
## 2 48.89 50 49.445 47.945 50.945 0.0294787060
## 3 74.10 75 74.550 73.050 76.050 0.0125878868
## 4 99.30 100 99.650 98.150 101.150 0.0061677817
## 5 123.25 125 124.125 122.625 125.625 0.0018325601
## 6 149.08 150 149.540 148.040 151.040 0.0013604500
## 7 174.28 175 174.640 173.140 176.140 0.0006942789
## 8 198.86 200 199.430 197.930 200.930 0.0010070916
## 9 222.80 225 223.900 222.400 225.400 0.0004112768
## 10 249.27 250 249.635 248.135 251.135 0.0006718587
## 11 273.21 275 274.105 272.605 275.605 0.0003076106
## 12 298.41 300 299.205 297.705 300.705 0.0004244653
## suspicious_promo
## 1 2
## 2 3
## 3 2
## 4 2
## 5 1
## 6 1
## 7 1
## 8 1
## 9 1
## 10 1
## 11 1
## 12 1
find_interval <- function(x, intervals) {
for(i in 1:nrow(intervals)) {
if(x >= intervals$lowerCI[i] & x <= intervals$upperCI[i]) {
return(intervals$suspicious_promo[i])
}
}
return(0)
}
PDF <-
PDF |> mutate(suspicious_promo = sapply(discrep_amount, find_interval, intervals = interval_df))
PDF <-
PDF |> mutate(suspicious_promo = ifelse(is_last == 0, -1, suspicious_promo))
head(PDF[, c(1:3, 71:72, 74)], 10)
## # A tibble: 10 × 6
## # Groups: user_id, project_id [4]
## user_id project_id created_at is_last discrep_amount suspicious_promo
## <int> <int> <chr> <dbl> <dbl> <dbl>
## 1 10017 185 2022-08-04 0 234. -1
## 2 10017 185 2022-08-05 0 -1016. -1
## 3 10017 185 2023-01-01 0 -758. -1
## 4 10017 185 2023-01-18 1 -643. 0
## 5 10048 200 2022-06-23 0 116. -1
## 6 10048 200 2022-06-30 0 180. -1
## 7 10048 200 2022-10-19 1 218. 0
## 8 10050 200 2022-12-22 1 104. 0
## 9 10074 185 2021-01-29 0 -330 -1
## 10 10074 185 2021-05-28 0 -660 -1
PDF[, c(1:3, 71:72, 74)] |> filter(is_last == 1) |> head(10)
## # A tibble: 10 × 6
## # Groups: user_id, project_id [9]
## user_id project_id created_at is_last discrep_amount suspicious_promo
## <int> <int> <chr> <dbl> <dbl> <dbl>
## 1 10017 185 2023-01-18 1 -643. 0
## 2 10048 200 2022-10-19 1 218. 0
## 3 10050 200 2022-12-22 1 104. 0
## 4 10074 185 2023-01-07 1 622. 0
## 5 10084 184 2021-09-18 1 484. 0
## 6 10085 184 2022-10-25 1 50.0 3
## 7 10105 184 2022-06-23 1 234. 0
## 8 10105 184 2022-06-23 1 233. 0
## 9 10108 184 2022-01-29 1 13.0 0
## 10 10117 200 2022-02-07 1 149. 1
Since our PDF is in dynamic version, only rows with
is_last == 1 are valuable for us to analyze
suspicious_promo. In order to not confuse ourselves, we
label those non-last rows with suspicious_promo == -1, and
a reader should immediately know that such a factor is not meaningful to
study for.
Algorithm 1 should be able to capture a majority of “suspicious”
promotions. However, a case we would easily find weird is that some
discrep_amount has some perfect positive round integers
(i.e. 15.00, 130.00, and etc.). Any of these round integers are not
within any of our “confidence intervals”, but we are highly skeptical
and tend to think they are also “suspicious” promotions. The extent of
suspicious_promo is an interesting question here. A
subjective belief here is that we treat these round integers as a new
class, i.e. 1.5 or 2.5. Recall the previous
definition in Reconciliation Analysis II [2]:
0 (very unlikely of being offered
promotions)
1 (unlikely of being offered
promotions)
2 (likely of being offered
promotions)
3 (very likely of being offered
promotions)
This is the extended definition:
1.5 (somewhat likely of being offered
promotions)
2.5 (highly likely of being offered
promotions)
-1 (meaningless label indicating the
observation in progress)
The reason we set 1.5 between 1 (unlikely)
and 2 (likely) can also be seen from the density plot.
Round integers tend to have higher densities, and such peak heights are
largely aligned between the peak heights of class 2 and
class 1. 2.5 works for more special cases. For
example, besides the multiples of 25, other multiples of, say, 5, are
highly favorable towards the promotion category.
However, we can’t blindly update every round integer to
1.5 or 2.5. Our domain reasoning here must
abide by the following criteria.
discrep_amount > 0 (positivity)
is.integer(discrep_amount) == TRUE
(wholeness)
discrep_amount %% 5 == 0 (periodicity) [if TRUE,
then assign 2.5, else assign 1.5]
It should notice that this is an extended method to Algorithm 1, so
we should only consider updating 0s into either
1.5 or 2.5. Do not cast 3,
2, 1, or -1 into 1.5
or 2.5, since the density-based approach is dominantly true
given peak heights.
PDF <- mutate(PDF, suspicious_promo = ifelse(is_last == 0, -1, suspicious_promo),
suspicious_promo = ifelse(discrep_amount > 0 & (discrep_amount %% 1 == 0) & (discrep_amount %% 5 != 0) & suspicious_promo == "0", 1.5, suspicious_promo),
suspicious_promo = ifelse(discrep_amount > 0 & (discrep_amount %% 1 == 0) & (discrep_amount %% 5 == 0) & suspicious_promo == "0", 2.5, suspicious_promo))
PDF <- transform(PDF, suspicious_promo = as.factor(suspicious_promo))
PDF[, c(1:3, 70:72, 74)] |> filter(is_last == 1, is_reconcilable == 0) |> head(20)
## user_id project_id created_at is_reconcilable is_last discrep_amount
## 1 10048 200 2022-10-19 0 1 218.03
## 2 10050 200 2022-12-22 0 1 104.36
## 3 10074 185 2023-01-07 0 1 622.16
## 4 10084 184 2021-09-18 0 1 483.92
## 5 10085 184 2022-10-25 0 1 50.04
## 6 10105 184 2022-06-23 0 1 233.79
## 7 10105 184 2022-06-23 0 1 232.92
## 8 10108 184 2022-01-29 0 1 12.98
## 9 10117 200 2022-02-07 0 1 149.38
## 10 10119 208 2022-06-19 0 1 55.00
## 11 10137 184 2022-05-26 0 1 248.68
## 12 10137 184 2022-05-26 0 1 196.57
## 13 10138 184 2021-10-13 0 1 95.34
## 14 10138 184 2021-10-13 0 1 92.39
## 15 10140 184 2022-01-13 0 1 102.98
## 16 10152 700 2022-12-16 0 1 118.54
## 17 10202 185 2021-10-31 0 1 53.10
## 18 10212 224 2022-10-19 0 1 285.80
## 19 10212 277 2022-11-02 0 1 266.87
## 20 10212 302 2022-08-02 0 1 1.00
## suspicious_promo
## 1 0
## 2 0
## 3 0
## 4 0
## 5 3
## 6 0
## 7 0
## 8 0
## 9 1
## 10 2.5
## 11 1
## 12 0
## 13 0
## 14 0
## 15 0
## 16 0
## 17 0
## 18 0
## 19 0
## 20 1.5
In this section, we will compare our suspicious_promo
accuracy with real promotional columns, including
utm_campaign, utm_medium,
utm_content, and utm_source. However, before
proceeding our correlational study, we should correct the
-1 label in PDF. It is true that it indicates
the observation in progress, but eventually for comparison with
utm-related columns, we have to cast all factors into
binary situations (1 or 0). This means all in
progress observations will be assigned the same value of their
cumulative comparison result. To clarify, we will mutate a new column
called suspicious_promo_binary, with only 2
factor outcomes: 1 or 0.
Take the unique pair of user_id == 10017 and
project_id == 185 as an example. By accessing
is_last == 1, we find out that
suspicious_promo == 0. This should translate into
suspicious_promo_binary == 0 for the entire observation
(i.e. row 1 to 4 all with suspicious_promo_binary == 0). If
suspicious_promo is any other factors excluding
0s for is_last == 1, which can be
1, 1.5, 2, 2.5, or
3, then mark the entire observation as
suspicious_promo_binary == 1.
PDF[, c(1:3, 70:72, 74)] |> head(4)
## user_id project_id created_at is_reconcilable is_last discrep_amount
## 1 10017 185 2022-08-04 1 0 234.31
## 2 10017 185 2022-08-05 1 0 -1015.69
## 3 10017 185 2023-01-01 1 0 -757.98
## 4 10017 185 2023-01-18 1 1 -642.98
## suspicious_promo
## 1 -1
## 2 -1
## 3 -1
## 4 0
However, we should be particularly careful if we encounter multiple
is_last == 1 rows for each user_id under
project_id. This is plausible to happen if certain
customers purchase or redeem multiple times on their last day. The
detailed HH-MM-SS information is removed to satisfy the
requirement of the panel data causes this special situation. In this
case, if any of these is_last == 1 rows have
suspicious_promo not equal to 0, we should
treat the entire observation for this user_id under this
project_id all with
suspicious_promo_binary == 1.
suspicious_promo_binaryHence, the above algorithms can be summarized in 4 steps:
Identify the unique pairs of user_id and
project_id.
For each pair, if is_last == 1 and
suspicious_promo != 0, set
suspicious_promo_binary = 1 for all rows of that
pair.
If is_last == 1 and
suspicious_promo == 0, set
suspicious_promo_binary = 0 for all rows of that
pair.
Be careful when there are multiple is_last == 1 rows
for a pair: if any of them have suspicious_promo != 0, then
all rows for that pair should be marked as
suspicious_promo_binary = 1.
set_suspicious_promo_binary <- function(is_last, suspicious_promo) {
if (any((is_last == 1) & (suspicious_promo != 0))) {
return(1)
}
else {
return(0)
}
}
PDF <-
PDF |> group_by(user_id, project_id) |>
mutate(suspicious_promo_binary = set_suspicious_promo_binary(is_last, suspicious_promo))
PDF[83:200, c(1:3, 71, 74:75)]
## # A tibble: 118 × 6
## # Groups: user_id, project_id [38]
## user_id project_id created_at is_last suspicious_promo suspicious_promo_bin…¹
## <int> <int> <chr> <dbl> <fct> <dbl>
## 1 10212 224 2022-10-19 1 0 0
## 2 10212 255 2022-02-11 0 -1 0
## 3 10212 255 2022-02-12 0 -1 0
## 4 10212 255 2022-02-13 1 0 0
## 5 10212 277 2021-04-11 0 -1 0
## 6 10212 277 2021-04-11 0 -1 0
## 7 10212 277 2021-07-01 0 -1 0
## 8 10212 277 2022-01-31 0 -1 0
## 9 10212 277 2022-02-24 0 -1 0
## 10 10212 277 2022-02-28 0 -1 0
## # ℹ 108 more rows
## # ℹ abbreviated name: ¹suspicious_promo_binary
utm_binaryIn this sub-section, we will mutate a generic column called
utm_binary to summarize all utm-related
columns (i.e. utm_campaign, utm_medium,
utm_content, and utm_source). It only has two
outcomes, either 1 or 0. Before we proceed,
let’s first take a look at how many missing data are there in
PDF among these 4 variables.
missing_values <-
sapply(PDF[c("utm_campaign", "utm_medium", "utm_content", "utm_source")],
function(x) mean(is.na(x)) * 100)
missing_values_df <- data.frame(Variable = names(missing_values),
MissingPercentage = missing_values)
ggplot(missing_values_df, aes(x = Variable, y = MissingPercentage)) +
geom_bar(stat = "identity", fill = "steelblue", color = "black") + ylim(c(0, 100)) +
geom_text(aes(label = sprintf("%.2f%%", MissingPercentage)), vjust = -0.3, color = "black") +
labs(x = "Variable", y = "Percentage of Missing Data") +
theme_minimal()
All of them are highly sparse in data, and around 14% of data is missing for each variable, respectively. For simplicity, we tend to treat customers with missing data among these 4 variables as customers who do not receive promotions for certain projects.
The mutation of utm_binary abides by the following
criteria:
Mutate a generic column called utm_binary with
values all equal to 0.
Search through each row consecutively and look at all
utm-related columns (i.e. utm_campaign,
utm_medium, utm_content, and
utm_source). If any one of these 4 columns is not
NA or (empty) [in other words, there is any
information existing], then update 0 with
1.
If for any user_id under a project_id
(a unique pair), there exists at least one row with
utm_binary == 1, we would like to set
utm_binary == 1 for all other rows of such a unique
pair.
check_not_na_or_empty <- function(x) {
return(!is.na(x) & x != "")
}
PDF$utm_binary <-
ifelse(rowSums(sapply(PDF[, c("utm_campaign", "utm_medium", "utm_content", "utm_source")],
check_not_na_or_empty)) > 0, 1, 0)
PDF <- PDF |> group_by(user_id, project_id) |>
mutate(utm_binary = ifelse(any(utm_binary == 1), 1, utm_binary)) |> ungroup()
PDF[101:110, c(1:3, 61:64)]
## # A tibble: 10 × 7
## user_id project_id created_at utm_campaign utm_medium utm_content utm_source
## <int> <int> <chr> <chr> <chr> <chr> <chr>
## 1 10212 277 2022-09-28 <NA> <NA> <NA> <NA>
## 2 10212 277 2022-10-05 <NA> <NA> <NA> <NA>
## 3 10212 277 2022-10-27 <NA> <NA> <NA> <NA>
## 4 10212 277 2022-11-02 <NA> <NA> <NA> <NA>
## 5 10212 302 2022-08-02 <NA> <NA> <NA> <NA>
## 6 10212 302 2022-08-02 <NA> <NA> <NA> <NA>
## 7 10212 306 2022-10-27 <NA> <NA> <NA> <NA>
## 8 10212 310 2022-04-15 welcome email "" modal
## 9 10212 311 2021-03-10 <NA> <NA> <NA> <NA>
## 10 10212 311 2021-03-10 <NA> <NA> <NA> <NA>
PDF[101:110, c(1:3, 76, 75, 72)]
## # A tibble: 10 × 6
## user_id project_id created_at utm_binary suspicious_promo_binary
## <int> <int> <chr> <dbl> <dbl>
## 1 10212 277 2022-09-28 0 0
## 2 10212 277 2022-10-05 0 0
## 3 10212 277 2022-10-27 0 0
## 4 10212 277 2022-11-02 0 0
## 5 10212 302 2022-08-02 0 1
## 6 10212 302 2022-08-02 0 1
## 7 10212 306 2022-10-27 0 1
## 8 10212 310 2022-04-15 1 0
## 9 10212 311 2021-03-10 0 1
## 10 10212 311 2021-03-10 0 1
## # ℹ 1 more variable: discrep_amount <dbl>
In this section, we will model the correlation between
suspicious_promo_binary and utm_binary by
binary logistic regression.
First, let’s take a look at the pair plots of these two variables.
The correlation is surprising to us: -0.338, a rather
moderate negative correlation between
suspicious_promo_binary and utm_binary!
ggpairs(PDF[, c("suspicious_promo_binary", "utm_binary")])
Then we construct the binary logistic regression model by
glm(). Such a model is stored as corr_model,
and its summary statistics is also shown below. The likelihood plot for
the binary logistic regression model is also presented.
corr_model <-
glm(utm_binary ~ suspicious_promo_binary, data = PDF, family = binomial)
options(scipen = 999)
summary(corr_model)
##
## Call:
## glm(formula = utm_binary ~ suspicious_promo_binary, family = binomial,
## data = PDF)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1457 -1.1457 -0.4684 1.2094 2.1281
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.074969 0.004352 -17.23 <0.0000000000000002 ***
## suspicious_promo_binary -2.079702 0.012916 -161.01 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 378651 on 284114 degrees of freedom
## Residual deviance: 341343 on 284113 degrees of freedom
## AIC: 341347
##
## Number of Fisher Scoring iterations: 4
x_values <-
seq(min(PDF$suspicious_promo_binary), max(PDF$suspicious_promo_binary),
length.out = 500)
y_values <-
predict(corr_model, newdata = data.frame(suspicious_promo_binary = x_values),
type = "response")
ggplot() + geom_line(aes(x = x_values, y = y_values), color = "steelblue") +
labs(title = "Likelihood Plot for Binary Logistic Regression Model",
x = "suspicious_promo_binary",
y = "Predicted Probability of utm_binary == 1") +
theme_minimal()
From the binary logistic regression plot above, we can conclude the
detailed behaviors after a simple correlation value. In particular, when
suspicious_promo_binary == 1, the predicted probability of
utm_binary == 1 is very low, around 0.1. On the other hand,
when suspicious_promo_binary == 0, the predicted
probability of utm_binary == 1 is moderately average,
around 0.5.
Intuitively speaking, if a customer under a specific project has been marked as the “suspicious” promotion receiver due to the reason that their discrepancy amount is either striking in terms of density peaks or multiples of round integers, such a customer is indeed quite unlikely to receive the real promotion. On the other hand, if such a customer is not marked as the “suspicious” promotion receiver, then it is a half-half chance for such a customer to receive the real promotion.
Lastly, we can plot the Receiver Operating Characteristic (ROC) Curve which displays the percentage of true positives predicted by the model as the prediction probability cutoff is lowered from 1 to 0. The higher the AUC (area under the curve), the more accurately our model is able to predict outcomes.
predicted_probs <- predict(corr_model, type = "response")
roc_obj <- roc(PDF$utm_binary, predicted_probs)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_val <- auc(roc_obj)
plot(roc_obj, main="Receiver Operating Characteristic (ROC) Curve Plot")
text(0.6, 0.2, paste("AUC =", round(auc_val, 2)))
We can see that the AUC is 0.65, which is moderately
average, unfortunately. This indicates that our model
corr_model performs relatively poor at distinguishing
between the positive and negative classes.
In this section, we will visualize the dynamic reconciliation for
some representative user_ids, where they are chosen in
advance from section 2.
user_id == 61487 (only across a
homogeneous project, non-reconcilable
account)PDF_61487_cumulative <-
filter(PDF[, c(1:3, 9, 11, 70)], user_id %in% 61487)
PDF_61487_cumulative$created_at <- as.Date(PDF_61487_cumulative$created_at)
PDF_61487_transf <- data.frame()
for(project in unique(PDF_61487_cumulative$project_id)) {
PDF_61487_sub <- PDF_61487_cumulative[PDF_61487_cumulative$project_id == project, ]
PDF_61487_sub_long <-
reshape2::melt(PDF_61487_sub, id.vars = c("user_id", "project_id", "created_at"),
measure.vars = c("cumsum_total_redemption_amount", "cumsum_credit_given_in_usd"),
variable.name = "Measure", value.name = "Value")
PDF_61487_transf <- rbind(PDF_61487_transf, PDF_61487_sub_long)
}
ggplot(PDF_61487_transf,
aes(x = created_at, y = Value, color = as.factor(project_id), linetype = Measure)) +
geom_line() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "Time", y = "Cumulative Value ($)", color = "Project ID", linetype = "Measure",
title = "Transaction and Redemption for Homogeneous Dynamic Non-reconcilable Account 61487") +
theme_minimal() +
theme(legend.position = "right")
user_id == 10212 (across
heterogeneous projects,
non-reconcilable account)PDF_10212_cumulative <-
filter(filter(PDF[, c(1:3, 9, 11, 70)], is_reconcilable == 0), user_id %in% 10212)
PDF_10212_cumulative$created_at <- as.Date(PDF_10212_cumulative$created_at)
PDF_10212_transf <- data.frame()
for(project in unique(PDF_10212_cumulative$project_id)) {
PDF_10212_sub <- PDF_10212_cumulative[PDF_10212_cumulative$project_id == project, ]
PDF_10212_sub_long <-
reshape2::melt(PDF_10212_sub, id.vars = c("user_id", "project_id", "created_at"),
measure.vars = c("cumsum_total_redemption_amount", "cumsum_credit_given_in_usd"),
variable.name = "Measure", value.name = "Value")
PDF_10212_transf <- rbind(PDF_10212_transf, PDF_10212_sub_long)
}
ggplot(PDF_10212_transf,
aes(x = created_at, y = Value, color = as.factor(project_id), linetype = Measure)) +
geom_line() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "Time", y = "Cumulative Value ($)", color = "Project ID", linetype = "Measure",
title = "Transaction and Redemption for Heterogeneous Dynamic Non-reconcilable Account 10212") +
theme_minimal() +
theme(legend.position = "right")
user_id == 10212 (across
heterogeneous projects, reconcilable
account)PDF_10212_cumulative_recon <-
filter(filter(PDF[, c(1:3, 9, 11, 70)], is_reconcilable == 1), user_id %in% 10212)
PDF_10212_cumulative_recon$created_at <- as.Date(PDF_10212_cumulative_recon$created_at)
PDF_10212_transf_recon <- data.frame()
for(project in unique(PDF_10212_cumulative_recon$project_id)) {
PDF_10212_sub_recon <- PDF_10212_cumulative_recon[PDF_10212_cumulative_recon$project_id == project, ]
PDF_10212_sub_long_recon <-
reshape2::melt(PDF_10212_sub_recon, id.vars = c("user_id", "project_id", "created_at"),
measure.vars = c("cumsum_total_redemption_amount", "cumsum_credit_given_in_usd"),
variable.name = "Measure", value.name = "Value")
PDF_10212_transf_recon <- rbind(PDF_10212_transf_recon, PDF_10212_sub_long_recon)
}
ggplot(PDF_10212_transf_recon,
aes(x = created_at, y = Value, color = as.factor(project_id), linetype = Measure)) +
geom_line() + ylim(c(0, 12000)) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(x = "Time", y = "Cumulative Value ($)", color = "Project ID", linetype = "Measure",
title = "Transaction and Redemption for Heterogeneous Dynamic Reconcilable Account 10212") +
theme_minimal() +
theme(legend.position = "right")
## Warning: Removed 1 row containing missing values (`geom_line()`).
Lastly, we continue from Reconciliation Analysis II [2] and directly utilize our previous pseudo-stochastic selections.
PDFWe reorder those newly mutated columns after numerical columns but before expanded factor columns.
PDF <- PDF[, c(1:13, 70:76, 14:69)]
user_ids under each conditionThe following 6 vectors are generated by our pseudo-stochastic selection. For detailed explanation, please inquiry Reconciliation Analysis II [2].
CP3_user_id <- c(32227, 205555, 25174, 120196, 122843, 111576)
CP4_user_id <- c(185744, 133067, 182610, 129394)
CP5_user_id <- c(26826, 25855, 190610)
CP6_user_id <- c(40585, 31986)
CP7_user_id <- c(10800, 26018, 89038)
CP8_user_id <- c(129377, 146344)
PDF_CP3 <- filter(PDF, user_id %in% CP3_user_id)
PDF_CP4 <- filter(PDF, user_id %in% CP4_user_id)
PDF_CP5 <- filter(PDF, user_id %in% CP5_user_id)
PDF_CP6 <- filter(PDF, user_id %in% CP6_user_id)
PDF_CP7 <- filter(PDF, user_id %in% CP7_user_id)
PDF_CP8 <- filter(PDF, user_id %in% CP8_user_id)
write.csv(PDF_CP3, "PDF_CP3.csv", row.names = F)
write.csv(PDF_CP4, "PDF_CP4.csv", row.names = F)
write.csv(PDF_CP5, "PDF_CP5.csv", row.names = F)
write.csv(PDF_CP6, "PDF_CP6.csv", row.names = F)
write.csv(PDF_CP7, "PDF_CP7.csv", row.names = F)
write.csv(PDF_CP8, "PDF_CP8.csv", row.names = F)
[1] Z. Jiang, “RPubs - Reconciliation Analysis I - Modeling and Extracting Emblematic Non-reconcilable Dynamics,” rpubs.com, Jul. 13, 2023. https://rpubs.com/jiangzm/1063534 (accessed Jul. 21, 2023).
[2] Z. Jiang, “RPubs - Reconciliation Analysis II - Density-based Estimation of Suspicious Promotions and Pseudo-stochastic Selection of Emblematic Non-reconcilable Dynamics,” rpubs.com, Jul. 18, 2023. https://rpubs.com/jiangzm/1064494 (accessed Jul. 24, 2023).