library("plm")
library("rjson")
library("DT")
library("data.table")
## 
## Attaching package: 'data.table'
## The following object is masked from 'package:plm':
## 
##     between
library("tidyverse")
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0     ✔ purrr   0.3.4
## ✔ tibble  3.2.1     ✔ dplyr   1.1.2
## ✔ tidyr   1.2.0     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between()   masks data.table::between(), plm::between()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::first()     masks data.table::first()
## ✖ dplyr::lag()       masks plm::lag(), stats::lag()
## ✖ dplyr::last()      masks data.table::last()
## ✖ dplyr::lead()      masks plm::lead()
## ✖ purrr::transpose() masks data.table::transpose()
library("usmap")
library("ggplot2")
library("maptools")
## Loading required package: sp
## Checking rgeos availability: FALSE
## Please note that 'maptools' will be retired during 2023,
## plan transition at your earliest convenience;
## some functionality will be moved to 'sp'.
##      Note: when rgeos is not available, polygon geometry     computations in maptools depend on gpclib,
##      which has a restricted licence. It is disabled by default;
##      to enable gpclib, type gpclibPermit()
library("mapview")
library("devtools")
## Loading required package: usethis
library("vcd")
## Loading required package: grid
library("gridExtra")
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
library("shiny")
## 
## Attaching package: 'shiny'
## 
## The following objects are masked from 'package:DT':
## 
##     dataTableOutput, renderDataTable
library("MASS")
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library("faraway")
library("rgdal")
## Please note that rgdal will be retired during 2023,
## plan transition to sf/stars/terra functions using GDAL and PROJ
## at your earliest convenience.
## See https://r-spatial.org/r/2022/04/12/evolution.html and https://github.com/r-spatial/evolution
## rgdal: version: 1.6-6, (SVN revision 1201)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 3.4.2, released 2022/03/08
## Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/4.2/Resources/library/rgdal/gdal
## GDAL binary built with GEOS: FALSE 
## Loaded PROJ runtime: Rel. 8.2.1, January 1st, 2022, [PJ_VERSION: 821]
## Path to PROJ shared files: /Library/Frameworks/R.framework/Versions/4.2/Resources/library/rgdal/proj
## PROJ CDN enabled: FALSE
## Linking to sp version:1.6-0
## To mute warnings of possible GDAL/OSR exportToProj4() degradation,
## use options("rgdal_show_exportToProj4_warnings"="none") before loading sp or rgdal.
library("tigris")
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
library("sf")
## Linking to GEOS 3.10.2, GDAL 3.4.2, PROJ 8.2.1; sf_use_s2() is TRUE
library("car")
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following objects are masked from 'package:faraway':
## 
##     logit, vif
## 
## The following object is masked from 'package:maptools':
## 
##     pointLabel
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library("GGally")
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## 
## Attaching package: 'GGally'
## 
## The following object is masked from 'package:faraway':
## 
##     happy
library("patchwork")
## 
## Attaching package: 'patchwork'
## 
## The following object is masked from 'package:MASS':
## 
##     area
library("tibble")
library("corrplot")
## corrplot 0.92 loaded
library("Hmisc")
## 
## Attaching package: 'Hmisc'
## 
## The following object is masked from 'package:maptools':
## 
##     label
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library("ggcorrplot")
library("lubridate")
## 
## Attaching package: 'lubridate'
## 
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library("date")

Loading the Example ETL Data

example_ETL1 <- read.csv("/Users/apple/Desktop/380101014883.csv")

example_ETL2 <- read.csv("/Users/apple/Desktop/380100299547.csv")

example_ETL1
example_ETL2

Both example_ETL1 and example_ETL2 are sample panel (longitudinal) data frames. Our goal of EVA VII is to convert our previous data frames into such format for later modeling purposes.

Importing Related Data Frames from EDA I, II, III, IV

redemptions_2021 <- 
  fromJSON(file = "/Users/apple/Desktop/2023\ Feb\ Transfer/redemptions_2021.json")

redemptions_2022 <- 
  fromJSON(file = "/Users/apple/Desktop/2023\ Feb\ Transfer/redemptions_2022.json")

redemptions_2023_Jan <- 
  fromJSON(file = "/Users/apple/Desktop/2023\ Feb\ Transfer/redemptions_2023_Jan.json")

revenue_view_2021 <-
  fromJSON(file = "/Users/apple/Desktop/2023\ Feb\ Transfer/revenue_view_2021.json")

revenue_view_2022 <- 
  fromJSON(file = "/Users/apple/Desktop/2023\ Feb\ Transfer/revenue_view_2022.json")

revenue_view_2023_Jan <- 
  fromJSON(file = "/Users/apple/Desktop/2023\ Feb\ Transfer/revenue_view_2023_Jan.json")

redemptions_2021 <- lapply(redemptions_2021, function(x) {
  x[sapply(x, is.null)] <- NA
  unlist(x)
})

df_redemptions_2021 <- as.data.frame(do.call("cbind", redemptions_2021)) |> t()

redemptions_2022 <- lapply(redemptions_2022, function(x) {
  x[sapply(x, is.null)] <- NA
  unlist(x)
})

df_redemptions_2022 <- as.data.frame(do.call("cbind", redemptions_2022)) |> t()

redemptions_2023_Jan <- lapply(redemptions_2023_Jan, function(x) {
  x[sapply(x, is.null)] <- NA
  unlist(x)
})

df_redemptions_2023_Jan <- as.data.frame(do.call("cbind", redemptions_2023_Jan)) |> t()

revenue_view_2021 <- lapply(revenue_view_2021, function(x) {
  x[sapply(x, is.null)] <- NA
  unlist(x)
})

df_revenue_view_2021 <- as.data.frame(do.call("cbind", revenue_view_2021)) |> t()

revenue_view_2022 <- lapply(revenue_view_2022, function(x) {
  x[sapply(x, is.null)] <- NA
  unlist(x)
})

df_revenue_view_2022 <- as.data.frame(do.call("cbind", revenue_view_2022)) |> t()

revenue_view_2023_Jan <- lapply(revenue_view_2023_Jan, function(x) {
  x[sapply(x, is.null)] <- NA
  unlist(x)
})

df_revenue_view_2023_Jan <- as.data.frame(do.call("cbind", revenue_view_2023_Jan)) |> t()

df_redemptions <- 
  rbind(df_redemptions_2021, df_redemptions_2022, df_redemptions_2023_Jan)

df_revenue_view <- 
  rbind(df_revenue_view_2021, df_revenue_view_2022, df_revenue_view_2023_Jan)

df_redemptions <- data.frame(df_redemptions)

df_redemptions <- transform(df_redemptions, 
                  total_redemption_amount = as.numeric(total_redemption_amount),
                  tip = as.numeric(tip),
                  Venue.Type...Detail = as.factor(Venue.Type...Detail), 
                  Check.Average = as.factor(Check.Average),
                  Service.Type = as.factor(Service.Type))

df_revenue_view <- data.frame(df_revenue_view)

df_revenue_view <- transform(df_revenue_view, 
                  amount_charged_in_usd = as.numeric(amount_charged_in_usd),
                  credit_given_in_usd = as.numeric(credit_given_in_usd), 
                  is_app_purchase = as.factor(is_app_purchase),
                  transaction_type = as.factor(transaction_type),
                  credit_type = as.factor(credit_type),
                  is_excess = as.factor(is_excess),
                  stripe_brand = as.factor(stripe_brand))
brand_tags <- 
  fromJSON(file = "/Users/apple/Desktop/2023\ Feb\ Transfer/brand_tags.json")

projects <- 
  fromJSON(file = "/Users/apple/Desktop/2023\ Feb\ Transfer/projects.json")

users_thru_jan23 <- 
  fromJSON(file = "/Users/apple/Desktop/2023\ Feb\ Transfer/users_thru_jan23.json")

brand_tags <- lapply(brand_tags, function(x) {
  x[sapply(x, is.null)] <- NA
  unlist(x)
})

df_brand_tags <- as.data.frame(do.call("cbind", brand_tags)) |> t()

projects <- lapply(projects, function(x) {
  x[sapply(x, is.null)] <- NA
  unlist(x)
})

df_projects <- as.data.frame(do.call("cbind", projects)) |> t()

users_thru_jan23 <- lapply(users_thru_jan23, function(x) {
  x[sapply(x, is.null)] <- NA
  unlist(x)
})

df_users_thru_jan23 <- as.data.frame(do.call("cbind", users_thru_jan23)) |> t()

df_brand_tags <- data.frame(df_brand_tags)

df_brand_tags <- transform(df_brand_tags, 
                             is_featured = as.numeric(is_featured),
                             longitude = as.numeric(longitude),
                             latitude = as.numeric(latitude))

df_projects <- data.frame(df_projects)

df_users_thru_jan23 <- data.frame(df_users_thru_jan23)

Data Wrangling

We have reloaded all the data frames again from EDA I through IV. Notice our goal is to set up a long-form panel data frame. Potential covariates will be picked from these 5 data frames based on our prior judgement. It is easy to recognize that both df_brand_tags and df_projects do not include the common column user_id. Hence, both data frames will be discarded in this EDA. Also, both tables are not directly described in the major components of the matrix factorization (MF) model according to Koren, Bell, Volinsky [1]. We will merge the rest 3 data frames together and further filter out the most important covariates in the panel data. Some redundant columns are immediately checked for saving the size.

DF1 <- full_join(df_redemptions, df_revenue_view, 
                 by = c("user_id", "created_at", "project_id"))

DF <- full_join(DF1, df_users_thru_jan23, by = "user_id")

DF <- 
  mutate(DF, created_at = as.Date(DF$created_at, tz = "UTC"), 
         account_created_at = as.Date(DF$account_created_at, tz = "UTC"))

DF

Notice that we have re-processed the time stamps created_at and account_created_at into standard YYYY-MM-DD format.

Excluding Confounding Customers

Regarding the variable account_created_at from df_users_thru_jan23, it is a great source for us to exclude confounding customers. The confounding customers refer to those who either 1). have weird or unusual customer IDs user_id, such as those IDs with quite small digits or 2). have registered extremely early compare to the official release date of the application, where we can utilize account_created_at to help us judge whether if we should toss them out.

We research the official release date of the application online. However, the exact timing table cannot be explicitly found. One tactic is that we can look at this application’s earliest customer reviews on both Apple App Store and Google Play. The earliest comment we can retrieve from Google Play [2] is April 17, 2020, whereas the earliest comment we can retrieve from Apple App Store [3] is roughly six years ago (the year of 2017). To ensure that we do not delete some real customers here, we will pick a safe cutoff date, say 2016-12-31. The real customers should create their accounts as early as early 2017, and so their transaction history should also follow that date roughly. Hence, two date cutoffs are strictly enforced here. For customers who have either created their accounts or purchased/redeemed on the app before the end of 2016, they are considered to be zombie users and should be tossed out.

DF_fil <-
  filter(DF, (account_created_at >= as.Date("2016-12-31")) & 
              created_at >= as.Date("2016-12-31"))

DF_fil <- filter(DF_fil, !is.na(user_id))

dim(DF)
## [1] 416996     34
dim(DF_fil)
## [1] 325064     34
nrow(DF) - nrow(DF_fil)
## [1] 91932

The first exclusion based on customers’ timing reports successfully shrinks the data frame by 25%. To proceed the second exclusion based on weird or unusual customer IDs user_id, this is rather harder to achieve with a few lines of code. To keep it simple and safe, we will first create a new column called digit_count to record the number of digits for each user_id.

DF_fil$user_id <- as.integer(DF_fil$user_id)

DF_fil$user_id <- as.character(DF_fil$user_id)

DF_fil <- DF_fil |> mutate(digit_count = nchar(user_id))

digit_counts <- DF_fil |> count(digit_count) |> arrange(digit_count)

digit_counts
2512/416996
## [1] 0.006024039

The above table digit_count has shown that there are 2,513 observations with 4 digits in user_id, 161,738 observations with 5 digits in user_id, and 160,813 observations with 6 digits in user_id. 4 digits seem to be weird customer IDs. These customers may be real person (not computer testing data), but they could be highly possible that they are created by the firm itself. Also, consider the size of 4-digit customer ID users, the proportion of them is quite small (less than 0.6%). It is not harmful to exclude them. However, for customers with either 5 digits or 6 digits, given the large population there, we tend to consider both of them as real customers.

DF_fil <- DF_fil |> filter(nchar(user_id) != 4)

DF_fil <- DF_fil[, 1:34]

dim(DF_fil)
## [1] 322551     34

After both methods to exclude potential fake customers, the eventual N we obtain is 322,551, which still satisfies to be a part of the challenge “large N, small T, many P”.

Discarding Unnecessary Covariates

In each subsection, we will present a table of latent factors that we think should be tossed out.

Removing unique identifiers

Before turning the data frame into a panel data frame, we notice that some unimportant variables may be discarded since they may not play any informative roles here. The definition of these unimportant variables can vary, but a basic principle here is that we treat IDs (other IDs except user_id) as not important. They only appear to be unique identifiers, which are unlikely to provide useful information for a model. If they are included in the model, they could lead to overfitting because the model might learn patterns specific to individual IDs, which are not generalizable.

DF_fil[, c(3:8, 9, 15:18, 30)]

Removing latent variables With too many missing values

If a variable has many missing values, it might not provide much information for the model. By checking the number of missing values in each column, we consider excluding those with a high proportion of missing values.

summary(is.na(DF_fil))
##   user_id        created_at      project_id      project.name   
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:322551    FALSE:322551    FALSE:322551    FALSE:180038   
##                                                  TRUE :142513   
##   Location       project_location_id    city           state        
##  Mode :logical   Mode :logical       Mode :logical   Mode :logical  
##  FALSE:180038    FALSE:180038        FALSE:180038    FALSE:180038   
##  TRUE :142513    TRUE :142513        TRUE :142513    TRUE :142513   
##  credit_transaction_id total_redemption_amount    tip         
##  Mode :logical         Mode :logical           Mode :logical  
##  FALSE:180038          FALSE:180038            FALSE:135596   
##  TRUE :142513          TRUE :142513            TRUE :186955   
##  Venue.Type...Detail Check.Average   Service.Type       UID         
##  Mode :logical       Mode :logical   Mode :logical   Mode :logical  
##  FALSE:180038        FALSE:180038    FALSE:180038    FALSE:142513   
##  TRUE :142513        TRUE :142513    TRUE :142513    TRUE :180038   
##     name           gcp_id          ict_id        amount_charged_in_usd
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical        
##  FALSE:142513    FALSE:104077    FALSE:38436     FALSE:142513         
##  TRUE :180038    TRUE :218474    TRUE :284115    TRUE :180038         
##  utm_campaign    utm_medium      utm_content     utm_source     
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:72723     FALSE:72723     FALSE:71232     FALSE:72707    
##  TRUE :249828    TRUE :249828    TRUE :251319    TRUE :249844   
##  is_app_purchase transaction_type credit_type     credit_given_in_usd
##  Mode :logical   Mode :logical    Mode :logical   Mode :logical      
##  FALSE:142513    FALSE:142513     FALSE:142513    FALSE:142513       
##  TRUE :180038    TRUE :180038     TRUE :180038    TRUE :180038       
##  is_excess       stripe_brand    option_id       account_created_at
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical     
##  FALSE:39471     FALSE:136620    FALSE:141918    FALSE:322551      
##  TRUE :283080    TRUE :185931    TRUE :180633                      
##   zip_code       user_app_version user_app_platform
##  Mode :logical   Mode :logical    Mode :logical    
##  FALSE:6208      FALSE:308101     FALSE:308101     
##  TRUE :316343    TRUE :14450      TRUE :14450

The is.na() function creates a logical matrix where TRUE represents a missing value (NA) and FALSE represents a non-missing value. For better readability, we will process a new table to store such information. We will also perform a percentage calculation to better decide if such a latent factor should be removed or not.

summary <- DF_fil |> summarise(across(everything(), 
                     list(Missing = ~sum(is.na(.)), 
                          NonMissing = ~sum(!is.na(.)), 
                          PercentageMissing = ~round(sum(is.na(.))/length(.), 2))))

summary <- summary |> pivot_longer(cols = everything(), 
                                   names_to = c("Column", ".value"), 
                                   names_pattern = "(.*)_(.*)")

summary

The summary table above displays each potential variable’s missing values, non-missing values, and percentages of missing. We will visualize the percentage in the following plot.

ggplot(summary, 
       aes(x = reorder(Column, PercentageMissing), y = PercentageMissing)) + 
  geom_bar(stat = "identity", fill = "steelblue", col = "white") +
  geom_text(aes(label = paste0(PercentageMissing*100, "%")), hjust = -0.1) +
  geom_hline(aes(yintercept = 0.75), linetype="dashed", color = "red") +
  coord_flip() +
  labs(x = "Column", y = "Percentage Missing", 
       title = "Percentage of Missing Values For Each Variable")

An red dashed line at 75% has been arbitrarily implemented. It is set up as my personal cutoff, so I intend to discard any covariates that are missing more than 75% of their entries. This is rather a safe choice. Hence, we will exclude columns zip_code, is_excess, ict_id, utm_content, utm_source, utm_medium, and utm_campaign. Notice that we see a lot of flat percentages among several covariates. This is reasonable since that bunch of variables is coming from the same data frame before the merging happens.

Some controversial alarms

Also, locations-related information may also be ignored. However, it can be controversial and dangerous to perform such action early. According to Koren, Bell, Volinsky [1], a fourth optional input source, known as user attribute, \(\sum_{a \in A(u)} y_a\), may be included in MF model. This is suggested to do when we face a “cold start” problem since we have limited amount of information on customers in pre-treatment periods. By adding an extra input source such as user attributes, we can utilize such implicit information to prevent some problems like overfitting. However, for simplicity, we tend to exclude the demographic information in our EDA VII. We may possibly include them in future EDAs for better interpretability.

DF_fil[, 31:34]

Now we can begin the exclusion process. The above information has told us that what variables should be tossed out. The model after shrinkage is shown below.

DF_final <- DF_fil[, -c(3:8, 9, 15:18, 30, 31:34, 28, 20:23)]

DF_final

The eventual data frame before transitioning to a panel data frame has only 13 latent variables and more than 300K observations. In viewing of such a large number of N, our P is comparatively small. I am not sure if this final data frame is fitted well for a “large N, small T, many P” problem. Hence, the following transformation may be important in dealing with such situation.

Panel Data Transformation

Before transforming DF_final into the panel data frame, we need to be careful with the time granularity level. A basic thought is to split such data frame into 3 measures in terms of daily granularity, weekly granularity, and monthly granularity.

First, the daily granularity is created for recording every single observation. That is, each customer’s purchasing or redeeming action will be recorded every day. This is considered to be the most detailed data frame.

Second, the weekly granularity is created as a balance between the daily and monthly granularity. However, the concept of weekly data can be more complex since there exist a lot of definitions that are not dominantly accepted. For simplicity, I will define a week based on the ISO 8601 standard, which states that a week starts on Monday and ends on Sunday.

Third, the monthly granularity is created for revealing some seasonal patterns. This data frame is not detailed compared to the daily and weekly granularity. However, we may see some unexpected seasonal patterns from such a data frame.

We will perform the date parsing below and assign each granularity in a separate column.

start_date <- as.Date("2021-01-04")

DF_final$created_at <- as.Date(DF_final$created_at, format = "%Y-%m-%d")

DF_final$day <- DF_final$created_at

DF_final$week <- as.Date(cut(DF_final$created_at, "week"), format="%Y-%m-%d")

DF_final$week_index <- as.integer(format(DF_final$created_at, "%U")) 

DF_final$week_index[DF_final$created_at < as.Date("2021-01-04")] <- 0

DF_final$month <- as.Date(format(DF_final$created_at, "%Y-%m-01"), format="%Y-%m-%d")

DF_final[, c(1:2, 14:17)]

Notice that according to ISO 8601 standard, the first week of the year (week 01) is the week that includes January 4. This means that if January 1, 2, or 3 falls on a Sunday, Monday, Tuesday, Wednesday, or Thursday, then it is part of week 01. If January 1, 2, or 3 falls on a Friday or Saturday, then it is part of the last week of the previous year. The calendar shows that 2021-01-01 is a Friday, so it is part of the last week of year 2020. Hence, the week of 2020-12-28 is labeled week 0, the week of 2021-01-04 is labeled week 1, the week of 2021-01-11 is labeled week 2, and so on. The table above has shown the added time granularity for panel data transformation. Since day and created_at is duplicated, we will remove created_at. The following data frames are stored for daily, weekly, and monthly granularity, respectively.

DF_final <- DF_final[, -2]

DF_daily <- DF_final[, -c(14:16)]

DF_weekly <- DF_final[, -c(13, 16)]

DF_monthly <- DF_final[, -c(13:15)]

Panel Data Frame - Daily Granularity

The transformation into panel data frame for daily granularity is easy. We can directly apply pdata.frame() to perform such transformation.

PDF_daily <- pdata.frame(DF_daily, index = c("user_id", "day"))
## Warning in pdata.frame(DF_daily, index = c("user_id", "day")): duplicate couples (id-time) in resulting pdata.frame
##  to find out which, use, e.g., table(index(your_pdataframe), useNA = "ifany")
PDF_daily

Panel Data Frame - Weekly Granularity

However, the transaction for weekly and monthly granularity is harder since we need to group by customers during a common period of week or month. The chunks below for weekly and monthly granularity are just for display purposes. The group_by and summarise functions keep running forever but not reporting any errors. I have waited for at least 45 minutes on R’s responding. Unfortunately, it still keeps loading and fails to report anything. I will find another way to tackle such a problem in our next EDA VIII.

DF_weekly <- DF_weekly |>
  group_by(user_id, week) |>
  summarise(across(where(is.numeric), mean, na.rm = TRUE),
            across(where(!is.numeric), ~ names(which.max(table(.)))))

PDF_weekly <- pdata.frame(DF_weekly, index = c("user_id", "week", "week_index"))

Panel Data Frame - Monthly Granularity

DF_monthly <- DF_monthly |>
  group_by(user_id, week) |>
  summarise(across(where(is.numeric), mean, na.rm = TRUE),
            across(where(!is.numeric), ~ names(which.max(table(.)))))

PDF_monthly <- pdata.frame(DF_monthly, index = c("user_id", "month"))

References

[1] Y. Koren, R. Bell, and C. Volinsky, “Matrix Factorization Techniques for Recommender Systems,” Computer, vol. 42, no. 8, pp. 30–37, Aug. 2009, doi: https://doi.org/10.1109/mc.2009.263 [accessed Jul. 09, 2023].

[2] Google Play, “inKind - Apps on Google Play,” play.google.com, 2020. https://play.google.com/store/apps/details?id=com.inkind.inkind&hl=en_US&gl=US [accessed Jul. 07, 2023].

[3] Apple App Store, “inKind,” App Store, 2023. https://apps.apple.com/us/app/inkind/id1153771822 [accessed Jul. 07, 2023].