library("plm")
library("rjson")
library("DT")
library("data.table")
##
## Attaching package: 'data.table'
## The following object is masked from 'package:plm':
##
## between
library("tidyverse")
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.4
## ✔ tibble 3.2.1 ✔ dplyr 1.1.2
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::between() masks data.table::between(), plm::between()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks data.table::first()
## ✖ dplyr::lag() masks plm::lag(), stats::lag()
## ✖ dplyr::last() masks data.table::last()
## ✖ dplyr::lead() masks plm::lead()
## ✖ purrr::transpose() masks data.table::transpose()
library("usmap")
library("ggplot2")
library("maptools")
## Loading required package: sp
## Checking rgeos availability: FALSE
## Please note that 'maptools' will be retired during 2023,
## plan transition at your earliest convenience;
## some functionality will be moved to 'sp'.
## Note: when rgeos is not available, polygon geometry computations in maptools depend on gpclib,
## which has a restricted licence. It is disabled by default;
## to enable gpclib, type gpclibPermit()
library("mapview")
library("devtools")
## Loading required package: usethis
library("vcd")
## Loading required package: grid
library("gridExtra")
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library("shiny")
##
## Attaching package: 'shiny'
##
## The following objects are masked from 'package:DT':
##
## dataTableOutput, renderDataTable
library("MASS")
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library("faraway")
library("rgdal")
## Please note that rgdal will be retired during 2023,
## plan transition to sf/stars/terra functions using GDAL and PROJ
## at your earliest convenience.
## See https://r-spatial.org/r/2022/04/12/evolution.html and https://github.com/r-spatial/evolution
## rgdal: version: 1.6-6, (SVN revision 1201)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 3.4.2, released 2022/03/08
## Path to GDAL shared files: /Library/Frameworks/R.framework/Versions/4.2/Resources/library/rgdal/gdal
## GDAL binary built with GEOS: FALSE
## Loaded PROJ runtime: Rel. 8.2.1, January 1st, 2022, [PJ_VERSION: 821]
## Path to PROJ shared files: /Library/Frameworks/R.framework/Versions/4.2/Resources/library/rgdal/proj
## PROJ CDN enabled: FALSE
## Linking to sp version:1.6-0
## To mute warnings of possible GDAL/OSR exportToProj4() degradation,
## use options("rgdal_show_exportToProj4_warnings"="none") before loading sp or rgdal.
library("tigris")
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
library("sf")
## Linking to GEOS 3.10.2, GDAL 3.4.2, PROJ 8.2.1; sf_use_s2() is TRUE
library("car")
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following objects are masked from 'package:faraway':
##
## logit, vif
##
## The following object is masked from 'package:maptools':
##
## pointLabel
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library("GGally")
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Attaching package: 'GGally'
##
## The following object is masked from 'package:faraway':
##
## happy
library("patchwork")
##
## Attaching package: 'patchwork'
##
## The following object is masked from 'package:MASS':
##
## area
library("tibble")
library("corrplot")
## corrplot 0.92 loaded
library("Hmisc")
##
## Attaching package: 'Hmisc'
##
## The following object is masked from 'package:maptools':
##
## label
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
## The following objects are masked from 'package:base':
##
## format.pval, units
library("ggcorrplot")
library("lubridate")
##
## Attaching package: 'lubridate'
##
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library("date")
Loading the Example ETL Data
example_ETL1 <- read.csv("/Users/apple/Desktop/380101014883.csv")
example_ETL2 <- read.csv("/Users/apple/Desktop/380100299547.csv")
example_ETL1
example_ETL2
Both example_ETL1 and example_ETL2 are
sample panel (longitudinal) data frames. Our goal of EVA VII is to
convert our previous data frames into such format for later modeling
purposes.
Importing Related Data Frames from EDA I, II, III, IV
redemptions_2021 <-
fromJSON(file = "/Users/apple/Desktop/2023\ Feb\ Transfer/redemptions_2021.json")
redemptions_2022 <-
fromJSON(file = "/Users/apple/Desktop/2023\ Feb\ Transfer/redemptions_2022.json")
redemptions_2023_Jan <-
fromJSON(file = "/Users/apple/Desktop/2023\ Feb\ Transfer/redemptions_2023_Jan.json")
revenue_view_2021 <-
fromJSON(file = "/Users/apple/Desktop/2023\ Feb\ Transfer/revenue_view_2021.json")
revenue_view_2022 <-
fromJSON(file = "/Users/apple/Desktop/2023\ Feb\ Transfer/revenue_view_2022.json")
revenue_view_2023_Jan <-
fromJSON(file = "/Users/apple/Desktop/2023\ Feb\ Transfer/revenue_view_2023_Jan.json")
redemptions_2021 <- lapply(redemptions_2021, function(x) {
x[sapply(x, is.null)] <- NA
unlist(x)
})
df_redemptions_2021 <- as.data.frame(do.call("cbind", redemptions_2021)) |> t()
redemptions_2022 <- lapply(redemptions_2022, function(x) {
x[sapply(x, is.null)] <- NA
unlist(x)
})
df_redemptions_2022 <- as.data.frame(do.call("cbind", redemptions_2022)) |> t()
redemptions_2023_Jan <- lapply(redemptions_2023_Jan, function(x) {
x[sapply(x, is.null)] <- NA
unlist(x)
})
df_redemptions_2023_Jan <- as.data.frame(do.call("cbind", redemptions_2023_Jan)) |> t()
revenue_view_2021 <- lapply(revenue_view_2021, function(x) {
x[sapply(x, is.null)] <- NA
unlist(x)
})
df_revenue_view_2021 <- as.data.frame(do.call("cbind", revenue_view_2021)) |> t()
revenue_view_2022 <- lapply(revenue_view_2022, function(x) {
x[sapply(x, is.null)] <- NA
unlist(x)
})
df_revenue_view_2022 <- as.data.frame(do.call("cbind", revenue_view_2022)) |> t()
revenue_view_2023_Jan <- lapply(revenue_view_2023_Jan, function(x) {
x[sapply(x, is.null)] <- NA
unlist(x)
})
df_revenue_view_2023_Jan <- as.data.frame(do.call("cbind", revenue_view_2023_Jan)) |> t()
df_redemptions <-
rbind(df_redemptions_2021, df_redemptions_2022, df_redemptions_2023_Jan)
df_revenue_view <-
rbind(df_revenue_view_2021, df_revenue_view_2022, df_revenue_view_2023_Jan)
df_redemptions <- data.frame(df_redemptions)
df_redemptions <- transform(df_redemptions,
total_redemption_amount = as.numeric(total_redemption_amount),
tip = as.numeric(tip),
Venue.Type...Detail = as.factor(Venue.Type...Detail),
Check.Average = as.factor(Check.Average),
Service.Type = as.factor(Service.Type))
df_revenue_view <- data.frame(df_revenue_view)
df_revenue_view <- transform(df_revenue_view,
amount_charged_in_usd = as.numeric(amount_charged_in_usd),
credit_given_in_usd = as.numeric(credit_given_in_usd),
is_app_purchase = as.factor(is_app_purchase),
transaction_type = as.factor(transaction_type),
credit_type = as.factor(credit_type),
is_excess = as.factor(is_excess),
stripe_brand = as.factor(stripe_brand))
brand_tags <-
fromJSON(file = "/Users/apple/Desktop/2023\ Feb\ Transfer/brand_tags.json")
projects <-
fromJSON(file = "/Users/apple/Desktop/2023\ Feb\ Transfer/projects.json")
users_thru_jan23 <-
fromJSON(file = "/Users/apple/Desktop/2023\ Feb\ Transfer/users_thru_jan23.json")
brand_tags <- lapply(brand_tags, function(x) {
x[sapply(x, is.null)] <- NA
unlist(x)
})
df_brand_tags <- as.data.frame(do.call("cbind", brand_tags)) |> t()
projects <- lapply(projects, function(x) {
x[sapply(x, is.null)] <- NA
unlist(x)
})
df_projects <- as.data.frame(do.call("cbind", projects)) |> t()
users_thru_jan23 <- lapply(users_thru_jan23, function(x) {
x[sapply(x, is.null)] <- NA
unlist(x)
})
df_users_thru_jan23 <- as.data.frame(do.call("cbind", users_thru_jan23)) |> t()
df_brand_tags <- data.frame(df_brand_tags)
df_brand_tags <- transform(df_brand_tags,
is_featured = as.numeric(is_featured),
longitude = as.numeric(longitude),
latitude = as.numeric(latitude))
df_projects <- data.frame(df_projects)
df_users_thru_jan23 <- data.frame(df_users_thru_jan23)
Data Wrangling
We have reloaded all the data frames again from EDA I through IV.
Notice our goal is to set up a long-form panel data frame. Potential
covariates will be picked from these 5 data frames based on our prior
judgement. It is easy to recognize that both df_brand_tags
and df_projects do not include the common column
user_id. Hence, both data frames will be discarded in this
EDA. Also, both tables are not directly described in the major
components of the matrix factorization (MF) model according to Koren,
Bell, Volinsky [1]. We will merge the rest 3 data frames together and
further filter out the most important covariates in the panel data. Some
redundant columns are immediately checked for saving the size.
DF1 <- full_join(df_redemptions, df_revenue_view,
by = c("user_id", "created_at", "project_id"))
DF <- full_join(DF1, df_users_thru_jan23, by = "user_id")
DF <-
mutate(DF, created_at = as.Date(DF$created_at, tz = "UTC"),
account_created_at = as.Date(DF$account_created_at, tz = "UTC"))
DF
Notice that we have re-processed the time stamps
created_at and account_created_at into
standard YYYY-MM-DD format.
Excluding Confounding Customers
Regarding the variable account_created_at from
df_users_thru_jan23, it is a great source for us to exclude
confounding customers. The confounding customers refer
to those who either 1). have weird or unusual customer IDs
user_id, such as those IDs with quite small digits or 2).
have registered extremely early compare to the official release date of
the application, where we can utilize account_created_at to
help us judge whether if we should toss them out.
We research the official release date of the application online.
However, the exact timing table cannot be explicitly found. One tactic
is that we can look at this application’s earliest customer reviews on
both Apple App Store and Google Play. The earliest comment we can
retrieve from Google Play [2] is April 17, 2020, whereas the earliest
comment we can retrieve from Apple App Store [3] is roughly six years
ago (the year of 2017). To ensure that we do not delete some real
customers here, we will pick a safe cutoff date, say
2016-12-31. The real customers should create their
accounts as early as early 2017, and so their transaction history should
also follow that date roughly. Hence, two date cutoffs are strictly
enforced here. For customers who have either created their accounts or
purchased/redeemed on the app before the end of 2016, they are
considered to be zombie users and should be tossed out.
DF_fil <-
filter(DF, (account_created_at >= as.Date("2016-12-31")) &
created_at >= as.Date("2016-12-31"))
DF_fil <- filter(DF_fil, !is.na(user_id))
dim(DF)
## [1] 416996 34
dim(DF_fil)
## [1] 325064 34
nrow(DF) - nrow(DF_fil)
## [1] 91932
The first exclusion based on customers’ timing reports successfully
shrinks the data frame by 25%. To proceed the second exclusion based on
weird or unusual customer IDs user_id, this is rather
harder to achieve with a few lines of code. To keep it simple and safe,
we will first create a new column called digit_count to
record the number of digits for each user_id.
DF_fil$user_id <- as.integer(DF_fil$user_id)
DF_fil$user_id <- as.character(DF_fil$user_id)
DF_fil <- DF_fil |> mutate(digit_count = nchar(user_id))
digit_counts <- DF_fil |> count(digit_count) |> arrange(digit_count)
digit_counts
2512/416996
## [1] 0.006024039
The above table digit_count has shown that there are
2,513 observations with 4 digits in user_id, 161,738
observations with 5 digits in user_id, and 160,813
observations with 6 digits in user_id. 4 digits seem to be
weird customer IDs. These customers may be real person (not computer
testing data), but they could be highly possible that they are created
by the firm itself. Also, consider the size of 4-digit customer ID
users, the proportion of them is quite small (less than 0.6%). It is not
harmful to exclude them. However, for customers with either 5 digits or
6 digits, given the large population there, we tend to consider both of
them as real customers.
DF_fil <- DF_fil |> filter(nchar(user_id) != 4)
DF_fil <- DF_fil[, 1:34]
dim(DF_fil)
## [1] 322551 34
After both methods to exclude potential fake customers, the eventual N we obtain is 322,551, which still satisfies to be a part of the challenge “large N, small T, many P”.
Discarding Unnecessary Covariates
In each subsection, we will present a table of latent factors that we think should be tossed out.
Removing unique identifiers
Before turning the data frame into a panel data frame, we notice that
some unimportant variables may be discarded since they
may not play any informative roles here. The definition of these
unimportant variables can vary, but a basic principle here is that we
treat IDs (other IDs except user_id) as not important. They
only appear to be unique identifiers, which are unlikely to provide
useful information for a model. If they are included in the model, they
could lead to overfitting because the model might learn patterns
specific to individual IDs, which are not generalizable.
DF_fil[, c(3:8, 9, 15:18, 30)]
Removing latent variables With too many missing values
If a variable has many missing values, it might not provide much information for the model. By checking the number of missing values in each column, we consider excluding those with a high proportion of missing values.
summary(is.na(DF_fil))
## user_id created_at project_id project.name
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:322551 FALSE:322551 FALSE:322551 FALSE:180038
## TRUE :142513
## Location project_location_id city state
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:180038 FALSE:180038 FALSE:180038 FALSE:180038
## TRUE :142513 TRUE :142513 TRUE :142513 TRUE :142513
## credit_transaction_id total_redemption_amount tip
## Mode :logical Mode :logical Mode :logical
## FALSE:180038 FALSE:180038 FALSE:135596
## TRUE :142513 TRUE :142513 TRUE :186955
## Venue.Type...Detail Check.Average Service.Type UID
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:180038 FALSE:180038 FALSE:180038 FALSE:142513
## TRUE :142513 TRUE :142513 TRUE :142513 TRUE :180038
## name gcp_id ict_id amount_charged_in_usd
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:142513 FALSE:104077 FALSE:38436 FALSE:142513
## TRUE :180038 TRUE :218474 TRUE :284115 TRUE :180038
## utm_campaign utm_medium utm_content utm_source
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:72723 FALSE:72723 FALSE:71232 FALSE:72707
## TRUE :249828 TRUE :249828 TRUE :251319 TRUE :249844
## is_app_purchase transaction_type credit_type credit_given_in_usd
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:142513 FALSE:142513 FALSE:142513 FALSE:142513
## TRUE :180038 TRUE :180038 TRUE :180038 TRUE :180038
## is_excess stripe_brand option_id account_created_at
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:39471 FALSE:136620 FALSE:141918 FALSE:322551
## TRUE :283080 TRUE :185931 TRUE :180633
## zip_code user_app_version user_app_platform
## Mode :logical Mode :logical Mode :logical
## FALSE:6208 FALSE:308101 FALSE:308101
## TRUE :316343 TRUE :14450 TRUE :14450
The is.na() function creates a logical matrix where
TRUE represents a missing value (NA) and FALSE
represents a non-missing value. For better readability, we will process
a new table to store such information. We will also perform a percentage
calculation to better decide if such a latent factor should be removed
or not.
summary <- DF_fil |> summarise(across(everything(),
list(Missing = ~sum(is.na(.)),
NonMissing = ~sum(!is.na(.)),
PercentageMissing = ~round(sum(is.na(.))/length(.), 2))))
summary <- summary |> pivot_longer(cols = everything(),
names_to = c("Column", ".value"),
names_pattern = "(.*)_(.*)")
summary
The summary table above displays each potential
variable’s missing values, non-missing values, and percentages of
missing. We will visualize the percentage in the following plot.
ggplot(summary,
aes(x = reorder(Column, PercentageMissing), y = PercentageMissing)) +
geom_bar(stat = "identity", fill = "steelblue", col = "white") +
geom_text(aes(label = paste0(PercentageMissing*100, "%")), hjust = -0.1) +
geom_hline(aes(yintercept = 0.75), linetype="dashed", color = "red") +
coord_flip() +
labs(x = "Column", y = "Percentage Missing",
title = "Percentage of Missing Values For Each Variable")
An red dashed line at 75% has been arbitrarily implemented. It is set
up as my personal cutoff, so I intend to discard any covariates that are
missing more than 75% of their entries. This is rather a safe choice.
Hence, we will exclude columns zip_code,
is_excess, ict_id, utm_content,
utm_source, utm_medium, and
utm_campaign. Notice that we see a lot of flat percentages
among several covariates. This is reasonable since that bunch of
variables is coming from the same data frame before the merging
happens.
Some controversial alarms
Also, locations-related information may also be ignored. However, it can be controversial and dangerous to perform such action early. According to Koren, Bell, Volinsky [1], a fourth optional input source, known as user attribute, \(\sum_{a \in A(u)} y_a\), may be included in MF model. This is suggested to do when we face a “cold start” problem since we have limited amount of information on customers in pre-treatment periods. By adding an extra input source such as user attributes, we can utilize such implicit information to prevent some problems like overfitting. However, for simplicity, we tend to exclude the demographic information in our EDA VII. We may possibly include them in future EDAs for better interpretability.
DF_fil[, 31:34]
Now we can begin the exclusion process. The above information has told us that what variables should be tossed out. The model after shrinkage is shown below.
DF_final <- DF_fil[, -c(3:8, 9, 15:18, 30, 31:34, 28, 20:23)]
DF_final
The eventual data frame before transitioning to a panel data frame has only 13 latent variables and more than 300K observations. In viewing of such a large number of N, our P is comparatively small. I am not sure if this final data frame is fitted well for a “large N, small T, many P” problem. Hence, the following transformation may be important in dealing with such situation.
Panel Data Transformation
Before transforming DF_final into the panel data frame,
we need to be careful with the time granularity level. A basic thought
is to split such data frame into 3 measures in terms of daily
granularity, weekly granularity, and monthly granularity.
First, the daily granularity is created for recording every single observation. That is, each customer’s purchasing or redeeming action will be recorded every day. This is considered to be the most detailed data frame.
Second, the weekly granularity is created as a balance between the daily and monthly granularity. However, the concept of weekly data can be more complex since there exist a lot of definitions that are not dominantly accepted. For simplicity, I will define a week based on the ISO 8601 standard, which states that a week starts on Monday and ends on Sunday.
Third, the monthly granularity is created for revealing some seasonal patterns. This data frame is not detailed compared to the daily and weekly granularity. However, we may see some unexpected seasonal patterns from such a data frame.
We will perform the date parsing below and assign each granularity in a separate column.
start_date <- as.Date("2021-01-04")
DF_final$created_at <- as.Date(DF_final$created_at, format = "%Y-%m-%d")
DF_final$day <- DF_final$created_at
DF_final$week <- as.Date(cut(DF_final$created_at, "week"), format="%Y-%m-%d")
DF_final$week_index <- as.integer(format(DF_final$created_at, "%U"))
DF_final$week_index[DF_final$created_at < as.Date("2021-01-04")] <- 0
DF_final$month <- as.Date(format(DF_final$created_at, "%Y-%m-01"), format="%Y-%m-%d")
DF_final[, c(1:2, 14:17)]
Notice that according to ISO 8601 standard, the first week of the
year (week 01) is the week that includes January 4. This means that if
January 1, 2, or 3 falls on a Sunday, Monday, Tuesday, Wednesday, or
Thursday, then it is part of week 01. If January 1, 2, or 3 falls on a
Friday or Saturday, then it is part of the last week of the previous
year. The calendar shows that 2021-01-01 is a Friday, so it
is part of the last week of year 2020. Hence, the week of
2020-12-28 is labeled week 0, the week of
2021-01-04 is labeled week 1, the week of
2021-01-11 is labeled week 2, and so on. The table above
has shown the added time granularity for panel data transformation.
Since day and created_at is duplicated, we
will remove created_at. The following data frames are
stored for daily, weekly, and monthly granularity, respectively.
DF_final <- DF_final[, -2]
DF_daily <- DF_final[, -c(14:16)]
DF_weekly <- DF_final[, -c(13, 16)]
DF_monthly <- DF_final[, -c(13:15)]
Panel Data Frame - Daily Granularity
The transformation into panel data frame for daily granularity is
easy. We can directly apply pdata.frame() to perform such
transformation.
PDF_daily <- pdata.frame(DF_daily, index = c("user_id", "day"))
## Warning in pdata.frame(DF_daily, index = c("user_id", "day")): duplicate couples (id-time) in resulting pdata.frame
## to find out which, use, e.g., table(index(your_pdataframe), useNA = "ifany")
PDF_daily
Panel Data Frame - Weekly Granularity
However, the transaction for weekly and monthly granularity is harder
since we need to group by customers during a common period of week or
month. The chunks below for weekly and monthly granularity are just for
display purposes. The group_by and summarise
functions keep running forever but not reporting any errors. I have
waited for at least 45 minutes on R’s responding. Unfortunately, it
still keeps loading and fails to report anything. I will find another
way to tackle such a problem in our next EDA VIII.
DF_weekly <- DF_weekly |>
group_by(user_id, week) |>
summarise(across(where(is.numeric), mean, na.rm = TRUE),
across(where(!is.numeric), ~ names(which.max(table(.)))))
PDF_weekly <- pdata.frame(DF_weekly, index = c("user_id", "week", "week_index"))
Panel Data Frame - Monthly Granularity
DF_monthly <- DF_monthly |>
group_by(user_id, week) |>
summarise(across(where(is.numeric), mean, na.rm = TRUE),
across(where(!is.numeric), ~ names(which.max(table(.)))))
PDF_monthly <- pdata.frame(DF_monthly, index = c("user_id", "month"))
References
[1] Y. Koren, R. Bell, and C. Volinsky, “Matrix Factorization Techniques for Recommender Systems,” Computer, vol. 42, no. 8, pp. 30–37, Aug. 2009, doi: https://doi.org/10.1109/mc.2009.263 [accessed Jul. 09, 2023].
[2] Google Play, “inKind - Apps on Google Play,” play.google.com, 2020. https://play.google.com/store/apps/details?id=com.inkind.inkind&hl=en_US&gl=US [accessed Jul. 07, 2023].
[3] Apple App Store, “inKind,” App Store, 2023. https://apps.apple.com/us/app/inkind/id1153771822 [accessed Jul. 07, 2023].