Exploratory Data Analysis III - Redemptions and Revenue

Codes from previous EDAs

redemptions_2021 <- 
  fromJSON(file = "/Users/apple/Desktop/2023\ Feb\ Transfer/redemptions_2021.json")

redemptions_2022 <- 
  fromJSON(file = "/Users/apple/Desktop/2023\ Feb\ Transfer/redemptions_2022.json")

redemptions_2023_Jan <- 
  fromJSON(file = "/Users/apple/Desktop/2023\ Feb\ Transfer/redemptions_2023_Jan.json")

revenue_view_2021 <-
  fromJSON(file = "/Users/apple/Desktop/2023\ Feb\ Transfer/revenue_view_2021.json")

revenue_view_2022 <- 
  fromJSON(file = "/Users/apple/Desktop/2023\ Feb\ Transfer/revenue_view_2022.json")

revenue_view_2023_Jan <- 
  fromJSON(file = "/Users/apple/Desktop/2023\ Feb\ Transfer/revenue_view_2023_Jan.json")

redemptions_2021 <- lapply(redemptions_2021, function(x) {
  x[sapply(x, is.null)] <- NA
  unlist(x)
})

df_redemptions_2021 <- as.data.frame(do.call("cbind", redemptions_2021)) |> t()

redemptions_2022 <- lapply(redemptions_2022, function(x) {
  x[sapply(x, is.null)] <- NA
  unlist(x)
})

df_redemptions_2022 <- as.data.frame(do.call("cbind", redemptions_2022)) |> t()

redemptions_2023_Jan <- lapply(redemptions_2023_Jan, function(x) {
  x[sapply(x, is.null)] <- NA
  unlist(x)
})

df_redemptions_2023_Jan <- as.data.frame(do.call("cbind", redemptions_2023_Jan)) |> t()

revenue_view_2021 <- lapply(revenue_view_2021, function(x) {
  x[sapply(x, is.null)] <- NA
  unlist(x)
})

df_revenue_view_2021 <- as.data.frame(do.call("cbind", revenue_view_2021)) |> t()

revenue_view_2022 <- lapply(revenue_view_2022, function(x) {
  x[sapply(x, is.null)] <- NA
  unlist(x)
})

df_revenue_view_2022 <- as.data.frame(do.call("cbind", revenue_view_2022)) |> t()

revenue_view_2023_Jan <- lapply(revenue_view_2023_Jan, function(x) {
  x[sapply(x, is.null)] <- NA
  unlist(x)
})

df_revenue_view_2023_Jan <- as.data.frame(do.call("cbind", revenue_view_2023_Jan)) |> t()

df_redemptions <- 
  rbind(df_redemptions_2021, df_redemptions_2022, df_redemptions_2023_Jan)

df_revenue_view <- 
  rbind(df_revenue_view_2021, df_revenue_view_2022, df_revenue_view_2023_Jan)

df_redemptions <- data.frame(df_redemptions)

df_redemptions <- transform(df_redemptions, 
                  total_redemption_amount = as.numeric(total_redemption_amount),
                  tip = as.numeric(tip),
                  Venue.Type...Detail = as.factor(Venue.Type...Detail), 
                  Check.Average = as.factor(Check.Average),
                  Service.Type = as.factor(Service.Type))

df_revenue_view <- data.frame(df_revenue_view)

df_revenue_view <- transform(df_revenue_view, 
                  amount_charged_in_usd = as.numeric(amount_charged_in_usd),
                  credit_given_in_usd = as.numeric(credit_given_in_usd), 
                  is_app_purchase = as.factor(is_app_purchase),
                  transaction_type = as.factor(transaction_type),
                  credit_type = as.factor(credit_type),
                  is_excess = as.factor(is_excess),
                  stripe_brand = as.factor(stripe_brand))

New Work Start from here - EDA on Redemptions

How many redemptions do customers take? How quickly and frequently do customers redeem? In addition to the population average values, could you also plot these by quantiles, or if it’s visually sensical, continuously?

library("dplyr", warn.conflicts = FALSE)

unique(df_redemptions$user_id) |> length()

## [1] 73089

nrow(df_redemptions)

## [1] 183687

df_redemptions_by_user <- group_by(df_redemptions, user_id)

summary(df_redemptions_by_user)

##    user_id           created_at         project_id        project.name      
##  Length:183687      Length:183687      Length:183687      Length:183687     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    Location         project_location_id     city              state          
##  Length:183687      Length:183687       Length:183687      Length:183687     
##  Class :character   Class :character    Class :character   Class :character  
##  Mode  :character   Mode  :character    Mode  :character   Mode  :character  
##                                                                              
##                                                                              
##                                                                              
##                                                                              
##  credit_transaction_id total_redemption_amount      tip          
##  Length:183687         Min.   :    0.00        Min.   :    0.00  
##  Class :character      1st Qu.:   26.76        1st Qu.:    6.00  
##  Mode  :character      Median :   62.00        Median :   14.01  
##                        Mean   :  118.56        Mean   :   25.54  
##                        3rd Qu.:  129.57        3rd Qu.:   28.46  
##                        Max.   :31250.00        Max.   :11002.00  
##                                                NA's   :45316     
##          Venue.Type...Detail   Check.Average          Service.Type   
##  Casual Dining     :58965    High     :43150   Fast Casual  : 18836  
##  Casual Fine Dining:57593    Low      :13523   Full Service :131572  
##  Fine Dining       :19526    Mid      :95303   Ghost Kitchen:   650  
##  Coffee Shop       :14688    Very High:15501   QSR          : 18811  
##  Unknown           :13620    Very Low :16210   Unknown      : 13818  
##  Fast Casual       :12943                                            
##  (Other)           : 6352

A very preliminary analysis on all customers has shown that there are a total of 183,687 redemptions made from 73,089 customers in our redemptions dataset, and the average total redemption amount for them is approximately 118.56 dollars. The average tipping amount, by excluding 45,316 NA entries, is approximately 25.54 dollars.

qqnorm(df_redemptions_by_user$total_redemption_amount, pch = 1, frame = FALSE)
qqline(df_redemptions_by_user$total_redemption_amount, col = "steelblue", lwd = 2)

qqPlot(df_redemptions_by_user$total_redemption_amount)

## [1] 146676 151277

Unfortunately, the Quantile-Quantile plot (QQ-plot) does not follow a theoretical distribution, i.e. normality, since we observe a very heavy tail-up on the right.

ggplot(df_redemptions_by_user, 
       aes(sample = df_redemptions_by_user$total_redemption_amount, 
           color = df_redemptions_by_user$Service.Type)) +
  stat_qq() +
  stat_qq_line()

ggplot(df_redemptions_by_user, 
       aes(sample = df_redemptions_by_user$total_redemption_amount, 
           color = df_redemptions_by_user$Check.Average)) +
  stat_qq() +
  stat_qq_line()

ggplot(df_redemptions_by_user, 
       aes(sample = df_redemptions_by_user$total_redemption_amount, 
           color = df_redemptions_by_user$Venue.Type...Detail)) +
  stat_qq() +
  stat_qq_line()

However, when we take a closer look at the QQ-plot, we may find out that some categories are indeed following the theoretical QQ-line, while some others extremely deviate from the line. Some naive observations here are that when the service type is full service restaurants or when the redemption amount is fairly high, we may see them exceed the normal QQ-line quite a lot. For those fast food restaurants or when the redemption amount is fairly low, they almost follow the normality assumption. This can be an intriguing finding that may help us decide what distributions we will use later.

OK, back to the prompt again. Let us extrapolate all redemption amounts from the data frame and plot them continuously in quantiles.

redem <- hist(df_redemptions_by_user$total_redemption_amount, breaks = 10, 
              col = 'skyblue', ylim = c(0, 200000),
          xlab = "Amount of total redemption", 
          main = "Histogram of amount of total redemption")
text(redem$mids,redem$counts,labels=redem$counts, adj=c(0.5, -0.5))

redem <- hist(df_redemptions_by_user$total_redemption_amount, breaks = 20, 
              col = 'skyblue', ylim = c(0, 200000),
          xlab = "Amount of total redemption ($)", 
          main = "Histogram of amount of total redemption")
text(redem$mids,redem$counts,labels=redem$counts, adj=c(0.5, -0.5))

The histogram is not very informative since we have almost all of the data gathered less than 2,500 dollars in total redemption amount. We can instead try to visualize the quantile plot by computing each quantile first and giving a discrete plot. But before we perform that, let us look at the boxplot to get a sense of the distribution.

ggplot(df_redemptions_by_user) + 
  geom_boxplot(aes(df_redemptions_by_user$total_redemption_amount)) + 
  xlab("Amount of total redemption ($)")

The distribution above is also what we have expected.

q100 <- quantile(df_redemptions_by_user$total_redemption_amount, 
               probs = seq(0, 1, 1/100))

q2000 <- quantile(df_redemptions_by_user$total_redemption_amount, 
               probs = seq(0, 1, 1/2000))

df100 <- data.frame(q100)
df100 <- rename_at(df100, "q100", ~ "total redemption amount ($)")
df100

tail(df100)

df2000 <- data.frame(q2000)
df2000 <- rename_at(df2000, "q2000", ~ "total redemption amount ($)")
tail(df2000, n = 20)

plot(q100, col = "orange", xlab = "Total redemption amount ($)", 
     ylab = "total redemption amount ($)")

plot(q2000, col = "pink1", xlab = "Total redemption amount ($)", 
     ylab = "total redemption amount ($)")

We numerically compute for 100-percentile and a more detailed 2000-percentile on total redemption amount. We find out that the total redemption amount in 99% of time is under 100 dollars under a 100-percentile, while the total redemption amount in 99% of time is under 2000 dollars under a 2000-percentile. We may need to be careful with those extremely high amount of total redemption values. Though they are only a few present in this dataset, they may still prevent us from building a theoretical assumption on normality.

Preparation for CLV Computation

For both the redemption and purchase data, for each customer, please perform the following summary statistics calculations: (1) average value per transaction in dollar amounts, (2) the recency of the most recent transaction, relative to the end date of the dataset, and (3) the average weekly frequency of transactions, from the date of a customer’s own first transaction to the end date of the dataset.

Merging Tabular Data

df_revenue_view_by_user <- group_by(df_revenue_view, user_id)

df_rev_redem <- full_join(df_revenue_view_by_user, df_redemptions_by_user, 
                          by = "user_id")

## Warning in full_join(df_revenue_view_by_user, df_redemptions_by_user, by = "user_id"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 2 of `x` matches multiple rows in `y`.
## ℹ Row 7 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.

dim(df_rev_redem)

## [1] 698708     32

dim(df_revenue_view_by_user)

## [1] 143806     19

dim(df_redemptions_by_user)

## [1] 183687     14

Notice that after merging the dataset, we now have 32 columns (19 + 14 - 1 by excluding the common column user_id twice). However, the rows are not simply an addition of two data frames’ nrow(). Instead, due to some combinatorial concerns, we have way more than their addition. However, some rows are present with NA, so we should pay more attention.

For each customer, compute for average value per transaction / redemption in dollar amounts

unique(df_rev_redem$user_id) |> length()

## [1] 106635

We have a total of 106,635 customers, though some of them are not present in both tables. Even though df_rev_redem is already grouped by user_id, we will perform it once again and apply aggregating method to find each customer’s mean amount on both transaction and redemption.

df_rev_redem_avg <- df_rev_redem |> group_by(user_id) |> 
  summarize_at(vars(amount_charged_in_usd, total_redemption_amount), list(avg = mean))

df_rev_redem_avg

cor(df_rev_redem_avg$amount_charged_in_usd_avg, 
    df_rev_redem_avg$total_redemption_amount_avg, 
    use="pairwise.complete.obs")

## [1] 0.4408879

ggpairs(df_rev_redem_avg[, 2:3])

## Warning: Removed 11217 rows containing non-finite values (`stat_density()`).

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 44763 rows containing missing values

## Warning: Removed 44763 rows containing missing values (`geom_point()`).

## Warning: Removed 33546 rows containing non-finite values (`stat_density()`).

We have formed the tabular average data for both transaction and redemption amount. We are interested in learning their correlation after we have excluded all NA entries. The correlation between two mean values is approximately 0.44, neither too high nor too low. From the pairs plot, we can observe that the density plots on the diagonal are almost identical in shape. The lower left corner is the scatter-plot between the two average values. The upper right corner display the Pearson correlation coefficient between each average variable.

For each customer, find the recency of the most recent transaction, relative to the end date of the dataset

lastD1 <- max(as.Date(df_rev_redem$created_at.x), na.rm = TRUE)
lastD2 <- max(as.Date(df_rev_redem$created_at.y), na.rm = TRUE)
lastD <- max(lastD1, lastD2)
lastD

## [1] "2023-01-31"

The end date of data recording is 2023-01-31. Now we will access each customer’s recency date for both transaction and redemption. The approach is similar to the above.

df_rev_redem_recency <- df_rev_redem |> group_by(user_id) |> 
  summarize_at(vars(created_at.x, created_at.y), list(recency = max))

df_rev_redem_recency <- rename_at(df_rev_redem_recency, 
                        2, ~ "transaction_recency")

df_rev_redem_recency <- rename_at(df_rev_redem_recency, 
                        3, ~ "redemption_recency")

df_rev_redem_recency <- mutate(df_rev_redem_recency, 
transaction_recency_diff = lastD - as.Date(df_rev_redem_recency$transaction_recency), 
redemption_recency_diff = lastD - as.Date(df_rev_redem_recency$redemption_recency))

df_rev_redem_recency <- transform(df_rev_redem_recency, 
              transaction_recency_diff = as.numeric(transaction_recency_diff),
              redemption_recency_diff = as.numeric(redemption_recency_diff))

df_rev_redem_recency

df_rev_redem_recency |> arrange(desc(as.numeric(transaction_recency_diff)))

filter(df_rev_redem_recency, transaction_recency_diff == 760) |> nrow()

## [1] 39

df_rev_redem_recency |> arrange(as.numeric(transaction_recency_diff))

filter(df_rev_redem_recency, transaction_recency_diff == 0) |> nrow()

## [1] 855

df_rev_redem_recency |> arrange(desc(as.numeric(redemption_recency_diff)))

filter(df_rev_redem_recency, redemption_recency_diff == 760) |> nrow()

## [1] 22

df_rev_redem_recency |> arrange(as.numeric(redemption_recency_diff))

filter(df_rev_redem_recency, redemption_recency_diff == 0) |> nrow()

## [1] 446

as.Date("2023-01-31") - 760

## [1] "2021-01-01"

From the table output above, we surprisingly find out that the recency of customers in revenue and redemption dataset all has a maximum value of 760 days (~ 2.08 years away from the last date entered in the dataset), and the recency of customers in revenue and redemption dataset all has a minimum value of 0 days. This implies that about 2 years ago, which is actually Jan 1, 2021, the first date of record in the dataset, 39 customers finished their last purchase via our application, whereas 22 customers finished their last redemption via our application. On Jan 31, 2023, which is the last date that our dataset can record, it displays that 855 customers just finished a transaction on this last date, whereas 446 customers just finished a redemption on this last date. It may be interested in learning the distribution of recency frequency for both transaction and redemption, since this purchasing or redeeming behavior is a potentially significant factor that determines the CLV model.

ggpairs(df_rev_redem_recency[, 4:5])

## Warning: Removed 11217 rows containing non-finite values (`stat_density()`).

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 44763 rows containing missing values

## Warning: Removed 44763 rows containing missing values (`geom_point()`).

## Warning: Removed 33546 rows containing non-finite values (`stat_density()`).

Again, on the diagonal, we observe the frequency plot (shown as density). The shapes for both transaction and redemption are very similar to each other. There are two relatively obvious peaks for both actions - for transaction, it has a global peak at around 50 days, following by another small peak at around 250 days; for redemption, it has a global peak at around 40 days, following by another relatively obvious peak at around 250 days. In fact, the time-series peak-trough pattern is also quite similar! We can see from the upper right corner that their Pearson correlation coefficient is quite high, reaching about 0.882. The scatter-plot is hard to interpret because of overfitting issue, but we are able to conclude their high associativity. However, we would also raise a question here - why do we observe such a pattern in times? Is there any particular event happening on those peak dates that influence customers to no longer purchase and redeem OR influence them to become more inclined to purchasing and redeeming via our platform? We are unable to see this from the dataset itself. Instead, we would be interested to look for the firm to share some historical events recorded and see if they match the dates of our peaks or troughs. This is a great insight that can be deliberately enforced to increase CLV. Though a retention rate is not defined here, our plots can offer an insight for quasi-retention rate. We don’t know if some customers in the dataset stop purchasing or redeeming due to whatever reasons, but once we find some predictive variables that can accurately judge if a customer is ultimately terminating their usage of our platform or a customer resumes his or her purchasing or redeeming behavior in the near or far future, then we almost determine their “retention rate”, which is an indispensible factor in determining the CLV model under subscriptive environment.

trans_p <- ggplot(df_rev_redem_recency, 
                  aes(x = transaction_recency_diff, y = ..density..)) + 
  geom_histogram(color = "grey60", fill = "#FFCC99") + 
  geom_density(color = "red2") + 
  xlab("Recency diff btw customer's last day of transaction and Jan 31, 2023")

redem_p <- ggplot(df_rev_redem_recency, 
                  aes(x = redemption_recency_diff, y = ..density..)) + 
  geom_histogram(color = "grey", fill = "steelblue") + 
  geom_density(color = "purple3") + 
  xlab("Recency diff btw customer's last day of redemption and Jan 31, 2023")

trans_p + redem_p

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 11217 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 11217 rows containing non-finite values (`stat_density()`).

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 33546 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 33546 rows containing non-finite values (`stat_density()`).

The histograms along with density plots are displayed above next to each other. The pattern is almost the same - a lot of customers stop purchasing or redeeming about 50 days ago. This is a bad signal since we should expect the firm to attract more and more customers purchasing and redeeming through our platform. Why do customers face a sharp decline about 50 days ago from Jan 31, 2023 (which is around Dec 10, 2022)? During the Christmas holiday and New Year, they even decrease their using frequency, which is this happening? Also, why is the peak staying for a short period instead of a long, prosperous stage? All of these questions need to be investigated out of our curiosity.

For each customer, compute the average weekly frequency of transaction and redemption from the date of a customer’s own first transaction to the end date of the dataset

Well, this question is very similar to the first task we just performed - each customer’s total average over the total period. Now the prompt asks for each customer’s weekly average value. The task sounds hard to manipulate since each customer has different usage lifetime in using our platform (my proposed definition, a.k.a. \(UL := \Delta T = T_{end} - T_{start}\), where \(\Delta T\) implies the date difference between the starting date and ending date in the unit of day, \(UL\) is the short abbreviation for usage lifetime). However, a further thinking directs us to work out this problem from another perspective. That is, by taking an idea from our recency table, we can create a new column by calculating \(UL\) for each single customer. Then the average weekly transaction and redemption amount can be founded very easily. Also, the frequency of both is easy to be found.

df_rev_redem_UL <- df_rev_redem |> group_by(user_id) |> 
  summarize_at(vars(created_at.x, created_at.y), 
               list(end = max, start = min))

df_rev_redem_UL <- rename_at(df_rev_redem_UL, 
                        2, ~ "transaction_end")

df_rev_redem_UL <- rename_at(df_rev_redem_UL, 
                        3, ~ "redemption_end")

df_rev_redem_UL <- rename_at(df_rev_redem_UL, 
                        4, ~ "transaction_start")

df_rev_redem_UL <- rename_at(df_rev_redem_UL, 
                        5, ~ "redemption_start")

df_rev_redem_UL <- mutate(df_rev_redem_UL, 
UL_trans = as.Date(df_rev_redem_UL$transaction_end) - 
  as.Date(df_rev_redem_UL$transaction_start), 
UL_redem = as.Date(df_rev_redem_UL$redemption_end) - 
  as.Date(df_rev_redem_UL$redemption_start))

df_rev_redem_UL <- transform(df_rev_redem_UL, 
              UL_trans = as.numeric(UL_trans),
              UL_redem = as.numeric(UL_redem))

df_rev_redem_UL <- 
  mutate(df_rev_redem_UL, UL_trans = UL_trans/7, UL_redem = UL_redem/7)

df_rev_redem_UL

The codes above are just to recreate 4 new columns on transaction_end, redemption_end, transaction_start, and redemption_start. Then based on our definition for usage lifetime (abbr. UL), we calculate for UL_trans and UL_redem based on the time difference between transaction_end and transaction_start AND the difference between redemption_end and redemption_start. Notice that the last line is to convert the difference in day units into week units.

df_rev_freq <- df_revenue_view_by_user |> group_by(user_id) |> count()

df_rev_freq <- rename_at(df_rev_freq, 2, ~ "total_transaction")

df_redem_freq <- df_redemptions_by_user |> group_by(user_id) |> count()

df_redem_freq <- rename_at(df_redem_freq, 2, ~ "total_redemption")

df_rev_redem_freq <- full_join(df_rev_freq, df_redem_freq, by = "user_id")

df_rev_redem_freq

We have just created another data frame for total_transaction and total_redemption for each user_id. Since we have common variables user_id in both data frames (df_rev_redem_UL and df_rev_redem_freq), we intend to full-join them.

df_rev_redem_weekly <- full_join(df_rev_redem_UL, df_rev_redem_freq, by = "user_id")

df_rev_redem_weekly_eff <- df_rev_redem_weekly[, c(1, 6:9)]

df_rev_redem_weekly_eff <- 
  mutate(df_rev_redem_weekly_eff, weekly_trans = total_transaction/UL_trans, 
                                weekly_redem = total_redemption/UL_redem)

df_rev_redem_weekly_eff[sapply(df_rev_redem_weekly_eff, is.infinite)] <- NA

df_rev_redem_weekly_eff_sub <- df_rev_redem_weekly_eff[, c(1, 6:7)] |> 
  filter(weekly_trans != "NA" | weekly_redem != "NA")

df_rev_redem_weekly_eff_sub

After we join the two data frames, we perform the calculation for weekly_trans and weekly_redem. A tricky part here is that when UL_trans == 0 or UL_redem == 0, the division will make weekly_trans and weekly_redem an infinity value (Inf)! To avoid this mathematical error, we issue all entries of Inf with NA. Also, notice that we exclude the cases of both NA in weekly_trans and weekly_redem - this makes our analysis very trivial. Hence, we decide to toss them out. Still, some customers only have non-NA entries for just one of weekly_trans and weekly_redem. Hence, we will split the data frame into two parts, one with columns user_id and weekly_trans, the other with columns user_id and weekly_redem.

df_rev_redem_weekly_trans <- df_rev_redem_weekly_eff_sub[, c(1, 2)]

df_rev_redem_weekly_redem <- df_rev_redem_weekly_eff_sub[, c(1, 3)]

df_rev_redem_weekly_trans <-
  df_rev_redem_weekly_trans |> 
  filter(weekly_trans != "NA") |> 
  arrange(desc(weekly_trans))

df_rev_redem_weekly_redem <-
  df_rev_redem_weekly_redem |>
  filter(weekly_redem != "NA") |> 
  arrange(desc(weekly_redem))

df_rev_redem_weekly_trans

df_rev_redem_weekly_redem

The splitted data frames are df_rev_redem_weekly_trans and df_rev_redem_weekly_redem - we again toss out those NA entries and sort the weekly transaction times and weekly redemption times in decreasing order. We find out that a customer with user_id == 229778 made 308 transaction times per week and a customer with user_id == 206397 made 70 redemptions per week! Let us visualize them.

ggpairs(df_rev_redem_weekly_eff_sub[, 2:3])

## Warning: Removed 11172 rows containing non-finite values (`stat_density()`).

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 22200 rows containing missing values

## Warning: Removed 22200 rows containing missing values (`geom_point()`).

## Warning: Removed 11028 rows containing non-finite values (`stat_density()`).

It is interesting to see that the weekly transaction times and weekly redemption times are similar in density distributions on the diagonal. Also, recall our diagonal plots for average transaction amount and average redemption amount, actually the four shapes are quite similar in shape! This is a great finding as it may indicate a strong correlation in between. Let us take a closer look at the correlation plot below.

df_corr_avg_weekly_times <- 
  full_join(df_rev_redem_avg[, 1:3], 
            df_rev_redem_weekly_eff[, c(1, 6:7)], 
            by = "user_id")

df_corr_avg_weekly_times_nNA <- na.omit(df_corr_avg_weekly_times)

cor(df_corr_avg_weekly_times_nNA[, 2:5]) |> ggcorrplot()

The correlation plot displays that there is a strong correlation between weekly redemption times and weekly transaction times AND a strong correlation between average redemption amount and average transaction amount, which is a finding that agrees to our previous analysis. The weekly purchasing / redeeming times are weakly correlated with the average transaction amount and average redemption amount - thus, their distributions in similar shape do not provide any more informative knowledge to us.

Visualizations or tables on “how many different projects” customers purchase from - This could once again be a frequency table, with number of different projects on the X-axis, and count of customers on the Y-axis. This frequency table would only need to applied to customers with two or more purchases because, by definition, a single purchase can only come from a single project.

df_revenue_view_by_user_by_proj_id <- df_revenue_view_by_user |> 
  group_by(user_id, project_id, name) |> count() 

df_revenue_view_by_user_by_proj_id <- 
  mutate(df_revenue_view_by_user_by_proj_id, unique = 1)

tbl1 <- aggregate(df_revenue_view_by_user_by_proj_id$unique, 
                  list(df_revenue_view_by_user_by_proj_id$user_id), FUN = sum)

tbl2 <- aggregate(df_revenue_view_by_user_by_proj_id$n, 
                  list(df_revenue_view_by_user_by_proj_id$user_id), FUN = sum)

tbl1 <- rename_at(tbl1, 1, ~ "user_id")

tbl1 <- rename_at(tbl1, 2, ~ "number_diff_projects")

tbl2 <- rename_at(tbl2, 1, ~ "user_id")

tbl2 <- rename_at(tbl2, 2, ~ "number_total_projects")

We first group by the revenue data frame with respect to each user_id and assign a new column unique with entries all equal to 1. Then by the method of aggregate(), we can sum up each customer’s unique column and look up how many different projects this certain customer purchase from. Similarly, the n column is the total number of projects for a customer at a certain project. We also aggregate() it with respect to user_id, it is not used in the later context, but in case we would need it in the future, I include it here. After aggregations, tbl1 lists the number of different projects for each user_id, and tbl2 lists the number of total projects for each user_id. We will now merge the two tables.

df_revenue_view_by_user_diff_total <- full_join(tbl1, tbl2, by = "user_id")

df_revenue_view_by_user_diff_count <- df_revenue_view_by_user_diff_total |> 
  group_by(number_diff_projects) |> count()

df_revenue_view_by_user_diff_count <- 
  rename_at(df_revenue_view_by_user_diff_count, 2, ~ "count_users")

df_revenue_view_by_user_diff_count_geq2 <- df_revenue_view_by_user_diff_count |> 
  filter(number_diff_projects >= 2)

df_revenue_view_by_user_diff_count_geq2

After merging, we consider re-grouping the data frame by number_diff_projects and then counting the total number of customers. As stated in the prompt, we are not interested in one-and-gone customers since they only purchased one times and the number of different projects they could purchase is always 1. Hence, we filter() the data with number_diff_projects >= 2. The data frame is shown above. The visualization for this data frame is plotted below.

ggplot(df_revenue_view_by_user_diff_count_geq2, 
       aes(x = number_diff_projects, y = count_users)) + geom_point() + 
  stat_smooth(method = loess, span = 0.4, se = TRUE, color = "steelblue2") + 
  stat_smooth(method = lm, se = FALSE, color = "pink2") +
  xlab("Number of different projects") + ylab("Count of customers") +
  annotate("text", x = 3, y = 9200, label = "(2, 8,648)") +
  annotate("text", x = 37, y = 500, label = "(35, 1)")

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

The scatter-plot above shows the relationship between the count of customers and the number of different projects. We also approach its regression by two ways - the pink line is the OLS regression and the steelblue curve is the loess regression with its 95% confidence interval. An interesting fact has came to our attention - even though we exclude customers with non-different project (singular project purchaser), the customers who purchase from different projects are in fact quite a lot in number, reaching around 8,648. When the number of different projects is more than 2, the number of customers shrink significantly, as shown by the relatively flat dots aligned horizontally.

Exploratory Data Analysis III - Redemptions and Revenue

Zhongming Jiang

2023-05-13