1 Library Loading

library(readxl)
library(tidyverse)
library(readxl)
library(knitr)
library(data.table)
library(dplyr)
library(ggplot2)
library(tidyr)
library(writexl)
library(pastecs)
library(fpc)
library(magrittr)

2 Data Uploading

data <- read_excel("Transaction_data.xlsx", sheet = "raw")
data$userId <- NULL

3 Data Wrangling

3.1 Remove missing value

df <- data %>% mutate(var4 = rowSums(!is.na(select(., -Cellphone))))

3.2 Number of login

data <- read_excel("transaction_data.xlsx")
data$userId <- NULL
data$Cellphone[data$Cellphone==""] <- "000"
data$Cellphone[data$Cellphone=="NA"] <- "000"

3.3 Split string ;

data$u <- sapply(strsplit(as.character(data$`2020-05-16`), ";"), length)

3.4 rowSum

u <- data %>% mutate(var4 = rowSums(!is.na(select(., -Cellphone))))

3.5 Convert from wide to long format

a <- melt(data, id.vars=c("Cellphone"))
b <- a
c <- b[complete.cases(b), ]

d <- c %>% 
  group_by(Cellphone) %>% 
  summarize(freq = n())

3.6 Caculation mean value for number of login

mean(d$freq)
## [1] 6.388889
median(d$freq)
## [1] 5
max(d$freq)
## [1] 25
quantile(d$freq, probs = c(0, 0.25, 0.5, 0.75, 1)) # quartile
##   0%  25%  50%  75% 100% 
##    2    3    5    8   25

3.7 Split string

e <- strsplit(c$value, "\\;") # 1 column to multiple column 
n.obs <- sapply(e, length)
seq.max <- seq_len(max(n.obs))
f <- t(sapply(e, "[", i = seq.max))
class(f)
## [1] "matrix"

3.8 Mapping (Key) to define the user

g <- f %>% as.data.frame() %>% 
  mutate(Class= c$Cellphone)

g <- g %>% as.data.frame() %>% 
  mutate(Date= c$variable)

g <- g %>% rename(Cellphone = Class)

str(g$Class)
##  NULL
h <- merge(d,data, by = "Cellphone")

max(h$freq)
## [1] 25

3.9 Calculation for 1st week

c$variable <- as.character(c$variable)
d1 <- c %>% filter(variable >= "2020-05-15" & variable <= "2020-05-30")
d2 <- d1 %>% 
  group_by(Cellphone) %>% 
  summarize(freq = n())

mean(d2$freq)
## [1] 3.807692
median(d2$freq)
## [1] 3
max(d2$freq)
## [1] 13

3.10 Mapping failed transaction

df1 <- read_excel("fail_transaction.xlsx", sheet = "report")
df2 <- read_excel("fail_transaction_log.xlsx")

df1$FAILED <- ifelse(is.na(match(paste0(df1$TransId), 
                                        paste0(df2$TransId))),"FALSE", "TRUE")

x <- df1 %>% filter(FAILED == "TRUE")
table(df1$FAILED)
## 
## FALSE  TRUE 
##   391   182