library(readxl)
library(tidyverse)
library(readxl)
library(knitr)
library(data.table)
library(dplyr)
library(ggplot2)
library(tidyr)
library(writexl)
library(pastecs)
library(fpc)
library(magrittr)data <- read_excel("Transaction_data.xlsx", sheet = "raw")
data$userId <- NULLdf <- data %>% mutate(var4 = rowSums(!is.na(select(., -Cellphone))))data <- read_excel("transaction_data.xlsx")
data$userId <- NULL
data$Cellphone[data$Cellphone==""] <- "000"
data$Cellphone[data$Cellphone=="NA"] <- "000"data$u <- sapply(strsplit(as.character(data$`2020-05-16`), ";"), length)u <- data %>% mutate(var4 = rowSums(!is.na(select(., -Cellphone))))a <- melt(data, id.vars=c("Cellphone"))
b <- a
c <- b[complete.cases(b), ]
d <- c %>%
group_by(Cellphone) %>%
summarize(freq = n())mean(d$freq)## [1] 6.388889
median(d$freq)## [1] 5
max(d$freq)## [1] 25
quantile(d$freq, probs = c(0, 0.25, 0.5, 0.75, 1)) # quartile## 0% 25% 50% 75% 100%
## 2 3 5 8 25
e <- strsplit(c$value, "\\;") # 1 column to multiple column
n.obs <- sapply(e, length)
seq.max <- seq_len(max(n.obs))
f <- t(sapply(e, "[", i = seq.max))
class(f)## [1] "matrix"
g <- f %>% as.data.frame() %>%
mutate(Class= c$Cellphone)
g <- g %>% as.data.frame() %>%
mutate(Date= c$variable)
g <- g %>% rename(Cellphone = Class)
str(g$Class)## NULL
h <- merge(d,data, by = "Cellphone")
max(h$freq)## [1] 25
c$variable <- as.character(c$variable)
d1 <- c %>% filter(variable >= "2020-05-15" & variable <= "2020-05-30")
d2 <- d1 %>%
group_by(Cellphone) %>%
summarize(freq = n())
mean(d2$freq)## [1] 3.807692
median(d2$freq)## [1] 3
max(d2$freq)## [1] 13
df1 <- read_excel("fail_transaction.xlsx", sheet = "report")
df2 <- read_excel("fail_transaction_log.xlsx")
df1$FAILED <- ifelse(is.na(match(paste0(df1$TransId),
paste0(df2$TransId))),"FALSE", "TRUE")
x <- df1 %>% filter(FAILED == "TRUE")
table(df1$FAILED)##
## FALSE TRUE
## 391 182