library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(repurrrsive)
files <- list.files(pattern = "^Month-.*\\.csv$")
df <- map_dfr(files, read_csv, show_col_types = FALSE)
glimpse(df)
## Rows: 698,159
## Columns: 10
## $ Account_ID <dbl> 5, 16, 28, 40, 62, 64, 69, 69, 70, 79, 88, 90, 9…
## $ Transaction_Timestamp <dttm> 2009-01-08 00:16:41, 2009-01-20 22:40:08, 2009-…
## $ Factor_A <dbl> 2, 2, 2, 2, 2, 7, 2, 2, 2, 7, 8, 10, 10, 2, 2, 2…
## $ Factor_B <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 18, 6, 6, 6, 6, 6,…
## $ Factor_C <chr> "VI", "VI", "VI", "VI", "VI", "MC", "VI", "VI", …
## $ Factor_D <dbl> 20, 20, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, …
## $ Factor_E <chr> "A", "H", "NULL", "H", "B", "NULL", "H", "H", "B…
## $ Response <dbl> 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1020, …
## $ Transaction_Status <chr> "Approved", "Approved", "Approved", "Approved", …
## $ Month <chr> "Jan", "Jan", "Jan", "Jan", "Jan", "Jan", "Jan",…
map(df, class)
## $Account_ID
## [1] "numeric"
##
## $Transaction_Timestamp
## [1] "POSIXct" "POSIXt"
##
## $Factor_A
## [1] "numeric"
##
## $Factor_B
## [1] "numeric"
##
## $Factor_C
## [1] "character"
##
## $Factor_D
## [1] "numeric"
##
## $Factor_E
## [1] "character"
##
## $Response
## [1] "numeric"
##
## $Transaction_Status
## [1] "character"
##
## $Month
## [1] "character"
map_int(df, ~ sum(is.na(.x)))
## Account_ID Transaction_Timestamp Factor_A
## 0 0 0
## Factor_B Factor_C Factor_D
## 0 0 0
## Factor_E Response Transaction_Status
## 0 0 0
## Month
## 0
dim(df)
## [1] 698159 10
summary(df)
## Account_ID Transaction_Timestamp Factor_A
## Min. : 1 Min. :2009-01-01 00:02:57 Min. : 0.000
## 1st Qu.:116300 1st Qu.:2009-04-14 12:57:32 1st Qu.: 2.000
## Median :235929 Median :2009-07-16 00:12:39 Median : 2.000
## Mean :238645 Mean :2009-07-06 01:14:46 Mean : 4.804
## 3rd Qu.:358446 3rd Qu.:2009-10-01 00:11:31 3rd Qu.: 7.000
## Max. :495150 Max. :2009-11-30 23:59:32 Max. :16.000
## Factor_B Factor_C Factor_D Factor_E
## Min. : 0.00 Length:698159 Min. :10.00 Length:698159
## 1st Qu.: 6.00 Class :character 1st Qu.:20.00 Class :character
## Median : 6.00 Mode :character Median :20.00 Mode :character
## Mean :10.23 Mean :22.07
## 3rd Qu.:15.00 3rd Qu.:20.00
## Max. :25.00 Max. :90.00
## Response Transaction_Status Month
## Min. :1020 Length:698159 Length:698159
## 1st Qu.:1020 Class :character Class :character
## Median :1020 Mode :character Mode :character
## Mean :1068
## 3rd Qu.:1020
## Max. :1913
map_int(df, n_distinct)
## Account_ID Transaction_Timestamp Factor_A
## 475413 686538 7
## Factor_B Factor_C Factor_D
## 6 4 15
## Factor_E Response Transaction_Status
## 63 42 2
## Month
## 11
df <- df %>%
mutate(Factor_D = recode(Factor_D,
`26` = 25))
sum(df$Factor_D == 25)
## [1] 41021
df_clean <- df %>%
filter(Factor_C != "NULL",
Factor_E != "NULL")
nrow(df_clean)
## [1] 489537
df_clean <- df_clean %>%
mutate(across(-Transaction_Timestamp, as.factor))
map(df_clean, class)
## $Account_ID
## [1] "factor"
##
## $Transaction_Timestamp
## [1] "POSIXct" "POSIXt"
##
## $Factor_A
## [1] "factor"
##
## $Factor_B
## [1] "factor"
##
## $Factor_C
## [1] "factor"
##
## $Factor_D
## [1] "factor"
##
## $Factor_E
## [1] "factor"
##
## $Response
## [1] "factor"
##
## $Transaction_Status
## [1] "factor"
##
## $Month
## [1] "factor"
df_clean %>%
group_by(Transaction_Status) %>%
summarise(across(c(Account_ID, Factor_A, Response, Factor_D, Month),
n_distinct))
## # A tibble: 2 × 6
## Transaction_Status Account_ID Factor_A Response Factor_D Month
## <fct> <int> <int> <int> <int> <int>
## 1 Approved 316172 2 2 11 11
## 2 Declined 14066 2 28 11 11
df_clean %>%
mutate(Quarter = lubridate::quarter(Transaction_Timestamp)) %>%
count(Quarter)
## # A tibble: 4 × 2
## Quarter n
## <int> <int>
## 1 1 85588
## 2 2 100227
## 3 3 161071
## 4 4 142651
sw_people %>%
map_int(~length(.x$films)) %>%
table()
## .
## 1 2 3 4 5 6 7
## 46 18 13 2 5 2 1