library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(repurrrsive)
files <- list.files(pattern = "^Month-.*\\.csv$")
df <- map_dfr(files, read_csv, show_col_types = FALSE)
glimpse(df)
## Rows: 698,159
## Columns: 10
## $ Account_ID            <dbl> 5, 16, 28, 40, 62, 64, 69, 69, 70, 79, 88, 90, 9…
## $ Transaction_Timestamp <dttm> 2009-01-08 00:16:41, 2009-01-20 22:40:08, 2009-…
## $ Factor_A              <dbl> 2, 2, 2, 2, 2, 7, 2, 2, 2, 7, 8, 10, 10, 2, 2, 2…
## $ Factor_B              <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 18, 6, 6, 6, 6, 6,…
## $ Factor_C              <chr> "VI", "VI", "VI", "VI", "VI", "MC", "VI", "VI", …
## $ Factor_D              <dbl> 20, 20, 21, 20, 20, 20, 20, 20, 20, 20, 20, 20, …
## $ Factor_E              <chr> "A", "H", "NULL", "H", "B", "NULL", "H", "H", "B…
## $ Response              <dbl> 1020, 1020, 1020, 1020, 1020, 1020, 1020, 1020, …
## $ Transaction_Status    <chr> "Approved", "Approved", "Approved", "Approved", …
## $ Month                 <chr> "Jan", "Jan", "Jan", "Jan", "Jan", "Jan", "Jan",…
map(df, class)
## $Account_ID
## [1] "numeric"
## 
## $Transaction_Timestamp
## [1] "POSIXct" "POSIXt" 
## 
## $Factor_A
## [1] "numeric"
## 
## $Factor_B
## [1] "numeric"
## 
## $Factor_C
## [1] "character"
## 
## $Factor_D
## [1] "numeric"
## 
## $Factor_E
## [1] "character"
## 
## $Response
## [1] "numeric"
## 
## $Transaction_Status
## [1] "character"
## 
## $Month
## [1] "character"
map_int(df, ~ sum(is.na(.x)))
##            Account_ID Transaction_Timestamp              Factor_A 
##                     0                     0                     0 
##              Factor_B              Factor_C              Factor_D 
##                     0                     0                     0 
##              Factor_E              Response    Transaction_Status 
##                     0                     0                     0 
##                 Month 
##                     0
dim(df)
## [1] 698159     10
summary(df)
##    Account_ID     Transaction_Timestamp            Factor_A     
##  Min.   :     1   Min.   :2009-01-01 00:02:57   Min.   : 0.000  
##  1st Qu.:116300   1st Qu.:2009-04-14 12:57:32   1st Qu.: 2.000  
##  Median :235929   Median :2009-07-16 00:12:39   Median : 2.000  
##  Mean   :238645   Mean   :2009-07-06 01:14:46   Mean   : 4.804  
##  3rd Qu.:358446   3rd Qu.:2009-10-01 00:11:31   3rd Qu.: 7.000  
##  Max.   :495150   Max.   :2009-11-30 23:59:32   Max.   :16.000  
##     Factor_B       Factor_C            Factor_D       Factor_E        
##  Min.   : 0.00   Length:698159      Min.   :10.00   Length:698159     
##  1st Qu.: 6.00   Class :character   1st Qu.:20.00   Class :character  
##  Median : 6.00   Mode  :character   Median :20.00   Mode  :character  
##  Mean   :10.23                      Mean   :22.07                     
##  3rd Qu.:15.00                      3rd Qu.:20.00                     
##  Max.   :25.00                      Max.   :90.00                     
##     Response    Transaction_Status    Month          
##  Min.   :1020   Length:698159      Length:698159     
##  1st Qu.:1020   Class :character   Class :character  
##  Median :1020   Mode  :character   Mode  :character  
##  Mean   :1068                                        
##  3rd Qu.:1020                                        
##  Max.   :1913
map_int(df, n_distinct)
##            Account_ID Transaction_Timestamp              Factor_A 
##                475413                686538                     7 
##              Factor_B              Factor_C              Factor_D 
##                     6                     4                    15 
##              Factor_E              Response    Transaction_Status 
##                    63                    42                     2 
##                 Month 
##                    11
df <- df %>%
  mutate(Factor_D = recode(Factor_D,
                           `26` = 25))
sum(df$Factor_D == 25)
## [1] 41021
df_clean <- df %>%
  filter(Factor_C != "NULL",
         Factor_E != "NULL")
nrow(df_clean)
## [1] 489537
df_clean <- df_clean %>%
  mutate(across(-Transaction_Timestamp, as.factor))
map(df_clean, class)
## $Account_ID
## [1] "factor"
## 
## $Transaction_Timestamp
## [1] "POSIXct" "POSIXt" 
## 
## $Factor_A
## [1] "factor"
## 
## $Factor_B
## [1] "factor"
## 
## $Factor_C
## [1] "factor"
## 
## $Factor_D
## [1] "factor"
## 
## $Factor_E
## [1] "factor"
## 
## $Response
## [1] "factor"
## 
## $Transaction_Status
## [1] "factor"
## 
## $Month
## [1] "factor"
df_clean %>%
  group_by(Transaction_Status) %>%
  summarise(across(c(Account_ID, Factor_A, Response, Factor_D, Month),
                   n_distinct))
## # A tibble: 2 × 6
##   Transaction_Status Account_ID Factor_A Response Factor_D Month
##   <fct>                   <int>    <int>    <int>    <int> <int>
## 1 Approved               316172        2        2       11    11
## 2 Declined                14066        2       28       11    11
df_clean %>%
  mutate(Quarter = lubridate::quarter(Transaction_Timestamp)) %>%
  count(Quarter)
## # A tibble: 4 × 2
##   Quarter      n
##     <int>  <int>
## 1       1  85588
## 2       2 100227
## 3       3 161071
## 4       4 142651
sw_people %>%
  map_int(~length(.x$films)) %>%
  table()
## .
##  1  2  3  4  5  6  7 
## 46 18 13  2  5  2  1