1. Data Exploratory Analysis

library(readxl)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
df_treasury <- read_xlsx("C:/Users/Anhuynh/Desktop/Work Projects/Treasury Data Exercise.xlsx", sheet = "Dataset_Manipulated")
# 1.1. Check missing data
mis_dt <- function(x) {sum(is.na(x))/length(x)*100}
apply(df_treasury, 2, mis_dt)
##                     Flow         Transaction date                  Account 
##                0.0000000                0.0000000                0.0000000 
##               Value date                Flow cur.              Flow amount 
##                0.0000000                0.0000000                0.0000000 
## Rate against U.S. Dollar                Acct cur.           Account amount 
##                0.0000000                0.0000000                0.0000000 
##              Flow status              Budget code       Budget description 
##                0.0000000                0.4904518                0.4904518 
##              Description                Reference                   Number 
##                0.0000000                0.0000000                0.0000000 
##               Fee amount          Accounting date              Update date 
##                0.0000000                0.0000000                0.0000000 
##              Update time                   Origin      Account description 
##                0.0000000                0.0000000                0.0000000 
##         Flow description 
##                0.0000000
options(scipen = 999) # turn off scientific notation for numbers
# Recalculate Flow/Account Amount
df_treasury$Flow_amt_usd <- df_treasury$`Rate against U.S. Dollar` * df_treasury$`Flow amount`
# 1.2. Check Outliers using Percentiles method
lower_bound <- quantile(df_treasury$Flow_amt_usd, 0.025)

upper_bound <- quantile(df_treasury$Flow_amt_usd, 0.975)

outlier_ind <- which(df_treasury$Flow_amt_usd < lower_bound | df_treasury$Flow_amt_usd > upper_bound)

## Print Outliers
df_treasury[outlier_ind, ]
## # A tibble: 917 × 23
##    Flow    `Transaction date`  Account `Value date`        `Flow cur.`
##    <chr>   <dttm>                <dbl> <dttm>              <chr>      
##  1 -NOCASH 2020-02-06 00:00:00       2 2020-02-06 00:00:00 EUR        
##  2 +195    2020-02-18 00:00:00       4 2020-02-18 00:00:00 MXN        
##  3 +195    2020-04-14 00:00:00       4 2020-04-14 00:00:00 MXN        
##  4 -698    2020-01-20 00:00:00       4 2020-01-20 00:00:00 MXN        
##  5 -698    2020-02-20 00:00:00       4 2020-02-20 00:00:00 MXN        
##  6 -698    2020-03-20 00:00:00       4 2020-03-20 00:00:00 MXN        
##  7 -698    2020-04-20 00:00:00       4 2020-04-20 00:00:00 MXN        
##  8 -698    2020-05-20 00:00:00       4 2020-05-20 00:00:00 MXN        
##  9 -699    2020-01-20 00:00:00       4 2020-01-20 00:00:00 MXN        
## 10 -699    2020-02-20 00:00:00       4 2020-02-20 00:00:00 MXN        
## # ℹ 907 more rows
## # ℹ 18 more variables: `Flow amount` <dbl>, `Rate against U.S. Dollar` <dbl>,
## #   `Acct cur.` <chr>, `Account amount` <dbl>, `Flow status` <chr>,
## #   `Budget code` <chr>, `Budget description` <chr>, Description <chr>,
## #   Reference <dbl>, Number <dbl>, `Fee amount` <dbl>,
## #   `Accounting date` <dttm>, `Update date` <dttm>, `Update time` <dttm>,
## #   Origin <chr>, `Account description` <chr>, `Flow description` <chr>, …
## Outlier Visualization
### Box plot
require(ggplot2)
ggplot(df_treasury) +
  aes(x = "", y = Flow_amt_usd) +
  geom_boxplot(fill = "#0c4c8a") +
  theme_minimal()

### Histogram
hist(df_treasury$Flow_amt_usd, col = "green")
rug(df_treasury$Flow_amt_usd)
abline(v = median(df_treasury$Flow_amt_usd), col = "magenta", lwd = 4)

# 1.3. Standardization
# standardize the data using scale() function
library(dplyr)
df_treasury_stan <- df_treasury %>%
  mutate(Flow_amt_usd_sta = scale(df_treasury$Flow_amt_usd))
# 1.4. Feature Engineering
## Create a "Time_lag" feature of time difference between Transaction date & Value date

df_treasury_stan <- df_treasury_stan %>%
  mutate(Time_lag = difftime(df_treasury_stan$`Value date`, df_treasury_stan$`Transaction date`, units = "days")) 


## Data Viz
df_timelag <- data.frame(Time_Lag = c("Same day", "Before or after 01 day", "Before or after more than 02 days"), Transaction_Volume = c(18819, 235, 112))
ggplot(data=df_timelag, aes(x = Time_Lag, y = Transaction_Volume, fill = Time_Lag)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_timelag$Transaction_Volume, label= df_timelag$Transaction_Volume), vjust=1, color="black", size=3) +
labs(title="Time Lag (Days) Between Value Date & Transaction Date") +
        theme(legend.position="none")
## Warning: Use of `df_timelag$Transaction_Volume` is discouraged.
## ℹ Use `Transaction_Volume` instead.
## Use of `df_timelag$Transaction_Volume` is discouraged.
## ℹ Use `Transaction_Volume` instead.

# 1.5. Correlation Of Variables
## data preparation
df_treasury_stan$`Flow status` <- as.factor(df_treasury_stan$`Flow status`)
df_treasury_stan$`Budget code` <- as.factor(df_treasury_stan$`Budget code`)

## examining correlation with the most popular Budget codes
df_treasury_stan_top_budget <- df_treasury_stan[which(df_treasury_stan$`Budget code` %in% c("+COLLECT", "-AP_VEND", "-BANKFEE", "-FX", "-INTERCO", "-MISC", "-PAYROL", "-TAXES", "-TRLEXP", "-ZBA", "-ZNOCODE", "+INTERCO", "+STOCK", "+ZBA")),]

df_treasury_top_cor <- df_treasury_stan_top_budget[, c(10,11,24,25)]

library(ggcorrplot) ## apply to multiple data types
model.matrix(~0+., data = df_treasury_top_cor) %>% 
  cor(use="pairwise.complete.obs") %>% 
  ggcorrplot(show.diag=FALSE, type="lower", lab=TRUE, lab_size=2)
## Warning in cor(., use = "pairwise.complete.obs"): the standard deviation is
## zero

## examining correlation with the least popular Budget codes
df_treasury_stan_bot_budget <- df_treasury_stan[which(df_treasury_stan$`Budget code` %in% c("-AP_RENT", "-BENEFIT", "-DEBT", "-ESPP", "-IC_LOAN", "-ICD", "-INT_ICL", "-INT_INC", "-INVEST", "-PAYFEE", "-PAYFID", "-PAYTAX", "-POOL", "-SALETAX", "-SWEEP", "-VDI", "+ESPP", "+FX", "+IC_LOAN", "+ICD", "+INT_ICL", "+INT_INC", "+MISC", "+SWEEP", "+TAXES", "+ZNOCODE")),]

df_treasury_bot_cor <- df_treasury_stan_bot_budget[, c(10,11,24,25)]

library(ggcorrplot) ## apply to multiple data types
model.matrix(~0+., data = df_treasury_bot_cor) %>% 
  cor(use="pairwise.complete.obs") %>% 
  ggcorrplot(show.diag=FALSE, type="lower", lab=TRUE, lab_size=2)
## Warning in cor(., use = "pairwise.complete.obs"): the standard deviation is
## zero

df_treasury_stan[which(df_treasury_stan$`Budget code`=="-SWEEP"),]
## # A tibble: 6 × 25
##   Flow  `Transaction date`  Account `Value date`        `Flow cur.`
##   <chr> <dttm>                <dbl> <dttm>              <chr>      
## 1 -MISC 2019-02-26 00:00:00      60 2020-02-21 00:00:00 BRL        
## 2 -MISC 2020-02-11 00:00:00      60 2020-02-10 00:00:00 BRL        
## 3 -MISC 2020-03-17 00:00:00      60 2020-03-16 00:00:00 BRL        
## 4 -MISC 2020-03-20 00:00:00      60 2020-03-19 00:00:00 BRL        
## 5 -MISC 2020-03-27 00:00:00      60 2020-03-26 00:00:00 BRL        
## 6 -MISC 2020-05-19 00:00:00      60 2020-05-18 00:00:00 BRL        
## # ℹ 20 more variables: `Flow amount` <dbl>, `Rate against U.S. Dollar` <dbl>,
## #   `Acct cur.` <chr>, `Account amount` <dbl>, `Flow status` <fct>,
## #   `Budget code` <fct>, `Budget description` <chr>, Description <chr>,
## #   Reference <dbl>, Number <dbl>, `Fee amount` <dbl>,
## #   `Accounting date` <dttm>, `Update date` <dttm>, `Update time` <dttm>,
## #   Origin <chr>, `Account description` <chr>, `Flow description` <chr>,
## #   Flow_amt_usd <dbl>, Flow_amt_usd_sta <dbl[,1]>, Time_lag <drtn>
# 1.6. Investigate Relationship between Budget codes & Flow amounts
## as observed from overall correlation matrix, choosing the most affected budget codes
df_treasury_stan_chosen_budget <- df_treasury_stan[which(df_treasury_stan$`Budget code` %in% c("-AP_VEND", "-ZBA", "+ZBA", "+ICD", "-ICD") & df_treasury_stan$Flow %in% c("+277", "+275", "+206", "+195", "-577", "-575", "-495")),]

df_treasury_chosen_cor <- df_treasury_stan_chosen_budget[, c(1,10,11,24)]

library(ggcorrplot) ## apply to multiple data types
model.matrix(~0+., data = df_treasury_chosen_cor) %>% 
  cor(use="pairwise.complete.obs") %>% 
  ggcorrplot(show.diag=FALSE, type="lower", lab=TRUE, lab_size=2)
## Warning in cor(., use = "pairwise.complete.obs"): the standard deviation is
## zero

# Relationship of Flow Amount ~ Budget Code
par(mfrow = c(2,2), mar = c(5,4,2,1))

ggplot(data=df_treasury_stan, aes(x = `Budget code`, y=Flow_amt_usd, fill=`Budget code`)) +
geom_bar(stat="identity", width=0.8) +
geom_text(aes(y=df_treasury_stan$Flow_amt_usd, label=""), vjust=1, color="black", size=3) +
labs(title="Change of Flow/Account Amount Over Budget Codes") +
theme(legend.title = element_blank(), axis.text.x=element_text(angle=45,hjust=1,vjust=1)) +
        theme(legend.position="none")
## Warning: Use of `df_treasury_stan$Flow_amt_usd` is discouraged.
## ℹ Use `Flow_amt_usd` instead.

df_treasury_stan[which(df_treasury_stan$`Budget code`%in% c("-AP_VEND", "-ZBA", "+ZBA", "+ICD", "-ICD") & df_treasury_stan$Flow %in% c("+277", "+275", "+206", "+195", "-577", "-575", "-495")),]
## # A tibble: 4,732 × 25
##    Flow  `Transaction date`  Account `Value date`        `Flow cur.`
##    <chr> <dttm>                <dbl> <dttm>              <chr>      
##  1 -495  2020-02-04 00:00:00       1 2020-02-04 00:00:00 CAD        
##  2 -495  2020-03-10 00:00:00       1 2020-03-10 00:00:00 CAD        
##  3 -495  2020-03-26 00:00:00       1 2020-03-26 00:00:00 CAD        
##  4 -495  2020-04-15 00:00:00       1 2020-04-15 00:00:00 CAD        
##  5 -495  2020-05-06 00:00:00       1 2020-05-06 00:00:00 CAD        
##  6 -495  2020-05-06 00:00:00       1 2020-05-06 00:00:00 CAD        
##  7 -495  2020-05-06 00:00:00       1 2020-05-06 00:00:00 CAD        
##  8 -495  2020-05-13 00:00:00       1 2020-05-13 00:00:00 CAD        
##  9 -495  2020-05-22 00:00:00       1 2020-05-22 00:00:00 CAD        
## 10 -495  2020-01-15 00:00:00       2 2020-01-15 00:00:00 EUR        
## # ℹ 4,722 more rows
## # ℹ 20 more variables: `Flow amount` <dbl>, `Rate against U.S. Dollar` <dbl>,
## #   `Acct cur.` <chr>, `Account amount` <dbl>, `Flow status` <fct>,
## #   `Budget code` <fct>, `Budget description` <chr>, Description <chr>,
## #   Reference <dbl>, Number <dbl>, `Fee amount` <dbl>,
## #   `Accounting date` <dttm>, `Update date` <dttm>, `Update time` <dttm>,
## #   Origin <chr>, `Account description` <chr>, `Flow description` <chr>, …
fit1 <- lm(Flow_amt_usd_sta ~ as.factor(`Budget code`) *as.factor(Flow) -1, df_treasury_stan_chosen_budget) # With Flow
summary(fit1)$coef
##                                     Estimate Std. Error     t value
## as.factor(`Budget code`)-AP_VEND -0.11466412 0.02762923 -4.15010150
## as.factor(`Budget code`)-ICD      3.20659363 0.19627214 16.33748717
## as.factor(`Budget code`)-ZBA      0.64585365 0.11675242  5.53182221
## as.factor(`Budget code`)+ICD      4.05413985 0.42753174  9.48266402
## as.factor(`Budget code`)+ZBA     -0.13109628 1.63035887 -0.08040946
## as.factor(Flow)-575               0.42936120 0.14693339  2.92214853
## as.factor(Flow)+195               0.04066614 0.34107424  0.11922959
## as.factor(Flow)+206              -2.72757708 0.66976891 -4.07241518
## as.factor(Flow)+275               1.08909665 1.63248035  0.66714227
## as.factor(Flow)+277               0.78095483 1.63455543  0.47777813
##                                                                                           Pr(>|t|)
## as.factor(`Budget code`)-AP_VEND 0.000033817854686727118044588247247617118773632682859897613525391
## as.factor(`Budget code`)-ICD     0.000000000000000000000000000000000000000000000000000000000208163
## as.factor(`Budget code`)-ZBA     0.000000033402144618755744696122350623213037579262163490056991577
## as.factor(`Budget code`)+ICD     0.000000000000000000003818508387144790880082800832440170779591426
## as.factor(`Budget code`)+ZBA     0.935915007282081878869917090923991054296493530273437500000000000
## as.factor(Flow)-575              0.003492749204348178856949225234984623966738581657409667968750000
## as.factor(Flow)+195              0.905098538921346174035420517611782997846603393554687500000000000
## as.factor(Flow)+206              0.000047290203063972310578225166644017463113414123654365539550781
## as.factor(Flow)+275              0.504713858726509423391348718723747879266738891601562500000000000
## as.factor(Flow)+277              0.632830245313695360920291932416148483753204345703125000000000000
fit2 <- lm(Flow_amt_usd_sta ~ as.factor(`Budget code`) -1, df_treasury_stan_chosen_budget) # without Flow
summary(fit2)$coef
##                                    Estimate Std. Error   t value
## as.factor(`Budget code`)-AP_VEND -0.1143973 0.02763028 -4.140286
## as.factor(`Budget code`)-ICD      3.2065936 0.19692680 16.283175
## as.factor(`Budget code`)-ZBA      0.9169437 0.07112160 12.892619
## as.factor(`Budget code`)+ICD      3.5411573 0.23133661 15.307380
## as.factor(`Budget code`)+ZBA      0.8528732 0.06798140 12.545686
##                                                                                            Pr(>|t|)
## as.factor(`Budget code`)-AP_VEND 0.0000352919015092657145388904682548059099644888192415237426757812
## as.factor(`Budget code`)-ICD     0.0000000000000000000000000000000000000000000000000000000004812368
## as.factor(`Budget code`)-ZBA     0.0000000000000000000000000000000000002101283889512836462526088055
## as.factor(`Budget code`)+ICD     0.0000000000000000000000000000000000000000000000000011628865146983
## as.factor(`Budget code`)+ZBA     0.0000000000000000000000000000000000153737318024414360041751681507
# evaluation with ANOVA
anova(fit1, fit2)
## Analysis of Variance Table
## 
## Model 1: Flow_amt_usd_sta ~ as.factor(`Budget code`) * as.factor(Flow) - 
##     1
## Model 2: Flow_amt_usd_sta ~ as.factor(`Budget code`) - 1
##   Res.Df   RSS Df Sum of Sq      F       Pr(>F)    
## 1   4722 12551                                     
## 2   4727 12649 -5   -97.248 7.3172 0.0000007696 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 1.7. Update Patterns
library(dplyr)
library(ggplot2)
library(lubridate)
library(timetk)

## prepare Update Date data
df_treasury_stan_time <- df_treasury_stan[which(df_treasury_stan$`Flow cur.` %in% c("USD", "EUR", "BRL", "AUD", "GBP","ILS", "JPY", "CAD","INR", "SGD") & df_treasury_stan$`Acct cur.` %in% c("USD", "EUR", "BRL", "AUD", "GBP","ILS", "JPY", "CAD","INR", "SGD")),c(3,5,8,18,19,23)]

df_treasury_stan_time$Account <- as.factor(df_treasury_stan_time$Account)

df_treasury_stan_time$month <-  lubridate::month(df_treasury_stan_time$`Update date`)
df_treasury_stan_time$year <-  lubridate::year(df_treasury_stan_time$`Update date`)

# line chart by month - Full years
df_treasury_stan_time %>% 
  # count by date
  group_by(`Update date`) %>% 
  mutate(Number_account = n()) %>% 
        
  ggplot() +
  geom_line(aes(x =`Update date`, y = Number_account, colour = factor(`Acct cur.`))) +

  labs(title = "Variation in Number of Accounts Of Most Popular Currencies", colour = "Account Currency") +
  theme(legend.position="bottom")

# line chart by month - Year 2020
df_treasury_stan_time[which(df_treasury_stan_time$year == 2020),] %>% 
  # count by date
  group_by(`Update date`) %>% 
  mutate(Number_account = n()) %>% 
        
  ggplot() +
  geom_line(aes(x =`Update date`, y = Number_account, colour = factor(`Acct cur.`))) +

  labs(title = "Variation in Number of Accounts Of Most Popular Currencies", subtitle = "Year: 2020", colour = "Account Currency") +
  theme(legend.position="bottom")

# line chart by month - Full years
df_treasury_stan_time %>% 
  # count by date
  group_by(`Update date`) %>% 
  mutate(Transaction_Volume = sum(Flow_amt_usd)) %>% 
        
  ggplot() +
  geom_line(aes(x =`Update date`, y = Transaction_Volume, colour = factor(`Flow cur.`))) +

  labs(title = "Variation in Transaction Volume Of Most Popular Currencies", subtitle = "Conversion in U.S Dollars", colour = "Flow Currency") +
  theme(legend.position="bottom")

# line chart by month - Year 2020
df_treasury_stan_time[which(df_treasury_stan_time$year == 2020),] %>% 
  # count by date
  group_by(`Update date`) %>% 
  mutate(Transaction_Volume = sum(Flow_amt_usd)) %>% 
        
  ggplot() +
  geom_line(aes(x =`Update date`, y = Transaction_Volume, colour = factor(`Flow cur.`))) + 

  labs(title = "Variation in Transaction Volume Of Most Popular Currencies", subtitle = "Conversion in U.S Dollars, Year: 2020", colour = "Flow Currency") +
  theme(legend.position="bottom")

## Count on Accounts and Total Transaction Volume Per Currency
df_treasury_stan_time_cnt <- df_treasury_stan_time %>% 
  # count by date
  group_by(`Update date`) %>% 
  mutate(Number_account = n())

aggregate(df_treasury_stan_time_cnt$Number_account, list( df_treasury_stan_time_cnt$`Acct cur.`), FUN=sum)
##    Group.1       x
## 1      AUD  165120
## 2      BRL  264939
## 3      CAD   84810
## 4      EUR  233267
## 5      GBP  150648
## 6      ILS   98515
## 7      INR   79937
## 8      JPY   99584
## 9      SGD   63354
## 10     USD 1942736
aggregate(df_treasury_stan_time$Flow_amt_usd, list(df_treasury_stan_time$`Flow cur.`), FUN=sum)
##    Group.1          x
## 1      AUD   50089718
## 2      BRL   36035919
## 3      CAD   41246313
## 4      EUR  303389517
## 5      GBP  158949365
## 6      ILS   31670313
## 7      INR   32388721
## 8      JPY   42861570
## 9      SGD   11502069
## 10     USD 5080602999
# Update Time for full dataset
## Extract Hour:Minute and timezone AM/PM indicator from datetime
df_treasury_stan$Update_time_HM <- format(as.POSIXct(df_treasury_stan$`Update time`), format = "%H:%M %p")

df_treasury_stan$month <-  lubridate::month(df_treasury_stan$`Update date`)
df_treasury_stan$year <-  lubridate::year(df_treasury_stan$`Update date`)

extract_am_pm <- function(time_string) {
  am_pm <- substr(time_string, nchar(time_string) - 1, nchar(time_string))
  return(am_pm)
}

df_treasury_stan$tz_cat <- ifelse(extract_am_pm(df_treasury_stan$Update_time_HM) == "AM", "Morning Hours" , "Afternoon Hours")


## data viz
df_treasury_stan[which(df_treasury_stan$year == 2020),] %>%
  group_by(`Update date`) %>% 
  mutate(Number_account = n()) %>% 
        
  ggplot(aes(x = `Update date`, y = Number_account)) + 
  geom_line(aes(color = tz_cat, linetype = tz_cat)) + 
  labs(title="Changes in Account Numbers at Different Update Times of the Day", subtitle = "Year: 2020") +
  scale_color_manual(values = c("#de2d26", "green", "#de2d26", "#de2d26", "#de2d26", "#de2d26", "#de2d26")) + 
  geom_smooth(method = "loess") +
  theme(legend.position="bottom", plot.title = element_text(size=13))
## `geom_smooth()` using formula = 'y ~ x'

df_treasury_stan[which(df_treasury_stan$year == 2020),] %>%
  group_by(`Update date`) %>% 
  mutate(Transaction_Volumn = sum(Flow_amt_usd)) %>%  
        
  ggplot(aes(x = `Update date`, y = Transaction_Volumn)) + 
  geom_line(aes(color = tz_cat, linetype = tz_cat)) + 
  labs(title="Changes in Transaction Volume at Different Update Times of the Day",subtitle = "Conversion in U.S Dollars, Year: 2020") +
  scale_color_manual(values = c("#de2d26", "green", "#de2d26", "#de2d26", "#de2d26", "#de2d26", "#de2d26")) + 
  geom_smooth(method = "loess") +
  theme(legend.position="bottom", plot.title = element_text(size=13))
## `geom_smooth()` using formula = 'y ~ x'

## Account
aggregate(df_treasury_stan$Flow_amt_usd, list( df_treasury_stan$Account), FUN=sum)
##    Group.1              x
## 1        1   20108194.441
## 2        2   12787807.780
## 3        3     288681.306
## 4        4     327540.638
## 5        5 2285190074.000
## 6        6 1122038408.000
## 7        7  594024995.000
## 8        8   28290008.000
## 9        9  656694703.685
## 10      10   17360007.000
## 11      11   21138117.534
## 12      12   24480027.764
## 13      13    2426150.585
## 14      14    4711681.341
## 15      15     627291.639
## 16      16   66777623.310
## 17      17     170889.010
## 18      18     353171.772
## 19      19   46260030.000
## 20      20    5520529.517
## 21      21  158334016.892
## 22      22   68090033.000
## 23      23   23455843.141
## 24      24     340003.000
## 25      25    2250153.403
## 26      26   18074139.820
## 27      27   10853009.203
## 28      28     230004.000
## 29      29   12281210.079
## 30      30          2.312
## 31      31   31670310.292
## 32      32      60011.000
## 33      33    4519566.569
## 34      34    2784725.446
## 35      35    2007566.528
## 36      36     654921.090
## 37      37    1562759.566
## 38      38     441102.973
## 39      39     777024.420
## 40      40  165781371.692
## 41      41          5.232
## 42      42  224750015.000
## 43      43   11572517.823
## 44      44     490003.000
## 45      45   50089717.211
## 46      46    4673100.884
## 47      47    3220028.000
## 48      48    4348373.480
## 49      49   37354270.119
## 50      50    3520023.000
## 51      51    5617200.140
## 52      52    1405720.000
## 53      53    3380005.000
## 54      54     110600.000
## 55      55   31658900.630
## 56      56    2000000.000
## 57      57     975697.565
## 58      58     729820.364
## 59      59   34386684.147
## 60      60    1730476.147
# Flow status
aggregate(df_treasury_stan$Flow_amt_usd, list( df_treasury_stan$`Flow status`), FUN=sum)
##              Group.1          x
## 1             Actual 5725803101
## 2 Confirmed forecast  109953764
# Budget code
aggregate(df_treasury_stan$Flow_amt_usd, list(df_treasury_stan$`Budget code`), FUN=sum)
##     Group.1             x
## 1  -AP_RENT     459301.31
## 2  -AP_VEND  269032493.82
## 3  -BANKFEE     211571.36
## 4  -BENEFIT    2068713.85
## 5     -DEBT   20000000.00
## 6     -ESPP   16113436.57
## 7       -FX   82045378.98
## 8  -IC_LOAN   11100000.00
## 9      -ICD  542875008.00
## 10 -INT_ICL     382101.11
## 11 -INT_INC      11164.71
## 12 -INTERCO  172465101.41
## 13  -INVEST    8323390.48
## 14    -MISC    1612131.83
## 15  -PAYFEE      60009.00
## 16  -PAYFID   28020007.00
## 17  -PAYROL  159021132.69
## 18  -PAYTAX  187325487.41
## 19    -POOL    2953600.03
## 20 -SALETAX    2757003.00
## 21   -SWEEP     318200.00
## 22   -TAXES   20289916.85
## 23  -TRLEXP    9154248.52
## 24     -VDI    1810000.00
## 25     -ZBA 1250682142.84
## 26 -ZNOCODE     540048.57
## 27 +COLLECT  854498045.85
## 28    +ESPP   16250007.11
## 29      +FX   13811461.89
## 30 +IC_LOAN   14470000.00
## 31     +ICD  412953760.00
## 32 +INT_ICL      32271.11
## 33 +INT_INC      40738.13
## 34 +INTERCO  274570801.40
## 35    +MISC   17099390.00
## 36   +STOCK   36827856.33
## 37   +SWEEP    5744811.92
## 38   +TAXES     376880.74
## 39     +ZBA 1285662588.93
## 40 +ZNOCODE    3832898.20
## Count on Flow 
df_treasury_stan <- df_treasury_stan %>% 
  # count by date
  group_by(Flow) %>% 
  mutate(flow_cnt = n())

# Flow Description
aggregate(df_treasury_stan$flow_cnt, list(df_treasury_stan$`Flow description`), FUN=sum)
##                                         Group.1        x
## 1                      ACH Concentration Credit        4
## 2                           ACH Credit Received   359862
## 3                            ACH Debit Received   543528
## 4                               ACH Return Item       25
## 5      ACH Return Item or Adjustment Settlement       49
## 6                                ACH Settlement  1639714
## 7                                     Bank fees    97969
## 8                         Bank Originated Debit    10260
## 9                          Book Transfer Credit     9702
## 10                          Book Transfer Debit    70225
## 11 Cash Flows NOT impacting balance calculation     5066
## 12                                Check Deposit     1600
## 13               Credit interests on investment        1
## 14                              Credit Reversal       81
## 15                  Currency and Coin Deposited        1
## 16                             Debit Adjustment        4
## 17                               Debit Reversal      100
## 18                      Deposited Item Returned       64
## 19                    Domestic Letter of Credit        1
## 20                            Domestic transfer   116084
## 21                             Float Adjustment       16
## 22                             Foreign exchange        9
## 23                       Foreign Exchange Debit     8649
## 24                   Foreign Exchange of Credit      121
## 25                    Foreign Remittance Credit     3422
## 26                    Interest Adjustment Debit        9
## 27                              Interest Credit      361
## 28                               Interest Debit     3969
## 29                            Interest Received        1
## 30                      Internal Money Transfer      225
## 31                 International Money Transfer      361
## 32          International Money Transfer Debits      676
## 33                               Issued Cheques        4
## 34                             Letter of Credit        9
## 35                               Loan Interests       16
## 36                              Lockbox Deposit    15376
## 37                     Miscellaneous ACH Credit   321789
## 38                      Miscellaneous ACH Debit  2689340
## 39                     Miscellaneous collection     5184
## 40                         Miscellaneous Credit       16
## 41                          Miscellaneous Debit     2401
## 42                   Miscellaneous disbursement     7744
## 43                     Miscellaneous Fee Refund        9
## 44                           Miscellaneous Fees    27556
## 45                 Miscellaneous Security Debit     1024
## 46                                Other Deposit       16
## 47             Outgoing Internal Money Transfer   184665
## 48                                   Paid Check   217863
## 49                     Preauthorized ACH Credit  8465719
## 50                      Preauthorized ACH Debit   343396
## 51                                  Return Item      289
## 52       summary total automatic transfer debit     9409
## 53                          Sweep Principal Buy     2116
## 54                         Sweep Principal Sell     2601
## 55                             Total ACh Debits      196
## 56                                Wire Transfer 25511983
## 57                                   ZBA Credit   148710
## 58                          ZBA Credit Transfer    42408
## 59                                    ZBA Debit   111556
## 60                           ZBA Debit Transfer    37830