Project Data Analysis for Finance: Performa Cabang

Data used

df_loan <- read.csv('https://storage.googleapis.com/dqlab-dataset/loan_disbursement.csv', stringsAsFactors = F)
dplyr::glimpse(df_loan)

## Rows: 9,754
## Columns: 5
## $ loan_id      <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…
## $ tanggal_cair <chr> "2020-01-06", "2020-01-06", "2020-01-06", "2020-01-06", "…
## $ cabang       <chr> "AA", "AA", "AA", "AA", "AA", "AA", "AB", "AB", "AB", "AB…
## $ agen         <chr> "AA-1", "AA-1", "AA-1", "AA-2", "AA-2", "AA-2", "AB-1", "…
## $ amount       <int> 320000, 440000, 200000, 430000, 360000, 220000, 320000, 4…

Filter data for May 2020, and sum data per branch

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.2.2

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

df_loan_mei <- df_loan %>% 
  filter(tanggal_cair >= '2020-05-01', tanggal_cair <= '2020-05-3') %>% 
  group_by(cabang) %>% 
  summarise(total_amount = sum(amount))
df_loan_mei

## # A tibble: 22 × 2
##    cabang total_amount
##    <chr>         <int>
##  1 AA         75710000
##  2 AB         81440000
##  3 AC         83990000
##  4 AD         76080000
##  5 AE         54200000
##  6 AF         68040000
##  7 AG         74080000
##  8 AH         73840000
##  9 AI         46640000
## 10 AJ         43580000
## # … with 12 more rows

Show data on 5 branches with the largest total amount

library(dplyr)
library(scales)

## Warning: package 'scales' was built under R version 4.2.2

df_loan_mei %>% 
  arrange(desc(total_amount)) %>% 
  mutate(total_amount = comma(total_amount)) %>% 
  head(5)

## # A tibble: 5 × 2
##   cabang total_amount
##   <chr>  <chr>       
## 1 AC     83,990,000  
## 2 AB     81,440,000  
## 3 AD     76,080,000  
## 4 AA     75,710,000  
## 5 AG     74,080,000

Show data on 5 branches with the smallest total amount

library(dplyr)
library(scales)
df_loan_mei %>% 
  arrange(total_amount) %>% 
  mutate(total_amount = comma(total_amount)) %>% 
  head(5)

## # A tibble: 5 × 2
##   cabang total_amount
##   <chr>  <chr>       
## 1 AV     30,280,000  
## 2 AS     31,740,000  
## 3 AT     34,840,000  
## 4 AU     35,610,000  
## 5 AO     39,120,000

Calculating the age of the branches (in months)

library(dplyr)
 df_cabang_umur <- df_loan %>%
  group_by(cabang) %>% 
  summarise(pertama_cair = min(tanggal_cair)) %>% 
  mutate(umur = as.numeric(as.Date('2020-05-15') - as.Date( pertama_cair))%/%30)
df_cabang_umur

## # A tibble: 22 × 3
##    cabang pertama_cair  umur
##    <chr>  <chr>        <dbl>
##  1 AA     2020-01-06       4
##  2 AB     2020-01-06       4
##  3 AC     2020-01-06       4
##  4 AD     2020-01-06       4
##  5 AE     2020-02-03       3
##  6 AF     2020-02-03       3
##  7 AG     2020-02-03       3
##  8 AH     2020-02-03       3
##  9 AI     2020-03-02       2
## 10 AJ     2020-03-02       2
## # … with 12 more rows

Combine age and May performance data

library(dplyr)
df_loan_mei_umur <- df_cabang_umur %>%
  inner_join(df_loan_mei, by = 'cabang')
df_loan_mei_umur

## # A tibble: 22 × 4
##    cabang pertama_cair  umur total_amount
##    <chr>  <chr>        <dbl>        <int>
##  1 AA     2020-01-06       4     75710000
##  2 AB     2020-01-06       4     81440000
##  3 AC     2020-01-06       4     83990000
##  4 AD     2020-01-06       4     76080000
##  5 AE     2020-02-03       3     54200000
##  6 AF     2020-02-03       3     68040000
##  7 AG     2020-02-03       3     74080000
##  8 AH     2020-02-03       3     73840000
##  9 AI     2020-03-02       2     46640000
## 10 AJ     2020-03-02       2     43580000
## # … with 12 more rows

Plot of age and performance relationships in May

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.2.2

ggplot(df_loan_mei_umur, aes(x = umur, y = total_amount)) +
  geom_point() +
  scale_y_continuous(labels = scales::comma) +
  labs(title = "Semakin berumur, perfoma cabang akan semakin baik",
       x = "Umur (bulan)",
       y = "Total Amount")

Looking for branches with low performance for every age

library(dplyr)
library(scales)
df_loan_mei_flag <- df_loan_mei_umur %>% 
  group_by(umur) %>% 
  mutate(Q1 = quantile(total_amount, 0.25),
       Q3 = quantile(total_amount, 0.75),
      IQR = (Q3-Q1)) %>%
  mutate(flag = ifelse(total_amount < (Q1 - IQR), 'rendah','baik'))

df_loan_mei_flag %>% 
  filter(flag == 'rendah') %>% 
  mutate_if(is.numeric, funs(comma))

## `mutate_if()` ignored the following grouping variables:
## • Column `umur`

## Warning: `funs()` was deprecated in dplyr 0.8.0.
## ℹ Please use a list of either functions or lambdas:
## 
## # Simple named list: list(mean = mean, median = median)
## 
## # Auto named with `tibble::lst()`: tibble::lst(mean, median)
## 
## # Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))

## # A tibble: 2 × 8
## # Groups:   umur [2]
##   cabang pertama_cair  umur total_amount Q1         Q3         IQR       flag  
##   <chr>  <chr>        <dbl> <chr>        <chr>      <chr>      <chr>     <chr> 
## 1 AE     2020-02-03       3 54,200,000   64,580,000 73,900,000 9,320,000 rendah
## 2 AL     2020-03-02       2 40,650,000   43,580,000 44,590,000 1,010,000 rendah

Create another Scatterplot and give the low branches a red color

library(ggplot2)

ggplot(df_loan_mei_flag, aes(x = umur, y = total_amount)) +
  geom_point(aes(color = flag)) +
  scale_color_manual(breaks = c("baik", "rendah"),
                        values=c("blue", "red")) +
  scale_y_continuous(labels = scales::comma) +
  labs(title = "Ada cabang berpeforma rendah padahal tidak termasuk bottom 5 nasional",
       color = "",
       x = "Umur (bulan)",
       y = "Total Amount")

See how the performance of branches of the same age compares

library(dplyr)
library(scales)
df_loan_mei_flag %>% 
  filter(umur == 3) %>% 
  inner_join(df_loan, by = 'cabang') %>% 
  filter(tanggal_cair >= '2020-05-01', tanggal_cair <= '2020-05-31') %>% 
  group_by(cabang, flag) %>% 
  summarise(jumlah_hari = n_distinct(tanggal_cair),
            agen_aktif = n_distinct(agen),
            total_loan_cair = n_distinct(loan_id),
            avg_amount = mean(amount), 
            total_amount = sum(amount)) %>% 
  arrange(total_amount) %>% 
  mutate_if(is.numeric, funs(comma))

## `summarise()` has grouped output by 'cabang'. You can override using the
## `.groups` argument.
## `mutate_if()` ignored the following grouping variables:

## # A tibble: 4 × 7
## # Groups:   cabang [4]
##   cabang flag   jumlah_hari agen_aktif total_loan_cair avg_amount total_amount
##   <chr>  <chr>  <chr>       <chr>      <chr>           <chr>      <chr>       
## 1 AE     rendah 21          3          175             309,714    54,200,000  
## 2 AF     baik   21          3          225             302,400    68,040,000  
## 3 AH     baik   21          3          241             306,390    73,840,000  
## 4 AG     baik   21          3          241             307,386    74,080,000

See a comparison of agent performance on low branches

library(dplyr)
library(scales)

df_loan_mei_flag %>% 
  filter(umur == 3, flag == 'rendah') %>% 
  inner_join(df_loan, by = 'cabang') %>% 
  filter(tanggal_cair >= '2020-05-01', tanggal_cair <= '2020-05-31') %>% 
  group_by(cabang, agen) %>% 
  summarise(jumlah_hari = n_distinct(tanggal_cair),
            total_loan_cair = n_distinct(loan_id),
            avg_amount = mean(amount), 
            total_amount = sum(amount)) %>% 
  arrange(total_amount) %>% 
  mutate_if(is.numeric, funs(comma))

## `summarise()` has grouped output by 'cabang'. You can override using the
## `.groups` argument.
## `mutate_if()` ignored the following grouping variables:

## # A tibble: 3 × 6
## # Groups:   cabang [1]
##   cabang agen  jumlah_hari total_loan_cair avg_amount total_amount
##   <chr>  <chr> <chr>       <chr>           <chr>      <chr>       
## 1 AE     AE-3  4           16              310,625    4,970,000   
## 2 AE     AE-2  18          73              320,274    23,380,000  
## 3 AE     AE-1  21          86              300,581    25,850,000

See a comparison of agent performance on the branch that is best at 3 months old

library(dplyr)
library(scales)
df_loan %>% 
  filter(cabang == 'AH') %>% 
  filter(tanggal_cair >= '2020-05-01', tanggal_cair <= '2020-05-31') %>% 
  group_by(cabang, agen) %>% 
  summarise(jumlah_hari = n_distinct(tanggal_cair),
            total_loan_cair = n_distinct(loan_id),
            avg_amount = mean(amount), 
            total_amount = sum(amount)) %>% 
  arrange(total_amount) %>% 
  mutate_if(is.numeric, funs(comma))

## `summarise()` has grouped output by 'cabang'. You can override using the
## `.groups` argument.
## `mutate_if()` ignored the following grouping variables:

## # A tibble: 3 × 6
## # Groups:   cabang [1]
##   cabang agen  jumlah_hari total_loan_cair avg_amount total_amount
##   <chr>  <chr> <chr>       <chr>           <chr>      <chr>       
## 1 AH     AH-3  19          74              303,649    22,470,000  
## 2 AH     AH-1  21          81              301,358    24,410,000  
## 3 AH     AH-2  21          86              313,488    26,960,000

Conclusion :

Based on the analysis, it can be concluded that the low performance of the AE branch is caused by one of the agents disbursing only 4 days in 1 month, even though other agents can be active for 21 days. This makes the total number of agents only 20% compared to other agents. While in the AH branch, the performance is very good because the three agents disburse almost / always every working day. 2 full people 21 days 1 person 19 days. So that the performance is well maintained. It is also worth noting that in order to compare the performance of branches, it must first be grouped according to the same characteristics. All indirect branches compared without knowing its characteristics.