ETL in IT Audit

Nickyta SM

2023-11-03

library(readr)
library(dplyr)
library(lubridate)

1. Extract data from csv files

transactions <- read_csv("D:/EY/transactions.csv")
## Rows: 9 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): Transaction_ID, Transaction_Type, Status
## dbl  (2): Account_Number, Amount
## date (1): Transaction_Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
accounts <- read_csv("D:/EY/accounts.csv")
## Rows: 5 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): Account_Holder, Account_Type
## dbl  (1): Account_Number
## date (1): Opening_Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
audit_logs <- read_csv("D:/EY/audit_logs.csv")
## Rows: 7 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): Log_ID, Action
## dbl  (1): Account_Number
## date (1): Log_Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Display the extracted data

print(transactions)
## # A tibble: 9 × 6
##   Transaction_ID Account_Number Transaction_Date Transaction_Type Amount Status 
##   <chr>                   <dbl> <date>           <chr>             <dbl> <chr>  
## 1 TXN001                 123456 2024-01-01       Credit             1000 Cleared
## 2 TXN002                 789012 2024-01-02       Debit               500 Cleared
## 3 TXN003                 345678 2024-01-03       Credit               NA Pending
## 4 TXN004                 901234 2024-01-04       Debit               200 Cleared
## 5 TXN005                 567890 2024-01-05       Credit             3000 Cleared
## 6 TXN006                 567890 2024-01-06       Credit             5700 Cleared
## 7 TXN007                 901234 2024-01-07       Debit               500 Cleared
## 8 TXN008                 901236 2024-01-07       Debit               600 Cleared
## 9 TXN008                 901236 2024-01-07       Debit               600 Cleared
print(accounts)
## # A tibble: 5 × 4
##   Account_Number Account_Holder  Account_Type Opening_Date
##            <dbl> <chr>           <chr>        <date>      
## 1         123456 John Doe        Savings      2020-01-01  
## 2         789012 Jane Smith      Checking     2019-05-15  
## 3         345678 Emily Davis     Savings      2021-07-21  
## 4         901234 Michael Brown   Checking     2018-11-30  
## 5         567890 Jessica Johnson Savings      2022-03-22
print(audit_logs)
## # A tibble: 7 × 4
##   Log_ID Account_Number Log_Date   Action    
##   <chr>           <dbl> <date>     <chr>     
## 1 L001           123456 2024-01-01 Login     
## 2 L002           789012 2024-01-02 Transfer  
## 3 L003           345678 2024-01-03 Withdrawal
## 4 L004           901234 2024-01-04 Deposit   
## 5 L005           567890 2024-01-05 Login     
## 6 L006           345678 2024-01-06 Login     
## 7 L007           123456 2024-01-07 Transfer

2. Transform Data

a. Remove duplicates

transactions <- transactions %>% distinct()
accounts <- accounts %>% distinct()
audit_logs <- audit_logs %>% distinct()

b. Check and Handle missing values (filling with mean)

check_missing_values <- function(df, df_name) {
  missing_counts <- sapply(df, function(x) sum(is.na(x)))
  if (any(missing_counts > 0)) {
    cat("Dataframe", df_name, "has missing values in the following columns:\n")
    print(missing_counts[missing_counts > 0])
  } else {
    cat("Dataframe", df_name, "has no missing values.\n")
  }
}

check_missing_values(transactions, "Transactions")
## Dataframe Transactions has missing values in the following columns:
## Amount 
##      1
check_missing_values(accounts, "Accounts")
## Dataframe Accounts has no missing values.
check_missing_values(audit_logs, "Audit Logs")
## Dataframe Audit Logs has no missing values.
transactions$Amount[is.na(transactions$Amount)] <- mean(transactions$Amount, na.rm = TRUE)

c. Standardize date formats

transactions$Transaction_Date <- as.Date(transactions$Transaction_Date, format = "%Y-%m-%d")
accounts$Opening_Date <- as.Date(accounts$Opening_Date, format = "%Y-%m-%d")
audit_logs$Log_Date <- as.Date(audit_logs$Log_Date, format = "%Y-%m-%d")

d. Create valid_transaction using filter: only transactions with account numbers that exist in the accounts dataframe are included.

valid_transactions <- transactions %>%
  filter(Account_Number %in% accounts$Account_Number)
print(valid_transactions)
## # A tibble: 7 × 6
##   Transaction_ID Account_Number Transaction_Date Transaction_Type Amount Status 
##   <chr>                   <dbl> <date>           <chr>             <dbl> <chr>  
## 1 TXN001                 123456 2024-01-01       Credit            1000  Cleared
## 2 TXN002                 789012 2024-01-02       Debit              500  Cleared
## 3 TXN003                 345678 2024-01-03       Credit            1643. Pending
## 4 TXN004                 901234 2024-01-04       Debit              200  Cleared
## 5 TXN005                 567890 2024-01-05       Credit            3000  Cleared
## 6 TXN006                 567890 2024-01-06       Credit            5700  Cleared
## 7 TXN007                 901234 2024-01-07       Debit              500  Cleared

e. Create a new calculated field: Transaction Month

valid_transactions <- valid_transactions %>%
  mutate(Transaction_Month = floor_date(Transaction_Date, "month"))
print(valid_transactions)
## # A tibble: 7 × 7
##   Transaction_ID Account_Number Transaction_Date Transaction_Type Amount Status 
##   <chr>                   <dbl> <date>           <chr>             <dbl> <chr>  
## 1 TXN001                 123456 2024-01-01       Credit            1000  Cleared
## 2 TXN002                 789012 2024-01-02       Debit              500  Cleared
## 3 TXN003                 345678 2024-01-03       Credit            1643. Pending
## 4 TXN004                 901234 2024-01-04       Debit              200  Cleared
## 5 TXN005                 567890 2024-01-05       Credit            3000  Cleared
## 6 TXN006                 567890 2024-01-06       Credit            5700  Cleared
## 7 TXN007                 901234 2024-01-07       Debit              500  Cleared
## # … with 1 more variable: Transaction_Month <date>

f. Aggregate data: Total transaction amount per account and per month

aggregated_data <- valid_transactions %>%
  group_by(Account_Number, Transaction_Month) %>%
  summarise(Total_Amount = sum(Amount), .groups = 'drop')
print(aggregated_data)
## # A tibble: 5 × 3
##   Account_Number Transaction_Month Total_Amount
##            <dbl> <date>                   <dbl>
## 1         123456 2024-01-01               1000 
## 2         345678 2024-01-01               1643.
## 3         567890 2024-01-01               8700 
## 4         789012 2024-01-01                500 
## 5         901234 2024-01-01                700

g. Join transactions with account details

transformed_data <- valid_transactions %>%
  left_join(accounts, by = "Account_Number")
print(transformed_data)
## # A tibble: 7 × 10
##   Transaction_ID Account_Number Transaction_Date Transaction_Type Amount Status 
##   <chr>                   <dbl> <date>           <chr>             <dbl> <chr>  
## 1 TXN001                 123456 2024-01-01       Credit            1000  Cleared
## 2 TXN002                 789012 2024-01-02       Debit              500  Cleared
## 3 TXN003                 345678 2024-01-03       Credit            1643. Pending
## 4 TXN004                 901234 2024-01-04       Debit              200  Cleared
## 5 TXN005                 567890 2024-01-05       Credit            3000  Cleared
## 6 TXN006                 567890 2024-01-06       Credit            5700  Cleared
## 7 TXN007                 901234 2024-01-07       Debit              500  Cleared
## # … with 4 more variables: Transaction_Month <date>, Account_Holder <chr>,
## #   Account_Type <chr>, Opening_Date <date>

h. Conditional transformations: Flag large transactions

transformed_data <- transformed_data %>%
  mutate(Large_Transaction = ifelse(Amount > 1000, TRUE, FALSE))
print(transformed_data)
## # A tibble: 7 × 11
##   Transaction_ID Account_Number Transaction_Date Transaction_Type Amount Status 
##   <chr>                   <dbl> <date>           <chr>             <dbl> <chr>  
## 1 TXN001                 123456 2024-01-01       Credit            1000  Cleared
## 2 TXN002                 789012 2024-01-02       Debit              500  Cleared
## 3 TXN003                 345678 2024-01-03       Credit            1643. Pending
## 4 TXN004                 901234 2024-01-04       Debit              200  Cleared
## 5 TXN005                 567890 2024-01-05       Credit            3000  Cleared
## 6 TXN006                 567890 2024-01-06       Credit            5700  Cleared
## 7 TXN007                 901234 2024-01-07       Debit              500  Cleared
## # … with 5 more variables: Transaction_Month <date>, Account_Holder <chr>,
## #   Account_Type <chr>, Opening_Date <date>, Large_Transaction <lgl>

3. Load transformed data into new csv files

write_csv(transformed_data, "D:/EY/transformed_data.csv")
write_csv(aggregated_data, "D:/EY/aggregated_data.csv")