library(readr)
library(dplyr)
library(lubridate)
1. Extract data from csv files
transactions <- read_csv("D:/EY/transactions.csv")
## Rows: 9 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Transaction_ID, Transaction_Type, Status
## dbl (2): Account_Number, Amount
## date (1): Transaction_Date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
accounts <- read_csv("D:/EY/accounts.csv")
## Rows: 5 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Account_Holder, Account_Type
## dbl (1): Account_Number
## date (1): Opening_Date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
audit_logs <- read_csv("D:/EY/audit_logs.csv")
## Rows: 7 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Log_ID, Action
## dbl (1): Account_Number
## date (1): Log_Date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Display the extracted data
print(transactions)
## # A tibble: 9 × 6
## Transaction_ID Account_Number Transaction_Date Transaction_Type Amount Status
## <chr> <dbl> <date> <chr> <dbl> <chr>
## 1 TXN001 123456 2024-01-01 Credit 1000 Cleared
## 2 TXN002 789012 2024-01-02 Debit 500 Cleared
## 3 TXN003 345678 2024-01-03 Credit NA Pending
## 4 TXN004 901234 2024-01-04 Debit 200 Cleared
## 5 TXN005 567890 2024-01-05 Credit 3000 Cleared
## 6 TXN006 567890 2024-01-06 Credit 5700 Cleared
## 7 TXN007 901234 2024-01-07 Debit 500 Cleared
## 8 TXN008 901236 2024-01-07 Debit 600 Cleared
## 9 TXN008 901236 2024-01-07 Debit 600 Cleared
print(accounts)
## # A tibble: 5 × 4
## Account_Number Account_Holder Account_Type Opening_Date
## <dbl> <chr> <chr> <date>
## 1 123456 John Doe Savings 2020-01-01
## 2 789012 Jane Smith Checking 2019-05-15
## 3 345678 Emily Davis Savings 2021-07-21
## 4 901234 Michael Brown Checking 2018-11-30
## 5 567890 Jessica Johnson Savings 2022-03-22
print(audit_logs)
## # A tibble: 7 × 4
## Log_ID Account_Number Log_Date Action
## <chr> <dbl> <date> <chr>
## 1 L001 123456 2024-01-01 Login
## 2 L002 789012 2024-01-02 Transfer
## 3 L003 345678 2024-01-03 Withdrawal
## 4 L004 901234 2024-01-04 Deposit
## 5 L005 567890 2024-01-05 Login
## 6 L006 345678 2024-01-06 Login
## 7 L007 123456 2024-01-07 Transfer
2. Transform Data
a. Remove duplicates
transactions <- transactions %>% distinct()
accounts <- accounts %>% distinct()
audit_logs <- audit_logs %>% distinct()
b. Check and Handle missing values (filling with mean)
check_missing_values <- function(df, df_name) {
missing_counts <- sapply(df, function(x) sum(is.na(x)))
if (any(missing_counts > 0)) {
cat("Dataframe", df_name, "has missing values in the following columns:\n")
print(missing_counts[missing_counts > 0])
} else {
cat("Dataframe", df_name, "has no missing values.\n")
}
}
check_missing_values(transactions, "Transactions")
## Dataframe Transactions has missing values in the following columns:
## Amount
## 1
check_missing_values(accounts, "Accounts")
## Dataframe Accounts has no missing values.
check_missing_values(audit_logs, "Audit Logs")
## Dataframe Audit Logs has no missing values.
transactions$Amount[is.na(transactions$Amount)] <- mean(transactions$Amount, na.rm = TRUE)
c. Standardize date formats
transactions$Transaction_Date <- as.Date(transactions$Transaction_Date, format = "%Y-%m-%d")
accounts$Opening_Date <- as.Date(accounts$Opening_Date, format = "%Y-%m-%d")
audit_logs$Log_Date <- as.Date(audit_logs$Log_Date, format = "%Y-%m-%d")
d. Create valid_transaction using filter: only transactions with account numbers that exist in the accounts dataframe are included.
valid_transactions <- transactions %>%
filter(Account_Number %in% accounts$Account_Number)
print(valid_transactions)
## # A tibble: 7 × 6
## Transaction_ID Account_Number Transaction_Date Transaction_Type Amount Status
## <chr> <dbl> <date> <chr> <dbl> <chr>
## 1 TXN001 123456 2024-01-01 Credit 1000 Cleared
## 2 TXN002 789012 2024-01-02 Debit 500 Cleared
## 3 TXN003 345678 2024-01-03 Credit 1643. Pending
## 4 TXN004 901234 2024-01-04 Debit 200 Cleared
## 5 TXN005 567890 2024-01-05 Credit 3000 Cleared
## 6 TXN006 567890 2024-01-06 Credit 5700 Cleared
## 7 TXN007 901234 2024-01-07 Debit 500 Cleared
e. Create a new calculated field: Transaction Month
valid_transactions <- valid_transactions %>%
mutate(Transaction_Month = floor_date(Transaction_Date, "month"))
print(valid_transactions)
## # A tibble: 7 × 7
## Transaction_ID Account_Number Transaction_Date Transaction_Type Amount Status
## <chr> <dbl> <date> <chr> <dbl> <chr>
## 1 TXN001 123456 2024-01-01 Credit 1000 Cleared
## 2 TXN002 789012 2024-01-02 Debit 500 Cleared
## 3 TXN003 345678 2024-01-03 Credit 1643. Pending
## 4 TXN004 901234 2024-01-04 Debit 200 Cleared
## 5 TXN005 567890 2024-01-05 Credit 3000 Cleared
## 6 TXN006 567890 2024-01-06 Credit 5700 Cleared
## 7 TXN007 901234 2024-01-07 Debit 500 Cleared
## # … with 1 more variable: Transaction_Month <date>
f. Aggregate data: Total transaction amount per account and per month
aggregated_data <- valid_transactions %>%
group_by(Account_Number, Transaction_Month) %>%
summarise(Total_Amount = sum(Amount), .groups = 'drop')
print(aggregated_data)
## # A tibble: 5 × 3
## Account_Number Transaction_Month Total_Amount
## <dbl> <date> <dbl>
## 1 123456 2024-01-01 1000
## 2 345678 2024-01-01 1643.
## 3 567890 2024-01-01 8700
## 4 789012 2024-01-01 500
## 5 901234 2024-01-01 700
g. Join transactions with account details
transformed_data <- valid_transactions %>%
left_join(accounts, by = "Account_Number")
print(transformed_data)
## # A tibble: 7 × 10
## Transaction_ID Account_Number Transaction_Date Transaction_Type Amount Status
## <chr> <dbl> <date> <chr> <dbl> <chr>
## 1 TXN001 123456 2024-01-01 Credit 1000 Cleared
## 2 TXN002 789012 2024-01-02 Debit 500 Cleared
## 3 TXN003 345678 2024-01-03 Credit 1643. Pending
## 4 TXN004 901234 2024-01-04 Debit 200 Cleared
## 5 TXN005 567890 2024-01-05 Credit 3000 Cleared
## 6 TXN006 567890 2024-01-06 Credit 5700 Cleared
## 7 TXN007 901234 2024-01-07 Debit 500 Cleared
## # … with 4 more variables: Transaction_Month <date>, Account_Holder <chr>,
## # Account_Type <chr>, Opening_Date <date>
h. Conditional transformations: Flag large transactions
transformed_data <- transformed_data %>%
mutate(Large_Transaction = ifelse(Amount > 1000, TRUE, FALSE))
print(transformed_data)
## # A tibble: 7 × 11
## Transaction_ID Account_Number Transaction_Date Transaction_Type Amount Status
## <chr> <dbl> <date> <chr> <dbl> <chr>
## 1 TXN001 123456 2024-01-01 Credit 1000 Cleared
## 2 TXN002 789012 2024-01-02 Debit 500 Cleared
## 3 TXN003 345678 2024-01-03 Credit 1643. Pending
## 4 TXN004 901234 2024-01-04 Debit 200 Cleared
## 5 TXN005 567890 2024-01-05 Credit 3000 Cleared
## 6 TXN006 567890 2024-01-06 Credit 5700 Cleared
## 7 TXN007 901234 2024-01-07 Debit 500 Cleared
## # … with 5 more variables: Transaction_Month <date>, Account_Holder <chr>,
## # Account_Type <chr>, Opening_Date <date>, Large_Transaction <lgl>
3. Load transformed data into new csv files
write_csv(transformed_data, "D:/EY/transformed_data.csv")
write_csv(aggregated_data, "D:/EY/aggregated_data.csv")