Title: “RStudio ETL, EDA, and Control Tests Notebook” Author: Jessen Hobson ************************************************************************

Load packages

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(lubridate)

ETL - Extract, Transform, and Load the data

Load data

dfa <- read_csv('C:/Users/ebene/Desktop/RStudio files accmod2/pcard_data_part1_v01.csv')
## Rows: 63038 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): Agency.Description, Cardholder.Name, Item.Description, Merchant, M...
## dbl  (2): Agency.Number, Amount
## lgl  (1): Project
## date (2): Transaction.Date, Posting.Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dfb <- read_csv('C:/Users/ebene/Desktop/RStudio files accmod2/pcard_data_part2_v01.csv')
## Rows: 63039 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): Agency.Description, Cardholder.Name, Item.Description, Merchant, MCC
## dbl  (2): Agency.Number, Amount
## lgl  (1): Project
## date (2): Transaction.Date, Posting.Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Join the data together before fixing the column headings

dftemp <- bind_rows(dfa, dfb)

Rename the MCC.Description column in dfa and join correctly

# Rename
dfa <- dfa %>% rename(MCC = MCC.Description)

# join
df <- bind_rows(dfa, dfb)

Filter to keep only OSU

df <- df %>% filter(Agency.Number == 1000)

Modify all character columns to make the strings all uppercase

df <- df %>% mutate(across(where(is.character), str_to_upper))

1) Eliminate “Project”; 2) Convert columns that should be numbers; 3) Convert columns that should be dates

# Eliminate `Project`
df <- df %>% select(-Project)

# numeric and date columns are already correct
str(df)
## tibble [83,126 × 9] (S3: tbl_df/tbl/data.frame)
##  $ Agency.Number     : num [1:83126] 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 ...
##  $ Agency.Description: chr [1:83126] "OKLAHOMA STATE UNIVERSITY" "OKLAHOMA STATE UNIVERSITY" "OKLAHOMA STATE UNIVERSITY" "OKLAHOMA STATE UNIVERSITY" ...
##  $ Cardholder.Name   : chr [1:83126] "ROUNTREE, W" "USCANGA, J" "PRATER, J" "BLACKSHIRE, P" ...
##  $ Item.Description  : chr [1:83126] "SHIPPING CHARGES" "GENERAL PURCHASE" "L-110 X 3/4 STD 3/16 X 3/3 EAC|L-110 X 1-7/16 STD" "GRIMSLEYS INC EACH" ...
##  $ Amount            : num [1:83126] 7.87 45.92 174.23 350.32 207.25 ...
##  $ Merchant          : chr [1:83126] "UPS 000000Y059Y3390" "WAL-MART #0137" "MOTION INDUSTRIES OK01" "GRIMSLEYS INC" ...
##  $ Transaction.Date  : Date[1:83126], format: "2020-09-28" "2020-02-12" ...
##  $ Posting.Date      : Date[1:83126], format: "2020-09-29" "2020-02-13" ...
##  $ MCC               : chr [1:83126] "COURIER SERVICES-AIR OR GROUND,FREIGHT F" "GROCERY STORES SUPERMARKETS" "INDUSTRIAL SUPPLIES NOT ESLEWHERE CLASSI" "INDUSTRIAL SUPPLIES NOT ESLEWHERE CLASSI" ...

Add three date columns

df <- df %>% mutate(Month = month(Transaction.Date))
df <- df %>% mutate(Weekday = wday(Transaction.Date))
df <- df %>% mutate(Day.Month = mday(Transaction.Date))

Exploratory Data Analysis

Distribution statistics for Amount

summary(df$Amount)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -12395.38     27.37     87.66    258.47    267.32  23700.00

Box plot for Amount

df %>% ggplot(aes(y=Amount)) +
  geom_boxplot() +
  coord_cartesian(ylim = c(-1000, 1000))

Sum of Amount by month

des2 <- df %>% group_by(Month) %>%  
  summarize(sum_amount = sum(Amount))

des2 %>% ggplot(aes(y=sum_amount, x=as.factor(Month), fill=as.factor(Month))) + 
  geom_col()

Sum of Amount by day of the week

df %>% 
  group_by(Weekday) %>%  
  summarize(sum_amount = sum(Amount)) %>% 
  ggplot(aes(y=sum_amount, x=as.factor(Weekday), fill=as.factor(Weekday))) + 
  geom_col()

Sum of Amount by day of the month

df %>% group_by(Day.Month) %>%  summarize(sum_amount = sum(Amount)) %>% 
  ggplot(aes(y=sum_amount, x=as.factor(Day.Month), fill=as.factor(Day.Month))) + 
  geom_col()

Tests of controls

Single Transaction Limit, descending by Amount

## All transactions
df %>% filter(Amount>5000) %>% 
  arrange(desc(Amount))

Single transaction limit, ascending by Cardholder.Name

## All transactions
df %>% filter(Amount>5000) %>% 
  arrange(Cardholder.Name)

Single transaction limit, ascending by Item.Description

## All transactions
df %>% filter(Amount>5000) %>% 
  arrange(Item.Description)

Single transaction limit, by person

df %>% filter(Amount>5000) %>% 
  group_by(Cardholder.Name) %>% 
  summarize(count = n(), sum_amount=sum(Amount), avg_amount=mean(Amount)) %>% 
  arrange(desc(count))

Yearly spending limit

df %>% group_by(Cardholder.Name) %>% 
  summarize(sum_amount = sum(Amount), count = n()) %>% 
  filter(sum_amount >50000) %>% 
  arrange(desc(sum_amount))
df %>% filter(Amount > 5000) %>%group_by(Cardholder.Name) %>%summarize(count = n(), sum_amount=sum(Amount)) %>%filter(count >= 5)

Monthly spending limit violations by person

df %>% group_by(Cardholder.Name, Month) %>% 
  summarize(sum_amount = sum(Amount)) %>% 
  filter(sum_amount >10000) %>% 
  summarize(count_months_over = n(), total_spent = sum(sum_amount), average_per_month=mean(sum_amount)) %>% 
  arrange(desc(count_months_over))
## `summarise()` has grouped output by 'Cardholder.Name'. You can override using
## the `.groups` argument.