Title: “RStudio ETL, EDA, and Control Tests Notebook” Author: Jessen Hobson ************************************************************************

Load packages

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors

library(lubridate)

ETL - Extract, Transform, and Load the data

Load data

dfa <- read_csv('C:/Users/ebene/Desktop/RStudio files accmod2/pcard_data_part1_v01.csv')

## Rows: 63038 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): Agency.Description, Cardholder.Name, Item.Description, Merchant, M...
## dbl  (2): Agency.Number, Amount
## lgl  (1): Project
## date (2): Transaction.Date, Posting.Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

dfb <- read_csv('C:/Users/ebene/Desktop/RStudio files accmod2/pcard_data_part2_v01.csv')

## Rows: 63039 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): Agency.Description, Cardholder.Name, Item.Description, Merchant, MCC
## dbl  (2): Agency.Number, Amount
## lgl  (1): Project
## date (2): Transaction.Date, Posting.Date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Join the data together before fixing the column headings

dftemp <- bind_rows(dfa, dfb)

Rename the `MCC.Description` column in dfa and join correctly

# Rename
dfa <- dfa %>% rename(MCC = MCC.Description)

# join
df <- bind_rows(dfa, dfb)

Filter to keep only OSU

df <- df %>% filter(Agency.Number == 1000)

Modify all character columns to make the strings all uppercase

df <- df %>% mutate(across(where(is.character), str_to_upper))

1) Eliminate “Project”; 2) Convert columns that should be numbers; 3) Convert columns that should be dates

# Eliminate `Project`
df <- df %>% select(-Project)

# numeric and date columns are already correct
str(df)

## tibble [83,126 × 9] (S3: tbl_df/tbl/data.frame)
##  $ Agency.Number     : num [1:83126] 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 ...
##  $ Agency.Description: chr [1:83126] "OKLAHOMA STATE UNIVERSITY" "OKLAHOMA STATE UNIVERSITY" "OKLAHOMA STATE UNIVERSITY" "OKLAHOMA STATE UNIVERSITY" ...
##  $ Cardholder.Name   : chr [1:83126] "ROUNTREE, W" "USCANGA, J" "PRATER, J" "BLACKSHIRE, P" ...
##  $ Item.Description  : chr [1:83126] "SHIPPING CHARGES" "GENERAL PURCHASE" "L-110 X 3/4 STD 3/16 X 3/3 EAC|L-110 X 1-7/16 STD" "GRIMSLEYS INC EACH" ...
##  $ Amount            : num [1:83126] 7.87 45.92 174.23 350.32 207.25 ...
##  $ Merchant          : chr [1:83126] "UPS 000000Y059Y3390" "WAL-MART #0137" "MOTION INDUSTRIES OK01" "GRIMSLEYS INC" ...
##  $ Transaction.Date  : Date[1:83126], format: "2020-09-28" "2020-02-12" ...
##  $ Posting.Date      : Date[1:83126], format: "2020-09-29" "2020-02-13" ...
##  $ MCC               : chr [1:83126] "COURIER SERVICES-AIR OR GROUND,FREIGHT F" "GROCERY STORES SUPERMARKETS" "INDUSTRIAL SUPPLIES NOT ESLEWHERE CLASSI" "INDUSTRIAL SUPPLIES NOT ESLEWHERE CLASSI" ...

Add three date columns

df <- df %>% mutate(Month = month(Transaction.Date))
df <- df %>% mutate(Weekday = wday(Transaction.Date))
df <- df %>% mutate(Day.Month = mday(Transaction.Date))

Exploratory Data Analysis

Distribution statistics for `Amount`

summary(df$Amount)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -12395.38     27.37     87.66    258.47    267.32  23700.00

Box plot for `Amount`

df %>% ggplot(aes(y=Amount)) +
  geom_boxplot() +
  coord_cartesian(ylim = c(-1000, 1000))

Sum of `Amount` by month

des2 <- df %>% group_by(Month) %>%  
  summarize(sum_amount = sum(Amount))

des2 %>% ggplot(aes(y=sum_amount, x=as.factor(Month), fill=as.factor(Month))) + 
  geom_col()

Sum of `Amount` by day of the week

df %>% 
  group_by(Weekday) %>%  
  summarize(sum_amount = sum(Amount)) %>% 
  ggplot(aes(y=sum_amount, x=as.factor(Weekday), fill=as.factor(Weekday))) + 
  geom_col()

Sum of `Amount` by day of the month

df %>% group_by(Day.Month) %>%  summarize(sum_amount = sum(Amount)) %>% 
  ggplot(aes(y=sum_amount, x=as.factor(Day.Month), fill=as.factor(Day.Month))) + 
  geom_col()

Tests of controls

Single Transaction Limit, descending by `Amount`

## All transactions
df %>% filter(Amount>5000) %>% 
  arrange(desc(Amount))

Single transaction limit, ascending by `Cardholder.Name`

## All transactions
df %>% filter(Amount>5000) %>% 
  arrange(Cardholder.Name)

Single transaction limit, ascending by `Item.Description`

## All transactions
df %>% filter(Amount>5000) %>% 
  arrange(Item.Description)

Single transaction limit, by person

df %>% filter(Amount>5000) %>% 
  group_by(Cardholder.Name) %>% 
  summarize(count = n(), sum_amount=sum(Amount), avg_amount=mean(Amount)) %>% 
  arrange(desc(count))

Yearly spending limit

df %>% group_by(Cardholder.Name) %>% 
  summarize(sum_amount = sum(Amount), count = n()) %>% 
  filter(sum_amount >50000) %>% 
  arrange(desc(sum_amount))

df %>% filter(Amount > 5000) %>%group_by(Cardholder.Name) %>%summarize(count = n(), sum_amount=sum(Amount)) %>%filter(count >= 5)

Monthly spending limit violations by person

df %>% group_by(Cardholder.Name, Month) %>% 
  summarize(sum_amount = sum(Amount)) %>% 
  filter(sum_amount >10000) %>% 
  summarize(count_months_over = n(), total_spent = sum(sum_amount), average_per_month=mean(sum_amount)) %>% 
  arrange(desc(count_months_over))

## `summarise()` has grouped output by 'Cardholder.Name'. You can override using
## the `.groups` argument.

RStudio ETL, EDA, and Control Tests Notebook

Load packages

ETL - Extract, Transform, and Load the data

Load data

Join the data together before fixing the column headings

Rename the MCC.Description column in dfa and join correctly

Filter to keep only OSU

Modify all character columns to make the strings all uppercase

1) Eliminate “Project”; 2) Convert columns that should be numbers; 3) Convert columns that should be dates

Add three date columns

Exploratory Data Analysis

Distribution statistics for Amount

Box plot for Amount

Sum of Amount by month

Sum of Amount by day of the week

Sum of Amount by day of the month

Tests of controls

Single Transaction Limit, descending by Amount

Single transaction limit, ascending by Cardholder.Name

Single transaction limit, ascending by Item.Description

Single transaction limit, by person

Yearly spending limit

Monthly spending limit violations by person

Rename the `MCC.Description` column in dfa and join correctly

Distribution statistics for `Amount`

Box plot for `Amount`

Sum of `Amount` by month

Sum of `Amount` by day of the week

Sum of `Amount` by day of the month

Single Transaction Limit, descending by `Amount`

Single transaction limit, ascending by `Cardholder.Name`

Single transaction limit, ascending by `Item.Description`