Title: “RStudio ETL, EDA, and Control Tests Notebook” Author: Jessen Hobson ************************************************************************
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(lubridate)
dfa <- read_csv('C:/Users/ebene/Desktop/RStudio files accmod2/pcard_data_part1_v01.csv')
## Rows: 63038 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Agency.Description, Cardholder.Name, Item.Description, Merchant, M...
## dbl (2): Agency.Number, Amount
## lgl (1): Project
## date (2): Transaction.Date, Posting.Date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dfb <- read_csv('C:/Users/ebene/Desktop/RStudio files accmod2/pcard_data_part2_v01.csv')
## Rows: 63039 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Agency.Description, Cardholder.Name, Item.Description, Merchant, MCC
## dbl (2): Agency.Number, Amount
## lgl (1): Project
## date (2): Transaction.Date, Posting.Date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dftemp <- bind_rows(dfa, dfb)
MCC.Description column in dfa and join
correctly# Rename
dfa <- dfa %>% rename(MCC = MCC.Description)
# join
df <- bind_rows(dfa, dfb)
df <- df %>% filter(Agency.Number == 1000)
df <- df %>% mutate(across(where(is.character), str_to_upper))
# Eliminate `Project`
df <- df %>% select(-Project)
# numeric and date columns are already correct
str(df)
## tibble [83,126 × 9] (S3: tbl_df/tbl/data.frame)
## $ Agency.Number : num [1:83126] 1000 1000 1000 1000 1000 1000 1000 1000 1000 1000 ...
## $ Agency.Description: chr [1:83126] "OKLAHOMA STATE UNIVERSITY" "OKLAHOMA STATE UNIVERSITY" "OKLAHOMA STATE UNIVERSITY" "OKLAHOMA STATE UNIVERSITY" ...
## $ Cardholder.Name : chr [1:83126] "ROUNTREE, W" "USCANGA, J" "PRATER, J" "BLACKSHIRE, P" ...
## $ Item.Description : chr [1:83126] "SHIPPING CHARGES" "GENERAL PURCHASE" "L-110 X 3/4 STD 3/16 X 3/3 EAC|L-110 X 1-7/16 STD" "GRIMSLEYS INC EACH" ...
## $ Amount : num [1:83126] 7.87 45.92 174.23 350.32 207.25 ...
## $ Merchant : chr [1:83126] "UPS 000000Y059Y3390" "WAL-MART #0137" "MOTION INDUSTRIES OK01" "GRIMSLEYS INC" ...
## $ Transaction.Date : Date[1:83126], format: "2020-09-28" "2020-02-12" ...
## $ Posting.Date : Date[1:83126], format: "2020-09-29" "2020-02-13" ...
## $ MCC : chr [1:83126] "COURIER SERVICES-AIR OR GROUND,FREIGHT F" "GROCERY STORES SUPERMARKETS" "INDUSTRIAL SUPPLIES NOT ESLEWHERE CLASSI" "INDUSTRIAL SUPPLIES NOT ESLEWHERE CLASSI" ...
df <- df %>% mutate(Month = month(Transaction.Date))
df <- df %>% mutate(Weekday = wday(Transaction.Date))
df <- df %>% mutate(Day.Month = mday(Transaction.Date))
Amountsummary(df$Amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -12395.38 27.37 87.66 258.47 267.32 23700.00
Amountdf %>% ggplot(aes(y=Amount)) +
geom_boxplot() +
coord_cartesian(ylim = c(-1000, 1000))
Amount by monthdes2 <- df %>% group_by(Month) %>%
summarize(sum_amount = sum(Amount))
des2 %>% ggplot(aes(y=sum_amount, x=as.factor(Month), fill=as.factor(Month))) +
geom_col()
Amount by day of the weekdf %>%
group_by(Weekday) %>%
summarize(sum_amount = sum(Amount)) %>%
ggplot(aes(y=sum_amount, x=as.factor(Weekday), fill=as.factor(Weekday))) +
geom_col()
Amount by day of the monthdf %>% group_by(Day.Month) %>% summarize(sum_amount = sum(Amount)) %>%
ggplot(aes(y=sum_amount, x=as.factor(Day.Month), fill=as.factor(Day.Month))) +
geom_col()
Amount## All transactions
df %>% filter(Amount>5000) %>%
arrange(desc(Amount))
Cardholder.Name## All transactions
df %>% filter(Amount>5000) %>%
arrange(Cardholder.Name)
Item.Description## All transactions
df %>% filter(Amount>5000) %>%
arrange(Item.Description)
df %>% filter(Amount>5000) %>%
group_by(Cardholder.Name) %>%
summarize(count = n(), sum_amount=sum(Amount), avg_amount=mean(Amount)) %>%
arrange(desc(count))
df %>% group_by(Cardholder.Name) %>%
summarize(sum_amount = sum(Amount), count = n()) %>%
filter(sum_amount >50000) %>%
arrange(desc(sum_amount))
df %>% filter(Amount > 5000) %>%group_by(Cardholder.Name) %>%summarize(count = n(), sum_amount=sum(Amount)) %>%filter(count >= 5)
df %>% group_by(Cardholder.Name, Month) %>%
summarize(sum_amount = sum(Amount)) %>%
filter(sum_amount >10000) %>%
summarize(count_months_over = n(), total_spent = sum(sum_amount), average_per_month=mean(sum_amount)) %>%
arrange(desc(count_months_over))
## `summarise()` has grouped output by 'Cardholder.Name'. You can override using
## the `.groups` argument.