Part A: Data Pull

I made these csvs and uploaded them to github for reproducability purposes

partA_data <- read.csv("https://raw.githubusercontent.com/jonburns2454/DATA-624/refs/heads/main/Project%201%20Data/ATM624Data.csv")

EDA - ATM

Necessary libraries

library(tidyverse)
library(naniar)
library(readxl)
library(fpp3)
library(forecast)
library(ggfortify)
library(gridExtra)

# Convert DATE to Date type
partA_data$DATE <- mdy_hms(partA_data$DATE)

# Summarize the data
summary(partA_data)

##       DATE                            ATM                 Cash        
##  Min.   :2009-05-01 00:00:00.00   Length:1474        Min.   :    0.0  
##  1st Qu.:2009-08-01 00:00:00.00   Class :character   1st Qu.:    0.5  
##  Median :2009-11-01 00:00:00.00   Mode  :character   Median :   73.0  
##  Mean   :2009-10-31 19:11:48.27                      Mean   :  155.6  
##  3rd Qu.:2010-02-01 00:00:00.00                      3rd Qu.:  114.0  
##  Max.   :2010-05-14 00:00:00.00                      Max.   :10920.0  
##                                                      NA's   :19

# Plot the data
# 1. Cash distribution by ATM
ggplot(partA_data, aes(x = ATM, y = Cash)) +
  geom_boxplot() +
  labs(title = "Cash Distribution by ATM", x = "ATM", y = "Cash")

## Warning: Removed 19 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

# 2. Cash over time
ggplot(partA_data, aes(x = DATE, y = Cash, color = ATM)) +
  geom_line() +
  labs(title = "Cash Over Time", x = "Date", y = "Cash")

## Warning: Removed 14 rows containing missing values or values outside the scale range
## (`geom_line()`).

# 3. Total cash per ATM
total_cash <- partA_data %>%
  group_by(ATM) %>%
  summarize(Total_Cash = sum(Cash))

ggplot(total_cash, aes(x = ATM, y = Total_Cash, fill = ATM)) +
  geom_bar(stat = "identity") +
  labs(title = "Total Cash per ATM", x = "ATM", y = "Total Cash")

## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_bar()`).

### Missingness

The amount of missingness across actual categorized atms (ATM1-ATM4) is minimal as you can see from the table and visualization, a majority of the missingness comes from the unlabeled atm variables which make up 74% of the missingness. The cash column has the only instance of missingness

gg_miss_var(partA_data) +
  labs(title = "Missing Data ATM dataset")

gg_miss_var(partA_data, facet = ATM) +
  labs(title = "Missing Data by ATM Category")

missing_by_atm <- partA_data %>%
  group_by(ATM) %>%
  summarise(across(everything(), ~ sum(is.na(.)), .names = "missing_{col}"))

total_missing <- colSums(is.na(partA_data))
missing_by_atm <- missing_by_atm %>%
  mutate(across(starts_with("missing_"), ~ . / total_missing[match(sub("missing_", "", cur_column()), names(total_missing))] * 100, .names = "percent_{col}"))
missing_by_atm

## # A tibble: 5 × 5
##   ATM    missing_DATE missing_Cash percent_missing_DATE percent_missing_Cash
##   <chr>         <int>        <int>                <dbl>                <dbl>
## 1 ""                0           14                  NaN                 73.7
## 2 "ATM1"            0            3                  NaN                 15.8
## 3 "ATM2"            0            2                  NaN                 10.5
## 4 "ATM3"            0            0                  NaN                  0  
## 5 "ATM4"            0            0                  NaN                  0

Data Preparation:

To utilize the autoplot function you need to ensure that the data is formatted correctly.

atm_data <- partA_data %>% 
    mutate(DATE = as_date(DATE), Cash = as.integer(Cash))

Dealing with missingness

To combat the missingness I will filter the dataset for any date prior to 2010-05-01 because this data is completely empty after that data. Next I will use mean imputation (grouped by atm) to fill in the remaining 5 missing cash data. Lastly I will convert this data.frame object to a tsibble and create a key and index.

atm_data <- atm_data %>% 
    filter(DATE < "2010-05-01") %>% 
    group_by(ATM) %>%
    mutate(Cash = ifelse(is.na(Cash), mean(Cash, na.rm = TRUE), Cash)) %>% 
    ungroup()
atm_data <- as_tsibble(atm_data, index = DATE, key = ATM)

Evaluate autoplots at each ATM

ATM1:

atm1_auto <-atm_data %>% 
    filter(ATM == "ATM1") %>% 
    autoplot(Cash) +
    ggtitle("Pre-Transformation Autoplot: ATM1")

atm1_stl <- atm_data %>% 
    filter(ATM == "ATM1") %>%
    model(STL(Cash ~ season(window = "periodic"), robust = TRUE)) %>%
    components() %>%
    autoplot() +
    labs(title = "STL Decomposition for ATM1")
grid.arrange(atm1_auto, atm1_stl, ncol = 2)

### ATM2:

atm2_auto <- atm_data %>% 
    filter(ATM == "ATM2") %>% 
    autoplot(Cash) +
    ggtitle("Pre-Transformation Autoplot: ATM2")

atm2_stl <- atm_data %>% 
    filter(ATM == "ATM2") %>%
    model(STL(Cash ~ season(window = "periodic"), robust = TRUE)) %>%
    components() %>%
    autoplot() +
    labs(title = "STL Decomposition for ATM2")
grid.arrange(atm2_auto, atm2_stl, ncol = 2)

ATM3:

atm3_auto <- atm_data %>% 
    filter(ATM == "ATM3") %>% 
    autoplot(Cash) +
    ggtitle("Pre-Transformation Autoplot: ATM3")

atm3_stl <- atm_data %>% 
    filter(ATM == "ATM3") %>%
    model(STL(Cash ~ season(window = "periodic"), robust = TRUE)) %>%
    components() %>%
    autoplot() +
    labs(title = "STL Decomposition for ATM3")
grid.arrange(atm3_auto, atm3_stl, ncol = 2)

ATM4:

atm4_auto <- atm_data %>% 
    filter(ATM == "ATM4") %>% 
    autoplot(Cash) +
    ggtitle("Pre-Transformation Autoplot: ATM4")

atm4_stl <- atm_data %>% 
    filter(ATM == "ATM4") %>%
    model(STL(Cash ~ season(window = "periodic"), robust = TRUE)) %>%
    components() %>%
    autoplot() +
    labs(title = "STL Decomposition for ATM4")
grid.arrange(atm4_auto, atm4_stl, ncol = 2)

Project 1 JBurns DATA624

Jonathan Burns

2024-10-26

Project 1 Part A Overview:

Part A: Deliverables