Tidyverse_Candace

library(readxl)       # Import Excel files



library(dplyr)        # Data manipulation

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)        # Data tidying
library(lubridate)    # Date/time handling

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(yardstick)    # Tidy model metrics

# Read Excel file

power_data <- read_excel("Power_Forecast_2014.xlsx")

# Display structure

glimpse(power_data)

## Rows: 12
## Columns: 6
## $ Month        <dttm> 2014-01-01, 2014-02-01, 2014-03-01, 2014-04-01, 2014-05-…
## $ Forecast_KWH <dbl> 9714361, 8181495, 6737578, 5957336, 5727983, 7534009, 792…
## $ Lower_80     <dbl> 8607037, 7041708, 5597792, 4817549, 4588197, 6394222, 678…
## $ Upper_80     <dbl> 10821686, 9321281, 7877365, 7097122, 6867770, 8673795, 90…
## $ Lower_95     <dbl> 8020854, 6438342, 4994426, 4214183, 3984831, 5790856, 617…
## $ Upper_95     <dbl> 11407868, 9924648, 8480731, 7700488, 7471136, 9277162, 96…

# Read with explicit date handling
atm_data <- read_excel("ATM624Data.xlsx") %>%
  mutate(
    # Force proper date conversion from Excel numeric format
    DATE = as.Date(as.numeric(DATE), origin = "1899-12-30")
  )

# Check result
cat("Date range:", as.character(min(atm_data$DATE, na.rm = TRUE)), "to", 
    as.character(max(atm_data$DATE, na.rm = TRUE)), "\n")

## Date range: 2009-05-01 to 2010-05-14

# Column names
cat("Column Names:\n")

## Column Names:

print(names(atm_data))

## [1] "DATE" "ATM"  "Cash"

cat("\n")

head(atm_data, 10)

## # A tibble: 10 × 3
##    DATE       ATM    Cash
##    <date>     <chr> <dbl>
##  1 2009-05-01 ATM1     96
##  2 2009-05-01 ATM2    107
##  3 2009-05-02 ATM1     82
##  4 2009-05-02 ATM2     89
##  5 2009-05-03 ATM1     85
##  6 2009-05-03 ATM2     90
##  7 2009-05-04 ATM1     90
##  8 2009-05-04 ATM2     55
##  9 2009-05-05 ATM1     99
## 10 2009-05-05 ATM2     79

sum(is.na(atm_data$Cash))

## [1] 19

# Dataset Overview
cat("Overview of the Dataset:\n")

## Overview of the Dataset:

glimpse(atm_data, 10)

## Rows: 1,474
## Columns: 3
## $ DATE <date> …
## $ ATM  <chr> …
## $ Cash <dbl> …

#Missing data assessment
# Unique ATM identifiers
unique_atms <- unique(atm_data$ATM)
print(unique_atms)

## [1] "ATM1" "ATM2" NA     "ATM3" "ATM4"

cat("\nNumber of ATM machines:", length(unique_atms[!is.na(unique_atms)]), "\n\n")

## 
## Number of ATM machines: 4

# Missing values by ATM
cat("Missing Cash values by ATM:\n")

## Missing Cash values by ATM:

missing_by_atm <- atm_data %>%
  group_by(ATM) %>%
  summarise(
    Total_Records = n(),
    Missing_Cash = sum(is.na(Cash)),
    Missing_Percentage = round(sum(is.na(Cash)) / n() * 100, 2)
  )
print(missing_by_atm)

## # A tibble: 5 × 4
##   ATM   Total_Records Missing_Cash Missing_Percentage
##   <chr>         <int>        <int>              <dbl>
## 1 ATM1            365            3               0.82
## 2 ATM2            365            2               0.55
## 3 ATM3            365            0               0   
## 4 ATM4            365            0               0   
## 5 <NA>             14           14             100

cat("\n")

# Missing values by ATM - sorted by most missing to least
missing_by_atm <- atm_data %>%
  group_by(ATM) %>%
  summarise(
    Total_Records = n(),
    Missing_Cash = sum(is.na(Cash)),
    Missing_Pct = round(sum(is.na(Cash)) / n() * 100, 2)
  ) %>%
  arrange(desc(Missing_Cash))  # ← Highest missing first


# Check where missing values actually are

cat("Missing Values by Column:\n")

## Missing Values by Column:

cat("Cash:", sum(is.na(atm_data$Cash)), "\n")

## Cash: 19

cat("ATM:", sum(is.na(atm_data$ATM)), "\n")

## ATM: 14

cat("DATE:", sum(is.na(atm_data$DATE)), "\n\n")

## DATE: 0

# Cross-check: rows with missing ATM but present Cash
atm_data %>%
  filter(is.na(ATM)) %>%
  summarise(
    Count = n(),
    Has_Cash = sum(!is.na(Cash)),
    Has_Date = sum(!is.na(DATE))
  )

## # A tibble: 1 × 3
##   Count Has_Cash Has_Date
##   <int>    <int>    <int>
## 1    14        0       14

# Check for Duplicates 
duplicates <- atm_data %>%
  group_by(DATE, ATM) %>%
  summarise(Count = n(), .groups = 'drop') %>%
  filter(Count > 1)

cat("Duplicate DATE-ATM combinations:", nrow(duplicates), "\n")

## Duplicate DATE-ATM combinations: 0

if(nrow(duplicates) > 0) {
  cat("\nDuplicate records found:\n")
  print(duplicates)
  cat("\n")
} else {
  cat("No duplicate DATE-ATM combinations found.\n\n")
}

## No duplicate DATE-ATM combinations found.

# Date range by ATM
cat("Date Range by ATM:\n")

## Date Range by ATM:

date_range_by_atm <- atm_data %>%
  filter(!is.na(ATM)) %>%
  group_by(ATM) %>%
  summarise(
    Start_Date = min(DATE, na.rm = TRUE),
    End_Date = max(DATE, na.rm = TRUE),
    Days_Span = as.numeric(difftime(max(DATE, na.rm = TRUE), 
                                    min(DATE, na.rm = TRUE), 
                                    units = "days"))
  )
print(date_range_by_atm)

## # A tibble: 4 × 4
##   ATM   Start_Date End_Date   Days_Span
##   <chr> <date>     <date>         <dbl>
## 1 ATM1  2009-05-01 2010-04-30       364
## 2 ATM2  2009-05-01 2010-04-30       364
## 3 ATM3  2009-05-01 2010-04-30       364
## 4 ATM4  2009-05-01 2010-04-30       364

# --- Sample Size per ATM ---

sample_size <- atm_data %>%
  filter(!is.na(ATM)) %>%
  group_by(ATM) %>%
  summarise(
    Total_Records = n(),
    Non_Missing_Cash = sum(!is.na(Cash)),
    Missing_Cash = sum(is.na(Cash))
  )
print(sample_size)

## # A tibble: 4 × 4
##   ATM   Total_Records Non_Missing_Cash Missing_Cash
##   <chr>         <int>            <int>        <int>
## 1 ATM1            365              362            3
## 2 ATM2            365              363            2
## 3 ATM3            365              365            0
## 4 ATM4            365              365            0

cat("\n")

# --- Summary Statistics for Each ATM ---


summary_stats <- atm_data %>%
  filter(!is.na(ATM) & !is.na(Cash)) %>%
  group_by(ATM) %>%
  summarise(
    Count = n(),
    Mean = round(mean(Cash, na.rm = TRUE), 2),
    Median = round(median(Cash, na.rm = TRUE), 2),
    Std_Dev = round(sd(Cash, na.rm = TRUE), 2),
    Min = round(min(Cash, na.rm = TRUE), 2),
    Max = round(max(Cash, na.rm = TRUE), 2),
    Q1 = round(quantile(Cash, 0.25, na.rm = TRUE), 2),
    Q3 = round(quantile(Cash, 0.75, na.rm = TRUE), 2)
  )
print(summary_stats)

## # A tibble: 4 × 9
##   ATM   Count   Mean Median Std_Dev   Min    Max    Q1    Q3
##   <chr> <int>  <dbl>  <dbl>   <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1 ATM1    362  83.9     91    36.7   1      180   73    108 
## 2 ATM2    363  62.6     67    38.9   0      147   25.5   93 
## 3 ATM3    365   0.72     0     7.94  0       96    0      0 
## 4 ATM4    365 474.     404.  651.    1.56 10920. 124.   705.

cat("\n")

#is.na(atm_data)

```

Tidyverse_Candace

Candace Grant

2025-10-24