library(readxl) # Import Excel files
library(dplyr) # Data manipulation
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr) # Data tidying
library(lubridate) # Date/time handling
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(yardstick) # Tidy model metrics
# Read Excel file
power_data <- read_excel("Power_Forecast_2014.xlsx")
# Display structure
glimpse(power_data)
## Rows: 12
## Columns: 6
## $ Month <dttm> 2014-01-01, 2014-02-01, 2014-03-01, 2014-04-01, 2014-05-…
## $ Forecast_KWH <dbl> 9714361, 8181495, 6737578, 5957336, 5727983, 7534009, 792…
## $ Lower_80 <dbl> 8607037, 7041708, 5597792, 4817549, 4588197, 6394222, 678…
## $ Upper_80 <dbl> 10821686, 9321281, 7877365, 7097122, 6867770, 8673795, 90…
## $ Lower_95 <dbl> 8020854, 6438342, 4994426, 4214183, 3984831, 5790856, 617…
## $ Upper_95 <dbl> 11407868, 9924648, 8480731, 7700488, 7471136, 9277162, 96…
# Read with explicit date handling
atm_data <- read_excel("ATM624Data.xlsx") %>%
mutate(
# Force proper date conversion from Excel numeric format
DATE = as.Date(as.numeric(DATE), origin = "1899-12-30")
)
# Check result
cat("Date range:", as.character(min(atm_data$DATE, na.rm = TRUE)), "to",
as.character(max(atm_data$DATE, na.rm = TRUE)), "\n")
## Date range: 2009-05-01 to 2010-05-14
# Column names
cat("Column Names:\n")
## Column Names:
print(names(atm_data))
## [1] "DATE" "ATM" "Cash"
cat("\n")
head(atm_data, 10)
## # A tibble: 10 × 3
## DATE ATM Cash
## <date> <chr> <dbl>
## 1 2009-05-01 ATM1 96
## 2 2009-05-01 ATM2 107
## 3 2009-05-02 ATM1 82
## 4 2009-05-02 ATM2 89
## 5 2009-05-03 ATM1 85
## 6 2009-05-03 ATM2 90
## 7 2009-05-04 ATM1 90
## 8 2009-05-04 ATM2 55
## 9 2009-05-05 ATM1 99
## 10 2009-05-05 ATM2 79
sum(is.na(atm_data$Cash))
## [1] 19
# Dataset Overview
cat("Overview of the Dataset:\n")
## Overview of the Dataset:
glimpse(atm_data, 10)
## Rows: 1,474
## Columns: 3
## $ DATE <date> …
## $ ATM <chr> …
## $ Cash <dbl> …
#Missing data assessment
# Unique ATM identifiers
unique_atms <- unique(atm_data$ATM)
print(unique_atms)
## [1] "ATM1" "ATM2" NA "ATM3" "ATM4"
cat("\nNumber of ATM machines:", length(unique_atms[!is.na(unique_atms)]), "\n\n")
##
## Number of ATM machines: 4
# Missing values by ATM
cat("Missing Cash values by ATM:\n")
## Missing Cash values by ATM:
missing_by_atm <- atm_data %>%
group_by(ATM) %>%
summarise(
Total_Records = n(),
Missing_Cash = sum(is.na(Cash)),
Missing_Percentage = round(sum(is.na(Cash)) / n() * 100, 2)
)
print(missing_by_atm)
## # A tibble: 5 × 4
## ATM Total_Records Missing_Cash Missing_Percentage
## <chr> <int> <int> <dbl>
## 1 ATM1 365 3 0.82
## 2 ATM2 365 2 0.55
## 3 ATM3 365 0 0
## 4 ATM4 365 0 0
## 5 <NA> 14 14 100
cat("\n")
# Missing values by ATM - sorted by most missing to least
missing_by_atm <- atm_data %>%
group_by(ATM) %>%
summarise(
Total_Records = n(),
Missing_Cash = sum(is.na(Cash)),
Missing_Pct = round(sum(is.na(Cash)) / n() * 100, 2)
) %>%
arrange(desc(Missing_Cash)) # ← Highest missing first
# Check where missing values actually are
cat("Missing Values by Column:\n")
## Missing Values by Column:
cat("Cash:", sum(is.na(atm_data$Cash)), "\n")
## Cash: 19
cat("ATM:", sum(is.na(atm_data$ATM)), "\n")
## ATM: 14
cat("DATE:", sum(is.na(atm_data$DATE)), "\n\n")
## DATE: 0
# Cross-check: rows with missing ATM but present Cash
atm_data %>%
filter(is.na(ATM)) %>%
summarise(
Count = n(),
Has_Cash = sum(!is.na(Cash)),
Has_Date = sum(!is.na(DATE))
)
## # A tibble: 1 × 3
## Count Has_Cash Has_Date
## <int> <int> <int>
## 1 14 0 14
# Check for Duplicates
duplicates <- atm_data %>%
group_by(DATE, ATM) %>%
summarise(Count = n(), .groups = 'drop') %>%
filter(Count > 1)
cat("Duplicate DATE-ATM combinations:", nrow(duplicates), "\n")
## Duplicate DATE-ATM combinations: 0
if(nrow(duplicates) > 0) {
cat("\nDuplicate records found:\n")
print(duplicates)
cat("\n")
} else {
cat("No duplicate DATE-ATM combinations found.\n\n")
}
## No duplicate DATE-ATM combinations found.
# Date range by ATM
cat("Date Range by ATM:\n")
## Date Range by ATM:
date_range_by_atm <- atm_data %>%
filter(!is.na(ATM)) %>%
group_by(ATM) %>%
summarise(
Start_Date = min(DATE, na.rm = TRUE),
End_Date = max(DATE, na.rm = TRUE),
Days_Span = as.numeric(difftime(max(DATE, na.rm = TRUE),
min(DATE, na.rm = TRUE),
units = "days"))
)
print(date_range_by_atm)
## # A tibble: 4 × 4
## ATM Start_Date End_Date Days_Span
## <chr> <date> <date> <dbl>
## 1 ATM1 2009-05-01 2010-04-30 364
## 2 ATM2 2009-05-01 2010-04-30 364
## 3 ATM3 2009-05-01 2010-04-30 364
## 4 ATM4 2009-05-01 2010-04-30 364
# --- Sample Size per ATM ---
sample_size <- atm_data %>%
filter(!is.na(ATM)) %>%
group_by(ATM) %>%
summarise(
Total_Records = n(),
Non_Missing_Cash = sum(!is.na(Cash)),
Missing_Cash = sum(is.na(Cash))
)
print(sample_size)
## # A tibble: 4 × 4
## ATM Total_Records Non_Missing_Cash Missing_Cash
## <chr> <int> <int> <int>
## 1 ATM1 365 362 3
## 2 ATM2 365 363 2
## 3 ATM3 365 365 0
## 4 ATM4 365 365 0
cat("\n")
# --- Summary Statistics for Each ATM ---
summary_stats <- atm_data %>%
filter(!is.na(ATM) & !is.na(Cash)) %>%
group_by(ATM) %>%
summarise(
Count = n(),
Mean = round(mean(Cash, na.rm = TRUE), 2),
Median = round(median(Cash, na.rm = TRUE), 2),
Std_Dev = round(sd(Cash, na.rm = TRUE), 2),
Min = round(min(Cash, na.rm = TRUE), 2),
Max = round(max(Cash, na.rm = TRUE), 2),
Q1 = round(quantile(Cash, 0.25, na.rm = TRUE), 2),
Q3 = round(quantile(Cash, 0.75, na.rm = TRUE), 2)
)
print(summary_stats)
## # A tibble: 4 × 9
## ATM Count Mean Median Std_Dev Min Max Q1 Q3
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM1 362 83.9 91 36.7 1 180 73 108
## 2 ATM2 363 62.6 67 38.9 0 147 25.5 93
## 3 ATM3 365 0.72 0 7.94 0 96 0 0
## 4 ATM4 365 474. 404. 651. 1.56 10920. 124. 705.
cat("\n")
#is.na(atm_data)
```