This is an R Markdown document for DATA607/Assignment 3B, by Mehreen Ali Gillani. Window Functions
Find (or ask an LLM to generate!) a dataset that includes time series for two or more separate items. For example, you could use end of day stock or cryptocurrency prices since Jan 1, 2022 for several instruments. Use window functions (in SQL or dplyr) to calculate the year-to-date average and the six-day moving averages for each item. I have downloaded US natural disasters dataset from kaggle. This dataset is available on github https://raw.githubusercontent.com/mehreengillani/Data607-Assignment3/refs/heads/main/us_disasters_m5.csv
Step 1: Load and Explore the Data
url='https://raw.githubusercontent.com/mehreengillani/Data607-Assignment3/refs/heads/main/us_disasters_m5.csv'
data<-read.csv(url)
head(data)
## disaster_number state declaration_type declaration_date incident_type
## 1 2862 TX FM 2011-02-28T03:12:00Z Fire
## 2 2863 TX FM 2011-02-28T03:25:00Z Fire
## 3 2864 TX FM 2011-02-28T03:37:00Z Fire
## 4 2865 TX FM 2011-02-28T05:55:00Z Fire
## 5 2867 TX FM 2011-03-12T00:32:00Z Fire
## 6 2870 TX FM 2011-03-12T06:19:00Z Fire
## declaration_title ih_program_declared ia_program_declared
## 1 Willow Creek South Fire Complex 0 0
## 2 Matador Fire 0 0
## 3 Tanglewood Fire Complex 0 0
## 4 Mitchell Fire Complex 0 0
## 5 Enmin Fire 0 0
## 6 Big Trickle Ranch Fire 0 0
## pa_program_declared hm_program_declared incident_begin_date
## 1 1 0 2011-02-27T05:00:00Z
## 2 1 0 2011-02-27T05:00:00Z
## 3 1 0 2011-02-27T05:00:00Z
## 4 1 0 2011-02-27T05:00:00Z
## 5 1 0 2011-03-11T05:00:00Z
## 6 1 0 2011-03-11T05:00:00Z
## incident_end_date fips designated_area
## 1 <NA> 48375 Potter (County)
## 2 <NA> 48345 Motley (County)
## 3 <NA> 48381 Randall (County)
## 4 <NA> 48335 Mitchell (County)
## 5 2011-03-17T04:00:00Z 48237 Jack (County)
## 6 2011-03-17T04:00:00Z 48035 Bosque (County)
Lets print data summary:
summary(data)
## disaster_number state declaration_type declaration_date
## Min. :1966 Length:627 Length:627 Length:627
## 1st Qu.:2889 Class :character Class :character Class :character
## Median :4223 Mode :character Mode :character Mode :character
## Mean :3617
## 3rd Qu.:4255
## Max. :5116
## incident_type declaration_title ih_program_declared ia_program_declared
## Length:627 Length:627 Min. :0.0000 Min. :0
## Class :character Class :character 1st Qu.:0.0000 1st Qu.:0
## Mode :character Mode :character Median :0.0000 Median :0
## Mean :0.2297 Mean :0
## 3rd Qu.:0.0000 3rd Qu.:0
## Max. :1.0000 Max. :0
## pa_program_declared hm_program_declared incident_begin_date incident_end_date
## Min. :0.0000 Min. :0.0000 Length:627 Length:627
## 1st Qu.:1.0000 1st Qu.:1.0000 Class :character Class :character
## Median :1.0000 Median :1.0000 Mode :character Mode :character
## Mean :0.9761 Mean :0.7959
## 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000
## fips designated_area
## Min. : 6000 Length:627
## 1st Qu.:48098 Class :character
## Median :48241 Mode :character
## Mean :44875
## 3rd Qu.:48392
## Max. :55131
Lets print data dimesion:
dim(data)
## [1] 627 14
Lets print data variables:
colnames(data)
## [1] "disaster_number" "state" "declaration_type"
## [4] "declaration_date" "incident_type" "declaration_title"
## [7] "ih_program_declared" "ia_program_declared" "pa_program_declared"
## [10] "hm_program_declared" "incident_begin_date" "incident_end_date"
## [13] "fips" "designated_area"
Lets print Data structure
str(data)
## 'data.frame': 627 obs. of 14 variables:
## $ disaster_number : int 2862 2863 2864 2865 2867 2870 2870 2881 2881 2882 ...
## $ state : chr "TX" "TX" "TX" "TX" ...
## $ declaration_type : chr "FM" "FM" "FM" "FM" ...
## $ declaration_date : chr "2011-02-28T03:12:00Z" "2011-02-28T03:25:00Z" "2011-02-28T03:37:00Z" "2011-02-28T05:55:00Z" ...
## $ incident_type : chr "Fire" "Fire" "Fire" "Fire" ...
## $ declaration_title : chr "Willow Creek South Fire Complex" "Matador Fire" "Tanglewood Fire Complex" "Mitchell Fire Complex" ...
## $ ih_program_declared: int 0 0 0 0 0 0 0 0 0 0 ...
## $ ia_program_declared: int 0 0 0 0 0 0 0 0 0 0 ...
## $ pa_program_declared: int 1 1 1 1 1 1 1 1 1 1 ...
## $ hm_program_declared: int 0 0 0 0 0 0 0 0 0 0 ...
## $ incident_begin_date: chr "2011-02-27T05:00:00Z" "2011-02-27T05:00:00Z" "2011-02-27T05:00:00Z" "2011-02-27T05:00:00Z" ...
## $ incident_end_date : chr NA NA NA NA ...
## $ fips : int 48375 48345 48381 48335 48237 48035 48425 48329 48135 48479 ...
## $ designated_area : chr "Potter (County)" "Motley (County)" "Randall (County)" "Mitchell (County)" ...
Step 2: Data Cleaning Check for missing values:
missing_summary <- sapply(data, function(x) sum(is.na(x)))
print("Missing values summary:")
## [1] "Missing values summary:"
print(missing_summary)
## disaster_number state declaration_type declaration_date
## 0 0 0 0
## incident_type declaration_title ih_program_declared ia_program_declared
## 0 0 0 0
## pa_program_declared hm_program_declared incident_begin_date incident_end_date
## 0 0 0 11
## fips designated_area
## 0 0
#Print total number of NA
sum(is.na(data))
## [1] 11
There is no missing value except in incident_end_date
Let check Number of duplicate rows:
# Count total duplicate rows
sum(duplicated(data))
## [1] 0
Print unique values for incident type
unique(data$incident_type)
## [1] "Fire" "Snow" "Tsunami" "Severe Storm(s)"
## [5] "Other" "Earthquake" "Flood"
Format and extract date from datetime columns
# Function to extract date from datetime columns
extract_dates <- function(data) {
# Identify columns that might be datetime (contain 'T' and 'Z')
datetime_cols <- names(data)[sapply(data, function(x) {
any(grepl("^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z$", x))
})]
# Extract date for each datetime column
for(col in datetime_cols) {
new_col_name <- paste0(col, "_only")
data[[new_col_name]] <- as.Date(sub("T.*", "", data[[col]]))
}
return(data)
}
# Apply to your data frame
df <- extract_dates(data)
Lets check the updated date columns by printing data head
head(df)
## disaster_number state declaration_type declaration_date incident_type
## 1 2862 TX FM 2011-02-28T03:12:00Z Fire
## 2 2863 TX FM 2011-02-28T03:25:00Z Fire
## 3 2864 TX FM 2011-02-28T03:37:00Z Fire
## 4 2865 TX FM 2011-02-28T05:55:00Z Fire
## 5 2867 TX FM 2011-03-12T00:32:00Z Fire
## 6 2870 TX FM 2011-03-12T06:19:00Z Fire
## declaration_title ih_program_declared ia_program_declared
## 1 Willow Creek South Fire Complex 0 0
## 2 Matador Fire 0 0
## 3 Tanglewood Fire Complex 0 0
## 4 Mitchell Fire Complex 0 0
## 5 Enmin Fire 0 0
## 6 Big Trickle Ranch Fire 0 0
## pa_program_declared hm_program_declared incident_begin_date
## 1 1 0 2011-02-27T05:00:00Z
## 2 1 0 2011-02-27T05:00:00Z
## 3 1 0 2011-02-27T05:00:00Z
## 4 1 0 2011-02-27T05:00:00Z
## 5 1 0 2011-03-11T05:00:00Z
## 6 1 0 2011-03-11T05:00:00Z
## incident_end_date fips designated_area declaration_date_only
## 1 <NA> 48375 Potter (County) 2011-02-28
## 2 <NA> 48345 Motley (County) 2011-02-28
## 3 <NA> 48381 Randall (County) 2011-02-28
## 4 <NA> 48335 Mitchell (County) 2011-02-28
## 5 2011-03-17T04:00:00Z 48237 Jack (County) 2011-03-12
## 6 2011-03-17T04:00:00Z 48035 Bosque (County) 2011-03-12
## incident_begin_date_only incident_end_date_only
## 1 2011-02-27 <NA>
## 2 2011-02-27 <NA>
## 3 2011-02-27 <NA>
## 4 2011-02-27 <NA>
## 5 2011-03-11 2011-03-17
## 6 2011-03-11 2011-03-17
Step 3: Apply Window Functions Find the state with Highest Incident Type
# Count incidents by state and type
incident_counts <- df %>%
group_by(state, incident_type) %>%
summarise(count = n(), .groups = 'drop') %>%
arrange(desc(count))
# View the top incidents
head(incident_counts, 10)
## # A tibble: 10 × 3
## state incident_type count
## <chr> <chr> <int>
## 1 TX Fire 264
## 2 TX Severe Storm(s) 190
## 3 TX Flood 94
## 4 CA Fire 48
## 5 WI Severe Storm(s) 12
## 6 WI Snow 11
## 7 CA Tsunami 3
## 8 CA Earthquake 2
## 9 TX Other 2
## 10 CA Flood 1
# Find the state with the highest count for each incident type
top_states_per_incident <- incident_counts %>%
group_by(incident_type) %>%
slice_max(count, n = 1) %>%
arrange(desc(count))
# View the results
print(top_states_per_incident)
## # A tibble: 7 × 3
## # Groups: incident_type [7]
## state incident_type count
## <chr> <chr> <int>
## 1 TX Fire 264
## 2 TX Severe Storm(s) 190
## 3 TX Flood 94
## 4 WI Snow 11
## 5 CA Tsunami 3
## 6 CA Earthquake 2
## 7 TX Other 2
So based on data Texas has highest incident count. Whereas in Wisconsin highest incident type is snow.
# Create a visualization
ggplot(top_states_per_incident, aes(x = reorder(incident_type, -count), y = count, fill = state)) +
geom_bar(stat = "identity") +
labs(title = "States with Highest Counts for Each Incident Type",
x = "Incident Type",
y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_viridis_d()
Count ih_program_declared_incident_type Binary flag indicating whether the “Individuals and Households program” was declared for this disaster.
# Count ih_program_declared values by incident_type
ih_program_counts <- df %>%
group_by(incident_type, ih_program_declared) %>%
summarise(count = n(), .groups = 'drop') %>%
pivot_wider(names_from = ih_program_declared,
values_from = count,
names_prefix = "ih_",
values_fill = 0)
# View the results
print(ih_program_counts)
## # A tibble: 7 × 3
## incident_type ih_1 ih_0
## <chr> <int> <int>
## 1 Earthquake 2 0
## 2 Fire 25 287
## 3 Flood 53 42
## 4 Other 1 1
## 5 Severe Storm(s) 63 139
## 6 Snow 0 11
## 7 Tsunami 0 3
hm_program_declared Binary flag indicating whether the “Hazard Mitigation program” was declared for this disaster. Declaration Type One of “DR” (= major disaster), “EM” (= emergency management), or “FM” (= “fire management”)
# Count hm_program_declared values by incident_type
hm_program_counts <- df %>%
group_by(incident_type, declaration_type, hm_program_declared) %>%
summarise(count = n(), .groups = 'drop') %>%
pivot_wider(names_from = hm_program_declared,
values_from = count,
names_prefix = "hm_",
values_fill = 0)
# View the results
print(hm_program_counts)
## # A tibble: 9 × 4
## incident_type declaration_type hm_1 hm_0
## <chr> <chr> <int> <int>
## 1 Earthquake DR 2 0
## 2 Fire DR 179 12
## 3 Fire FM 11 110
## 4 Flood DR 92 3
## 5 Other DR 1 0
## 6 Other EM 0 1
## 7 Severe Storm(s) DR 201 1
## 8 Snow DR 10 1
## 9 Tsunami DR 3 0
incident_end_date: Date the incident itself ended. The M5 constraint is that the disaster must have begun before the start of the evaluation time period on 2016-05-23 and ended on or after the begin of the training data range on 2011-01-29. This feature has about 14% NA entries, which were considered under the assumption that start data = end date. I will replace N/A with incident begin date.
df <- df %>%
mutate(incident_end_date_only = if_else(is.na(incident_end_date_only), incident_begin_date_only, incident_end_date_only))
check incident end dates
sum(is.na(df$incident_end_date_only))
## [1] 0
calculate year-to-date (YTD) averages and 6-day moving averages for each incident type based on begin and end dates:
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
# First, create a complete date sequence for the analysis period
date_range <- seq(min(df$incident_begin_date_only, na.rm = TRUE),
max(df$incident_end_date_only, na.rm = TRUE),
by = "day")
# Create a function to calculate incident duration in days
calculate_duration <- function(begin_date, end_date) {
as.numeric(difftime(end_date, begin_date, units = "days")) + 1
}
# Prepare the data with duration and year information
disaster_daily <- df %>%
mutate(
duration_days = calculate_duration(incident_begin_date_only, incident_end_date_only),
year = year(incident_begin_date_only)
) %>%
# Create a row for each day of each incident
tidyr::uncount(duration_days, .id = "day_within_incident") %>%
mutate(
incident_date = incident_begin_date_only + days(day_within_incident - 1)
) %>%
select(incident_type, incident_date, year)
# Create a complete grid of all dates and incident types
complete_grid <- expand.grid(
incident_date = date_range,
incident_type = unique(disaster_daily$incident_type),
stringsAsFactors = FALSE
) %>%
mutate(year = year(incident_date))
# Count incidents by type and date
daily_incident_counts <- complete_grid %>%
left_join(
disaster_daily %>%
group_by(incident_date, incident_type) %>%
summarise(daily_count = n(), .groups = "drop"),
by = c("incident_date", "incident_type")
) %>%
mutate(daily_count = ifelse(is.na(daily_count), 0, daily_count))
# Calculate YTD average and 6-day moving average for each incident type
incident_avgs <- daily_incident_counts %>%
group_by(incident_type, year) %>%
arrange(incident_date) %>%
mutate(
# Year-to-date average
ytd_count = cumsum(daily_count),
ytd_days = as.numeric(difftime(incident_date, ymd(paste0(year, "-01-01")), units = "days")) + 1,
ytd_avg = ytd_count / ytd_days,
# 6-day moving average
ma_6day = rollmean(daily_count, k = 6, fill = NA, align = "right")
) %>%
ungroup()
# View the results
head(incident_avgs)
## # A tibble: 6 × 8
## incident_date incident_type year daily_count ytd_count ytd_days ytd_avg
## <date> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2011-02-01 Fire 2011 0 0 32 0
## 2 2011-02-01 Snow 2011 11 11 32 0.344
## 3 2011-02-01 Tsunami 2011 0 0 32 0
## 4 2011-02-01 Severe Storm(s) 2011 0 0 32 0
## 5 2011-02-01 Other 2011 0 0 32 0
## 6 2011-02-01 Earthquake 2011 0 0 32 0
## # ℹ 1 more variable: ma_6day <dbl>
# Create a visualization of the moving averages
library(ggplot2)
ggplot(incident_avgs %>% filter(incident_type %in% unique(incident_type)[1:4]),
aes(x = incident_date, y = ma_6day, color = incident_type)) +
geom_line() +
labs(title = "6-Day Moving Average of Incidents by Type",
x = "Date", y = "6-Day Moving Average") +
facet_wrap(~incident_type, scales = "free_y") +
theme_minimal()
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).
# Create a visualization of YTD averages
ggplot(incident_avgs %>% filter(incident_type %in% unique(incident_type)[1:4]),
aes(x = incident_date, y = ytd_avg, color = incident_type)) +
geom_line() +
labs(title = "Year-to-Date Average of Incidents by Type",
x = "Date", y = "YTD Average") +
facet_wrap(~incident_type, scales = "free_y") +
theme_minimal()
# Create a summary table
incident_summary <- incident_avgs %>%
group_by(incident_type, year) %>%
summarise(
total_incidents = sum(daily_count),
avg_ytd = mean(ytd_avg, na.rm = TRUE),
avg_6day_ma = mean(ma_6day, na.rm = TRUE),
.groups = "drop"
)
# View the summary
print(incident_summary)
## # A tibble: 42 × 5
## incident_type year total_incidents avg_ytd avg_6day_ma
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Earthquake 2011 0 0 0
## 2 Earthquake 2012 0 0 0
## 3 Earthquake 2013 0 0 0
## 4 Earthquake 2014 32 0.0358 0.0889
## 5 Earthquake 2015 0 0 0
## 6 Earthquake 2016 0 0 0
## 7 Fire 2011 26902 50.7 81.3
## 8 Fire 2012 75 0.119 0.208
## 9 Fire 2013 163 0.215 0.453
## 10 Fire 2014 162 0.271 0.45
## # ℹ 32 more rows