R Markdown

This is an R Markdown document for DATA607/Assignment 3B, by Mehreen Ali Gillani. Window Functions

Find (or ask an LLM to generate!) a dataset that includes time series for two or more separate items. For example, you could use end of day stock or cryptocurrency prices since Jan 1, 2022 for several instruments. Use window functions (in SQL or dplyr) to calculate the year-to-date average and the six-day moving averages for each item. I have downloaded US natural disasters dataset from kaggle. This dataset is available on github https://raw.githubusercontent.com/mehreengillani/Data607-Assignment3/refs/heads/main/us_disasters_m5.csv

Step 1: Load and Explore the Data

url='https://raw.githubusercontent.com/mehreengillani/Data607-Assignment3/refs/heads/main/us_disasters_m5.csv'
data<-read.csv(url)
head(data)
##   disaster_number state declaration_type     declaration_date incident_type
## 1            2862    TX               FM 2011-02-28T03:12:00Z          Fire
## 2            2863    TX               FM 2011-02-28T03:25:00Z          Fire
## 3            2864    TX               FM 2011-02-28T03:37:00Z          Fire
## 4            2865    TX               FM 2011-02-28T05:55:00Z          Fire
## 5            2867    TX               FM 2011-03-12T00:32:00Z          Fire
## 6            2870    TX               FM 2011-03-12T06:19:00Z          Fire
##                 declaration_title ih_program_declared ia_program_declared
## 1 Willow Creek South Fire Complex                   0                   0
## 2                    Matador Fire                   0                   0
## 3         Tanglewood Fire Complex                   0                   0
## 4           Mitchell Fire Complex                   0                   0
## 5                      Enmin Fire                   0                   0
## 6          Big Trickle Ranch Fire                   0                   0
##   pa_program_declared hm_program_declared  incident_begin_date
## 1                   1                   0 2011-02-27T05:00:00Z
## 2                   1                   0 2011-02-27T05:00:00Z
## 3                   1                   0 2011-02-27T05:00:00Z
## 4                   1                   0 2011-02-27T05:00:00Z
## 5                   1                   0 2011-03-11T05:00:00Z
## 6                   1                   0 2011-03-11T05:00:00Z
##      incident_end_date  fips   designated_area
## 1                 <NA> 48375   Potter (County)
## 2                 <NA> 48345   Motley (County)
## 3                 <NA> 48381  Randall (County)
## 4                 <NA> 48335 Mitchell (County)
## 5 2011-03-17T04:00:00Z 48237     Jack (County)
## 6 2011-03-17T04:00:00Z 48035   Bosque (County)

Lets print data summary:

summary(data)
##  disaster_number    state           declaration_type   declaration_date  
##  Min.   :1966    Length:627         Length:627         Length:627        
##  1st Qu.:2889    Class :character   Class :character   Class :character  
##  Median :4223    Mode  :character   Mode  :character   Mode  :character  
##  Mean   :3617                                                            
##  3rd Qu.:4255                                                            
##  Max.   :5116                                                            
##  incident_type      declaration_title  ih_program_declared ia_program_declared
##  Length:627         Length:627         Min.   :0.0000      Min.   :0          
##  Class :character   Class :character   1st Qu.:0.0000      1st Qu.:0          
##  Mode  :character   Mode  :character   Median :0.0000      Median :0          
##                                        Mean   :0.2297      Mean   :0          
##                                        3rd Qu.:0.0000      3rd Qu.:0          
##                                        Max.   :1.0000      Max.   :0          
##  pa_program_declared hm_program_declared incident_begin_date incident_end_date 
##  Min.   :0.0000      Min.   :0.0000      Length:627          Length:627        
##  1st Qu.:1.0000      1st Qu.:1.0000      Class :character    Class :character  
##  Median :1.0000      Median :1.0000      Mode  :character    Mode  :character  
##  Mean   :0.9761      Mean   :0.7959                                            
##  3rd Qu.:1.0000      3rd Qu.:1.0000                                            
##  Max.   :1.0000      Max.   :1.0000                                            
##       fips       designated_area   
##  Min.   : 6000   Length:627        
##  1st Qu.:48098   Class :character  
##  Median :48241   Mode  :character  
##  Mean   :44875                     
##  3rd Qu.:48392                     
##  Max.   :55131

Lets print data dimesion:

dim(data)
## [1] 627  14

Lets print data variables:

colnames(data)
##  [1] "disaster_number"     "state"               "declaration_type"   
##  [4] "declaration_date"    "incident_type"       "declaration_title"  
##  [7] "ih_program_declared" "ia_program_declared" "pa_program_declared"
## [10] "hm_program_declared" "incident_begin_date" "incident_end_date"  
## [13] "fips"                "designated_area"

Lets print Data structure

str(data)
## 'data.frame':    627 obs. of  14 variables:
##  $ disaster_number    : int  2862 2863 2864 2865 2867 2870 2870 2881 2881 2882 ...
##  $ state              : chr  "TX" "TX" "TX" "TX" ...
##  $ declaration_type   : chr  "FM" "FM" "FM" "FM" ...
##  $ declaration_date   : chr  "2011-02-28T03:12:00Z" "2011-02-28T03:25:00Z" "2011-02-28T03:37:00Z" "2011-02-28T05:55:00Z" ...
##  $ incident_type      : chr  "Fire" "Fire" "Fire" "Fire" ...
##  $ declaration_title  : chr  "Willow Creek South Fire Complex" "Matador Fire" "Tanglewood Fire Complex" "Mitchell Fire Complex" ...
##  $ ih_program_declared: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ia_program_declared: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pa_program_declared: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ hm_program_declared: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ incident_begin_date: chr  "2011-02-27T05:00:00Z" "2011-02-27T05:00:00Z" "2011-02-27T05:00:00Z" "2011-02-27T05:00:00Z" ...
##  $ incident_end_date  : chr  NA NA NA NA ...
##  $ fips               : int  48375 48345 48381 48335 48237 48035 48425 48329 48135 48479 ...
##  $ designated_area    : chr  "Potter (County)" "Motley (County)" "Randall (County)" "Mitchell (County)" ...

Step 2: Data Cleaning Check for missing values:

missing_summary <- sapply(data, function(x) sum(is.na(x)))
print("Missing values summary:")
## [1] "Missing values summary:"
print(missing_summary)
##     disaster_number               state    declaration_type    declaration_date 
##                   0                   0                   0                   0 
##       incident_type   declaration_title ih_program_declared ia_program_declared 
##                   0                   0                   0                   0 
## pa_program_declared hm_program_declared incident_begin_date   incident_end_date 
##                   0                   0                   0                  11 
##                fips     designated_area 
##                   0                   0
#Print total number of NA 
sum(is.na(data))
## [1] 11

There is no missing value except in incident_end_date

Let check Number of duplicate rows:

# Count total duplicate rows
sum(duplicated(data))
## [1] 0

Print unique values for incident type

unique(data$incident_type)
## [1] "Fire"            "Snow"            "Tsunami"         "Severe Storm(s)"
## [5] "Other"           "Earthquake"      "Flood"

Format and extract date from datetime columns

# Function to extract date from datetime columns
extract_dates <- function(data) {
  # Identify columns that might be datetime (contain 'T' and 'Z')
  datetime_cols <- names(data)[sapply(data, function(x) {
    any(grepl("^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z$", x))
  })]
  
  # Extract date for each datetime column
  for(col in datetime_cols) {
    new_col_name <- paste0(col, "_only")
    data[[new_col_name]] <- as.Date(sub("T.*", "", data[[col]]))
  }
  
  return(data)
}
# Apply to your data frame
df <- extract_dates(data)

Lets check the updated date columns by printing data head

head(df)
##   disaster_number state declaration_type     declaration_date incident_type
## 1            2862    TX               FM 2011-02-28T03:12:00Z          Fire
## 2            2863    TX               FM 2011-02-28T03:25:00Z          Fire
## 3            2864    TX               FM 2011-02-28T03:37:00Z          Fire
## 4            2865    TX               FM 2011-02-28T05:55:00Z          Fire
## 5            2867    TX               FM 2011-03-12T00:32:00Z          Fire
## 6            2870    TX               FM 2011-03-12T06:19:00Z          Fire
##                 declaration_title ih_program_declared ia_program_declared
## 1 Willow Creek South Fire Complex                   0                   0
## 2                    Matador Fire                   0                   0
## 3         Tanglewood Fire Complex                   0                   0
## 4           Mitchell Fire Complex                   0                   0
## 5                      Enmin Fire                   0                   0
## 6          Big Trickle Ranch Fire                   0                   0
##   pa_program_declared hm_program_declared  incident_begin_date
## 1                   1                   0 2011-02-27T05:00:00Z
## 2                   1                   0 2011-02-27T05:00:00Z
## 3                   1                   0 2011-02-27T05:00:00Z
## 4                   1                   0 2011-02-27T05:00:00Z
## 5                   1                   0 2011-03-11T05:00:00Z
## 6                   1                   0 2011-03-11T05:00:00Z
##      incident_end_date  fips   designated_area declaration_date_only
## 1                 <NA> 48375   Potter (County)            2011-02-28
## 2                 <NA> 48345   Motley (County)            2011-02-28
## 3                 <NA> 48381  Randall (County)            2011-02-28
## 4                 <NA> 48335 Mitchell (County)            2011-02-28
## 5 2011-03-17T04:00:00Z 48237     Jack (County)            2011-03-12
## 6 2011-03-17T04:00:00Z 48035   Bosque (County)            2011-03-12
##   incident_begin_date_only incident_end_date_only
## 1               2011-02-27                   <NA>
## 2               2011-02-27                   <NA>
## 3               2011-02-27                   <NA>
## 4               2011-02-27                   <NA>
## 5               2011-03-11             2011-03-17
## 6               2011-03-11             2011-03-17

Step 3: Apply Window Functions Find the state with Highest Incident Type

# Count incidents by state and type
incident_counts <- df %>%
  group_by(state, incident_type) %>%
  summarise(count = n(), .groups = 'drop') %>%
  arrange(desc(count))

# View the top incidents
head(incident_counts, 10)
## # A tibble: 10 × 3
##    state incident_type   count
##    <chr> <chr>           <int>
##  1 TX    Fire              264
##  2 TX    Severe Storm(s)   190
##  3 TX    Flood              94
##  4 CA    Fire               48
##  5 WI    Severe Storm(s)    12
##  6 WI    Snow               11
##  7 CA    Tsunami             3
##  8 CA    Earthquake          2
##  9 TX    Other               2
## 10 CA    Flood               1
# Find the state with the highest count for each incident type
top_states_per_incident <- incident_counts %>%
  group_by(incident_type) %>%
  slice_max(count, n = 1) %>%
  arrange(desc(count))

# View the results
print(top_states_per_incident)
## # A tibble: 7 × 3
## # Groups:   incident_type [7]
##   state incident_type   count
##   <chr> <chr>           <int>
## 1 TX    Fire              264
## 2 TX    Severe Storm(s)   190
## 3 TX    Flood              94
## 4 WI    Snow               11
## 5 CA    Tsunami             3
## 6 CA    Earthquake          2
## 7 TX    Other               2

So based on data Texas has highest incident count. Whereas in Wisconsin highest incident type is snow.

# Create a visualization
ggplot(top_states_per_incident, aes(x = reorder(incident_type, -count), y = count, fill = state)) +
  geom_bar(stat = "identity") +
  labs(title = "States with Highest Counts for Each Incident Type",
       x = "Incident Type",
       y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_fill_viridis_d()

Count ih_program_declared_incident_type Binary flag indicating whether the “Individuals and Households program” was declared for this disaster.

# Count ih_program_declared values by incident_type
ih_program_counts <- df %>%
  group_by(incident_type, ih_program_declared) %>%
  summarise(count = n(), .groups = 'drop') %>%
  pivot_wider(names_from = ih_program_declared, 
              values_from = count, 
              names_prefix = "ih_",
              values_fill = 0)

# View the results
print(ih_program_counts)
## # A tibble: 7 × 3
##   incident_type    ih_1  ih_0
##   <chr>           <int> <int>
## 1 Earthquake          2     0
## 2 Fire               25   287
## 3 Flood              53    42
## 4 Other               1     1
## 5 Severe Storm(s)    63   139
## 6 Snow                0    11
## 7 Tsunami             0     3

hm_program_declared Binary flag indicating whether the “Hazard Mitigation program” was declared for this disaster. Declaration Type One of “DR” (= major disaster), “EM” (= emergency management), or “FM” (= “fire management”)

# Count hm_program_declared values by incident_type
hm_program_counts <- df %>%
  group_by(incident_type, declaration_type, hm_program_declared) %>%
  summarise(count = n(), .groups = 'drop') %>%
  pivot_wider(names_from = hm_program_declared, 
              values_from = count, 
              names_prefix = "hm_",
              values_fill = 0)

# View the results
print(hm_program_counts)
## # A tibble: 9 × 4
##   incident_type   declaration_type  hm_1  hm_0
##   <chr>           <chr>            <int> <int>
## 1 Earthquake      DR                   2     0
## 2 Fire            DR                 179    12
## 3 Fire            FM                  11   110
## 4 Flood           DR                  92     3
## 5 Other           DR                   1     0
## 6 Other           EM                   0     1
## 7 Severe Storm(s) DR                 201     1
## 8 Snow            DR                  10     1
## 9 Tsunami         DR                   3     0

incident_end_date: Date the incident itself ended. The M5 constraint is that the disaster must have begun before the start of the evaluation time period on 2016-05-23 and ended on or after the begin of the training data range on 2011-01-29. This feature has about 14% NA entries, which were considered under the assumption that start data = end date. I will replace N/A with incident begin date.

df <- df %>%
  mutate(incident_end_date_only = if_else(is.na(incident_end_date_only), incident_begin_date_only, incident_end_date_only))

check incident end dates

sum(is.na(df$incident_end_date_only))
## [1] 0

calculate year-to-date (YTD) averages and 6-day moving averages for each incident type based on begin and end dates:

library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
# First, create a complete date sequence for the analysis period
date_range <- seq(min(df$incident_begin_date_only, na.rm = TRUE), 
                 max(df$incident_end_date_only, na.rm = TRUE), 
                 by = "day")

# Create a function to calculate incident duration in days
calculate_duration <- function(begin_date, end_date) {
  as.numeric(difftime(end_date, begin_date, units = "days")) + 1
  
}
# Prepare the data with duration and year information
disaster_daily <- df %>%
  mutate(
    duration_days = calculate_duration(incident_begin_date_only, incident_end_date_only),
    year = year(incident_begin_date_only)
  ) %>%
  # Create a row for each day of each incident
  tidyr::uncount(duration_days, .id = "day_within_incident") %>%
  mutate(
    incident_date = incident_begin_date_only + days(day_within_incident - 1)
  ) %>%
  select(incident_type, incident_date, year)

# Create a complete grid of all dates and incident types
complete_grid <- expand.grid(
  incident_date = date_range,
  incident_type = unique(disaster_daily$incident_type),
  stringsAsFactors = FALSE
) %>%
  mutate(year = year(incident_date))

# Count incidents by type and date
daily_incident_counts <- complete_grid %>%
  left_join(
    disaster_daily %>%
      group_by(incident_date, incident_type) %>%
      summarise(daily_count = n(), .groups = "drop"),
    by = c("incident_date", "incident_type")
  ) %>%
  mutate(daily_count = ifelse(is.na(daily_count), 0, daily_count))

# Calculate YTD average and 6-day moving average for each incident type
incident_avgs <- daily_incident_counts %>%
  group_by(incident_type, year) %>%
  arrange(incident_date) %>%
  mutate(
    # Year-to-date average
    ytd_count = cumsum(daily_count),
    ytd_days = as.numeric(difftime(incident_date, ymd(paste0(year, "-01-01")), units = "days")) + 1,
    ytd_avg = ytd_count / ytd_days,
    
    # 6-day moving average
    ma_6day = rollmean(daily_count, k = 6, fill = NA, align = "right")
  ) %>%
  ungroup()

# View the results
head(incident_avgs)
## # A tibble: 6 × 8
##   incident_date incident_type    year daily_count ytd_count ytd_days ytd_avg
##   <date>        <chr>           <dbl>       <dbl>     <dbl>    <dbl>   <dbl>
## 1 2011-02-01    Fire             2011           0         0       32   0    
## 2 2011-02-01    Snow             2011          11        11       32   0.344
## 3 2011-02-01    Tsunami          2011           0         0       32   0    
## 4 2011-02-01    Severe Storm(s)  2011           0         0       32   0    
## 5 2011-02-01    Other            2011           0         0       32   0    
## 6 2011-02-01    Earthquake       2011           0         0       32   0    
## # ℹ 1 more variable: ma_6day <dbl>
# Create a visualization of the moving averages
library(ggplot2)

ggplot(incident_avgs %>% filter(incident_type %in% unique(incident_type)[1:4]), 
       aes(x = incident_date, y = ma_6day, color = incident_type)) +
  geom_line() +
  labs(title = "6-Day Moving Average of Incidents by Type",
       x = "Date", y = "6-Day Moving Average") +
  facet_wrap(~incident_type, scales = "free_y") +
  theme_minimal()
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

# Create a visualization of YTD averages
ggplot(incident_avgs %>% filter(incident_type %in% unique(incident_type)[1:4]), 
       aes(x = incident_date, y = ytd_avg, color = incident_type)) +
  geom_line() +
  labs(title = "Year-to-Date Average of Incidents by Type",
       x = "Date", y = "YTD Average") +
  facet_wrap(~incident_type, scales = "free_y") +
  theme_minimal()

# Create a summary table
incident_summary <- incident_avgs %>%
  group_by(incident_type, year) %>%
  summarise(
    total_incidents = sum(daily_count),
    avg_ytd = mean(ytd_avg, na.rm = TRUE),
    avg_6day_ma = mean(ma_6day, na.rm = TRUE),
    .groups = "drop"
  )

# View the summary
print(incident_summary)
## # A tibble: 42 × 5
##    incident_type  year total_incidents avg_ytd avg_6day_ma
##    <chr>         <dbl>           <dbl>   <dbl>       <dbl>
##  1 Earthquake     2011               0  0           0     
##  2 Earthquake     2012               0  0           0     
##  3 Earthquake     2013               0  0           0     
##  4 Earthquake     2014              32  0.0358      0.0889
##  5 Earthquake     2015               0  0           0     
##  6 Earthquake     2016               0  0           0     
##  7 Fire           2011           26902 50.7        81.3   
##  8 Fire           2012              75  0.119       0.208 
##  9 Fire           2013             163  0.215       0.453 
## 10 Fire           2014             162  0.271       0.45  
## # ℹ 32 more rows