Research Question: After quantifying the success of response time dependent on size for U.S. wildfires, can this maitenance rating be predicted by an array of other factors pertaining to the wildfire?

The data used to address this research question comes from the U.S. Department of Agriculture’s Forest Service. A full link to descriptions of the variables and other information can be found at the conclusion of this report.

Libraries

library(readr)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ purrr     1.0.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(mice)

## 
## Attaching package: 'mice'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     cbind, rbind

library(dplyr)
library(MASS)

## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select

wildfire_data <- read_csv("~/Desktop/Stats Learning/Final Project/National_Interagency_Fire_Occurrence_Sixth_Edition_1992-2020_(Feature_Layer).csv")

## Rows: 2303566 Columns: 40
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (30): FPA_ID, SOURCE_SYSTEM_TYPE, SOURCE_SYSTEM, NWCG_REPORTING_AGENCY, ...
## dbl (10): X, Y, OBJECTID, FOD_ID, FIRE_YEAR, DISCOVERY_DOY, CONT_DOY, FIRE_S...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Unnecessary variable removal
fire_data_clean <- wildfire_data |>
  dplyr::select(OBJECTID, SOURCE_SYSTEM_TYPE, NWCG_REPORTING_AGENCY, NWCG_REPORTING_UNIT_NAME, SOURCE_REPORTING_UNIT, SOURCE_REPORTING_UNIT_NAME, LOCAL_FIRE_REPORT_ID, FIRE_NAME, FIRE_YEAR, DISCOVERY_DOY, DISCOVERY_TIME, NWCG_CAUSE_CLASSIFICATION, NWCG_GENERAL_CAUSE, NWCG_CAUSE_AGE_CATEGORY, CONT_DOY, CONT_TIME, FIRE_SIZE, FIRE_SIZE_CLASS, LATITUDE, LONGITUDE, OWNER_DESCR, STATE, COUNTY)

fire_data_times <- fire_data_clean |>
  filter(is.na(CONT_DOY)==FALSE) |>
  filter(is.na(DISCOVERY_DOY)==FALSE)

# No fires are just under a year long but have same-day endpoints
fire_data_clean |>
  filter(DISCOVERY_DOY==CONT_DOY) |>
  filter(DISCOVERY_TIME>CONT_TIME)

## # A tibble: 0 × 23
## # ℹ 23 variables: OBJECTID <dbl>, SOURCE_SYSTEM_TYPE <chr>,
## #   NWCG_REPORTING_AGENCY <chr>, NWCG_REPORTING_UNIT_NAME <chr>,
## #   SOURCE_REPORTING_UNIT <chr>, SOURCE_REPORTING_UNIT_NAME <chr>,
## #   LOCAL_FIRE_REPORT_ID <chr>, FIRE_NAME <chr>, FIRE_YEAR <dbl>,
## #   DISCOVERY_DOY <dbl>, DISCOVERY_TIME <chr>, NWCG_CAUSE_CLASSIFICATION <chr>,
## #   NWCG_GENERAL_CAUSE <chr>, NWCG_CAUSE_AGE_CATEGORY <chr>, CONT_DOY <dbl>,
## #   CONT_TIME <chr>, FIRE_SIZE <dbl>, FIRE_SIZE_CLASS <chr>, LATITUDE <dbl>, …

# Fires with a length of "0" days simply means less than one day
fire_data_times <- fire_data_times |>
  mutate(fire_duration = case_when(DISCOVERY_DOY==CONT_DOY ~ 0,
                                   CONT_DOY>DISCOVERY_DOY ~ CONT_DOY-DISCOVERY_DOY,
                                   FIRE_YEAR%%4 == 0 ~ 366+CONT_DOY-DISCOVERY_DOY,
                                   DISCOVERY_DOY>CONT_DOY ~ 365+CONT_DOY-DISCOVERY_DOY))

# Many fires were approximated to 0.1, 1, 0.5 acres (in that order)
fire_data_times |>
  ggplot() +
  geom_density(aes(x = log(FIRE_SIZE))) +
  labs(title = "Wildfire Sizes in the U.S. 1992-2020", x = "Size of Fire (Acres)", y = "Frequency", caption = "Data: USFS")

# This makes distinct lines in the comparison scatterplot between fire size and duration
fire_data_times |>
  sample_n(10000) |>
  ggplot() +
  geom_point(aes(x = log(FIRE_SIZE), y = fire_duration, )) +
  labs(title = "Wildfire Sizes and Timeframes in the U.S. 1992-2020", x = "Size of Fire (Acres)", y = "Duration of Fire (Days)", caption = "Data: USFS")

# To see the general trend of fire size and duration relationship, we can group the size variable and average it across all groups
fire_distinct_sizes <- fire_data_times |>
  group_by(FIRE_SIZE) |>
  mutate(avg_duration = mean(fire_duration)) |>
  ungroup()
fire_distinct_sizes |>
  sample_n(10000) |>
  ggplot() +
  geom_point(aes(x = log(FIRE_SIZE), y = avg_duration)) +
  labs(title = "Average Duration of U.S. Wildifres by Size 1992-2020", x = "Size of Fire (Acres)", y = "Mean Duration of Fire (Days)", caption = "Data: USFS")

# Using regression to find the expected duration of a fire based on size
wildfire_regression <- fire_data_times |>
  mutate(log_fire = log(FIRE_SIZE)) |>
  dplyr::select(log_fire, fire_duration)
wildfire_lm <- lm(fire_duration ~ poly(log_fire, 4), wildfire_regression)
# Our residuals now convey the "success" of a wildfire's maintenance:
# Positive residual = took more time than average/expected for that size
# Negative residual = more efficient than average/expected for that size
fire_data_times <- fire_data_times |>
  mutate(resid = wildfire_lm$residuals)
# Polynomial regression of degree four was discerned as best via plotting the respective degrees' residuals
# Although admittedly not ideal (there are more higher positive than lower negative residuals) this model was the most effective in capturing the relationship between fire size and duration
wildfire_lm$residuals |>
 plot() |>
 labs(title = "Quantified Wildfire Response via Residuals", x = "Case of Wildfire (By Indices)", y = "Regression Residual", caption = "Data: USFS")

## [[1]]
## NULL
## 
## $x
## [1] "Case of Wildfire (By Indices)"
## 
## $y
## [1] "Regression Residual"
## 
## $title
## [1] "Quantified Wildfire Response via Residuals"
## 
## $caption
## [1] "Data: USFS"
## 
## attr(,"class")
## [1] "labels"

set.seed(101)
ggplot(wildfire_regression |> sample_n(10000) , aes(x = log_fire, y = fire_duration)) +
  geom_point() +
  geom_smooth(method = "lm", formula = y ~ poly(x, 4),
                                        se = TRUE) +
  labs(title = "Quantified Wildfire Response Regression", x = "Size of Fire (acres)", y = "Duration of Fire (Days)", caption = "Data: USFS")

Using these residual values as a quantifier for fire management success, we can attempt to predict these numerical representations with other variables. In this manner, discerning what variables outside of a fire’s direct size and duration contribute to the efficiency in combating it can be analyzed. The following variables will be assessed on their relationship with successful fire management:

LATITUDE & LONGITUDE: coordinates corresponding to where the wildfire was observed (two separate variables, both in decimal degrees). Does location (or either specific aspect of location) permit or hinder firefighting services?
OWNER_DESCR: Name of primary owner or entity responsible for managing the land at the point of origin of the fire at the time of the incident. Does the one performing the land maintenance signify response adequacy?
LOCAL_FIRE_REPORT_ID: Number or code that uniquely identifies an incident report for a particular reporting unit and a particular calendar year. Do fires happening later in a local area’s year mean more seasonal experience?
FIRE_YEAR: Calendar year in which the fire was discovered or confirmed to exist. Do firefighting abilities increase/decrease overtime?
NWCG_GENERAL_CAUSE: Event or circumstance that started a fire or set the stage for the occurrence. Are fires caused by certain circumstances harder to fight?

Linear Discriminant Analysis

Use of LDA is applicable due to the presence of two categorical variables in our dataset: cause of fire and land-owning management organization. It is also a comparably fast method that will work more efficiently with this expansive dataset.

wildfire_lda <- fire_data_times |>
  dplyr::select(LATITUDE, LONGITUDE, OWNER_DESCR, LOCAL_FIRE_REPORT_ID, FIRE_YEAR, NWCG_GENERAL_CAUSE, resid)

# After observing which variables have frequent "NA" values, I decided to remove the local fire report ID as a variables in the LDA dataset. The question it contributes to answering (does a certain area's ability to fight wildfires shift with yearly experience) can be answered in conglomeration of latitude-longitude and calendar year data pairings.
md.pattern(wildfire_lda)

##        LATITUDE LONGITUDE OWNER_DESCR FIRE_YEAR NWCG_GENERAL_CAUSE resid
## 459390        1         1           1         1                  1     1
## 949363        1         1           1         1                  1     1
##               0         0           0         0                  0     0
##        LOCAL_FIRE_REPORT_ID       
## 459390                    1      0
## 949363                    0      1
##                      949363 949363

wildfire_lda <- wildfire_lda |>
  dplyr::select(-LOCAL_FIRE_REPORT_ID)

# We can then fill-in for the missing values through the mice function as all variables now have at most ~10% of their data missing
wildfire_lda_full <- mice(wildfire_lda,
                         method = "lda",
                         seed = 101)

## 
##  iter imp variable
##   1   1
##   1   2
##   1   3
##   1   4
##   1   5
##   2   1
##   2   2
##   2   3
##   2   4
##   2   5
##   3   1
##   3   2
##   3   3
##   3   4
##   3   5
##   4   1
##   4   2
##   4   3
##   4   4
##   4   5
##   5   1
##   5   2
##   5   3
##   5   4
##   5   5

## Warning: Number of logged events: 2

wildfire_lda_final <- complete(wildfire_lda_full)

# The following takes exceedingly long to run, as well as is over the maximum storage size
# lda_model <- lda(residuals ~ ., data = wildfire_lda_final)

set.seed(101)
training_data_rows <- sample(1:nrow(wildfire_lda_final), size = nrow(wildfire_lda_final)/2)

training_fire <- wildfire_lda_final[training_data_rows, ] |>
  sample_n(10000)
testing_fire <- wildfire_lda_final[-training_data_rows, ] |>
  sample_n(10000)

fire_lda_model <- lda(resid ~ ., data = training_fire)
fire_preds <- predict(fire_lda_model, testing_fire)

# negative diff: thought the fire's maintenance would have been better than it actually was
# positive diff: thought the fire's maintenance would have been worse than it actually was
results <- as.data.frame(testing_fire$resid)
results <- results |>
  mutate(predictions = as.numeric(as.character(fire_preds$class))) |>
  mutate(diff = `testing_fire$resid`-predictions)

# Pretty accurate predictions!! But why...?
results |>
  ggplot() +
  geom_density(aes(x = diff)) +
  labs(title = "Wildfire Regression Accuracy", x = "Difference Residuals (Prediction - Actual)", y = "Frequency", caption = "Data: USFS")

# We can make lda models for each variable and see which ones have the lowest overall (absolute value) of difference
# Including all variables: 3.836
results |>
  summarise(mean(abs(diff)))

##   mean(abs(diff))
## 1        3.835569

# Latitude: 1.165
latitude_lda_model <- lda(resid ~ LATITUDE, data = training_fire)
lat_preds <- predict(latitude_lda_model, testing_fire)
lat_results <- as.data.frame(testing_fire$resid)
lat_results <- lat_results |>
  mutate(predictions = as.numeric(as.character(lat_preds$class))) |>
  mutate(diff = `testing_fire$resid`-predictions)
lat_results |>
  ggplot() +
  geom_density(aes(x = diff)) +
  labs(title = "Wildfire Regression Accuracy (Latitude)", x = "Difference Residuals (Prediction - Actual)", y = "Frequency", caption = "Data: USFS")

lat_results |>
  summarise(mean(abs(diff)))

##   mean(abs(diff))
## 1        1.164935

# Longitude: 1.102, right-skewed (underestimating response capabilities)
longitude_lda_model <- lda(resid ~ LONGITUDE, data = training_fire)
long_preds <- predict(longitude_lda_model, testing_fire)
long_results <- as.data.frame(testing_fire$resid)
long_results <- long_results |>
  mutate(predictions = as.numeric(as.character(long_preds$class))) |>
  mutate(diff = `testing_fire$resid`-predictions)
long_results |>
  ggplot() +
  geom_density(aes(x = diff)) +
  labs(title = "Wildfire Regression Accuracy (Longitude)", x = "Difference Residuals (Prediction - Actual)", y = "Frequency", caption = "Data: USFS")

long_results |>
  summarise(mean(abs(diff)))

##   mean(abs(diff))
## 1        1.101878

# Land Owner: 2.069
owner_lda_model <- lda(resid ~ OWNER_DESCR, data = training_fire)
owner_preds <- predict(owner_lda_model, testing_fire)
owner_results <- as.data.frame(testing_fire$resid)
owner_results <- owner_results |>
  mutate(predictions = as.numeric(as.character(owner_preds$class))) |>
  mutate(diff = `testing_fire$resid`-predictions)
owner_results |>
  ggplot() +
  geom_density(aes(x = diff)) +
  labs(title = "Wildfire Regression Accuracy (Land Owner)", x = "Difference Residuals (Prediction - Actual)", y = "Frequency", caption = "Data: USFS")

owner_results |>
  summarise(mean(abs(diff)))

##   mean(abs(diff))
## 1        2.069422

# Fire Year: 1.103, right-skewed (underestimating response capabilities)
year_lda_model <- lda(resid ~ FIRE_YEAR, data = training_fire)
year_preds <- predict(year_lda_model, testing_fire)
year_results <- as.data.frame(testing_fire$resid)
year_results <- year_results |>
  mutate(predictions = as.numeric(as.character(year_preds$class))) |>
  mutate(diff = `testing_fire$resid`-predictions)
year_results |>
  ggplot() +
  geom_density(aes(x = diff)) +
  labs(title = "Wildfire Regression Accuracy (Fire Year)", x = "Difference Residuals (Prediction - Actual)", y = "Frequency", caption = "Data: USFS")

year_results |>
  summarise(mean(abs(diff)))

##   mean(abs(diff))
## 1        1.102678

# Fire Cause: 1.527, right-skewed (underestimating response capabilities)
cause_lda_model <- lda(resid ~ NWCG_GENERAL_CAUSE, data = training_fire)
cause_preds <- predict(cause_lda_model, testing_fire)
cause_results <- as.data.frame(testing_fire$resid)
cause_results <- cause_results |>
  mutate(predictions = as.numeric(as.character(cause_preds$class))) |>
  mutate(diff = `testing_fire$resid`-predictions)
cause_results |>
  ggplot() +
  geom_density(aes(x = diff)) +
  labs(title = "Wildfire Regression Accuracy (Fire Cause)", x = "Difference Residuals (Prediction - Actual)", y = "Frequency", caption = "Data: USFS")

cause_results |>
  summarise(mean(abs(diff)))

##   mean(abs(diff))
## 1        1.536508

variable_analysis <- as.data.frame(fire_lda_model$scaling)
variable_analysis <- variable_analysis |>
  mutate(contributions = (LD1+LD2+LD3+LD4+LD5+LD6+LD7+LD8+LD9+LD10+LD11+LD12+LD13+LD14+LD15+LD16+LD17+LD18+LD19+LD20+LD21+LD22+LD23+LD24+LD25+LD26+LD27+LD28)/28)
variable_analysis |>
  arrange(contributions)

Summary of Variable Importance:

Numerical: Longitude is the most telling, then year of the fire, then latitude. It is important to recognize how close all these three variables are to predicting the success of fire response when put into an LDA model by themselves. However, fire year and longitude both are skewed right in their difference variables, indicating that there is a pattern of underestimating response capabilities if just relying on these variables individually. It is also important to note how many smaller fires were reported in disproportionate comparison to larger fires reported. This could make a model who is effective at predicting response capabilities for smaller fires one that is overall marketable as an “effective model”.

Categorical:

Signifies likelihood of higher efficiency: (all in decreasing order of importance) - Land Owner: County, undefined federal, state/private, other federal, and state organizations are the ones in charge of the maintenance. Basically, if a governmental organization is the one doing the upkeep, the area is set up for better fire maintenance prospects. - Fire Cause: Recreation & ceremony, equipment & vehicle use, and fireworks. If there are humans present doing some activity with purposeful use (that is maybe unique in its time or with a lot of others present OR that those individuals are capable of handling) then the area is more likely to have a quick response. There also doesn’t necessary need to be a similarity across all the fire causes that lead to quicker cleanups, and this analysis could simply be pointing out different avenues that all conclude with effective wildfire responses.

Signifies likelihood of lower efficiency: (all in decreasing importance order, again) - Fire Cause: Firearms & explosive use, “other causes”, railroad operations & maintenance, misuse of fire by a minor, and power generation/transmission/distribution. These categories are either more destructive, in the absence of human oversight, or are unclassified aspects potentially inflicted by under-prepared parties (minors). If the reason for a fire being lit is absent of individuals there to report on it then it makes sense the reponse will be time-wise not ideal. - Land Owner: Fish & Wildlife Service and Tribal organizations. These two organizations may not have policies in line of what to do when a wildfire arises (FWS) may not have direct access to the same resources (Tribal organizations) and help that governmental organizations would defaultedly have.

This research question could also be evaluated using the following model:

New Supervised Learning Method: Support Vector Regression

Support Vector Regression is a Support Vector Machine based approach utilized to craft a function that aims to best predict a continuous, numerical output value. This boils down to receiving a dataset, recognizing patterns in it, and then communicating a formula to predict for a new case potentially in the dataset (variable-wise) what the prediction of the looking-to-be-solved-for type is. In this case the SVR model would hope to output a function which could determine the quantified basis of wildfire response time dependent on the set of factors outlined above in the LDA section.

In typical SVM supervised learning methods, a hyperplane (a multi-dimensional plane) is optimized to best encapsulate the divide between classes. This entails finding the maximum distance between points via margins to this hyperplane that are lengthened by recognition of patterns in the variable data. The naming of this algorithm comes from these vectors that span the margins, the “support vectors”. Contextually put, this would translate to finding a reason of combined set of reasons that equate to why wildfires are more quickly or slowly cleaned up. Further analysis of the wildfire dataset could reveal more ways to expand on this hyperplane divider.

Since variables of diverse types aren’t inherently “combinable”, they need to be set into a higher-dimensional space that can encapsulate their differences. When vectors aren’t able to be mapped into this higher-dimensional space naturally, kernel transformations are utilized. This is at large a conglomeration approach to bring the data to the same level for discussion. This occurs when relationships in the data aren’t linearly separable, meaning that a line, plane, or hyperplane can’t adequately classify all the datapoints correctly (there is some overlap of classes within the variable ranges). We would see this in our data when a few wildfires managed by the same land ownership organization were started by a variety of causes and also occurred in varying years (One example of many). There is no concrete relationship, so kernel transformations are necessary to compute vector-products that maps these interactions into a multi-dimensional space where division of patterns is possible.

SVR holds many similarities to SVM, it is simply geared toward maximizing an encompassing regression model for the given data as opposed to a hyperplane-like divider. The independent variables here still serve as the factors potentially able to predict the continuous, numerical quantifier of success in containing a wildfire.

To do this in r, one should use the function SVR() and indicate that the kernel attribute is either linear, polynomial, or of the radial basis function (RBF) type. This is located in the package “e1071”. RBF is another strategy for mapping non-linear relationships between variables for more complex data. We would likely use RBF for the wildfire data as its complexity is quite high (both in size and number of variables).

The contextualized steps for SVR are as follows (variables are seen below within the example code that are synonymous to the wildfire data used):

Create the model (r will do everything for you after installing and loading in the library): svr_model = SVR(kernel = ‘linear OR polynomial OR RBF’)
Train the model (using wildfire variables as examples, this is like the training data we used for LDA): svr.fit(training_data, FIRE_YEAR, LONGITUDE, etc.) You’ll notice that there are specific functions with SVR that train and test the data. svr.fit() = train, svr.predict = test.
Make predictions with the SVR model (adjacent to the testing data that we used before): predictions = svr.predict(testing_data)
Optional - Plot the predictions (for fire maintenance) (will be a continuous line/polynomial due to r’s inability to display multi-dimensional graphs!) against the true values (residuals of fire maintenance): done using any ggplot2 or similar function

A regression function will be produced much like any other regression!

##Other Questions:##

These are a couple questions that crossed my mind while analyzing this data that I could see a graph-able start on. I recognize the intricacies in this data and the confounding variables that were not addressed in this report.

Is source_reporting_unit associated with generalizations of fire size? Are the source reporting units with the highest rates of wildfire coverage generalizing to make up for the amount they have to report on?

# Starting graph for this question:
fire_data_clean |>
  count(SOURCE_REPORTING_UNIT_NAME) |>
  arrange(-n)

## # A tibble: 5,202 × 2
##    SOURCE_REPORTING_UNIT_NAME                                             n
##    <chr>                                                              <int>
##  1 Georgia Forestry Commission                                       113791
##  2 Fire Department of New York                                        93024
##  3 Texas Forest Service                                               78755
##  4 South Carolina Forestry Commission                                 58612
##  5 Headquarters, California Department of Forestry & Fire Protection  53986
##  6 Mississippi Forestry Commission                                    51328
##  7 Florida Forest Service                                             50202
##  8 North Carolina Division of Forest Resources                        39870
##  9 Minnesota Department of Natural Resources                          33153
## 10 Alabama Forestry Commission                                        29196
## # ℹ 5,192 more rows

fire_data_clean |>
  sample_n(1000) |>
  filter(FIRE_SIZE == 0.1 | FIRE_SIZE == 1 | FIRE_SIZE == 0.5) |>
  filter(SOURCE_REPORTING_UNIT_NAME == "Georgia Forestry Commission" | SOURCE_REPORTING_UNIT_NAME == "Fire Department of New York" | SOURCE_REPORTING_UNIT_NAME == "Texas Forest Service" | SOURCE_REPORTING_UNIT_NAME == "South Carolina Forestry Commission" | SOURCE_REPORTING_UNIT_NAME == "Headquarters, California Department of Forestry & Fire Protection") |>
  ggplot() +
  geom_bar(aes(x = FIRE_SIZE, color = SOURCE_REPORTING_UNIT_NAME)) +
  labs(title = "Wildfire Size by Reporting Agency", x = "Wildfire Size (Acres)", caption = "Data: USFS")

Are there more fires reported in recent years due to climate change?

# Starting graph for this question:
fire_data_clean |>
  sample_n(10000) |>
  ggplot() +
  geom_bar(aes(x = FIRE_YEAR)) +
  labs(title = "Wildfire Frequency by Year", x = "Fire Year", y = "Frequency", caption = "Data: USFS")

Sources

Non-Linear Regression: https://www.geeksforgeeks.org/non-linear-regression-in-r/

Dataset Information: https://www.fs.usda.gov/rds/archive/products/RDS-2013-0009.6/_metadata_RDS-2013-0009.6.html#Metadata_Reference_Information

Stat218_Final_Project

Lily Larsen

2025-05-08

Linear Discriminant Analysis

Summary of Variable Importance:

New Supervised Learning Method: Support Vector Regression

Sources