Research Question: After quantifying the success of response time dependent on size for U.S. wildfires, can this maitenance rating be predicted by an array of other factors pertaining to the wildfire?
The data used to address this research question comes from the U.S. Department of Agriculture’s Forest Service. A full link to descriptions of the variables and other information can be found at the conclusion of this report.
Libraries
library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.0.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(mice)
##
## Attaching package: 'mice'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(dplyr)
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
wildfire_data <- read_csv("~/Desktop/Stats Learning/Final Project/National_Interagency_Fire_Occurrence_Sixth_Edition_1992-2020_(Feature_Layer).csv")
## Rows: 2303566 Columns: 40
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (30): FPA_ID, SOURCE_SYSTEM_TYPE, SOURCE_SYSTEM, NWCG_REPORTING_AGENCY, ...
## dbl (10): X, Y, OBJECTID, FOD_ID, FIRE_YEAR, DISCOVERY_DOY, CONT_DOY, FIRE_S...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Unnecessary variable removal
fire_data_clean <- wildfire_data |>
dplyr::select(OBJECTID, SOURCE_SYSTEM_TYPE, NWCG_REPORTING_AGENCY, NWCG_REPORTING_UNIT_NAME, SOURCE_REPORTING_UNIT, SOURCE_REPORTING_UNIT_NAME, LOCAL_FIRE_REPORT_ID, FIRE_NAME, FIRE_YEAR, DISCOVERY_DOY, DISCOVERY_TIME, NWCG_CAUSE_CLASSIFICATION, NWCG_GENERAL_CAUSE, NWCG_CAUSE_AGE_CATEGORY, CONT_DOY, CONT_TIME, FIRE_SIZE, FIRE_SIZE_CLASS, LATITUDE, LONGITUDE, OWNER_DESCR, STATE, COUNTY)
fire_data_times <- fire_data_clean |>
filter(is.na(CONT_DOY)==FALSE) |>
filter(is.na(DISCOVERY_DOY)==FALSE)
# No fires are just under a year long but have same-day endpoints
fire_data_clean |>
filter(DISCOVERY_DOY==CONT_DOY) |>
filter(DISCOVERY_TIME>CONT_TIME)
## # A tibble: 0 × 23
## # ℹ 23 variables: OBJECTID <dbl>, SOURCE_SYSTEM_TYPE <chr>,
## # NWCG_REPORTING_AGENCY <chr>, NWCG_REPORTING_UNIT_NAME <chr>,
## # SOURCE_REPORTING_UNIT <chr>, SOURCE_REPORTING_UNIT_NAME <chr>,
## # LOCAL_FIRE_REPORT_ID <chr>, FIRE_NAME <chr>, FIRE_YEAR <dbl>,
## # DISCOVERY_DOY <dbl>, DISCOVERY_TIME <chr>, NWCG_CAUSE_CLASSIFICATION <chr>,
## # NWCG_GENERAL_CAUSE <chr>, NWCG_CAUSE_AGE_CATEGORY <chr>, CONT_DOY <dbl>,
## # CONT_TIME <chr>, FIRE_SIZE <dbl>, FIRE_SIZE_CLASS <chr>, LATITUDE <dbl>, …
# Fires with a length of "0" days simply means less than one day
fire_data_times <- fire_data_times |>
mutate(fire_duration = case_when(DISCOVERY_DOY==CONT_DOY ~ 0,
CONT_DOY>DISCOVERY_DOY ~ CONT_DOY-DISCOVERY_DOY,
FIRE_YEAR%%4 == 0 ~ 366+CONT_DOY-DISCOVERY_DOY,
DISCOVERY_DOY>CONT_DOY ~ 365+CONT_DOY-DISCOVERY_DOY))
# Many fires were approximated to 0.1, 1, 0.5 acres (in that order)
fire_data_times |>
ggplot() +
geom_density(aes(x = log(FIRE_SIZE))) +
labs(title = "Wildfire Sizes in the U.S. 1992-2020", x = "Size of Fire (Acres)", y = "Frequency", caption = "Data: USFS")
# This makes distinct lines in the comparison scatterplot between fire size and duration
fire_data_times |>
sample_n(10000) |>
ggplot() +
geom_point(aes(x = log(FIRE_SIZE), y = fire_duration, )) +
labs(title = "Wildfire Sizes and Timeframes in the U.S. 1992-2020", x = "Size of Fire (Acres)", y = "Duration of Fire (Days)", caption = "Data: USFS")
# To see the general trend of fire size and duration relationship, we can group the size variable and average it across all groups
fire_distinct_sizes <- fire_data_times |>
group_by(FIRE_SIZE) |>
mutate(avg_duration = mean(fire_duration)) |>
ungroup()
fire_distinct_sizes |>
sample_n(10000) |>
ggplot() +
geom_point(aes(x = log(FIRE_SIZE), y = avg_duration)) +
labs(title = "Average Duration of U.S. Wildifres by Size 1992-2020", x = "Size of Fire (Acres)", y = "Mean Duration of Fire (Days)", caption = "Data: USFS")
# Using regression to find the expected duration of a fire based on size
wildfire_regression <- fire_data_times |>
mutate(log_fire = log(FIRE_SIZE)) |>
dplyr::select(log_fire, fire_duration)
wildfire_lm <- lm(fire_duration ~ poly(log_fire, 4), wildfire_regression)
# Our residuals now convey the "success" of a wildfire's maintenance:
# Positive residual = took more time than average/expected for that size
# Negative residual = more efficient than average/expected for that size
fire_data_times <- fire_data_times |>
mutate(resid = wildfire_lm$residuals)
# Polynomial regression of degree four was discerned as best via plotting the respective degrees' residuals
# Although admittedly not ideal (there are more higher positive than lower negative residuals) this model was the most effective in capturing the relationship between fire size and duration
wildfire_lm$residuals |>
plot() |>
labs(title = "Quantified Wildfire Response via Residuals", x = "Case of Wildfire (By Indices)", y = "Regression Residual", caption = "Data: USFS")
## [[1]]
## NULL
##
## $x
## [1] "Case of Wildfire (By Indices)"
##
## $y
## [1] "Regression Residual"
##
## $title
## [1] "Quantified Wildfire Response via Residuals"
##
## $caption
## [1] "Data: USFS"
##
## attr(,"class")
## [1] "labels"
set.seed(101)
ggplot(wildfire_regression |> sample_n(10000) , aes(x = log_fire, y = fire_duration)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ poly(x, 4),
se = TRUE) +
labs(title = "Quantified Wildfire Response Regression", x = "Size of Fire (acres)", y = "Duration of Fire (Days)", caption = "Data: USFS")
Using these residual values as a quantifier for fire management success, we can attempt to predict these numerical representations with other variables. In this manner, discerning what variables outside of a fire’s direct size and duration contribute to the efficiency in combating it can be analyzed. The following variables will be assessed on their relationship with successful fire management:
Use of LDA is applicable due to the presence of two categorical variables in our dataset: cause of fire and land-owning management organization. It is also a comparably fast method that will work more efficiently with this expansive dataset.
wildfire_lda <- fire_data_times |>
dplyr::select(LATITUDE, LONGITUDE, OWNER_DESCR, LOCAL_FIRE_REPORT_ID, FIRE_YEAR, NWCG_GENERAL_CAUSE, resid)
# After observing which variables have frequent "NA" values, I decided to remove the local fire report ID as a variables in the LDA dataset. The question it contributes to answering (does a certain area's ability to fight wildfires shift with yearly experience) can be answered in conglomeration of latitude-longitude and calendar year data pairings.
md.pattern(wildfire_lda)
## LATITUDE LONGITUDE OWNER_DESCR FIRE_YEAR NWCG_GENERAL_CAUSE resid
## 459390 1 1 1 1 1 1
## 949363 1 1 1 1 1 1
## 0 0 0 0 0 0
## LOCAL_FIRE_REPORT_ID
## 459390 1 0
## 949363 0 1
## 949363 949363
wildfire_lda <- wildfire_lda |>
dplyr::select(-LOCAL_FIRE_REPORT_ID)
# We can then fill-in for the missing values through the mice function as all variables now have at most ~10% of their data missing
wildfire_lda_full <- mice(wildfire_lda,
method = "lda",
seed = 101)
##
## iter imp variable
## 1 1
## 1 2
## 1 3
## 1 4
## 1 5
## 2 1
## 2 2
## 2 3
## 2 4
## 2 5
## 3 1
## 3 2
## 3 3
## 3 4
## 3 5
## 4 1
## 4 2
## 4 3
## 4 4
## 4 5
## 5 1
## 5 2
## 5 3
## 5 4
## 5 5
## Warning: Number of logged events: 2
wildfire_lda_final <- complete(wildfire_lda_full)
# The following takes exceedingly long to run, as well as is over the maximum storage size
# lda_model <- lda(residuals ~ ., data = wildfire_lda_final)
set.seed(101)
training_data_rows <- sample(1:nrow(wildfire_lda_final), size = nrow(wildfire_lda_final)/2)
training_fire <- wildfire_lda_final[training_data_rows, ] |>
sample_n(10000)
testing_fire <- wildfire_lda_final[-training_data_rows, ] |>
sample_n(10000)
fire_lda_model <- lda(resid ~ ., data = training_fire)
fire_preds <- predict(fire_lda_model, testing_fire)
# negative diff: thought the fire's maintenance would have been better than it actually was
# positive diff: thought the fire's maintenance would have been worse than it actually was
results <- as.data.frame(testing_fire$resid)
results <- results |>
mutate(predictions = as.numeric(as.character(fire_preds$class))) |>
mutate(diff = `testing_fire$resid`-predictions)
# Pretty accurate predictions!! But why...?
results |>
ggplot() +
geom_density(aes(x = diff)) +
labs(title = "Wildfire Regression Accuracy", x = "Difference Residuals (Prediction - Actual)", y = "Frequency", caption = "Data: USFS")
# We can make lda models for each variable and see which ones have the lowest overall (absolute value) of difference
# Including all variables: 3.836
results |>
summarise(mean(abs(diff)))
## mean(abs(diff))
## 1 3.835569
# Latitude: 1.165
latitude_lda_model <- lda(resid ~ LATITUDE, data = training_fire)
lat_preds <- predict(latitude_lda_model, testing_fire)
lat_results <- as.data.frame(testing_fire$resid)
lat_results <- lat_results |>
mutate(predictions = as.numeric(as.character(lat_preds$class))) |>
mutate(diff = `testing_fire$resid`-predictions)
lat_results |>
ggplot() +
geom_density(aes(x = diff)) +
labs(title = "Wildfire Regression Accuracy (Latitude)", x = "Difference Residuals (Prediction - Actual)", y = "Frequency", caption = "Data: USFS")
lat_results |>
summarise(mean(abs(diff)))
## mean(abs(diff))
## 1 1.164935
# Longitude: 1.102, right-skewed (underestimating response capabilities)
longitude_lda_model <- lda(resid ~ LONGITUDE, data = training_fire)
long_preds <- predict(longitude_lda_model, testing_fire)
long_results <- as.data.frame(testing_fire$resid)
long_results <- long_results |>
mutate(predictions = as.numeric(as.character(long_preds$class))) |>
mutate(diff = `testing_fire$resid`-predictions)
long_results |>
ggplot() +
geom_density(aes(x = diff)) +
labs(title = "Wildfire Regression Accuracy (Longitude)", x = "Difference Residuals (Prediction - Actual)", y = "Frequency", caption = "Data: USFS")
long_results |>
summarise(mean(abs(diff)))
## mean(abs(diff))
## 1 1.101878
# Land Owner: 2.069
owner_lda_model <- lda(resid ~ OWNER_DESCR, data = training_fire)
owner_preds <- predict(owner_lda_model, testing_fire)
owner_results <- as.data.frame(testing_fire$resid)
owner_results <- owner_results |>
mutate(predictions = as.numeric(as.character(owner_preds$class))) |>
mutate(diff = `testing_fire$resid`-predictions)
owner_results |>
ggplot() +
geom_density(aes(x = diff)) +
labs(title = "Wildfire Regression Accuracy (Land Owner)", x = "Difference Residuals (Prediction - Actual)", y = "Frequency", caption = "Data: USFS")
owner_results |>
summarise(mean(abs(diff)))
## mean(abs(diff))
## 1 2.069422
# Fire Year: 1.103, right-skewed (underestimating response capabilities)
year_lda_model <- lda(resid ~ FIRE_YEAR, data = training_fire)
year_preds <- predict(year_lda_model, testing_fire)
year_results <- as.data.frame(testing_fire$resid)
year_results <- year_results |>
mutate(predictions = as.numeric(as.character(year_preds$class))) |>
mutate(diff = `testing_fire$resid`-predictions)
year_results |>
ggplot() +
geom_density(aes(x = diff)) +
labs(title = "Wildfire Regression Accuracy (Fire Year)", x = "Difference Residuals (Prediction - Actual)", y = "Frequency", caption = "Data: USFS")
year_results |>
summarise(mean(abs(diff)))
## mean(abs(diff))
## 1 1.102678
# Fire Cause: 1.527, right-skewed (underestimating response capabilities)
cause_lda_model <- lda(resid ~ NWCG_GENERAL_CAUSE, data = training_fire)
cause_preds <- predict(cause_lda_model, testing_fire)
cause_results <- as.data.frame(testing_fire$resid)
cause_results <- cause_results |>
mutate(predictions = as.numeric(as.character(cause_preds$class))) |>
mutate(diff = `testing_fire$resid`-predictions)
cause_results |>
ggplot() +
geom_density(aes(x = diff)) +
labs(title = "Wildfire Regression Accuracy (Fire Cause)", x = "Difference Residuals (Prediction - Actual)", y = "Frequency", caption = "Data: USFS")
cause_results |>
summarise(mean(abs(diff)))
## mean(abs(diff))
## 1 1.536508
variable_analysis <- as.data.frame(fire_lda_model$scaling)
variable_analysis <- variable_analysis |>
mutate(contributions = (LD1+LD2+LD3+LD4+LD5+LD6+LD7+LD8+LD9+LD10+LD11+LD12+LD13+LD14+LD15+LD16+LD17+LD18+LD19+LD20+LD21+LD22+LD23+LD24+LD25+LD26+LD27+LD28)/28)
variable_analysis |>
arrange(contributions)
Numerical: Longitude is the most telling, then year of the fire, then latitude. It is important to recognize how close all these three variables are to predicting the success of fire response when put into an LDA model by themselves. However, fire year and longitude both are skewed right in their difference variables, indicating that there is a pattern of underestimating response capabilities if just relying on these variables individually. It is also important to note how many smaller fires were reported in disproportionate comparison to larger fires reported. This could make a model who is effective at predicting response capabilities for smaller fires one that is overall marketable as an “effective model”.
Categorical:
Signifies likelihood of higher efficiency: (all in decreasing order of importance) - Land Owner: County, undefined federal, state/private, other federal, and state organizations are the ones in charge of the maintenance. Basically, if a governmental organization is the one doing the upkeep, the area is set up for better fire maintenance prospects. - Fire Cause: Recreation & ceremony, equipment & vehicle use, and fireworks. If there are humans present doing some activity with purposeful use (that is maybe unique in its time or with a lot of others present OR that those individuals are capable of handling) then the area is more likely to have a quick response. There also doesn’t necessary need to be a similarity across all the fire causes that lead to quicker cleanups, and this analysis could simply be pointing out different avenues that all conclude with effective wildfire responses.
Signifies likelihood of lower efficiency: (all in decreasing importance order, again) - Fire Cause: Firearms & explosive use, “other causes”, railroad operations & maintenance, misuse of fire by a minor, and power generation/transmission/distribution. These categories are either more destructive, in the absence of human oversight, or are unclassified aspects potentially inflicted by under-prepared parties (minors). If the reason for a fire being lit is absent of individuals there to report on it then it makes sense the reponse will be time-wise not ideal. - Land Owner: Fish & Wildlife Service and Tribal organizations. These two organizations may not have policies in line of what to do when a wildfire arises (FWS) may not have direct access to the same resources (Tribal organizations) and help that governmental organizations would defaultedly have.
This research question could also be evaluated using the following model:
Support Vector Regression is a Support Vector Machine based approach utilized to craft a function that aims to best predict a continuous, numerical output value. This boils down to receiving a dataset, recognizing patterns in it, and then communicating a formula to predict for a new case potentially in the dataset (variable-wise) what the prediction of the looking-to-be-solved-for type is. In this case the SVR model would hope to output a function which could determine the quantified basis of wildfire response time dependent on the set of factors outlined above in the LDA section.
In typical SVM supervised learning methods, a hyperplane (a multi-dimensional plane) is optimized to best encapsulate the divide between classes. This entails finding the maximum distance between points via margins to this hyperplane that are lengthened by recognition of patterns in the variable data. The naming of this algorithm comes from these vectors that span the margins, the “support vectors”. Contextually put, this would translate to finding a reason of combined set of reasons that equate to why wildfires are more quickly or slowly cleaned up. Further analysis of the wildfire dataset could reveal more ways to expand on this hyperplane divider.
Since variables of diverse types aren’t inherently “combinable”, they need to be set into a higher-dimensional space that can encapsulate their differences. When vectors aren’t able to be mapped into this higher-dimensional space naturally, kernel transformations are utilized. This is at large a conglomeration approach to bring the data to the same level for discussion. This occurs when relationships in the data aren’t linearly separable, meaning that a line, plane, or hyperplane can’t adequately classify all the datapoints correctly (there is some overlap of classes within the variable ranges). We would see this in our data when a few wildfires managed by the same land ownership organization were started by a variety of causes and also occurred in varying years (One example of many). There is no concrete relationship, so kernel transformations are necessary to compute vector-products that maps these interactions into a multi-dimensional space where division of patterns is possible.
SVR holds many similarities to SVM, it is simply geared toward maximizing an encompassing regression model for the given data as opposed to a hyperplane-like divider. The independent variables here still serve as the factors potentially able to predict the continuous, numerical quantifier of success in containing a wildfire.
To do this in r, one should use the function SVR() and indicate that the kernel attribute is either linear, polynomial, or of the radial basis function (RBF) type. This is located in the package “e1071”. RBF is another strategy for mapping non-linear relationships between variables for more complex data. We would likely use RBF for the wildfire data as its complexity is quite high (both in size and number of variables).
The contextualized steps for SVR are as follows (variables are seen below within the example code that are synonymous to the wildfire data used):
A regression function will be produced much like any other regression!
##Other Questions:##
These are a couple questions that crossed my mind while analyzing this data that I could see a graph-able start on. I recognize the intricacies in this data and the confounding variables that were not addressed in this report.
# Starting graph for this question:
fire_data_clean |>
count(SOURCE_REPORTING_UNIT_NAME) |>
arrange(-n)
## # A tibble: 5,202 × 2
## SOURCE_REPORTING_UNIT_NAME n
## <chr> <int>
## 1 Georgia Forestry Commission 113791
## 2 Fire Department of New York 93024
## 3 Texas Forest Service 78755
## 4 South Carolina Forestry Commission 58612
## 5 Headquarters, California Department of Forestry & Fire Protection 53986
## 6 Mississippi Forestry Commission 51328
## 7 Florida Forest Service 50202
## 8 North Carolina Division of Forest Resources 39870
## 9 Minnesota Department of Natural Resources 33153
## 10 Alabama Forestry Commission 29196
## # ℹ 5,192 more rows
fire_data_clean |>
sample_n(1000) |>
filter(FIRE_SIZE == 0.1 | FIRE_SIZE == 1 | FIRE_SIZE == 0.5) |>
filter(SOURCE_REPORTING_UNIT_NAME == "Georgia Forestry Commission" | SOURCE_REPORTING_UNIT_NAME == "Fire Department of New York" | SOURCE_REPORTING_UNIT_NAME == "Texas Forest Service" | SOURCE_REPORTING_UNIT_NAME == "South Carolina Forestry Commission" | SOURCE_REPORTING_UNIT_NAME == "Headquarters, California Department of Forestry & Fire Protection") |>
ggplot() +
geom_bar(aes(x = FIRE_SIZE, color = SOURCE_REPORTING_UNIT_NAME)) +
labs(title = "Wildfire Size by Reporting Agency", x = "Wildfire Size (Acres)", caption = "Data: USFS")
# Starting graph for this question:
fire_data_clean |>
sample_n(10000) |>
ggplot() +
geom_bar(aes(x = FIRE_YEAR)) +
labs(title = "Wildfire Frequency by Year", x = "Fire Year", y = "Frequency", caption = "Data: USFS")
Non-Linear Regression: https://www.geeksforgeeks.org/non-linear-regression-in-r/
Dataset Information: https://www.fs.usda.gov/rds/archive/products/RDS-2013-0009.6/_metadata_RDS-2013-0009.6.html#Metadata_Reference_Information