Introduction

This analysis aims to predict forest fire occurrences and cluster risk zones using Logistic Regression and k-means clustering.

Dataset Description

The Forest Fires dataset includes 517 instances with attributes related to meteorological conditions. Features include: - Temperature - Relative Humidity - Wind Speed - Rainfall

The target variable is the burned area (area), transformed for Logistic Regression.

# Set CRAN mirror
if (is.null(getOption("repos"))) {
  options(repos = c(CRAN = "https://cran.rstudio.com/"))
}

# Load libraries
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
library(caret)
## Warning: package 'caret' was built under R version 4.4.2
## Loading required package: lattice
library(cluster)
## Warning: package 'cluster' was built under R version 4.4.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.4.2
library(readxl)
## Warning: package 'readxl' was built under R version 4.4.2
library(writexl)
## Warning: package 'writexl' was built under R version 4.4.2
# Load dataset
data <- read_excel("forestfires.xlsx")
head(data)
## # A tibble: 6 × 13
##       X     Y month day    FFMC   DMC    DC   ISI  temp    RH  wind  rain  area
##   <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1     7     5 mar   fri    86.2  26.2  94.3   5.1   8.2    51   6.7   0       0
## 2     7     4 oct   tue    90.6  35.4 669.    6.7  18      33   0.9   0       0
## 3     7     4 oct   sat    90.6  43.7 687.    6.7  14.6    33   1.3   0       0
## 4     8     6 mar   fri    91.7  33.3  77.5   9     8.3    97   4     0.2     0
## 5     8     6 mar   sun    89.3  51.3 102.    9.6  11.4    99   1.8   0       0
## 6     8     6 aug   sun    92.3  85.3 488    14.7  22.2    29   5.4   0       0

Data Preprocessing

  1. Normalize continuous features (e.g., temperature, humidity).
  2. Encode categorical variables as factors.
  3. Remove or impute missing values.
# Handle missing values (impute with column mean)
data <- data %>% mutate_all(~ifelse(is.na(.), mean(., na.rm = TRUE), .))

# Normalize numerical features
data$temp <- scale(data$temp)
data$RH <- scale(data$RH)
data$wind <- scale(data$wind)
data$rain <- scale(data$rain)
numerical_cols <- c("temp", "RH", "wind", "rain", "area")
data[numerical_cols] <- scale(data[numerical_cols])

# Convert categorical variables to factors
data$month <- as.factor(data$month)
data$day <- as.factor(data$day)

# Boxplots for detecting outliers in the 'area' feature
ggplot(data, aes(y = area)) + 
  geom_boxplot() + 
  labs(title = "Boxplot for Burned Area", y = "Burned Area")

Logistic Regression

Logistic Regression is used to predict the binary outcome: whether the burned area exceeds a threshold.

# Create a binary target variable
data$large_fire <- ifelse(data$area > median(data$area), 1, 0)
data$large_fire <- as.factor(data$large_fire)

# Split data into training and testing sets
set.seed(4326)
train_index <- createDataPartition(data$large_fire, p = 0.7, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]

# Fit Logistic Regression model
log_model <- glm(large_fire ~ temp + RH + wind + rain, data = train_data, family = binomial)

# Predictions
pred_probs <- predict(log_model, test_data, type = "response")
pred_classes <- ifelse(pred_probs > 0.5, 1, 0)

# Evaluate model
conf_matrix <- confusionMatrix(as.factor(pred_classes), test_data$large_fire)

K-means Clustering

K-means clustering identifies groups of regions with similar fire risk levels.

# Select relevant features for clustering
clustering_data <- data %>% select(temp, RH, wind, rain)

# Determine optimal clusters using the Elbow Method
set.seed(1234)
wss <- sapply(1:10, function(k) {
  kmeans(clustering_data, centers = k, nstart = 10)$tot.withinss
})
plot(1:10, wss, type = "b", xlab = "Number of Clusters", ylab = "Within Sum of Squares")

# Apply K-means with 3 clusters
kmeans_model <- kmeans(clustering_data, centers = 3, nstart = 10)
clustering_data$cluster <- as.factor(kmeans_model$cluster)

# Visualize clusters
ggplot(clustering_data, aes(x = temp, y = RH, color = cluster)) +
  geom_point(size = 2) +
  labs(title = "K-means Clustering of Fire Risk Zones", x = "Temperature", y = "Relative Humidity")

## ## Results Comparison

# Logistic Regression Results
logistic_accuracy <- conf_matrix$overall['Accuracy']
logistic_precision <- conf_matrix$byClass['Precision']
logistic_recall <- conf_matrix$byClass['Recall']

cat("Logistic Regression Performance:\n")
## Logistic Regression Performance:
cat("Accuracy:", logistic_accuracy, "\n")
## Accuracy: 0.5324675
cat("Precision:", logistic_precision, "\n")
## Precision: 0.5342466
cat("Recall:", logistic_recall, "\n")
## Recall: 0.5064935
# K-means Clustering Summary
cat("\nk-means Clustering Summary:\n")
## 
## k-means Clustering Summary:
cat("Cluster Centers:\n")
## Cluster Centers:
print(kmeans_model$centers)
##         temp         RH       wind        rain
## 1 -0.9457857  1.5019645  0.2608509  0.26169696
## 2 -0.2685653 -0.3898071  0.9649054 -0.07319742
## 3  0.5411120 -0.4358218 -0.6114026 -0.07319742

Save Processed Dataset

## # A tibble: 6 × 14
##       X     Y month day    FFMC   DMC    DC   ISI   temp     RH     wind    rain
##   <dbl> <dbl> <fct> <fct> <dbl> <dbl> <dbl> <dbl>  <dbl>  <dbl>    <dbl>   <dbl>
## 1     7     5 mar   fri    86.2  26.2  94.3   5.1 -1.84   0.411  1.50    -0.0732
## 2     7     4 oct   tue    90.6  35.4 669.    6.7 -0.153 -0.692 -1.74    -0.0732
## 3     7     4 oct   sat    90.6  43.7 687.    6.7 -0.739 -0.692 -1.52    -0.0732
## 4     8     6 mar   fri    91.7  33.3  77.5   9   -1.82   3.23  -0.00982  0.603 
## 5     8     6 mar   sun    89.3  51.3 102.    9.6 -1.29   3.35  -1.24    -0.0732
## 6     8     6 aug   sun    92.3  85.3 488    14.7  0.570 -0.937  0.772   -0.0732
## # ℹ 2 more variables: area <dbl>, large_fire <fct>