This analysis aims to predict forest fire occurrences and cluster risk zones using Logistic Regression and k-means clustering.
The Forest Fires dataset includes 517 instances with attributes related to meteorological conditions. Features include: - Temperature - Relative Humidity - Wind Speed - Rainfall
The target variable is the burned area (area),
transformed for Logistic Regression.
# Set CRAN mirror
if (is.null(getOption("repos"))) {
options(repos = c(CRAN = "https://cran.rstudio.com/"))
}
# Load libraries
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
library(caret)
## Warning: package 'caret' was built under R version 4.4.2
## Loading required package: lattice
library(cluster)
## Warning: package 'cluster' was built under R version 4.4.2
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.4.2
library(readxl)
## Warning: package 'readxl' was built under R version 4.4.2
library(writexl)
## Warning: package 'writexl' was built under R version 4.4.2
# Load dataset
data <- read_excel("forestfires.xlsx")
head(data)
## # A tibble: 6 × 13
## X Y month day FFMC DMC DC ISI temp RH wind rain area
## <dbl> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7 5 mar fri 86.2 26.2 94.3 5.1 8.2 51 6.7 0 0
## 2 7 4 oct tue 90.6 35.4 669. 6.7 18 33 0.9 0 0
## 3 7 4 oct sat 90.6 43.7 687. 6.7 14.6 33 1.3 0 0
## 4 8 6 mar fri 91.7 33.3 77.5 9 8.3 97 4 0.2 0
## 5 8 6 mar sun 89.3 51.3 102. 9.6 11.4 99 1.8 0 0
## 6 8 6 aug sun 92.3 85.3 488 14.7 22.2 29 5.4 0 0
# Handle missing values (impute with column mean)
data <- data %>% mutate_all(~ifelse(is.na(.), mean(., na.rm = TRUE), .))
# Normalize numerical features
data$temp <- scale(data$temp)
data$RH <- scale(data$RH)
data$wind <- scale(data$wind)
data$rain <- scale(data$rain)
numerical_cols <- c("temp", "RH", "wind", "rain", "area")
data[numerical_cols] <- scale(data[numerical_cols])
# Convert categorical variables to factors
data$month <- as.factor(data$month)
data$day <- as.factor(data$day)
# Boxplots for detecting outliers in the 'area' feature
ggplot(data, aes(y = area)) +
geom_boxplot() +
labs(title = "Boxplot for Burned Area", y = "Burned Area")
Logistic Regression is used to predict the binary outcome: whether the burned area exceeds a threshold.
# Create a binary target variable
data$large_fire <- ifelse(data$area > median(data$area), 1, 0)
data$large_fire <- as.factor(data$large_fire)
# Split data into training and testing sets
set.seed(4326)
train_index <- createDataPartition(data$large_fire, p = 0.7, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]
# Fit Logistic Regression model
log_model <- glm(large_fire ~ temp + RH + wind + rain, data = train_data, family = binomial)
# Predictions
pred_probs <- predict(log_model, test_data, type = "response")
pred_classes <- ifelse(pred_probs > 0.5, 1, 0)
# Evaluate model
conf_matrix <- confusionMatrix(as.factor(pred_classes), test_data$large_fire)
K-means clustering identifies groups of regions with similar fire risk levels.
# Select relevant features for clustering
clustering_data <- data %>% select(temp, RH, wind, rain)
# Determine optimal clusters using the Elbow Method
set.seed(1234)
wss <- sapply(1:10, function(k) {
kmeans(clustering_data, centers = k, nstart = 10)$tot.withinss
})
plot(1:10, wss, type = "b", xlab = "Number of Clusters", ylab = "Within Sum of Squares")
# Apply K-means with 3 clusters
kmeans_model <- kmeans(clustering_data, centers = 3, nstart = 10)
clustering_data$cluster <- as.factor(kmeans_model$cluster)
# Visualize clusters
ggplot(clustering_data, aes(x = temp, y = RH, color = cluster)) +
geom_point(size = 2) +
labs(title = "K-means Clustering of Fire Risk Zones", x = "Temperature", y = "Relative Humidity")
## ## Results Comparison
# Logistic Regression Results
logistic_accuracy <- conf_matrix$overall['Accuracy']
logistic_precision <- conf_matrix$byClass['Precision']
logistic_recall <- conf_matrix$byClass['Recall']
cat("Logistic Regression Performance:\n")
## Logistic Regression Performance:
cat("Accuracy:", logistic_accuracy, "\n")
## Accuracy: 0.5324675
cat("Precision:", logistic_precision, "\n")
## Precision: 0.5342466
cat("Recall:", logistic_recall, "\n")
## Recall: 0.5064935
# K-means Clustering Summary
cat("\nk-means Clustering Summary:\n")
##
## k-means Clustering Summary:
cat("Cluster Centers:\n")
## Cluster Centers:
print(kmeans_model$centers)
## temp RH wind rain
## 1 -0.9457857 1.5019645 0.2608509 0.26169696
## 2 -0.2685653 -0.3898071 0.9649054 -0.07319742
## 3 0.5411120 -0.4358218 -0.6114026 -0.07319742
## # A tibble: 6 × 14
## X Y month day FFMC DMC DC ISI temp RH wind rain
## <dbl> <dbl> <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 7 5 mar fri 86.2 26.2 94.3 5.1 -1.84 0.411 1.50 -0.0732
## 2 7 4 oct tue 90.6 35.4 669. 6.7 -0.153 -0.692 -1.74 -0.0732
## 3 7 4 oct sat 90.6 43.7 687. 6.7 -0.739 -0.692 -1.52 -0.0732
## 4 8 6 mar fri 91.7 33.3 77.5 9 -1.82 3.23 -0.00982 0.603
## 5 8 6 mar sun 89.3 51.3 102. 9.6 -1.29 3.35 -1.24 -0.0732
## 6 8 6 aug sun 92.3 85.3 488 14.7 0.570 -0.937 0.772 -0.0732
## # ℹ 2 more variables: area <dbl>, large_fire <fct>